From 83e4f7a530afed8642045eebca7cf6b7863e94a2 Mon Sep 17 00:00:00 2001 From: Jeff Heller Date: Mon, 25 May 2026 12:21:56 -0600 Subject: [PATCH 1/8] =?UTF-8?q?feat:=20scale=20telemetry=20=E2=80=94=20SoC?= =?UTF-8?q?=20temp,=20weight-stall=20watchdog,=20reset=20reason?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Diagnostics for the field "weight stops being collected under sustained load" reports (suspected thermal). Adds to the WS status frame and serial logs: - soc_temp_c / soc_temp_max_c: live + peak ESP32-S3 die temperature (temperatureRead()), sampled every 2s on the main loop. - weight_stalled + stall_count + last_stall_ms + last_stall_temp_c: a watchdog in pureScale() that flags when the ADS1232 raw value is frozen/railed for >8s (a live cell dithers every sample), recording the die temp at the moment of the stall to correlate failures with heat. (This failure is not firmware- recoverable, so it's surfaced, not silently retried.) - reset_reason: esp_reset_reason() captured at boot, so a brownout/panic/WDT reset is attributable instead of looking like a clean power-on. Telemetry-only; no behavior change to the weight/WiFi/BLE paths. Co-Authored-By: Claude Opus 4.7 --- include/parameter.h | 17 +++++++++++ include/websocket.h | 22 +++++++++++--- src/hds.ino | 71 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 106 insertions(+), 4 deletions(-) diff --git a/include/parameter.h b/include/parameter.h index 4f12594..a795110 100644 --- a/include/parameter.h +++ b/include/parameter.h @@ -196,6 +196,23 @@ static bool b_adc_recovery_active = false; static uint8_t i_adc_recovery_count = 0; //bool b_tempDisablePowerOff = true; +// Instrumentation for diagnosing the "weight stops being collected" failure +// under sustained load (suspected thermal). b_weightStalled is set by the +// pureScale() stall watchdog when the ADC raw value is frozen/railed; written on +// the main loop, read in the WS status frame. g_resetReason is the last reset +// cause (esp_reset_reason()) captured at boot, surfaced for fleet telemetry. +volatile bool b_weightStalled = false; +const char *g_resetReason = "unknown"; +// Peak/last-event stats since boot (survive nothing; reset on reboot, which +// g_resetReason then explains). Sampled on the main loop, read in the WS status +// frame. g_socTempMaxC = highest SoC die temp seen; the *_stall_* fields capture +// the most recent stall so the failure is visible after the fact. +volatile float g_socTempC = 0.0f; // latest SoC temperature (C) +volatile float g_socTempMaxC = -100.0f; // peak SoC temperature since boot (C) +volatile uint32_t g_stallCount = 0; // number of weight-stall events since boot +volatile unsigned long g_lastStallMs = 0; // millis() of the last stall onset (0 = none) +volatile float g_lastStallTempC = 0.0f; // SoC temp when the last stall began + bool b_negativeWeight = false; bool b_weight_quick_zero = false; //Tare后快速显示为0优化 diff --git a/include/websocket.h b/include/websocket.h index b1185c5..09906e2 100644 --- a/include/websocket.h +++ b/include/websocket.h @@ -183,7 +183,7 @@ void sendWebsocketRateInfo(AsyncWebSocketClient *client, const char *status) { } void sendWebsocketStatus(AsyncWebSocketClient *client, const char *status) { - client->printf("{\"type\":\"status\",\"status\":\"%s\",\"protocol_version\":1,\"firmware_version\":\"%s\",\"grams\":%.2f,\"ms\":%lu,\"battery_percent\":%d,\"battery_voltage\":%.2f,\"charging\":%s,\"timer_running\":%s,\"timer_seconds\":%lu,\"display_on\":%s,\"low_power\":%s,\"soft_sleep\":%s,\"events_enabled\":%s,\"rate_hz\":%lu,\"interval_ms\":%lu}", + client->printf("{\"type\":\"status\",\"status\":\"%s\",\"protocol_version\":1,\"firmware_version\":\"%s\",\"grams\":%.2f,\"ms\":%lu,\"battery_percent\":%d,\"battery_voltage\":%.2f,\"charging\":%s,\"timer_running\":%s,\"timer_seconds\":%lu,\"display_on\":%s,\"low_power\":%s,\"soft_sleep\":%s,\"events_enabled\":%s,\"rate_hz\":%lu,\"interval_ms\":%lu,\"soc_temp_c\":%.1f,\"soc_temp_max_c\":%.1f,\"weight_stalled\":%s,\"stall_count\":%lu,\"last_stall_ms\":%lu,\"last_stall_temp_c\":%.1f,\"reset_reason\":\"%s\"}", status, FIRMWARE_VER, f_displayedValue, @@ -198,7 +198,14 @@ void sendWebsocketStatus(AsyncWebSocketClient *client, const char *status) { b_softSleep ? "true" : "false", b_websocketEventsEnabled ? "true" : "false", websocketRateForInterval(weightWebsocketNotifyInterval), - weightWebsocketNotifyInterval); + weightWebsocketNotifyInterval, + g_socTempC, + g_socTempMaxC, + b_weightStalled ? "true" : "false", + (unsigned long)g_stallCount, + g_lastStallMs, + g_lastStallTempC, + g_resetReason); } // Broadcast via printfAll(): it holds the library's client-list mutex and @@ -211,7 +218,7 @@ void sendWebsocketStatus(AsyncWebSocketClient *client, const char *status) { // without blocking the others. void sendWebsocketStatusAll(const char *status) { if (!b_wifiEnabled || !b_websocketEventsEnabled || websocket.count() == 0) return; - websocket.printfAll("{\"type\":\"status\",\"status\":\"%s\",\"protocol_version\":1,\"firmware_version\":\"%s\",\"grams\":%.2f,\"ms\":%lu,\"battery_percent\":%d,\"battery_voltage\":%.2f,\"charging\":%s,\"timer_running\":%s,\"timer_seconds\":%lu,\"display_on\":%s,\"low_power\":%s,\"soft_sleep\":%s,\"events_enabled\":%s,\"rate_hz\":%lu,\"interval_ms\":%lu}", + websocket.printfAll("{\"type\":\"status\",\"status\":\"%s\",\"protocol_version\":1,\"firmware_version\":\"%s\",\"grams\":%.2f,\"ms\":%lu,\"battery_percent\":%d,\"battery_voltage\":%.2f,\"charging\":%s,\"timer_running\":%s,\"timer_seconds\":%lu,\"display_on\":%s,\"low_power\":%s,\"soft_sleep\":%s,\"events_enabled\":%s,\"rate_hz\":%lu,\"interval_ms\":%lu,\"soc_temp_c\":%.1f,\"soc_temp_max_c\":%.1f,\"weight_stalled\":%s,\"stall_count\":%lu,\"last_stall_ms\":%lu,\"last_stall_temp_c\":%.1f,\"reset_reason\":\"%s\"}", status, FIRMWARE_VER, f_displayedValue, @@ -226,7 +233,14 @@ void sendWebsocketStatusAll(const char *status) { b_softSleep ? "true" : "false", b_websocketEventsEnabled ? "true" : "false", websocketRateForInterval(weightWebsocketNotifyInterval), - weightWebsocketNotifyInterval); + weightWebsocketNotifyInterval, + g_socTempC, + g_socTempMaxC, + b_weightStalled ? "true" : "false", + (unsigned long)g_stallCount, + g_lastStallMs, + g_lastStallTempC, + g_resetReason); } void sendWebsocketWeightAll(float grams, unsigned long ms) { diff --git a/src/hds.ino b/src/hds.ino index aac5ec8..b833d38 100644 --- a/src/hds.ino +++ b/src/hds.ino @@ -392,10 +392,31 @@ void wifi_init() { MyUsbCallbacks usbCallbacks; +// Map esp_reset_reason() to a short string for boot logging + WS telemetry, so a +// spontaneous reset (brownout / panic / watchdog) is attributable instead of +// looking like a clean power-on. +const char *resetReasonStr(esp_reset_reason_t r) { + switch (r) { + case ESP_RST_POWERON: return "poweron"; + case ESP_RST_EXT: return "ext"; + case ESP_RST_SW: return "sw"; + case ESP_RST_PANIC: return "panic"; + case ESP_RST_INT_WDT: return "int_wdt"; + case ESP_RST_TASK_WDT: return "task_wdt"; + case ESP_RST_WDT: return "wdt"; + case ESP_RST_DEEPSLEEP: return "deepsleep"; + case ESP_RST_BROWNOUT: return "brownout"; + case ESP_RST_SDIO: return "sdio"; + default: return "unknown"; + } +} + void setup() { Serial.begin(115200); while (!Serial) // Wait for the Serial port to initialize (typically used in Arduino to ensure the Serial monitor is ready) ; + g_resetReason = resetReasonStr(esp_reset_reason()); + Serial.printf("[boot] reset_reason=%s\n", g_resetReason); if (!EEPROM.begin(512)) { Serial.println("EEPROM init failed!"); while (1) { @@ -932,6 +953,35 @@ void pureScale() { t_lastScaleData = millis(); } + // Stall watchdog: a live load cell's raw 24-bit value dithers on every + // conversion (~10/s). If the raw value is byte-identical for >8 s it's frozen + // or railed (0 / 0xFFFFFF) -- the "weight stops being collected" failure + // (suspected thermal/analog) that an in-firmware ADC power-cycle can't fix. + // Surface it (flag + one-shot log) instead of silently streaming a stuck value. + { + static long lastRaw = 0x7FFFFFFFL; + static unsigned long t_rawChange = 0; + long raw = scale.getDebugInfo().rawValue; + unsigned long nowMs = millis(); + if (t_rawChange == 0) t_rawChange = nowMs; + if (raw != lastRaw) { + lastRaw = raw; + t_rawChange = nowMs; + if (b_weightStalled) { + b_weightStalled = false; + Serial.println("[adc] weight readings resumed"); + } + } else if (!b_weightStalled && nowMs - t_rawChange > 8000) { + b_weightStalled = true; + g_stallCount++; + g_lastStallMs = nowMs; + g_lastStallTempC = g_socTempC; + Serial.printf("[adc] WEIGHT STALLED #%lu: raw frozen at %ld for >8s soc=%.1fC heap=%lu\n", + (unsigned long)g_stallCount, raw, g_lastStallTempC, + (unsigned long)ESP.getFreeHeap()); + } + } + if (scale.update()) { b_newDataReady = true; t_lastScaleData = millis(); @@ -1273,6 +1323,27 @@ void loop() { return; } + // SoC-temperature sampler + peak tracking (diagnosing the suspected thermal + // stall). Runs every 2 s regardless of WiFi/charging; prints a summary every + // 10 s so a serial capture during a stress run shows the temp trend, and feeds + // g_socTempC/Max into the WS status frame. + { + static unsigned long t_tempSample = 0, t_tempLog = 0; + unsigned long nowMs = millis(); + if (nowMs - t_tempSample >= 2000) { + t_tempSample = nowMs; + float t = temperatureRead(); + g_socTempC = t; + if (t > g_socTempMaxC) g_socTempMaxC = t; + if (nowMs - t_tempLog >= 10000) { + t_tempLog = nowMs; + Serial.printf("[temp] soc=%.1fC max=%.1fC stalls=%lu last_stall=%lums stall_temp=%.1fC heap=%lu\n", + g_socTempC, g_socTempMaxC, (unsigned long)g_stallCount, + g_lastStallMs, g_lastStallTempC, (unsigned long)ESP.getFreeHeap()); + } + } + } + if (bleState == CONNECTED && b_requireHeartBeat && millis() - t_firstConnect > HEARTBEAT_TIMEOUT) { if (millis() - t_heartBeat > HEARTBEAT_TIMEOUT) { disconnectBLE(); From 46f4df8ffe7bcd071075c21b7a1c3fbe2390b929 Mon Sep 17 00:00:00 2001 From: Jeff Heller Date: Mon, 25 May 2026 12:21:56 -0600 Subject: [PATCH 2/8] test: add thermal_load_test.sh (1h multi-protocol soak + telemetry monitor) Drives USB 10Hz + WS 10Hz + HTTP/WS churn + mDNS (BT driven externally) and polls the new temp/stall telemetry every ~60s, watching for the weight-stall failure and the die temp at which it occurs. Co-Authored-By: Claude Opus 4.7 --- tools/thermal_load_test.sh | 84 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100755 tools/thermal_load_test.sh diff --git a/tools/thermal_load_test.sh b/tools/thermal_load_test.sh new file mode 100755 index 0000000..605bea8 --- /dev/null +++ b/tools/thermal_load_test.sh @@ -0,0 +1,84 @@ +#!/bin/bash +# 1-hour multi-protocol thermal/stall load test. +# - drives: USB 10 Hz binary + WS 10 Hz stream + HTTP/WS churn + mDNS +# - NOT driven here: BT (the user's app drives it concurrently) +# - monitors: the WS status-frame telemetry (soc_temp_c/max, weight_stalled, +# stall_count, last_stall_ms/temp) every ~60 s, watching for the +# "weight stops being collected" failure and the temp at which it hits. +# Opening USB reboots the scale once (clean baseline); reconnect BT afterward. +set -u +cd "$(dirname "$0")/.." +IP=192.168.10.242 +HOST=hds.local +DUR="${1:-3600}" +PORT="$(ls /dev/cu.*usbserial* 2>/dev/null | head -1)" +LOG=/tmp/thermal; rm -rf "$LOG"; mkdir -p "$LOG" +ts(){ date +%H:%M:%S; } +echo "[thermal] START $(ts) dur=${DUR}s port=$PORT" + +# 1) USB 10 Hz (opening the port pulses DTR/RTS -> one reboot -> clean baseline) +python3 -u tools/usb_rate_check.py "$PORT" --seconds "$DUR" --mult 1 --boot-wait 8 > "$LOG/usb.log" 2>&1 & +echo "[thermal] USB launched (scale rebooting) @ $(ts)" + +# 2) detect the reboot, then wait for full recovery +down=0 +for i in $(seq 1 30); do + if ! ping -c1 -t1 "$HOST" >/dev/null 2>&1; then down=1; echo "[thermal] reboot detected @ $(ts)"; break; fi + sleep 1 +done +until ping -c1 -t1 "$HOST" >/dev/null 2>&1; do sleep 1; done +sleep 4 +echo "[thermal] WiFi back @ $(ts) (reboot_detected=$down) -- RECONNECT BT APP NOW" + +RDUR=$((DUR-60)); [ "$RDUR" -lt 120 ] && RDUR=120 + +# 3) WiFi load +python3 -u tools/ws_drop_repro.py "$IP" --rate 10 --duration "$RDUR" --print-every 120 > "$LOG/ws.log" 2>&1 & +python3 -u tools/conn_churn.py "$IP" --http --ws --rate 0.5 --workers 1 --duration "$RDUR" > "$LOG/churn.log" 2>&1 & +python3 -u tools/mdns_stress.py --host "$HOST" --rate 1 --duration "$RDUR" --resolver > "$LOG/mdns.log" 2>&1 & + +# 4) telemetry monitor: one WS client, events on, log status every ~60 s +python3 -u - "$HOST" "$RDUR" > "$LOG/telemetry.log" 2>&1 <<'PY' & +import json,sys,time,websocket +host=sys.argv[1]; dur=int(sys.argv[2]); end=time.time()+dur +def connect(): + w=websocket.create_connection("ws://%s/snapshot"%host,timeout=8); w.settimeout(1.0) + try: w.send('{"command":"events","action":"on"}') + except Exception: pass + return w +ws=connect(); prev_stalls=0 +while time.time()prev_stalls else "" + prev_stalls=sc or 0 + print("[%s] soc=%5.1fC max=%5.1fC stalled=%-5s stalls=%s last_stall_ms=%s stall_temp=%s grams=%s chg=%s%s"%( + time.strftime('%H:%M:%S'), st.get('soc_temp_c',-1), st.get('soc_temp_max_c',-1), + st.get('weight_stalled'), sc, st.get('last_stall_ms'), st.get('last_stall_temp_c'), + st.get('grams'), st.get('charging'), flag), flush=True) + else: + print("[%s] NO STATUS FRAME (reconnecting)"%time.strftime('%H:%M:%S'), flush=True) + try: ws.close() + except Exception: pass + try: ws=connect() + except Exception as e: print("reconnect failed:",e,flush=True); time.sleep(5) + time.sleep(58) +try: ws.close() +except Exception: pass +PY + +echo "[thermal] load + telemetry running ${RDUR}s @ $(ts)" +wait +echo "[thermal] DONE $(ts)" +echo "===== TELEMETRY (temp / stall) ====="; cat "$LOG/telemetry.log" +echo "===== peak temp / any stalls ====="; grep -E "STALL|max=" "$LOG/telemetry.log" | tail -3 +echo "===== WS (drops) ====="; tail -12 "$LOG/ws.log" +echo "===== USB ====="; tail -6 "$LOG/usb.log" +echo "===== churn ====="; tail -3 "$LOG/churn.log" +echo "===== mDNS ====="; tail -3 "$LOG/mdns.log" From 8e345d9af3b78852593ca029c4ef5c17fa9fdd8c Mon Sep 17 00:00:00 2001 From: Jeff Heller Date: Mon, 25 May 2026 12:27:39 -0600 Subject: [PATCH 3/8] fix: stall watchdog skip during ADC recovery + throttle to 250ms Review follow-ups on the telemetry watchdog: - Skip the stall check while b_adc_recovery_active (the ADS1232 power-cycle freezes the raw value by design); re-seed the window on resume so a genuine signal-timeout recovery isn't miscounted as a railed/frozen stall. - Check every 250 ms instead of every loop iteration -- the ADC only produces ~10 samples/s, so polling getDebugInfo() at full loop rate (with its sqrt + dataset passes) just burns CPU/heat, which is counterproductive on the chip we're trying to characterize. Co-Authored-By: Claude Opus 4.7 --- src/hds.ino | 51 +++++++++++++++++++++++++++++++++------------------ 1 file changed, 33 insertions(+), 18 deletions(-) diff --git a/src/hds.ino b/src/hds.ino index b833d38..b1d6bd4 100644 --- a/src/hds.ino +++ b/src/hds.ino @@ -958,27 +958,42 @@ void pureScale() { // or railed (0 / 0xFFFFFF) -- the "weight stops being collected" failure // (suspected thermal/analog) that an in-firmware ADC power-cycle can't fix. // Surface it (flag + one-shot log) instead of silently streaming a stuck value. + // Skipped while a deliberate ADC power-cycle recovery is in progress (raw is + // frozen by definition then, which is not the failure we're detecting), and + // the window is reset on resume. Checked every 250 ms (not every loop) -- the + // ADC only produces ~10 samples/s, so polling faster just burns CPU/heat. { static long lastRaw = 0x7FFFFFFFL; - static unsigned long t_rawChange = 0; - long raw = scale.getDebugInfo().rawValue; - unsigned long nowMs = millis(); - if (t_rawChange == 0) t_rawChange = nowMs; - if (raw != lastRaw) { - lastRaw = raw; - t_rawChange = nowMs; - if (b_weightStalled) { - b_weightStalled = false; - Serial.println("[adc] weight readings resumed"); + static unsigned long t_rawChange = 0; // 0 = (re)seed window on next sample + static unsigned long t_stallCheck = 0; + if (b_adc_recovery_active) { + // Deliberate ADC power-cycle in progress: raw is frozen by design, not by + // the failure we detect. Re-seed the window so we don't false-trip when + // streaming resumes. + t_rawChange = 0; + } else if (millis() - t_stallCheck >= 250) { + unsigned long nowMs = millis(); + t_stallCheck = nowMs; + long raw = scale.getDebugInfo().rawValue; + if (t_rawChange == 0) { + lastRaw = raw; + t_rawChange = nowMs; + } else if (raw != lastRaw) { + lastRaw = raw; + t_rawChange = nowMs; + if (b_weightStalled) { + b_weightStalled = false; + Serial.println("[adc] weight readings resumed"); + } + } else if (!b_weightStalled && nowMs - t_rawChange > 8000) { + b_weightStalled = true; + g_stallCount++; + g_lastStallMs = nowMs; + g_lastStallTempC = g_socTempC; + Serial.printf("[adc] WEIGHT STALLED #%lu: raw frozen at %ld for >8s soc=%.1fC heap=%lu\n", + (unsigned long)g_stallCount, raw, g_lastStallTempC, + (unsigned long)ESP.getFreeHeap()); } - } else if (!b_weightStalled && nowMs - t_rawChange > 8000) { - b_weightStalled = true; - g_stallCount++; - g_lastStallMs = nowMs; - g_lastStallTempC = g_socTempC; - Serial.printf("[adc] WEIGHT STALLED #%lu: raw frozen at %ld for >8s soc=%.1fC heap=%lu\n", - (unsigned long)g_stallCount, raw, g_lastStallTempC, - (unsigned long)ESP.getFreeHeap()); } } From cacc2ba665becf968178123387e5b93a5691e7d5 Mon Sep 17 00:00:00 2001 From: Jeff Heller Date: Mon, 25 May 2026 12:39:43 -0600 Subject: [PATCH 4/8] fix: telemetry review follow-ups (NaN guard, volatile, recovery count) From the toolkit review of PR #57: - temperatureRead() NaN guard: don't poison g_socTempC/Max (NaN -> invalid JSON and a frozen peak since NaN compares false); keep last valid + log once. - g_resetReason is now volatile (CLAUDE.md: cross-task globals read on the AsyncTCP status path); status frame casts it for printf. - Expose adc_recovery_count in the status frame: a *perpetual* ADC recovery loop keeps re-seeding the stall window so weight_stalled may never trip -- the climbing recovery count makes that failure mode visible. i_adc_recovery_count is now volatile (newly read cross-task). - reset_reason: numeric "unknown_" fallback so unmapped IDF reset reasons (CPU_LOCKUP/USB/JTAG) stay attributable. - Comment fixes: volatile cross-task rationale; stall-window re-seed wording + recovery-loop blind-spot note; last_stall_temp_c valid-only-if last_stall_ms. Co-Authored-By: Claude Opus 4.7 --- include/parameter.h | 31 +++++++++++++++++----------- include/websocket.h | 10 +++++---- src/hds.ino | 50 ++++++++++++++++++++++++++++++++++----------- 3 files changed, 63 insertions(+), 28 deletions(-) diff --git a/include/parameter.h b/include/parameter.h index a795110..0e84023 100644 --- a/include/parameter.h +++ b/include/parameter.h @@ -193,25 +193,32 @@ static const unsigned long ZERO_DISPLAY_MISMATCH_TIMEOUT = 1500; static const float ZERO_DISPLAY_MISMATCH_THRESHOLD = 0.5; static const uint8_t ADC_ERROR_RECOVERY_COUNT = 2; static bool b_adc_recovery_active = false; -static uint8_t i_adc_recovery_count = 0; +// volatile: incremented on the main loop (ADC power-cycle recovery) and now also +// read in the WS status frame (which can be built on the AsyncTCP task). +static volatile uint8_t i_adc_recovery_count = 0; //bool b_tempDisablePowerOff = true; // Instrumentation for diagnosing the "weight stops being collected" failure -// under sustained load (suspected thermal). b_weightStalled is set by the -// pureScale() stall watchdog when the ADC raw value is frozen/railed; written on -// the main loop, read in the WS status frame. g_resetReason is the last reset -// cause (esp_reset_reason()) captured at boot, surfaced for fleet telemetry. +// under sustained load (suspected thermal). These are all written on the main +// loop and read by the WS status frame, which is built BOTH on the main loop +// (periodic) AND on the AsyncTCP task (command responses) -- so the read crosses +// a task boundary. volatile prevents the AsyncTCP reader caching a stale value +// (single aligned scalars => the load/store is atomic on Xtensa, no mutex +// needed). b_weightStalled is set by the pureScale() stall watchdog when the ADC +// raw value is frozen/railed. volatile bool b_weightStalled = false; -const char *g_resetReason = "unknown"; -// Peak/last-event stats since boot (survive nothing; reset on reboot, which -// g_resetReason then explains). Sampled on the main loop, read in the WS status -// frame. g_socTempMaxC = highest SoC die temp seen; the *_stall_* fields capture -// the most recent stall so the failure is visible after the fact. +// volatile for the same cross-task reason; written once at boot in setup(). +volatile const char *g_resetReason = "unknown"; +// Peak/last-event stats since boot (no NVS; reset on reboot, which g_resetReason +// then explains). g_socTempMaxC = highest SoC die temp seen. The *_stall_* +// fields capture the most recent stall so the failure is visible after the fact +// -- consumers must treat last_stall_temp_c as valid only when g_lastStallMs != 0 +// (0.0 otherwise means "no stall yet", not a real 0 C reading). volatile float g_socTempC = 0.0f; // latest SoC temperature (C) -volatile float g_socTempMaxC = -100.0f; // peak SoC temperature since boot (C) +volatile float g_socTempMaxC = -100.0f; // peak SoC temperature since boot (C); -100 = no valid sample yet volatile uint32_t g_stallCount = 0; // number of weight-stall events since boot volatile unsigned long g_lastStallMs = 0; // millis() of the last stall onset (0 = none) -volatile float g_lastStallTempC = 0.0f; // SoC temp when the last stall began +volatile float g_lastStallTempC = 0.0f; // SoC temp when the last stall began (valid only if g_lastStallMs != 0) bool b_negativeWeight = false; diff --git a/include/websocket.h b/include/websocket.h index 09906e2..315f242 100644 --- a/include/websocket.h +++ b/include/websocket.h @@ -183,7 +183,7 @@ void sendWebsocketRateInfo(AsyncWebSocketClient *client, const char *status) { } void sendWebsocketStatus(AsyncWebSocketClient *client, const char *status) { - client->printf("{\"type\":\"status\",\"status\":\"%s\",\"protocol_version\":1,\"firmware_version\":\"%s\",\"grams\":%.2f,\"ms\":%lu,\"battery_percent\":%d,\"battery_voltage\":%.2f,\"charging\":%s,\"timer_running\":%s,\"timer_seconds\":%lu,\"display_on\":%s,\"low_power\":%s,\"soft_sleep\":%s,\"events_enabled\":%s,\"rate_hz\":%lu,\"interval_ms\":%lu,\"soc_temp_c\":%.1f,\"soc_temp_max_c\":%.1f,\"weight_stalled\":%s,\"stall_count\":%lu,\"last_stall_ms\":%lu,\"last_stall_temp_c\":%.1f,\"reset_reason\":\"%s\"}", + client->printf("{\"type\":\"status\",\"status\":\"%s\",\"protocol_version\":1,\"firmware_version\":\"%s\",\"grams\":%.2f,\"ms\":%lu,\"battery_percent\":%d,\"battery_voltage\":%.2f,\"charging\":%s,\"timer_running\":%s,\"timer_seconds\":%lu,\"display_on\":%s,\"low_power\":%s,\"soft_sleep\":%s,\"events_enabled\":%s,\"rate_hz\":%lu,\"interval_ms\":%lu,\"soc_temp_c\":%.1f,\"soc_temp_max_c\":%.1f,\"weight_stalled\":%s,\"stall_count\":%lu,\"last_stall_ms\":%lu,\"last_stall_temp_c\":%.1f,\"adc_recovery_count\":%u,\"reset_reason\":\"%s\"}", status, FIRMWARE_VER, f_displayedValue, @@ -205,7 +205,8 @@ void sendWebsocketStatus(AsyncWebSocketClient *client, const char *status) { (unsigned long)g_stallCount, g_lastStallMs, g_lastStallTempC, - g_resetReason); + (unsigned)i_adc_recovery_count, + (const char *)g_resetReason); } // Broadcast via printfAll(): it holds the library's client-list mutex and @@ -218,7 +219,7 @@ void sendWebsocketStatus(AsyncWebSocketClient *client, const char *status) { // without blocking the others. void sendWebsocketStatusAll(const char *status) { if (!b_wifiEnabled || !b_websocketEventsEnabled || websocket.count() == 0) return; - websocket.printfAll("{\"type\":\"status\",\"status\":\"%s\",\"protocol_version\":1,\"firmware_version\":\"%s\",\"grams\":%.2f,\"ms\":%lu,\"battery_percent\":%d,\"battery_voltage\":%.2f,\"charging\":%s,\"timer_running\":%s,\"timer_seconds\":%lu,\"display_on\":%s,\"low_power\":%s,\"soft_sleep\":%s,\"events_enabled\":%s,\"rate_hz\":%lu,\"interval_ms\":%lu,\"soc_temp_c\":%.1f,\"soc_temp_max_c\":%.1f,\"weight_stalled\":%s,\"stall_count\":%lu,\"last_stall_ms\":%lu,\"last_stall_temp_c\":%.1f,\"reset_reason\":\"%s\"}", + websocket.printfAll("{\"type\":\"status\",\"status\":\"%s\",\"protocol_version\":1,\"firmware_version\":\"%s\",\"grams\":%.2f,\"ms\":%lu,\"battery_percent\":%d,\"battery_voltage\":%.2f,\"charging\":%s,\"timer_running\":%s,\"timer_seconds\":%lu,\"display_on\":%s,\"low_power\":%s,\"soft_sleep\":%s,\"events_enabled\":%s,\"rate_hz\":%lu,\"interval_ms\":%lu,\"soc_temp_c\":%.1f,\"soc_temp_max_c\":%.1f,\"weight_stalled\":%s,\"stall_count\":%lu,\"last_stall_ms\":%lu,\"last_stall_temp_c\":%.1f,\"adc_recovery_count\":%u,\"reset_reason\":\"%s\"}", status, FIRMWARE_VER, f_displayedValue, @@ -240,7 +241,8 @@ void sendWebsocketStatusAll(const char *status) { (unsigned long)g_stallCount, g_lastStallMs, g_lastStallTempC, - g_resetReason); + (unsigned)i_adc_recovery_count, + (const char *)g_resetReason); } void sendWebsocketWeightAll(float grams, unsigned long ms) { diff --git a/src/hds.ino b/src/hds.ino index b1d6bd4..1f591df 100644 --- a/src/hds.ino +++ b/src/hds.ino @@ -407,7 +407,15 @@ const char *resetReasonStr(esp_reset_reason_t r) { case ESP_RST_DEEPSLEEP: return "deepsleep"; case ESP_RST_BROWNOUT: return "brownout"; case ESP_RST_SDIO: return "sdio"; - default: return "unknown"; + default: { + // Don't collapse unmapped IDF reset codes (e.g. CPU_LOCKUP, USB, JTAG on + // newer IDF) to a bare "unknown" -- keep the numeric code so a new/rare + // reason is still attributable. Written once at boot, so a static buffer + // is safe. + static char buf[16]; + snprintf(buf, sizeof(buf), "unknown_%d", (int)r); + return buf; + } } } @@ -416,7 +424,7 @@ void setup() { while (!Serial) // Wait for the Serial port to initialize (typically used in Arduino to ensure the Serial monitor is ready) ; g_resetReason = resetReasonStr(esp_reset_reason()); - Serial.printf("[boot] reset_reason=%s\n", g_resetReason); + Serial.printf("[boot] reset_reason=%s\n", (const char *)g_resetReason); if (!EEPROM.begin(512)) { Serial.println("EEPROM init failed!"); while (1) { @@ -954,14 +962,19 @@ void pureScale() { } // Stall watchdog: a live load cell's raw 24-bit value dithers on every - // conversion (~10/s). If the raw value is byte-identical for >8 s it's frozen - // or railed (0 / 0xFFFFFF) -- the "weight stops being collected" failure + // conversion (the ADS1232/HX711 runs ~10 samples/s at the configured rate). If + // the raw value is byte-identical for >8 s it's frozen or railed (a rail to 0 + // freezes rawValue at the last good value via the driver's data>0 guard, so it + // still reads as "unchanged") -- the "weight stops being collected" failure // (suspected thermal/analog) that an in-firmware ADC power-cycle can't fix. // Surface it (flag + one-shot log) instead of silently streaming a stuck value. // Skipped while a deliberate ADC power-cycle recovery is in progress (raw is - // frozen by definition then, which is not the failure we're detecting), and - // the window is reset on resume. Checked every 250 ms (not every loop) -- the - // ADC only produces ~10 samples/s, so polling faster just burns CPU/heat. + // frozen by definition then); the window is re-seeded on the first 250 ms poll + // after recovery clears (via the t_rawChange==0 sentinel). Blind spot: a + // *perpetual* recovery loop (recovery every ~5 s) keeps re-seeding so this flag + // may never trip -- the climbing adc_recovery_count in the status frame is the + // signal for that case. Checked every 250 ms (not every loop): the ADC only + // produces ~10 samples/s, so polling faster just burns CPU/heat. { static long lastRaw = 0x7FFFFFFFL; static unsigned long t_rawChange = 0; // 0 = (re)seed window on next sample @@ -1339,17 +1352,30 @@ void loop() { } // SoC-temperature sampler + peak tracking (diagnosing the suspected thermal - // stall). Runs every 2 s regardless of WiFi/charging; prints a summary every - // 10 s so a serial capture during a stress run shows the temp trend, and feeds - // g_socTempC/Max into the WS status frame. + // stall). Runs every 2 s regardless of WiFi state or power-supply mode + // (USB/battery); prints a summary every 10 s so a serial capture during a + // stress run shows the temp trend, and feeds g_socTempC/Max into the WS + // status frame. { static unsigned long t_tempSample = 0, t_tempLog = 0; unsigned long nowMs = millis(); if (nowMs - t_tempSample >= 2000) { t_tempSample = nowMs; float t = temperatureRead(); - g_socTempC = t; - if (t > g_socTempMaxC) g_socTempMaxC = t; + // temperatureRead() returns NaN if the SoC sensor is unavailable. Don't + // poison g_socTempC/Max (NaN would serialize as invalid JSON and NaN + // comparisons would freeze the peak); keep the last valid value and log + // once so the failure is visible rather than silent. + if (!isnan(t)) { + g_socTempC = t; + if (t > g_socTempMaxC) g_socTempMaxC = t; + } else { + static bool tempFailLogged = false; + if (!tempFailLogged) { + tempFailLogged = true; + Serial.println("[temp] temperatureRead() returned NaN -- SoC sensor unavailable"); + } + } if (nowMs - t_tempLog >= 10000) { t_tempLog = nowMs; Serial.printf("[temp] soc=%.1fC max=%.1fC stalls=%lu last_stall=%lums stall_temp=%.1fC heap=%lu\n", From 710949e303c0bbc6aca54e061e85141ff7662789 Mon Sep 17 00:00:00 2001 From: Jeff Heller Date: Mon, 25 May 2026 12:39:43 -0600 Subject: [PATCH 5/8] =?UTF-8?q?test:=20thermal=5Fload=5Ftest.sh=20?= =?UTF-8?q?=E2=80=94=20overridable=20IP/HOST,=20reboot-aware=20summary?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - DURATION/IP/HOST now positional args; warn (don't silently skip) if no USB port. - Telemetry monitor logs reset_reason + adc_recovery_count per line, waits a full status interval after (re)connect, tracks peak temp / stalls / recoveries / reboots across the whole run (so a firmware reset doesn't lose the peak), and prints a SUMMARY line with a PASS/FAIL verdict. Co-Authored-By: Claude Opus 4.7 --- tools/thermal_load_test.sh | 75 ++++++++++++++++++++++++++------------ 1 file changed, 52 insertions(+), 23 deletions(-) diff --git a/tools/thermal_load_test.sh b/tools/thermal_load_test.sh index 605bea8..ac55123 100755 --- a/tools/thermal_load_test.sh +++ b/tools/thermal_load_test.sh @@ -3,22 +3,28 @@ # - drives: USB 10 Hz binary + WS 10 Hz stream + HTTP/WS churn + mDNS # - NOT driven here: BT (the user's app drives it concurrently) # - monitors: the WS status-frame telemetry (soc_temp_c/max, weight_stalled, -# stall_count, last_stall_ms/temp) every ~60 s, watching for the -# "weight stops being collected" failure and the temp at which it hits. +# stall_count, last_stall_temp_c, adc_recovery_count, reset_reason) +# every ~60 s, watching for the "weight stops being collected" +# failure and the temp at which it hits. # Opening USB reboots the scale once (clean baseline); reconnect BT afterward. +# +# Usage: tools/thermal_load_test.sh [DURATION_S] [IP] [HOST] set -u cd "$(dirname "$0")/.." -IP=192.168.10.242 -HOST=hds.local DUR="${1:-3600}" +IP="${2:-192.168.10.242}" +HOST="${3:-hds.local}" PORT="$(ls /dev/cu.*usbserial* 2>/dev/null | head -1)" LOG=/tmp/thermal; rm -rf "$LOG"; mkdir -p "$LOG" ts(){ date +%H:%M:%S; } -echo "[thermal] START $(ts) dur=${DUR}s port=$PORT" +echo "[thermal] START $(ts) dur=${DUR}s ip=$IP host=$HOST port=${PORT:-}" +[ -z "$PORT" ] && echo "[thermal] WARNING: no USB serial port found -- USB 10Hz load DISABLED (WiFi-only)" # 1) USB 10 Hz (opening the port pulses DTR/RTS -> one reboot -> clean baseline) -python3 -u tools/usb_rate_check.py "$PORT" --seconds "$DUR" --mult 1 --boot-wait 8 > "$LOG/usb.log" 2>&1 & -echo "[thermal] USB launched (scale rebooting) @ $(ts)" +if [ -n "$PORT" ]; then + python3 -u tools/usb_rate_check.py "$PORT" --seconds "$DUR" --mult 1 --boot-wait 8 > "$LOG/usb.log" 2>&1 & + echo "[thermal] USB launched (scale rebooting) @ $(ts)" +fi # 2) detect the reboot, then wait for full recovery down=0 @@ -37,7 +43,10 @@ python3 -u tools/ws_drop_repro.py "$IP" --rate 10 --duration "$RDUR" --print-eve python3 -u tools/conn_churn.py "$IP" --http --ws --rate 0.5 --workers 1 --duration "$RDUR" > "$LOG/churn.log" 2>&1 & python3 -u tools/mdns_stress.py --host "$HOST" --rate 1 --duration "$RDUR" --resolver > "$LOG/mdns.log" 2>&1 & -# 4) telemetry monitor: one WS client, events on, log status every ~60 s +# 4) telemetry monitor: one WS client, events on, log status every ~60 s. +# Tracks peak temp / stalls / recoveries / reboots ACROSS the whole run (so a +# firmware reset that zeroes the since-boot counters doesn't lose the peak), +# and prints a SUMMARY + RESULT verdict at the end. python3 -u - "$HOST" "$RDUR" > "$LOG/telemetry.log" 2>&1 <<'PY' & import json,sys,time,websocket host=sys.argv[1]; dur=int(sys.argv[2]); end=time.time()+dur @@ -46,39 +55,59 @@ def connect(): try: w.send('{"command":"events","action":"on"}') except Exception: pass return w -ws=connect(); prev_stalls=0 -while time.time()=1 full 5s status interval) for a status frame + t=time.time()+secs + st=None while time.time()prev_stalls else "" - prev_stalls=sc or 0 - print("[%s] soc=%5.1fC max=%5.1fC stalled=%-5s stalls=%s last_stall_ms=%s stall_temp=%s grams=%s chg=%s%s"%( - time.strftime('%H:%M:%S'), st.get('soc_temp_c',-1), st.get('soc_temp_max_c',-1), - st.get('weight_stalled'), sc, st.get('last_stall_ms'), st.get('last_stall_temp_c'), - st.get('grams'), st.get('charging'), flag), flush=True) + soc=st.get('soc_temp_c'); mx=st.get('soc_temp_max_c'); sc=st.get('stall_count',0) or 0 + recov=st.get('adc_recovery_count',0) or 0; rr=st.get('reset_reason','?') + last_reset=rr + if isinstance(mx,(int,float)) and mx>peak: peak=mx + if recov>max_recov: max_recov=recov + # reboot heuristic: since-boot counters or peak dropped vs last frame + if prev_stalls is not None and (sc/dev/null || echo "(USB disabled)" echo "===== churn ====="; tail -3 "$LOG/churn.log" echo "===== mDNS ====="; tail -3 "$LOG/mdns.log" From 7fad485c770cc68ef6caeb24067c44fbf3e5d412 Mon Sep 17 00:00:00 2001 From: Jeff Heller Date: Mon, 25 May 2026 12:51:14 -0600 Subject: [PATCH 6/8] =?UTF-8?q?fix:=20telemetry=20review=20round=202=20?= =?UTF-8?q?=E2=80=94=20stopWatch=20snapshot,=20robustness,=20docs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses the iteration-2 review findings on PR #57: - Status frame no longer reads the multi-field stopWatch object directly off the AsyncTCP task (CLAUDE.md-forbidden cross-task tear, pre-existing). The loop task now snapshots it into aligned volatiles (g_timerRunning/ g_timerElapsed) that both status frames read. - Widen i_adc_recovery_count uint8_t -> uint32_t and drop the <255 cap so a perpetual-recovery loop (the blind spot the stall watchdog can't see) keeps counting truthfully over a long soak instead of saturating; update the WS format specifier %u -> %lu accordingly. - SoC temp guard: isfinite() instead of !isnan() so +/-inf can't reach the JSON. - Stall watchdog: never store 0 as the t_rawChange timestamp (it is the reseed sentinel) at boot/rollover. - README: document the new status-frame telemetry fields. - thermal_load_test.sh: FAIL (not silent PASS) on sustained loss of status frames or a crashed load generator, and exit non-zero on FAIL so it works as a CI gate. Co-Authored-By: Claude Opus 4.7 (1M context) --- README.md | 29 ++++++++++++++++++++++++++++- include/parameter.h | 19 ++++++++++++++++--- include/websocket.h | 16 ++++++++-------- src/hds.ino | 23 +++++++++++++++-------- tools/thermal_load_test.sh | 34 +++++++++++++++++++++++++++++----- 5 files changed, 96 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index c43d8d5..cef2d61 100644 --- a/README.md +++ b/README.md @@ -182,10 +182,37 @@ Status frame shape: "soft_sleep": false, "events_enabled": true, "rate_hz": 10, - "interval_ms": 100 + "interval_ms": 100, + "soc_temp_c": 33.3, + "soc_temp_max_c": 41.2, + "weight_stalled": false, + "stall_count": 0, + "last_stall_ms": 0, + "last_stall_temp_c": 0.0, + "adc_recovery_count": 0, + "reset_reason": "poweron" } ``` +The trailing fields are diagnostic telemetry (added to investigate a thermal +"weight stops being collected" failure under sustained load): + +- `soc_temp_c` / `soc_temp_max_c` — current and peak ESP32-S3 die temperature + (°C) since boot. `soc_temp_max_c` is `-100` until the first valid sample. +- `weight_stalled` — `true` while the load-cell raw value has been frozen/railed + for >8 s (readings have stopped), cleared when they resume. +- `stall_count` — number of stall events since boot; `last_stall_ms` is the + `millis()` of the most recent stall onset (`0` = none yet) and + `last_stall_temp_c` is the die temp at that moment (valid only when + `last_stall_ms != 0`). +- `adc_recovery_count` — number of ADC power-cycle recoveries since boot. A + climbing value is the signal for a perpetual-recovery loop (the case + `weight_stalled` is blind to). +- `reset_reason` — why the SoC last reset (`poweron`, `panic`, `brownout`, + `task_wdt`, …), so a reboot mid-soak is explained. + +These reset on reboot (not persisted to NVS). + For backwards compatibility, WiFi only sends weight snapshots by default. A client must send `events on` before periodic status, local scale button presses, or power-off notifications are emitted. The event stream resets to off when the diff --git a/include/parameter.h b/include/parameter.h index 0e84023..b713c0e 100644 --- a/include/parameter.h +++ b/include/parameter.h @@ -193,9 +193,12 @@ static const unsigned long ZERO_DISPLAY_MISMATCH_TIMEOUT = 1500; static const float ZERO_DISPLAY_MISMATCH_THRESHOLD = 0.5; static const uint8_t ADC_ERROR_RECOVERY_COUNT = 2; static bool b_adc_recovery_active = false; -// volatile: incremented on the main loop (ADC power-cycle recovery) and now also -// read in the WS status frame (which can be built on the AsyncTCP task). -static volatile uint8_t i_adc_recovery_count = 0; +// volatile: written on the main loop -- incremented on each ADC power-cycle +// recovery, reset to 0 by resetAdcRecoveryState() -- and read in the WS status +// frame (which can be built on the AsyncTCP task). uint32_t (not uint8_t) so a +// *perpetual* recovery loop -- the one failure mode the stall watchdog is blind +// to -- keeps counting truthfully over a long soak instead of saturating at 255. +static volatile uint32_t i_adc_recovery_count = 0; //bool b_tempDisablePowerOff = true; // Instrumentation for diagnosing the "weight stops being collected" failure @@ -220,6 +223,16 @@ volatile uint32_t g_stallCount = 0; // number of weight-stall events sinc volatile unsigned long g_lastStallMs = 0; // millis() of the last stall onset (0 = none) volatile float g_lastStallTempC = 0.0f; // SoC temp when the last stall began (valid only if g_lastStallMs != 0) +// Snapshot of the stopWatch state, refreshed once per main-loop iteration. The +// WS status frame is built BOTH on the main loop AND on the AsyncTCP task +// (command responses); stopWatch is a multi-field object (running flag + start +// ts + accumulator) also mutated from BLE/USB, so reading it directly off the +// AsyncTCP task can tear (CLAUDE.md). The status frame reads these single +// aligned volatiles instead. g_timerElapsed carries stopWatch.elapsed() in its +// configured resolution (SECONDS) -- it is the WS "timer_seconds" field. +volatile bool g_timerRunning = false; +volatile unsigned long g_timerElapsed = 0; + bool b_negativeWeight = false; bool b_weight_quick_zero = false; //Tare后快速显示为0优化 diff --git a/include/websocket.h b/include/websocket.h index 315f242..0816475 100644 --- a/include/websocket.h +++ b/include/websocket.h @@ -183,7 +183,7 @@ void sendWebsocketRateInfo(AsyncWebSocketClient *client, const char *status) { } void sendWebsocketStatus(AsyncWebSocketClient *client, const char *status) { - client->printf("{\"type\":\"status\",\"status\":\"%s\",\"protocol_version\":1,\"firmware_version\":\"%s\",\"grams\":%.2f,\"ms\":%lu,\"battery_percent\":%d,\"battery_voltage\":%.2f,\"charging\":%s,\"timer_running\":%s,\"timer_seconds\":%lu,\"display_on\":%s,\"low_power\":%s,\"soft_sleep\":%s,\"events_enabled\":%s,\"rate_hz\":%lu,\"interval_ms\":%lu,\"soc_temp_c\":%.1f,\"soc_temp_max_c\":%.1f,\"weight_stalled\":%s,\"stall_count\":%lu,\"last_stall_ms\":%lu,\"last_stall_temp_c\":%.1f,\"adc_recovery_count\":%u,\"reset_reason\":\"%s\"}", + client->printf("{\"type\":\"status\",\"status\":\"%s\",\"protocol_version\":1,\"firmware_version\":\"%s\",\"grams\":%.2f,\"ms\":%lu,\"battery_percent\":%d,\"battery_voltage\":%.2f,\"charging\":%s,\"timer_running\":%s,\"timer_seconds\":%lu,\"display_on\":%s,\"low_power\":%s,\"soft_sleep\":%s,\"events_enabled\":%s,\"rate_hz\":%lu,\"interval_ms\":%lu,\"soc_temp_c\":%.1f,\"soc_temp_max_c\":%.1f,\"weight_stalled\":%s,\"stall_count\":%lu,\"last_stall_ms\":%lu,\"last_stall_temp_c\":%.1f,\"adc_recovery_count\":%lu,\"reset_reason\":\"%s\"}", status, FIRMWARE_VER, f_displayedValue, @@ -191,8 +191,8 @@ void sendWebsocketStatus(AsyncWebSocketClient *client, const char *status) { websocketBatteryPercent(), f_batteryVoltage, websocketIsCharging() ? "true" : "false", - stopWatch.isRunning() ? "true" : "false", - (unsigned long)stopWatch.elapsed(), + g_timerRunning ? "true" : "false", + g_timerElapsed, b_u8g2Sleep ? "false" : "true", b_websocketLowPowerEnabled ? "true" : "false", b_softSleep ? "true" : "false", @@ -205,7 +205,7 @@ void sendWebsocketStatus(AsyncWebSocketClient *client, const char *status) { (unsigned long)g_stallCount, g_lastStallMs, g_lastStallTempC, - (unsigned)i_adc_recovery_count, + (unsigned long)i_adc_recovery_count, (const char *)g_resetReason); } @@ -219,7 +219,7 @@ void sendWebsocketStatus(AsyncWebSocketClient *client, const char *status) { // without blocking the others. void sendWebsocketStatusAll(const char *status) { if (!b_wifiEnabled || !b_websocketEventsEnabled || websocket.count() == 0) return; - websocket.printfAll("{\"type\":\"status\",\"status\":\"%s\",\"protocol_version\":1,\"firmware_version\":\"%s\",\"grams\":%.2f,\"ms\":%lu,\"battery_percent\":%d,\"battery_voltage\":%.2f,\"charging\":%s,\"timer_running\":%s,\"timer_seconds\":%lu,\"display_on\":%s,\"low_power\":%s,\"soft_sleep\":%s,\"events_enabled\":%s,\"rate_hz\":%lu,\"interval_ms\":%lu,\"soc_temp_c\":%.1f,\"soc_temp_max_c\":%.1f,\"weight_stalled\":%s,\"stall_count\":%lu,\"last_stall_ms\":%lu,\"last_stall_temp_c\":%.1f,\"adc_recovery_count\":%u,\"reset_reason\":\"%s\"}", + websocket.printfAll("{\"type\":\"status\",\"status\":\"%s\",\"protocol_version\":1,\"firmware_version\":\"%s\",\"grams\":%.2f,\"ms\":%lu,\"battery_percent\":%d,\"battery_voltage\":%.2f,\"charging\":%s,\"timer_running\":%s,\"timer_seconds\":%lu,\"display_on\":%s,\"low_power\":%s,\"soft_sleep\":%s,\"events_enabled\":%s,\"rate_hz\":%lu,\"interval_ms\":%lu,\"soc_temp_c\":%.1f,\"soc_temp_max_c\":%.1f,\"weight_stalled\":%s,\"stall_count\":%lu,\"last_stall_ms\":%lu,\"last_stall_temp_c\":%.1f,\"adc_recovery_count\":%lu,\"reset_reason\":\"%s\"}", status, FIRMWARE_VER, f_displayedValue, @@ -227,8 +227,8 @@ void sendWebsocketStatusAll(const char *status) { websocketBatteryPercent(), f_batteryVoltage, websocketIsCharging() ? "true" : "false", - stopWatch.isRunning() ? "true" : "false", - (unsigned long)stopWatch.elapsed(), + g_timerRunning ? "true" : "false", + g_timerElapsed, b_u8g2Sleep ? "false" : "true", b_websocketLowPowerEnabled ? "true" : "false", b_softSleep ? "true" : "false", @@ -241,7 +241,7 @@ void sendWebsocketStatusAll(const char *status) { (unsigned long)g_stallCount, g_lastStallMs, g_lastStallTempC, - (unsigned)i_adc_recovery_count, + (unsigned long)i_adc_recovery_count, (const char *)g_resetReason); } diff --git a/src/hds.ino b/src/hds.ino index 1f591df..0116613 100644 --- a/src/hds.ino +++ b/src/hds.ino @@ -986,6 +986,7 @@ void pureScale() { t_rawChange = 0; } else if (millis() - t_stallCheck >= 250) { unsigned long nowMs = millis(); + if (nowMs == 0) nowMs = 1; // 0 is the reseed sentinel for t_rawChange; never store it as a real timestamp (boot/rollover) t_stallCheck = nowMs; long raw = scale.getDebugInfo().rawValue; if (t_rawChange == 0) { @@ -1019,9 +1020,7 @@ void pureScale() { millis() - t_lastScaleRecovery > 5000) { Serial.println("Scale ADC timeout. Power cycling ADC."); b_adc_recovery_active = true; - if (i_adc_recovery_count < 255) { - i_adc_recovery_count++; - } + i_adc_recovery_count++; // uint32_t: counts truthfully, won't wrap in any realistic runtime scale.powerDown(); delay(5); scale.powerUp(); @@ -1346,6 +1345,14 @@ void loop() { // here on the loop task rather than racing peripheral drivers. processWsPendingCmds(); + // Snapshot the multi-field stopWatch into aligned volatiles on the loop task so + // the WS status frame (built on the AsyncTCP task for command responses) never + // reads stopWatch cross-task. Done after the drain above so a just-applied + // timer start/stop/zero is reflected. elapsed() is in the configured + // resolution (SECONDS). + g_timerRunning = stopWatch.isRunning(); + g_timerElapsed = (unsigned long)stopWatch.elapsed(); + if (b_powerOff){ shut_down_now_nobeep(); return; @@ -1362,11 +1369,11 @@ void loop() { if (nowMs - t_tempSample >= 2000) { t_tempSample = nowMs; float t = temperatureRead(); - // temperatureRead() returns NaN if the SoC sensor is unavailable. Don't - // poison g_socTempC/Max (NaN would serialize as invalid JSON and NaN - // comparisons would freeze the peak); keep the last valid value and log - // once so the failure is visible rather than silent. - if (!isnan(t)) { + // temperatureRead() returns NaN if the SoC sensor is unavailable. Reject + // any non-finite value (NaN or +/-inf): NaN serializes as invalid JSON and + // a non-finite compare would freeze the peak. Keep the last valid value and + // log once so the failure is visible rather than silent. + if (isfinite(t)) { g_socTempC = t; if (t > g_socTempMaxC) g_socTempMaxC = t; } else { diff --git a/tools/thermal_load_test.sh b/tools/thermal_load_test.sh index ac55123..f72a51e 100755 --- a/tools/thermal_load_test.sh +++ b/tools/thermal_load_test.sh @@ -68,13 +68,14 @@ def first_status(w, secs): ws=connect() peak=-999.0; total_stalls=0; reboots=0; max_recov=0 prev_stalls=None; prev_max=None; last_reset="?" +no_status_streak=0; max_no_status_streak=0 first=True while time.time()peak: peak=mx if recov>max_recov: max_recov=recov # reboot heuristic: since-boot counters or peak dropped vs last frame @@ -89,7 +90,9 @@ while time.time()max_no_status_streak: max_no_status_streak=no_status_streak + print("[%s] NO STATUS FRAME (reconnecting; streak=%d)"%(time.strftime('%H:%M:%S'),no_status_streak), flush=True) try: ws.close() except Exception: pass try: ws=connect(); first=True @@ -97,9 +100,13 @@ while time.time()= 3 +result = "PASS" if (total_stalls==0 and max_recov==0 and reboots==0 and not visibility_lost) else "FAIL" +print("SUMMARY peak_temp=%.1fC total_stalls=%d adc_recoveries=%d reboots=%d max_no_status_streak=%d last_reset=%s RESULT=%s"%( + peak, total_stalls, max_recov, reboots, max_no_status_streak, last_reset, result), flush=True) PY echo "[thermal] load + telemetry running ${RDUR}s @ $(ts)" @@ -111,3 +118,20 @@ echo "===== WS (drops) ====="; tail -12 "$LOG/ws.log" echo "===== USB ====="; tail -6 "$LOG/usb.log" 2>/dev/null || echo "(USB disabled)" echo "===== churn ====="; tail -3 "$LOG/churn.log" echo "===== mDNS ====="; tail -3 "$LOG/mdns.log" + +# Verdict -> exit code. A green run requires BOTH the telemetry monitor's +# RESULT=PASS *and* that every load generator stayed alive: if a driver crashed +# (Traceback in its log) the scale was under-stressed, so the run is invalid even +# if no stall was seen. A missing SUMMARY means the monitor itself died (caught +# by the RESULT=PASS grep failing). This makes the script usable as a CI gate. +fail=0 +if ! grep -q "RESULT=PASS" "$LOG/telemetry.log"; then + echo "[thermal] FAIL: telemetry verdict not PASS (stall/reboot/recovery, lost visibility, or monitor crashed)"; fail=1 +fi +for f in usb ws churn mdns; do + if [ -f "$LOG/$f.log" ] && grep -q "Traceback" "$LOG/$f.log"; then + echo "[thermal] FAIL: load generator '$f' crashed -- scale was under-stressed (see $LOG/$f.log)"; fail=1 + fi +done +[ "$fail" -eq 0 ] && echo "[thermal] RESULT=PASS" || echo "[thermal] RESULT=FAIL" +exit "$fail" From f3adb84ea0b74eb9424ea6ca4440d9d45bef4c39 Mon Sep 17 00:00:00 2001 From: Jeff Heller Date: Mon, 25 May 2026 12:56:53 -0600 Subject: [PATCH 7/8] test: harden soak verdict + document pre-existing-bug policy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - thermal_load_test.sh: close the silent-PASS holes a review found. A flapping wedge (scale answers one frame per reconnect, resetting the consecutive-miss streak) now fails via a cumulative total_no_status counter, not just max_no_status_streak. Each load generator's PID is captured and waited on individually so a never-started/crashed driver (non-zero exit) fails the run instead of being missed by a Traceback-only grep. A run that never saw soc_temp_max_c (peak stuck at the -999 sentinel) also fails, since the thermal data the test exists to capture is absent. - CLAUDE.md: add "Fixing bugs you find along the way" — pre-existing bugs get fixed in the same change, not deferred. Co-Authored-By: Claude Opus 4.7 (1M context) --- CLAUDE.md | 4 +++ tools/thermal_load_test.sh | 70 ++++++++++++++++++++++++++------------ 2 files changed, 53 insertions(+), 21 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 966f6bb..223d673 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -147,6 +147,10 @@ This document is meant to evolve with the codebase. During a session, if you (Cl If you fix a bug whose symptom is documented in the "When something is broken" table, leave the entry in place — it's still the right "first place to look" for the next person. +## Fixing bugs you find along the way + +Pre-existing bugs get fixed too — "it was already there" is not a reason to defer. When you turn up a bug while working on something else (a review flags it, you read past it, a test surfaces it), fix it as part of the same change; a pre-existing bug is no less bad than a newly introduced one, and the person touching the code is the right person to fix it. The only exception is when the fix is genuinely a large, independent effort — then call it out explicitly and agree on a separate change, rather than silently leaving it in place. + ## Don't - Don't call I²C / SPI / blocking IO from the AsyncTCP task. diff --git a/tools/thermal_load_test.sh b/tools/thermal_load_test.sh index f72a51e..3a5c5df 100755 --- a/tools/thermal_load_test.sh +++ b/tools/thermal_load_test.sh @@ -23,6 +23,7 @@ echo "[thermal] START $(ts) dur=${DUR}s ip=$IP host=$HOST port=${PORT:-}" # 1) USB 10 Hz (opening the port pulses DTR/RTS -> one reboot -> clean baseline) if [ -n "$PORT" ]; then python3 -u tools/usb_rate_check.py "$PORT" --seconds "$DUR" --mult 1 --boot-wait 8 > "$LOG/usb.log" 2>&1 & + USB_PID=$! echo "[thermal] USB launched (scale rebooting) @ $(ts)" fi @@ -40,8 +41,11 @@ RDUR=$((DUR-60)); [ "$RDUR" -lt 120 ] && RDUR=120 # 3) WiFi load python3 -u tools/ws_drop_repro.py "$IP" --rate 10 --duration "$RDUR" --print-every 120 > "$LOG/ws.log" 2>&1 & +WS_PID=$! python3 -u tools/conn_churn.py "$IP" --http --ws --rate 0.5 --workers 1 --duration "$RDUR" > "$LOG/churn.log" 2>&1 & +CHURN_PID=$! python3 -u tools/mdns_stress.py --host "$HOST" --rate 1 --duration "$RDUR" --resolver > "$LOG/mdns.log" 2>&1 & +MDNS_PID=$! # 4) telemetry monitor: one WS client, events on, log status every ~60 s. # Tracks peak temp / stalls / recoveries / reboots ACROSS the whole run (so a @@ -68,7 +72,7 @@ def first_status(w, secs): ws=connect() peak=-999.0; total_stalls=0; reboots=0; max_recov=0 prev_stalls=None; prev_max=None; last_reset="?" -no_status_streak=0; max_no_status_streak=0 +no_status_streak=0; max_no_status_streak=0; total_no_status=0 first=True while time.time()max_no_status_streak: max_no_status_streak=no_status_streak - print("[%s] NO STATUS FRAME (reconnecting; streak=%d)"%(time.strftime('%H:%M:%S'),no_status_streak), flush=True) + print("[%s] NO STATUS FRAME (reconnecting; streak=%d total=%d)"%(time.strftime('%H:%M:%S'),no_status_streak,total_no_status), flush=True) try: ws.close() except Exception: pass try: ws=connect(); first=True @@ -100,17 +104,36 @@ while time.time()= 3 -result = "PASS" if (total_stalls==0 and max_recov==0 and reboots==0 and not visibility_lost) else "FAIL" -print("SUMMARY peak_temp=%.1fC total_stalls=%d adc_recoveries=%d reboots=%d max_no_status_streak=%d last_reset=%s RESULT=%s"%( - peak, total_stalls, max_recov, reboots, max_no_status_streak, last_reset, result), flush=True) +# Loss of status frames means the scale stopped answering -- exactly the "weight +# stops" failure being hunted -- so it must FAIL, not silently PASS because the +# stall/reboot counters simply stopped advancing. Two patterns count: a SUSTAINED +# loss (streak >= 3 consecutive misses) AND a FLAPPING wedge that recovers on each +# reconnect (which resets the streak but accumulates total_no_status). A healthy +# hour-long run has ~0 misses (the 58 s sleep leaves a backlog of buffered status +# frames), so a cumulative threshold of 5 tolerates rare network jitter while +# still catching a scale that keeps dropping out. +visibility_lost = (max_no_status_streak >= 3) or (total_no_status >= 5) +# peak stays at its -999 sentinel if no soc_temp_max_c field was ever seen: the +# thermal data this test exists to capture is missing, so don't call it a PASS. +no_temp_data = peak < -900 +result = "PASS" if (total_stalls==0 and max_recov==0 and reboots==0 and not visibility_lost and not no_temp_data) else "FAIL" +print("SUMMARY peak_temp=%.1fC total_stalls=%d adc_recoveries=%d reboots=%d max_no_status_streak=%d total_no_status=%d last_reset=%s RESULT=%s"%( + peak, total_stalls, max_recov, reboots, max_no_status_streak, total_no_status, last_reset, result), flush=True) PY +TELE_PID=$! echo "[thermal] load + telemetry running ${RDUR}s @ $(ts)" -wait +# Wait for each child individually so we capture its exit status (a bare `wait` +# discards them). These tools exit 0 on normal completion and non-zero only on a +# startup failure (missing dep / unresolvable host) or a crash/kill -- never on +# "detected drops" -- so a non-zero code reliably means that load generator did +# not do its job and the scale was under-stressed. +rc_usb=0; rc_ws=0; rc_churn=0; rc_mdns=0; rc_tele=0 +[ -n "${USB_PID:-}" ] && { wait "$USB_PID"; rc_usb=$?; } +wait "$WS_PID"; rc_ws=$? +wait "$CHURN_PID"; rc_churn=$? +wait "$MDNS_PID"; rc_mdns=$? +wait "$TELE_PID"; rc_tele=$? echo "[thermal] DONE $(ts)" echo "===== TELEMETRY (last 12 + summary) ====="; tail -12 "$LOG/telemetry.log" echo "----- key events -----"; grep -E "STALL|REBOOT|SUMMARY" "$LOG/telemetry.log" || echo "(none)" @@ -120,18 +143,23 @@ echo "===== churn ====="; tail -3 "$LOG/churn.log" echo "===== mDNS ====="; tail -3 "$LOG/mdns.log" # Verdict -> exit code. A green run requires BOTH the telemetry monitor's -# RESULT=PASS *and* that every load generator stayed alive: if a driver crashed -# (Traceback in its log) the scale was under-stressed, so the run is invalid even -# if no stall was seen. A missing SUMMARY means the monitor itself died (caught -# by the RESULT=PASS grep failing). This makes the script usable as a CI gate. +# RESULT=PASS *and* that every load generator stayed alive for the whole run: a +# generator that never started or crashed (non-zero exit) means the scale was +# under-stressed, so the run is invalid even if no stall was seen. A non-zero +# monitor exit or a missing SUMMARY line means the monitor itself died. This +# makes the script usable as a CI gate. fail=0 -if ! grep -q "RESULT=PASS" "$LOG/telemetry.log"; then - echo "[thermal] FAIL: telemetry verdict not PASS (stall/reboot/recovery, lost visibility, or monitor crashed)"; fail=1 +if [ "$rc_tele" -ne 0 ] || ! grep -q "RESULT=PASS" "$LOG/telemetry.log"; then + echo "[thermal] FAIL: telemetry verdict not PASS (stall/reboot/recovery, lost visibility, or monitor died rc=$rc_tele)"; fail=1 fi -for f in usb ws churn mdns; do - if [ -f "$LOG/$f.log" ] && grep -q "Traceback" "$LOG/$f.log"; then - echo "[thermal] FAIL: load generator '$f' crashed -- scale was under-stressed (see $LOG/$f.log)"; fail=1 +check_gen() { # $1=name $2=exit-code + if [ "$2" -ne 0 ]; then + echo "[thermal] FAIL: load generator '$1' exited $2 -- never started or crashed, scale under-stressed (see $LOG/$1.log)"; fail=1 fi -done +} +[ -n "${USB_PID:-}" ] && check_gen usb "$rc_usb" +check_gen ws "$rc_ws" +check_gen churn "$rc_churn" +check_gen mdns "$rc_mdns" [ "$fail" -eq 0 ] && echo "[thermal] RESULT=PASS" || echo "[thermal] RESULT=FAIL" exit "$fail" From 9827c09682c660773feeca4317ca62b3cbe05d90 Mon Sep 17 00:00:00 2001 From: Jeff Heller Date: Mon, 25 May 2026 16:21:25 -0600 Subject: [PATCH 8/8] fix: prevent WS-broadcast OOM crash under connection churn Root-caused from a captured panic backtrace: under sustained multi-client WiFi load (WS connection churn + the 10 Hz weight broadcast), free heap collapses and AsyncWebSocket's printfAll path allocates an AsyncWebSocketMessage per client -> operator new throws std::bad_alloc -> (Arduino-ESP32 is -fno-exceptions) std::terminate() -> abort() -> reboot. That OOM-reboot is the "weight stops being collected under load" failure (not thermal -- die temp was 33 C). Decoded stack: operator new -> __cxa_throw -> std::terminate -> abort AsyncWebSocketClient::_queueMessage (AsyncWebSocket.cpp:490) AsyncWebSocket::printfAll sendWebsocketWeightAll (websocket.h) <- loop() 10 Hz broadcast Fix: - Heap-gate every broadcast-to-all helper (weight, status, button, power-off) with wsBroadcastHeapOk(): skip the frame when free heap is below WS_BROADCAST_HEAP_FLOOR (25 KB, above the 15 KB heap watchdog) instead of allocating into an exhausted heap and crashing. Dropping a frame is invisible; the next is <=500 ms away. - Cap each client's outbound queue via -D WS_MAX_QUEUED_MESSAGES=8 (lib default 32) so a backed-up/half-open client can't hoard heap. - Document the footgun in CLAUDE.md (notes + troubleshooting table). Stacked on the scale-telemetry branch (PR #57) whose reset_reason / heap telemetry made this diagnosable. Co-Authored-By: Claude Opus 4.7 (1M context) --- CLAUDE.md | 2 ++ include/websocket.h | 32 ++++++++++++++++++++++++++++++++ platformio.ini | 4 ++++ 3 files changed, 38 insertions(+) diff --git a/CLAUDE.md b/CLAUDE.md index 223d673..2551e0f 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -95,6 +95,7 @@ WiFi and BLE share the same 2.4 GHz radio. The Arduino-ESP32 default is `WIFI_PS - `LittleFS.begin()` is idempotent. - Multiple concurrent WS clients are supported (on-device web UI + a separate app). `cleanupClients()` caps at `DEFAULT_MAX_WS_CLIENTS` (8 on ESP32); shared session state (rate, events) resets only when the last client disconnects (`server->count() == 0`). - Broadcast with `websocket.printfAll(...)`, not a hand-rolled `getClients()` loop: `getClients()` doesn't take the library's client-list mutex, so iterating it on the loop task races a client disconnect on the AsyncTCP task (use-after-free). `printfAll` holds the lock and sends to each client. +- **`printfAll` allocates per client and throws `std::bad_alloc` when the heap is exhausted** — and Arduino-ESP32 builds `-fno-exceptions`, so the throw goes straight to `std::terminate()`→`abort()`→reboot. Connection churn (many/half-open WS clients lingering under the 30 s ack timeout) can collapse the heap, so every broadcast-to-all helper (`sendWebsocketWeightAll`, `sendWebsocketStatusAll`, button/power-off) is gated by `wsBroadcastHeapOk()` (`WS_BROADCAST_HEAP_FLOOR`, ~25 KB, above the 15 KB heap watchdog): when heap is below the floor the frame is **skipped**, not allocated. Per-client queues are capped via `-D WS_MAX_QUEUED_MESSAGES=8` (lib default 32) so a backed-up client can't hoard heap. Don't add a new `printfAll` broadcast without the `wsBroadcastHeapOk()` guard. WS frame parsing: only act on complete unfragmented text frames: @@ -135,6 +136,7 @@ Functions and locals are camelCase. Some legacy snake_case remains; don't churn | Boot logs show `LittleFS mount failed` | Run `pio run -t uploadfs` to write the filesystem image — firmware-only flashes don't touch it. | | `pio device monitor` hangs in a non-PTY shell | Use the pyserial snippet in Quick reference. | | `pio` flash takes >60s instead of ~15s | Bad firmware is choking the bootloader handshake. Symptom of a serious bug on the device (WiFi coex, OLED stuck, etc.), not a hardware fault. | +| `reset_reason=panic` / `abort()` + reboot under sustained multi-client WiFi load | Heap-exhaustion OOM in a WS broadcast: `printfAll` → `operator new` throws `bad_alloc`. Broadcasts are heap-gated (`wsBroadcastHeapOk`); look for `[ws] low heap … skip broadcast` on serial and a falling `[health] heap=`. Driven by WS connection churn (half-open clients lingering on the 30 s ack timeout). Not thermal. | ## Keeping this file fresh diff --git a/include/websocket.h b/include/websocket.h index 0816475..750af5d 100644 --- a/include/websocket.h +++ b/include/websocket.h @@ -156,8 +156,37 @@ void processWsPendingCmds() { } } +// --- Heap-floor gate for periodic WS broadcasts ------------------------------ +// printfAll() allocates an AsyncWebSocketMessage (a heap buffer) for EVERY +// connected client. Under WebSocket connection churn the heap can collapse, and +// that allocation then throws std::bad_alloc -> std::terminate() -> abort() +// (Arduino-ESP32 builds with -fno-exceptions, so the throw can't be caught) -> +// reboot. That OOM-reboot is the "weight stops being collected under sustained +// multi-client load" failure. Skipping a frame is invisible (the next weight +// frame is <=500 ms away, status <=5 s); crashing is not. The floor sits above +// the 15 KB heap watchdog (wifi_setup.cpp) so broadcasts back off well before a +// reboot is even considered. Every broadcast helper below runs on the main loop, +// so the skip counter needs no synchronization. +static const uint32_t WS_BROADCAST_HEAP_FLOOR = 25000; +static uint32_t g_wsBroadcastHeapSkips = 0; +static inline bool wsBroadcastHeapOk() { + if (ESP.getFreeHeap() >= WS_BROADCAST_HEAP_FLOOR) return true; + g_wsBroadcastHeapSkips++; + static unsigned long lastLog = 0; + unsigned long now = millis(); + if (now - lastLog >= 2000) { // rate-limit: broadcasts can be 10 Hz + lastLog = now; + Serial.printf("[ws] low heap %lu < %lu -> skip broadcast (total skips=%lu)\n", + (unsigned long)ESP.getFreeHeap(), + (unsigned long)WS_BROADCAST_HEAP_FLOOR, + (unsigned long)g_wsBroadcastHeapSkips); + } + return false; +} + void sendWebsocketButton(int buttonNumber, int buttonShortPress) { if (!b_wifiEnabled || !b_websocketEventsEnabled || websocket.count() == 0) return; + if (!wsBroadcastHeapOk()) return; websocket.printfAll("{\"type\":\"button\",\"button\":\"%s\",\"button_number\":%d,\"press\":\"%s\",\"press_code\":%d,\"ms\":%lu}", websocketButtonName(buttonNumber), buttonNumber, @@ -168,6 +197,7 @@ void sendWebsocketButton(int buttonNumber, int buttonShortPress) { void sendWebsocketPowerOff(int i_reason) { if (!b_wifiEnabled || !b_websocketEventsEnabled || websocket.count() == 0) return; + if (!wsBroadcastHeapOk()) return; websocket.printfAll("{\"type\":\"power\",\"event\":\"power_off\",\"reason\":\"%s\",\"reason_code\":%d,\"ms\":%lu}", websocketPowerOffReason(i_reason), i_reason, @@ -219,6 +249,7 @@ void sendWebsocketStatus(AsyncWebSocketClient *client, const char *status) { // without blocking the others. void sendWebsocketStatusAll(const char *status) { if (!b_wifiEnabled || !b_websocketEventsEnabled || websocket.count() == 0) return; + if (!wsBroadcastHeapOk()) return; websocket.printfAll("{\"type\":\"status\",\"status\":\"%s\",\"protocol_version\":1,\"firmware_version\":\"%s\",\"grams\":%.2f,\"ms\":%lu,\"battery_percent\":%d,\"battery_voltage\":%.2f,\"charging\":%s,\"timer_running\":%s,\"timer_seconds\":%lu,\"display_on\":%s,\"low_power\":%s,\"soft_sleep\":%s,\"events_enabled\":%s,\"rate_hz\":%lu,\"interval_ms\":%lu,\"soc_temp_c\":%.1f,\"soc_temp_max_c\":%.1f,\"weight_stalled\":%s,\"stall_count\":%lu,\"last_stall_ms\":%lu,\"last_stall_temp_c\":%.1f,\"adc_recovery_count\":%lu,\"reset_reason\":\"%s\"}", status, FIRMWARE_VER, @@ -247,6 +278,7 @@ void sendWebsocketStatusAll(const char *status) { void sendWebsocketWeightAll(float grams, unsigned long ms) { if (!b_wifiEnabled || websocket.count() == 0) return; + if (!wsBroadcastHeapOk()) return; websocket.printfAll("{\"grams\":%.2f,\"ms\":%lu}", grams, ms); } diff --git a/platformio.ini b/platformio.ini index 289a3f1..1192edc 100644 --- a/platformio.ini +++ b/platformio.ini @@ -27,6 +27,10 @@ build_flags = ; -DESP32 -D CONFIG_ASYNC_TCP_RUNNING_CORE=1 -DELEGANTOTA_USE_ASYNC_WEBSERVER=1 + ; Cap each WS client's outbound queue (lib default 32) so a backed-up or + ; half-open client (connection churn) can't hoard heap. Bounds aggregate heap + ; growth; complements the WS_BROADCAST_HEAP_FLOOR gate in include/websocket.h. + -D WS_MAX_QUEUED_MESSAGES=8 !python3 git_rev_macro.py # -D DEBUG