Skip to content

Commit a3abfc7

Browse files
authored
Merge pull request #50 from autonomys/better-error-handling
Add --slack=false and --alert-limit=N options, and integration tests in CI
2 parents 8e71792 + deedfbb commit a3abfc7

File tree

15 files changed

+322
-108
lines changed

15 files changed

+322
-108
lines changed

.cargo/config.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,4 +66,5 @@ rustflags = [
6666
"-Wclippy::manual_assert",
6767
"-Wclippy::needless_pass_by_value",
6868
"-Wclippy::precedence_bits",
69+
"-Wclippy::let_underscore_untyped",
6970
]

.github/workflows/rust.yml

Lines changed: 48 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ jobs:
9191
]
9292
runs-on: ${{ matrix.os }}
9393
# Don't use the full 6 hours if a test hangs
94-
timeout-minutes: 120
94+
timeout-minutes: 30
9595

9696
steps:
9797
- name: Checkout
@@ -106,6 +106,51 @@ jobs:
106106
run: |
107107
cargo nextest run --locked --all-features
108108
109+
# Runs the chain-alerter process until it sends a startup alert, but with Slack disabled.
110+
# This is useful for testing the chain-alerter process setup and main loop.
111+
cargo-run:
112+
name: cargo-run (${{ matrix.os }})
113+
strategy:
114+
fail-fast: false
115+
matrix:
116+
os: [
117+
"ubuntu-24.04",
118+
"ubuntu-24.04-arm",
119+
"windows-2025",
120+
"windows-11-arm",
121+
"macos-15",
122+
]
123+
runs-on: ${{ matrix.os }}
124+
# Unfortunately, this doesn't work at the step level on Windows.
125+
defaults:
126+
run:
127+
shell: bash
128+
# Don't use the full 6 hours if this integration test hangs
129+
timeout-minutes: 30
130+
131+
steps:
132+
- name: Checkout
133+
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
134+
135+
- name: Install ripgrep from crates.io
136+
uses: baptiste0928/cargo-install@b687c656bda5733207e629b50a22bf68974a0305 # v3.3.2
137+
with:
138+
crate: ripgrep
139+
140+
# Randomly choose between the foundation and labs nodes, then run the integration test,
141+
# checking for the startup alert, but printing all the logs to the console.
142+
- name: cargo run --locked -- --slack=false --alert-limit=1
143+
run: |
144+
if [[ $(($RANDOM % 2)) == 0 ]]; then
145+
echo "Using foundation node"
146+
NODE_URL=wss://rpc.mainnet.subspace.foundation/ws
147+
else
148+
echo "Using labs node"
149+
NODE_URL=wss://rpc-0.mainnet.autonomys.xyz/ws
150+
fi
151+
cargo run --locked -- --slack=false --alert-limit=1 --node-rpc-url="$NODE_URL" \
152+
| rg --fixed-strings --passthru "**Launched and connected to the node**"
153+
109154
# This job checks for incorrectly added dependencies, or dependencies that were made unused by code changes.
110155
cargo-unused-deps:
111156
name: cargo-unused-deps (ubuntu-24.04)
@@ -138,6 +183,7 @@ jobs:
138183
- cargo-clippy
139184
- cargo-build
140185
- cargo-test
186+
- cargo-run
141187
- cargo-unused-deps
142188
steps:
143189
- name: Check job statuses
@@ -148,4 +194,5 @@ jobs:
148194
[[ "${{ needs.cargo-clippy.result }}" == "success" ]] || exit 1
149195
[[ "${{ needs.cargo-build.result }}" == "success" ]] || exit 1
150196
[[ "${{ needs.cargo-test.result }}" == "success" ]] || exit 1
197+
[[ "${{ needs.cargo-run.result }}" == "success" ]] || exit 1
151198
[[ "${{ needs.cargo-unused-deps.result }}" == "success" ]] || exit 1

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ hex-literal = "0.4.1"
1616
humantime = "2.2.0"
1717
hyper-rustls = { version = "0.27.7", default-features = false }
1818
hyper-util = { version = "0.1.16", default-features = false }
19+
rand = "0.9.2"
1920
reqwest = { version = "0.12.22", default-features = false }
2021
rustls = { version = "0.23.31", default-features = false }
2122
scale-value = { version = "0.18.0", default-features = false }

README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,8 +119,10 @@ Other alert kinds are not de-duplicated.
119119
4. Build and run
120120
- `cargo run -- --name "My Test Bot" --icon "warning" --node-rpc-url wss://rpc.mainnet.subspace.foundation/ws`
121121
- public node URLs are [listed in subspace.rs](https://github.com/autonomys/subspace-chain-alerts/blob/ac33ed7d200a1fdc3b92c1919f7b9cfacfba37c6/chain-alerter/src/subspace.rs#L43-L49)
122+
- `--slack=false` will disable Slack message posting entirely, and just log alerts to the terminal.
123+
- `--alert-limit=5` will exit after 5 alerts have been posted, including the startup alert.
122124
- On first observed block, you should see a Slack message in `#chain-alerts-test` summarizing connection and block info.
123-
- All arguments are optional. The default node is localhost, and the default icon is the instance external IP address country flag.
125+
- All arguments are optional. The default node is `localhost`, and the default icon is the instance external IP address country flag (looked up via an online GeoIP service, which can be wrong).
124126
- `RUST_LOG` can be used to filter logs, see:
125127
<https://docs.rs/tracing-subscriber/0.3.19/tracing_subscriber/filter/struct.EnvFilter.html#directives>
126128

chain-alerter/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,3 +25,4 @@ zeroize.workspace = true
2525

2626
[dev-dependencies]
2727
hex-literal.workspace = true
28+
rand.workspace = true

chain-alerter/src/alerts.rs

Lines changed: 32 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ use chrono::Utc;
1515
use std::fmt::{self, Display};
1616
use std::time::Duration;
1717
use tokio::sync::{mpsc, watch};
18+
use tokio::task::JoinHandle;
1819
use tokio::time::sleep;
1920
use tracing::{trace, warn};
2021

@@ -96,6 +97,16 @@ pub struct Alert {
9697
pub mode: BlockCheckMode,
9798
}
9899

100+
impl Display for Alert {
101+
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
102+
write!(f, "Mode: {:?}", self.mode)?;
103+
write!(f, "\n{}", self.alert)?;
104+
write!(f, "\nBlock: {}", self.block_info)?;
105+
106+
Ok(())
107+
}
108+
}
109+
99110
impl Alert {
100111
/// Create a new alert.
101112
pub fn new(alert: AlertKind, block_info: BlockInfo, mode: BlockCheckMode) -> Self {
@@ -710,7 +721,11 @@ pub async fn startup_alert(
710721
alert_tx: &mpsc::Sender<Alert>,
711722
block_info: &BlockInfo,
712723
) -> anyhow::Result<()> {
713-
assert!(mode.is_startup(), "should only be called at startup");
724+
// We could check this is only called once, but that's a low priority.
725+
assert!(
726+
mode.is_current(),
727+
"should only be called on the first current block",
728+
);
714729

715730
// TODO:
716731
// - always post this to the test channel, because it's not a real "alert"
@@ -770,27 +785,27 @@ pub async fn check_block(
770785
/// Spawn a task that waits for `MIN_BLOCK_GAP`, then alerts if there was no block received on
771786
/// `latest_block_rx` in that gap.
772787
///
773-
/// Doesn't work on replayed blocks, because the chain has already resumed after a replayed block
774-
/// gap.
788+
/// Fatal errors will be returned from the spawned task's join handle.
775789
///
776-
/// Fatal errors will panic in the spawned task.
790+
/// # Panics
791+
///
792+
/// On replayed blocks, because the chain has already resumed after a replayed block
793+
/// gap.
794+
#[must_use = "the spawned task must be joined"]
777795
pub async fn check_for_block_stall(
778796
mode: BlockCheckMode,
779797
alert_tx: mpsc::Sender<Alert>,
780798
block_info: BlockInfo,
781799
latest_block_rx: watch::Receiver<Option<BlockInfo>>,
782-
) {
800+
) -> JoinHandle<anyhow::Result<()>> {
783801
// It doesn't make sense to check the local clock for stalls on replayed blocks, because we
784802
// already know there's a new current block. It's expensive to spawn a task, so return
785803
// early.
786-
if !mode.is_current() {
787-
return;
788-
}
804+
assert!(mode.is_current(), "should only be called on current blocks");
789805

790806
let old_block_info = block_info;
791807

792-
// There's no need to check the result of the spawned task, panics are impossible, and other
793-
// errors are handled by replaying missed blocks on restart.
808+
// We handle channel errors by restarting all tasks.
794809
tokio::spawn(async move {
795810
// Stall alerts without a resume are alarming, it looks like either the chain or alerter
796811
// has stopped. We've seen a spurious stall at exactly 60 seconds, but only once.
@@ -805,21 +820,22 @@ pub async fn check_for_block_stall(
805820
// There's a new block since we sent our block and spawned our task, so block
806821
// production hasn't stalled. But the latest block also spawned a task, so it
807822
// will alert if there is actually a stall.
808-
return;
823+
return Ok(());
809824
}
810825

811826
let gap = gap_since_time(Utc::now(), old_block_info);
812827

813-
// If we have restarted, the new alerter will replay missed blocks, so we can ignore any
814-
// send errors to dropped channels. This also avoids panics during process shutdown.
815-
let _ = alert_tx
828+
// If there is an error, we will restart, and the new alerter will replay missed blocks.
829+
alert_tx
816830
.send(Alert::new(
817831
AlertKind::BlockProductionStall { gap },
818832
old_block_info,
819833
mode,
820834
))
821-
.await;
822-
});
835+
.await?;
836+
837+
Ok(())
838+
})
823839
}
824840

825841
/// Check an extrinsic for alerts.

chain-alerter/src/alerts/tests.rs

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -162,11 +162,11 @@ async fn test_startup_alert() -> anyhow::Result<()> {
162162

163163
let (block_info, _, _) = fetch_block_info(&subspace_client, None, None).await?;
164164

165-
alerts::startup_alert(BlockCheckMode::Startup, &alert_tx, &block_info).await?;
165+
alerts::startup_alert(BlockCheckMode::Current, &alert_tx, &block_info).await?;
166166
let alert = alert_rx.try_recv().expect("no alert received");
167167
assert_eq!(
168168
alert,
169-
Alert::new(AlertKind::Startup, block_info, BlockCheckMode::Startup),
169+
Alert::new(AlertKind::Startup, block_info, BlockCheckMode::Current),
170170
);
171171

172172
// There should be no other alerts.
@@ -494,10 +494,10 @@ async fn no_expected_test_slot_time_alert() -> anyhow::Result<()> {
494494

495495
naive_slot_time_monitor
496496
.process_block(BlockCheckMode::Replay, &first_block)
497-
.await;
497+
.await?;
498498
naive_slot_time_monitor
499499
.process_block(BlockCheckMode::Replay, &second_block)
500-
.await;
500+
.await?;
501501

502502
alert_rx
503503
.try_recv()
@@ -525,10 +525,10 @@ async fn expected_test_slot_time_alert() -> anyhow::Result<()> {
525525

526526
strict_slot_time_monitor
527527
.process_block(BlockCheckMode::Replay, &first_block)
528-
.await;
528+
.await?;
529529
strict_slot_time_monitor
530530
.process_block(BlockCheckMode::Replay, &second_block)
531-
.await;
531+
.await?;
532532

533533
let alert = alert_rx
534534
.try_recv()
@@ -576,10 +576,10 @@ async fn expected_test_slot_time_alert_but_not_yet() -> anyhow::Result<()> {
576576

577577
strict_slot_time_monitor
578578
.process_block(BlockCheckMode::Replay, &first_block)
579-
.await;
579+
.await?;
580580
strict_slot_time_monitor
581581
.process_block(BlockCheckMode::Replay, &second_block)
582-
.await;
582+
.await?;
583583

584584
alert_rx
585585
.try_recv()

chain-alerter/src/chain_fork_monitor.rs

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -721,12 +721,9 @@ pub async fn add_blocks_to_chain_fork_state(
721721
// TODO: fix this at a lower level in the init, startup, or fork detection code
722722
// instead
723723
if event.needs_alert() && !mode.is_startup() {
724-
// If we have restarted, the new alerter will replay missed blocks, so we
725-
// can ignore any send errors to dropped channels.
726-
// This also avoids panics during process shutdown.
727-
let _ = alert_tx
724+
alert_tx
728725
.send(Alert::from_chain_fork_event(event, block_info, mode))
729-
.await;
726+
.await?;
730727
}
731728

732729
send_best_fork_block(

0 commit comments

Comments
 (0)