From a9791b12a0aa53cc893b6f1413a66dfa3f8989b8 Mon Sep 17 00:00:00 2001 From: Artem Shcherbatiuk Date: Tue, 10 Mar 2026 20:34:45 +0100 Subject: [PATCH 1/8] fix: coordinated ShutdownInitiated after bid close tx broadcast is finished --- gateway/rest/router.go | 2 +- manifest/service.go | 2 +- manifest/watchdog.go | 13 ++++++++----- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/gateway/rest/router.go b/gateway/rest/router.go index 6442650a..c656cf6d 100644 --- a/gateway/rest/router.go +++ b/gateway/rest/router.go @@ -110,7 +110,7 @@ func newRouter(log log.Logger, addr sdk.Address, pclient provider.Client, ctxCon authorizeProviderMiddleware, requireOwner, ) - + hostnameRouter := authedRouter.PathPrefix(apclient.HostnamePrefix).Subrouter() hostnameRouter.HandleFunc(apclient.MigratePathPrefix, migrateHandler(log, pclient.Hostname(), pclient.ClusterService())). diff --git a/manifest/service.go b/manifest/service.go index 0b64eba6..e3424c5e 100644 --- a/manifest/service.go +++ b/manifest/service.go @@ -263,7 +263,7 @@ loop: if ev.LeaseID.GetProvider() != s.session.Provider().Address().String() { continue } - s.session.Log().Info("lease won", "lease", ev.LeaseID) + s.session.Log().Info("handling lease", "lease", ev.LeaseID) s.handleLease(ev, true) case *dtypes.EventDeploymentUpdated: s.session.Log().Info("update received", "deployment", ev.ID, "version", ev.Hash) diff --git a/manifest/watchdog.go b/manifest/watchdog.go index f961bab5..585b286d 100644 --- a/manifest/watchdog.go +++ b/manifest/watchdog.go @@ -75,17 +75,20 @@ func (wd *watchdog) run() { ID: mtypes.MakeBidID(wd.leaseID.OrderID(), wd.sess.Provider().Address()), Reason: mtypes.LeaseClosedReasonManifestTimeout, } - return runner.NewResult(wd.sess.Client().Tx().BroadcastMsgs(wd.ctx, []sdk.Msg{msg}, aclient.WithResultCodeAsError())) }) case err = <-wd.lc.ShutdownRequest(): } - wd.lc.ShutdownInitiated(err) if runch != nil { - result := <-runch - if err := result.Error(); err != nil { - wd.log.Error("failed closing bid", "err", err) + select { + case result := <-runch: + if err := result.Error(); err != nil { + wd.log.Error("failed closing bid", "err", err) + } + case err = <-wd.lc.ShutdownRequest(): + go func() { <-runch }() } } + wd.lc.ShutdownInitiated(err) } From 15977fa1097c74c15709523d5f32f765d37cec36 Mon Sep 17 00:00:00 2001 From: Artem Shcherbatiuk Date: Tue, 10 Mar 2026 21:58:22 +0100 Subject: [PATCH 2/8] fix: test watchdog stop while witing for broadcast --- manifest/watchdog.go | 2 +- manifest/watchdog_test.go | 27 +++++++++++++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/manifest/watchdog.go b/manifest/watchdog.go index 585b286d..34f7de64 100644 --- a/manifest/watchdog.go +++ b/manifest/watchdog.go @@ -87,7 +87,7 @@ func (wd *watchdog) run() { wd.log.Error("failed closing bid", "err", err) } case err = <-wd.lc.ShutdownRequest(): - go func() { <-runch }() + // we need this case to skip waiting on runch when stop() is called } } wd.lc.ShutdownInitiated(err) diff --git a/manifest/watchdog_test.go b/manifest/watchdog_test.go index 34191f18..db019609 100644 --- a/manifest/watchdog_test.go +++ b/manifest/watchdog_test.go @@ -29,6 +29,10 @@ type watchdogTestScaffold struct { } func makeWatchdogTestScaffold(t *testing.T, timeout time.Duration) (*watchdog, *watchdogTestScaffold) { + return makeWatchdogTestScaffoldWithBlocking(t, timeout, nil) +} + +func makeWatchdogTestScaffoldWithBlocking(t *testing.T, timeout time.Duration, blockUntilRelease <-chan struct{}) (*watchdog, *watchdogTestScaffold) { scaffold := &watchdogTestScaffold{} scaffold.parentCh = make(chan struct{}) scaffold.doneCh = make(chan dtypes.DeploymentID, 1) @@ -39,6 +43,9 @@ func makeWatchdogTestScaffold(t *testing.T, timeout time.Duration) (*watchdog, * txClientMock := &clientmocks.TxClient{} txClientMock.On("BroadcastMsgs", mock.Anything, mock.Anything, mock.Anything).Run(func(args mock.Arguments) { + if blockUntilRelease != nil { + <-blockUntilRelease + } scaffold.broadcasts <- args.Get(1).([]sdk.Msg) }).Return(&sdk.Result{}, nil) @@ -72,6 +79,7 @@ func TestWatchdogTimeout(t *testing.T) { msg := msgs[0].(*mvbeta.MsgCloseBid) require.Equal(t, scaffold.leaseID, msg.ID.LeaseID()) + require.Equal(t, mtypes.LeaseClosedReasonManifestTimeout, msg.Reason) deploymentID := testutil.ChannelWaitForValue(t, scaffold.doneCh) require.Equal(t, deploymentID, scaffold.leaseID.DeploymentID()) @@ -121,3 +129,22 @@ func TestWatchdogStopsOnParent(t *testing.T) { deploymentID := testutil.ChannelWaitForValue(t, scaffold.doneCh) require.Equal(t, deploymentID, scaffold.leaseID.DeploymentID()) } + +func TestWatchdogStopWhileWaitingForBroadcast(t *testing.T) { + releaseCh := make(chan struct{}) + wd, scaffold := makeWatchdogTestScaffoldWithBlocking(t, 100*time.Millisecond, releaseCh) + + <-time.After(200 * time.Millisecond) + wd.stop() + + select { + case <-wd.lc.Done(): + case <-time.After(5 * time.Second): + t.Fatal("deadlock: stop() blocked while watchdog was waiting on broadcast") + } + + close(releaseCh) + + deploymentID := testutil.ChannelWaitForValue(t, scaffold.doneCh) + require.Equal(t, deploymentID, scaffold.leaseID.DeploymentID()) +} From 09dbce02c81b3c9137d35c3cdd6abee8b7e79c44 Mon Sep 17 00:00:00 2001 From: Artem Shcherbatiuk Date: Wed, 11 Mar 2026 20:24:44 +0100 Subject: [PATCH 3/8] chore: reverted original log.Info for lease won --- manifest/service.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifest/service.go b/manifest/service.go index e3424c5e..0b64eba6 100644 --- a/manifest/service.go +++ b/manifest/service.go @@ -263,7 +263,7 @@ loop: if ev.LeaseID.GetProvider() != s.session.Provider().Address().String() { continue } - s.session.Log().Info("handling lease", "lease", ev.LeaseID) + s.session.Log().Info("lease won", "lease", ev.LeaseID) s.handleLease(ev, true) case *dtypes.EventDeploymentUpdated: s.session.Log().Info("update received", "deployment", ev.ID, "version", ev.Hash) From b1a78b66d610a22324c0b7fb474bcf318757d63e Mon Sep 17 00:00:00 2001 From: Artem Shcherbatiuk Date: Thu, 12 Mar 2026 15:27:17 +0100 Subject: [PATCH 4/8] chore: commit to the broadcast once started --- manifest/watchdog.go | 25 +++++++++++++++++++------ manifest/watchdog_test.go | 11 ++++++++--- 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/manifest/watchdog.go b/manifest/watchdog.go index 34f7de64..c88c7de2 100644 --- a/manifest/watchdog.go +++ b/manifest/watchdog.go @@ -25,6 +25,7 @@ type watchdog struct { lc lifecycle.Lifecycle sess session.Session ctx context.Context + cancel context.CancelFunc log log.Logger } @@ -36,13 +37,11 @@ func newWatchdog(sess session.Session, parent <-chan struct{}, done chan<- dtype lc: lifecycle.New(), sess: sess, ctx: ctx, + cancel: cancel, log: sess.Log().With("leaseID", leaseID), } - go func() { - result.lc.WatchChannel(parent) - cancel() - }() + go result.lc.WatchChannel(parent) go func() { <-result.lc.Done() @@ -54,12 +53,22 @@ func newWatchdog(sess session.Session, parent <-chan struct{}, done chan<- dtype return result } +// stop signals the watchdog to exit without closing the bid. +// Called when: (1) manifest received within the timeout, (2) manifest manager is stopped. func (wd *watchdog) stop() { wd.lc.ShutdownAsync(nil) } +// run waits for the manifest timeout, then broadcasts MsgCloseBid. +// +// Rule: once the broadcast is started, we commit to it - it must complete regardless of +// any concurrent stop() or parent shutdown. This prevents leaving an open bid on-chain. +// +// wd.ctx is derived from context.Background() and is only canceled via defer wd.cancel() +// at the end of run(), so the broadcast context is always alive while run() executes. func (wd *watchdog) run() { defer wd.lc.ShutdownCompleted() + defer wd.cancel() var runch <-chan runner.Result var err error @@ -78,16 +87,20 @@ func (wd *watchdog) run() { return runner.NewResult(wd.sess.Client().Tx().BroadcastMsgs(wd.ctx, []sdk.Msg{msg}, aclient.WithResultCodeAsError())) }) case err = <-wd.lc.ShutdownRequest(): + // Manifest received or parent shutdown before timeout - exit without closing the bid. } - if runch != nil { + // ShutdownRequest may arrive while we wait for the broadcast result + // consume it to unblock the sender, but keep looping until runch delivers. + // wd.ctx is not canceled until run() returns, so none of these signals can interrupt the in-flight broadcast. + for runch != nil { select { case result := <-runch: if err := result.Error(); err != nil { wd.log.Error("failed closing bid", "err", err) } + runch = nil case err = <-wd.lc.ShutdownRequest(): - // we need this case to skip waiting on runch when stop() is called } } wd.lc.ShutdownInitiated(err) diff --git a/manifest/watchdog_test.go b/manifest/watchdog_test.go index db019609..88a6ddf7 100644 --- a/manifest/watchdog_test.go +++ b/manifest/watchdog_test.go @@ -137,14 +137,19 @@ func TestWatchdogStopWhileWaitingForBroadcast(t *testing.T) { <-time.After(200 * time.Millisecond) wd.stop() + close(releaseCh) + select { case <-wd.lc.Done(): case <-time.After(5 * time.Second): - t.Fatal("deadlock: stop() blocked while watchdog was waiting on broadcast") + t.Fatal("deadlock: watchdog did not complete after broadcast") } - close(releaseCh) - deploymentID := testutil.ChannelWaitForValue(t, scaffold.doneCh) require.Equal(t, deploymentID, scaffold.leaseID.DeploymentID()) + + broadcasts := testutil.ChannelWaitForValue(t, scaffold.broadcasts) + msgs := broadcasts.([]sdk.Msg) + require.Len(t, msgs, 1) + require.Equal(t, mtypes.LeaseClosedReasonManifestTimeout, msgs[0].(*mvbeta.MsgCloseBid).Reason) } From 85f822633db7d17ced855aff347df7bc938048a7 Mon Sep 17 00:00:00 2001 From: Artem Shcherbatiuk Date: Fri, 13 Mar 2026 17:57:35 +0100 Subject: [PATCH 5/8] fix: added context.WithTimeout to the broadcast message --- cmd/provider-services/cmd/run.go | 2 ++ config.go | 1 + manifest/config.go | 1 + manifest/service.go | 2 +- manifest/watchdog.go | 45 ++++++++++++++-------------- manifest/watchdog_test.go | 50 +++++++++++++++++++++++++++----- service.go | 1 + 7 files changed, 69 insertions(+), 33 deletions(-) diff --git a/cmd/provider-services/cmd/run.go b/cmd/provider-services/cmd/run.go index 12c41b71..48bcbf4e 100644 --- a/cmd/provider-services/cmd/run.go +++ b/cmd/provider-services/cmd/run.go @@ -474,6 +474,7 @@ func doRunCmd(ctx context.Context, cmd *cobra.Command, _ []string) error { deploymentRuntimeClass := viper.GetString(FlagDeploymentRuntimeClass) bidTimeout := viper.GetDuration(FlagBidTimeout) manifestTimeout := viper.GetDuration(FlagManifestTimeout) + broadcastTimeout := viper.GetDuration(FlagTxBroadcastTimeout) metricsListener := viper.GetString(FlagMetricsListener) providerConfig := viper.GetString(FlagProviderConfig) cachedResultMaxAge := viper.GetDuration(FlagCachedResultMaxAge) @@ -576,6 +577,7 @@ func doRunCmd(ctx context.Context, cmd *cobra.Command, _ []string) error { config.DeploymentIngressDomain = deploymentIngressDomain config.BidTimeout = bidTimeout config.ManifestTimeout = manifestTimeout + config.BroadcastTimeout = broadcastTimeout config.MonitorMaxRetries = monitorMaxRetries config.MonitorRetryPeriod = monitorRetryPeriod config.MonitorRetryPeriodJitter = monitorRetryPeriodJitter diff --git a/config.go b/config.go index e2248d27..99091cc9 100644 --- a/config.go +++ b/config.go @@ -21,6 +21,7 @@ type Config struct { BidDeposit sdk.Coin BidTimeout time.Duration ManifestTimeout time.Duration + BroadcastTimeout time.Duration BalanceCheckerCfg BalanceCheckerConfig Attributes attrtypes.Attributes MaxGroupVolumes int diff --git a/manifest/config.go b/manifest/config.go index b909aef8..4cee79d4 100644 --- a/manifest/config.go +++ b/manifest/config.go @@ -5,6 +5,7 @@ import "time" type ServiceConfig struct { HTTPServicesRequireAtLeastOneHost bool ManifestTimeout time.Duration + BroadcastTimeout time.Duration RPCQueryTimeout time.Duration CachedResultMaxAge time.Duration } diff --git a/manifest/service.go b/manifest/service.go index 0b64eba6..752cbf06 100644 --- a/manifest/service.go +++ b/manifest/service.go @@ -351,7 +351,7 @@ func (s *service) handleLease(ev event.LeaseWon, isNew bool) { if isNew && s.config.ManifestTimeout > time.Duration(0) { // Create watchdog if it does not exist AND a manifest has not been received yet if watchdog := s.watchdogs[ev.LeaseID.DeploymentID()]; watchdog == nil { - watchdog = newWatchdog(s.session, s.lc.ShuttingDown(), s.watchdogch, ev.LeaseID, s.config.ManifestTimeout) + watchdog = newWatchdog(s.session, s.lc.ShuttingDown(), s.watchdogch, ev.LeaseID, s.config.ManifestTimeout, s.config.BroadcastTimeout) s.watchdogs[ev.LeaseID.DeploymentID()] = watchdog } } diff --git a/manifest/watchdog.go b/manifest/watchdog.go index c88c7de2..a37cb9b1 100644 --- a/manifest/watchdog.go +++ b/manifest/watchdog.go @@ -20,25 +20,22 @@ import ( ) type watchdog struct { - leaseID mtypes.LeaseID - timeout time.Duration - lc lifecycle.Lifecycle - sess session.Session - ctx context.Context - cancel context.CancelFunc - log log.Logger + leaseID mtypes.LeaseID + timeout time.Duration + broadcastTimeout time.Duration + lc lifecycle.Lifecycle + sess session.Session + log log.Logger } -func newWatchdog(sess session.Session, parent <-chan struct{}, done chan<- dtypes.DeploymentID, leaseID mtypes.LeaseID, timeout time.Duration) *watchdog { - ctx, cancel := context.WithCancel(context.Background()) +func newWatchdog(sess session.Session, parent <-chan struct{}, done chan<- dtypes.DeploymentID, leaseID mtypes.LeaseID, timeout, broadcastTimeout time.Duration) *watchdog { result := &watchdog{ - leaseID: leaseID, - timeout: timeout, - lc: lifecycle.New(), - sess: sess, - ctx: ctx, - cancel: cancel, - log: sess.Log().With("leaseID", leaseID), + leaseID: leaseID, + timeout: timeout, + broadcastTimeout: broadcastTimeout, + lc: lifecycle.New(), + sess: sess, + log: sess.Log().With("leaseID", leaseID), } go result.lc.WatchChannel(parent) @@ -63,12 +60,9 @@ func (wd *watchdog) stop() { // // Rule: once the broadcast is started, we commit to it - it must complete regardless of // any concurrent stop() or parent shutdown. This prevents leaving an open bid on-chain. -// -// wd.ctx is derived from context.Background() and is only canceled via defer wd.cancel() -// at the end of run(), so the broadcast context is always alive while run() executes. +// broadcastTimeout bounds how long we wait for the RPC response to prevent a permanent hang. func (wd *watchdog) run() { defer wd.lc.ShutdownCompleted() - defer wd.cancel() var runch <-chan runner.Result var err error @@ -80,19 +74,22 @@ func (wd *watchdog) run() { wd.log.Info("watchdog closing bid") runch = runner.Do(func() runner.Result { + ctx, cancel := context.WithTimeout(context.Background(), wd.broadcastTimeout) + defer cancel() + msg := &mvbeta.MsgCloseBid{ ID: mtypes.MakeBidID(wd.leaseID.OrderID(), wd.sess.Provider().Address()), Reason: mtypes.LeaseClosedReasonManifestTimeout, } - return runner.NewResult(wd.sess.Client().Tx().BroadcastMsgs(wd.ctx, []sdk.Msg{msg}, aclient.WithResultCodeAsError())) + return runner.NewResult(wd.sess.Client().Tx().BroadcastMsgs(ctx, []sdk.Msg{msg}, aclient.WithResultCodeAsError())) }) case err = <-wd.lc.ShutdownRequest(): // Manifest received or parent shutdown before timeout - exit without closing the bid. } - // ShutdownRequest may arrive while we wait for the broadcast result - // consume it to unblock the sender, but keep looping until runch delivers. - // wd.ctx is not canceled until run() returns, so none of these signals can interrupt the in-flight broadcast. + // ShutdownRequest may arrive while we wait for the broadcast result. + // Consume it to unblock the sender, but keep looping until runch delivers. + // The broadcast context is independent and bounded by broadcastTimeout. for runch != nil { select { case result := <-runch: diff --git a/manifest/watchdog_test.go b/manifest/watchdog_test.go index 88a6ddf7..7a10c807 100644 --- a/manifest/watchdog_test.go +++ b/manifest/watchdog_test.go @@ -1,6 +1,7 @@ package manifest import ( + "context" "testing" "time" @@ -10,6 +11,7 @@ import ( sdk "github.com/cosmos/cosmos-sdk/types" clientmocks "pkg.akt.dev/go/mocks/node/client" + aclient "pkg.akt.dev/go/node/client/v1beta3" dtypes "pkg.akt.dev/go/node/deployment/v1" mtypes "pkg.akt.dev/go/node/market/v1" mvbeta "pkg.akt.dev/go/node/market/v1beta5" @@ -29,10 +31,14 @@ type watchdogTestScaffold struct { } func makeWatchdogTestScaffold(t *testing.T, timeout time.Duration) (*watchdog, *watchdogTestScaffold) { - return makeWatchdogTestScaffoldWithBlocking(t, timeout, nil) + return makeWatchdogTestScaffoldFull(t, timeout, 30*time.Second, nil) } func makeWatchdogTestScaffoldWithBlocking(t *testing.T, timeout time.Duration, blockUntilRelease <-chan struct{}) (*watchdog, *watchdogTestScaffold) { + return makeWatchdogTestScaffoldFull(t, timeout, 30*time.Second, blockUntilRelease) +} + +func makeWatchdogTestScaffoldFull(t *testing.T, timeout, broadcastTimeout time.Duration, blockUntilRelease <-chan struct{}) (*watchdog, *watchdogTestScaffold) { scaffold := &watchdogTestScaffold{} scaffold.parentCh = make(chan struct{}) scaffold.doneCh = make(chan dtypes.DeploymentID, 1) @@ -42,12 +48,19 @@ func makeWatchdogTestScaffoldWithBlocking(t *testing.T, timeout time.Duration, b scaffold.broadcasts = make(chan []sdk.Msg, 1) txClientMock := &clientmocks.TxClient{} - txClientMock.On("BroadcastMsgs", mock.Anything, mock.Anything, mock.Anything).Run(func(args mock.Arguments) { - if blockUntilRelease != nil { - <-blockUntilRelease - } - scaffold.broadcasts <- args.Get(1).([]sdk.Msg) - }).Return(&sdk.Result{}, nil) + txClientMock.EXPECT(). + BroadcastMsgs(mock.Anything, mock.Anything, mock.Anything). + RunAndReturn(func(ctx context.Context, msgs []sdk.Msg, _ ...aclient.BroadcastOption) (any, error) { + if blockUntilRelease != nil { + select { + case <-blockUntilRelease: + case <-ctx.Done(): + return nil, ctx.Err() + } + } + scaffold.broadcasts <- msgs + return &sdk.Result{}, nil + }) scaffold.client = &clientmocks.Client{} scaffold.client.On("Tx").Return(txClientMock) @@ -55,7 +68,7 @@ func makeWatchdogTestScaffoldWithBlocking(t *testing.T, timeout time.Duration, b require.NotNil(t, sess.Client()) - wd := newWatchdog(sess, scaffold.parentCh, scaffold.doneCh, scaffold.leaseID, timeout) + wd := newWatchdog(sess, scaffold.parentCh, scaffold.doneCh, scaffold.leaseID, timeout, broadcastTimeout) return wd, scaffold } @@ -130,6 +143,27 @@ func TestWatchdogStopsOnParent(t *testing.T) { require.Equal(t, deploymentID, scaffold.leaseID.DeploymentID()) } +func TestWatchdogBroadcastTimeout(t *testing.T) { + // Mock blocks forever; broadcast context expires after 10ms → watchdog exits cleanly. + neverRelease := make(chan struct{}) + wd, scaffold := makeWatchdogTestScaffoldFull(t, 100*time.Millisecond, 10*time.Millisecond, neverRelease) + + select { + case <-wd.lc.Done(): + case <-time.After(5 * time.Second): + t.Fatal("watchdog hung after broadcast timeout") + } + + select { + case <-scaffold.broadcasts: + t.Fatal("broadcast should not have completed") + default: + } + + deploymentID := testutil.ChannelWaitForValue(t, scaffold.doneCh) + require.Equal(t, deploymentID, scaffold.leaseID.DeploymentID()) +} + func TestWatchdogStopWhileWaitingForBroadcast(t *testing.T) { releaseCh := make(chan struct{}) wd, scaffold := makeWatchdogTestScaffoldWithBlocking(t, 100*time.Millisecond, releaseCh) diff --git a/service.go b/service.go index 175ed211..c0a3a1aa 100644 --- a/service.go +++ b/service.go @@ -105,6 +105,7 @@ func NewService(ctx context.Context, manifestConfig := manifest.ServiceConfig{ HTTPServicesRequireAtLeastOneHost: !cfg.DeploymentIngressStaticHosts, ManifestTimeout: cfg.ManifestTimeout, + BroadcastTimeout: cfg.BroadcastTimeout, RPCQueryTimeout: cfg.RPCQueryTimeout, CachedResultMaxAge: cfg.CachedResultMaxAge, } From d4c2e18c005083e1374f210e81587a24a2c7968e Mon Sep 17 00:00:00 2001 From: Artem Shcherbatiuk Date: Fri, 13 Mar 2026 18:30:39 +0100 Subject: [PATCH 6/8] fix: added default BroadcastTimeout --- config.go | 1 + 1 file changed, 1 insertion(+) diff --git a/config.go b/config.go index 99091cc9..94e2d65a 100644 --- a/config.go +++ b/config.go @@ -34,6 +34,7 @@ func NewDefaultConfig() Config { return Config{ ClusterWaitReadyDuration: time.Second * 10, BidDeposit: mtypes.DefaultBidMinDeposit, + BroadcastTimeout: 30 * time.Second, BalanceCheckerCfg: BalanceCheckerConfig{ LeaseFundsCheckInterval: 1 * time.Minute, WithdrawalPeriod: 24 * time.Hour, From 914b5b64d78c893ca2c4d5246629aa77253d356c Mon Sep 17 00:00:00 2001 From: Artem Shcherbatiuk Date: Fri, 13 Mar 2026 19:01:39 +0100 Subject: [PATCH 7/8] fix: set cancel timeout to 12s --- cmd/provider-services/cmd/flags.go | 2 +- config.go | 2 +- manifest/watchdog.go | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/cmd/provider-services/cmd/flags.go b/cmd/provider-services/cmd/flags.go index 96afee1e..65188bf2 100644 --- a/cmd/provider-services/cmd/flags.go +++ b/cmd/provider-services/cmd/flags.go @@ -226,7 +226,7 @@ func addRunFlags(cmd *cobra.Command) error { return err } - cmd.Flags().Duration(FlagTxBroadcastTimeout, 30*time.Second, "tx broadcast timeout. defaults to 30s") + cmd.Flags().Duration(FlagTxBroadcastTimeout, 12*time.Second, "tx broadcast timeout") if err := viper.BindPFlag(FlagTxBroadcastTimeout, cmd.Flags().Lookup(FlagTxBroadcastTimeout)); err != nil { return err } diff --git a/config.go b/config.go index 94e2d65a..cf938c51 100644 --- a/config.go +++ b/config.go @@ -34,7 +34,7 @@ func NewDefaultConfig() Config { return Config{ ClusterWaitReadyDuration: time.Second * 10, BidDeposit: mtypes.DefaultBidMinDeposit, - BroadcastTimeout: 30 * time.Second, + BroadcastTimeout: 12 * time.Second, BalanceCheckerCfg: BalanceCheckerConfig{ LeaseFundsCheckInterval: 1 * time.Minute, WithdrawalPeriod: 24 * time.Hour, diff --git a/manifest/watchdog.go b/manifest/watchdog.go index a37cb9b1..7ed1f681 100644 --- a/manifest/watchdog.go +++ b/manifest/watchdog.go @@ -98,6 +98,7 @@ func (wd *watchdog) run() { } runch = nil case err = <-wd.lc.ShutdownRequest(): + wd.log.Info("watchdog shutdown requested, waiting for bid close tx to complete") } } wd.lc.ShutdownInitiated(err) From 199c391782c885a969c7303af9a4185119d7aab4 Mon Sep 17 00:00:00 2001 From: Artem Shcherbatiuk Date: Fri, 20 Mar 2026 17:43:19 +0100 Subject: [PATCH 8/8] fix: added broadcastTimeout validation --- cmd/provider-services/cmd/run.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cmd/provider-services/cmd/run.go b/cmd/provider-services/cmd/run.go index 088d6f83..9b0e32b7 100644 --- a/cmd/provider-services/cmd/run.go +++ b/cmd/provider-services/cmd/run.go @@ -577,6 +577,10 @@ func doRunCmd(ctx context.Context, cmd *cobra.Command, _ []string) error { config.DeploymentIngressDomain = deploymentIngressDomain config.BidTimeout = bidTimeout config.ManifestTimeout = manifestTimeout + if broadcastTimeout <= 0 { + logger.Warn("tx-broadcast-timeout must be positive, using default", "invalid", broadcastTimeout, "default", 12*time.Second) + broadcastTimeout = 12 * time.Second + } config.BroadcastTimeout = broadcastTimeout config.MonitorMaxRetries = monitorMaxRetries config.MonitorRetryPeriod = monitorRetryPeriod