Skip to content

Commit 097ed4a

Browse files
committed
expose details about worker start timeout in the exception message
so that calling code can have more precise logic about how to handle the error
1 parent fea2284 commit 097ed4a

File tree

3 files changed

+39
-7
lines changed

3 files changed

+39
-7
lines changed

distributed/client.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@
9595
WorkerPlugin,
9696
_get_plugin_name,
9797
)
98+
from distributed.exceptions import WorkerStartTimeoutError
9899
from distributed.metrics import time
99100
from distributed.objects import HasWhat, SchedulerInfo, WhoHas
100101
from distributed.protocol import to_serialize
@@ -1651,10 +1652,8 @@ def running_workers(info):
16511652

16521653
while running_workers(info) < n_workers:
16531654
if deadline and time() > deadline:
1654-
raise TimeoutError(
1655-
"Only %d/%d workers arrived after %s"
1656-
% (running_workers(info), n_workers, timeout)
1657-
)
1655+
assert timeout is not None
1656+
raise WorkerStartTimeoutError(running_workers(info), n_workers, timeout)
16581657
await asyncio.sleep(0.1)
16591658
info = await self.scheduler.identity()
16601659
self._scheduler_identity = SchedulerInfo(info)

distributed/deploy/cluster.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from distributed.compatibility import PeriodicCallback
1919
from distributed.core import Status
2020
from distributed.deploy.adaptive import Adaptive
21+
from distributed.exceptions import WorkerStartTimeoutError
2122
from distributed.metrics import time
2223
from distributed.objects import SchedulerInfo
2324
from distributed.utils import (
@@ -610,9 +611,8 @@ def running_workers(info):
610611

611612
while n_workers and running_workers(self.scheduler_info) < n_workers:
612613
if deadline and time() > deadline:
613-
raise TimeoutError(
614-
"Only %d/%d workers arrived after %s"
615-
% (running_workers(self.scheduler_info), n_workers, timeout)
614+
raise WorkerStartTimeoutError(
615+
running_workers(self.scheduler_info), n_workers, timeout
616616
)
617617
await asyncio.sleep(0.1)
618618

distributed/exceptions.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
from __future__ import annotations
22

3+
from asyncio import TimeoutError
4+
35

46
class Reschedule(Exception):
57
"""Reschedule this task
@@ -13,3 +15,34 @@ class Reschedule(Exception):
1315
load across the cluster has significantly changed since first scheduling
1416
the task.
1517
"""
18+
19+
20+
class WorkerStartTimeoutError(TimeoutError):
21+
"""Raised when the expected number of workers to not start within the timeout period."""
22+
23+
def __init__(
24+
self, available_workers: int, expected_workers: int, timeout: float
25+
) -> None:
26+
super().__init__(available_workers, expected_workers, timeout)
27+
28+
@property
29+
def available_workers(self) -> int:
30+
"""Number of workers that are available."""
31+
return self.args[0]
32+
33+
@property
34+
def expected_workers(self) -> int:
35+
"""Number of workers that were expected to be available."""
36+
return self.args[1]
37+
38+
@property
39+
def timeout(self) -> float:
40+
"""Timeout period in seconds."""
41+
return self.args[2]
42+
43+
def __str__(self) -> str:
44+
return "Only %d/%d workers arrived after %s" % (
45+
self.available_workers,
46+
self.expected_workers,
47+
self.timeout,
48+
)

0 commit comments

Comments
 (0)