@@ -79,6 +79,11 @@ func (sm *ServiceManager) Reconcile() {
79
79
sm .ServiceInfo = svc
80
80
sm .PrepareMeta ()
81
81
sm .CheckNodeStatus ()
82
+
83
+ // Failed的情况下,如果service或者node有变动,尝试重新调度
84
+ if sm .ServiceInfo .Status == types .ServiceStatusFailed {
85
+ sm .ServiceInfo .Status = types .ServiceStatusNotReady
86
+ }
82
87
}
83
88
84
89
if sm .ServiceInfo .Status == types .ServiceStatusRunning &&
@@ -104,6 +109,12 @@ func (sm *ServiceManager) Reconcile() {
104
109
return
105
110
}
106
111
112
+ // 服务处于failed状态,就不需要再做任何操作
113
+ if sm .ServiceInfo .Status == types .MemoCreateContainerFailed {
114
+ slog .Info ("[Service Manager] Service is failed......" , "ServiceId" , sm .ServiceInfo .ServiceId )
115
+ return
116
+ }
117
+
107
118
// 所有容器都是正常的,就不需要再做任何操作
108
119
if sm .ServiceInfo .Status == types .ServiceStatusRunning && sm .IsContainerAllReady () {
109
120
slog .Info ("[Service Manager] Service is running ok......" , "ServiceId" , sm .ServiceInfo .ServiceId )
@@ -119,6 +130,14 @@ func (sm *ServiceManager) Reconcile() {
119
130
return
120
131
}
121
132
133
+ // 如果有容器失败,就不再继续
134
+ if sm .ServiceInfo .Deployment .Type != types .DeployTypeSchedule && sm .HasFailedContainer () {
135
+ slog .Info ("[Service Manager] container failed, stop dispatch......" , "ServiceId" , sm .ServiceInfo .ServiceId )
136
+ sm .ServiceInfo .Status = types .ServiceStatusFailed
137
+ db .ServiceUpdate (sm .ServiceInfo )
138
+ return
139
+ }
140
+
122
141
// 先选一个容器做删除
123
142
if c , ok := sm .TryToDeleteOne (); ok {
124
143
nodeId := c .NodeId
@@ -291,6 +310,16 @@ func (sm *ServiceManager) HasPendingContainer() bool {
291
310
return false
292
311
}
293
312
313
+ func (sm * ServiceManager ) HasFailedContainer () bool {
314
+ for _ , c := range sm .ServiceInfo .Containers {
315
+ version := parseVersionByContainerId (c .ContainerName )
316
+ if isContainerFailed (c .State ) && version == sm .ServiceInfo .Version {
317
+ return true
318
+ }
319
+ }
320
+ return false
321
+ }
322
+
294
323
func (sm * ServiceManager ) TryToDeleteOne () (* types.ContainerStatus , bool ) {
295
324
296
325
nodeDeployed := make (map [string ]bool )
@@ -305,7 +334,7 @@ func (sm *ServiceManager) TryToDeleteOne() (*types.ContainerStatus, bool) {
305
334
return c , true
306
335
}
307
336
}
308
- if isContainerFailed (c .State ) || isContainerRemoved (c .State ) {
337
+ if isContainerWarning (c .State ) || isContainerRemoved (c .State ) {
309
338
return c , true
310
339
}
311
340
@@ -340,6 +369,7 @@ func (sm *ServiceManager) StartNextContainer() {
340
369
if len (nodes ) == 0 {
341
370
slog .Error ("[Service Manager] Start Service error: No available nodes" , "ServiceId" , sm .ServiceInfo .ServiceId )
342
371
sm .ServiceInfo .Memo = types .MemoNoAvailableNode
372
+ sm .ServiceInfo .Status = types .ServiceStatusFailed
343
373
return
344
374
}
345
375
@@ -348,6 +378,7 @@ func (sm *ServiceManager) StartNextContainer() {
348
378
if nodeId == "" {
349
379
slog .Error ("[Service Manager] Start Service error: No available nodes" , "ServiceId" , sm .ServiceInfo .ServiceId )
350
380
sm .ServiceInfo .Memo = types .MemoNoAvailableNode
381
+ sm .ServiceInfo .Status = types .ServiceStatusFailed
351
382
return
352
383
}
353
384
@@ -441,6 +472,7 @@ func (sm *ServiceManager) UpdateContainerWhenChanged(cs types.ContainerStatus) {
441
472
ct .Env = cs .Env
442
473
ct .Mounts = cs .Mounts
443
474
ct .Ports = cs .Ports
475
+ ct .ErrorMsg = cs .ErrorMsg
444
476
if ct .State == types .ContainerStatusRunning {
445
477
ct .ErrorMsg = ""
446
478
}
0 commit comments