Skip to content

Commit e9f22ef

Browse files
authored
Merge pull request #77 from openpmix/master
Fork Sync: Update from parent repository
2 parents 3483da5 + eb577d4 commit e9f22ef

27 files changed

+197
-1327
lines changed

src/mca/ess/base/ess_base_bootstrap.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,9 @@ int prte_ess_base_bootstrap(void)
223223
if (NULL != cluster) {
224224
free(cluster);
225225
}
226+
if (NULL != ctrlhost) {
227+
free(ctrlhost);
228+
}
226229
if (NULL != dvmnodes) {
227230
free(dvmnodes);
228231
}

src/mca/ess/base/ess_base_std_prted.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -446,8 +446,9 @@ int prte_ess_base_prted_setup(void)
446446
error:
447447
pmix_show_help("help-prte-runtime.txt", "prte_init:startup:internal-failure", true,
448448
error, PRTE_ERROR_NAME(ret), ret);
449-
/* remove our use of the session directory tree */
450-
PMIX_RELEASE(jdata);
449+
if (NULL != jdata) {
450+
PMIX_RELEASE(jdata);
451+
}
451452
return PRTE_ERR_SILENT;
452453
}
453454

src/mca/grpcomm/direct/grpcomm_direct_group.c

Lines changed: 64 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
* Copyright (c) 2014-2020 Intel, Inc. All rights reserved.
99
* Copyright (c) 2014-2017 Research Organization for Information Science
1010
* and Technology (RIST). All rights reserved.
11-
* Copyright (c) 2021-2024 Nanook Consulting All rights reserved.
11+
* Copyright (c) 2021-2025 Nanook Consulting All rights reserved.
1212
* $COPYRIGHT$
1313
*
1414
* Additional copyrights may follow
@@ -226,19 +226,43 @@ static void group(int sd, short args, void *cbdata)
226226
PMIx_Info_list_convert(grpinfo, &darray);
227227
info = (pmix_info_t*)darray.array;
228228
ninfo = darray.size;
229-
PMIx_Data_pack(NULL, relay, &ninfo, 1, PMIX_SIZE);
229+
rc = PMIx_Data_pack(NULL, relay, &ninfo, 1, PMIX_SIZE);
230+
if (PMIX_SUCCESS != rc) {
231+
PMIX_ERROR_LOG(rc);
232+
PMIX_DATA_BUFFER_RELEASE(relay);
233+
PMIX_DESTRUCT(&sig);
234+
goto error;
235+
}
230236
if (0 < ninfo) {
231-
PMIx_Data_pack(NULL, relay, info, ninfo, PMIX_INFO);
237+
rc = PMIx_Data_pack(NULL, relay, info, ninfo, PMIX_INFO);
238+
if (PMIX_SUCCESS != rc) {
239+
PMIX_ERROR_LOG(rc);
240+
PMIX_DATA_BUFFER_RELEASE(relay);
241+
PMIX_DESTRUCT(&sig);
242+
goto error;
243+
}
232244
}
233245
PMIX_DATA_ARRAY_DESTRUCT(&darray);
234246

235247
// pack any endpts
236248
PMIx_Info_list_convert(endpts, &darray);
237249
info = (pmix_info_t*)darray.array;
238250
ninfo = darray.size;
239-
PMIx_Data_pack(NULL, relay, &ninfo, 1, PMIX_SIZE);
251+
rc = PMIx_Data_pack(NULL, relay, &ninfo, 1, PMIX_SIZE);
252+
if (PMIX_SUCCESS != rc) {
253+
PMIX_ERROR_LOG(rc);
254+
PMIX_DATA_BUFFER_RELEASE(relay);
255+
PMIX_DESTRUCT(&sig);
256+
goto error;
257+
}
240258
if (0 < ninfo) {
241-
PMIx_Data_pack(NULL, relay, info, ninfo, PMIX_INFO);
259+
rc = PMIx_Data_pack(NULL, relay, info, ninfo, PMIX_INFO);
260+
if (PMIX_SUCCESS != rc) {
261+
PMIX_ERROR_LOG(rc);
262+
PMIX_DATA_BUFFER_RELEASE(relay);
263+
PMIX_DESTRUCT(&sig);
264+
goto error;
265+
}
242266
}
243267
PMIX_DATA_ARRAY_DESTRUCT(&darray);
244268
}
@@ -298,7 +322,7 @@ void prte_grpcomm_direct_grp_recv(int status, pmix_proc_t *sender,
298322
prte_namelist_t *nm;
299323
pmix_data_array_t darray;
300324
pmix_status_t st;
301-
pmix_info_t *info = NULL, *endpts, *grpinfo;
325+
pmix_info_t *info = NULL, *endpts, *grpinfo = NULL;
302326
prte_grpcomm_direct_group_signature_t *sig = NULL;
303327
pmix_data_buffer_t *reply;
304328
prte_grpcomm_group_t *coll;
@@ -386,7 +410,9 @@ void prte_grpcomm_direct_grp_recv(int status, pmix_proc_t *sender,
386410
rc = PMIx_Data_unpack(NULL, buffer, &nendpts, &cnt, PMIX_SIZE);
387411
if (PMIX_SUCCESS != rc) {
388412
PMIX_ERROR_LOG(rc);
389-
PMIX_INFO_FREE(grpinfo, ngrpinfo);
413+
if (NULL != grpinfo) {
414+
PMIX_INFO_FREE(grpinfo, ngrpinfo);
415+
}
390416
PMIX_RELEASE(sig);
391417
return;
392418
}
@@ -396,7 +422,9 @@ void prte_grpcomm_direct_grp_recv(int status, pmix_proc_t *sender,
396422
rc = PMIx_Data_unpack(NULL, buffer, endpts, &cnt, PMIX_INFO);
397423
if (PMIX_SUCCESS != rc) {
398424
PMIX_ERROR_LOG(rc);
399-
PMIX_INFO_FREE(grpinfo, ngrpinfo);
425+
if (NULL != grpinfo) {
426+
PMIX_INFO_FREE(grpinfo, ngrpinfo);
427+
}
400428
PMIX_INFO_FREE(endpts, nendpts);
401429
PMIX_RELEASE(sig);
402430
return;
@@ -619,19 +647,43 @@ void prte_grpcomm_direct_grp_recv(int status, pmix_proc_t *sender,
619647
PMIx_Info_list_convert(coll->grpinfo, &darray);
620648
info = (pmix_info_t*)darray.array;
621649
ninfo = darray.size;
622-
PMIx_Data_pack(NULL, reply, &ninfo, 1, PMIX_SIZE);
650+
rc = PMIx_Data_pack(NULL, reply, &ninfo, 1, PMIX_SIZE);
651+
if (PMIX_SUCCESS != rc) {
652+
PMIX_ERROR_LOG(rc);
653+
PMIX_DATA_BUFFER_RELEASE(reply);
654+
PMIX_RELEASE(sig);
655+
return;
656+
}
623657
if (0 < ninfo) {
624-
PMIx_Data_pack(NULL, reply, info, ninfo, PMIX_INFO);
658+
rc = PMIx_Data_pack(NULL, reply, info, ninfo, PMIX_INFO);
659+
if (PMIX_SUCCESS != rc) {
660+
PMIX_ERROR_LOG(rc);
661+
PMIX_DATA_BUFFER_RELEASE(reply);
662+
PMIX_RELEASE(sig);
663+
return;
664+
}
625665
}
626666
PMIX_DATA_ARRAY_DESTRUCT(&darray);
627667

628668
// pack any endpts
629669
PMIx_Info_list_convert(coll->endpts, &darray);
630670
info = (pmix_info_t*)darray.array;
631671
ninfo = darray.size;
632-
PMIx_Data_pack(NULL, reply, &ninfo, 1, PMIX_SIZE);
672+
rc = PMIx_Data_pack(NULL, reply, &ninfo, 1, PMIX_SIZE);
673+
if (PMIX_SUCCESS != rc) {
674+
PMIX_ERROR_LOG(rc);
675+
PMIX_DATA_BUFFER_RELEASE(reply);
676+
PMIX_RELEASE(sig);
677+
return;
678+
}
633679
if (0 < ninfo) {
634-
PMIx_Data_pack(NULL, reply, info, ninfo, PMIX_INFO);
680+
rc =PMIx_Data_pack(NULL, reply, info, ninfo, PMIX_INFO);
681+
if (PMIX_SUCCESS != rc) {
682+
PMIX_ERROR_LOG(rc);
683+
PMIX_DATA_BUFFER_RELEASE(reply);
684+
PMIX_RELEASE(sig);
685+
return;
686+
}
635687
}
636688
PMIX_DATA_ARRAY_DESTRUCT(&darray);
637689
}

src/mca/odls/base/base.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
* Copyright (c) 2011-2020 Cisco Systems, Inc. All rights reserved
1313
* Copyright (c) 2013 Los Alamos National Security, LLC. All rights reserved.
1414
* Copyright (c) 2017-2019 Intel, Inc. All rights reserved.
15-
* Copyright (c) 2021-2024 Nanook Consulting All rights reserved.
15+
* Copyright (c) 2021-2025 Nanook Consulting All rights reserved.
1616
* $COPYRIGHT$
1717
*
1818
* Additional copyrights may follow
@@ -56,7 +56,6 @@ typedef struct {
5656
char **ev_threads; // event progress thread names
5757
int next_base; // counter to load-level thread use
5858
bool signal_direct_children_only;
59-
pmix_lock_t lock;
6059
char *exec_agent;
6160
} prte_odls_globals_t;
6261

src/mca/odls/base/odls_base_bind.c

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -275,11 +275,10 @@ void prte_odls_base_set(prte_odls_spawn_caddy_t *cd, int write_fd)
275275
hwloc_bitmap_free(cpuset);
276276
/* if we got an error and this wasn't a default binding policy, then report it */
277277
if (rc < 0 && PRTE_BINDING_POLICY_IS_SET(jobdat->map->binding)) {
278-
char *tmp = NULL;
279278
if (errno == ENOSYS) {
280-
msg = "hwloc indicates cpu binding not supported";
279+
msg = strdup("hwloc indicates cpu binding not supported");
281280
} else if (errno == EXDEV) {
282-
msg = "hwloc indicates cpu binding cannot be enforced";
281+
msg = strdup("hwloc indicates cpu binding cannot be enforced");
283282
} else {
284283
pmix_asprintf(&msg, "hwloc_set_cpubind returned \"%s\" for bitmap \"%s\"",
285284
prte_strerror(rc), child->cpuset);
@@ -291,19 +290,13 @@ void prte_odls_base_set(prte_odls_spawn_caddy_t *cd, int write_fd)
291290
"binding generic error",
292291
prte_process_info.nodename, context->app, msg,
293292
__FILE__, __LINE__);
293+
free(msg); // silence static analyzer warning
294294
} else {
295295
send_warn_show_help(write_fd, "help-prte-odls-default.txt",
296296
"not bound", prte_process_info.nodename,
297297
context->app, msg, __FILE__, __LINE__);
298-
if (NULL != tmp) {
299-
free(tmp);
300-
free(msg);
301-
}
302-
return;
303-
}
304-
if (NULL != tmp) {
305-
free(tmp);
306298
free(msg);
299+
return;
307300
}
308301
}
309302

src/mca/odls/base/odls_base_default_fns.c

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1252,14 +1252,15 @@ void prte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
12521252
int j, idx;
12531253
int total_num_local_procs = 0;
12541254
prte_odls_launch_local_t *caddy = (prte_odls_launch_local_t *) cbdata;
1255-
prte_job_t *jobdat;
1255+
prte_job_t *jobdat, *parent;
12561256
pmix_nspace_t job;
12571257
prte_odls_base_fork_local_proc_fn_t fork_local = caddy->fork_local;
1258-
bool index_argv;
1258+
bool index_argv, inherit;
12591259
char *msg, **xfer;
12601260
prte_odls_spawn_caddy_t *cd;
12611261
prte_event_base_t *evb;
12621262
prte_schizo_base_module_t *schizo;
1263+
pmix_proc_t *nptr;
12631264
PRTE_HIDE_UNUSED_PARAMS(fd, sd);
12641265

12651266
PMIX_ACQUIRE_OBJECT(caddy);
@@ -1352,6 +1353,20 @@ void prte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
13521353
}
13531354
}
13541355

1356+
// see if we have a parent in case of inheritance
1357+
nptr = NULL;
1358+
prte_get_attribute(&jobdat->attributes, PRTE_JOB_LAUNCH_PROXY, (void **) &nptr, PMIX_PROC);
1359+
if (NULL != nptr) {
1360+
parent = prte_get_job_data_object(nptr->nspace);
1361+
if (NULL != parent) {
1362+
inherit = prte_get_attribute(&parent->attributes, PRTE_JOB_INHERIT, NULL, PMIX_BOOL);
1363+
} else {
1364+
inherit = false;
1365+
}
1366+
} else {
1367+
inherit = false;
1368+
}
1369+
13551370
for (j = 0; j < jobdat->apps->size; j++) {
13561371
app = (prte_app_context_t *) pmix_pointer_array_get_item(jobdat->apps, j);
13571372
if (NULL == app) {
@@ -1395,6 +1410,10 @@ void prte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
13951410
}
13961411

13971412
// process any provided env directives
1413+
if (inherit) {
1414+
// start with the parent's directives
1415+
process_envars(parent, app);
1416+
}
13981417
process_envars(jobdat, app);
13991418

14001419

@@ -2151,6 +2170,11 @@ int prte_odls_base_default_restart_proc(prte_proc_t *child,
21512170
child->rml_uri = NULL;
21522171
}
21532172
app = (prte_app_context_t *) pmix_pointer_array_get_item(jobdat->apps, child->app_idx);
2173+
if (NULL == app) {
2174+
PRTE_ERROR_LOG(PRTE_ERR_NOT_FOUND);
2175+
rc = PRTE_ERR_NOT_FOUND;
2176+
goto CLEANUP;
2177+
}
21542178

21552179
/* setup the path */
21562180
if (PRTE_SUCCESS != (rc = setup_path(app, &wdir))) {

src/mca/odls/base/odls_base_frame.c

Lines changed: 3 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,6 @@ prte_odls_globals_t prte_odls_globals = {
7878
.ev_threads = NULL,
7979
.next_base = 0,
8080
.signal_direct_children_only = false,
81-
.lock = PMIX_LOCK_STATIC_INIT,
8281
.exec_agent = NULL
8382
};
8483

@@ -126,7 +125,6 @@ void prte_odls_base_harvest_threads(void)
126125
{
127126
int i;
128127

129-
PMIX_ACQUIRE_THREAD(&prte_odls_globals.lock);
130128
if (0 < prte_odls_globals.num_threads) {
131129
/* stop the progress threads */
132130
if (NULL != prte_odls_globals.ev_threads) {
@@ -144,18 +142,15 @@ void prte_odls_base_harvest_threads(void)
144142
prte_odls_globals.ev_threads = NULL;
145143
}
146144
}
147-
PMIX_RELEASE_THREAD(&prte_odls_globals.lock);
148145
}
149146

150147
void prte_odls_base_start_threads(prte_job_t *jdata)
151148
{
152149
int i;
153150
char *tmp;
154151

155-
PMIX_ACQUIRE_THREAD(&prte_odls_globals.lock);
156152
/* only do this once */
157153
if (NULL != prte_odls_globals.ev_threads) {
158-
PMIX_RELEASE_THREAD(&prte_odls_globals.lock);
159154
return;
160155
}
161156

@@ -205,7 +200,6 @@ void prte_odls_base_start_threads(prte_job_t *jdata)
205200
free(tmp);
206201
}
207202
}
208-
PMIX_RELEASE_THREAD(&prte_odls_globals.lock);
209203
}
210204

211205
static int prte_odls_base_close(void)
@@ -230,8 +224,6 @@ static int prte_odls_base_close(void)
230224

231225
prte_odls_base_harvest_threads();
232226

233-
PMIX_DESTRUCT_LOCK(&prte_odls_globals.lock);
234-
235227
return pmix_mca_base_framework_components_close(&prte_odls_base_framework, NULL);
236228
}
237229

@@ -247,9 +239,6 @@ static int prte_odls_base_open(pmix_mca_base_open_flag_t flags)
247239
bool xterm_hold;
248240
sigset_t unblock;
249241

250-
PMIX_CONSTRUCT_LOCK(&prte_odls_globals.lock);
251-
prte_odls_globals.lock.active = false; // start with nobody having the thread
252-
253242
/* initialize the global array of local children */
254243
prte_local_children = PMIX_NEW(pmix_pointer_array_t);
255244
if (PRTE_SUCCESS
@@ -263,12 +252,9 @@ static int prte_odls_base_open(pmix_mca_base_open_flag_t flags)
263252
prte_odls_globals.xtermcmd = NULL;
264253

265254
/* ensure that SIGCHLD is unblocked as we need to capture it */
266-
if (0 != sigemptyset(&unblock)) {
267-
return PRTE_ERROR;
268-
}
269-
if (0 != sigaddset(&unblock, SIGCHLD)) {
270-
return PRTE_ERROR;
271-
}
255+
sigemptyset(&unblock);
256+
sigaddset(&unblock, SIGCHLD);
257+
272258
if (0 != sigprocmask(SIG_UNBLOCK, &unblock, NULL)) {
273259
return PRTE_ERR_NOT_SUPPORTED;
274260
}

src/mca/plm/base/plm_base_launch_support.c

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -191,9 +191,16 @@ void prte_plm_base_allocation_complete(int fd, short args, void *cbdata)
191191
* to map so we can see where the procs would have
192192
* gone - so skip to the mapping state */
193193
if (prte_get_attribute(&caddy->jdata->attributes, PRTE_JOB_DO_NOT_LAUNCH, NULL, PMIX_BOOL)) {
194-
PRTE_ACTIVATE_JOB_STATE(caddy->jdata, PRTE_JOB_STATE_DAEMONS_REPORTED);
195194
node = (prte_node_t*)pmix_pointer_array_get_item(prte_node_pool, 0);
195+
if (NULL == node) {
196+
// should never happen
197+
PRTE_ERROR_LOG(PRTE_ERR_NOT_FOUND);
198+
PRTE_ACTIVATE_JOB_STATE(caddy->jdata, PRTE_JOB_STATE_FAILED_TO_START);
199+
PMIX_RELEASE(caddy);
200+
return;
201+
}
196202
prte_rmaps_base.require_hwtcpus = !prte_hwloc_base_core_cpus(node->topology->topo);
203+
PRTE_ACTIVATE_JOB_STATE(caddy->jdata, PRTE_JOB_STATE_DAEMONS_REPORTED);
197204
} else {
198205
/* move the state machine along */
199206
caddy->jdata->state = PRTE_JOB_STATE_ALLOCATION_COMPLETE;
@@ -1043,6 +1050,10 @@ void prte_plm_base_post_launch(int fd, short args, void *cbdata)
10431050
continue;
10441051
}
10451052
app = (prte_app_context_t*)pmix_pointer_array_get_item(jdata->apps, proc->app_idx);
1053+
if (NULL == app) {
1054+
// should never happen
1055+
continue;
1056+
}
10461057
fprintf(fp, "(rank, host, exe, pid) = (%u, %s, %s, %d)\n",
10471058
proc->name.rank, proc->node->name, app->app, proc->pid);
10481059
}

src/mca/ras/base/ras_base_allocate.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -306,7 +306,6 @@ void prte_ras_base_display_cpus(prte_job_t *jdata, char *nodelist)
306306
}
307307
if (0 == strcmp(nptr->name, nodes[j])) {
308308
display_cpus(nptr->topology, jdata, nodes[j]);
309-
moveon = true;
310309
break;
311310
}
312311
if (NULL == nptr->aliases) {

src/mca/ras/pbs/ras_pbs_module.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,8 @@ static int discover(pmix_list_t *nodelist, char *pbs_jobid)
195195
if (prte_mca_ras_pbs_component.smp_mode) {
196196
/* this cannot happen in smp mode */
197197
pmix_show_help("help-ras-pbs.txt", "smp-multi", true);
198+
fclose(fp);
199+
free(hostname);
198200
return PRTE_ERR_BAD_PARAM;
199201
}
200202
++node->slots;

0 commit comments

Comments
 (0)