Skip to content

Commit 56abcc9

Browse files
authored
Enhance signal handling and improve error logging in script execution (#239)
1 parent 7d21b39 commit 56abcc9

File tree

4 files changed

+100
-38
lines changed

4 files changed

+100
-38
lines changed

internal/common/common.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,10 @@ func (rc *ReportingCommand) Run() error {
9696
go func() {
9797
sig := <-sigChannel
9898
slog.Info("received signal", slog.String("signal", sig.String()))
99+
// when perfspect receives ctrl-c while in the shell, the shell makes sure to propogate the
100+
// signal to all our children. But when perfspect is run in the background or disowned and
101+
// then receives SIGINT, e.g., from a script, we need to send the signal to our children
102+
util.SignalChildren(syscall.SIGINT)
99103
}()
100104
// get the data we need to generate reports
101105
orderedTargetScriptOutputs, err := rc.retrieveScriptOutputs(localTempDir)

internal/report/table_defs.go

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2163,10 +2163,11 @@ func powerStatsTableValues(outputs map[string]script.ScriptOutput) []Field {
21632163
}
21642164

21652165
func gaudiStatsTableValues(outputs map[string]script.ScriptOutput) []Field {
2166-
// build fields to match CSV output from hl_smi tool
2167-
fields := []Field{}
21682166
// parse the CSV output
21692167
csvOutput := outputs[script.GaudiStatsScriptName].Stdout
2168+
if csvOutput == "" {
2169+
return []Field{}
2170+
}
21702171
r := csv.NewReader(strings.NewReader(csvOutput))
21712172
rows, err := r.ReadAll()
21722173
if err != nil {
@@ -2177,6 +2178,8 @@ func gaudiStatsTableValues(outputs map[string]script.ScriptOutput) []Field {
21772178
slog.Error("gaudi stats output is not in expected format")
21782179
return []Field{}
21792180
}
2181+
// build fields to match CSV output from hl_smi tool
2182+
fields := []Field{}
21802183
// first row is the header, extract field names
21812184
for _, fieldName := range rows[0] {
21822185
fields = append(fields, Field{Name: strings.TrimSpace(fieldName)})
@@ -2229,7 +2232,7 @@ func instructionMixTableValues(outputs map[string]script.ScriptOutput) []Field {
22292232
var interval int
22302233
lines := strings.Split(outputs[script.InstructionMixScriptName].Stdout, "\n")
22312234
if len(lines) < 4 {
2232-
slog.Error("no data found in instruction mix output")
2235+
slog.Warn("no data found in instruction mix output")
22332236
return []Field{}
22342237
}
22352238
// TIME

internal/script/script.go

Lines changed: 39 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ func RunScripts(myTarget target.Target, scripts []ScriptDefinition, ignoreScript
130130
if len(parallelScripts) > 0 {
131131
// form one master script that calls all the parallel scripts in the background
132132
masterScriptName := "parallel_master.sh"
133-
masterScript, needsElevatedPrivileges := formMasterScript(myTarget, parallelScripts)
133+
masterScript, needsElevatedPrivileges := formMasterScript(myTarget.GetTempDirectory(), parallelScripts)
134134
// write master script to local file
135135
masterScriptPath := path.Join(localTempDirForTarget, masterScriptName)
136136
err = os.WriteFile(masterScriptPath, []byte(masterScript), 0644)
@@ -278,11 +278,10 @@ func scriptNameToFilename(name string) string {
278278

279279
// formMasterScript forms a master script that runs all parallel scripts in the background, waits for them to finish, then prints the output of each script.
280280
// Return values are the master script and a boolean indicating whether the master script requires elevated privileges.
281-
func formMasterScript(myTarget target.Target, parallelScripts []ScriptDefinition) (string, bool) {
281+
func formMasterScript(targetTempDirectory string, parallelScripts []ScriptDefinition) (string, bool) {
282282
// we write the stdout and stderr from each command to temporary files and save the PID of each command
283283
// in a variable named after the script
284284
var masterScript strings.Builder
285-
targetTempDirectory := myTarget.GetTempDirectory()
286285

287286
masterScript.WriteString("#!/bin/bash\n")
288287

@@ -304,11 +303,34 @@ func formMasterScript(myTarget target.Target, parallelScripts []ScriptDefinition
304303
// function to handle SIGINT
305304
masterScript.WriteString("\nhandle_sigint() {\n")
306305
for _, script := range parallelScripts {
307-
masterScript.WriteString(fmt.Sprintf("\tkill -SIGINT $%s_pid\n", sanitizeScriptName(script.Name)))
308-
if script.NeedsKill {
309-
// kill the command started by the script
310-
masterScript.WriteString(fmt.Sprintf("\tkill -SIGKILL $(cat %s_cmd.pid)\n", sanitizeScriptName(script.Name)))
311-
masterScript.WriteString(fmt.Sprintf("\t%s_exitcode=137\n", sanitizeScriptName(script.Name))) // 137 is the exit code for SIGKILL
306+
// send SIGINT to the child script, if it is still running
307+
masterScript.WriteString(fmt.Sprintf("\tif ps -p \"$%s_pid\" > /dev/null; then\n", sanitizeScriptName(script.Name)))
308+
masterScript.WriteString(fmt.Sprintf("\t\tkill -SIGINT $%s_pid\n", sanitizeScriptName(script.Name)))
309+
masterScript.WriteString("\tfi\n")
310+
if script.NeedsKill { // this is primarily used for scripts that start commands in the background, some of which (processwatch) doesn't respond to SIGINT as expected
311+
// if the *cmd.pid file exists, check if the process is still running
312+
masterScript.WriteString(fmt.Sprintf("\tif [ -f %s_cmd.pid ]; then\n", sanitizeScriptName(script.Name)))
313+
masterScript.WriteString(fmt.Sprintf("\t\tif ps -p $(cat %s_cmd.pid) > /dev/null; then\n", sanitizeScriptName(script.Name)))
314+
// send SIGINT to the background process first, then SIGKILL if it doesn't respond to SIGINT
315+
masterScript.WriteString(fmt.Sprintf("\t\t\tkill -SIGINT $(cat %s_cmd.pid)\n", sanitizeScriptName(script.Name)))
316+
// give the process a chance to respond to SIGINT
317+
masterScript.WriteString("\t\t\tsleep 0.5\n")
318+
// if the background process is still running, send SIGKILL
319+
masterScript.WriteString(fmt.Sprintf("\t\t\tif ps -p $(cat %s_cmd.pid) > /dev/null; then\n", sanitizeScriptName(script.Name)))
320+
masterScript.WriteString(fmt.Sprintf("\t\t\t\tkill -SIGKILL $(cat %s_cmd.pid)\n", sanitizeScriptName(script.Name)))
321+
masterScript.WriteString(fmt.Sprintf("\t\t\t\t%s_exitcode=137\n", sanitizeScriptName(script.Name))) // 137 is the exit code for SIGKILL
322+
masterScript.WriteString("\t\t\telse\n")
323+
// if the background process has exited, set the exit code to 0
324+
masterScript.WriteString(fmt.Sprintf("\t\t\t\t%s_exitcode=0\n", sanitizeScriptName(script.Name)))
325+
masterScript.WriteString("\t\t\tfi\n")
326+
masterScript.WriteString("\t\telse\n")
327+
// if the script itself has exited, set the exit code to 0
328+
masterScript.WriteString(fmt.Sprintf("\t\t\t%s_exitcode=0\n", sanitizeScriptName(script.Name)))
329+
masterScript.WriteString("\t\tfi\n")
330+
masterScript.WriteString("\telse\n")
331+
// if the *cmd.pid file doesn't exist, set the exit code to 1
332+
masterScript.WriteString(fmt.Sprintf("\t\t%s_exitcode=0\n", sanitizeScriptName(script.Name)))
333+
masterScript.WriteString("\tfi\n")
312334
} else {
313335
masterScript.WriteString(fmt.Sprintf("\twait \"$%s_pid\"\n", sanitizeScriptName(script.Name)))
314336
masterScript.WriteString(fmt.Sprintf("\t%s_exitcode=$?\n", sanitizeScriptName(script.Name)))
@@ -402,10 +424,15 @@ func parseMasterScriptOutput(masterScriptOutput string) (scriptOutputs []ScriptO
402424
}
403425
stdout = strings.Join(stdoutLines, "\n")
404426
stderr = strings.Join(stderrLines, "\n")
405-
exitCodeInt, err := strconv.Atoi(exitcode)
406-
if err != nil {
407-
slog.Error("error converting exit code to integer, setting to -100", slog.String("exitcode", exitcode), slog.String("error", err.Error()))
408-
exitCodeInt = -100
427+
exitCodeInt := -100
428+
if exitcode == "" {
429+
slog.Warn("exit code for script not set", slog.String("script", scriptName))
430+
} else {
431+
var err error
432+
exitCodeInt, err = strconv.Atoi(exitcode)
433+
if err != nil {
434+
slog.Warn("error converting exit code to integer", slog.String("exitcode", exitcode), slog.String("error", err.Error()), slog.String("script", scriptName))
435+
}
409436
}
410437
scriptOutputs = append(scriptOutputs, ScriptOutput{
411438
ScriptDefinition: ScriptDefinition{Name: scriptName},

internal/script/script_defs.go

Lines changed: 51 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -302,7 +302,8 @@ if [ -d "$cstate_dir" ]; then
302302
done
303303
else
304304
echo "C-state directory not found."
305-
fi`,
305+
fi
306+
`,
306307
},
307308
SpecTurboCoresScriptName: {
308309
Name: SpecTurboCoresScriptName,
@@ -495,8 +496,7 @@ rdmsr 0x1ad # MSR_TURBO_RATIO_LIMIT: Maximum Ratio Limit of Turbo Mode
495496
},
496497
ElcScriptName: {
497498
Name: ElcScriptName,
498-
ScriptTemplate: `
499-
# Script derived from bhs-power-mode script in Intel PCM repository
499+
ScriptTemplate: `# Script derived from bhs-power-mode script in Intel PCM repository
500500
# Run the pcm-tpmi command to determine I/O and compute dies
501501
output=$(pcm-tpmi 2 0x10 -d -b 26:26)
502502
@@ -560,7 +560,7 @@ for die in "${!die_types[@]}"; do
560560
fi
561561
done <<< "$output"
562562
done
563-
`,
563+
`,
564564
Architectures: []string{x86_64},
565565
Families: []string{"6"}, // Intel
566566
Models: []string{"173", "175"}, // GNR, SRF
@@ -654,7 +654,8 @@ echo "" # finish the line
654654
Name: ChaCountScriptName,
655655
ScriptTemplate: `rdmsr 0x396
656656
rdmsr 0x702
657-
rdmsr 0x2FFE`, // uncore client cha count, uncore cha count, uncore cha count spr
657+
rdmsr 0x2FFE
658+
`, // uncore client cha count, uncore cha count, uncore cha count spr
658659
Architectures: []string{x86_64},
659660
Families: []string{"6"}, // Intel
660661
Lkms: []string{"msr"},
@@ -707,7 +708,7 @@ rdmsr 0x2FFE`, // uncore client cha count, uncore cha count, uncore cha count sp
707708
echo -n "IRQ Balance: "
708709
pgrep irqbalance >/dev/null && echo "Enabled" || echo "Disabled"
709710
done
710-
`,
711+
`,
711712
Depends: []string{"lshw"},
712713
Superuser: true,
713714
},
@@ -756,13 +757,15 @@ do
756757
fi
757758
fi
758759
echo "$name|$model|$size|$mountpoint|$fstype|$rqsize|$minio|$fw|$addr|$numa|$curlinkspeed|$curlinkwidth|$maxlinkspeed|$maxlinkwidth"
759-
done`,
760+
done
761+
`,
760762
},
761763
HdparmScriptName: {
762764
Name: HdparmScriptName,
763765
ScriptTemplate: `lsblk -d -r -o NAME -e7 -e1 -n | while read -r device ; do
764766
hdparm -i /dev/"$device"
765-
done`,
767+
done
768+
`,
766769
Superuser: true,
767770
},
768771
DfScriptName: {
@@ -859,7 +862,8 @@ for i in "${pmu_counters[@]}"; do
859862
fi
860863
# print the full list of PMU values
861864
echo "Values: ${pmu_values[$i]}"
862-
done`,
865+
done
866+
`,
863867
Superuser: true,
864868
Architectures: []string{x86_64},
865869
Families: []string{"6"}, // Intel
@@ -897,7 +901,8 @@ if [ $needed_num_huge_pages -gt $orig_num_huge_pages ]; then
897901
echo $needed_num_huge_pages > /proc/sys/vm/nr_hugepages
898902
fi
899903
mlc --loaded_latency
900-
echo $orig_num_huge_pages > /proc/sys/vm/nr_hugepages`,
904+
echo $orig_num_huge_pages > /proc/sys/vm/nr_hugepages
905+
`,
901906
Architectures: []string{x86_64},
902907
Superuser: true,
903908
Lkms: []string{"msr"},
@@ -917,7 +922,8 @@ if [ $needed_num_huge_pages -gt $orig_num_huge_pages ]; then
917922
echo $needed_num_huge_pages > /proc/sys/vm/nr_hugepages
918923
fi
919924
mlc --bandwidth_matrix
920-
echo $orig_num_huge_pages > /proc/sys/vm/nr_hugepages`,
925+
echo $orig_num_huge_pages > /proc/sys/vm/nr_hugepages
926+
`,
921927
Architectures: []string{x86_64},
922928
Superuser: true,
923929
Lkms: []string{"msr"},
@@ -930,7 +936,8 @@ echo $orig_num_huge_pages > /proc/sys/vm/nr_hugepages`,
930936
for method in $methods; do
931937
printf "%s " "$method"
932938
stress-ng --cpu 0 -t 1 --cpu-method "$method" --metrics-brief 2>&1 | tail -1 | awk '{print $9}'
933-
done`,
939+
done
940+
`,
934941
Superuser: false,
935942
Depends: []string{"stress-ng"},
936943
Sequential: true,
@@ -1005,7 +1012,8 @@ interleaved_core_list=$(IFS=,; echo "${interleaved_cores[*]}")
10051012
num_cores_per_socket=$( lscpu | grep 'Core(s) per socket:' | head -1 | awk '{print $4}' )
10061013
10071014
# Run the avx-turbo benchmark
1008-
avx-turbo --min-threads=1 --max-threads=$num_cores_per_socket --test scalar_iadd,avx128_fma,avx256_fma,avx512_fma --iters=100000 --cpuids=$interleaved_core_list`,
1015+
avx-turbo --min-threads=1 --max-threads=$num_cores_per_socket --test scalar_iadd,avx128_fma,avx256_fma,avx512_fma --iters=100000 --cpuids=$interleaved_core_list
1016+
`,
10091017
Superuser: true,
10101018
Lkms: []string{"msr"},
10111019
Depends: []string{"avx-turbo"},
@@ -1079,7 +1087,8 @@ if [ $duration -ne 0 ] && [ $interval -ne 0 ]; then
10791087
fi
10801088
mpstat -u -T -I SCPU -P ALL $interval $count &
10811089
echo $! > {{.ScriptName}}_cmd.pid
1082-
wait`,
1090+
wait
1091+
`,
10831092
Superuser: true,
10841093
Lkms: []string{},
10851094
Depends: []string{"mpstat"},
@@ -1094,7 +1103,8 @@ if [ $duration -ne 0 ] && [ $interval -ne 0 ]; then
10941103
fi
10951104
S_TIME_FORMAT=ISO iostat -d -t $interval $count | sed '/^loop/d' &
10961105
echo $! > {{.ScriptName}}_cmd.pid
1097-
wait`,
1106+
wait
1107+
`,
10981108
Superuser: true,
10991109
Lkms: []string{},
11001110
Depends: []string{"iostat"},
@@ -1109,7 +1119,8 @@ if [ $duration -ne 0 ] && [ $interval -ne 0 ]; then
11091119
fi
11101120
sar -r $interval $count &
11111121
echo $! > {{.ScriptName}}_cmd.pid
1112-
wait`,
1122+
wait
1123+
`,
11131124
Superuser: true,
11141125
Lkms: []string{},
11151126
Depends: []string{"sar", "sadc"},
@@ -1124,7 +1135,8 @@ if [ $duration -ne 0 ] && [ $interval -ne 0 ]; then
11241135
fi
11251136
sar -n DEV $interval $count &
11261137
echo $! > {{.ScriptName}}_cmd.pid
1127-
wait`,
1138+
wait
1139+
`,
11281140
Superuser: true,
11291141
Lkms: []string{},
11301142
Depends: []string{"sar", "sadc"},
@@ -1140,7 +1152,8 @@ if [ $duration -ne 0 ] && [ $interval -ne 0 ]; then
11401152
fi
11411153
turbostat -S -s PkgWatt,RAMWatt -q -i $interval $count | awk '{ print strftime("%H:%M:%S"), $0 }' &
11421154
echo $! > {{.ScriptName}}_cmd.pid
1143-
wait`,
1155+
wait
1156+
`,
11441157
Superuser: true,
11451158
Lkms: []string{"msr"},
11461159
Depends: []string{"turbostat"},
@@ -1173,21 +1186,35 @@ done
11731186
11741187
processwatch -c $arg_sampling_rate $arg_pid $arg_interval $arg_count $arg_filter &
11751188
echo $! > {{.ScriptName}}_cmd.pid
1176-
wait`,
1189+
wait
1190+
`,
11771191
Superuser: true,
11781192
Lkms: []string{"msr"},
11791193
Depends: []string{"processwatch"},
11801194
NeedsKill: true,
11811195
},
11821196
GaudiStatsScriptName: {
11831197
Name: GaudiStatsScriptName,
1184-
ScriptTemplate: `hl-smi --query-aip=timestamp,name,temperature.aip,module_id,utilization.aip,memory.total,memory.free,memory.used,power.draw --format=csv,nounits -l {{.Interval}} &
1185-
echo $! > {{.ScriptName}}_cmd.pid
1186-
sleep {{.Duration}}
1187-
kill -SIGINT $(cat {{.ScriptName}}_cmd.pid)`,
1198+
ScriptTemplate: `
1199+
# if the hl-smi program is in the path
1200+
if command -v hl-smi &> /dev/null; then
1201+
hl-smi --query-aip=timestamp,name,temperature.aip,module_id,utilization.aip,memory.total,memory.free,memory.used,power.draw --format=csv,nounits -l {{.Interval}} &
1202+
echo $! > {{.ScriptName}}_cmd.pid
1203+
# if duration is set, sleep for the duration then kill the process
1204+
if [ {{.Duration}} -ne 0 ]; then
1205+
sleep {{.Duration}}
1206+
kill -SIGINT $(cat {{.ScriptName}}_cmd.pid)
1207+
fi
1208+
wait
1209+
else
1210+
echo "hl-smi not found in the path" >&2
1211+
exit 1
1212+
fi
1213+
`,
11881214
Superuser: true,
11891215
NeedsKill: true,
11901216
},
1217+
// profile (flamegraph) scripts
11911218
ProfileJavaScriptName: {
11921219
Name: ProfileJavaScriptName,
11931220
ScriptTemplate: `interval={{.Interval}}
@@ -1287,6 +1314,7 @@ rm -f "$perf_fp_data" "$perf_dwarf_data" "$perf_dwarf_folded" "$perf_fp_folded"
12871314
Superuser: true,
12881315
Depends: []string{"perf", "stackcollapse-perf.pl"},
12891316
},
1317+
// lock analysis scripts
12901318
ProfileKernelLockScriptName: {
12911319
Name: ProfileKernelLockScriptName,
12921320
ScriptTemplate: `frequency={{.Frequency}}

0 commit comments

Comments
 (0)