Skip to content

Commit f6f6373

Browse files
authored
collect and report Gaudi device telemetry (#217)
1 parent 20ea2ae commit f6f6373

File tree

4 files changed

+135
-3
lines changed

4 files changed

+135
-3
lines changed

cmd/telemetry/telemetry.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ var (
5858
flagMemory bool
5959
flagPower bool
6060
flagInstrMix bool
61+
flagGaudi bool
6162

6263
flagInstrMixPid int
6364
flagInstrMixFilter []string
@@ -77,6 +78,7 @@ const (
7778
flagMemoryName = "memory"
7879
flagPowerName = "power"
7980
flagInstrMixName = "instrmix"
81+
flagGaudiName = "gaudi"
8082

8183
flagInstrMixPidName = "instrmix-pid"
8284
flagInstrMixFilterName = "instrmix-filter"
@@ -93,6 +95,7 @@ var categories = []common.Category{
9395
{FlagName: flagNetworkName, FlagVar: &flagNetwork, DefaultValue: false, Help: "monitor network", TableNames: []string{report.NetworkStatsTableName}},
9496
{FlagName: flagMemoryName, FlagVar: &flagMemory, DefaultValue: false, Help: "monitor memory", TableNames: []string{report.MemoryStatsTableName}},
9597
{FlagName: flagPowerName, FlagVar: &flagPower, DefaultValue: false, Help: "monitor power", TableNames: []string{report.PowerStatsTableName}},
98+
{FlagName: flagGaudiName, FlagVar: &flagGaudi, DefaultValue: false, Help: "monitor gaudi", TableNames: []string{report.GaudiStatsTableName}},
9699
}
97100

98101
func init() {

internal/report/html.go

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1175,6 +1175,73 @@ func instructionMixTableHTMLRenderer(tableValues TableValues, targetname string)
11751175
return telemetryTableHTMLRenderer(tableValues, data, datasetNames, chartConfig)
11761176
}
11771177

1178+
func renderGaudiStatsChart(tableValues TableValues, chartStatFieldName string, titleText string, yAxisText string, suggestedMax string) string {
1179+
data := [][]float64{}
1180+
datasetNames := []string{}
1181+
// timestamp is in the first field
1182+
// find the module_id field index
1183+
moduleIdFieldIdx, err := getFieldIndex("module_id", tableValues)
1184+
if err != nil {
1185+
slog.Error("no gaudi module_id field found")
1186+
return ""
1187+
}
1188+
// find the chartStatFieldName field index
1189+
chartStatFieldIndex, err := getFieldIndex(chartStatFieldName, tableValues)
1190+
if err != nil {
1191+
slog.Error("no gaudi chartStatFieldName field found")
1192+
return ""
1193+
}
1194+
// group the data points by module_id
1195+
moduleStat := make(map[string][]float64)
1196+
for i := 0; i < len(tableValues.Fields[0].Values); i++ {
1197+
moduleId := tableValues.Fields[moduleIdFieldIdx].Values[i]
1198+
val, err := strconv.ParseFloat(tableValues.Fields[chartStatFieldIndex].Values[i], 64)
1199+
if err != nil {
1200+
slog.Error("error parsing utilization", slog.String("error", err.Error()))
1201+
return ""
1202+
}
1203+
if _, ok := moduleStat[moduleId]; !ok {
1204+
moduleStat[moduleId] = []float64{}
1205+
}
1206+
moduleStat[moduleId] = append(moduleStat[moduleId], val)
1207+
}
1208+
// sort the module ids
1209+
var moduleIds []string
1210+
for moduleId := range moduleStat {
1211+
moduleIds = append(moduleIds, moduleId)
1212+
}
1213+
sort.Strings(moduleIds)
1214+
// build the data
1215+
for _, moduleId := range moduleIds {
1216+
if len(moduleStat[moduleId]) > 0 {
1217+
data = append(data, moduleStat[moduleId])
1218+
datasetNames = append(datasetNames, "module "+moduleId)
1219+
}
1220+
}
1221+
chartConfig := chartTemplateStruct{
1222+
ID: fmt.Sprintf("%s%d", tableValues.Name, rand.Intn(10000)),
1223+
XaxisText: "Time",
1224+
YaxisText: yAxisText,
1225+
TitleText: titleText,
1226+
DisplayTitle: "true",
1227+
DisplayLegend: "true",
1228+
AspectRatio: "2",
1229+
SuggestedMin: "0",
1230+
SuggestedMax: suggestedMax,
1231+
}
1232+
return telemetryTableHTMLRenderer(tableValues, data, datasetNames, chartConfig)
1233+
}
1234+
1235+
func gaudiStatsTableHTMLRenderer(tableValues TableValues, targetName string) string {
1236+
out := ""
1237+
out += renderGaudiStatsChart(tableValues, "utilization.aip [%]", "Utilization", "% Utilization", "100")
1238+
out += renderGaudiStatsChart(tableValues, "memory.free [MiB]", "Memory Free", "Memory (MiB)", "0")
1239+
out += renderGaudiStatsChart(tableValues, "memory.used [MiB]", "Memory Used", "Memory (MiB)", "0")
1240+
out += renderGaudiStatsChart(tableValues, "power.draw [W]", "Power", "Watts", "0")
1241+
out += renderGaudiStatsChart(tableValues, "temperature.aip [C]", "Temperature", "Temperature (C)", "0")
1242+
return out
1243+
}
1244+
11781245
func codePathFrequencyTableHTMLRenderer(tableValues TableValues, targetName string) string {
11791246
out := `<style>
11801247

internal/report/table_defs.go

Lines changed: 55 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -116,12 +116,13 @@ const (
116116
// telemetry table names
117117
CPUUtilizationTableName = "CPU Utilization"
118118
AverageCPUUtilizationTableName = "Average CPU Utilization"
119+
InstructionMixTableName = "Instruction Mix"
119120
IRQRateTableName = "IRQ Rate"
120121
DriveStatsTableName = "Drive Stats"
121122
NetworkStatsTableName = "Network Stats"
122123
MemoryStatsTableName = "Memory Stats"
123124
PowerStatsTableName = "Power Stats"
124-
InstructionMixTableName = "Instruction Mix"
125+
GaudiStatsTableName = "Gaudi Stats"
125126
// config table names
126127
ConfigurationTableName = "Configuration"
127128
// flamegraph table names
@@ -707,8 +708,17 @@ var tableDefinitions = map[string]TableDefinition{
707708
script.InstructionMixScriptName,
708709
},
709710
FieldsFunc: instructionMixTableValues,
710-
HTMLTableRendererFunc: instructionMixTableHTMLRenderer,
711-
},
711+
HTMLTableRendererFunc: instructionMixTableHTMLRenderer},
712+
GaudiStatsTableName: {
713+
Name: GaudiStatsTableName,
714+
MenuLabel: GaudiStatsTableName,
715+
HasRows: true,
716+
ScriptNames: []string{
717+
script.GaudiStatsScriptName,
718+
},
719+
NoDataFound: "No Gaudi stats data found. Gaudi devices and the hl-smi tool must be installed on the target system to collect Gaudi stats.",
720+
FieldsFunc: gaudiStatsTableValues,
721+
HTMLTableRendererFunc: gaudiStatsTableHTMLRenderer},
712722
//
713723
// flamegraph tables
714724
//
@@ -2152,6 +2162,48 @@ func powerStatsTableValues(outputs map[string]script.ScriptOutput) []Field {
21522162
return fields
21532163
}
21542164

2165+
func gaudiStatsTableValues(outputs map[string]script.ScriptOutput) []Field {
2166+
// build fields to match CSV output from hl_smi tool
2167+
fields := []Field{}
2168+
// parse the CSV output
2169+
csvOutput := outputs[script.GaudiStatsScriptName].Stdout
2170+
r := csv.NewReader(strings.NewReader(csvOutput))
2171+
rows, err := r.ReadAll()
2172+
if err != nil {
2173+
slog.Error(err.Error())
2174+
return []Field{}
2175+
}
2176+
if len(rows) < 2 {
2177+
slog.Error("gaudi stats output is not in expected format")
2178+
return []Field{}
2179+
}
2180+
// first row is the header, extract field names
2181+
for _, fieldName := range rows[0] {
2182+
fields = append(fields, Field{Name: strings.TrimSpace(fieldName)})
2183+
}
2184+
// values start in 2nd row
2185+
for _, row := range rows[1:] {
2186+
for i := range fields {
2187+
// reformat the timestamp field to only include the time
2188+
if i == 0 {
2189+
// parse the timestamp field's value
2190+
rowTime, err := time.Parse("Mon Jan 2 15:04:05 MST 2006", row[i])
2191+
if err != nil {
2192+
err = fmt.Errorf("unable to parse Gaudi telemetry timestamp: %s", row[i])
2193+
slog.Error(err.Error())
2194+
return []Field{}
2195+
}
2196+
// reformat the timestamp field's value to include time only
2197+
timestamp := rowTime.Format("15:04:05")
2198+
fields[i].Values = append(fields[i].Values, timestamp)
2199+
} else {
2200+
fields[i].Values = append(fields[i].Values, strings.TrimSpace(row[i]))
2201+
}
2202+
}
2203+
}
2204+
return fields
2205+
}
2206+
21552207
func codePathFrequencyTableValues(outputs map[string]script.ScriptOutput) []Field {
21562208
fields := []Field{
21572209
{Name: "System Paths", Values: []string{systemFoldedFromOutput(outputs)}},

internal/script/script_defs.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@ const (
113113
GaudiFirmwareScriptName = "gaudi firmware"
114114
GaudiNumaScriptName = "gaudi numa"
115115
InstructionMixScriptName = "instruction mix"
116+
GaudiStatsScriptName = "gaudi stats"
116117
)
117118

118119
const (
@@ -1163,6 +1164,15 @@ wait`,
11631164
Depends: []string{"processwatch"},
11641165
NeedsKill: true,
11651166
},
1167+
GaudiStatsScriptName: {
1168+
Name: GaudiStatsScriptName,
1169+
ScriptTemplate: `hl-smi --query-aip=timestamp,name,temperature.aip,module_id,utilization.aip,memory.total,memory.free,memory.used,power.draw --format=csv,nounits -l {{.Interval}} &
1170+
echo $! > {{.ScriptName}}_cmd.pid
1171+
sleep {{.Duration}}
1172+
kill -SIGINT $(cat {{.ScriptName}}_cmd.pid)`,
1173+
Superuser: true,
1174+
NeedsKill: true,
1175+
},
11661176
ProfileJavaScriptName: {
11671177
Name: ProfileJavaScriptName,
11681178
ScriptTemplate: `interval={{.Interval}}

0 commit comments

Comments
 (0)