Skip to content

Commit ce4ecb4

Browse files
authored
Merge pull request #134 from intel/gcpc4
support for GCP C4 instances
2 parents fa54070 + 4da3a94 commit ce4ecb4

File tree

6 files changed

+155
-59
lines changed

6 files changed

+155
-59
lines changed

cmd/metrics/event_defs.go

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ func LoadEventGroups(eventDefinitionOverridePath string, metadata Metadata) (gro
4141
uarch := strings.ToLower(strings.Split(metadata.Microarchitecture, "_")[0])
4242
// use alternate events/metrics when TMA fixed counters are not supported
4343
alternate := ""
44-
if (uarch == "icx" || uarch == "spr" || uarch == "emr") && !metadata.SupportsFixedTMA {
44+
if (uarch == "icx" || uarch == "spr" || uarch == "emr") && !metadata.SupportsFixedTMA { // AWS VM instances
4545
alternate = "_nofixedtma"
4646
}
4747
eventFileName := fmt.Sprintf("%s%s.txt", uarch, alternate)
@@ -132,20 +132,32 @@ func isCollectableEvent(event EventDefinition, metadata Metadata) bool {
132132
slog.Debug("Fixed counter TMA not supported on target", slog.String("event", event.Name))
133133
return false
134134
}
135-
// short-circuit for cpu events
136-
if event.Device == "cpu" && !strings.HasPrefix(event.Name, "OCR") {
135+
// PEBS events (not supported on GCP c4 VMs)
136+
pebsEventNames := []string{"INT_MISC.UNKNOWN_BRANCH_CYCLES", "UOPS_RETIRED.MS"}
137+
if !metadata.SupportsPEBS && util.StringInList(event.Name, pebsEventNames) {
138+
slog.Debug("PEBS events not supported on target", slog.String("event", event.Name))
139+
return false
140+
}
141+
// short-circuit for cpu events that aren't off-core response events
142+
if event.Device == "cpu" && !(strings.HasPrefix(event.Name, "OCR") || strings.HasPrefix(event.Name, "OFFCORE_REQUESTS_OUTSTANDING")) {
137143
return true
138144
}
139-
// short-circuit off-core response events
140-
if event.Device == "cpu" &&
141-
strings.HasPrefix(event.Name, "OCR") &&
142-
metadata.SupportsUncore {
143-
if flagScope == scopeProcess || flagScope == scopeCgroup {
145+
// off-core response events
146+
if event.Device == "cpu" && (strings.HasPrefix(event.Name, "OCR") || strings.HasPrefix(event.Name, "OFFCORE_REQUESTS_OUTSTANDING")) {
147+
if !metadata.SupportsOCR {
148+
slog.Debug("Off-core response events not supported on target", slog.String("event", event.Name))
149+
return false
150+
} else if flagScope == scopeProcess || flagScope == scopeCgroup {
144151
slog.Debug("Off-core response events not supported in process or cgroup scope", slog.String("event", event.Name))
145152
return false
146153
}
147154
return true
148155
}
156+
// uncore events
157+
if !metadata.SupportsUncore && strings.HasPrefix(event.Name, "UNC") {
158+
slog.Debug("Uncore events not supported on target", slog.String("event", event.Name))
159+
return false
160+
}
149161
// exclude uncore events when
150162
// - their corresponding device is not found
151163
// - not in system-wide collection scope
@@ -176,7 +188,6 @@ func isCollectableEvent(event EventDefinition, metadata Metadata) bool {
176188
slog.Debug("ref-cycles not supported on target", slog.String("event", event.Name))
177189
return false
178190
}
179-
180191
// no cstate and power events when collecting at process or cgroup scope
181192
if (flagScope == scopeProcess || flagScope == scopeCgroup) &&
182193
(strings.Contains(event.Name, "cstate_") || strings.Contains(event.Name, "power/energy")) {

cmd/metrics/event_frame.go

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -121,8 +121,13 @@ func parseEvents(rawEvents [][]byte, eventGroupDefinitions []GroupDefinition) (e
121121
for _, rawEvent := range rawEvents {
122122
var event Event
123123
if event, err = parseEventJSON(rawEvent); err != nil {
124-
err = fmt.Errorf("failed to parse perf event: %v", err)
125-
return
124+
if strings.Contains(err.Error(), "unrecognized event format") {
125+
slog.Error(err.Error(), slog.String("event", string(rawEvent)))
126+
return
127+
} else {
128+
slog.Warn(err.Error(), slog.String("event", string(rawEvent)))
129+
event.Value = math.NaN()
130+
}
126131
}
127132
if event.Event != previousEvent {
128133
eventIdx++
@@ -347,10 +352,17 @@ func parseEventJSON(rawEvent []byte) (event Event, err error) {
347352
err = fmt.Errorf("unrecognized event format: \"%s\"", rawEvent)
348353
return
349354
}
355+
if event.CounterValue == "<not supported>" {
356+
err = fmt.Errorf("event not supported: \"%s\"", rawEvent)
357+
return
358+
}
359+
if event.CounterValue == "<not counted>" {
360+
err = fmt.Errorf("event not counted: \"%s\"", rawEvent)
361+
return
362+
}
350363
if event.Value, err = strconv.ParseFloat(event.CounterValue, 64); err != nil {
351-
event.Value = math.NaN()
352-
err = nil
353-
slog.Debug("failed to parse event value", slog.String("event", string(rawEvent)))
364+
err = fmt.Errorf("failed to parse event value as float: \"%s\"", rawEvent)
365+
return
354366
}
355367
return
356368
}

cmd/metrics/metadata.go

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ type Metadata struct {
3939
SupportsFixedTMA bool
4040
SupportsRefCycles bool
4141
SupportsUncore bool
42+
SupportsPEBS bool
43+
SupportsOCR bool
4244
ThreadsPerCore int
4345
TSC int
4446
TSCFrequencyHz int
@@ -161,13 +163,41 @@ func LoadMetadata(myTarget target.Target, noRoot bool, perfPath string, localTem
161163
}
162164
slowFuncChannel <- err
163165
}()
166+
// PEBS
167+
go func() {
168+
var err error
169+
var output string
170+
if metadata.SupportsPEBS, output, err = getSupportsPEBS(myTarget, noRoot, perfPath, localTempDir); err != nil {
171+
err = fmt.Errorf("failed to determine if 'PEBS' is supported: %v", err)
172+
} else {
173+
if !metadata.SupportsPEBS {
174+
slog.Warn("'PEBS' events not supported", slog.String("output", output))
175+
}
176+
}
177+
slowFuncChannel <- err
178+
}()
179+
// Offcore response
180+
go func() {
181+
var err error
182+
var output string
183+
if metadata.SupportsOCR, output, err = getSupportsOCR(myTarget, noRoot, perfPath, localTempDir); err != nil {
184+
err = fmt.Errorf("failed to determine if 'OCR' is supported: %v", err)
185+
} else {
186+
if !metadata.SupportsOCR {
187+
slog.Warn("'OCR' events not supported", slog.String("output", output))
188+
}
189+
}
190+
slowFuncChannel <- err
191+
}()
164192
defer func() {
165193
var errs []error
166194
errs = append(errs, <-slowFuncChannel)
167195
errs = append(errs, <-slowFuncChannel)
168196
errs = append(errs, <-slowFuncChannel)
169197
errs = append(errs, <-slowFuncChannel)
170198
errs = append(errs, <-slowFuncChannel)
199+
errs = append(errs, <-slowFuncChannel)
200+
errs = append(errs, <-slowFuncChannel)
171201
for _, errInside := range errs {
172202
if errInside != nil {
173203
slog.Error("error loading metadata", slog.String("error", errInside.Error()), slog.String("target", myTarget.GetName()))
@@ -218,6 +248,8 @@ func (md Metadata) String() string {
218248
"Fixed TMA slot supported: %t, "+
219249
"ref-cycles supported: %t, "+
220250
"Uncore supported: %t, "+
251+
"PEBS supported: %t, "+
252+
"OCR supported: %t, "+
221253
"PMU Driver version: %s, "+
222254
"Kernel version: %s, ",
223255
md.ModelName,
@@ -234,6 +266,8 @@ func (md Metadata) String() string {
234266
md.SupportsFixedTMA,
235267
md.SupportsRefCycles,
236268
md.SupportsUncore,
269+
md.SupportsPEBS,
270+
md.SupportsOCR,
237271
md.PMUDriverVersion,
238272
md.KernelVersion)
239273
for deviceName, deviceIds := range md.UncoreDeviceIDs {
@@ -355,6 +389,42 @@ func getSupportsRefCycles(myTarget target.Target, noRoot bool, perfPath string,
355389
return
356390
}
357391

392+
// getSupportsPEBS() - checks if the PEBS events are supported on the target
393+
// On some VMs, e.g. GCP C4, PEBS events are not supported and perf returns '<not supported>'
394+
// Events that use MSR 0x3F7 are PEBS events. We use the INT_MISC.UNKNOWN_BRANCH_CYCLES event since
395+
// it is a PEBS event that we used in EMR metrics.
396+
func getSupportsPEBS(myTarget target.Target, noRoot bool, perfPath string, localTempDir string) (supported bool, output string, err error) {
397+
scriptDef := script.ScriptDefinition{
398+
Name: "perf stat pebs",
399+
Script: perfPath + " stat -a -e cpu/event=0xad,umask=0x40,period=1000003,name='INT_MISC.UNKNOWN_BRANCH_CYCLES'/ sleep 1",
400+
Superuser: !noRoot,
401+
}
402+
scriptOutput, err := script.RunScript(myTarget, scriptDef, localTempDir)
403+
if err != nil {
404+
err = fmt.Errorf("failed to determine if pebs is supported: %s, %d, %v", scriptOutput.Stderr, scriptOutput.Exitcode, err)
405+
return
406+
}
407+
supported = !strings.Contains(scriptOutput.Stderr, "<not supported>")
408+
return
409+
}
410+
411+
// getSupportsOCR() - checks if the offcore response events are supported on the target
412+
// On some VMs, e.g. GCP C4, offcore response events are not supported and perf returns '<not supported>'
413+
func getSupportsOCR(myTarget target.Target, noRoot bool, perfPath string, localTempDir string) (supported bool, output string, err error) {
414+
scriptDef := script.ScriptDefinition{
415+
Name: "perf stat ocr",
416+
Script: perfPath + " stat -a -e cpu/event=0x2a,umask=0x01,offcore_rsp=0x104004477,name='OCR.READS_TO_CORE.LOCAL_DRAM'/ sleep 1",
417+
Superuser: !noRoot,
418+
}
419+
scriptOutput, err := script.RunScript(myTarget, scriptDef, localTempDir)
420+
if err != nil {
421+
err = fmt.Errorf("failed to determine if ocr is supported: %s, %d, %v", scriptOutput.Stderr, scriptOutput.Exitcode, err)
422+
return
423+
}
424+
supported = !strings.Contains(scriptOutput.Stderr, "<not supported>")
425+
return
426+
}
427+
358428
// getSupportsFixedTMA - checks if the fixed TMA counter events are
359429
// supported by perf.
360430
//

cmd/metrics/metric_defs.go

Lines changed: 44 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ type MetricDefinition struct {
3434
// definition file. When the override path argument is empty, the function will load metrics from
3535
// the file associated with the platform's architecture found in the provided metadata. When
3636
// a list of metric names is provided, only those metric definitions will be loaded.
37-
func LoadMetricDefinitions(metricDefinitionOverridePath string, selectedMetrics []string, uncollectableEvents []string, metadata Metadata) (metrics []MetricDefinition, err error) {
37+
func LoadMetricDefinitions(metricDefinitionOverridePath string, selectedMetrics []string, metadata Metadata) (metrics []MetricDefinition, err error) {
3838
var bytes []byte
3939
if metricDefinitionOverridePath != "" {
4040
if bytes, err = os.ReadFile(metricDefinitionOverridePath); err != nil {
@@ -56,20 +56,6 @@ func LoadMetricDefinitions(metricDefinitionOverridePath string, selectedMetrics
5656
if err = json.Unmarshal(bytes, &metricsInFile); err != nil {
5757
return
5858
}
59-
// remove "metric_" prefix from metric names
60-
for i := range metricsInFile {
61-
metricsInFile[i].Name = strings.TrimPrefix(metricsInFile[i].Name, "metric_")
62-
}
63-
// remove metrics from list that use uncollectable events
64-
for _, uncollectableEvent := range uncollectableEvents {
65-
for i := 0; i < len(metricsInFile); i++ {
66-
if strings.Contains(metricsInFile[i].Expression, uncollectableEvent) {
67-
slog.Debug("removing metric that uses uncollectable event", slog.String("metric", metricsInFile[i].Name), slog.String("event", uncollectableEvent))
68-
metricsInFile = append(metricsInFile[:i], metricsInFile[i+1:]...)
69-
i--
70-
}
71-
}
72-
}
7359
// if a list of metric names provided, reduce list to match
7460
if len(selectedMetrics) > 0 {
7561
// confirm provided metric names are valid (included in metrics defined in file)
@@ -102,7 +88,7 @@ func LoadMetricDefinitions(metricDefinitionOverridePath string, selectedMetrics
10288
// ConfigureMetrics prepares metrics for use by the evaluator, by e.g., replacing
10389
// metric constants with known values and aligning metric variables to perf event
10490
// groups
105-
func ConfigureMetrics(metrics []MetricDefinition, evaluatorFunctions map[string]govaluate.ExpressionFunction, metadata Metadata) (err error) {
91+
func ConfigureMetrics(loadedMetrics []MetricDefinition, uncollectableEvents []string, evaluatorFunctions map[string]govaluate.ExpressionFunction, metadata Metadata) (metrics []MetricDefinition, err error) {
10692
// get constants as strings
10793
tscFreq := fmt.Sprintf("%f", float64(metadata.TSCFrequencyHz))
10894
tsc := fmt.Sprintf("%f", float64(metadata.TSC))
@@ -112,54 +98,70 @@ func ConfigureMetrics(metrics []MetricDefinition, evaluatorFunctions map[string]
11298
hyperThreadingOn := fmt.Sprintf("%t", metadata.ThreadsPerCore > 1)
11399
threadsPerCore := fmt.Sprintf("%f", float64(metadata.ThreadsPerCore))
114100
// configure each metric
115-
for metricIdx := range metrics {
101+
for metricIdx := range loadedMetrics {
102+
tmpMetric := loadedMetrics[metricIdx]
103+
// abbreviate event names in metric expressions to match abbreviations used in uncollectableEvents
104+
tmpMetric.Expression = abbreviateEventName(tmpMetric.Expression)
105+
tmpMetric.ExpressionTxn = abbreviateEventName(tmpMetric.ExpressionTxn)
106+
// skip metrics that use uncollectable events
107+
foundUncollectable := false
108+
for _, uncollectableEvent := range uncollectableEvents {
109+
if strings.Contains(tmpMetric.Expression, uncollectableEvent) {
110+
slog.Warn("removing metric that uses uncollectable event", slog.String("metric", tmpMetric.Name), slog.String("event", uncollectableEvent))
111+
foundUncollectable = true
112+
break
113+
}
114+
}
115+
if foundUncollectable {
116+
continue
117+
}
116118
// swap in per-txn metric definition if transaction rate is provided
117-
if flagTransactionRate != 0 && metrics[metricIdx].ExpressionTxn != "" {
118-
metrics[metricIdx].Expression = metrics[metricIdx].ExpressionTxn
119-
metrics[metricIdx].Expression = strings.ReplaceAll(metrics[metricIdx].Expression, "[TXN]", fmt.Sprintf("%f", flagTransactionRate))
120-
metrics[metricIdx].Name = metrics[metricIdx].NameTxn
119+
if flagTransactionRate != 0 && tmpMetric.ExpressionTxn != "" {
120+
tmpMetric.Expression = tmpMetric.ExpressionTxn
121+
tmpMetric.Expression = strings.ReplaceAll(tmpMetric.Expression, "[TXN]", fmt.Sprintf("%f", flagTransactionRate))
122+
tmpMetric.Name = tmpMetric.NameTxn
121123
}
124+
// remove "metric_" prefix from metric names
125+
tmpMetric.Name = strings.TrimPrefix(tmpMetric.Name, "metric_")
122126
// transform if/else to ?/:
123127
var transformed string
124-
if transformed, err = transformConditional(metrics[metricIdx].Expression); err != nil {
128+
if transformed, err = transformConditional(tmpMetric.Expression); err != nil {
125129
return
126130
}
127-
if transformed != metrics[metricIdx].Expression {
128-
slog.Debug("transformed metric", slog.String("original", metrics[metricIdx].Name), slog.String("transformed", transformed))
129-
metrics[metricIdx].Expression = transformed
131+
if transformed != tmpMetric.Expression {
132+
slog.Debug("transformed metric", slog.String("original", tmpMetric.Name), slog.String("transformed", transformed))
133+
tmpMetric.Expression = transformed
130134
}
131135
// replace constants with their values
132-
metrics[metricIdx].Expression = strings.ReplaceAll(metrics[metricIdx].Expression, "[SYSTEM_TSC_FREQ]", tscFreq)
133-
metrics[metricIdx].Expression = strings.ReplaceAll(metrics[metricIdx].Expression, "[TSC]", tsc)
134-
metrics[metricIdx].Expression = strings.ReplaceAll(metrics[metricIdx].Expression, "[CORES_PER_SOCKET]", coresPerSocket)
135-
metrics[metricIdx].Expression = strings.ReplaceAll(metrics[metricIdx].Expression, "[CHAS_PER_SOCKET]", chasPerSocket)
136-
metrics[metricIdx].Expression = strings.ReplaceAll(metrics[metricIdx].Expression, "[SOCKET_COUNT]", socketCount)
137-
metrics[metricIdx].Expression = strings.ReplaceAll(metrics[metricIdx].Expression, "[HYPERTHREADING_ON]", hyperThreadingOn)
138-
metrics[metricIdx].Expression = strings.ReplaceAll(metrics[metricIdx].Expression, "[CONST_THREAD_COUNT]", threadsPerCore)
139-
// abbreviate event names
140-
metrics[metricIdx].Expression = abbreviateEventName(metrics[metricIdx].Expression)
141-
metrics[metricIdx].ExpressionTxn = abbreviateEventName(metrics[metricIdx].ExpressionTxn)
136+
tmpMetric.Expression = strings.ReplaceAll(tmpMetric.Expression, "[SYSTEM_TSC_FREQ]", tscFreq)
137+
tmpMetric.Expression = strings.ReplaceAll(tmpMetric.Expression, "[TSC]", tsc)
138+
tmpMetric.Expression = strings.ReplaceAll(tmpMetric.Expression, "[CORES_PER_SOCKET]", coresPerSocket)
139+
tmpMetric.Expression = strings.ReplaceAll(tmpMetric.Expression, "[CHAS_PER_SOCKET]", chasPerSocket)
140+
tmpMetric.Expression = strings.ReplaceAll(tmpMetric.Expression, "[SOCKET_COUNT]", socketCount)
141+
tmpMetric.Expression = strings.ReplaceAll(tmpMetric.Expression, "[HYPERTHREADING_ON]", hyperThreadingOn)
142+
tmpMetric.Expression = strings.ReplaceAll(tmpMetric.Expression, "[CONST_THREAD_COUNT]", threadsPerCore)
142143
// get a list of the variables in the expression
143-
metrics[metricIdx].Variables = make(map[string]int)
144+
tmpMetric.Variables = make(map[string]int)
144145
expressionIdx := 0
145146
for {
146-
startVar := strings.IndexRune(metrics[metricIdx].Expression[expressionIdx:], '[')
147+
startVar := strings.IndexRune(tmpMetric.Expression[expressionIdx:], '[')
147148
if startVar == -1 { // no more vars in this expression
148149
break
149150
}
150-
endVar := strings.IndexRune(metrics[metricIdx].Expression[expressionIdx:], ']')
151+
endVar := strings.IndexRune(tmpMetric.Expression[expressionIdx:], ']')
151152
if endVar == -1 {
152-
err = fmt.Errorf("didn't find end of variable indicator (]) in expression: %s", metrics[metricIdx].Expression[expressionIdx:])
153+
err = fmt.Errorf("didn't find end of variable indicator (]) in expression: %s", tmpMetric.Expression[expressionIdx:])
153154
return
154155
}
155156
// add the variable name to the map, set group index to -1 to indicate it has not yet been determined
156-
metrics[metricIdx].Variables[metrics[metricIdx].Expression[expressionIdx:][startVar+1:endVar]] = -1
157+
tmpMetric.Variables[tmpMetric.Expression[expressionIdx:][startVar+1:endVar]] = -1
157158
expressionIdx += endVar + 1
158159
}
159-
if metrics[metricIdx].Evaluable, err = govaluate.NewEvaluableExpressionWithFunctions(metrics[metricIdx].Expression, evaluatorFunctions); err != nil {
160-
slog.Error("failed to create evaluable expression for metric", slog.String("error", err.Error()), slog.String("metric name", metrics[metricIdx].Name), slog.String("metric expression", metrics[metricIdx].Expression))
160+
if tmpMetric.Evaluable, err = govaluate.NewEvaluableExpressionWithFunctions(tmpMetric.Expression, evaluatorFunctions); err != nil {
161+
slog.Error("failed to create evaluable expression for metric", slog.String("error", err.Error()), slog.String("metric name", tmpMetric.Name), slog.String("metric expression", tmpMetric.Expression))
161162
return
162163
}
164+
metrics = append(metrics, tmpMetric)
163165
}
164166
return
165167
}

cmd/metrics/metrics.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -883,15 +883,16 @@ func prepareMetrics(targetContext *targetContext, localTempDir string, channelEr
883883
return
884884
}
885885
// load metric definitions
886-
if targetContext.metricDefinitions, err = LoadMetricDefinitions(flagMetricFilePath, flagMetricsList, uncollectableEvents, targetContext.metadata); err != nil {
886+
var loadedMetrics []MetricDefinition
887+
if loadedMetrics, err = LoadMetricDefinitions(flagMetricFilePath, flagMetricsList, targetContext.metadata); err != nil {
887888
err = fmt.Errorf("failed to load metric definitions: %w", err)
888889
_ = statusUpdate(myTarget.GetName(), fmt.Sprintf("Error: %s", err.Error()))
889890
targetContext.err = err
890891
channelError <- targetError{target: myTarget, err: err}
891892
return
892893
}
893894
// configure metrics
894-
if err = ConfigureMetrics(targetContext.metricDefinitions, GetEvaluatorFunctions(), targetContext.metadata); err != nil {
895+
if targetContext.metricDefinitions, err = ConfigureMetrics(loadedMetrics, uncollectableEvents, GetEvaluatorFunctions(), targetContext.metadata); err != nil {
895896
err = fmt.Errorf("failed to configure metrics: %w", err)
896897
_ = statusUpdate(myTarget.GetName(), fmt.Sprintf("Error: %s", err.Error()))
897898
targetContext.err = err

cmd/metrics/summary.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,7 @@ func (m *metricsFromCSV) getStats() (stats map[string]metricStats, err error) {
199199
sum := 0.0
200200
for _, row := range m.rows {
201201
val := row.metrics[metricName]
202-
if math.IsNaN(val) {
202+
if math.IsNaN(val) || math.IsInf(val, 0) {
203203
continue
204204
}
205205
if math.IsNaN(min) { // min was initialized to NaN

0 commit comments

Comments
 (0)