@@ -34,7 +34,6 @@ import (
3434 "github.com/k8stopologyawareschedwg/podfingerprint"
3535
3636 "github.com/openshift-kni/debug-tools/pkg/pfpstatus/record"
37- "github.com/openshift-kni/numaresources-operator/internal/mustgather"
3837)
3938
4039//go:embed README.md
@@ -57,10 +56,6 @@ type diff struct {
5756 foundOnSchedOnly []podfingerprint.NamespacedName
5857}
5958
60- func isEmpty (a []record.RecordedStatus ) bool {
61- return len (a ) == 0
62- }
63-
6459func printPrettyString (sts []diff ) {
6560 var sb strings.Builder
6661
@@ -88,37 +83,26 @@ func printPrettyString(sts []diff) {
8883 fmt .Println (sb .String ())
8984}
9085
91- // refineSchedListToMap filters out synced fingerprints and in case of duplicates keeps the newest
92- // record based on the RecordTime. Returns a statuses map whose key is fingerprintExpected and
93- // whose value is the newest record .
94- func refineSchedListToMap (statuses []record.RecordedStatus ) map [string ]record.RecordedStatus {
86+ // refineListToMap filters out synced fingerprints where relevant and in case of duplicates keeps the newest
87+ // record based on the RecordTime. Returns a statuses map with the newest records, whose key is fingerprintExpected
88+ // for lists that have both fingerprints (i.e. from scheduler) and, key is FingerprintComputed otherwise .
89+ func refineListToMap (statuses []record.RecordedStatus ) map [string ]record.RecordedStatus {
9590 refined := make (map [string ]record.RecordedStatus , len (statuses ))
9691 for _ , s := range statuses {
9792 // we know that the last status report is the final one, but in this tool we don't
9893 // care about the last one only, instead we care more to see the trend of reported
9994 // pfp Statuses from the scheduler side so we skip the synced ones while continue
10095 // to process the others even if it is known that they eventually synced
101- if s .FingerprintExpected == s .FingerprintComputed {
96+ if s .FingerprintExpected != "" && s . FingerprintExpected == s .FingerprintComputed {
10297 continue
10398 }
10499
105- key := s .FingerprintExpected
106- if existing , ok := refined [key ]; ok {
107- if s .RecordTime .Before (existing .RecordTime ) {
108- continue
109- }
100+ key := s .FingerprintExpected // processing statuses list from scheduler
101+ if key == "" {
102+ // processing statuses list from RTE
103+ key = s .FingerprintComputed
110104 }
111- refined [key ] = s
112- }
113- return refined
114- }
115105
116- // refineRTEListToMap filters out duplicates and keeps the newest record based on the RecordTime.
117- // Returns a statuses map whose key is fingerprintComputed and whose value is the newest record.
118- func refineRTEListToMap (statuses []record.RecordedStatus ) map [string ]record.RecordedStatus {
119- refined := make (map [string ]record.RecordedStatus , len (statuses ))
120- for _ , s := range statuses {
121- key := s .FingerprintComputed
122106 if existing , ok := refined [key ]; ok {
123107 if s .RecordTime .Before (existing .RecordTime ) {
124108 continue
@@ -129,18 +113,6 @@ func refineRTEListToMap(statuses []record.RecordedStatus) map[string]record.Reco
129113 return refined
130114}
131115
132- // getDiff gets two lists of podfingerprint.NamespacedName `a` and `b` and returns two lists of podfingerprint.NamespacedName.
133- // The first list contains the elements of `a` that are not in `b`, and the second list contains the elements of `b` that are not in `a`.
134- func getDiff (a []podfingerprint.NamespacedName , b []podfingerprint.NamespacedName ) ([]podfingerprint.NamespacedName , []podfingerprint.NamespacedName ) {
135- aSet := sets .New [podfingerprint.NamespacedName ](a ... )
136- bSet := sets .New [podfingerprint.NamespacedName ](b ... )
137-
138- aOnly := aSet .Difference (bSet )
139- bOnly := bSet .Difference (aSet )
140-
141- return aOnly .UnsortedList (), bOnly .UnsortedList ()
142- }
143-
144116// getDifference gets two maps of record.RecordedStatus and returns a list of differences between
145117// the two with respect to the expected fingerprint and the pods used to compute it on each of RTE
146118// and Secondary-Scheduler. The comman factor between the two lists is the fingerprint that is reported
@@ -162,8 +134,10 @@ func getDifference(refinedSchedStatuses map[string]record.RecordedStatus, refine
162134 }
163135
164136 // no need to check for pod duplications, this cannot happen in real kubernetes
165-
166- foundOnRTEOnly , foundOnSchedOnly := getDiff (rteStatus .Pods , schedStatus .Pods )
137+ rteSet := sets .New [podfingerprint.NamespacedName ](rteStatus .Pods ... )
138+ schedSet := sets .New [podfingerprint.NamespacedName ](schedStatus .Pods ... )
139+ foundOnRTEOnly := rteSet .Difference (schedSet ).UnsortedList ()
140+ foundOnSchedOnly := schedSet .Difference (rteSet ).UnsortedList ()
167141
168142 if len (foundOnRTEOnly ) == 0 && len (foundOnSchedOnly ) == 0 {
169143 klog .InfoS ("scheduler is synced with RTE" ,
@@ -183,25 +157,25 @@ func getDifference(refinedSchedStatuses map[string]record.RecordedStatus, refine
183157 foundOnSchedOnly : foundOnSchedOnly ,
184158 }
185159 result = append (result , currentDiff )
186- klog .InfoS ("scheduler is off-sync with RTE" , "podfingerprint" , key )
187160 }
161+
188162 return result
189163}
190164
191165// verifyPFPSync verifies if the PFP statuses from RTE and scheduler are in sync, and prints out the difference between them if any.
192166func verifyPFPSync (fromRTE []record.RecordedStatus , fromSched []record.RecordedStatus ) error {
193- if isEmpty (fromRTE ) || isEmpty (fromSched ) { // should never happen
167+ if len (fromRTE ) == 0 || len (fromSched ) == 0 { // should never happen
194168 return fmt .Errorf ("sync check is skipped, recorded statuses are missing: RTEStatusListLength=%d, schedulerStatusListLength=%d" , len (fromRTE ), len (fromSched ))
195169 }
196170
197171 //make map from sched where the expectedPFP is the key, makes sure no duplication and uses the freshest report
198- refinedSchedStatuses := refineSchedListToMap (fromSched )
172+ refinedSchedStatuses := refineListToMap (fromSched )
199173 if len (refinedSchedStatuses ) == 0 {
200174 klog .InfoS ("all scheduler PFP status trace is synced with RTE!" )
201175 return nil
202176 }
203177
204- refinedRTEStatuses := refineRTEListToMap (fromRTE )
178+ refinedRTEStatuses := refineListToMap (fromRTE )
205179
206180 diffStatuses := getDifference (refinedSchedStatuses , refinedRTEStatuses )
207181 if len (diffStatuses ) == 0 {
@@ -264,14 +238,71 @@ func singlePairPFPSyncCheck(rteFilePath string, schedulerFilePath string) error
264238 return nil
265239}
266240
241+ func findFiles (mustgatherDir string , suffix string ) (map [string ]string , error ) {
242+ foundDir , err := getDirPath (mustgatherDir , suffix )
243+ if err != nil {
244+ return map [string ]string {}, err
245+ }
246+
247+ filesMap , err := getFilesPaths (foundDir )
248+ if err != nil {
249+ return map [string ]string {}, err
250+ }
251+
252+ return filesMap , nil
253+ }
254+
255+ func getDirPath (mustgatherDir string , dirSuffix string ) (string , error ) {
256+ foundPaths := []string {}
257+ err := filepath .Walk (mustgatherDir , func (path string , info os.FileInfo , err error ) error {
258+ if info .IsDir () && strings .HasSuffix (path , dirSuffix ) {
259+ foundPaths = append (foundPaths , path )
260+ }
261+ return nil
262+ })
263+
264+ if err != nil {
265+ return "" , fmt .Errorf ("failed to find directory %s: %v" , dirSuffix , err )
266+ }
267+
268+ if len (foundPaths ) == 0 {
269+ return "" , fmt .Errorf ("no directory found ending with %s" , dirSuffix )
270+ }
271+
272+ if len (foundPaths ) > 1 {
273+ return "" , fmt .Errorf ("multiple directories found ending with %s" , dirSuffix )
274+ }
275+
276+ return foundPaths [0 ], nil
277+ }
278+
279+ func getFilesPaths (mustgatherDir string ) (map [string ]string , error ) {
280+ foundFiles := map [string ]string {}
281+ mustgatherDir = filepath .Clean (mustgatherDir )
282+
283+ content , err := os .ReadDir (mustgatherDir )
284+ if err != nil {
285+ return foundFiles , fmt .Errorf ("failed to read directory: %v" , err )
286+ }
287+ for _ , content := range content {
288+ if content .IsDir () {
289+ continue
290+ }
291+
292+ foundFiles [content .Name ()] = filepath .Join (mustgatherDir , content .Name ())
293+ }
294+
295+ return foundFiles , nil
296+ }
297+
267298func mustGatherPFPSyncCheck (mustGatherDirPath string ) error {
268- schedNodeFiles , err := mustgather . FindFilesUnderDir (mustGatherDirPath , "/pfpstatus/scheduler" )
299+ schedNodeFiles , err := findFiles (mustGatherDirPath , "/pfpstatus/scheduler" )
269300 if err != nil {
270301 klog .InfoS ("failed to find scheduler node files; skipping sync check" , "error" , err )
271302 return nil
272303 }
273304
274- rteNodeFiles , err := mustgather . FindFilesUnderDir (mustGatherDirPath , "/pfpstatus/resource-topology-exporters" )
305+ rteNodeFiles , err := findFiles (mustGatherDirPath , "/pfpstatus/resource-topology-exporters" )
275306 if err != nil {
276307 klog .InfoS ("failed to find RTE node files; skipping sync check" , "error" , err )
277308 return nil
0 commit comments