diff --git a/pkg/inventory/software/collector_darwin.go b/pkg/inventory/software/collector_darwin.go index f2275784aad83c..d2c9f1eb28e540 100644 --- a/pkg/inventory/software/collector_darwin.go +++ b/pkg/inventory/software/collector_darwin.go @@ -15,7 +15,6 @@ import ( "os/exec" "path/filepath" "regexp" - "strconv" "strings" "sync" "time" @@ -368,66 +367,6 @@ func getPublisherFromInfoPlist(bundlePath string) string { return getPublisherFromPlistData(plistData) } -// pkgInfo contains information about a package installation from pkgutil -type pkgInfo struct { - // PkgID is the package identifier (e.g., "com.microsoft.Word") - PkgID string - // Volume is the install volume (e.g., "/") - Volume string - // InstallTime is the installation timestamp - InstallTime string -} - -// getPkgInfo queries the macOS package receipt database to find which PKG installed -// a specific file or directory. This uses `pkgutil --file-info` which is the official -// way to link applications to their installer receipts. -// -// Parameters: -// - path: The path to query (e.g., "/Applications/Numbers.app") -// -// Returns: -// - *pkgInfo: Package information if the path was installed by a PKG, nil otherwise -// -// Note: Returns nil for apps installed via drag-and-drop (no PKG receipt) or -// Mac App Store apps (receipt stored inside the app bundle, not in pkgutil database). -func getPkgInfo(path string) *pkgInfo { - // Run pkgutil --file-info to query which package installed this path - cmd := exec.Command("pkgutil", "--file-info", path) - output, err := cmd.Output() - if err != nil { - // No package owns this path (drag-and-drop install or error) - return nil - } - - // Parse the output which looks like: - // volume: / - // path: Applications/Numbers.app - // pkgid: com.apple.pkg.Numbers - // pkg-version: 14.0 - // install-time: 1654432493 - info := &pkgInfo{} - for _, line := range strings.Split(string(output), "\n") { - line = strings.TrimSpace(line) - if strings.HasPrefix(line, "pkgid: ") { - info.PkgID = strings.TrimPrefix(line, "pkgid: ") - } else if strings.HasPrefix(line, "volume: ") { - info.Volume = strings.TrimPrefix(line, "volume: ") - } else if strings.HasPrefix(line, "install-time: ") { - // Convert Unix timestamp to ISO 8601 format for cross-platform consistency - timestampStr := strings.TrimPrefix(line, "install-time: ") - if unixTime, err := strconv.ParseInt(timestampStr, 10, 64); err == nil { - info.InstallTime = time.Unix(unixTime, 0).Format(time.RFC3339) - } - } - } - - // Only return if we found a package ID - if info.PkgID != "" { - return info - } - return nil -} - // entryWithPath pairs an Entry with its bundle path for parallel processing. // When plistData is non-nil, the worker uses it to compute publisher without reading the file. type entryWithPath struct { diff --git a/pkg/inventory/software/collector_darwin_apps.go b/pkg/inventory/software/collector_darwin_apps.go index bea169c1aba996..9b8f92b2ceba7c 100644 --- a/pkg/inventory/software/collector_darwin_apps.go +++ b/pkg/inventory/software/collector_darwin_apps.go @@ -13,7 +13,6 @@ import ( "path/filepath" "runtime" "strings" - "sync" "time" ) @@ -69,47 +68,26 @@ func getLocalUsers() ([]string, []*Warning) { return users, warnings } -// appPkgLookup holds info needed for parallel pkgutil lookup +// appPkgLookup holds info needed for PKG install source lookup type appPkgLookup struct { entry *Entry appPath string } -// populatePkgInfoParallel queries pkgutil for multiple apps in parallel -// Uses a worker pool to limit concurrent pkgutil processes -func populatePkgInfoParallel(items []appPkgLookup) { - const maxWorkers = 10 // Limit concurrent pkgutil processes - +// populatePkgInfoFromIndex looks up PKG install source for each app using +// the BOM-derived reverse index, avoiding per-app subprocess spawning. +func populatePkgInfoFromIndex(items []appPkgLookup) { if len(items) == 0 { return } - jobs := make(chan *appPkgLookup, len(items)) + idx := getGlobalAppToPkgIndex() for i := range items { - jobs <- &items[i] - } - close(jobs) - - var wg sync.WaitGroup - workerCount := maxWorkers - if len(items) < maxWorkers { - workerCount = len(items) - } - - for i := 0; i < workerCount; i++ { - wg.Add(1) - go func() { - defer wg.Done() - for item := range jobs { - if pkgInfo := getPkgInfo(item.appPath); pkgInfo != nil { - item.entry.InstallSource = installSourcePkg - item.entry.PkgID = pkgInfo.PkgID - } - } - }() + if pkgID := idx.lookupPkgForApp(items[i].appPath); pkgID != "" { + items[i].entry.InstallSource = installSourcePkg + items[i].entry.PkgID = pkgID + } } - - wg.Wait() } // Collect scans the /Applications directory recursively for installed applications. @@ -220,10 +198,8 @@ func (c *applicationsCollector) Collect() ([]*Entry, []*Warning, error) { if _, err := os.Stat(masReceiptPath); err == nil { source = softwareTypeMAS installSource = installSourceMAS - } else { - // Not a MAS app or system app - will need to check pkgutil later (in parallel) - needsPkgLookup = true } + needsPkgLookup = true } // Determine architecture @@ -267,9 +243,8 @@ func (c *applicationsCollector) Collect() ([]*Entry, []*Warning, error) { } } - // Populate PKG info in parallel for non-MAS apps - // This queries pkgutil --file-info to determine if the app was installed via PKG - populatePkgInfoParallel(itemsForPkgLookup) + // Populate PKG install source from BOM-derived reverse index (zero subprocesses) + populatePkgInfoFromIndex(itemsForPkgLookup) // Populate publisher info in parallel using Info.plist extraction populatePublishersParallel(itemsForPublisher) diff --git a/pkg/inventory/software/collector_darwin_pkg.go b/pkg/inventory/software/collector_darwin_pkg.go index e1707594ef14ae..1395201d8895fa 100644 --- a/pkg/inventory/software/collector_darwin_pkg.go +++ b/pkg/inventory/software/collector_darwin_pkg.go @@ -8,15 +8,19 @@ package software import ( + "bufio" + "context" + "fmt" "os" "os/exec" "path/filepath" "runtime" + "sort" "strings" "sync" "time" - "golang.org/x/sync/singleflight" + log "github.com/DataDog/datadog-agent/pkg/util/log" ) // pkgReceiptsCollector collects software from PKG installer receipts @@ -24,211 +28,257 @@ import ( // by the applicationsCollector (apps in /Applications), to avoid confusing duplicates. type pkgReceiptsCollector struct{} -// pkgFilesCacheEntry holds a cached file list with its timestamp for TTL checking -type pkgFilesCacheEntry struct { - Files []string - Timestamp time.Time +// pkgSummary stores compact derived facts from lsbom directory listing output. +// It intentionally avoids retaining full file lists to reduce memory usage. +type pkgSummary struct { + // HasApplicationsApp is true when pkg payload contains an app bundle under /Applications. + HasApplicationsApp bool + // HasNonAppPayload is true when pkg payload includes directories outside /Applications app bundles. + HasNonAppPayload bool + // TopLevelPaths stores deduplicated top-level install directories derived from pkg payload directory paths. + TopLevelPaths []string } -// pkgFilesCache holds cached results from pkgutil --files queries -type pkgFilesCache struct { - mu sync.RWMutex - cache map[string]*pkgFilesCacheEntry // pkgID -> cache entry with files and timestamp - ttl time.Duration // Time-to-live for cache entries - sfGroup singleflight.Group // deduplicates fetchPkgFiles per pkgID across goroutines +const ( + defaultBomCacheTTL = 1 * time.Hour + defaultBomCacheMaxEntries = 512 + lsbomBatchTimeout = 60 * time.Second + lsbomSingleTimeout = 30 * time.Second + lsbomScannerMaxTokenSize = 2 * 1024 * 1024 + maxTopLevelPathsPerPkg = 128 + bomDelimiterPrefix = "===BOM:" + bomDelimiterSuffix = "===" +) + +// bomCacheEntry stores cached raw directory lines from lsbom for a single BOM file. +type bomCacheEntry struct { + Lines []string + Timestamp time.Time } -// Default TTL for pkgutil --files cache entries -const defaultPkgFilesCacheTTL = 1 * time.Hour +// bomCache caches raw lsbom -sd output lines keyed by BOM file path. +// The summary (pkgSummary) is derived per-receipt from cached lines + prefixPath, +// so the same BOM data can serve receipts with different install prefixes. +type bomCache struct { + mu sync.Mutex + entries map[string]*bomCacheEntry + ttl time.Duration + maxEntries int +} -// Global cache instance for pkgutil --files results var ( - globalPkgFilesCache *pkgFilesCache - globalCacheOnce sync.Once + globalBomCache *bomCache + globalBomCacheOnce sync.Once ) -// getGlobalPkgFilesCache returns the global singleton cache instance -// The cache persists across collection runs within the same process lifetime -func getGlobalPkgFilesCache() *pkgFilesCache { - globalCacheOnce.Do(func() { - globalPkgFilesCache = &pkgFilesCache{ - cache: make(map[string]*pkgFilesCacheEntry), - ttl: defaultPkgFilesCacheTTL, +func getGlobalBomCache() *bomCache { + globalBomCacheOnce.Do(func() { + globalBomCache = &bomCache{ + entries: make(map[string]*bomCacheEntry), + ttl: defaultBomCacheTTL, + maxEntries: defaultBomCacheMaxEntries, } }) - return globalPkgFilesCache + return globalBomCache } -// get retrieves cached file list for a package, or fetches it if not cached or expired -func (c *pkgFilesCache) get(pkgID string) []string { +// getBomLines returns cached lsbom lines for the given BOM paths. +// Uncached or expired entries are fetched in a single batched shell subprocess. +func (c *bomCache) getBomLines(bomPaths []string) map[string][]string { + c.mu.Lock() + defer c.mu.Unlock() + now := time.Now() + result := make(map[string][]string, len(bomPaths)) + var uncached []string - // Check cache with read lock - c.mu.RLock() - entry, ok := c.cache[pkgID] - if ok && entry != nil { - // Check if entry is still valid (not expired) - age := now.Sub(entry.Timestamp) - if age < c.ttl { - // Cache hit - entry is valid - files := entry.Files - c.mu.RUnlock() - return files - } - // Entry exists but is expired - will fetch new data below - } - c.mu.RUnlock() - - // Not in cache or expired: use singleflight so only one goroutine runs pkgutil per pkgID - v, err, _ := c.sfGroup.Do(pkgID, func() (interface{}, error) { - files := fetchPkgFiles(pkgID) - c.mu.Lock() - c.cache[pkgID] = &pkgFilesCacheEntry{ - Files: files, - Timestamp: time.Now(), - } - c.mu.Unlock() - return files, nil - }) - if err != nil { - return nil + for _, bp := range bomPaths { + if entry, ok := c.entries[bp]; ok && now.Sub(entry.Timestamp) < c.ttl { + result[bp] = entry.Lines + } else { + uncached = append(uncached, bp) + } } - return v.([]string) -} - -// prefetch fetches pkgutil --files for multiple packages in parallel -// Uses a worker pool to limit concurrent pkgutil processes -func (c *pkgFilesCache) prefetch(pkgIDs []string) { - const maxWorkers = 10 // Limit concurrent pkgutil processes - if len(pkgIDs) == 0 { - return + if len(uncached) == 0 { + return result } - // Create a channel for work items - jobs := make(chan string, len(pkgIDs)) - for _, pkgID := range pkgIDs { - jobs <- pkgID + fetched := batchLsbom(uncached) + + // Evict expired entries before inserting new ones + for key, entry := range c.entries { + if now.Sub(entry.Timestamp) >= c.ttl { + delete(c.entries, key) + } } - close(jobs) - // Start worker pool - var wg sync.WaitGroup - workerCount := maxWorkers - if len(pkgIDs) < maxWorkers { - workerCount = len(pkgIDs) + for _, bp := range uncached { + lines := fetched[bp] + if lines == nil { + lines = []string{} + } + + // Evict oldest if at capacity + if len(c.entries) >= c.maxEntries { + c.evictOldestLocked() + } + + c.entries[bp] = &bomCacheEntry{Lines: lines, Timestamp: now} + result[bp] = lines } - for i := 0; i < workerCount; i++ { - wg.Add(1) - go func() { - defer wg.Done() - for pkgID := range jobs { - c.get(pkgID) // This will fetch and cache if not already cached - } - }() + return result +} + +func (c *bomCache) evictOldestLocked() { + var oldestKey string + var oldestTime time.Time + first := true + for key, entry := range c.entries { + if first || entry.Timestamp.Before(oldestTime) { + oldestKey = key + oldestTime = entry.Timestamp + first = false + } + } + if !first { + delete(c.entries, oldestKey) } +} - wg.Wait() +// shellUnsafeChars contains characters that could enable command injection inside +// single-quoted shell strings. BOM paths containing any of these are handled via +// a dedicated exec.Command instead of the batched shell script. +const shellUnsafeChars = "'\"`$;|&(){}!\\#~<>?\n\r\x00" + +// isSafeForShell reports whether a path can be safely embedded in a single-quoted +// shell argument without risk of command injection. +func isSafeForShell(path string) bool { + return !strings.ContainsAny(path, shellUnsafeChars) } -// fetchPkgFiles runs pkgutil --files and returns the list of files -func fetchPkgFiles(pkgID string) []string { - cmd := exec.Command("pkgutil", "--files", pkgID) - output, err := cmd.Output() +// singleLsbom runs lsbom -sd for one BOM file using exec.Command directly, +// bypassing the shell. This is injection-safe for any filename. +func singleLsbom(bomPath string) []string { + ctx, cancel := context.WithTimeout(context.Background(), lsbomSingleTimeout) + defer cancel() + + cmd := exec.CommandContext(ctx, "lsbom", "-sd", bomPath) + stdout, err := cmd.StdoutPipe() if err != nil { - return nil + return []string{} + } + if err := cmd.Start(); err != nil { + return []string{} } - var files []string - for _, line := range strings.Split(string(output), "\n") { - line = strings.TrimSpace(line) - if line != "" { - files = append(files, line) - } + var lines []string + scanner := bufio.NewScanner(stdout) + scanner.Buffer(make([]byte, 0, 64*1024), lsbomScannerMaxTokenSize) + for scanner.Scan() { + lines = append(lines, scanner.Text()) + } + if err := cmd.Wait(); err != nil { + log.Warnf("lsbom failed for %s: %v", bomPath, err) } - return files + return lines } -// pkgInstalledAppFromCache checks if a package installed an application bundle -// using cached file list instead of calling pkgutil again -func pkgInstalledAppFromCache(files []string) bool { - for _, line := range files { - // Check if this line represents an .app bundle - // We look for .app in the path and verify it's a bundle (not just a file with .app in name) - if strings.Contains(line, ".app") { - // Get the first path component to check if it's the app bundle itself - // or if it's inside an Applications directory - parts := strings.Split(line, "/") - - // Case 1: Direct app bundle (InstallPrefixPath = "Applications") - // e.g., "Google Chrome.app" or "Google Chrome.app/Contents" - if strings.HasSuffix(parts[0], ".app") { - return true - } +// batchLsbomShell runs a single shell subprocess that invokes lsbom -sd for multiple +// BOM files, producing delimited output. Only safe paths should be passed here. +func batchLsbomShell(bomPaths []string) map[string][]string { + if len(bomPaths) == 0 { + return nil + } + + var script strings.Builder + for _, bp := range bomPaths { + fmt.Fprintf(&script, "printf '===BOM:%s===\\n'; lsbom -sd '%s' 2>/dev/null; ", bp, bp) + } + + ctx, cancel := context.WithTimeout(context.Background(), lsbomBatchTimeout) + defer cancel() - // Case 2: App inside Applications folder (InstallPrefixPath = "/") - // e.g., "Applications/Numbers.app" or "Applications/Numbers.app/Contents" - if len(parts) >= 2 && parts[0] == "Applications" && strings.HasSuffix(parts[1], ".app") { - return true + cmd := exec.CommandContext(ctx, "sh", "-c", script.String()) + stdout, err := cmd.StdoutPipe() + if err != nil { + return nil + } + if err := cmd.Start(); err != nil { + return nil + } + + result := make(map[string][]string, len(bomPaths)) + var currentBom string + scanner := bufio.NewScanner(stdout) + scanner.Buffer(make([]byte, 0, 64*1024), lsbomScannerMaxTokenSize) + for scanner.Scan() { + line := scanner.Text() + if strings.HasPrefix(line, bomDelimiterPrefix) && strings.HasSuffix(line, bomDelimiterSuffix) { + currentBom = line[len(bomDelimiterPrefix) : len(line)-len(bomDelimiterSuffix)] + if result[currentBom] == nil { + result[currentBom] = []string{} } + continue + } + if currentBom != "" { + result[currentBom] = append(result[currentBom], line) } } - return false + if err := cmd.Wait(); err != nil { + log.Warnf("batched lsbom shell failed: %v", err) + } + return result } -// isLikelyFile checks if a path looks like a file (not a directory). -// Files typically have extensions or are in known executable locations. -func isLikelyFile(path string) bool { - // Common file extensions - fileExtensions := []string{ - ".so", ".dylib", ".a", ".o", // Libraries - ".py", ".pyc", ".pyo", ".pyd", // Python - ".rb", ".pl", ".sh", ".bash", // Scripts - ".json", ".yaml", ".yml", ".xml", ".plist", // Config - ".txt", ".md", ".rst", ".html", ".css", ".js", // Text/Web - ".png", ".jpg", ".jpeg", ".gif", ".ico", ".icns", // Images - ".app", ".framework", ".bundle", ".kext", // macOS bundles - ".pkg", ".dmg", ".zip", ".tar", ".gz", // Archives - ".conf", ".cfg", ".ini", ".log", // Config/logs - ".h", ".c", ".cpp", ".m", ".swift", // Source - ".strings", ".nib", ".xib", ".storyboard", // macOS resources +// batchLsbom fetches lsbom -sd output for all given BOM paths. +// Safe paths are batched into a single shell subprocess for performance. +// Paths with shell metacharacters are handled via individual exec.Command calls. +func batchLsbom(bomPaths []string) map[string][]string { + if len(bomPaths) == 0 { + return nil } - // Check for file extension - for _, ext := range fileExtensions { - if strings.HasSuffix(path, ext) { - return true + var safePaths, unsafePaths []string + for _, bp := range bomPaths { + if isSafeForShell(bp) { + safePaths = append(safePaths, bp) + } else { + unsafePaths = append(unsafePaths, bp) } } - // Check if it's in a bin directory (executables often have no extension) - parts := strings.Split(path, "/") - for i, part := range parts { - if part == "bin" && i < len(parts)-1 { - // The item after "bin" is likely an executable - return true - } + result := batchLsbomShell(safePaths) + if result == nil { + result = make(map[string][]string, len(unsafePaths)) } - - // Check for common executable names without extensions - lastPart := parts[len(parts)-1] - if !strings.Contains(lastPart, ".") && len(lastPart) > 0 { - // Files in certain directories are likely files, not directories - for _, part := range parts { - if part == "bin" || part == "lib" || part == "share" || part == "include" { - return true - } - } + for _, bp := range unsafePaths { + result[bp] = singleLsbom(bp) } + return result +} - return false +// isApplicationsAppPath reports whether a pkg file path belongs to an app bundle in Applications. +func isApplicationsAppPath(path string) bool { + parts := strings.Split(path, "/") + if len(parts) == 0 { + return false + } + if strings.HasSuffix(parts[0], ".app") { + return true + } + return len(parts) >= 2 && parts[0] == "Applications" && strings.HasSuffix(parts[1], ".app") } -// getPkgTopLevelPathsFromCache extracts top-level directories from cached file list -func getPkgTopLevelPathsFromCache(files []string, prefixPath string) []string { - // Normalize prefix path for building absolute paths +// topLevelPathFromLine derives a representative top-level install path from a pkg payload line. +func topLevelPathFromLine(line, prefixPath string) string { + parts := strings.Split(line, "/") + if len(parts) == 0 { + return "" + } + var basePrefix string if prefixPath == "" || prefixPath == "/" { basePrefix = "" @@ -238,87 +288,108 @@ func getPkgTopLevelPathsFromCache(files []string, prefixPath string) []string { basePrefix = "/" + prefixPath } - // Collect file parent directories at appropriate depth - dirSet := make(map[string]bool) - - for _, line := range files { - // Only process files (items with extensions or in known file locations) - if !isLikelyFile(line) { - continue + var topLevelDir string + switch parts[0] { + case "usr": + if len(parts) >= 3 { + topLevelDir = "/" + parts[0] + "/" + parts[1] + "/" + parts[2] } - - // Get path components - parts := strings.Split(line, "/") - if len(parts) == 0 { - continue + case "Library": + if len(parts) >= 2 { + topLevelDir = "/" + parts[0] + "/" + parts[1] } + case "opt": + if len(parts) >= 2 { + topLevelDir = "/" + parts[0] + "/" + parts[1] + } + case "Applications": + if len(parts) >= 2 { + topLevelDir = "/" + parts[0] + "/" + parts[1] + } + case "System", "private", "var": + if len(parts) >= 3 { + topLevelDir = "/" + parts[0] + "/" + parts[1] + "/" + parts[2] + } else if len(parts) >= 2 { + topLevelDir = "/" + parts[0] + "/" + parts[1] + } + default: + if basePrefix != "" && basePrefix != "/" { + topLevelDir = basePrefix + "/" + parts[0] + } else if len(parts) >= 1 { + topLevelDir = "/" + parts[0] + } + } - // Determine the meaningful top-level directory based on path structure - // We want to capture the "application directory" level, not every nested dir - var topLevelDir string + if topLevelDir == "" || topLevelDir == "/" { + return "" + } + return strings.ReplaceAll(topLevelDir, "//", "/") +} - switch parts[0] { - case "usr": - // For /usr paths, capture at the 3rd level (e.g., /usr/local/bin, /usr/local/ykman) - if len(parts) >= 3 { - topLevelDir = "/" + parts[0] + "/" + parts[1] + "/" + parts[2] - } - case "Library": - // For /Library, capture at 2nd level (e.g., /Library/LaunchDaemons) - if len(parts) >= 2 { - topLevelDir = "/" + parts[0] + "/" + parts[1] - } - case "opt": - // For /opt, capture the application directory (e.g., /opt/datadog-agent) - if len(parts) >= 2 { - topLevelDir = "/" + parts[0] + "/" + parts[1] - } - case "Applications": - // For /Applications, capture the app bundle (e.g., /Applications/Chrome.app) - if len(parts) >= 2 { - topLevelDir = "/" + parts[0] + "/" + parts[1] - } - case "System", "private", "var": - // For system paths, capture at 3rd level - if len(parts) >= 3 { - topLevelDir = "/" + parts[0] + "/" + parts[1] + "/" + parts[2] - } else if len(parts) >= 2 { - topLevelDir = "/" + parts[0] + "/" + parts[1] - } - default: - // For paths with a prefix (e.g., "Applications" prefix), combine with first component - if basePrefix != "" && basePrefix != "/" { - topLevelDir = basePrefix + "/" + parts[0] - } else if len(parts) >= 1 { - topLevelDir = "/" + parts[0] - } - } +// updatePkgSummaryFromLine updates summary flags and top-level path set from one directory line. +// Input lines come from lsbom -sd, so every line is a directory path prefixed with "./". +func updatePkgSummaryFromLine(summary *pkgSummary, topLevelSet map[string]struct{}, line, prefixPath string) { + line = strings.TrimSpace(line) + line = strings.TrimPrefix(line, "./") + if line == "" || line == "." { + return + } - // Clean up and add to set - if topLevelDir != "" && topLevelDir != "/" { - topLevelDir = strings.ReplaceAll(topLevelDir, "//", "/") - dirSet[topLevelDir] = true - } + appPath := isApplicationsAppPath(line) + if appPath { + summary.HasApplicationsApp = true + } else { + summary.HasNonAppPayload = true } - // Convert map to sorted slice - paths := make([]string, 0, len(dirSet)) - for path := range dirSet { - paths = append(paths, path) + topLevelDir := topLevelPathFromLine(line, prefixPath) + if topLevelDir == "" { + return + } + if _, exists := topLevelSet[topLevelDir]; exists || len(topLevelSet) < maxTopLevelPathsPerPkg { + topLevelSet[topLevelDir] = struct{}{} } +} - // Sort for consistent output - if len(paths) > 1 { - for i := 0; i < len(paths)-1; i++ { - for j := i + 1; j < len(paths); j++ { - if paths[i] > paths[j] { - paths[i], paths[j] = paths[j], paths[i] - } - } - } +// buildPkgSummaryFromLines builds a compact package summary from lsbom -sd output lines. +func buildPkgSummaryFromLines(lines []string, prefixPath string) pkgSummary { + summary := pkgSummary{} + topLevelSet := make(map[string]struct{}) + for _, line := range lines { + updatePkgSummaryFromLine(&summary, topLevelSet, line, prefixPath) } + summary.TopLevelPaths = sortedPathsFromSet(topLevelSet) + return summary +} - return paths +// sortedPathsFromSet converts a path set into a lexicographically sorted slice. +func sortedPathsFromSet(paths map[string]struct{}) []string { + if len(paths) == 0 { + return nil + } + out := make([]string, 0, len(paths)) + for path := range paths { + out = append(out, path) + } + sort.Strings(out) + return out +} + +// shouldSkipPkgFromSummary applies baseline pkg suppression semantics for app-backed software. +func shouldSkipPkgFromSummary(summary pkgSummary) bool { + return summary.HasApplicationsApp +} + +// filterGenericSystemPaths removes overly generic install roots from summarized path output. +func filterGenericSystemPaths(paths []string) []string { + filtered := make([]string, 0, len(paths)) + for _, path := range paths { + if path == "/etc" || path == "/var" || path == "/tmp" || path == "/System" { + continue + } + filtered = append(filtered, path) + } + return filtered } // pkgReceiptInfo holds parsed info from a PKG receipt plist @@ -327,6 +398,199 @@ type pkgReceiptInfo struct { version string installDate string prefixPath string + bomPath string +} + +// buildEntryFromReceipt builds a software entry for one receipt using a pre-computed summary. +// Returns nil when the receipt should be skipped by representation rules. +func buildEntryFromReceipt(receipt pkgReceiptInfo, summary pkgSummary, is64Bit bool) *Entry { + if shouldSkipPkgFromSummary(summary) { + return nil + } + + var installPath string + if receipt.prefixPath != "" && receipt.prefixPath != "/" { + if !strings.HasPrefix(receipt.prefixPath, "/") { + installPath = "/" + receipt.prefixPath + } else { + installPath = receipt.prefixPath + } + } else { + installPath = "N/A" + } + + installPaths := filterGenericSystemPaths(summary.TopLevelPaths) + + if installPath != "N/A" && len(installPaths) > 0 { + hasPathsOutside := false + installPathWithSlash := installPath + "/" + for _, p := range installPaths { + if !strings.HasPrefix(p, installPathWithSlash) && p != installPath { + hasPathsOutside = true + break + } + } + if !hasPathsOutside { + installPaths = nil + } + } else if installPath == "N/A" && len(installPaths) > 0 { + if len(installPaths) == 1 { + installPath = installPaths[0] + installPaths = nil + } else { + installPath = "" + } + } + + status := statusInstalled + var brokenReason string + if installPath != "" && installPath != "N/A" { + if _, err := os.Stat(installPath); os.IsNotExist(err) { + status = statusBroken + brokenReason = "install path not found: " + installPath + } + } else if len(installPaths) > 0 { + for _, p := range installPaths { + if _, err := os.Stat(p); os.IsNotExist(err) { + status = statusBroken + brokenReason = "install path not found: " + p + break + } + } + } + + return &Entry{ + DisplayName: receipt.packageID, + Version: receipt.version, + InstallDate: receipt.installDate, + Source: softwareTypePkg, + ProductCode: receipt.packageID, + Status: status, + BrokenReason: brokenReason, + Is64Bit: is64Bit, + InstallPath: installPath, + InstallPaths: installPaths, + } +} + +// appToPkgIndex maps absolute app paths (e.g., "/Applications/zoom.us.app") to the +// package identifier that installed them, derived from BOM data. This replaces the +// expensive per-app `pkgutil --file-info` subprocess calls in applicationsCollector. +type appToPkgIndex struct { + mu sync.Mutex + index map[string]string // appPath → pkgID + built bool +} + +var ( + globalAppToPkgIndex *appToPkgIndex + globalAppToPkgIndexOnce sync.Once +) + +func getGlobalAppToPkgIndex() *appToPkgIndex { + globalAppToPkgIndexOnce.Do(func() { + globalAppToPkgIndex = &appToPkgIndex{} + }) + return globalAppToPkgIndex +} + +// lookupPkgForApp returns the package ID that installed the given app path, or "" if unknown. +// On first call, it reads all PKG receipts and BOM data to build the reverse index. +func (idx *appToPkgIndex) lookupPkgForApp(appPath string) string { + idx.mu.Lock() + defer idx.mu.Unlock() + + if !idx.built { + idx.index = buildAppToPkgMap() + idx.built = true + } + return idx.index[appPath] +} + +// buildAppToPkgMap reads all PKG receipts and their BOM data (via the global cache) +// to build a mapping from absolute app paths to package IDs. +func buildAppToPkgMap() map[string]string { + receiptsDir := "/var/db/receipts" + dirEntries, err := os.ReadDir(receiptsDir) + if err != nil { + return nil + } + + type receiptBom struct { + pkgID string + prefixPath string + bomPath string + } + + var items []receiptBom + bomPathSet := make(map[string]bool) + + for _, de := range dirEntries { + if !strings.HasSuffix(de.Name(), ".plist") { + continue + } + plistData, err := readPlistFile(filepath.Join(receiptsDir, de.Name())) + if err != nil { + continue + } + pkgID := plistData["PackageIdentifier"] + if pkgID == "" { + continue + } + prefixPath := plistData["InstallPrefixPath"] + if prefixPath == "" { + prefixPath = plistData["InstallLocation"] + } + bomPath := filepath.Join(receiptsDir, strings.TrimSuffix(de.Name(), ".plist")+".bom") + items = append(items, receiptBom{pkgID: pkgID, prefixPath: prefixPath, bomPath: bomPath}) + bomPathSet[bomPath] = true + } + + bomPaths := make([]string, 0, len(bomPathSet)) + for bp := range bomPathSet { + bomPaths = append(bomPaths, bp) + } + + cache := getGlobalBomCache() + bomLines := cache.getBomLines(bomPaths) + + result := make(map[string]string) + for _, item := range items { + lines := bomLines[item.bomPath] + prefix := item.prefixPath + if prefix == "" || prefix == "/" { + prefix = "" + } else if !strings.HasPrefix(prefix, "/") { + prefix = "/" + prefix + } + + for _, line := range lines { + line = strings.TrimSpace(line) + line = strings.TrimPrefix(line, "./") + if line == "" || line == "." { + continue + } + + // Build absolute path and check if it's an .app in /Applications + var absPath string + if prefix == "" { + absPath = "/" + line + } else { + absPath = prefix + "/" + line + } + + if !strings.HasSuffix(absPath, ".app") { + continue + } + // Only index top-level .app bundles (not nested .app inside other .app) + dir := filepath.Dir(absPath) + if dir == "/Applications" || strings.HasPrefix(dir, "/Applications/") || + strings.HasPrefix(dir, "/Users/") { + result[absPath] = item.pkgID + } + } + } + return result } // Collect reads PKG installer receipts from /var/db/receipts @@ -348,16 +612,13 @@ func (c *pkgReceiptsCollector) Collect() ([]*Entry, []*Warning, error) { dirEntries, err := os.ReadDir(receiptsDir) if err != nil { - // Not an error if receipts directory doesn't exist if os.IsNotExist(err) { return entries, warnings, nil } return nil, nil, err } - // First pass: Read all receipt plists and collect package IDs var receipts []pkgReceiptInfo - var pkgIDsToFetch []string for _, dirEntry := range dirEntries { if !strings.HasSuffix(dirEntry.Name(), ".plist") { @@ -371,130 +632,53 @@ func (c *pkgReceiptsCollector) Collect() ([]*Entry, []*Warning, error) { continue } - // Get package identifier as both display name and product code packageID := plistData["PackageIdentifier"] if packageID == "" { continue } - // Skip Mac App Store receipts - these correspond to MAS apps which are - // already captured by applicationsCollector with richer metadata if strings.HasSuffix(packageID, "_MASReceipt") { continue } - // Get install prefix path from receipt prefixPath := plistData["InstallPrefixPath"] if prefixPath == "" { prefixPath = plistData["InstallLocation"] } + bomPath := filepath.Join(receiptsDir, strings.TrimSuffix(dirEntry.Name(), ".plist")+".bom") + receipts = append(receipts, pkgReceiptInfo{ packageID: packageID, version: plistData["PackageVersion"], installDate: plistData["InstallDate"], prefixPath: prefixPath, + bomPath: bomPath, }) - pkgIDsToFetch = append(pkgIDsToFetch, packageID) } - // Prefetch all pkgutil --files results in parallel - // Use global cache that persists across collection runs - cache := getGlobalPkgFilesCache() - cache.prefetch(pkgIDsToFetch) - - // Determine architecture is64Bit := runtime.GOARCH == "amd64" || runtime.GOARCH == "arm64" - // Second pass: Process receipts using cached data - for _, receipt := range receipts { - files := cache.get(receipt.packageID) - - // Skip packages that installed applications to /Applications - // These are already captured by applicationsCollector - if pkgInstalledAppFromCache(files) { - continue - } - - // Determine install_path for backward compatibility - var installPath string - if receipt.prefixPath != "" && receipt.prefixPath != "/" { - if !strings.HasPrefix(receipt.prefixPath, "/") { - installPath = "/" + receipt.prefixPath - } else { - installPath = receipt.prefixPath - } - } else { - installPath = "N/A" - } - - // Get top-level installation directories from cached file list - installPaths := getPkgTopLevelPathsFromCache(files, receipt.prefixPath) - - // Filter out generic system directories - filteredPaths := make([]string, 0, len(installPaths)) - for _, p := range installPaths { - if p == "/etc" || p == "/var" || p == "/tmp" || p == "/System" { - continue - } - filteredPaths = append(filteredPaths, p) - } - installPaths = filteredPaths - - // Determine which path field(s) to include - if installPath != "N/A" && len(installPaths) > 0 { - hasPathsOutside := false - installPathWithSlash := installPath + "/" - for _, p := range installPaths { - if !strings.HasPrefix(p, installPathWithSlash) && p != installPath { - hasPathsOutside = true - break - } - } - if !hasPathsOutside { - installPaths = nil - } - } else if installPath == "N/A" && len(installPaths) > 0 { - if len(installPaths) == 1 { - installPath = installPaths[0] - installPaths = nil - } else { - installPath = "" - } + // Collect unique BOM paths + bomPaths := make([]string, 0, len(receipts)) + seen := make(map[string]bool, len(receipts)) + for _, r := range receipts { + if !seen[r.bomPath] { + bomPaths = append(bomPaths, r.bomPath) + seen[r.bomPath] = true } + } - // Check if the installation location still exists - status := statusInstalled - var brokenReason string - if installPath != "" && installPath != "N/A" { - if _, err := os.Stat(installPath); os.IsNotExist(err) { - status = statusBroken - brokenReason = "install path not found: " + installPath - } - } else if len(installPaths) > 0 { - for _, p := range installPaths { - if _, err := os.Stat(p); os.IsNotExist(err) { - status = statusBroken - brokenReason = "install path not found: " + p - break - } - } - } + // Fetch all BOM data in one batch (cache hit = 0 subprocesses, miss = 1 subprocess) + cache := getGlobalBomCache() + bomLines := cache.getBomLines(bomPaths) - entry := &Entry{ - DisplayName: receipt.packageID, - Version: receipt.version, - InstallDate: receipt.installDate, - Source: softwareTypePkg, - ProductCode: receipt.packageID, - Status: status, - BrokenReason: brokenReason, - Is64Bit: is64Bit, - InstallPath: installPath, - InstallPaths: installPaths, + for _, receipt := range receipts { + lines := bomLines[receipt.bomPath] + summary := buildPkgSummaryFromLines(lines, receipt.prefixPath) + if entry := buildEntryFromReceipt(receipt, summary, is64Bit); entry != nil { + entries = append(entries, entry) } - - entries = append(entries, entry) } return entries, warnings, nil diff --git a/pkg/inventory/software/collector_darwin_pkg_test.go b/pkg/inventory/software/collector_darwin_pkg_test.go new file mode 100644 index 00000000000000..ab566fc5014982 --- /dev/null +++ b/pkg/inventory/software/collector_darwin_pkg_test.go @@ -0,0 +1,202 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2016-present Datadog, Inc. + +//go:build darwin + +package software + +import ( + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestBuildPkgSummaryFromLines_AppOnly(t *testing.T) { + lines := []string{ + "./Applications/Google Chrome.app", + "./Applications/Google Chrome.app/Contents", + "./Applications/Google Chrome.app/Contents/MacOS", + } + + summary := buildPkgSummaryFromLines(lines, "/") + + assert.True(t, summary.HasApplicationsApp) + assert.False(t, summary.HasNonAppPayload) + assert.Contains(t, summary.TopLevelPaths, "/Applications/Google Chrome.app") + assert.True(t, shouldSkipPkgFromSummary(summary), "app-only packages should be skipped") +} + +func TestBuildPkgSummaryFromLines_MixedPayloadSkipped(t *testing.T) { + lines := []string{ + "./Applications/Datadog Agent.app", + "./Applications/Datadog Agent.app/Contents", + "./opt/datadog-agent", + "./opt/datadog-agent/bin", + "./opt/datadog-agent/etc", + } + + summary := buildPkgSummaryFromLines(lines, "/") + + assert.True(t, summary.HasApplicationsApp) + assert.True(t, summary.HasNonAppPayload) + assert.Contains(t, summary.TopLevelPaths, "/Applications/Datadog Agent.app") + assert.Contains(t, summary.TopLevelPaths, "/opt/datadog-agent") + assert.True(t, shouldSkipPkgFromSummary(summary), "baseline semantics skip packages that include Applications app payload") +} + +func TestBuildPkgSummaryFromLines_NonAppOnlyKept(t *testing.T) { + lines := []string{ + "./opt/example", + "./opt/example/bin", + "./usr/local/bin", + } + + summary := buildPkgSummaryFromLines(lines, "/") + + assert.False(t, summary.HasApplicationsApp) + assert.True(t, summary.HasNonAppPayload) + assert.Contains(t, summary.TopLevelPaths, "/opt/example") + assert.Contains(t, summary.TopLevelPaths, "/usr/local/bin") + assert.False(t, shouldSkipPkgFromSummary(summary), "non-app packages should be kept") +} + +func TestBuildPkgSummaryFromLines_PrefixApplicationsApp(t *testing.T) { + lines := []string{ + "./Pages.app", + "./Pages.app/Contents", + } + + summary := buildPkgSummaryFromLines(lines, "Applications") + + assert.True(t, summary.HasApplicationsApp) + assert.False(t, summary.HasNonAppPayload) + assert.Contains(t, summary.TopLevelPaths, "/Applications/Pages.app") + assert.True(t, shouldSkipPkgFromSummary(summary), "applications-prefix app-only package should be skipped") +} + +func TestBomCache_HitsWithinTTL(t *testing.T) { + c := &bomCache{ + entries: make(map[string]*bomCacheEntry), + ttl: time.Hour, + maxEntries: 100, + } + + // Seed cache manually + c.entries["/var/db/receipts/com.example.bom"] = &bomCacheEntry{ + Lines: []string{"./opt/example", "./opt/example/bin"}, + Timestamp: time.Now(), + } + + result := c.getBomLines([]string{"/var/db/receipts/com.example.bom"}) + require.Len(t, result["/var/db/receipts/com.example.bom"], 2) + assert.Equal(t, "./opt/example", result["/var/db/receipts/com.example.bom"][0]) +} + +func TestBomCache_ExpiredEntryRefetches(t *testing.T) { + c := &bomCache{ + entries: make(map[string]*bomCacheEntry), + ttl: 5 * time.Millisecond, + maxEntries: 100, + } + + c.entries["/var/db/receipts/com.example.bom"] = &bomCacheEntry{ + Lines: []string{"./stale"}, + Timestamp: time.Now().Add(-time.Second), + } + + // After TTL, getBomLines should call batchLsbom for the expired key. + // Since the BOM file doesn't exist, we'll get empty lines back. + result := c.getBomLines([]string{"/var/db/receipts/com.example.bom"}) + lines := result["/var/db/receipts/com.example.bom"] + assert.NotContains(t, lines, "./stale", "expired entry should not return stale data") +} + +func TestBomCache_EvictsWhenFull(t *testing.T) { + c := &bomCache{ + entries: make(map[string]*bomCacheEntry), + ttl: time.Hour, + maxEntries: 2, + } + + oldest := time.Now().Add(-10 * time.Minute) + c.entries["/bom/a"] = &bomCacheEntry{Lines: []string{}, Timestamp: oldest} + c.entries["/bom/b"] = &bomCacheEntry{Lines: []string{}, Timestamp: time.Now()} + + // Inserting a third should evict the oldest (/bom/a) + c.getBomLines([]string{"/bom/c"}) + + assert.LessOrEqual(t, len(c.entries), 2) + _, hasA := c.entries["/bom/a"] + assert.False(t, hasA, "oldest entry should be evicted") +} + +func TestBatchLsbom_EmptyInput(t *testing.T) { + result := batchLsbom(nil) + assert.Nil(t, result) + + result = batchLsbom([]string{}) + assert.Nil(t, result) +} + +func TestBuildEntryFromReceipt_SkipsAppPackage(t *testing.T) { + receipt := pkgReceiptInfo{ + packageID: "com.google.Chrome", + version: "1.0", + installDate: "2026-01-01", + prefixPath: "/", + bomPath: "/var/db/receipts/com.google.Chrome.bom", + } + summary := pkgSummary{ + HasApplicationsApp: true, + HasNonAppPayload: false, + TopLevelPaths: []string{"/Applications/Google Chrome.app"}, + } + + entry := buildEntryFromReceipt(receipt, summary, true) + assert.Nil(t, entry, "app packages should be skipped") +} + +func TestBuildEntryFromReceipt_KeepsNonAppPackage(t *testing.T) { + receipt := pkgReceiptInfo{ + packageID: "com.example.tool", + version: "2.0", + installDate: "2026-01-01", + prefixPath: "/", + bomPath: "/var/db/receipts/com.example.tool.bom", + } + summary := pkgSummary{ + HasApplicationsApp: false, + HasNonAppPayload: true, + TopLevelPaths: []string{"/opt/example"}, + } + + entry := buildEntryFromReceipt(receipt, summary, true) + assert.NotNil(t, entry) + assert.Equal(t, "com.example.tool", entry.DisplayName) + assert.Equal(t, "2.0", entry.Version) + assert.Equal(t, softwareTypePkg, entry.Source) +} + +func TestFilterGenericSystemPaths(t *testing.T) { + paths := []string{ + "/etc", + "/var", + "/tmp", + "/System", + "/opt/datadog-agent", + "/usr/local/bin", + } + + filtered := filterGenericSystemPaths(paths) + + assert.NotContains(t, filtered, "/etc") + assert.NotContains(t, filtered, "/var") + assert.NotContains(t, filtered, "/tmp") + assert.NotContains(t, filtered, "/System") + assert.Contains(t, filtered, "/opt/datadog-agent") + assert.Contains(t, filtered, "/usr/local/bin") +}