From b188c9c9839f1061ae9dad09257d3993bbb67af5 Mon Sep 17 00:00:00 2001
From: mann1x <20623405+mann1x@users.noreply.github.com>
Date: Mon, 22 Apr 2024 20:08:51 +0200
Subject: [PATCH 01/12] CpuSet support for Windows

---
 common/common.cpp | 487 +++++++++++++++++++++++++++++++++++++++++++++-
 common/common.h   |  24 ++-
 ggml.c            |  47 +++++
 3 files changed, 550 insertions(+), 8 deletions(-)
diff --git a/common/common.cpp b/common/common.cpp
index cf69535e2d1f5..c53749befb8ab 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -32,8 +32,13 @@
 #endif
 #include <locale>
 #include <windows.h>
+#include <stdio.h>
+#include <wincred.h>
 #include <fcntl.h>
 #include <io.h>
+#include <bitset>
+#include <tlhelp32.h>
+#include <tchar.h>
 #else
 #include <sys/ioctl.h>
 #include <sys/stat.h>
@@ -72,8 +77,96 @@
 
 using json = nlohmann::ordered_json;
 
+#if defined(_WIN32)
+std::vector<CPU_SET_INFORMATION> cpuset;
+std::vector<CPU_SET_INFORMATION> cpuset_best;
+std::vector<CPU_SET_INFORMATION> cpuset_worst;
+
+bool cpuset_enable = false;
+bool cpuset_smt = false;
+
+int32_t numPhysicalCores = -1;
+int32_t PhysicalCores = std::thread::hardware_concurrency();
+
+//
+// CPUSET logging
+//
+
+#define CPUSET_DEBUG 1
+#if (CPUSET_DEBUG >= 1)
+#define CPUSET_PRINT_DEBUG(...) printf(__VA_ARGS__)
+#else
+#define CPUSET_PRINT_DEBUG(...)
+#endif
+
+int32_t get_pos_procMask(ULONG_PTR procMask) {
+            std::bitset<64> bMask = procMask;
+            int32_t thisPos = 0;
+            for (int32_t i = 0; i < 64; ++i) {
+                if (bMask[i] == 1) {
+                    return i;
+                    break;
+                }
+            }
+            return thisPos;
+}
+
+int32_t get_count_procMask(ULONG_PTR procMask) {
+            std::bitset<64> bMask = procMask;
+            return bMask.count();
+}
+
+bool cpuset_sorter_best(CPU_SET_INFORMATION const& lhs, CPU_SET_INFORMATION const& rhs) {
+        return lhs.SchedulingClass > rhs.SchedulingClass;
+}
+
+bool cpuset_sorter_worst(CPU_SET_INFORMATION const& lhs, CPU_SET_INFORMATION const& rhs) {
+        return lhs.SchedulingClass < rhs.SchedulingClass;
+}
+
+ULONG generate_Mask(int direction, int32_t req_threads, int lltraversal) {
+    std::bitset<64> bMask;
+    std::vector<CPU_SET_INFORMATION> _cpuset;
+    int32_t bVal = 0;
+    int32_t assigned_t = 0;
+    int32_t llcache = -1;
+
+    if (direction == BEST_CORES) {
+        _cpuset = cpuset_best;
+    } else {
+        _cpuset = cpuset_worst;
+    }
+    CPUSET_PRINT_DEBUG("\ngenerate_Mask dir=%d req_threads=%d lltraversal=%d llcache=%d\n", direction, req_threads, lltraversal, llcache);
+    for (auto index : _cpuset) {
+        bVal = 0;
+        if (index.LogicalProcessorIndex != 0 &&
+            ((cpuset_smt && index.Threads > 1) || !cpuset_smt) &&
+            index.EfficiencyClass == 0 &&
+            ((llcache == index.LastLevelCacheIndex && lltraversal == 0) || llcache == -1)
+            ) {
+            if (lltraversal == 0) {
+                CPUSET_PRINT_DEBUG("cache for lltraversal %d pre llcache %d now_cache=%u\n", lltraversal, llcache, index.LastLevelCacheIndex);
+                llcache = index.LastLevelCacheIndex;
+                CPUSET_PRINT_DEBUG("cache for lltraversal %d pos llcache %d now_cache=%u\n", lltraversal, llcache, index.LastLevelCacheIndex);
+            } 
+            bVal = 1;
+            assigned_t++;
+            CPUSET_PRINT_DEBUG("Assigned LogicalCoreIndex: %d lltraversal %d llcache %d now_cache=%u\n", index.LogicalProcessorIndex, lltraversal, llcache, index.LastLevelCacheIndex);
+        }
+        bMask[index.LogicalProcessorIndex] = bVal;
+        CPUSET_PRINT_DEBUG("Index: %d b:%d smt=%d thrds=%d\n", index.LogicalProcessorIndex, bVal, cpuset_smt, index.Threads);
+        if (req_threads > 0) {
+            if (assigned_t >= req_threads) {
+                break;
+            }
+        }
+    }
+    return bMask.to_ullong();
+}
+#endif
+
 int32_t get_num_physical_cores() {
-#ifdef __linux__
+#ifdef __linux__ // __x86_64__ && __linux__
     // enumerate the set of thread siblings, num entries is num cores
     std::unordered_set<std::string> siblings;
     for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) {
@@ -90,7 +183,7 @@ int32_t get_num_physical_cores() {
     if (!siblings.empty()) {
         return static_cast<int32_t>(siblings.size());
     }
-#elif defined(__APPLE__) && defined(__MACH__)
+#elif defined(__APPLE__) && defined(__MACH__) // __APPLE__ && __MACH__
     int32_t num_physical_cores;
     size_t len = sizeof(num_physical_cores);
     int result = sysctlbyname("hw.perflevel0.physicalcpu", &num_physical_cores, &len, NULL, 0);
@@ -101,12 +194,148 @@ int32_t get_num_physical_cores() {
     if (result == 0) {
         return num_physical_cores;
     }
-#elif defined(_WIN32)
-    //TODO: Implement
+#elif defined(_WIN32) // _WIN32
+    if (numPhysicalCores > 0) {
+        return numPhysicalCores;    
+    }
+    unsigned int d_threads = std::thread::hardware_concurrency();
+
+    HMODULE h = GetModuleHandleW(L"kernel32.dll");
+    if (NULL != h) {
+        if (NULL != GetProcAddress(h, "GetSystemCpuSetInformation")){
+            CPUSET_PRINT_DEBUG("Windows SystemCpuSetInformation is available\n");
+            cpuset_enable = true;
+        }
+    }
+    numPhysicalCores = d_threads > 0 ? (d_threads <= 4 ? d_threads : d_threads / 2) : 4;
+    if (d_threads < 4 || d_threads > 64 || !cpuset_enable) {
+        return numPhysicalCores;
+    }
+    ULONG bufferSize;
+    ULONG bufferSizeLogical;
+    HANDLE curProc = GetCurrentProcess();
+
+    GetSystemCpuSetInformation(nullptr, 0, &bufferSize, curProc, 0);
+    GetLogicalProcessorInformation(nullptr, &bufferSizeLogical);
+
+    auto buffer = std::make_unique<uint8_t[]>(bufferSize);
+    auto bufferLogical = std::make_unique<uint8_t[]>(bufferSizeLogical);
+
+    if(!GetSystemCpuSetInformation(reinterpret_cast<PSYSTEM_CPU_SET_INFORMATION>(buffer.get()), bufferSize, &bufferSize, curProc, 0))
+    {
+
+        CPUSET_PRINT_DEBUG("Failure GetSystemCpuSetInformation, fallback\n");
+        cpuset_enable = false;
+        return numPhysicalCores;
+    }
+    uint8_t* cpuSetPtr = buffer.get();
+
+    GetLogicalProcessorInformation(reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION>(bufferLogical.get()), &bufferSizeLogical);
+    uint8_t* logicalPtr = bufferLogical.get();
+
+    uint32_t numLogicalCores = 0;
+
+    CPUSET_PRINT_DEBUG("\nCPUSET GetSystemCpuSetInformation:\n");
+
+    for (ULONG cpuSetSize = 0; cpuSetSize < bufferSize; )
+    {
+        auto nextCPUSet = reinterpret_cast<PSYSTEM_CPU_SET_INFORMATION>(cpuSetPtr);
+
+        if (nextCPUSet->Type == CPU_SET_INFORMATION_TYPE::CpuSetInformation)
+        {
+                CPU_SET_INFORMATION _cpuset;
+                _cpuset.LogicalProcessorIndex = nextCPUSet->CpuSet.LogicalProcessorIndex;
+                _cpuset.CoreIndex = nextCPUSet->CpuSet.CoreIndex;
+                _cpuset.Id = nextCPUSet->CpuSet.Id;
+                _cpuset.Group = nextCPUSet->CpuSet.Group;
+                _cpuset.LastLevelCacheIndex = nextCPUSet->CpuSet.LastLevelCacheIndex;
+                _cpuset.NumaNodeIndex = nextCPUSet->CpuSet.NumaNodeIndex;
+                _cpuset.EfficiencyClass = nextCPUSet->CpuSet.EfficiencyClass;
+                _cpuset.SchedulingClass = nextCPUSet->CpuSet.SchedulingClass;
+                cpuset.push_back(_cpuset);
+                numLogicalCores++;
+        }
+        // Should not happen but it's a fail safe
+        if (numLogicalCores > d_threads) continue;
+
+        cpuSetPtr += nextCPUSet->Size;
+        cpuSetSize += nextCPUSet->Size;
+    }    
+    
+    int32_t physicalCount = 0;
+    int32_t thisLogical = 0;
+    int32_t coreThreadsNum = 1;
+
+    for (ULONG logicalSize = 0; logicalSize < bufferSizeLogical; )
+    {
+        auto nextLogical = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION>(logicalPtr);
+        
+
+        if (nextLogical->ProcessorCore.Flags == 1 && nextLogical->Cache.Associativity <= 2) {
+            switch (nextLogical->Relationship) {
+                case LOGICAL_PROCESSOR_RELATIONSHIP::RelationProcessorCore:                    
+                    CPUSET_PRINT_DEBUG("Physical Count: %u\n", physicalCount);        
+                    CPUSET_PRINT_DEBUG("Cache.Associativity: %d\n", nextLogical->Cache.Associativity);        
+                    CPUSET_PRINT_DEBUG("Cache.Level: %d\n", nextLogical->Cache.Level);        
+                    CPUSET_PRINT_DEBUG("Cache.Type: %d\n", nextLogical->Cache.Type);        
+                    CPUSET_PRINT_DEBUG("Core Flags: %d\n", nextLogical->ProcessorCore.Flags);
+                    coreThreadsNum = get_count_procMask(nextLogical->ProcessorMask);
+                    CPUSET_PRINT_DEBUG("LogicalCore: %d is Physical with %d [%d]thread(s)\n", get_pos_procMask(nextLogical->ProcessorMask), get_count_procMask(nextLogical->ProcessorMask), coreThreadsNum);
+                    if (coreThreadsNum > 1) cpuset_smt = true;
+                    cpuset[get_pos_procMask(nextLogical->ProcessorMask)].Threads = coreThreadsNum;
+
+                    for (int32_t thread = 1; thread < coreThreadsNum;) {
+                        CPUSET_PRINT_DEBUG("LogicalCore: %u is a thread\n", get_pos_procMask(nextLogical->ProcessorMask)+thread);
+                        cpuset[get_pos_procMask(nextLogical->ProcessorMask)+thread].Threads = 1;
+                        thread++;
+                    }
+
+                    break;
+            }
+        }
+
+        logicalSize += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION); 
+        logicalPtr += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
+        
+    }   
+    cpuset_best = cpuset;
+    cpuset_worst = cpuset;
+    std::sort(cpuset_best.begin(), cpuset_best.end(), &cpuset_sorter_best);
+    std::sort(cpuset_worst.begin(), cpuset_worst.end(), &cpuset_sorter_worst);
+
+    physicalCount = get_count_procMask(generate_Mask(WORST_CORES, 0, 1));
+
+    CPUSET_PRINT_DEBUG("\n\nLPhysicalCount: %d\n\n", physicalCount);
+
+    physicalCount = physicalCount <= 0 ? numLogicalCores : physicalCount;
+
+    CPUSET_PRINT_DEBUG("\n\nLPhysicalCount2: %d\n\n", physicalCount);
+
+    CPUSET_PRINT_DEBUG("\n\nLogical Processors Summary\n\n");
+
+    for (uint32_t _logicalCore = 0; _logicalCore < numLogicalCores;)
+    {
+            CPUSET_PRINT_DEBUG("\nLogical: %u\n", _logicalCore);
+            CPUSET_PRINT_DEBUG("Threads: %u\n", cpuset[_logicalCore].Threads);
+            CPUSET_PRINT_DEBUG("Id: %u\n", cpuset[_logicalCore].Id);
+            CPUSET_PRINT_DEBUG("Group: %u\n", cpuset[numLogicalCores].Group);
+            CPUSET_PRINT_DEBUG("LastLevelCacheIndex: %u\n", cpuset[_logicalCore].LastLevelCacheIndex);
+            CPUSET_PRINT_DEBUG("NumaNodeIndex: %u\n", cpuset[_logicalCore].NumaNodeIndex);
+            CPUSET_PRINT_DEBUG("LogicalProcessorIndex: %u\n", cpuset[_logicalCore].LogicalProcessorIndex);
+            CPUSET_PRINT_DEBUG("EfficiencyClass: %u\n", cpuset[_logicalCore].EfficiencyClass);
+            CPUSET_PRINT_DEBUG("SchedulingClass: %u\n", cpuset[_logicalCore].SchedulingClass);
+            _logicalCore++;
+    }
+
+
+    CPUSET_PRINT_DEBUG("\n\n<Grand total> \n\n");    
+    CPUSET_PRINT_DEBUG("Total Physical: %u\n", physicalCount);    
+    CPUSET_PRINT_DEBUG("Total Logical: %u\n", numLogicalCores);    
+    return physicalCount;
 #endif
     unsigned int n_threads = std::thread::hardware_concurrency();
-    return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
-}
+    return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;}
+
 
 #if defined(__x86_64__) && defined(__linux__)
 #include <pthread.h>
@@ -156,7 +385,168 @@ static int count_math_cpus(int cpu_count) {
     return result;
 }
 
-#endif // __x86_64__ && __linux__
+#elif defined(_WIN32)
+
+#define STATUS_ACCESS_DENIED ((NTSTATUS)0xC0000022L)
+#define STATUS_SUCCESS		 ((NTSTATUS)0)
+
+typedef enum _SYSTEM_INFORMATION_CLASS {
+	SystemAllowedCpuSetsInformation = 168,
+	SystemCpuSetInformation = 175,
+	SystemCpuSetTagInformation = 176,
+} SYSTEM_INFORMATION_CLASS;
+
+typedef enum _PROCESSINFOCLASS {
+	ProcessDefaultCpuSetsInformation = 66,
+	ProcessAllowedCpuSetsInformation = 67,
+} PROCESSINFOCLASS;
+
+extern "C"
+NTSTATUS
+NTAPI
+NtQuerySystemInformationEx(
+	_In_ SYSTEM_INFORMATION_CLASS SystemInformationClass,
+	_In_reads_bytes_(InputBufferLength) PVOID InputBuffer,
+	_In_ ULONG InputBufferLength,
+	_Out_writes_bytes_opt_(SystemInformationLength) PVOID SystemInformation,
+	_In_ ULONG SystemInformationLength,
+	_Out_opt_ PULONG ReturnLength
+);
+
+
+extern "C"
+NTSTATUS
+NTAPI
+NtQueryInformationProcess(
+	_In_ HANDLE ProcessHandle,
+	_In_ PROCESSINFOCLASS ProcessInformationClass,
+	_Out_writes_bytes_opt_(ProcessInformationLength) PVOID ProcessInformation,
+	_In_ ULONG ProcessInformationLength,
+	_Out_opt_ PULONG ReturnLength
+);
+
+int32_t setCpuAffinity(std::bitset<64> cpuMask) {
+    DWORD_PTR processAffinityMask;
+    DWORD_PTR systemAffinityMask;
+    int32_t coreSelected = get_count_procMask(cpuMask.to_ullong());
+    HANDLE hToken = nullptr;
+
+    BOOL bToken = ::OpenProcessToken(::GetCurrentProcess(), TOKEN_ALL_ACCESS, &hToken);
+    if (!bToken) {
+        CPUSET_PRINT_DEBUG("Could not access process main ALL\n");
+    }
+
+    HANDLE hProcess = ::OpenProcess(PROCESS_QUERY_LIMITED_INFORMATION | PROCESS_SET_INFORMATION, FALSE, GetCurrentProcessId());
+    if (!hProcess) {
+        CPUSET_PRINT_DEBUG("Could not access process for Affinity\n");
+    }
+
+    if (!GetProcessAffinityMask(hProcess, &processAffinityMask, &systemAffinityMask)) {
+        CPUSET_PRINT_DEBUG("Could not get affinity for Process\n");
+    }
+    
+    std::bitset<64> processMask = processAffinityMask;
+    CPUSET_PRINT_DEBUG("Process Mask:   %s\n", processMask.to_string().c_str());
+    std::bitset<64> systemMask = systemAffinityMask;
+    CPUSET_PRINT_DEBUG("System Mask:    %s\n", systemMask.to_string().c_str());
+    std::bitset<64> reqMask = cpuMask;
+    CPUSET_PRINT_DEBUG("Requested Mask: %s\n", reqMask.to_string().c_str());
+
+    // Set process affinity
+    if (!SetProcessAffinityMask(hProcess, cpuMask.to_ullong() & systemAffinityMask)) {
+        CPUSET_PRINT_DEBUG("Could not set affinity for Process\n");
+    } else {
+        coreSelected = get_count_procMask(cpuMask.to_ullong() & systemAffinityMask);
+        CPUSET_PRINT_DEBUG("Affinity SET for Process\n");
+    }
+
+    if (!GetProcessAffinityMask(hProcess, &processAffinityMask, &systemAffinityMask)) {
+        CPUSET_PRINT_DEBUG("Could not get affinity for Process\n");
+    }
+    std::bitset<64> newprocessMask = processAffinityMask;
+    CPUSET_PRINT_DEBUG("New Proc Mask:  %s\n", newprocessMask.to_string().c_str());
+  
+    HANDLE hThread = GetCurrentThread();
+    // Get the thread ID of this thread
+    DWORD tid = (DWORD)GetThreadId(hThread);
+
+    // Enumerate all threads in the process
+    THREADENTRY32 te;
+    HANDLE hSnapshot = CreateToolhelp32Snapshot(TH32CS_SNAPTHREAD, 0);
+    if (hSnapshot != INVALID_HANDLE_VALUE) {
+        te.dwSize = sizeof(THREADENTRY32);
+        Thread32First(hSnapshot, &te);
+        if (Thread32Next(hSnapshot, &te)) {
+            do {
+                // Check if the thread is part of this process
+                if (te.th32OwnerProcessID == GetProcessId(hProcess)) {
+                    // Set thread affinity
+                    if (!SetThreadAffinityMask(hThread, cpuMask.to_ullong() & systemAffinityMask)) {
+                        CPUSET_PRINT_DEBUG("Could not set affinity for Main Process Thread\n");
+                    }
+                }
+            } while( Thread32Next(hSnapshot, &te ) );
+        } 
+        CloseHandle(hSnapshot);
+    }
+
+	if (hProcess)
+		::CloseHandle(hProcess);
+	if (hThread)
+		::CloseHandle(hThread);
+
+    HANDLE hProcess2 = ::OpenProcess(PROCESS_ALL_ACCESS, FALSE, GetCurrentProcessId());
+    
+    if (hProcess2) {
+        PROCESS_POWER_THROTTLING_STATE PowerThrottling;
+        RtlZeroMemory(&PowerThrottling, sizeof(PowerThrottling));
+        PowerThrottling.Version = PROCESS_POWER_THROTTLING_CURRENT_VERSION;
+
+        PowerThrottling.ControlMask = PROCESS_POWER_THROTTLING_IGNORE_TIMER_RESOLUTION;
+        PowerThrottling.StateMask = 0;
+        PowerThrottling.StateMask = PROCESS_POWER_THROTTLING_IGNORE_TIMER_RESOLUTION;
+
+        SetProcessInformation(hProcess2,
+            ProcessPowerThrottling,
+            &PowerThrottling,
+            sizeof(PowerThrottling));
+    
+        RtlZeroMemory(&PowerThrottling, sizeof(PowerThrottling));
+        PowerThrottling.ControlMask = PROCESS_POWER_THROTTLING_EXECUTION_SPEED;
+        PowerThrottling.StateMask = 0;
+        PowerThrottling.StateMask = PROCESS_POWER_THROTTLING_EXECUTION_SPEED;
+        SetProcessInformation(hProcess2,
+            ProcessPowerThrottling,
+            &PowerThrottling,
+            sizeof(PowerThrottling));
+
+        MEMORY_PRIORITY_INFORMATION MemPrio;
+        ZeroMemory(&MemPrio, sizeof(MemPrio));
+        MemPrio.MemoryPriority = MEMORY_PRIORITY_NORMAL;
+
+        SetProcessInformation(hProcess2,
+            ProcessMemoryPriority,
+            &MemPrio,
+            sizeof(MemPrio));
+
+		::CloseHandle(hProcess2);
+    }
+    
+    return coreSelected;
+}
+
+ULONG set_procMask(int direction = 0 , int32_t req_threads = 0, int lltraversal = 0 ) {
+    std::bitset<64> bMask;
+
+    bMask = generate_Mask(direction, req_threads, lltraversal);
+
+    numPhysicalCores = get_count_procMask(bMask.to_ullong());
+
+    CPUSET_PRINT_DEBUG("Generated Mask: %s\n", bMask.to_string().c_str());
+    return bMask.to_ullong();
+}
+
+#endif // _WIN32
 
 /**
  * Returns number of CPUs on system that are useful for math.
@@ -177,10 +567,28 @@ int get_math_cpu_count() {
             }
         }
     }
+
+#elif defined(_WIN32)
+    int32_t _numPhysical = get_num_physical_cores();
+    if (cpuset_enable) {
+        // Initial Affinity set
+        setCpuAffinity(set_procMask(WORST_CORES, 0, 1));
+    }
+    return _numPhysical;
 #endif
     return get_num_physical_cores();
 }
 
+#if defined(_WIN32)
+int get_math_cpu_count(int32_t req_threads, int cpuset_order, int lltraversal) {
+    int32_t _numPhysical = get_num_physical_cores();
+    if (cpuset_enable) {
+        _numPhysical = setCpuAffinity(set_procMask(cpuset_order, req_threads, lltraversal));
+    }
+    return _numPhysical;
+}
+#endif
+
 void process_escapes(std::string & input) {
     std::size_t input_len = input.length();
     std::size_t output_idx = 0;
@@ -245,15 +653,48 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         params.seed = std::stoul(argv[i]);
         return true;
     }
+    if (arg == "-llct") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+#if defined(_WIN32)
+        std::string value(argv[i]);
+        if (value == "1" || value == "on" || value == "true" || value == "True") { params.cpuset_lltraversal = 1; }
+        else if (value == "0" || value == "off" || value == "false" || value == "False") { params.cpuset_lltraversal = 0; }
+        else { invalid_param = true; }
+#endif
+        return true;
+    }
+    if (arg == "-bco") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+#if defined(_WIN32)
+        std::string value(argv[i]);
+        if (value == "1" || value == "on" || value == "true" || value == "True") { params.cpuset_order = BEST_CORES; }
+        else if (value == "0" || value == "off" || value == "false" || value == "False") { params.cpuset_order = WORST_CORES; }
+        else { invalid_param = true; }
+#endif
+        return true;
+    }
     if (arg == "-t" || arg == "--threads") {
         if (++i >= argc) {
             invalid_param = true;
             return true;
         }
+#if defined(_WIN32)
+        params.n_threads = std::stoi(argv[i]);
+        if (params.n_threads <= 0) {
+            params.n_threads = numPhysicalCores;
+        }
+#else            
         params.n_threads = std::stoi(argv[i]);
         if (params.n_threads <= 0) {
             params.n_threads = std::thread::hardware_concurrency();
         }
+#endif
         return true;
     }
     if (arg == "-tb" || arg == "--threads-batch") {
@@ -262,8 +703,13 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
             return true;
         }
         params.n_threads_batch = std::stoi(argv[i]);
+#if defined(_WIN32)
+        if (params.n_threads_batch <= 0 || params.n_threads_batch > numPhysicalCores) {
+            params.n_threads_batch = numPhysicalCores;
+#else            
         if (params.n_threads_batch <= 0) {
             params.n_threads_batch = std::thread::hardware_concurrency();
+#endif
         }
         return true;
     }
@@ -273,8 +719,13 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
             return true;
         }
         params.n_threads_draft = std::stoi(argv[i]);
+#if defined(_WIN32)
+        if (params.n_threads_draft <= 0 || params.n_threads_draft > numPhysicalCores) {
+            params.n_threads_draft = numPhysicalCores;
+#else            
         if (params.n_threads_draft <= 0) {
             params.n_threads_draft = std::thread::hardware_concurrency();
+#endif
         }
         return true;
     }
@@ -284,8 +735,13 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
             return true;
         }
         params.n_threads_batch_draft = std::stoi(argv[i]);
+#if defined(_WIN32)
+        if (params.n_threads_batch_draft <= 0 || params.n_threads_batch_draft > numPhysicalCores) {
+            params.n_threads_batch_draft = numPhysicalCores;
+#else            
         if (params.n_threads_batch_draft <= 0) {
             params.n_threads_batch_draft = std::thread::hardware_concurrency();
+#endif
         }
         return true;
     }
@@ -1281,6 +1737,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         params.kv_overrides.push_back(kvo);
         return true;
     }
+
 #ifndef LOG_DISABLE_LOGS
     // Parse args for logging parameters
     if (log_param_single_parse(argv[i])) {
@@ -1325,6 +1782,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
         }
     }
 
+#if defined(_WIN32)
+    params.n_threads = get_math_cpu_count(params.n_threads, params.cpuset_order, params.cpuset_lltraversal);
+    CPUSET_PRINT_DEBUG("Using %d threads order=%d llcache=%d\n", params.n_threads, params.cpuset_order, params.cpuset_lltraversal);
+#endif
+
     if (invalid_param) {
         throw std::invalid_argument("error: invalid parameter for argument: " + arg);
     }
@@ -1486,6 +1948,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     if (llama_supports_mmap()) {
         printf("  --no-mmap             do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
     }
+#if defined(_WIN32)
+        printf("  -bco                  change the order of the selected cores from the best to worst (default: worst to best)\n");
+        printf("  -llct                 allow the core selection to traverse the last level cache (default: disabled)\n");
+#endif
     printf("  --numa TYPE           attempt optimizations that help on some NUMA systems\n");
     printf("                          - distribute: spread execution evenly over all nodes\n");
     printf("                          - isolate: only spawn threads on CPUs on the node that execution started on\n");
@@ -2679,7 +3145,14 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
     dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector);
 
     fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
+#if defined(_WIN32)
+    fprintf(stream, "threads: %d # default: %u\n", params.n_threads, get_math_cpu_count());
+    fprintf(stream, "bco: %d # default: 0\n", params.cpuset_order);
+    fprintf(stream, "llct: %d # default: 0\n", params.cpuset_lltraversal);
+#else
+
     fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency());
+#endif
     fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
     fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
     fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
diff --git a/common/common.h b/common/common.h
index cca44268e6df5..773913c7342e4 100644
--- a/common/common.h
+++ b/common/common.h
@@ -39,6 +39,26 @@ extern char const *LLAMA_BUILD_TARGET;
 
 struct llama_control_vector_load_info;
 
+#ifdef _WIN32
+struct CPU_SET_INFORMATION
+{
+    int32_t LogicalProcessorIndex;
+    int32_t Id;
+    int32_t Group;
+    int32_t CoreIndex;
+    int32_t LastLevelCacheIndex;
+    int32_t NumaNodeIndex;
+    int32_t EfficiencyClass;
+    int32_t SchedulingClass;
+    int32_t Priority;
+    int32_t Threads;
+};
+
+#endif
+
+static const int BEST_CORES            = 0;
+static const int WORST_CORES           = 1;
+
 int get_math_cpu_count();
 int32_t get_num_physical_cores();
 
@@ -53,6 +73,8 @@ struct gpt_params {
     int32_t n_threads_draft       = -1;
     int32_t n_threads_batch       = -1;    // number of threads to use for batch processing (-1 = use n_threads)
     int32_t n_threads_batch_draft = -1;
+    int32_t cpuset_lltraversal    = 0;
+    int32_t cpuset_order          = WORST_CORES;
     int32_t n_predict             = -1;    // new tokens to predict
     int32_t n_ctx                 = 512;   // context size
     int32_t n_batch               = 2048;  // logical batch size for prompt processing (must be >=32 to use BLAS)
@@ -321,4 +343,4 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
 //
 static const char * const LLM_KV_SPLIT_NO            = "split.no";
 static const char * const LLM_KV_SPLIT_COUNT         = "split.count";
-static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
+static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
\ No newline at end of file
diff --git a/ggml.c b/ggml.c
index 593c603f493be..90584e18b4959 100644
--- a/ggml.c
+++ b/ggml.c
@@ -77,12 +77,59 @@ typedef DWORD thread_ret_t;
 static int pthread_create(pthread_t * out, void * unused, thread_ret_t(*func)(void *), void * arg) {
     (void) unused;
     HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL);
+
+#if defined(_WIN32)
+    HANDLE hToken;
+    DWORD_PTR processAffinityMask;
+    DWORD_PTR systemAffinityMask;
+
+    BOOL bToken = OpenProcessToken(GetCurrentProcess(), TOKEN_ALL_ACCESS, &hToken);
+    if (bToken) {
+
+        HANDLE hProcess = OpenProcess(PROCESS_QUERY_LIMITED_INFORMATION | PROCESS_SET_INFORMATION, FALSE, GetCurrentProcessId());
+        if (hProcess) {
+            if (GetProcessAffinityMask(hProcess, &processAffinityMask, &systemAffinityMask)) {
+                SetThreadAffinityMask(handle, processAffinityMask);
+            }
+        }
+        if (hProcess)
+            CloseHandle(hProcess);
+    }
+    
+    HANDLE hProcess2 = OpenProcess(PROCESS_ALL_ACCESS, FALSE, GetCurrentProcessId());
+
+    PROCESS_POWER_THROTTLING_STATE PowerThrottling;
+    RtlZeroMemory(&PowerThrottling, sizeof(PowerThrottling));
+    PowerThrottling.Version = PROCESS_POWER_THROTTLING_CURRENT_VERSION;
+
+    PowerThrottling.ControlMask = PROCESS_POWER_THROTTLING_IGNORE_TIMER_RESOLUTION;
+    PowerThrottling.StateMask = 0;
+	PowerThrottling.StateMask = PROCESS_POWER_THROTTLING_IGNORE_TIMER_RESOLUTION;
+
+    SetProcessInformation(hProcess2,
+        ProcessPowerThrottling,
+        &PowerThrottling,
+        sizeof(PowerThrottling));
+   
+    RtlZeroMemory(&PowerThrottling, sizeof(PowerThrottling));
+    PowerThrottling.ControlMask = PROCESS_POWER_THROTTLING_EXECUTION_SPEED;
+    PowerThrottling.StateMask = 0;
+    PowerThrottling.StateMask = PROCESS_POWER_THROTTLING_EXECUTION_SPEED;
+    SetProcessInformation(hProcess2,
+        ProcessPowerThrottling,
+        &PowerThrottling,
+        sizeof(PowerThrottling));
+
+	if (hProcess2)
+		CloseHandle(hProcess2);
+#endif
     if (handle == NULL)
     {
         return EAGAIN;
     }
 
     *out = handle;
+
     return 0;
 }
 

From ca37f7d2c565581cb40def755d86dc622bfcca3c Mon Sep 17 00:00:00 2001
From: mann1x <20623405+mann1x@users.noreply.github.com>
Date: Mon, 22 Apr 2024 20:12:15 +0200
Subject: [PATCH 02/12] Remove dubug flag

---
 common/common.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/common.cpp b/common/common.cpp
index c53749befb8ab..f447aa9ada724 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -92,7 +92,7 @@ int32_t PhysicalCores = std::thread::hardware_concurrency();
 // CPUSET logging
 //
 
-#define CPUSET_DEBUG 1
+#define CPUSET_DEBUG 0
 #if (CPUSET_DEBUG >= 1)
 #define CPUSET_PRINT_DEBUG(...) printf(__VA_ARGS__)
 #else

From f9b42b8cd8a35411174c2952c2beb665f3f34a68 Mon Sep 17 00:00:00 2001
From: mann1x <20623405+mann1x@users.noreply.github.com>
Date: Wed, 24 Apr 2024 21:50:01 +0200
Subject: [PATCH 03/12] Added new options and some fixes

---
 common/common.cpp | 150 ++++++++++++++++++++++++++++++++++++++--------
 common/common.h   |   8 ++-
 ggml.c            |   4 +-
 3 files changed, 131 insertions(+), 31 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index f447aa9ada724..a62d67cb07957 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -92,7 +92,7 @@ int32_t PhysicalCores = std::thread::hardware_concurrency();
 // CPUSET logging
 //
 
-#define CPUSET_DEBUG 0
+#define CPUSET_DEBUG 1
 #if (CPUSET_DEBUG >= 1)
 #define CPUSET_PRINT_DEBUG(...) printf(__VA_ARGS__)
 #else
@@ -124,13 +124,52 @@ bool cpuset_sorter_worst(CPU_SET_INFORMATION const& lhs, CPU_SET_INFORMATION con
         return lhs.SchedulingClass < rhs.SchedulingClass;
 }
 
-ULONG generate_Mask(int direction, int32_t req_threads, int lltraversal) {
+ULONG generate_Mask(int32_t direction, int32_t req_threads, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask) {
     std::bitset<64> bMask;
     std::vector<CPU_SET_INFORMATION> _cpuset;
     int32_t bVal = 0;
     int32_t assigned_t = 0;
     int32_t llcache = -1;
 
+    DWORD_PTR processAffinityMask;
+    DWORD_PTR systemAffinityMask;
+    HANDLE hToken = nullptr;
+    bool gotsystemMask = true;
+
+    BOOL bToken = ::OpenProcessToken(::GetCurrentProcess(), TOKEN_ALL_ACCESS, &hToken);
+    if (!bToken) {
+        CPUSET_PRINT_DEBUG("Could not access OpenProcessToken from generate_Mask\n");
+    }
+
+    HANDLE hProcess = ::OpenProcess(PROCESS_QUERY_LIMITED_INFORMATION | PROCESS_SET_INFORMATION, FALSE, GetCurrentProcessId());
+    if (!hProcess) {
+        CPUSET_PRINT_DEBUG("Could not access OpenProcess for Affinity\n");
+        gotsystemMask = false;
+    }
+
+    if (!GetProcessAffinityMask(hProcess, &processAffinityMask, &systemAffinityMask)) {
+        CPUSET_PRINT_DEBUG("Could not get GetProcessAffinityMask for Process\n");
+        gotsystemMask = false;
+    }
+    
+	if (hProcess)
+		::CloseHandle(hProcess);
+
+    if (cpuMask != 0) {
+        std::bitset<64> reqMask = cpuMask;
+        CPUSET_PRINT_DEBUG("Custom cpuMask: %s\n", reqMask.to_string().c_str());
+        if (gotsystemMask) {
+            std::bitset<64> systemMask = systemAffinityMask;
+            CPUSET_PRINT_DEBUG("System Mask:    %s\n", systemMask.to_string().c_str());
+            std::bitset<64> newprocessMask = reqMask & systemMask;
+            CPUSET_PRINT_DEBUG("New Proc Mask:  %s\n", newprocessMask.to_string().c_str());
+            bMask = reqMask & systemMask;
+        } else{
+            bMask = cpuMask;
+        }
+        return bMask.to_ullong();
+    }
+
     if (direction == BEST_CORES) {
         _cpuset = cpuset_best;
     } else {
@@ -139,27 +178,25 @@ ULONG generate_Mask(int direction, int32_t req_threads, int lltraversal) {
     CPUSET_PRINT_DEBUG("\ngenerate_Mask dir=%d req_threads=%d lltraversal=%d llcache=%d\n", direction, req_threads, lltraversal, llcache);
     for (auto index : _cpuset) {
         bVal = 0;
-        if (index.LogicalProcessorIndex != 0 &&
-            ((cpuset_smt && index.Threads > 1) || !cpuset_smt) &&
+        if ((index.LogicalProcessorIndex != 0 || allowcz) &&
+            ((cpuset_smt && index.Threads > 1) || !cpuset_smt || allowtc) &&
             index.EfficiencyClass == 0 &&
-            ((llcache == index.LastLevelCacheIndex && lltraversal == 0) || llcache == -1)
+            ((llcache == index.LastLevelCacheIndex && lltraversal == 0) || llcache == -1 || lltraversal == 1)
             ) {
             if (lltraversal == 0) {
-                CPUSET_PRINT_DEBUG("cache for lltraversal %d pre llcache %d now_cache=%u\n", lltraversal, llcache, index.LastLevelCacheIndex);
+                CPUSET_PRINT_DEBUG("### cache for lltraversal %d pre llcache=%d now_cache=%u\n", lltraversal, llcache, index.LastLevelCacheIndex);
                 llcache = index.LastLevelCacheIndex;
-                CPUSET_PRINT_DEBUG("cache for lltraversal %d pos llcache %d now_cache=%u\n", lltraversal, llcache, index.LastLevelCacheIndex);
+                CPUSET_PRINT_DEBUG("### cache for lltraversal %d pos llcache=%d now_cache=%u\n", lltraversal, llcache, index.LastLevelCacheIndex);
             } 
             bVal = 1;
+        }
+        if (req_threads > 0 && assigned_t >= req_threads) { bVal = 0;}
+        if(bVal == 1) {
             assigned_t++;
-            CPUSET_PRINT_DEBUG("Assigned LogicalCoreIndex: %d lltraversal %d llcache %d now_cache=%u\n", index.LogicalProcessorIndex, lltraversal, llcache, index.LastLevelCacheIndex);
+            CPUSET_PRINT_DEBUG("--> Assigned LogicalCoreIndex: %d lltraversal=%d llcache=%d now_cache=%u\n", index.LogicalProcessorIndex, lltraversal, llcache, index.LastLevelCacheIndex);
         }
         bMask[index.LogicalProcessorIndex] = bVal;
-        CPUSET_PRINT_DEBUG("Index: %d b:%d smt=%d thrds=%d\n", index.LogicalProcessorIndex, bVal, cpuset_smt, index.Threads);
-        if (req_threads > 0) {
-            if (assigned_t >= req_threads) {
-                break;
-            }
-        }
+        CPUSET_PRINT_DEBUG("LogicalCoreIndex: %d b:%d smt=%d thrds=%d lltraversal=%d acz=%d atc=%d\n", index.LogicalProcessorIndex, bVal, cpuset_smt, index.Threads, lltraversal, allowcz, allowtc);
     }
     return bMask.to_ullong();
 }
@@ -262,7 +299,6 @@ int32_t get_num_physical_cores() {
         cpuSetSize += nextCPUSet->Size;
     }    
     
-    int32_t physicalCount = 0;
     int32_t thisLogical = 0;
     int32_t coreThreadsNum = 1;
 
@@ -274,7 +310,6 @@ int32_t get_num_physical_cores() {
         if (nextLogical->ProcessorCore.Flags == 1 && nextLogical->Cache.Associativity <= 2) {
             switch (nextLogical->Relationship) {
                 case LOGICAL_PROCESSOR_RELATIONSHIP::RelationProcessorCore:                    
-                    CPUSET_PRINT_DEBUG("Physical Count: %u\n", physicalCount);        
                     CPUSET_PRINT_DEBUG("Cache.Associativity: %d\n", nextLogical->Cache.Associativity);        
                     CPUSET_PRINT_DEBUG("Cache.Level: %d\n", nextLogical->Cache.Level);        
                     CPUSET_PRINT_DEBUG("Cache.Type: %d\n", nextLogical->Cache.Type);        
@@ -303,15 +338,16 @@ int32_t get_num_physical_cores() {
     std::sort(cpuset_best.begin(), cpuset_best.end(), &cpuset_sorter_best);
     std::sort(cpuset_worst.begin(), cpuset_worst.end(), &cpuset_sorter_worst);
 
-    physicalCount = get_count_procMask(generate_Mask(WORST_CORES, 0, 1));
+    int32_t physicalCount = 0;
+    physicalCount = get_count_procMask(generate_Mask(WORST_CORES, 0, 1, 0, 1, 0));
 
-    CPUSET_PRINT_DEBUG("\n\nLPhysicalCount: %d\n\n", physicalCount);
+    CPUSET_PRINT_DEBUG("\n\n1st PhysicalCount: %d\n\n", physicalCount);
 
     physicalCount = physicalCount <= 0 ? numLogicalCores : physicalCount;
 
-    CPUSET_PRINT_DEBUG("\n\nLPhysicalCount2: %d\n\n", physicalCount);
+    CPUSET_PRINT_DEBUG("\n\n2nd PhysicalCount2: %d\n\n", physicalCount);
 
-    CPUSET_PRINT_DEBUG("\n\nLogical Processors Summary\n\n");
+    CPUSET_PRINT_DEBUG("\n\n### Logical Processors Summary ###\n\n");
 
     for (uint32_t _logicalCore = 0; _logicalCore < numLogicalCores;)
     {
@@ -535,10 +571,10 @@ int32_t setCpuAffinity(std::bitset<64> cpuMask) {
     return coreSelected;
 }
 
-ULONG set_procMask(int direction = 0 , int32_t req_threads = 0, int lltraversal = 0 ) {
+ULONG set_procMask(int32_t direction = 0 , int32_t req_threads = 0, int32_t lltraversal = 0, int32_t allowtc = 0, int32_t allowcz = 0, int64_t cpuMask = 0) {
     std::bitset<64> bMask;
 
-    bMask = generate_Mask(direction, req_threads, lltraversal);
+    bMask = generate_Mask(direction, req_threads, lltraversal, allowtc, allowcz, cpuMask);
 
     numPhysicalCores = get_count_procMask(bMask.to_ullong());
 
@@ -580,10 +616,10 @@ int get_math_cpu_count() {
 }
 
 #if defined(_WIN32)
-int get_math_cpu_count(int32_t req_threads, int cpuset_order, int lltraversal) {
+int get_math_cpu_count(int32_t req_threads, int32_t cpuset_order, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask) {
     int32_t _numPhysical = get_num_physical_cores();
     if (cpuset_enable) {
-        _numPhysical = setCpuAffinity(set_procMask(cpuset_order, req_threads, lltraversal));
+        _numPhysical = setCpuAffinity(set_procMask(cpuset_order, req_threads, lltraversal, allowtc, allowcz, cpuMask));
     }
     return _numPhysical;
 }
@@ -653,6 +689,61 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         params.seed = std::stoul(argv[i]);
         return true;
     }
+    if (arg == "-acz") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+#if defined(_WIN32)
+        std::string value(argv[i]);
+        if (value == "1" || value == "on" || value == "true" || value == "True") { params.cpuset_allowzero = 1; }
+        else if (value == "0" || value == "off" || value == "false" || value == "False") { params.cpuset_allowzero = 0; }
+        else { invalid_param = true; }
+#endif
+        return true;
+    }
+    if (arg == "-atc") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+#if defined(_WIN32)
+        std::string value(argv[i]);
+        if (value == "1" || value == "on" || value == "true" || value == "True") { params.cpuset_allowthreads = 1; }
+        else if (value == "0" || value == "off" || value == "false" || value == "False") { params.cpuset_allowthreads = 0; }
+        else { invalid_param = true; }
+#endif
+        return true;
+    }
+    if (arg == "-ccm") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+#if defined(_WIN32)
+        std::string value(argv[i]);
+        std::size_t pos{};
+        int64_t cpuMask = 0;
+        bool valid_bitmask = false;
+        try
+        {
+            const int64_t ll{std::stoll(value, &pos)};
+            cpuMask = ll;
+            valid_bitmask = true;
+        }
+        catch (std::invalid_argument const& ex)
+        {
+            fprintf(stderr, "%s\n", ex.what());
+        }
+        catch (std::out_of_range const& ex)
+        {
+            fprintf(stderr, "%s\n", ex.what());
+        }
+        if (valid_bitmask && cpuMask != 0) { params.cpuset_cpumask = cpuMask; }
+        else { invalid_param = true; }
+#endif
+        return true;
+    }
     if (arg == "-llct") {
         if (++i >= argc) {
             invalid_param = true;
@@ -695,6 +786,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
             params.n_threads = std::thread::hardware_concurrency();
         }
 #endif
+        params.n_threads_auto = false;
         return true;
     }
     if (arg == "-tb" || arg == "--threads-batch") {
@@ -1783,8 +1875,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
     }
 
 #if defined(_WIN32)
-    params.n_threads = get_math_cpu_count(params.n_threads, params.cpuset_order, params.cpuset_lltraversal);
-    CPUSET_PRINT_DEBUG("Using %d threads order=%d llcache=%d\n", params.n_threads, params.cpuset_order, params.cpuset_lltraversal);
+    params.n_threads = get_math_cpu_count(params.n_threads_auto ? 0 : params.n_threads, params.cpuset_order, params.cpuset_lltraversal, params.cpuset_allowthreads, params.cpuset_allowzero, params.cpuset_cpumask);
+    CPUSET_PRINT_DEBUG("Using %d threads order=%d llcache=%d acm=%d acz=%d cpumask=%lli\n", params.n_threads, params.cpuset_order, params.cpuset_lltraversal, params.cpuset_allowthreads, params.cpuset_allowzero, params.cpuset_cpumask);
 #endif
 
     if (invalid_param) {
@@ -1951,6 +2043,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
 #if defined(_WIN32)
         printf("  -bco                  change the order of the selected cores from the best to worst (default: worst to best)\n");
         printf("  -llct                 allow the core selection to traverse the last level cache (default: disabled)\n");
+        printf("  -acz                  allow the core selection to pick the core 0 as well (default: disabled)\n");
+        printf("  -atc                  allow the core selection to pick non physical, threaded, cores (default: disabled)\n");
+        printf("  -ccm                  specify a custom CPU Affinity bitmask in hex for the core selection (default: disabled)\n");
 #endif
     printf("  --numa TYPE           attempt optimizations that help on some NUMA systems\n");
     printf("                          - distribute: spread execution evenly over all nodes\n");
@@ -3149,6 +3244,9 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "threads: %d # default: %u\n", params.n_threads, get_math_cpu_count());
     fprintf(stream, "bco: %d # default: 0\n", params.cpuset_order);
     fprintf(stream, "llct: %d # default: 0\n", params.cpuset_lltraversal);
+    fprintf(stream, "acz: %d # default: 0\n", params.cpuset_allowzero);
+    fprintf(stream, "atc: %d # default: 0\n", params.cpuset_allowthreads);
+    fprintf(stream, "ccm: %lli # default: none\n", params.cpuset_cpumask);
 #else
 
     fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency());
diff --git a/common/common.h b/common/common.h
index 773913c7342e4..5ba823acead4a 100644
--- a/common/common.h
+++ b/common/common.h
@@ -56,8 +56,8 @@ struct CPU_SET_INFORMATION
 
 #endif
 
-static const int BEST_CORES            = 0;
-static const int WORST_CORES           = 1;
+static const int32_t BEST_CORES            = 0;
+static const int32_t WORST_CORES           = 1;
 
 int get_math_cpu_count();
 int32_t get_num_physical_cores();
@@ -73,8 +73,12 @@ struct gpt_params {
     int32_t n_threads_draft       = -1;
     int32_t n_threads_batch       = -1;    // number of threads to use for batch processing (-1 = use n_threads)
     int32_t n_threads_batch_draft = -1;
+    bool    n_threads_auto        = true;
     int32_t cpuset_lltraversal    = 0;
     int32_t cpuset_order          = WORST_CORES;
+    int64_t cpuset_cpumask        = 0;
+    int32_t cpuset_allowzero      = 0;
+    int32_t cpuset_allowthreads   = 0;
     int32_t n_predict             = -1;    // new tokens to predict
     int32_t n_ctx                 = 512;   // context size
     int32_t n_batch               = 2048;  // logical batch size for prompt processing (must be >=32 to use BLAS)
diff --git a/ggml.c b/ggml.c
index 90584e18b4959..b5b11ca16090e 100644
--- a/ggml.c
+++ b/ggml.c
@@ -78,11 +78,10 @@ static int pthread_create(pthread_t * out, void * unused, thread_ret_t(*func)(vo
     (void) unused;
     HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL);
 
-#if defined(_WIN32)
     HANDLE hToken;
     DWORD_PTR processAffinityMask;
     DWORD_PTR systemAffinityMask;
-
+        
     BOOL bToken = OpenProcessToken(GetCurrentProcess(), TOKEN_ALL_ACCESS, &hToken);
     if (bToken) {
 
@@ -122,7 +121,6 @@ static int pthread_create(pthread_t * out, void * unused, thread_ret_t(*func)(vo
 
 	if (hProcess2)
 		CloseHandle(hProcess2);
-#endif
     if (handle == NULL)
     {
         return EAGAIN;

From 63cd3dc251563e0aa15dc66a61f7e1affd6ed011 Mon Sep 17 00:00:00 2001
From: mann1x <20623405+mann1x@users.noreply.github.com>
Date: Thu, 25 Apr 2024 22:27:50 +0200
Subject: [PATCH 04/12] Initial support for Linux

---
 common/common.cpp | 385 ++++++++++++++++++++++++++++++++++------------
 common/common.h   |   8 +-
 2 files changed, 289 insertions(+), 104 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index a62d67cb07957..666de14bea501 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -77,7 +77,7 @@
 
 using json = nlohmann::ordered_json;
 
-#if defined(_WIN32)
+#if defined(_WIN32) || (defined(__linux__) && defined(__x86_64__))
 std::vector<CPU_SET_INFORMATION> cpuset;
 std::vector<CPU_SET_INFORMATION> cpuset_best;
 std::vector<CPU_SET_INFORMATION> cpuset_worst;
@@ -92,13 +92,25 @@ int32_t PhysicalCores = std::thread::hardware_concurrency();
 // CPUSET logging
 //
 
-#define CPUSET_DEBUG 1
+#define CPUSET_DEBUG 0
 #if (CPUSET_DEBUG >= 1)
 #define CPUSET_PRINT_DEBUG(...) printf(__VA_ARGS__)
 #else
 #define CPUSET_PRINT_DEBUG(...)
 #endif
 
+bool cpuset_sorter_best(CPU_SET_INFORMATION const& lhs, CPU_SET_INFORMATION const& rhs) {
+        return lhs.SchedulingClass > rhs.SchedulingClass;
+}
+
+bool cpuset_sorter_worst(CPU_SET_INFORMATION const& lhs, CPU_SET_INFORMATION const& rhs) {
+        return lhs.SchedulingClass < rhs.SchedulingClass;
+}
+
+#endif
+
+#if defined(_WIN32)
+
 int32_t get_pos_procMask(ULONG_PTR procMask) {
             std::bitset<64> bMask = procMask;
             int32_t thisPos = 0;
@@ -116,14 +128,6 @@ int32_t get_count_procMask(ULONG_PTR procMask) {
             return bMask.count();
 }
 
-bool cpuset_sorter_best(CPU_SET_INFORMATION const& lhs, CPU_SET_INFORMATION const& rhs) {
-        return lhs.SchedulingClass > rhs.SchedulingClass;
-}
-
-bool cpuset_sorter_worst(CPU_SET_INFORMATION const& lhs, CPU_SET_INFORMATION const& rhs) {
-        return lhs.SchedulingClass < rhs.SchedulingClass;
-}
-
 ULONG generate_Mask(int32_t direction, int32_t req_threads, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask) {
     std::bitset<64> bMask;
     std::vector<CPU_SET_INFORMATION> _cpuset;
@@ -202,12 +206,250 @@ ULONG generate_Mask(int32_t direction, int32_t req_threads, int32_t lltraversal,
 }
 #endif
 
+#if defined(__x86_64__) && defined(__linux__)
+#include <pthread.h>
+
+int32_t setCpuAffinity(std::bitset<64> cpuMask) {
+    int32_t coreSelected = cpuMask.count();
+
+    cpu_set_t mask;
+    CPU_ZERO(&mask);
+
+    for (int32_t i = 0; i < 64; ++i) {
+        if (cpuMask[i] == 1) {
+            CPUSET_PRINT_DEBUG("Setting CPU %d\n", i);
+            CPU_SET(i, &mask);
+        }
+    }
+
+    if (sched_setaffinity(0, sizeof(cpu_set_t), &mask) == -1) {
+            CPUSET_PRINT_DEBUG("setCpuAffinity sched_setaffinity error\n");
+    }
+    if (pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask) == -1) {
+            CPUSET_PRINT_DEBUG("setCpuAffinity pthread_setaffinity_np error\n");
+    }
+     
+    return coreSelected;
+}
+
+uint64_t generate_Mask(int32_t direction, int32_t req_threads, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask) {
+    std::bitset<64> bMask;
+    std::vector<CPU_SET_INFORMATION> _cpuset;
+    int32_t bVal = 0;
+    int32_t assigned_t = 0;
+    int32_t llcache = -1;
+
+    if (cpuMask != 0) {
+        std::bitset<64> reqMask = cpuMask;
+        CPUSET_PRINT_DEBUG("Custom cpuMask: %s\n", reqMask.to_string().c_str());
+        bMask = cpuMask;
+        return bMask.to_ullong();
+    }
+
+    if (direction == BEST_CORES) {
+        _cpuset = cpuset_best;
+    } else {
+        _cpuset = cpuset_worst;
+    }
+    CPUSET_PRINT_DEBUG("\ngenerate_Mask dir=%d req_threads=%d lltraversal=%d llcache=%d\n", direction, req_threads, lltraversal, llcache);
+    for (auto index : _cpuset) {
+        bVal = 0;
+        if ((index.LogicalProcessorIndex != 0 || allowcz) &&
+            ((cpuset_smt && index.Threads > 1) || !cpuset_smt || allowtc) &&
+            index.EfficiencyClass == 0 &&
+            ((llcache == index.LastLevelCacheIndex && lltraversal == 0) || llcache == -1 || lltraversal == 1)
+            ) {
+            if (lltraversal == 0) {
+                CPUSET_PRINT_DEBUG("### cache for lltraversal %d pre llcache=%d now_cache=%u\n", lltraversal, llcache, index.LastLevelCacheIndex);
+                llcache = index.LastLevelCacheIndex;
+                CPUSET_PRINT_DEBUG("### cache for lltraversal %d pos llcache=%d now_cache=%u\n", lltraversal, llcache, index.LastLevelCacheIndex);
+            } 
+            bVal = 1;
+        }
+        if (req_threads > 0 && assigned_t >= req_threads) { bVal = 0;}
+        if(bVal == 1) {
+            assigned_t++;
+            CPUSET_PRINT_DEBUG("--> Assigned LogicalCoreIndex: %d lltraversal=%d llcache=%d now_cache=%u\n", index.LogicalProcessorIndex, lltraversal, llcache, index.LastLevelCacheIndex);
+        }
+        bMask[index.LogicalProcessorIndex] = bVal;
+        CPUSET_PRINT_DEBUG("LogicalCoreIndex: %d b:%d smt=%d thrds=%d lltraversal=%d acz=%d atc=%d\n", index.LogicalProcessorIndex, bVal, cpuset_smt, index.Threads, lltraversal, allowcz, allowtc);
+    }
+    return bMask.to_ullong();
+}
+
+static void cpuid(unsigned leaf, unsigned subleaf,
+                  unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) {
+    __asm__("movq\t%%rbx,%%rsi\n\t"
+            "cpuid\n\t"
+            "xchgq\t%%rbx,%%rsi"
+            : "=a"(*eax), "=S"(*ebx), "=c"(*ecx), "=d"(*edx)
+            : "0"(leaf), "2"(subleaf));
+}
+
+static int pin_cpu(int cpu) {
+    cpu_set_t mask;
+    CPU_ZERO(&mask);
+    CPU_SET(cpu, &mask);
+    return pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask);
+}
+
+static bool is_hybrid_cpu(void) {
+    unsigned eax, ebx, ecx, edx;
+    cpuid(7, 0, &eax, &ebx, &ecx, &edx);
+    return !!(edx & (1u << 15));
+}
+
+static bool is_running_on_efficiency_core(void) {
+    unsigned eax, ebx, ecx, edx;
+    cpuid(0x1a, 0, &eax, &ebx, &ecx, &edx);
+    int intel_atom = 0x20;
+    int core_type = (eax & 0xff000000u) >> 24;
+    return core_type == intel_atom;
+}
+
+static int count_math_cpus(int cpu_count) {
+    int result = 0;
+    for (int cpu = 0; cpu < cpu_count; ++cpu) {
+        if (pin_cpu(cpu)) {
+            return -1;
+        }
+        if (is_running_on_efficiency_core()) {
+            continue; // efficiency cores harm lockstep threading
+        }
+        ++cpu; // hyperthreading isn't useful for linear algebra
+        ++result;
+    }
+    return result;
+}
+
+uint64_t set_procMask(int32_t direction = 0 , int32_t req_threads = 0, int32_t lltraversal = 0, int32_t allowtc = 0, int32_t allowcz = 0, int64_t cpuMask = 0) {
+    std::bitset<64> bMask;
+
+    bMask = generate_Mask(direction, req_threads, lltraversal, allowtc, allowcz, cpuMask);
+
+    numPhysicalCores = bMask.count();
+
+    CPUSET_PRINT_DEBUG("Generated Mask: %s\n", bMask.to_string().c_str());
+    return bMask.to_ullong();
+}
+
+#endif
+
 int32_t get_num_physical_cores() {
-#ifdef __linux__ // __x86_64__ && __linux__
+#if defined(__linux__) && defined(__x86_64__) // __x86_64__ && __linux__
+    if (numPhysicalCores > 0) {
+        return numPhysicalCores;    
+    }
     // enumerate the set of thread siblings, num entries is num cores
+    fprintf(stderr, "physical cpus count\n");
     std::unordered_set<std::string> siblings;
+    int32_t cursize = 0;
+    cpu_set_t mask;
+    CPU_ZERO(&mask);
+    bool is_hybrid = is_hybrid_cpu();
+    bool is_hybrid_core = false;
+    std::vector<CPU_SET_INFORMATION> _cpuset;
+    int32_t numLogicalCores = 0;
+
     for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) {
-        std::ifstream thread_siblings("/sys/devices/system/cpu"
+        fprintf(stderr, "physical cpu check %d\n", cpu);
+        std::ifstream thread_siblings("/sys/devices/system/cpu/cpu"
+            + std::to_string(cpu) + "/topology/thread_siblings");
+        if (!thread_siblings.is_open()) {
+            break; // no more cpus
+        }
+        is_hybrid_core = false;
+        if (is_hybrid) {
+            if (pin_cpu(cpu) == 0) {
+                if (is_running_on_efficiency_core()) is_hybrid_core = true;
+            }
+        }
+        numLogicalCores++;
+
+        CPU_SET_INFORMATION _cpuset;
+        _cpuset.LogicalProcessorIndex = cpu;
+        _cpuset.CoreIndex = cpu;
+        _cpuset.Id = cpu;
+        _cpuset.Group = 0;
+        _cpuset.LastLevelCacheIndex = 0;
+        _cpuset.NumaNodeIndex = 0;
+        _cpuset.EfficiencyClass = is_hybrid_core ? 1 : 0;
+        _cpuset.Threads = 1;
+
+        std::ifstream cppc_tag("/sys/devices/system/cpu/cpu"
+            + std::to_string(cpu) + "/acpi_cppc/highest_perf");
+        if (!cppc_tag.is_open()) {
+            _cpuset.SchedulingClass = 256-cpu;
+        } else {
+            std::string line;
+            if (std::getline(cppc_tag, line)) {
+                int32_t _thistag = std::stoi(line);    
+                _cpuset.SchedulingClass = _thistag;
+            }
+        }
+
+        if (is_hybrid_core) continue;
+        std::string line;
+        if (std::getline(thread_siblings, line)) {
+            cursize = static_cast<int32_t>(siblings.size());
+            siblings.insert(line);
+            if (static_cast<int32_t>(siblings.size()) > cursize ) {
+                _cpuset.Threads = 2;
+                CPU_SET(cpu, &mask);
+                fprintf(stderr, "physical cpu %u: %s\n", cpu, line.c_str());
+            } else {
+                cpuset_smt = true;
+            }
+        }
+        cpuset.push_back(_cpuset);
+    }
+    if (!siblings.empty()) {
+        cpuset_enable = true;
+        if (sched_setaffinity(0, sizeof(cpu_set_t), &mask) == -1) {
+                fprintf(stdout, "sched_setaffinity error\n");
+        }
+        if (pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask) == -1) {
+                fprintf(stdout, "pthread_setaffinity_np error\n");
+        }
+        fprintf(stderr, "physical cpus %li\n", siblings.size());
+
+        cpuset_best = cpuset;
+        cpuset_worst = cpuset;
+        std::sort(cpuset_best.begin(), cpuset_best.end(), &cpuset_sorter_best);
+        std::sort(cpuset_worst.begin(), cpuset_worst.end(), &cpuset_sorter_worst);
+
+        //int32_t physicalCount = 0;
+        int32_t physicalCount = static_cast<int32_t>(siblings.size());
+        //physicalCount = get_count_procMask(generate_Mask(WORST_CORES, 0, 1, 0, 1, 0));
+
+        CPUSET_PRINT_DEBUG("\n\n### Logical Processors Summary ###\n\n");
+
+        for (int32_t _logicalCore = 0; _logicalCore < numLogicalCores;)
+        {
+                CPUSET_PRINT_DEBUG("\nLogical: %u\n", _logicalCore);
+                CPUSET_PRINT_DEBUG("Threads: %u\n", cpuset[_logicalCore].Threads);
+                CPUSET_PRINT_DEBUG("Id: %u\n", cpuset[_logicalCore].Id);
+                CPUSET_PRINT_DEBUG("Group: %u\n", cpuset[numLogicalCores].Group);
+                CPUSET_PRINT_DEBUG("LastLevelCacheIndex: %u\n", cpuset[_logicalCore].LastLevelCacheIndex);
+                CPUSET_PRINT_DEBUG("NumaNodeIndex: %u\n", cpuset[_logicalCore].NumaNodeIndex);
+                CPUSET_PRINT_DEBUG("LogicalProcessorIndex: %u\n", cpuset[_logicalCore].LogicalProcessorIndex);
+                CPUSET_PRINT_DEBUG("EfficiencyClass: %u\n", cpuset[_logicalCore].EfficiencyClass);
+                CPUSET_PRINT_DEBUG("SchedulingClass: %u\n", cpuset[_logicalCore].SchedulingClass);
+                _logicalCore++;
+        }
+
+        CPUSET_PRINT_DEBUG("\n\n<Grand total> \n\n");    
+        CPUSET_PRINT_DEBUG("Total Physical: %d\n", physicalCount);    
+        CPUSET_PRINT_DEBUG("Total Logical: %u\n", numLogicalCores);    
+
+        numPhysicalCores = physicalCount;
+        return physicalCount;
+    }
+#elif defined(__linux__) // __linux__
+// enumerate the set of thread siblings, num entries is num cores
+    std::unordered_set<std::string> siblings;
+    for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) {
+        std::ifstream thread_siblings("/sys/devices/system/cpu/cpu"
             + std::to_string(cpu) + "/topology/thread_siblings");
         if (!thread_siblings.is_open()) {
             break; // no more cpus
@@ -289,6 +531,7 @@ int32_t get_num_physical_cores() {
                 _cpuset.NumaNodeIndex = nextCPUSet->CpuSet.NumaNodeIndex;
                 _cpuset.EfficiencyClass = nextCPUSet->CpuSet.EfficiencyClass;
                 _cpuset.SchedulingClass = nextCPUSet->CpuSet.SchedulingClass;
+                _cpuset.Threads = 1;
                 cpuset.push_back(_cpuset);
                 numLogicalCores++;
         }
@@ -370,58 +613,10 @@ int32_t get_num_physical_cores() {
     return physicalCount;
 #endif
     unsigned int n_threads = std::thread::hardware_concurrency();
-    return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;}
-
-
-#if defined(__x86_64__) && defined(__linux__)
-#include <pthread.h>
-
-static void cpuid(unsigned leaf, unsigned subleaf,
-                  unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) {
-    __asm__("movq\t%%rbx,%%rsi\n\t"
-            "cpuid\n\t"
-            "xchgq\t%%rbx,%%rsi"
-            : "=a"(*eax), "=S"(*ebx), "=c"(*ecx), "=d"(*edx)
-            : "0"(leaf), "2"(subleaf));
-}
-
-static int pin_cpu(int cpu) {
-    cpu_set_t mask;
-    CPU_ZERO(&mask);
-    CPU_SET(cpu, &mask);
-    return pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask);
-}
-
-static bool is_hybrid_cpu(void) {
-    unsigned eax, ebx, ecx, edx;
-    cpuid(7, 0, &eax, &ebx, &ecx, &edx);
-    return !!(edx & (1u << 15));
+    return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
 }
 
-static bool is_running_on_efficiency_core(void) {
-    unsigned eax, ebx, ecx, edx;
-    cpuid(0x1a, 0, &eax, &ebx, &ecx, &edx);
-    int intel_atom = 0x20;
-    int core_type = (eax & 0xff000000u) >> 24;
-    return core_type == intel_atom;
-}
-
-static int count_math_cpus(int cpu_count) {
-    int result = 0;
-    for (int cpu = 0; cpu < cpu_count; ++cpu) {
-        if (pin_cpu(cpu)) {
-            return -1;
-        }
-        if (is_running_on_efficiency_core()) {
-            continue; // efficiency cores harm lockstep threading
-        }
-        ++cpu; // hyperthreading isn't useful for linear algebra
-        ++result;
-    }
-    return result;
-}
-
-#elif defined(_WIN32)
+#if defined(_WIN32)
 
 #define STATUS_ACCESS_DENIED ((NTSTATUS)0xC0000022L)
 #define STATUS_SUCCESS		 ((NTSTATUS)0)
@@ -437,30 +632,6 @@ typedef enum _PROCESSINFOCLASS {
 	ProcessAllowedCpuSetsInformation = 67,
 } PROCESSINFOCLASS;
 
-extern "C"
-NTSTATUS
-NTAPI
-NtQuerySystemInformationEx(
-	_In_ SYSTEM_INFORMATION_CLASS SystemInformationClass,
-	_In_reads_bytes_(InputBufferLength) PVOID InputBuffer,
-	_In_ ULONG InputBufferLength,
-	_Out_writes_bytes_opt_(SystemInformationLength) PVOID SystemInformation,
-	_In_ ULONG SystemInformationLength,
-	_Out_opt_ PULONG ReturnLength
-);
-
-
-extern "C"
-NTSTATUS
-NTAPI
-NtQueryInformationProcess(
-	_In_ HANDLE ProcessHandle,
-	_In_ PROCESSINFOCLASS ProcessInformationClass,
-	_Out_writes_bytes_opt_(ProcessInformationLength) PVOID ProcessInformation,
-	_In_ ULONG ProcessInformationLength,
-	_Out_opt_ PULONG ReturnLength
-);
-
 int32_t setCpuAffinity(std::bitset<64> cpuMask) {
     DWORD_PTR processAffinityMask;
     DWORD_PTR systemAffinityMask;
@@ -571,7 +742,7 @@ int32_t setCpuAffinity(std::bitset<64> cpuMask) {
     return coreSelected;
 }
 
-ULONG set_procMask(int32_t direction = 0 , int32_t req_threads = 0, int32_t lltraversal = 0, int32_t allowtc = 0, int32_t allowcz = 0, int64_t cpuMask = 0) {
+ULONG set_procMask(int32_t direction = 0, int32_t req_threads = 0, int32_t lltraversal = 0, int32_t allowtc = 0, int32_t allowcz = 0, int64_t cpuMask = 0) {
     std::bitset<64> bMask;
 
     bMask = generate_Mask(direction, req_threads, lltraversal, allowtc, allowcz, cpuMask);
@@ -588,7 +759,7 @@ ULONG set_procMask(int32_t direction = 0 , int32_t req_threads = 0, int32_t lltr
  * Returns number of CPUs on system that are useful for math.
  */
 int get_math_cpu_count() {
-#if defined(__x86_64__) && defined(__linux__)
+#if defined(__x86_164__) && defined(__linux__)
     int cpu_count = sysconf(_SC_NPROCESSORS_ONLN);
     if (cpu_count < 1) {
         return get_num_physical_cores();
@@ -604,7 +775,7 @@ int get_math_cpu_count() {
         }
     }
 
-#elif defined(_WIN32)
+#elif defined(_WIN32) || (defined(__x86_64__) && defined(__linux__))
     int32_t _numPhysical = get_num_physical_cores();
     if (cpuset_enable) {
         // Initial Affinity set
@@ -615,7 +786,7 @@ int get_math_cpu_count() {
     return get_num_physical_cores();
 }
 
-#if defined(_WIN32)
+#if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__))
 int get_math_cpu_count(int32_t req_threads, int32_t cpuset_order, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask) {
     int32_t _numPhysical = get_num_physical_cores();
     if (cpuset_enable) {
@@ -694,7 +865,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
             invalid_param = true;
             return true;
         }
-#if defined(_WIN32)
+#if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__))
         std::string value(argv[i]);
         if (value == "1" || value == "on" || value == "true" || value == "True") { params.cpuset_allowzero = 1; }
         else if (value == "0" || value == "off" || value == "false" || value == "False") { params.cpuset_allowzero = 0; }
@@ -707,7 +878,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
             invalid_param = true;
             return true;
         }
-#if defined(_WIN32)
+#if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__))
         std::string value(argv[i]);
         if (value == "1" || value == "on" || value == "true" || value == "True") { params.cpuset_allowthreads = 1; }
         else if (value == "0" || value == "off" || value == "false" || value == "False") { params.cpuset_allowthreads = 0; }
@@ -720,7 +891,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
             invalid_param = true;
             return true;
         }
-#if defined(_WIN32)
+#if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__))
         std::string value(argv[i]);
         std::size_t pos{};
         int64_t cpuMask = 0;
@@ -749,7 +920,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
             invalid_param = true;
             return true;
         }
-#if defined(_WIN32)
+#if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__))
         std::string value(argv[i]);
         if (value == "1" || value == "on" || value == "true" || value == "True") { params.cpuset_lltraversal = 1; }
         else if (value == "0" || value == "off" || value == "false" || value == "False") { params.cpuset_lltraversal = 0; }
@@ -762,7 +933,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
             invalid_param = true;
             return true;
         }
-#if defined(_WIN32)
+#if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__))
         std::string value(argv[i]);
         if (value == "1" || value == "on" || value == "true" || value == "True") { params.cpuset_order = BEST_CORES; }
         else if (value == "0" || value == "off" || value == "false" || value == "False") { params.cpuset_order = WORST_CORES; }
@@ -775,7 +946,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
             invalid_param = true;
             return true;
         }
-#if defined(_WIN32)
+#if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__))
         params.n_threads = std::stoi(argv[i]);
         if (params.n_threads <= 0) {
             params.n_threads = numPhysicalCores;
@@ -795,7 +966,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
             return true;
         }
         params.n_threads_batch = std::stoi(argv[i]);
-#if defined(_WIN32)
+#if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__))
         if (params.n_threads_batch <= 0 || params.n_threads_batch > numPhysicalCores) {
             params.n_threads_batch = numPhysicalCores;
 #else            
@@ -811,7 +982,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
             return true;
         }
         params.n_threads_draft = std::stoi(argv[i]);
-#if defined(_WIN32)
+#if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__))
         if (params.n_threads_draft <= 0 || params.n_threads_draft > numPhysicalCores) {
             params.n_threads_draft = numPhysicalCores;
 #else            
@@ -827,7 +998,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
             return true;
         }
         params.n_threads_batch_draft = std::stoi(argv[i]);
-#if defined(_WIN32)
+#if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__))
         if (params.n_threads_batch_draft <= 0 || params.n_threads_batch_draft > numPhysicalCores) {
             params.n_threads_batch_draft = numPhysicalCores;
 #else            
@@ -1874,10 +2045,15 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
         }
     }
 
-#if defined(_WIN32)
+#if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__))
     params.n_threads = get_math_cpu_count(params.n_threads_auto ? 0 : params.n_threads, params.cpuset_order, params.cpuset_lltraversal, params.cpuset_allowthreads, params.cpuset_allowzero, params.cpuset_cpumask);
+#endif
+#if defined(_WIN32)
     CPUSET_PRINT_DEBUG("Using %d threads order=%d llcache=%d acm=%d acz=%d cpumask=%lli\n", params.n_threads, params.cpuset_order, params.cpuset_lltraversal, params.cpuset_allowthreads, params.cpuset_allowzero, params.cpuset_cpumask);
 #endif
+#if defined(__x86_64__) && defined(__linux__)
+    CPUSET_PRINT_DEBUG("Using %d threads order=%d llcache=%d acm=%d acz=%d cpumask=%li\n", params.n_threads, params.cpuset_order, params.cpuset_lltraversal, params.cpuset_allowthreads, params.cpuset_allowzero, params.cpuset_cpumask);
+#endif
 
     if (invalid_param) {
         throw std::invalid_argument("error: invalid parameter for argument: " + arg);
@@ -2040,7 +2216,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     if (llama_supports_mmap()) {
         printf("  --no-mmap             do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
     }
-#if defined(_WIN32)
+#if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__))
         printf("  -bco                  change the order of the selected cores from the best to worst (default: worst to best)\n");
         printf("  -llct                 allow the core selection to traverse the last level cache (default: disabled)\n");
         printf("  -acz                  allow the core selection to pick the core 0 as well (default: disabled)\n");
@@ -3240,13 +3416,18 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
     dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector);
 
     fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
-#if defined(_WIN32)
+#if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__))
     fprintf(stream, "threads: %d # default: %u\n", params.n_threads, get_math_cpu_count());
     fprintf(stream, "bco: %d # default: 0\n", params.cpuset_order);
     fprintf(stream, "llct: %d # default: 0\n", params.cpuset_lltraversal);
     fprintf(stream, "acz: %d # default: 0\n", params.cpuset_allowzero);
     fprintf(stream, "atc: %d # default: 0\n", params.cpuset_allowthreads);
+#if defined(_WIN32)
     fprintf(stream, "ccm: %lli # default: none\n", params.cpuset_cpumask);
+#endif
+#if defined(__x86_64__) && defined(__linux__)
+    fprintf(stream, "ccm: %li # default: none\n", params.cpuset_cpumask);
+#endif
 #else
 
     fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency());
diff --git a/common/common.h b/common/common.h
index 5ba823acead4a..aee4de284604b 100644
--- a/common/common.h
+++ b/common/common.h
@@ -39,7 +39,7 @@ extern char const *LLAMA_BUILD_TARGET;
 
 struct llama_control_vector_load_info;
 
-#ifdef _WIN32
+#if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__))
 struct CPU_SET_INFORMATION
 {
     int32_t LogicalProcessorIndex;
@@ -54,8 +54,12 @@ struct CPU_SET_INFORMATION
     int32_t Threads;
 };
 
-#endif
+bool cpuset_sorter_best(CPU_SET_INFORMATION const& lhs, CPU_SET_INFORMATION const& rhs);
 
+bool cpuset_sorter_worst(CPU_SET_INFORMATION const& lhs, CPU_SET_INFORMATION const& rhs);
+
+int get_math_cpu_count(int32_t req_threads, int32_t cpuset_order, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask);
+#endif
 static const int32_t BEST_CORES            = 0;
 static const int32_t WORST_CORES           = 1;
 

From a3e75fe48143e05cb8021173a7d0788c86527484 Mon Sep 17 00:00:00 2001
From: mann1x <20623405+mann1x@users.noreply.github.com>
Date: Fri, 26 Apr 2024 08:56:35 +0200
Subject: [PATCH 05/12] Fixes

---
 common/common.cpp | 78 ++++++++++++++---------------------------------
 1 file changed, 23 insertions(+), 55 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 666de14bea501..309552b1a6dd1 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -127,14 +127,17 @@ int32_t get_count_procMask(ULONG_PTR procMask) {
             std::bitset<64> bMask = procMask;
             return bMask.count();
 }
+#endif
 
-ULONG generate_Mask(int32_t direction, int32_t req_threads, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask) {
+#if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__))
+uint64_t generate_Mask(int32_t direction, int32_t req_threads, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask) {
     std::bitset<64> bMask;
     std::vector<CPU_SET_INFORMATION> _cpuset;
     int32_t bVal = 0;
     int32_t assigned_t = 0;
     int32_t llcache = -1;
 
+#if defined(_WIN32)
     DWORD_PTR processAffinityMask;
     DWORD_PTR systemAffinityMask;
     HANDLE hToken = nullptr;
@@ -174,6 +177,15 @@ ULONG generate_Mask(int32_t direction, int32_t req_threads, int32_t lltraversal,
         return bMask.to_ullong();
     }
 
+#else
+    if (cpuMask != 0) {
+        std::bitset<64> reqMask = cpuMask;
+        CPUSET_PRINT_DEBUG("Custom cpuMask: %s\n", reqMask.to_string().c_str());
+        bMask = cpuMask;
+        return bMask.to_ullong();
+    }
+#endif
+
     if (direction == BEST_CORES) {
         _cpuset = cpuset_best;
     } else {
@@ -232,51 +244,6 @@ int32_t setCpuAffinity(std::bitset<64> cpuMask) {
     return coreSelected;
 }
 
-uint64_t generate_Mask(int32_t direction, int32_t req_threads, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask) {
-    std::bitset<64> bMask;
-    std::vector<CPU_SET_INFORMATION> _cpuset;
-    int32_t bVal = 0;
-    int32_t assigned_t = 0;
-    int32_t llcache = -1;
-
-    if (cpuMask != 0) {
-        std::bitset<64> reqMask = cpuMask;
-        CPUSET_PRINT_DEBUG("Custom cpuMask: %s\n", reqMask.to_string().c_str());
-        bMask = cpuMask;
-        return bMask.to_ullong();
-    }
-
-    if (direction == BEST_CORES) {
-        _cpuset = cpuset_best;
-    } else {
-        _cpuset = cpuset_worst;
-    }
-    CPUSET_PRINT_DEBUG("\ngenerate_Mask dir=%d req_threads=%d lltraversal=%d llcache=%d\n", direction, req_threads, lltraversal, llcache);
-    for (auto index : _cpuset) {
-        bVal = 0;
-        if ((index.LogicalProcessorIndex != 0 || allowcz) &&
-            ((cpuset_smt && index.Threads > 1) || !cpuset_smt || allowtc) &&
-            index.EfficiencyClass == 0 &&
-            ((llcache == index.LastLevelCacheIndex && lltraversal == 0) || llcache == -1 || lltraversal == 1)
-            ) {
-            if (lltraversal == 0) {
-                CPUSET_PRINT_DEBUG("### cache for lltraversal %d pre llcache=%d now_cache=%u\n", lltraversal, llcache, index.LastLevelCacheIndex);
-                llcache = index.LastLevelCacheIndex;
-                CPUSET_PRINT_DEBUG("### cache for lltraversal %d pos llcache=%d now_cache=%u\n", lltraversal, llcache, index.LastLevelCacheIndex);
-            } 
-            bVal = 1;
-        }
-        if (req_threads > 0 && assigned_t >= req_threads) { bVal = 0;}
-        if(bVal == 1) {
-            assigned_t++;
-            CPUSET_PRINT_DEBUG("--> Assigned LogicalCoreIndex: %d lltraversal=%d llcache=%d now_cache=%u\n", index.LogicalProcessorIndex, lltraversal, llcache, index.LastLevelCacheIndex);
-        }
-        bMask[index.LogicalProcessorIndex] = bVal;
-        CPUSET_PRINT_DEBUG("LogicalCoreIndex: %d b:%d smt=%d thrds=%d lltraversal=%d acz=%d atc=%d\n", index.LogicalProcessorIndex, bVal, cpuset_smt, index.Threads, lltraversal, allowcz, allowtc);
-    }
-    return bMask.to_ullong();
-}
-
 static void cpuid(unsigned leaf, unsigned subleaf,
                   unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) {
     __asm__("movq\t%%rbx,%%rsi\n\t"
@@ -341,7 +308,7 @@ int32_t get_num_physical_cores() {
         return numPhysicalCores;    
     }
     // enumerate the set of thread siblings, num entries is num cores
-    fprintf(stderr, "physical cpus count\n");
+    CPUSET_PRINT_DEBUG("Start: get_num_physical_cores\n");
     std::unordered_set<std::string> siblings;
     int32_t cursize = 0;
     cpu_set_t mask;
@@ -352,7 +319,7 @@ int32_t get_num_physical_cores() {
     int32_t numLogicalCores = 0;
 
     for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) {
-        fprintf(stderr, "physical cpu check %d\n", cpu);
+        CPUSET_PRINT_DEBUG("Check for Logical CPU: %d\n", cpu);
         std::ifstream thread_siblings("/sys/devices/system/cpu/cpu"
             + std::to_string(cpu) + "/topology/thread_siblings");
         if (!thread_siblings.is_open()) {
@@ -396,7 +363,7 @@ int32_t get_num_physical_cores() {
             if (static_cast<int32_t>(siblings.size()) > cursize ) {
                 _cpuset.Threads = 2;
                 CPU_SET(cpu, &mask);
-                fprintf(stderr, "physical cpu %u: %s\n", cpu, line.c_str());
+                CPUSET_PRINT_DEBUG("CPU %u is physical, siblings: %s\n", cpu, line.c_str());
             } else {
                 cpuset_smt = true;
             }
@@ -406,21 +373,22 @@ int32_t get_num_physical_cores() {
     if (!siblings.empty()) {
         cpuset_enable = true;
         if (sched_setaffinity(0, sizeof(cpu_set_t), &mask) == -1) {
-                fprintf(stdout, "sched_setaffinity error\n");
+                CPUSET_PRINT_DEBUG("sched_setaffinity error\n");
         }
         if (pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask) == -1) {
-                fprintf(stdout, "pthread_setaffinity_np error\n");
+                CPUSET_PRINT_DEBUG("pthread_setaffinity_np error\n");
         }
-        fprintf(stderr, "physical cpus %li\n", siblings.size());
+        fprintf(stderr, "get_num_physical_cores Physical CPU count: %li\n", siblings.size());
 
         cpuset_best = cpuset;
         cpuset_worst = cpuset;
         std::sort(cpuset_best.begin(), cpuset_best.end(), &cpuset_sorter_best);
         std::sort(cpuset_worst.begin(), cpuset_worst.end(), &cpuset_sorter_worst);
 
-        //int32_t physicalCount = 0;
-        int32_t physicalCount = static_cast<int32_t>(siblings.size());
-        //physicalCount = get_count_procMask(generate_Mask(WORST_CORES, 0, 1, 0, 1, 0));
+        int32_t physicalCount = 0;
+        //int32_t physicalCount = static_cast<int32_t>(siblings.size());
+        std::bitset<64> bMask = generate_Mask(WORST_CORES, 0, 1, 0, 1, 0);
+        physicalCount = bMask.count();
 
         CPUSET_PRINT_DEBUG("\n\n### Logical Processors Summary ###\n\n");
 

From f7d2c0a5cda3dd58dc8aedfc78a1c498d79047f8 Mon Sep 17 00:00:00 2001
From: mann1x <20623405+mann1x@users.noreply.github.com>
Date: Fri, 26 Apr 2024 15:09:17 +0200
Subject: [PATCH 06/12] Added set thread affinity for Linux

---
 common/common.cpp | 10 ++++++++--
 ggml.c            | 27 +++++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 309552b1a6dd1..963f5ccb8dfde 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -231,6 +231,8 @@ int32_t setCpuAffinity(std::bitset<64> cpuMask) {
         if (cpuMask[i] == 1) {
             CPUSET_PRINT_DEBUG("Setting CPU %d\n", i);
             CPU_SET(i, &mask);
+        } else {
+            CPU_CLR(i, &mask);
         }
     }
 
@@ -328,7 +330,10 @@ int32_t get_num_physical_cores() {
         is_hybrid_core = false;
         if (is_hybrid) {
             if (pin_cpu(cpu) == 0) {
-                if (is_running_on_efficiency_core()) is_hybrid_core = true;
+                if (is_running_on_efficiency_core()) {
+                    is_hybrid_core = true;
+                    CPUSET_PRINT_DEBUG("Logical CPU is Hybrid: %d\n", cpu);
+                }
             }
         }
         numLogicalCores++;
@@ -365,6 +370,7 @@ int32_t get_num_physical_cores() {
                 CPU_SET(cpu, &mask);
                 CPUSET_PRINT_DEBUG("CPU %u is physical, siblings: %s\n", cpu, line.c_str());
             } else {
+                CPU_CLR(cpu, &mask);
                 cpuset_smt = true;
             }
         }
@@ -378,7 +384,7 @@ int32_t get_num_physical_cores() {
         if (pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask) == -1) {
                 CPUSET_PRINT_DEBUG("pthread_setaffinity_np error\n");
         }
-        fprintf(stderr, "get_num_physical_cores Physical CPU count: %li\n", siblings.size());
+        CPUSET_PRINT_DEBUG("get_num_physical_cores Physical CPU count: %li\n", siblings.size());
 
         cpuset_best = cpuset;
         cpuset_worst = cpuset;
diff --git a/ggml.c b/ggml.c
index b5b11ca16090e..ef3200b09e2d4 100644
--- a/ggml.c
+++ b/ggml.c
@@ -18708,6 +18708,33 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
             };
 
             const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
+#if defined(__x86_64__) && defined(__linux__)
+            cpu_set_t procMask;
+            cpu_set_t threadMask;
+            //fprintf(stdout, "sched_getaffinity init\n");
+            if (sched_getaffinity(0, sizeof(cpu_set_t), &procMask) == -1) {
+                fprintf(stderr, "ggml_thread_create sched_getaffinity error\n");
+            } else {
+                int result = pthread_setaffinity_np(workers[j].thrd, sizeof(cpu_set_t), &procMask);
+                if (result !=0) fprintf(stderr, "ggml_thread_create pthread_setaffinity_np: %d", result);
+                //printf("Set returned by sched_getaffinity() contained:\n");
+                //for (size_t k = 0; k < CPU_SETSIZE; k++)
+                    //if (CPU_ISSET(k, &procMask))
+                        //printf("    CPU %zu\n", k);
+            }
+            /*
+            int s;
+            s = pthread_getaffinity_np(workers[j].thrd, sizeof(threadMask), &threadMask);
+            if (s != 0) {
+               fprintf(stderr, "ggml_thread_create pthread_getaffinity_np: %d\n", s);
+            } else {
+                printf("Set returned by pthread_getaffinity_np() contained:\n");
+                for (size_t l = 0; l < CPU_SETSIZE; l++)
+                    if (CPU_ISSET(l, &threadMask))
+                        printf("    CPU %zu\n", l);
+            } 
+            */           
+#endif
             GGML_ASSERT(rc == 0);
             UNUSED(rc);
         }

From d55ae1513c88945eb1696a341ec86c36e1dfffbc Mon Sep 17 00:00:00 2001
From: mann1x <20623405+mann1x@users.noreply.github.com>
Date: Sat, 27 Apr 2024 12:17:05 +0200
Subject: [PATCH 07/12] Added one worker thread per core on Windows

---
 ggml.c | 31 ++++++++++++++++++++++++-------
 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/ggml.c b/ggml.c
index ef3200b09e2d4..0d97d135bf871 100644
--- a/ggml.c
+++ b/ggml.c
@@ -74,13 +74,13 @@ static LONG atomic_fetch_sub(atomic_int * ptr, LONG dec) {
 typedef HANDLE pthread_t;
 
 typedef DWORD thread_ret_t;
-static int pthread_create(pthread_t * out, void * unused, thread_ret_t(*func)(void *), void * arg) {
-    (void) unused;
+static int pthread_create(pthread_t * out, int32_t thread, thread_ret_t(*func)(void *), void * arg) {
     HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL);
 
     HANDLE hToken;
-    DWORD_PTR processAffinityMask;
-    DWORD_PTR systemAffinityMask;
+    ULONG_PTR processAffinityMask;
+    ULONG_PTR systemAffinityMask;
+    ULONG newprocessAffinityMask;
         
     BOOL bToken = OpenProcessToken(GetCurrentProcess(), TOKEN_ALL_ACCESS, &hToken);
     if (bToken) {
@@ -88,7 +88,21 @@ static int pthread_create(pthread_t * out, void * unused, thread_ret_t(*func)(vo
         HANDLE hProcess = OpenProcess(PROCESS_QUERY_LIMITED_INFORMATION | PROCESS_SET_INFORMATION, FALSE, GetCurrentProcessId());
         if (hProcess) {
             if (GetProcessAffinityMask(hProcess, &processAffinityMask, &systemAffinityMask)) {
-                SetThreadAffinityMask(handle, processAffinityMask);
+                int32_t posCore = 0;
+                for (int32_t i = 0; i < 64; ++i) {
+                    if (processAffinityMask & ((1ULL) << i) ) {
+                        //fprintf(stderr, "Check thread %d for core %d poscore %d\n", thread, i, posCore);
+                        if (posCore+1 == thread) {
+                            //fprintf(stderr, "Thread %d is assigned to core %d\n", thread, i);
+                        } else {
+                            newprocessAffinityMask = newprocessAffinityMask | (0ULL << i-1);
+                            //fprintf(stderr, "Thread %d is NOT assigned to core %d\n", thread, i);
+                            break;
+                        }
+                        posCore++;
+                    }
+                }
+                SetThreadAffinityMask(handle, newprocessAffinityMask);
             }
         }
         if (hProcess)
@@ -139,7 +153,7 @@ static int pthread_join(pthread_t thread, void * unused) {
 }
 
 static int sched_yield (void) {
-    Sleep (0);
+    Sleep(0);
     return 0;
 }
 #else
@@ -18706,8 +18720,11 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
                 .shared = &state_shared,
                 .ec = GGML_STATUS_SUCCESS,
             };
-
+#if defined(_WIN32)
+            const int rc = ggml_thread_create(&workers[j].thrd, j, ggml_graph_compute_thread, &workers[j]);
+#else
             const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
+#endif
 #if defined(__x86_64__) && defined(__linux__)
             cpu_set_t procMask;
             cpu_set_t threadMask;

From b01716a653458e6fdddfc4d3924b3809dd6b3eb2 Mon Sep 17 00:00:00 2001
From: mann1x <20623405+mann1x@users.noreply.github.com>
Date: Sat, 27 Apr 2024 13:00:33 +0200
Subject: [PATCH 08/12] Added worker threads sticking to a single core for
 Linux

---
 ggml.c | 40 +++++++++++++++++++++++++++++++++-------
 1 file changed, 33 insertions(+), 7 deletions(-)

diff --git a/ggml.c b/ggml.c
index 0d97d135bf871..0994d01ff4fdf 100644
--- a/ggml.c
+++ b/ggml.c
@@ -18727,17 +18727,43 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
 #endif
 #if defined(__x86_64__) && defined(__linux__)
             cpu_set_t procMask;
-            cpu_set_t threadMask;
+            cpu_set_t newprocessAffinityMask;
+            CPU_ZERO(&newprocessAffinityMask);
+            //fprintf(stderr, "\nThread %d checking\n\n", j);
             //fprintf(stdout, "sched_getaffinity init\n");
             if (sched_getaffinity(0, sizeof(cpu_set_t), &procMask) == -1) {
                 fprintf(stderr, "ggml_thread_create sched_getaffinity error\n");
             } else {
-                int result = pthread_setaffinity_np(workers[j].thrd, sizeof(cpu_set_t), &procMask);
-                if (result !=0) fprintf(stderr, "ggml_thread_create pthread_setaffinity_np: %d", result);
-                //printf("Set returned by sched_getaffinity() contained:\n");
-                //for (size_t k = 0; k < CPU_SETSIZE; k++)
-                    //if (CPU_ISSET(k, &procMask))
-                        //printf("    CPU %zu\n", k);
+                int posCore = 0;
+                for (int32_t i = 0; i < 64; ++i) {
+                    if (CPU_ISSET(i, &procMask) ) {
+                        //fprintf(stderr, "Check thread %d for core %d poscore %d\n", thread, i, posCore);
+                        if ((posCore+1) == j) {
+                            CPU_SET(i, &newprocessAffinityMask);
+                            //fprintf(stderr, "\nThread %d is assigned to core %d\n\n", j, i);
+                            break;
+                        } else {
+                            CPU_CLR(i, &newprocessAffinityMask);
+                            //fprintf(stderr, "Thread %d is NOT assigned to core %d\n\n", j, i);
+                        }
+                        posCore++;
+                    }
+                }
+                int result = pthread_setaffinity_np(workers[j].thrd, sizeof(cpu_set_t), &newprocessAffinityMask);
+                if (result !=0) fprintf(stderr, "\n\nggml_thread_create pthread_setaffinity_np for thread %d\n", j);
+                /*
+                printf("Set returned by sched_getaffinity() contained:\n");
+                cpu_set_t nprocMask;
+                CPU_ZERO(&nprocMask);
+                for (size_t k = 0; k < CPU_SETSIZE; k++)
+                    if (CPU_ISSET(k, &procMask))
+                        printf("    CPU %zu\n", k);
+                pthread_getaffinity_np(workers[j].thrd, sizeof(cpu_set_t), &nprocMask);
+                printf("Set returned by pthread_getaffinity_np() contained:\n");
+                for (size_t k = 0; k < CPU_SETSIZE; k++)
+                    if (CPU_ISSET(k, &nprocMask))
+                        printf("    CPU %zu\n", k);
+                */
             }
             /*
             int s;

From 49c1657821526a47451d90c8adc7487409f0e980 Mon Sep 17 00:00:00 2001
From: mann1x <20623405+mann1x@users.noreply.github.com>
Date: Sat, 27 Apr 2024 15:41:00 +0200
Subject: [PATCH 09/12] Fixes

---
 common/common.cpp | 8 ++++----
 ggml.c            | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 963f5ccb8dfde..8bd6eb44ab203 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -138,8 +138,8 @@ uint64_t generate_Mask(int32_t direction, int32_t req_threads, int32_t lltravers
     int32_t llcache = -1;
 
 #if defined(_WIN32)
-    DWORD_PTR processAffinityMask;
-    DWORD_PTR systemAffinityMask;
+    ULONG_PTR processAffinityMask;
+    ULONG_PTR systemAffinityMask;
     HANDLE hToken = nullptr;
     bool gotsystemMask = true;
 
@@ -607,8 +607,8 @@ typedef enum _PROCESSINFOCLASS {
 } PROCESSINFOCLASS;
 
 int32_t setCpuAffinity(std::bitset<64> cpuMask) {
-    DWORD_PTR processAffinityMask;
-    DWORD_PTR systemAffinityMask;
+    ULONG_PTR processAffinityMask;
+    ULONG_PTR systemAffinityMask;
     int32_t coreSelected = get_count_procMask(cpuMask.to_ullong());
     HANDLE hToken = nullptr;
 
diff --git a/ggml.c b/ggml.c
index 0994d01ff4fdf..68a9a709cf5ca 100644
--- a/ggml.c
+++ b/ggml.c
@@ -95,7 +95,7 @@ static int pthread_create(pthread_t * out, int32_t thread, thread_ret_t(*func)(v
                         if (posCore+1 == thread) {
                             //fprintf(stderr, "Thread %d is assigned to core %d\n", thread, i);
                         } else {
-                            newprocessAffinityMask = newprocessAffinityMask | (0ULL << i-1);
+                            newprocessAffinityMask = newprocessAffinityMask | ((0ULL) << (i-1));
                             //fprintf(stderr, "Thread %d is NOT assigned to core %d\n", thread, i);
                             break;
                         }

From fa125a10bbd1f0a750ff33d9887b3c7fbca063ef Mon Sep 17 00:00:00 2001
From: mann1x <20623405+mann1x@users.noreply.github.com>
Date: Sat, 27 Apr 2024 20:30:03 +0200
Subject: [PATCH 10/12] Fix typo

---
 common/common.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/common.cpp b/common/common.cpp
index 8bd6eb44ab203..2e4a3befb839e 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -733,7 +733,7 @@ ULONG set_procMask(int32_t direction = 0, int32_t req_threads = 0, int32_t lltra
  * Returns number of CPUs on system that are useful for math.
  */
 int get_math_cpu_count() {
-#if defined(__x86_164__) && defined(__linux__)
+#if defined(__x86_64__) && defined(__linux__)
     int cpu_count = sysconf(_SC_NPROCESSORS_ONLN);
     if (cpu_count < 1) {
         return get_num_physical_cores();

From e5672d33cb26c0a6f15f2804885fdbcf34e0d416 Mon Sep 17 00:00:00 2001
From: mann1x <20623405+mann1x@users.noreply.github.com>
Date: Sat, 27 Apr 2024 20:55:45 +0200
Subject: [PATCH 11/12] Fixes

---
 common/common.cpp | 4 +---
 common/common.h   | 7 +++++++
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 2e4a3befb839e..5a2bcbb03e7f6 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -176,13 +176,11 @@ uint64_t generate_Mask(int32_t direction, int32_t req_threads, int32_t lltravers
         }
         return bMask.to_ullong();
     }
-
 #else
     if (cpuMask != 0) {
         std::bitset<64> reqMask = cpuMask;
         CPUSET_PRINT_DEBUG("Custom cpuMask: %s\n", reqMask.to_string().c_str());
-        bMask = cpuMask;
-        return bMask.to_ullong();
+        return reqMask.to_ullong();
     }
 #endif
 
diff --git a/common/common.h b/common/common.h
index aee4de284604b..ca53a57509ace 100644
--- a/common/common.h
+++ b/common/common.h
@@ -60,6 +60,13 @@ bool cpuset_sorter_worst(CPU_SET_INFORMATION const& lhs, CPU_SET_INFORMATION con
 
 int get_math_cpu_count(int32_t req_threads, int32_t cpuset_order, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask);
 #endif
+#if defined(__x86_64__) && defined(__linux__)
+#include <bitset>
+int32_t setCpuAffinity(std::bitset<64> cpuMask);
+uint64_t generate_Mask(int32_t direction, int32_t req_threads, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask);
+uint64_t set_procMask(int32_t direction, int32_t req_threads, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask);
+#endif
+
 static const int32_t BEST_CORES            = 0;
 static const int32_t WORST_CORES           = 1;
 

From 063e201b020b8903f9467c00018b86e5a174b2cc Mon Sep 17 00:00:00 2001
From: mann1x <20623405+mann1x@users.noreply.github.com>
Date: Sun, 28 Apr 2024 22:46:12 +0200
Subject: [PATCH 12/12] Fixes, Linux support over 64 CPUs, Core 0 enabled at 6
 cores and below

---
 common/common.cpp | 128 ++++++++++++++++++++++++++++------------------
 common/common.h   |   9 ++--
 2 files changed, 83 insertions(+), 54 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 5a2bcbb03e7f6..759adad9713e0 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -92,7 +92,7 @@ int32_t PhysicalCores = std::thread::hardware_concurrency();
 // CPUSET logging
 //
 
-#define CPUSET_DEBUG 0
+#define CPUSET_DEBUG 1
 #if (CPUSET_DEBUG >= 1)
 #define CPUSET_PRINT_DEBUG(...) printf(__VA_ARGS__)
 #else
@@ -127,9 +127,7 @@ int32_t get_count_procMask(ULONG_PTR procMask) {
             std::bitset<64> bMask = procMask;
             return bMask.count();
 }
-#endif
 
-#if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__))
 uint64_t generate_Mask(int32_t direction, int32_t req_threads, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask) {
     std::bitset<64> bMask;
     std::vector<CPU_SET_INFORMATION> _cpuset;
@@ -137,7 +135,6 @@ uint64_t generate_Mask(int32_t direction, int32_t req_threads, int32_t lltravers
     int32_t assigned_t = 0;
     int32_t llcache = -1;
 
-#if defined(_WIN32)
     ULONG_PTR processAffinityMask;
     ULONG_PTR systemAffinityMask;
     HANDLE hToken = nullptr;
@@ -176,13 +173,6 @@ uint64_t generate_Mask(int32_t direction, int32_t req_threads, int32_t lltravers
         }
         return bMask.to_ullong();
     }
-#else
-    if (cpuMask != 0) {
-        std::bitset<64> reqMask = cpuMask;
-        CPUSET_PRINT_DEBUG("Custom cpuMask: %s\n", reqMask.to_string().c_str());
-        return reqMask.to_ullong();
-    }
-#endif
 
     if (direction == BEST_CORES) {
         _cpuset = cpuset_best;
@@ -192,7 +182,7 @@ uint64_t generate_Mask(int32_t direction, int32_t req_threads, int32_t lltravers
     CPUSET_PRINT_DEBUG("\ngenerate_Mask dir=%d req_threads=%d lltraversal=%d llcache=%d\n", direction, req_threads, lltraversal, llcache);
     for (auto index : _cpuset) {
         bVal = 0;
-        if ((index.LogicalProcessorIndex != 0 || allowcz) &&
+        if ((index.LogicalProcessorIndex != 0 || allowcz == 1) &&
             ((cpuset_smt && index.Threads > 1) || !cpuset_smt || allowtc) &&
             index.EfficiencyClass == 0 &&
             ((llcache == index.LastLevelCacheIndex && lltraversal == 0) || llcache == -1 || lltraversal == 1)
@@ -214,33 +204,73 @@ uint64_t generate_Mask(int32_t direction, int32_t req_threads, int32_t lltravers
     }
     return bMask.to_ullong();
 }
-#endif
 
-#if defined(__x86_64__) && defined(__linux__)
+#elif defined(__x86_64__) && defined(__linux__)
 #include <pthread.h>
 
-int32_t setCpuAffinity(std::bitset<64> cpuMask) {
-    int32_t coreSelected = cpuMask.count();
+cpu_set_t generate_Mask(int32_t direction, int32_t req_threads, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask) {
+    cpu_set_t bMask;
+    CPU_ZERO(&bMask);
+    std::vector<CPU_SET_INFORMATION> _cpuset;
+    int32_t bVal = 0;
+    int32_t assigned_t = 0;
+    int32_t llcache = -1;
+    std::bitset<64> reqMask = cpuMask;
 
-    cpu_set_t mask;
-    CPU_ZERO(&mask);
+    if (cpuMask != 0) {
+        CPUSET_PRINT_DEBUG("Custom cpuMask: %s\n", reqMask.to_string().c_str());
+    }
 
-    for (int32_t i = 0; i < 64; ++i) {
-        if (cpuMask[i] == 1) {
-            CPUSET_PRINT_DEBUG("Setting CPU %d\n", i);
-            CPU_SET(i, &mask);
+    if (direction == BEST_CORES) {
+        _cpuset = cpuset_best;
+    } else {
+        _cpuset = cpuset_worst;
+    }
+    CPUSET_PRINT_DEBUG("\ngenerate_Mask: dir=%d req_threads=%d lltraversal=%d llcache=%d\n", direction, req_threads, lltraversal, llcache);
+    for (auto index : _cpuset) {
+        bVal = 0;
+        if ((index.LogicalProcessorIndex != 0 || allowcz == 1) &&
+            ((cpuset_smt && index.Threads > 1) || !cpuset_smt || allowtc) &&
+            index.EfficiencyClass == 0 &&
+            ((llcache == index.LastLevelCacheIndex && lltraversal == 0) || llcache == -1 || lltraversal == 1)
+            ) {
+            if (lltraversal == 0) {
+                CPUSET_PRINT_DEBUG("### cache for lltraversal %d pre llcache=%d now_cache=%u\n", lltraversal, llcache, index.LastLevelCacheIndex);
+                llcache = index.LastLevelCacheIndex;
+                CPUSET_PRINT_DEBUG("### cache for lltraversal %d pos llcache=%d now_cache=%u\n", lltraversal, llcache, index.LastLevelCacheIndex);
+            } 
+            bVal = 1;
+        }
+        if (req_threads > 0 && assigned_t >= req_threads) { bVal = 0;}
+        if (cpuMask != 0) {
+            bVal = 1;
+            if (reqMask[index.LogicalProcessorIndex] == 0) {
+                bVal = 0;
+            }
+        }
+        if(bVal == 1) {
+            assigned_t++;
+            CPU_SET(index.LogicalProcessorIndex, &bMask);
+            CPUSET_PRINT_DEBUG("--> Assigned LogicalCoreIndex: %d lltraversal=%d llcache=%d now_cache=%u\n", index.LogicalProcessorIndex, lltraversal, llcache, index.LastLevelCacheIndex);
         } else {
-            CPU_CLR(i, &mask);
+            CPU_CLR(index.LogicalProcessorIndex, &bMask);
         }
+        CPUSET_PRINT_DEBUG("LogicalCoreIndex: %d b:%d smt=%d thrds=%d lltraversal=%d acz=%d atc=%d\n", index.LogicalProcessorIndex, bVal, cpuset_smt, index.Threads, lltraversal, allowcz, allowtc);
     }
+    return bMask;
+}
+
+int32_t setCpuAffinity(cpu_set_t bMask) {
+    const cpu_set_t cpuMask = bMask;
+    int32_t coreSelected = CPU_COUNT(&cpuMask);
 
-    if (sched_setaffinity(0, sizeof(cpu_set_t), &mask) == -1) {
+    if (sched_setaffinity(0, sizeof(cpu_set_t), &cpuMask) == -1) {
             CPUSET_PRINT_DEBUG("setCpuAffinity sched_setaffinity error\n");
     }
-    if (pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask) == -1) {
+    if (pthread_setaffinity_np(pthread_self(), sizeof(cpuMask), &cpuMask) == -1) {
             CPUSET_PRINT_DEBUG("setCpuAffinity pthread_setaffinity_np error\n");
-    }
-     
+    }     
+    
     return coreSelected;
 }
 
@@ -289,15 +319,16 @@ static int count_math_cpus(int cpu_count) {
     return result;
 }
 
-uint64_t set_procMask(int32_t direction = 0 , int32_t req_threads = 0, int32_t lltraversal = 0, int32_t allowtc = 0, int32_t allowcz = 0, int64_t cpuMask = 0) {
-    std::bitset<64> bMask;
-
+cpu_set_t set_procMask(int32_t direction = 0 , int32_t req_threads = 0, int32_t lltraversal = 0, int32_t allowtc = 0, int32_t allowcz = 0, int64_t cpuMask = 0) {
+    cpu_set_t bMask;
+    CPU_ZERO(&bMask);
     bMask = generate_Mask(direction, req_threads, lltraversal, allowtc, allowcz, cpuMask);
 
-    numPhysicalCores = bMask.count();
+    numPhysicalCores = CPU_COUNT(&bMask);
 
-    CPUSET_PRINT_DEBUG("Generated Mask: %s\n", bMask.to_string().c_str());
-    return bMask.to_ullong();
+    CPUSET_PRINT_DEBUG("Generated Mask Count CPU: %d\n", numPhysicalCores);
+    
+    return bMask;
 }
 
 #endif
@@ -318,7 +349,7 @@ int32_t get_num_physical_cores() {
     std::vector<CPU_SET_INFORMATION> _cpuset;
     int32_t numLogicalCores = 0;
 
-    for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) {
+    for (uint32_t cpu=0; cpu < 1024; ++cpu) {
         CPUSET_PRINT_DEBUG("Check for Logical CPU: %d\n", cpu);
         std::ifstream thread_siblings("/sys/devices/system/cpu/cpu"
             + std::to_string(cpu) + "/topology/thread_siblings");
@@ -390,9 +421,8 @@ int32_t get_num_physical_cores() {
         std::sort(cpuset_worst.begin(), cpuset_worst.end(), &cpuset_sorter_worst);
 
         int32_t physicalCount = 0;
-        //int32_t physicalCount = static_cast<int32_t>(siblings.size());
-        std::bitset<64> bMask = generate_Mask(WORST_CORES, 0, 1, 0, 1, 0);
-        physicalCount = bMask.count();
+        cpu_set_t bMask = generate_Mask(WORST_CORES, 0, 1, 0, 1, 0);
+        physicalCount = CPU_COUNT(&bMask);
 
         CPUSET_PRINT_DEBUG("\n\n### Logical Processors Summary ###\n\n");
 
@@ -554,7 +584,7 @@ int32_t get_num_physical_cores() {
     std::sort(cpuset_worst.begin(), cpuset_worst.end(), &cpuset_sorter_worst);
 
     int32_t physicalCount = 0;
-    physicalCount = get_count_procMask(generate_Mask(WORST_CORES, 0, 1, 0, 1, 0));
+    physicalCount = get_count_procMask(generate_Mask(WORST_CORES, 0, 1, 0, 2, 0));
 
     CPUSET_PRINT_DEBUG("\n\n1st PhysicalCount: %d\n\n", physicalCount);
 
@@ -731,7 +761,14 @@ ULONG set_procMask(int32_t direction = 0, int32_t req_threads = 0, int32_t lltra
  * Returns number of CPUs on system that are useful for math.
  */
 int get_math_cpu_count() {
-#if defined(__x86_64__) && defined(__linux__)
+#if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__))
+    int32_t _numPhysical = get_num_physical_cores();
+    if (cpuset_enable) {
+        // Initial Affinity set
+        setCpuAffinity(set_procMask(WORST_CORES, 0, 1, 0, 0));
+    }
+    return _numPhysical;
+#elif defined(__linux__)
     int cpu_count = sysconf(_SC_NPROCESSORS_ONLN);
     if (cpu_count < 1) {
         return get_num_physical_cores();
@@ -746,14 +783,6 @@ int get_math_cpu_count() {
             }
         }
     }
-
-#elif defined(_WIN32) || (defined(__x86_64__) && defined(__linux__))
-    int32_t _numPhysical = get_num_physical_cores();
-    if (cpuset_enable) {
-        // Initial Affinity set
-        setCpuAffinity(set_procMask(WORST_CORES, 0, 1));
-    }
-    return _numPhysical;
 #endif
     return get_num_physical_cores();
 }
@@ -762,6 +791,7 @@ int get_math_cpu_count() {
 int get_math_cpu_count(int32_t req_threads, int32_t cpuset_order, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask) {
     int32_t _numPhysical = get_num_physical_cores();
     if (cpuset_enable) {
+        if (_numPhysical < 7 && allowcz == 2) allowcz = 1;
         _numPhysical = setCpuAffinity(set_procMask(cpuset_order, req_threads, lltraversal, allowtc, allowcz, cpuMask));
     }
     return _numPhysical;
@@ -2191,7 +2221,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
 #if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__))
         printf("  -bco                  change the order of the selected cores from the best to worst (default: worst to best)\n");
         printf("  -llct                 allow the core selection to traverse the last level cache (default: disabled)\n");
-        printf("  -acz                  allow the core selection to pick the core 0 as well (default: disabled)\n");
+        printf("  -acz                  allow the core selection to pick the core 0 as well (default: disabled for more than 6 cores)\n");
         printf("  -atc                  allow the core selection to pick non physical, threaded, cores (default: disabled)\n");
         printf("  -ccm                  specify a custom CPU Affinity bitmask in hex for the core selection (default: disabled)\n");
 #endif
@@ -3392,7 +3422,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "threads: %d # default: %u\n", params.n_threads, get_math_cpu_count());
     fprintf(stream, "bco: %d # default: 0\n", params.cpuset_order);
     fprintf(stream, "llct: %d # default: 0\n", params.cpuset_lltraversal);
-    fprintf(stream, "acz: %d # default: 0\n", params.cpuset_allowzero);
+    fprintf(stream, "acz: %d # default: auto\n", params.cpuset_allowzero);
     fprintf(stream, "atc: %d # default: 0\n", params.cpuset_allowthreads);
 #if defined(_WIN32)
     fprintf(stream, "ccm: %lli # default: none\n", params.cpuset_cpumask);
diff --git a/common/common.h b/common/common.h
index ca53a57509ace..dc7fcffb5f1d5 100644
--- a/common/common.h
+++ b/common/common.h
@@ -61,10 +61,9 @@ bool cpuset_sorter_worst(CPU_SET_INFORMATION const& lhs, CPU_SET_INFORMATION con
 int get_math_cpu_count(int32_t req_threads, int32_t cpuset_order, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask);
 #endif
 #if defined(__x86_64__) && defined(__linux__)
-#include <bitset>
-int32_t setCpuAffinity(std::bitset<64> cpuMask);
-uint64_t generate_Mask(int32_t direction, int32_t req_threads, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask);
-uint64_t set_procMask(int32_t direction, int32_t req_threads, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask);
+int32_t setCpuAffinity(cpu_set_t cpuMask);
+cpu_set_t generate_Mask(int32_t direction, int32_t req_threads, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask);
+cpu_set_t set_procMask(int32_t direction, int32_t req_threads, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask);
 #endif
 
 static const int32_t BEST_CORES            = 0;
@@ -88,7 +87,7 @@ struct gpt_params {
     int32_t cpuset_lltraversal    = 0;
     int32_t cpuset_order          = WORST_CORES;
     int64_t cpuset_cpumask        = 0;
-    int32_t cpuset_allowzero      = 0;
+    int32_t cpuset_allowzero      = 2;
     int32_t cpuset_allowthreads   = 0;
     int32_t n_predict             = -1;    // new tokens to predict
     int32_t n_ctx                 = 512;   // context size