From b188c9c9839f1061ae9dad09257d3993bbb67af5 Mon Sep 17 00:00:00 2001 From: mann1x <20623405+mann1x@users.noreply.github.com> Date: Mon, 22 Apr 2024 20:08:51 +0200 Subject: [PATCH 01/12] CpuSet support for Windows --- common/common.cpp | 487 +++++++++++++++++++++++++++++++++++++++++++++- common/common.h | 24 ++- ggml.c | 47 +++++ 3 files changed, 550 insertions(+), 8 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index cf69535e2d1f5..c53749befb8ab 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -32,8 +32,13 @@ #endif #include #include +#include +#include #include #include +#include +#include +#include #else #include #include @@ -72,8 +77,96 @@ using json = nlohmann::ordered_json; +#if defined(_WIN32) +std::vector cpuset; +std::vector cpuset_best; +std::vector cpuset_worst; + +bool cpuset_enable = false; +bool cpuset_smt = false; + +int32_t numPhysicalCores = -1; +int32_t PhysicalCores = std::thread::hardware_concurrency(); + +// +// CPUSET logging +// + +#define CPUSET_DEBUG 1 +#if (CPUSET_DEBUG >= 1) +#define CPUSET_PRINT_DEBUG(...) printf(__VA_ARGS__) +#else +#define CPUSET_PRINT_DEBUG(...) +#endif + +int32_t get_pos_procMask(ULONG_PTR procMask) { + std::bitset<64> bMask = procMask; + int32_t thisPos = 0; + for (int32_t i = 0; i < 64; ++i) { + if (bMask[i] == 1) { + return i; + break; + } + } + return thisPos; +} + +int32_t get_count_procMask(ULONG_PTR procMask) { + std::bitset<64> bMask = procMask; + return bMask.count(); +} + +bool cpuset_sorter_best(CPU_SET_INFORMATION const& lhs, CPU_SET_INFORMATION const& rhs) { + return lhs.SchedulingClass > rhs.SchedulingClass; +} + +bool cpuset_sorter_worst(CPU_SET_INFORMATION const& lhs, CPU_SET_INFORMATION const& rhs) { + return lhs.SchedulingClass < rhs.SchedulingClass; +} + +ULONG generate_Mask(int direction, int32_t req_threads, int lltraversal) { + std::bitset<64> bMask; + std::vector _cpuset; + int32_t bVal = 0; + int32_t assigned_t = 0; + int32_t llcache = -1; + + if (direction == BEST_CORES) { + _cpuset = cpuset_best; + } else { + _cpuset = cpuset_worst; + } + CPUSET_PRINT_DEBUG("\ngenerate_Mask dir=%d req_threads=%d lltraversal=%d llcache=%d\n", direction, req_threads, lltraversal, llcache); + for (auto index : _cpuset) { + bVal = 0; + if (index.LogicalProcessorIndex != 0 && + ((cpuset_smt && index.Threads > 1) || !cpuset_smt) && + index.EfficiencyClass == 0 && + ((llcache == index.LastLevelCacheIndex && lltraversal == 0) || llcache == -1) + ) { + if (lltraversal == 0) { + CPUSET_PRINT_DEBUG("cache for lltraversal %d pre llcache %d now_cache=%u\n", lltraversal, llcache, index.LastLevelCacheIndex); + llcache = index.LastLevelCacheIndex; + CPUSET_PRINT_DEBUG("cache for lltraversal %d pos llcache %d now_cache=%u\n", lltraversal, llcache, index.LastLevelCacheIndex); + } + bVal = 1; + assigned_t++; + CPUSET_PRINT_DEBUG("Assigned LogicalCoreIndex: %d lltraversal %d llcache %d now_cache=%u\n", index.LogicalProcessorIndex, lltraversal, llcache, index.LastLevelCacheIndex); + } + bMask[index.LogicalProcessorIndex] = bVal; + CPUSET_PRINT_DEBUG("Index: %d b:%d smt=%d thrds=%d\n", index.LogicalProcessorIndex, bVal, cpuset_smt, index.Threads); + if (req_threads > 0) { + if (assigned_t >= req_threads) { + break; + } + } + } + return bMask.to_ullong(); +} +#endif + int32_t get_num_physical_cores() { -#ifdef __linux__ +#ifdef __linux__ // __x86_64__ && __linux__ // enumerate the set of thread siblings, num entries is num cores std::unordered_set siblings; for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) { @@ -90,7 +183,7 @@ int32_t get_num_physical_cores() { if (!siblings.empty()) { return static_cast(siblings.size()); } -#elif defined(__APPLE__) && defined(__MACH__) +#elif defined(__APPLE__) && defined(__MACH__) // __APPLE__ && __MACH__ int32_t num_physical_cores; size_t len = sizeof(num_physical_cores); int result = sysctlbyname("hw.perflevel0.physicalcpu", &num_physical_cores, &len, NULL, 0); @@ -101,12 +194,148 @@ int32_t get_num_physical_cores() { if (result == 0) { return num_physical_cores; } -#elif defined(_WIN32) - //TODO: Implement +#elif defined(_WIN32) // _WIN32 + if (numPhysicalCores > 0) { + return numPhysicalCores; + } + unsigned int d_threads = std::thread::hardware_concurrency(); + + HMODULE h = GetModuleHandleW(L"kernel32.dll"); + if (NULL != h) { + if (NULL != GetProcAddress(h, "GetSystemCpuSetInformation")){ + CPUSET_PRINT_DEBUG("Windows SystemCpuSetInformation is available\n"); + cpuset_enable = true; + } + } + numPhysicalCores = d_threads > 0 ? (d_threads <= 4 ? d_threads : d_threads / 2) : 4; + if (d_threads < 4 || d_threads > 64 || !cpuset_enable) { + return numPhysicalCores; + } + ULONG bufferSize; + ULONG bufferSizeLogical; + HANDLE curProc = GetCurrentProcess(); + + GetSystemCpuSetInformation(nullptr, 0, &bufferSize, curProc, 0); + GetLogicalProcessorInformation(nullptr, &bufferSizeLogical); + + auto buffer = std::make_unique(bufferSize); + auto bufferLogical = std::make_unique(bufferSizeLogical); + + if(!GetSystemCpuSetInformation(reinterpret_cast(buffer.get()), bufferSize, &bufferSize, curProc, 0)) + { + + CPUSET_PRINT_DEBUG("Failure GetSystemCpuSetInformation, fallback\n"); + cpuset_enable = false; + return numPhysicalCores; + } + uint8_t* cpuSetPtr = buffer.get(); + + GetLogicalProcessorInformation(reinterpret_cast(bufferLogical.get()), &bufferSizeLogical); + uint8_t* logicalPtr = bufferLogical.get(); + + uint32_t numLogicalCores = 0; + + CPUSET_PRINT_DEBUG("\nCPUSET GetSystemCpuSetInformation:\n"); + + for (ULONG cpuSetSize = 0; cpuSetSize < bufferSize; ) + { + auto nextCPUSet = reinterpret_cast(cpuSetPtr); + + if (nextCPUSet->Type == CPU_SET_INFORMATION_TYPE::CpuSetInformation) + { + CPU_SET_INFORMATION _cpuset; + _cpuset.LogicalProcessorIndex = nextCPUSet->CpuSet.LogicalProcessorIndex; + _cpuset.CoreIndex = nextCPUSet->CpuSet.CoreIndex; + _cpuset.Id = nextCPUSet->CpuSet.Id; + _cpuset.Group = nextCPUSet->CpuSet.Group; + _cpuset.LastLevelCacheIndex = nextCPUSet->CpuSet.LastLevelCacheIndex; + _cpuset.NumaNodeIndex = nextCPUSet->CpuSet.NumaNodeIndex; + _cpuset.EfficiencyClass = nextCPUSet->CpuSet.EfficiencyClass; + _cpuset.SchedulingClass = nextCPUSet->CpuSet.SchedulingClass; + cpuset.push_back(_cpuset); + numLogicalCores++; + } + // Should not happen but it's a fail safe + if (numLogicalCores > d_threads) continue; + + cpuSetPtr += nextCPUSet->Size; + cpuSetSize += nextCPUSet->Size; + } + + int32_t physicalCount = 0; + int32_t thisLogical = 0; + int32_t coreThreadsNum = 1; + + for (ULONG logicalSize = 0; logicalSize < bufferSizeLogical; ) + { + auto nextLogical = reinterpret_cast(logicalPtr); + + + if (nextLogical->ProcessorCore.Flags == 1 && nextLogical->Cache.Associativity <= 2) { + switch (nextLogical->Relationship) { + case LOGICAL_PROCESSOR_RELATIONSHIP::RelationProcessorCore: + CPUSET_PRINT_DEBUG("Physical Count: %u\n", physicalCount); + CPUSET_PRINT_DEBUG("Cache.Associativity: %d\n", nextLogical->Cache.Associativity); + CPUSET_PRINT_DEBUG("Cache.Level: %d\n", nextLogical->Cache.Level); + CPUSET_PRINT_DEBUG("Cache.Type: %d\n", nextLogical->Cache.Type); + CPUSET_PRINT_DEBUG("Core Flags: %d\n", nextLogical->ProcessorCore.Flags); + coreThreadsNum = get_count_procMask(nextLogical->ProcessorMask); + CPUSET_PRINT_DEBUG("LogicalCore: %d is Physical with %d [%d]thread(s)\n", get_pos_procMask(nextLogical->ProcessorMask), get_count_procMask(nextLogical->ProcessorMask), coreThreadsNum); + if (coreThreadsNum > 1) cpuset_smt = true; + cpuset[get_pos_procMask(nextLogical->ProcessorMask)].Threads = coreThreadsNum; + + for (int32_t thread = 1; thread < coreThreadsNum;) { + CPUSET_PRINT_DEBUG("LogicalCore: %u is a thread\n", get_pos_procMask(nextLogical->ProcessorMask)+thread); + cpuset[get_pos_procMask(nextLogical->ProcessorMask)+thread].Threads = 1; + thread++; + } + + break; + } + } + + logicalSize += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION); + logicalPtr += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION); + + } + cpuset_best = cpuset; + cpuset_worst = cpuset; + std::sort(cpuset_best.begin(), cpuset_best.end(), &cpuset_sorter_best); + std::sort(cpuset_worst.begin(), cpuset_worst.end(), &cpuset_sorter_worst); + + physicalCount = get_count_procMask(generate_Mask(WORST_CORES, 0, 1)); + + CPUSET_PRINT_DEBUG("\n\nLPhysicalCount: %d\n\n", physicalCount); + + physicalCount = physicalCount <= 0 ? numLogicalCores : physicalCount; + + CPUSET_PRINT_DEBUG("\n\nLPhysicalCount2: %d\n\n", physicalCount); + + CPUSET_PRINT_DEBUG("\n\nLogical Processors Summary\n\n"); + + for (uint32_t _logicalCore = 0; _logicalCore < numLogicalCores;) + { + CPUSET_PRINT_DEBUG("\nLogical: %u\n", _logicalCore); + CPUSET_PRINT_DEBUG("Threads: %u\n", cpuset[_logicalCore].Threads); + CPUSET_PRINT_DEBUG("Id: %u\n", cpuset[_logicalCore].Id); + CPUSET_PRINT_DEBUG("Group: %u\n", cpuset[numLogicalCores].Group); + CPUSET_PRINT_DEBUG("LastLevelCacheIndex: %u\n", cpuset[_logicalCore].LastLevelCacheIndex); + CPUSET_PRINT_DEBUG("NumaNodeIndex: %u\n", cpuset[_logicalCore].NumaNodeIndex); + CPUSET_PRINT_DEBUG("LogicalProcessorIndex: %u\n", cpuset[_logicalCore].LogicalProcessorIndex); + CPUSET_PRINT_DEBUG("EfficiencyClass: %u\n", cpuset[_logicalCore].EfficiencyClass); + CPUSET_PRINT_DEBUG("SchedulingClass: %u\n", cpuset[_logicalCore].SchedulingClass); + _logicalCore++; + } + + + CPUSET_PRINT_DEBUG("\n\n \n\n"); + CPUSET_PRINT_DEBUG("Total Physical: %u\n", physicalCount); + CPUSET_PRINT_DEBUG("Total Logical: %u\n", numLogicalCores); + return physicalCount; #endif unsigned int n_threads = std::thread::hardware_concurrency(); - return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4; -} + return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;} + #if defined(__x86_64__) && defined(__linux__) #include @@ -156,7 +385,168 @@ static int count_math_cpus(int cpu_count) { return result; } -#endif // __x86_64__ && __linux__ +#elif defined(_WIN32) + +#define STATUS_ACCESS_DENIED ((NTSTATUS)0xC0000022L) +#define STATUS_SUCCESS ((NTSTATUS)0) + +typedef enum _SYSTEM_INFORMATION_CLASS { + SystemAllowedCpuSetsInformation = 168, + SystemCpuSetInformation = 175, + SystemCpuSetTagInformation = 176, +} SYSTEM_INFORMATION_CLASS; + +typedef enum _PROCESSINFOCLASS { + ProcessDefaultCpuSetsInformation = 66, + ProcessAllowedCpuSetsInformation = 67, +} PROCESSINFOCLASS; + +extern "C" +NTSTATUS +NTAPI +NtQuerySystemInformationEx( + _In_ SYSTEM_INFORMATION_CLASS SystemInformationClass, + _In_reads_bytes_(InputBufferLength) PVOID InputBuffer, + _In_ ULONG InputBufferLength, + _Out_writes_bytes_opt_(SystemInformationLength) PVOID SystemInformation, + _In_ ULONG SystemInformationLength, + _Out_opt_ PULONG ReturnLength +); + + +extern "C" +NTSTATUS +NTAPI +NtQueryInformationProcess( + _In_ HANDLE ProcessHandle, + _In_ PROCESSINFOCLASS ProcessInformationClass, + _Out_writes_bytes_opt_(ProcessInformationLength) PVOID ProcessInformation, + _In_ ULONG ProcessInformationLength, + _Out_opt_ PULONG ReturnLength +); + +int32_t setCpuAffinity(std::bitset<64> cpuMask) { + DWORD_PTR processAffinityMask; + DWORD_PTR systemAffinityMask; + int32_t coreSelected = get_count_procMask(cpuMask.to_ullong()); + HANDLE hToken = nullptr; + + BOOL bToken = ::OpenProcessToken(::GetCurrentProcess(), TOKEN_ALL_ACCESS, &hToken); + if (!bToken) { + CPUSET_PRINT_DEBUG("Could not access process main ALL\n"); + } + + HANDLE hProcess = ::OpenProcess(PROCESS_QUERY_LIMITED_INFORMATION | PROCESS_SET_INFORMATION, FALSE, GetCurrentProcessId()); + if (!hProcess) { + CPUSET_PRINT_DEBUG("Could not access process for Affinity\n"); + } + + if (!GetProcessAffinityMask(hProcess, &processAffinityMask, &systemAffinityMask)) { + CPUSET_PRINT_DEBUG("Could not get affinity for Process\n"); + } + + std::bitset<64> processMask = processAffinityMask; + CPUSET_PRINT_DEBUG("Process Mask: %s\n", processMask.to_string().c_str()); + std::bitset<64> systemMask = systemAffinityMask; + CPUSET_PRINT_DEBUG("System Mask: %s\n", systemMask.to_string().c_str()); + std::bitset<64> reqMask = cpuMask; + CPUSET_PRINT_DEBUG("Requested Mask: %s\n", reqMask.to_string().c_str()); + + // Set process affinity + if (!SetProcessAffinityMask(hProcess, cpuMask.to_ullong() & systemAffinityMask)) { + CPUSET_PRINT_DEBUG("Could not set affinity for Process\n"); + } else { + coreSelected = get_count_procMask(cpuMask.to_ullong() & systemAffinityMask); + CPUSET_PRINT_DEBUG("Affinity SET for Process\n"); + } + + if (!GetProcessAffinityMask(hProcess, &processAffinityMask, &systemAffinityMask)) { + CPUSET_PRINT_DEBUG("Could not get affinity for Process\n"); + } + std::bitset<64> newprocessMask = processAffinityMask; + CPUSET_PRINT_DEBUG("New Proc Mask: %s\n", newprocessMask.to_string().c_str()); + + HANDLE hThread = GetCurrentThread(); + // Get the thread ID of this thread + DWORD tid = (DWORD)GetThreadId(hThread); + + // Enumerate all threads in the process + THREADENTRY32 te; + HANDLE hSnapshot = CreateToolhelp32Snapshot(TH32CS_SNAPTHREAD, 0); + if (hSnapshot != INVALID_HANDLE_VALUE) { + te.dwSize = sizeof(THREADENTRY32); + Thread32First(hSnapshot, &te); + if (Thread32Next(hSnapshot, &te)) { + do { + // Check if the thread is part of this process + if (te.th32OwnerProcessID == GetProcessId(hProcess)) { + // Set thread affinity + if (!SetThreadAffinityMask(hThread, cpuMask.to_ullong() & systemAffinityMask)) { + CPUSET_PRINT_DEBUG("Could not set affinity for Main Process Thread\n"); + } + } + } while( Thread32Next(hSnapshot, &te ) ); + } + CloseHandle(hSnapshot); + } + + if (hProcess) + ::CloseHandle(hProcess); + if (hThread) + ::CloseHandle(hThread); + + HANDLE hProcess2 = ::OpenProcess(PROCESS_ALL_ACCESS, FALSE, GetCurrentProcessId()); + + if (hProcess2) { + PROCESS_POWER_THROTTLING_STATE PowerThrottling; + RtlZeroMemory(&PowerThrottling, sizeof(PowerThrottling)); + PowerThrottling.Version = PROCESS_POWER_THROTTLING_CURRENT_VERSION; + + PowerThrottling.ControlMask = PROCESS_POWER_THROTTLING_IGNORE_TIMER_RESOLUTION; + PowerThrottling.StateMask = 0; + PowerThrottling.StateMask = PROCESS_POWER_THROTTLING_IGNORE_TIMER_RESOLUTION; + + SetProcessInformation(hProcess2, + ProcessPowerThrottling, + &PowerThrottling, + sizeof(PowerThrottling)); + + RtlZeroMemory(&PowerThrottling, sizeof(PowerThrottling)); + PowerThrottling.ControlMask = PROCESS_POWER_THROTTLING_EXECUTION_SPEED; + PowerThrottling.StateMask = 0; + PowerThrottling.StateMask = PROCESS_POWER_THROTTLING_EXECUTION_SPEED; + SetProcessInformation(hProcess2, + ProcessPowerThrottling, + &PowerThrottling, + sizeof(PowerThrottling)); + + MEMORY_PRIORITY_INFORMATION MemPrio; + ZeroMemory(&MemPrio, sizeof(MemPrio)); + MemPrio.MemoryPriority = MEMORY_PRIORITY_NORMAL; + + SetProcessInformation(hProcess2, + ProcessMemoryPriority, + &MemPrio, + sizeof(MemPrio)); + + ::CloseHandle(hProcess2); + } + + return coreSelected; +} + +ULONG set_procMask(int direction = 0 , int32_t req_threads = 0, int lltraversal = 0 ) { + std::bitset<64> bMask; + + bMask = generate_Mask(direction, req_threads, lltraversal); + + numPhysicalCores = get_count_procMask(bMask.to_ullong()); + + CPUSET_PRINT_DEBUG("Generated Mask: %s\n", bMask.to_string().c_str()); + return bMask.to_ullong(); +} + +#endif // _WIN32 /** * Returns number of CPUs on system that are useful for math. @@ -177,10 +567,28 @@ int get_math_cpu_count() { } } } + +#elif defined(_WIN32) + int32_t _numPhysical = get_num_physical_cores(); + if (cpuset_enable) { + // Initial Affinity set + setCpuAffinity(set_procMask(WORST_CORES, 0, 1)); + } + return _numPhysical; #endif return get_num_physical_cores(); } +#if defined(_WIN32) +int get_math_cpu_count(int32_t req_threads, int cpuset_order, int lltraversal) { + int32_t _numPhysical = get_num_physical_cores(); + if (cpuset_enable) { + _numPhysical = setCpuAffinity(set_procMask(cpuset_order, req_threads, lltraversal)); + } + return _numPhysical; +} +#endif + void process_escapes(std::string & input) { std::size_t input_len = input.length(); std::size_t output_idx = 0; @@ -245,15 +653,48 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.seed = std::stoul(argv[i]); return true; } + if (arg == "-llct") { + if (++i >= argc) { + invalid_param = true; + return true; + } +#if defined(_WIN32) + std::string value(argv[i]); + if (value == "1" || value == "on" || value == "true" || value == "True") { params.cpuset_lltraversal = 1; } + else if (value == "0" || value == "off" || value == "false" || value == "False") { params.cpuset_lltraversal = 0; } + else { invalid_param = true; } +#endif + return true; + } + if (arg == "-bco") { + if (++i >= argc) { + invalid_param = true; + return true; + } +#if defined(_WIN32) + std::string value(argv[i]); + if (value == "1" || value == "on" || value == "true" || value == "True") { params.cpuset_order = BEST_CORES; } + else if (value == "0" || value == "off" || value == "false" || value == "False") { params.cpuset_order = WORST_CORES; } + else { invalid_param = true; } +#endif + return true; + } if (arg == "-t" || arg == "--threads") { if (++i >= argc) { invalid_param = true; return true; } +#if defined(_WIN32) + params.n_threads = std::stoi(argv[i]); + if (params.n_threads <= 0) { + params.n_threads = numPhysicalCores; + } +#else params.n_threads = std::stoi(argv[i]); if (params.n_threads <= 0) { params.n_threads = std::thread::hardware_concurrency(); } +#endif return true; } if (arg == "-tb" || arg == "--threads-batch") { @@ -262,8 +703,13 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } params.n_threads_batch = std::stoi(argv[i]); +#if defined(_WIN32) + if (params.n_threads_batch <= 0 || params.n_threads_batch > numPhysicalCores) { + params.n_threads_batch = numPhysicalCores; +#else if (params.n_threads_batch <= 0) { params.n_threads_batch = std::thread::hardware_concurrency(); +#endif } return true; } @@ -273,8 +719,13 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } params.n_threads_draft = std::stoi(argv[i]); +#if defined(_WIN32) + if (params.n_threads_draft <= 0 || params.n_threads_draft > numPhysicalCores) { + params.n_threads_draft = numPhysicalCores; +#else if (params.n_threads_draft <= 0) { params.n_threads_draft = std::thread::hardware_concurrency(); +#endif } return true; } @@ -284,8 +735,13 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } params.n_threads_batch_draft = std::stoi(argv[i]); +#if defined(_WIN32) + if (params.n_threads_batch_draft <= 0 || params.n_threads_batch_draft > numPhysicalCores) { + params.n_threads_batch_draft = numPhysicalCores; +#else if (params.n_threads_batch_draft <= 0) { params.n_threads_batch_draft = std::thread::hardware_concurrency(); +#endif } return true; } @@ -1281,6 +1737,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.kv_overrides.push_back(kvo); return true; } + #ifndef LOG_DISABLE_LOGS // Parse args for logging parameters if (log_param_single_parse(argv[i])) { @@ -1325,6 +1782,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { } } +#if defined(_WIN32) + params.n_threads = get_math_cpu_count(params.n_threads, params.cpuset_order, params.cpuset_lltraversal); + CPUSET_PRINT_DEBUG("Using %d threads order=%d llcache=%d\n", params.n_threads, params.cpuset_order, params.cpuset_lltraversal); +#endif + if (invalid_param) { throw std::invalid_argument("error: invalid parameter for argument: " + arg); } @@ -1486,6 +1948,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { if (llama_supports_mmap()) { printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); } +#if defined(_WIN32) + printf(" -bco change the order of the selected cores from the best to worst (default: worst to best)\n"); + printf(" -llct allow the core selection to traverse the last level cache (default: disabled)\n"); +#endif printf(" --numa TYPE attempt optimizations that help on some NUMA systems\n"); printf(" - distribute: spread execution evenly over all nodes\n"); printf(" - isolate: only spawn threads on CPUs on the node that execution started on\n"); @@ -2679,7 +3145,14 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector); fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z); +#if defined(_WIN32) + fprintf(stream, "threads: %d # default: %u\n", params.n_threads, get_math_cpu_count()); + fprintf(stream, "bco: %d # default: 0\n", params.cpuset_order); + fprintf(stream, "llct: %d # default: 0\n", params.cpuset_lltraversal); +#else + fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency()); +#endif fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k); fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p); fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p); diff --git a/common/common.h b/common/common.h index cca44268e6df5..773913c7342e4 100644 --- a/common/common.h +++ b/common/common.h @@ -39,6 +39,26 @@ extern char const *LLAMA_BUILD_TARGET; struct llama_control_vector_load_info; +#ifdef _WIN32 +struct CPU_SET_INFORMATION +{ + int32_t LogicalProcessorIndex; + int32_t Id; + int32_t Group; + int32_t CoreIndex; + int32_t LastLevelCacheIndex; + int32_t NumaNodeIndex; + int32_t EfficiencyClass; + int32_t SchedulingClass; + int32_t Priority; + int32_t Threads; +}; + +#endif + +static const int BEST_CORES = 0; +static const int WORST_CORES = 1; + int get_math_cpu_count(); int32_t get_num_physical_cores(); @@ -53,6 +73,8 @@ struct gpt_params { int32_t n_threads_draft = -1; int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads) int32_t n_threads_batch_draft = -1; + int32_t cpuset_lltraversal = 0; + int32_t cpuset_order = WORST_CORES; int32_t n_predict = -1; // new tokens to predict int32_t n_ctx = 512; // context size int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS) @@ -321,4 +343,4 @@ llama_control_vector_data llama_control_vector_load(const std::vector Date: Mon, 22 Apr 2024 20:12:15 +0200 Subject: [PATCH 02/12] Remove dubug flag --- common/common.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/common.cpp b/common/common.cpp index c53749befb8ab..f447aa9ada724 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -92,7 +92,7 @@ int32_t PhysicalCores = std::thread::hardware_concurrency(); // CPUSET logging // -#define CPUSET_DEBUG 1 +#define CPUSET_DEBUG 0 #if (CPUSET_DEBUG >= 1) #define CPUSET_PRINT_DEBUG(...) printf(__VA_ARGS__) #else From f9b42b8cd8a35411174c2952c2beb665f3f34a68 Mon Sep 17 00:00:00 2001 From: mann1x <20623405+mann1x@users.noreply.github.com> Date: Wed, 24 Apr 2024 21:50:01 +0200 Subject: [PATCH 03/12] Added new options and some fixes --- common/common.cpp | 150 ++++++++++++++++++++++++++++++++++++++-------- common/common.h | 8 ++- ggml.c | 4 +- 3 files changed, 131 insertions(+), 31 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index f447aa9ada724..a62d67cb07957 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -92,7 +92,7 @@ int32_t PhysicalCores = std::thread::hardware_concurrency(); // CPUSET logging // -#define CPUSET_DEBUG 0 +#define CPUSET_DEBUG 1 #if (CPUSET_DEBUG >= 1) #define CPUSET_PRINT_DEBUG(...) printf(__VA_ARGS__) #else @@ -124,13 +124,52 @@ bool cpuset_sorter_worst(CPU_SET_INFORMATION const& lhs, CPU_SET_INFORMATION con return lhs.SchedulingClass < rhs.SchedulingClass; } -ULONG generate_Mask(int direction, int32_t req_threads, int lltraversal) { +ULONG generate_Mask(int32_t direction, int32_t req_threads, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask) { std::bitset<64> bMask; std::vector _cpuset; int32_t bVal = 0; int32_t assigned_t = 0; int32_t llcache = -1; + DWORD_PTR processAffinityMask; + DWORD_PTR systemAffinityMask; + HANDLE hToken = nullptr; + bool gotsystemMask = true; + + BOOL bToken = ::OpenProcessToken(::GetCurrentProcess(), TOKEN_ALL_ACCESS, &hToken); + if (!bToken) { + CPUSET_PRINT_DEBUG("Could not access OpenProcessToken from generate_Mask\n"); + } + + HANDLE hProcess = ::OpenProcess(PROCESS_QUERY_LIMITED_INFORMATION | PROCESS_SET_INFORMATION, FALSE, GetCurrentProcessId()); + if (!hProcess) { + CPUSET_PRINT_DEBUG("Could not access OpenProcess for Affinity\n"); + gotsystemMask = false; + } + + if (!GetProcessAffinityMask(hProcess, &processAffinityMask, &systemAffinityMask)) { + CPUSET_PRINT_DEBUG("Could not get GetProcessAffinityMask for Process\n"); + gotsystemMask = false; + } + + if (hProcess) + ::CloseHandle(hProcess); + + if (cpuMask != 0) { + std::bitset<64> reqMask = cpuMask; + CPUSET_PRINT_DEBUG("Custom cpuMask: %s\n", reqMask.to_string().c_str()); + if (gotsystemMask) { + std::bitset<64> systemMask = systemAffinityMask; + CPUSET_PRINT_DEBUG("System Mask: %s\n", systemMask.to_string().c_str()); + std::bitset<64> newprocessMask = reqMask & systemMask; + CPUSET_PRINT_DEBUG("New Proc Mask: %s\n", newprocessMask.to_string().c_str()); + bMask = reqMask & systemMask; + } else{ + bMask = cpuMask; + } + return bMask.to_ullong(); + } + if (direction == BEST_CORES) { _cpuset = cpuset_best; } else { @@ -139,27 +178,25 @@ ULONG generate_Mask(int direction, int32_t req_threads, int lltraversal) { CPUSET_PRINT_DEBUG("\ngenerate_Mask dir=%d req_threads=%d lltraversal=%d llcache=%d\n", direction, req_threads, lltraversal, llcache); for (auto index : _cpuset) { bVal = 0; - if (index.LogicalProcessorIndex != 0 && - ((cpuset_smt && index.Threads > 1) || !cpuset_smt) && + if ((index.LogicalProcessorIndex != 0 || allowcz) && + ((cpuset_smt && index.Threads > 1) || !cpuset_smt || allowtc) && index.EfficiencyClass == 0 && - ((llcache == index.LastLevelCacheIndex && lltraversal == 0) || llcache == -1) + ((llcache == index.LastLevelCacheIndex && lltraversal == 0) || llcache == -1 || lltraversal == 1) ) { if (lltraversal == 0) { - CPUSET_PRINT_DEBUG("cache for lltraversal %d pre llcache %d now_cache=%u\n", lltraversal, llcache, index.LastLevelCacheIndex); + CPUSET_PRINT_DEBUG("### cache for lltraversal %d pre llcache=%d now_cache=%u\n", lltraversal, llcache, index.LastLevelCacheIndex); llcache = index.LastLevelCacheIndex; - CPUSET_PRINT_DEBUG("cache for lltraversal %d pos llcache %d now_cache=%u\n", lltraversal, llcache, index.LastLevelCacheIndex); + CPUSET_PRINT_DEBUG("### cache for lltraversal %d pos llcache=%d now_cache=%u\n", lltraversal, llcache, index.LastLevelCacheIndex); } bVal = 1; + } + if (req_threads > 0 && assigned_t >= req_threads) { bVal = 0;} + if(bVal == 1) { assigned_t++; - CPUSET_PRINT_DEBUG("Assigned LogicalCoreIndex: %d lltraversal %d llcache %d now_cache=%u\n", index.LogicalProcessorIndex, lltraversal, llcache, index.LastLevelCacheIndex); + CPUSET_PRINT_DEBUG("--> Assigned LogicalCoreIndex: %d lltraversal=%d llcache=%d now_cache=%u\n", index.LogicalProcessorIndex, lltraversal, llcache, index.LastLevelCacheIndex); } bMask[index.LogicalProcessorIndex] = bVal; - CPUSET_PRINT_DEBUG("Index: %d b:%d smt=%d thrds=%d\n", index.LogicalProcessorIndex, bVal, cpuset_smt, index.Threads); - if (req_threads > 0) { - if (assigned_t >= req_threads) { - break; - } - } + CPUSET_PRINT_DEBUG("LogicalCoreIndex: %d b:%d smt=%d thrds=%d lltraversal=%d acz=%d atc=%d\n", index.LogicalProcessorIndex, bVal, cpuset_smt, index.Threads, lltraversal, allowcz, allowtc); } return bMask.to_ullong(); } @@ -262,7 +299,6 @@ int32_t get_num_physical_cores() { cpuSetSize += nextCPUSet->Size; } - int32_t physicalCount = 0; int32_t thisLogical = 0; int32_t coreThreadsNum = 1; @@ -274,7 +310,6 @@ int32_t get_num_physical_cores() { if (nextLogical->ProcessorCore.Flags == 1 && nextLogical->Cache.Associativity <= 2) { switch (nextLogical->Relationship) { case LOGICAL_PROCESSOR_RELATIONSHIP::RelationProcessorCore: - CPUSET_PRINT_DEBUG("Physical Count: %u\n", physicalCount); CPUSET_PRINT_DEBUG("Cache.Associativity: %d\n", nextLogical->Cache.Associativity); CPUSET_PRINT_DEBUG("Cache.Level: %d\n", nextLogical->Cache.Level); CPUSET_PRINT_DEBUG("Cache.Type: %d\n", nextLogical->Cache.Type); @@ -303,15 +338,16 @@ int32_t get_num_physical_cores() { std::sort(cpuset_best.begin(), cpuset_best.end(), &cpuset_sorter_best); std::sort(cpuset_worst.begin(), cpuset_worst.end(), &cpuset_sorter_worst); - physicalCount = get_count_procMask(generate_Mask(WORST_CORES, 0, 1)); + int32_t physicalCount = 0; + physicalCount = get_count_procMask(generate_Mask(WORST_CORES, 0, 1, 0, 1, 0)); - CPUSET_PRINT_DEBUG("\n\nLPhysicalCount: %d\n\n", physicalCount); + CPUSET_PRINT_DEBUG("\n\n1st PhysicalCount: %d\n\n", physicalCount); physicalCount = physicalCount <= 0 ? numLogicalCores : physicalCount; - CPUSET_PRINT_DEBUG("\n\nLPhysicalCount2: %d\n\n", physicalCount); + CPUSET_PRINT_DEBUG("\n\n2nd PhysicalCount2: %d\n\n", physicalCount); - CPUSET_PRINT_DEBUG("\n\nLogical Processors Summary\n\n"); + CPUSET_PRINT_DEBUG("\n\n### Logical Processors Summary ###\n\n"); for (uint32_t _logicalCore = 0; _logicalCore < numLogicalCores;) { @@ -535,10 +571,10 @@ int32_t setCpuAffinity(std::bitset<64> cpuMask) { return coreSelected; } -ULONG set_procMask(int direction = 0 , int32_t req_threads = 0, int lltraversal = 0 ) { +ULONG set_procMask(int32_t direction = 0 , int32_t req_threads = 0, int32_t lltraversal = 0, int32_t allowtc = 0, int32_t allowcz = 0, int64_t cpuMask = 0) { std::bitset<64> bMask; - bMask = generate_Mask(direction, req_threads, lltraversal); + bMask = generate_Mask(direction, req_threads, lltraversal, allowtc, allowcz, cpuMask); numPhysicalCores = get_count_procMask(bMask.to_ullong()); @@ -580,10 +616,10 @@ int get_math_cpu_count() { } #if defined(_WIN32) -int get_math_cpu_count(int32_t req_threads, int cpuset_order, int lltraversal) { +int get_math_cpu_count(int32_t req_threads, int32_t cpuset_order, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask) { int32_t _numPhysical = get_num_physical_cores(); if (cpuset_enable) { - _numPhysical = setCpuAffinity(set_procMask(cpuset_order, req_threads, lltraversal)); + _numPhysical = setCpuAffinity(set_procMask(cpuset_order, req_threads, lltraversal, allowtc, allowcz, cpuMask)); } return _numPhysical; } @@ -653,6 +689,61 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.seed = std::stoul(argv[i]); return true; } + if (arg == "-acz") { + if (++i >= argc) { + invalid_param = true; + return true; + } +#if defined(_WIN32) + std::string value(argv[i]); + if (value == "1" || value == "on" || value == "true" || value == "True") { params.cpuset_allowzero = 1; } + else if (value == "0" || value == "off" || value == "false" || value == "False") { params.cpuset_allowzero = 0; } + else { invalid_param = true; } +#endif + return true; + } + if (arg == "-atc") { + if (++i >= argc) { + invalid_param = true; + return true; + } +#if defined(_WIN32) + std::string value(argv[i]); + if (value == "1" || value == "on" || value == "true" || value == "True") { params.cpuset_allowthreads = 1; } + else if (value == "0" || value == "off" || value == "false" || value == "False") { params.cpuset_allowthreads = 0; } + else { invalid_param = true; } +#endif + return true; + } + if (arg == "-ccm") { + if (++i >= argc) { + invalid_param = true; + return true; + } +#if defined(_WIN32) + std::string value(argv[i]); + std::size_t pos{}; + int64_t cpuMask = 0; + bool valid_bitmask = false; + try + { + const int64_t ll{std::stoll(value, &pos)}; + cpuMask = ll; + valid_bitmask = true; + } + catch (std::invalid_argument const& ex) + { + fprintf(stderr, "%s\n", ex.what()); + } + catch (std::out_of_range const& ex) + { + fprintf(stderr, "%s\n", ex.what()); + } + if (valid_bitmask && cpuMask != 0) { params.cpuset_cpumask = cpuMask; } + else { invalid_param = true; } +#endif + return true; + } if (arg == "-llct") { if (++i >= argc) { invalid_param = true; @@ -695,6 +786,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.n_threads = std::thread::hardware_concurrency(); } #endif + params.n_threads_auto = false; return true; } if (arg == "-tb" || arg == "--threads-batch") { @@ -1783,8 +1875,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { } #if defined(_WIN32) - params.n_threads = get_math_cpu_count(params.n_threads, params.cpuset_order, params.cpuset_lltraversal); - CPUSET_PRINT_DEBUG("Using %d threads order=%d llcache=%d\n", params.n_threads, params.cpuset_order, params.cpuset_lltraversal); + params.n_threads = get_math_cpu_count(params.n_threads_auto ? 0 : params.n_threads, params.cpuset_order, params.cpuset_lltraversal, params.cpuset_allowthreads, params.cpuset_allowzero, params.cpuset_cpumask); + CPUSET_PRINT_DEBUG("Using %d threads order=%d llcache=%d acm=%d acz=%d cpumask=%lli\n", params.n_threads, params.cpuset_order, params.cpuset_lltraversal, params.cpuset_allowthreads, params.cpuset_allowzero, params.cpuset_cpumask); #endif if (invalid_param) { @@ -1951,6 +2043,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { #if defined(_WIN32) printf(" -bco change the order of the selected cores from the best to worst (default: worst to best)\n"); printf(" -llct allow the core selection to traverse the last level cache (default: disabled)\n"); + printf(" -acz allow the core selection to pick the core 0 as well (default: disabled)\n"); + printf(" -atc allow the core selection to pick non physical, threaded, cores (default: disabled)\n"); + printf(" -ccm specify a custom CPU Affinity bitmask in hex for the core selection (default: disabled)\n"); #endif printf(" --numa TYPE attempt optimizations that help on some NUMA systems\n"); printf(" - distribute: spread execution evenly over all nodes\n"); @@ -3149,6 +3244,9 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, "threads: %d # default: %u\n", params.n_threads, get_math_cpu_count()); fprintf(stream, "bco: %d # default: 0\n", params.cpuset_order); fprintf(stream, "llct: %d # default: 0\n", params.cpuset_lltraversal); + fprintf(stream, "acz: %d # default: 0\n", params.cpuset_allowzero); + fprintf(stream, "atc: %d # default: 0\n", params.cpuset_allowthreads); + fprintf(stream, "ccm: %lli # default: none\n", params.cpuset_cpumask); #else fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency()); diff --git a/common/common.h b/common/common.h index 773913c7342e4..5ba823acead4a 100644 --- a/common/common.h +++ b/common/common.h @@ -56,8 +56,8 @@ struct CPU_SET_INFORMATION #endif -static const int BEST_CORES = 0; -static const int WORST_CORES = 1; +static const int32_t BEST_CORES = 0; +static const int32_t WORST_CORES = 1; int get_math_cpu_count(); int32_t get_num_physical_cores(); @@ -73,8 +73,12 @@ struct gpt_params { int32_t n_threads_draft = -1; int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads) int32_t n_threads_batch_draft = -1; + bool n_threads_auto = true; int32_t cpuset_lltraversal = 0; int32_t cpuset_order = WORST_CORES; + int64_t cpuset_cpumask = 0; + int32_t cpuset_allowzero = 0; + int32_t cpuset_allowthreads = 0; int32_t n_predict = -1; // new tokens to predict int32_t n_ctx = 512; // context size int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS) diff --git a/ggml.c b/ggml.c index 90584e18b4959..b5b11ca16090e 100644 --- a/ggml.c +++ b/ggml.c @@ -78,11 +78,10 @@ static int pthread_create(pthread_t * out, void * unused, thread_ret_t(*func)(vo (void) unused; HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL); -#if defined(_WIN32) HANDLE hToken; DWORD_PTR processAffinityMask; DWORD_PTR systemAffinityMask; - + BOOL bToken = OpenProcessToken(GetCurrentProcess(), TOKEN_ALL_ACCESS, &hToken); if (bToken) { @@ -122,7 +121,6 @@ static int pthread_create(pthread_t * out, void * unused, thread_ret_t(*func)(vo if (hProcess2) CloseHandle(hProcess2); -#endif if (handle == NULL) { return EAGAIN; From 63cd3dc251563e0aa15dc66a61f7e1affd6ed011 Mon Sep 17 00:00:00 2001 From: mann1x <20623405+mann1x@users.noreply.github.com> Date: Thu, 25 Apr 2024 22:27:50 +0200 Subject: [PATCH 04/12] Initial support for Linux --- common/common.cpp | 385 ++++++++++++++++++++++++++++++++++------------ common/common.h | 8 +- 2 files changed, 289 insertions(+), 104 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index a62d67cb07957..666de14bea501 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -77,7 +77,7 @@ using json = nlohmann::ordered_json; -#if defined(_WIN32) +#if defined(_WIN32) || (defined(__linux__) && defined(__x86_64__)) std::vector cpuset; std::vector cpuset_best; std::vector cpuset_worst; @@ -92,13 +92,25 @@ int32_t PhysicalCores = std::thread::hardware_concurrency(); // CPUSET logging // -#define CPUSET_DEBUG 1 +#define CPUSET_DEBUG 0 #if (CPUSET_DEBUG >= 1) #define CPUSET_PRINT_DEBUG(...) printf(__VA_ARGS__) #else #define CPUSET_PRINT_DEBUG(...) #endif +bool cpuset_sorter_best(CPU_SET_INFORMATION const& lhs, CPU_SET_INFORMATION const& rhs) { + return lhs.SchedulingClass > rhs.SchedulingClass; +} + +bool cpuset_sorter_worst(CPU_SET_INFORMATION const& lhs, CPU_SET_INFORMATION const& rhs) { + return lhs.SchedulingClass < rhs.SchedulingClass; +} + +#endif + +#if defined(_WIN32) + int32_t get_pos_procMask(ULONG_PTR procMask) { std::bitset<64> bMask = procMask; int32_t thisPos = 0; @@ -116,14 +128,6 @@ int32_t get_count_procMask(ULONG_PTR procMask) { return bMask.count(); } -bool cpuset_sorter_best(CPU_SET_INFORMATION const& lhs, CPU_SET_INFORMATION const& rhs) { - return lhs.SchedulingClass > rhs.SchedulingClass; -} - -bool cpuset_sorter_worst(CPU_SET_INFORMATION const& lhs, CPU_SET_INFORMATION const& rhs) { - return lhs.SchedulingClass < rhs.SchedulingClass; -} - ULONG generate_Mask(int32_t direction, int32_t req_threads, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask) { std::bitset<64> bMask; std::vector _cpuset; @@ -202,12 +206,250 @@ ULONG generate_Mask(int32_t direction, int32_t req_threads, int32_t lltraversal, } #endif +#if defined(__x86_64__) && defined(__linux__) +#include + +int32_t setCpuAffinity(std::bitset<64> cpuMask) { + int32_t coreSelected = cpuMask.count(); + + cpu_set_t mask; + CPU_ZERO(&mask); + + for (int32_t i = 0; i < 64; ++i) { + if (cpuMask[i] == 1) { + CPUSET_PRINT_DEBUG("Setting CPU %d\n", i); + CPU_SET(i, &mask); + } + } + + if (sched_setaffinity(0, sizeof(cpu_set_t), &mask) == -1) { + CPUSET_PRINT_DEBUG("setCpuAffinity sched_setaffinity error\n"); + } + if (pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask) == -1) { + CPUSET_PRINT_DEBUG("setCpuAffinity pthread_setaffinity_np error\n"); + } + + return coreSelected; +} + +uint64_t generate_Mask(int32_t direction, int32_t req_threads, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask) { + std::bitset<64> bMask; + std::vector _cpuset; + int32_t bVal = 0; + int32_t assigned_t = 0; + int32_t llcache = -1; + + if (cpuMask != 0) { + std::bitset<64> reqMask = cpuMask; + CPUSET_PRINT_DEBUG("Custom cpuMask: %s\n", reqMask.to_string().c_str()); + bMask = cpuMask; + return bMask.to_ullong(); + } + + if (direction == BEST_CORES) { + _cpuset = cpuset_best; + } else { + _cpuset = cpuset_worst; + } + CPUSET_PRINT_DEBUG("\ngenerate_Mask dir=%d req_threads=%d lltraversal=%d llcache=%d\n", direction, req_threads, lltraversal, llcache); + for (auto index : _cpuset) { + bVal = 0; + if ((index.LogicalProcessorIndex != 0 || allowcz) && + ((cpuset_smt && index.Threads > 1) || !cpuset_smt || allowtc) && + index.EfficiencyClass == 0 && + ((llcache == index.LastLevelCacheIndex && lltraversal == 0) || llcache == -1 || lltraversal == 1) + ) { + if (lltraversal == 0) { + CPUSET_PRINT_DEBUG("### cache for lltraversal %d pre llcache=%d now_cache=%u\n", lltraversal, llcache, index.LastLevelCacheIndex); + llcache = index.LastLevelCacheIndex; + CPUSET_PRINT_DEBUG("### cache for lltraversal %d pos llcache=%d now_cache=%u\n", lltraversal, llcache, index.LastLevelCacheIndex); + } + bVal = 1; + } + if (req_threads > 0 && assigned_t >= req_threads) { bVal = 0;} + if(bVal == 1) { + assigned_t++; + CPUSET_PRINT_DEBUG("--> Assigned LogicalCoreIndex: %d lltraversal=%d llcache=%d now_cache=%u\n", index.LogicalProcessorIndex, lltraversal, llcache, index.LastLevelCacheIndex); + } + bMask[index.LogicalProcessorIndex] = bVal; + CPUSET_PRINT_DEBUG("LogicalCoreIndex: %d b:%d smt=%d thrds=%d lltraversal=%d acz=%d atc=%d\n", index.LogicalProcessorIndex, bVal, cpuset_smt, index.Threads, lltraversal, allowcz, allowtc); + } + return bMask.to_ullong(); +} + +static void cpuid(unsigned leaf, unsigned subleaf, + unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) { + __asm__("movq\t%%rbx,%%rsi\n\t" + "cpuid\n\t" + "xchgq\t%%rbx,%%rsi" + : "=a"(*eax), "=S"(*ebx), "=c"(*ecx), "=d"(*edx) + : "0"(leaf), "2"(subleaf)); +} + +static int pin_cpu(int cpu) { + cpu_set_t mask; + CPU_ZERO(&mask); + CPU_SET(cpu, &mask); + return pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask); +} + +static bool is_hybrid_cpu(void) { + unsigned eax, ebx, ecx, edx; + cpuid(7, 0, &eax, &ebx, &ecx, &edx); + return !!(edx & (1u << 15)); +} + +static bool is_running_on_efficiency_core(void) { + unsigned eax, ebx, ecx, edx; + cpuid(0x1a, 0, &eax, &ebx, &ecx, &edx); + int intel_atom = 0x20; + int core_type = (eax & 0xff000000u) >> 24; + return core_type == intel_atom; +} + +static int count_math_cpus(int cpu_count) { + int result = 0; + for (int cpu = 0; cpu < cpu_count; ++cpu) { + if (pin_cpu(cpu)) { + return -1; + } + if (is_running_on_efficiency_core()) { + continue; // efficiency cores harm lockstep threading + } + ++cpu; // hyperthreading isn't useful for linear algebra + ++result; + } + return result; +} + +uint64_t set_procMask(int32_t direction = 0 , int32_t req_threads = 0, int32_t lltraversal = 0, int32_t allowtc = 0, int32_t allowcz = 0, int64_t cpuMask = 0) { + std::bitset<64> bMask; + + bMask = generate_Mask(direction, req_threads, lltraversal, allowtc, allowcz, cpuMask); + + numPhysicalCores = bMask.count(); + + CPUSET_PRINT_DEBUG("Generated Mask: %s\n", bMask.to_string().c_str()); + return bMask.to_ullong(); +} + +#endif + int32_t get_num_physical_cores() { -#ifdef __linux__ // __x86_64__ && __linux__ +#if defined(__linux__) && defined(__x86_64__) // __x86_64__ && __linux__ + if (numPhysicalCores > 0) { + return numPhysicalCores; + } // enumerate the set of thread siblings, num entries is num cores + fprintf(stderr, "physical cpus count\n"); std::unordered_set siblings; + int32_t cursize = 0; + cpu_set_t mask; + CPU_ZERO(&mask); + bool is_hybrid = is_hybrid_cpu(); + bool is_hybrid_core = false; + std::vector _cpuset; + int32_t numLogicalCores = 0; + for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) { - std::ifstream thread_siblings("/sys/devices/system/cpu" + fprintf(stderr, "physical cpu check %d\n", cpu); + std::ifstream thread_siblings("/sys/devices/system/cpu/cpu" + + std::to_string(cpu) + "/topology/thread_siblings"); + if (!thread_siblings.is_open()) { + break; // no more cpus + } + is_hybrid_core = false; + if (is_hybrid) { + if (pin_cpu(cpu) == 0) { + if (is_running_on_efficiency_core()) is_hybrid_core = true; + } + } + numLogicalCores++; + + CPU_SET_INFORMATION _cpuset; + _cpuset.LogicalProcessorIndex = cpu; + _cpuset.CoreIndex = cpu; + _cpuset.Id = cpu; + _cpuset.Group = 0; + _cpuset.LastLevelCacheIndex = 0; + _cpuset.NumaNodeIndex = 0; + _cpuset.EfficiencyClass = is_hybrid_core ? 1 : 0; + _cpuset.Threads = 1; + + std::ifstream cppc_tag("/sys/devices/system/cpu/cpu" + + std::to_string(cpu) + "/acpi_cppc/highest_perf"); + if (!cppc_tag.is_open()) { + _cpuset.SchedulingClass = 256-cpu; + } else { + std::string line; + if (std::getline(cppc_tag, line)) { + int32_t _thistag = std::stoi(line); + _cpuset.SchedulingClass = _thistag; + } + } + + if (is_hybrid_core) continue; + std::string line; + if (std::getline(thread_siblings, line)) { + cursize = static_cast(siblings.size()); + siblings.insert(line); + if (static_cast(siblings.size()) > cursize ) { + _cpuset.Threads = 2; + CPU_SET(cpu, &mask); + fprintf(stderr, "physical cpu %u: %s\n", cpu, line.c_str()); + } else { + cpuset_smt = true; + } + } + cpuset.push_back(_cpuset); + } + if (!siblings.empty()) { + cpuset_enable = true; + if (sched_setaffinity(0, sizeof(cpu_set_t), &mask) == -1) { + fprintf(stdout, "sched_setaffinity error\n"); + } + if (pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask) == -1) { + fprintf(stdout, "pthread_setaffinity_np error\n"); + } + fprintf(stderr, "physical cpus %li\n", siblings.size()); + + cpuset_best = cpuset; + cpuset_worst = cpuset; + std::sort(cpuset_best.begin(), cpuset_best.end(), &cpuset_sorter_best); + std::sort(cpuset_worst.begin(), cpuset_worst.end(), &cpuset_sorter_worst); + + //int32_t physicalCount = 0; + int32_t physicalCount = static_cast(siblings.size()); + //physicalCount = get_count_procMask(generate_Mask(WORST_CORES, 0, 1, 0, 1, 0)); + + CPUSET_PRINT_DEBUG("\n\n### Logical Processors Summary ###\n\n"); + + for (int32_t _logicalCore = 0; _logicalCore < numLogicalCores;) + { + CPUSET_PRINT_DEBUG("\nLogical: %u\n", _logicalCore); + CPUSET_PRINT_DEBUG("Threads: %u\n", cpuset[_logicalCore].Threads); + CPUSET_PRINT_DEBUG("Id: %u\n", cpuset[_logicalCore].Id); + CPUSET_PRINT_DEBUG("Group: %u\n", cpuset[numLogicalCores].Group); + CPUSET_PRINT_DEBUG("LastLevelCacheIndex: %u\n", cpuset[_logicalCore].LastLevelCacheIndex); + CPUSET_PRINT_DEBUG("NumaNodeIndex: %u\n", cpuset[_logicalCore].NumaNodeIndex); + CPUSET_PRINT_DEBUG("LogicalProcessorIndex: %u\n", cpuset[_logicalCore].LogicalProcessorIndex); + CPUSET_PRINT_DEBUG("EfficiencyClass: %u\n", cpuset[_logicalCore].EfficiencyClass); + CPUSET_PRINT_DEBUG("SchedulingClass: %u\n", cpuset[_logicalCore].SchedulingClass); + _logicalCore++; + } + + CPUSET_PRINT_DEBUG("\n\n \n\n"); + CPUSET_PRINT_DEBUG("Total Physical: %d\n", physicalCount); + CPUSET_PRINT_DEBUG("Total Logical: %u\n", numLogicalCores); + + numPhysicalCores = physicalCount; + return physicalCount; + } +#elif defined(__linux__) // __linux__ +// enumerate the set of thread siblings, num entries is num cores + std::unordered_set siblings; + for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) { + std::ifstream thread_siblings("/sys/devices/system/cpu/cpu" + std::to_string(cpu) + "/topology/thread_siblings"); if (!thread_siblings.is_open()) { break; // no more cpus @@ -289,6 +531,7 @@ int32_t get_num_physical_cores() { _cpuset.NumaNodeIndex = nextCPUSet->CpuSet.NumaNodeIndex; _cpuset.EfficiencyClass = nextCPUSet->CpuSet.EfficiencyClass; _cpuset.SchedulingClass = nextCPUSet->CpuSet.SchedulingClass; + _cpuset.Threads = 1; cpuset.push_back(_cpuset); numLogicalCores++; } @@ -370,58 +613,10 @@ int32_t get_num_physical_cores() { return physicalCount; #endif unsigned int n_threads = std::thread::hardware_concurrency(); - return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;} - - -#if defined(__x86_64__) && defined(__linux__) -#include - -static void cpuid(unsigned leaf, unsigned subleaf, - unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) { - __asm__("movq\t%%rbx,%%rsi\n\t" - "cpuid\n\t" - "xchgq\t%%rbx,%%rsi" - : "=a"(*eax), "=S"(*ebx), "=c"(*ecx), "=d"(*edx) - : "0"(leaf), "2"(subleaf)); -} - -static int pin_cpu(int cpu) { - cpu_set_t mask; - CPU_ZERO(&mask); - CPU_SET(cpu, &mask); - return pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask); -} - -static bool is_hybrid_cpu(void) { - unsigned eax, ebx, ecx, edx; - cpuid(7, 0, &eax, &ebx, &ecx, &edx); - return !!(edx & (1u << 15)); + return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4; } -static bool is_running_on_efficiency_core(void) { - unsigned eax, ebx, ecx, edx; - cpuid(0x1a, 0, &eax, &ebx, &ecx, &edx); - int intel_atom = 0x20; - int core_type = (eax & 0xff000000u) >> 24; - return core_type == intel_atom; -} - -static int count_math_cpus(int cpu_count) { - int result = 0; - for (int cpu = 0; cpu < cpu_count; ++cpu) { - if (pin_cpu(cpu)) { - return -1; - } - if (is_running_on_efficiency_core()) { - continue; // efficiency cores harm lockstep threading - } - ++cpu; // hyperthreading isn't useful for linear algebra - ++result; - } - return result; -} - -#elif defined(_WIN32) +#if defined(_WIN32) #define STATUS_ACCESS_DENIED ((NTSTATUS)0xC0000022L) #define STATUS_SUCCESS ((NTSTATUS)0) @@ -437,30 +632,6 @@ typedef enum _PROCESSINFOCLASS { ProcessAllowedCpuSetsInformation = 67, } PROCESSINFOCLASS; -extern "C" -NTSTATUS -NTAPI -NtQuerySystemInformationEx( - _In_ SYSTEM_INFORMATION_CLASS SystemInformationClass, - _In_reads_bytes_(InputBufferLength) PVOID InputBuffer, - _In_ ULONG InputBufferLength, - _Out_writes_bytes_opt_(SystemInformationLength) PVOID SystemInformation, - _In_ ULONG SystemInformationLength, - _Out_opt_ PULONG ReturnLength -); - - -extern "C" -NTSTATUS -NTAPI -NtQueryInformationProcess( - _In_ HANDLE ProcessHandle, - _In_ PROCESSINFOCLASS ProcessInformationClass, - _Out_writes_bytes_opt_(ProcessInformationLength) PVOID ProcessInformation, - _In_ ULONG ProcessInformationLength, - _Out_opt_ PULONG ReturnLength -); - int32_t setCpuAffinity(std::bitset<64> cpuMask) { DWORD_PTR processAffinityMask; DWORD_PTR systemAffinityMask; @@ -571,7 +742,7 @@ int32_t setCpuAffinity(std::bitset<64> cpuMask) { return coreSelected; } -ULONG set_procMask(int32_t direction = 0 , int32_t req_threads = 0, int32_t lltraversal = 0, int32_t allowtc = 0, int32_t allowcz = 0, int64_t cpuMask = 0) { +ULONG set_procMask(int32_t direction = 0, int32_t req_threads = 0, int32_t lltraversal = 0, int32_t allowtc = 0, int32_t allowcz = 0, int64_t cpuMask = 0) { std::bitset<64> bMask; bMask = generate_Mask(direction, req_threads, lltraversal, allowtc, allowcz, cpuMask); @@ -588,7 +759,7 @@ ULONG set_procMask(int32_t direction = 0 , int32_t req_threads = 0, int32_t lltr * Returns number of CPUs on system that are useful for math. */ int get_math_cpu_count() { -#if defined(__x86_64__) && defined(__linux__) +#if defined(__x86_164__) && defined(__linux__) int cpu_count = sysconf(_SC_NPROCESSORS_ONLN); if (cpu_count < 1) { return get_num_physical_cores(); @@ -604,7 +775,7 @@ int get_math_cpu_count() { } } -#elif defined(_WIN32) +#elif defined(_WIN32) || (defined(__x86_64__) && defined(__linux__)) int32_t _numPhysical = get_num_physical_cores(); if (cpuset_enable) { // Initial Affinity set @@ -615,7 +786,7 @@ int get_math_cpu_count() { return get_num_physical_cores(); } -#if defined(_WIN32) +#if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__)) int get_math_cpu_count(int32_t req_threads, int32_t cpuset_order, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask) { int32_t _numPhysical = get_num_physical_cores(); if (cpuset_enable) { @@ -694,7 +865,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa invalid_param = true; return true; } -#if defined(_WIN32) +#if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__)) std::string value(argv[i]); if (value == "1" || value == "on" || value == "true" || value == "True") { params.cpuset_allowzero = 1; } else if (value == "0" || value == "off" || value == "false" || value == "False") { params.cpuset_allowzero = 0; } @@ -707,7 +878,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa invalid_param = true; return true; } -#if defined(_WIN32) +#if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__)) std::string value(argv[i]); if (value == "1" || value == "on" || value == "true" || value == "True") { params.cpuset_allowthreads = 1; } else if (value == "0" || value == "off" || value == "false" || value == "False") { params.cpuset_allowthreads = 0; } @@ -720,7 +891,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa invalid_param = true; return true; } -#if defined(_WIN32) +#if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__)) std::string value(argv[i]); std::size_t pos{}; int64_t cpuMask = 0; @@ -749,7 +920,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa invalid_param = true; return true; } -#if defined(_WIN32) +#if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__)) std::string value(argv[i]); if (value == "1" || value == "on" || value == "true" || value == "True") { params.cpuset_lltraversal = 1; } else if (value == "0" || value == "off" || value == "false" || value == "False") { params.cpuset_lltraversal = 0; } @@ -762,7 +933,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa invalid_param = true; return true; } -#if defined(_WIN32) +#if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__)) std::string value(argv[i]); if (value == "1" || value == "on" || value == "true" || value == "True") { params.cpuset_order = BEST_CORES; } else if (value == "0" || value == "off" || value == "false" || value == "False") { params.cpuset_order = WORST_CORES; } @@ -775,7 +946,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa invalid_param = true; return true; } -#if defined(_WIN32) +#if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__)) params.n_threads = std::stoi(argv[i]); if (params.n_threads <= 0) { params.n_threads = numPhysicalCores; @@ -795,7 +966,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } params.n_threads_batch = std::stoi(argv[i]); -#if defined(_WIN32) +#if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__)) if (params.n_threads_batch <= 0 || params.n_threads_batch > numPhysicalCores) { params.n_threads_batch = numPhysicalCores; #else @@ -811,7 +982,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } params.n_threads_draft = std::stoi(argv[i]); -#if defined(_WIN32) +#if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__)) if (params.n_threads_draft <= 0 || params.n_threads_draft > numPhysicalCores) { params.n_threads_draft = numPhysicalCores; #else @@ -827,7 +998,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } params.n_threads_batch_draft = std::stoi(argv[i]); -#if defined(_WIN32) +#if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__)) if (params.n_threads_batch_draft <= 0 || params.n_threads_batch_draft > numPhysicalCores) { params.n_threads_batch_draft = numPhysicalCores; #else @@ -1874,10 +2045,15 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { } } -#if defined(_WIN32) +#if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__)) params.n_threads = get_math_cpu_count(params.n_threads_auto ? 0 : params.n_threads, params.cpuset_order, params.cpuset_lltraversal, params.cpuset_allowthreads, params.cpuset_allowzero, params.cpuset_cpumask); +#endif +#if defined(_WIN32) CPUSET_PRINT_DEBUG("Using %d threads order=%d llcache=%d acm=%d acz=%d cpumask=%lli\n", params.n_threads, params.cpuset_order, params.cpuset_lltraversal, params.cpuset_allowthreads, params.cpuset_allowzero, params.cpuset_cpumask); #endif +#if defined(__x86_64__) && defined(__linux__) + CPUSET_PRINT_DEBUG("Using %d threads order=%d llcache=%d acm=%d acz=%d cpumask=%li\n", params.n_threads, params.cpuset_order, params.cpuset_lltraversal, params.cpuset_allowthreads, params.cpuset_allowzero, params.cpuset_cpumask); +#endif if (invalid_param) { throw std::invalid_argument("error: invalid parameter for argument: " + arg); @@ -2040,7 +2216,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { if (llama_supports_mmap()) { printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); } -#if defined(_WIN32) +#if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__)) printf(" -bco change the order of the selected cores from the best to worst (default: worst to best)\n"); printf(" -llct allow the core selection to traverse the last level cache (default: disabled)\n"); printf(" -acz allow the core selection to pick the core 0 as well (default: disabled)\n"); @@ -3240,13 +3416,18 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector); fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z); -#if defined(_WIN32) +#if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__)) fprintf(stream, "threads: %d # default: %u\n", params.n_threads, get_math_cpu_count()); fprintf(stream, "bco: %d # default: 0\n", params.cpuset_order); fprintf(stream, "llct: %d # default: 0\n", params.cpuset_lltraversal); fprintf(stream, "acz: %d # default: 0\n", params.cpuset_allowzero); fprintf(stream, "atc: %d # default: 0\n", params.cpuset_allowthreads); +#if defined(_WIN32) fprintf(stream, "ccm: %lli # default: none\n", params.cpuset_cpumask); +#endif +#if defined(__x86_64__) && defined(__linux__) + fprintf(stream, "ccm: %li # default: none\n", params.cpuset_cpumask); +#endif #else fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency()); diff --git a/common/common.h b/common/common.h index 5ba823acead4a..aee4de284604b 100644 --- a/common/common.h +++ b/common/common.h @@ -39,7 +39,7 @@ extern char const *LLAMA_BUILD_TARGET; struct llama_control_vector_load_info; -#ifdef _WIN32 +#if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__)) struct CPU_SET_INFORMATION { int32_t LogicalProcessorIndex; @@ -54,8 +54,12 @@ struct CPU_SET_INFORMATION int32_t Threads; }; -#endif +bool cpuset_sorter_best(CPU_SET_INFORMATION const& lhs, CPU_SET_INFORMATION const& rhs); +bool cpuset_sorter_worst(CPU_SET_INFORMATION const& lhs, CPU_SET_INFORMATION const& rhs); + +int get_math_cpu_count(int32_t req_threads, int32_t cpuset_order, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask); +#endif static const int32_t BEST_CORES = 0; static const int32_t WORST_CORES = 1; From a3e75fe48143e05cb8021173a7d0788c86527484 Mon Sep 17 00:00:00 2001 From: mann1x <20623405+mann1x@users.noreply.github.com> Date: Fri, 26 Apr 2024 08:56:35 +0200 Subject: [PATCH 05/12] Fixes --- common/common.cpp | 78 ++++++++++++++--------------------------------- 1 file changed, 23 insertions(+), 55 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 666de14bea501..309552b1a6dd1 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -127,14 +127,17 @@ int32_t get_count_procMask(ULONG_PTR procMask) { std::bitset<64> bMask = procMask; return bMask.count(); } +#endif -ULONG generate_Mask(int32_t direction, int32_t req_threads, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask) { +#if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__)) +uint64_t generate_Mask(int32_t direction, int32_t req_threads, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask) { std::bitset<64> bMask; std::vector _cpuset; int32_t bVal = 0; int32_t assigned_t = 0; int32_t llcache = -1; +#if defined(_WIN32) DWORD_PTR processAffinityMask; DWORD_PTR systemAffinityMask; HANDLE hToken = nullptr; @@ -174,6 +177,15 @@ ULONG generate_Mask(int32_t direction, int32_t req_threads, int32_t lltraversal, return bMask.to_ullong(); } +#else + if (cpuMask != 0) { + std::bitset<64> reqMask = cpuMask; + CPUSET_PRINT_DEBUG("Custom cpuMask: %s\n", reqMask.to_string().c_str()); + bMask = cpuMask; + return bMask.to_ullong(); + } +#endif + if (direction == BEST_CORES) { _cpuset = cpuset_best; } else { @@ -232,51 +244,6 @@ int32_t setCpuAffinity(std::bitset<64> cpuMask) { return coreSelected; } -uint64_t generate_Mask(int32_t direction, int32_t req_threads, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask) { - std::bitset<64> bMask; - std::vector _cpuset; - int32_t bVal = 0; - int32_t assigned_t = 0; - int32_t llcache = -1; - - if (cpuMask != 0) { - std::bitset<64> reqMask = cpuMask; - CPUSET_PRINT_DEBUG("Custom cpuMask: %s\n", reqMask.to_string().c_str()); - bMask = cpuMask; - return bMask.to_ullong(); - } - - if (direction == BEST_CORES) { - _cpuset = cpuset_best; - } else { - _cpuset = cpuset_worst; - } - CPUSET_PRINT_DEBUG("\ngenerate_Mask dir=%d req_threads=%d lltraversal=%d llcache=%d\n", direction, req_threads, lltraversal, llcache); - for (auto index : _cpuset) { - bVal = 0; - if ((index.LogicalProcessorIndex != 0 || allowcz) && - ((cpuset_smt && index.Threads > 1) || !cpuset_smt || allowtc) && - index.EfficiencyClass == 0 && - ((llcache == index.LastLevelCacheIndex && lltraversal == 0) || llcache == -1 || lltraversal == 1) - ) { - if (lltraversal == 0) { - CPUSET_PRINT_DEBUG("### cache for lltraversal %d pre llcache=%d now_cache=%u\n", lltraversal, llcache, index.LastLevelCacheIndex); - llcache = index.LastLevelCacheIndex; - CPUSET_PRINT_DEBUG("### cache for lltraversal %d pos llcache=%d now_cache=%u\n", lltraversal, llcache, index.LastLevelCacheIndex); - } - bVal = 1; - } - if (req_threads > 0 && assigned_t >= req_threads) { bVal = 0;} - if(bVal == 1) { - assigned_t++; - CPUSET_PRINT_DEBUG("--> Assigned LogicalCoreIndex: %d lltraversal=%d llcache=%d now_cache=%u\n", index.LogicalProcessorIndex, lltraversal, llcache, index.LastLevelCacheIndex); - } - bMask[index.LogicalProcessorIndex] = bVal; - CPUSET_PRINT_DEBUG("LogicalCoreIndex: %d b:%d smt=%d thrds=%d lltraversal=%d acz=%d atc=%d\n", index.LogicalProcessorIndex, bVal, cpuset_smt, index.Threads, lltraversal, allowcz, allowtc); - } - return bMask.to_ullong(); -} - static void cpuid(unsigned leaf, unsigned subleaf, unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) { __asm__("movq\t%%rbx,%%rsi\n\t" @@ -341,7 +308,7 @@ int32_t get_num_physical_cores() { return numPhysicalCores; } // enumerate the set of thread siblings, num entries is num cores - fprintf(stderr, "physical cpus count\n"); + CPUSET_PRINT_DEBUG("Start: get_num_physical_cores\n"); std::unordered_set siblings; int32_t cursize = 0; cpu_set_t mask; @@ -352,7 +319,7 @@ int32_t get_num_physical_cores() { int32_t numLogicalCores = 0; for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) { - fprintf(stderr, "physical cpu check %d\n", cpu); + CPUSET_PRINT_DEBUG("Check for Logical CPU: %d\n", cpu); std::ifstream thread_siblings("/sys/devices/system/cpu/cpu" + std::to_string(cpu) + "/topology/thread_siblings"); if (!thread_siblings.is_open()) { @@ -396,7 +363,7 @@ int32_t get_num_physical_cores() { if (static_cast(siblings.size()) > cursize ) { _cpuset.Threads = 2; CPU_SET(cpu, &mask); - fprintf(stderr, "physical cpu %u: %s\n", cpu, line.c_str()); + CPUSET_PRINT_DEBUG("CPU %u is physical, siblings: %s\n", cpu, line.c_str()); } else { cpuset_smt = true; } @@ -406,21 +373,22 @@ int32_t get_num_physical_cores() { if (!siblings.empty()) { cpuset_enable = true; if (sched_setaffinity(0, sizeof(cpu_set_t), &mask) == -1) { - fprintf(stdout, "sched_setaffinity error\n"); + CPUSET_PRINT_DEBUG("sched_setaffinity error\n"); } if (pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask) == -1) { - fprintf(stdout, "pthread_setaffinity_np error\n"); + CPUSET_PRINT_DEBUG("pthread_setaffinity_np error\n"); } - fprintf(stderr, "physical cpus %li\n", siblings.size()); + fprintf(stderr, "get_num_physical_cores Physical CPU count: %li\n", siblings.size()); cpuset_best = cpuset; cpuset_worst = cpuset; std::sort(cpuset_best.begin(), cpuset_best.end(), &cpuset_sorter_best); std::sort(cpuset_worst.begin(), cpuset_worst.end(), &cpuset_sorter_worst); - //int32_t physicalCount = 0; - int32_t physicalCount = static_cast(siblings.size()); - //physicalCount = get_count_procMask(generate_Mask(WORST_CORES, 0, 1, 0, 1, 0)); + int32_t physicalCount = 0; + //int32_t physicalCount = static_cast(siblings.size()); + std::bitset<64> bMask = generate_Mask(WORST_CORES, 0, 1, 0, 1, 0); + physicalCount = bMask.count(); CPUSET_PRINT_DEBUG("\n\n### Logical Processors Summary ###\n\n"); From f7d2c0a5cda3dd58dc8aedfc78a1c498d79047f8 Mon Sep 17 00:00:00 2001 From: mann1x <20623405+mann1x@users.noreply.github.com> Date: Fri, 26 Apr 2024 15:09:17 +0200 Subject: [PATCH 06/12] Added set thread affinity for Linux --- common/common.cpp | 10 ++++++++-- ggml.c | 27 +++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 309552b1a6dd1..963f5ccb8dfde 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -231,6 +231,8 @@ int32_t setCpuAffinity(std::bitset<64> cpuMask) { if (cpuMask[i] == 1) { CPUSET_PRINT_DEBUG("Setting CPU %d\n", i); CPU_SET(i, &mask); + } else { + CPU_CLR(i, &mask); } } @@ -328,7 +330,10 @@ int32_t get_num_physical_cores() { is_hybrid_core = false; if (is_hybrid) { if (pin_cpu(cpu) == 0) { - if (is_running_on_efficiency_core()) is_hybrid_core = true; + if (is_running_on_efficiency_core()) { + is_hybrid_core = true; + CPUSET_PRINT_DEBUG("Logical CPU is Hybrid: %d\n", cpu); + } } } numLogicalCores++; @@ -365,6 +370,7 @@ int32_t get_num_physical_cores() { CPU_SET(cpu, &mask); CPUSET_PRINT_DEBUG("CPU %u is physical, siblings: %s\n", cpu, line.c_str()); } else { + CPU_CLR(cpu, &mask); cpuset_smt = true; } } @@ -378,7 +384,7 @@ int32_t get_num_physical_cores() { if (pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask) == -1) { CPUSET_PRINT_DEBUG("pthread_setaffinity_np error\n"); } - fprintf(stderr, "get_num_physical_cores Physical CPU count: %li\n", siblings.size()); + CPUSET_PRINT_DEBUG("get_num_physical_cores Physical CPU count: %li\n", siblings.size()); cpuset_best = cpuset; cpuset_worst = cpuset; diff --git a/ggml.c b/ggml.c index b5b11ca16090e..ef3200b09e2d4 100644 --- a/ggml.c +++ b/ggml.c @@ -18708,6 +18708,33 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl }; const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]); +#if defined(__x86_64__) && defined(__linux__) + cpu_set_t procMask; + cpu_set_t threadMask; + //fprintf(stdout, "sched_getaffinity init\n"); + if (sched_getaffinity(0, sizeof(cpu_set_t), &procMask) == -1) { + fprintf(stderr, "ggml_thread_create sched_getaffinity error\n"); + } else { + int result = pthread_setaffinity_np(workers[j].thrd, sizeof(cpu_set_t), &procMask); + if (result !=0) fprintf(stderr, "ggml_thread_create pthread_setaffinity_np: %d", result); + //printf("Set returned by sched_getaffinity() contained:\n"); + //for (size_t k = 0; k < CPU_SETSIZE; k++) + //if (CPU_ISSET(k, &procMask)) + //printf(" CPU %zu\n", k); + } + /* + int s; + s = pthread_getaffinity_np(workers[j].thrd, sizeof(threadMask), &threadMask); + if (s != 0) { + fprintf(stderr, "ggml_thread_create pthread_getaffinity_np: %d\n", s); + } else { + printf("Set returned by pthread_getaffinity_np() contained:\n"); + for (size_t l = 0; l < CPU_SETSIZE; l++) + if (CPU_ISSET(l, &threadMask)) + printf(" CPU %zu\n", l); + } + */ +#endif GGML_ASSERT(rc == 0); UNUSED(rc); } From d55ae1513c88945eb1696a341ec86c36e1dfffbc Mon Sep 17 00:00:00 2001 From: mann1x <20623405+mann1x@users.noreply.github.com> Date: Sat, 27 Apr 2024 12:17:05 +0200 Subject: [PATCH 07/12] Added one worker thread per core on Windows --- ggml.c | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/ggml.c b/ggml.c index ef3200b09e2d4..0d97d135bf871 100644 --- a/ggml.c +++ b/ggml.c @@ -74,13 +74,13 @@ static LONG atomic_fetch_sub(atomic_int * ptr, LONG dec) { typedef HANDLE pthread_t; typedef DWORD thread_ret_t; -static int pthread_create(pthread_t * out, void * unused, thread_ret_t(*func)(void *), void * arg) { - (void) unused; +static int pthread_create(pthread_t * out, int32_t thread, thread_ret_t(*func)(void *), void * arg) { HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL); HANDLE hToken; - DWORD_PTR processAffinityMask; - DWORD_PTR systemAffinityMask; + ULONG_PTR processAffinityMask; + ULONG_PTR systemAffinityMask; + ULONG newprocessAffinityMask; BOOL bToken = OpenProcessToken(GetCurrentProcess(), TOKEN_ALL_ACCESS, &hToken); if (bToken) { @@ -88,7 +88,21 @@ static int pthread_create(pthread_t * out, void * unused, thread_ret_t(*func)(vo HANDLE hProcess = OpenProcess(PROCESS_QUERY_LIMITED_INFORMATION | PROCESS_SET_INFORMATION, FALSE, GetCurrentProcessId()); if (hProcess) { if (GetProcessAffinityMask(hProcess, &processAffinityMask, &systemAffinityMask)) { - SetThreadAffinityMask(handle, processAffinityMask); + int32_t posCore = 0; + for (int32_t i = 0; i < 64; ++i) { + if (processAffinityMask & ((1ULL) << i) ) { + //fprintf(stderr, "Check thread %d for core %d poscore %d\n", thread, i, posCore); + if (posCore+1 == thread) { + //fprintf(stderr, "Thread %d is assigned to core %d\n", thread, i); + } else { + newprocessAffinityMask = newprocessAffinityMask | (0ULL << i-1); + //fprintf(stderr, "Thread %d is NOT assigned to core %d\n", thread, i); + break; + } + posCore++; + } + } + SetThreadAffinityMask(handle, newprocessAffinityMask); } } if (hProcess) @@ -139,7 +153,7 @@ static int pthread_join(pthread_t thread, void * unused) { } static int sched_yield (void) { - Sleep (0); + Sleep(0); return 0; } #else @@ -18706,8 +18720,11 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl .shared = &state_shared, .ec = GGML_STATUS_SUCCESS, }; - +#if defined(_WIN32) + const int rc = ggml_thread_create(&workers[j].thrd, j, ggml_graph_compute_thread, &workers[j]); +#else const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]); +#endif #if defined(__x86_64__) && defined(__linux__) cpu_set_t procMask; cpu_set_t threadMask; From b01716a653458e6fdddfc4d3924b3809dd6b3eb2 Mon Sep 17 00:00:00 2001 From: mann1x <20623405+mann1x@users.noreply.github.com> Date: Sat, 27 Apr 2024 13:00:33 +0200 Subject: [PATCH 08/12] Added worker threads sticking to a single core for Linux --- ggml.c | 40 +++++++++++++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 7 deletions(-) diff --git a/ggml.c b/ggml.c index 0d97d135bf871..0994d01ff4fdf 100644 --- a/ggml.c +++ b/ggml.c @@ -18727,17 +18727,43 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl #endif #if defined(__x86_64__) && defined(__linux__) cpu_set_t procMask; - cpu_set_t threadMask; + cpu_set_t newprocessAffinityMask; + CPU_ZERO(&newprocessAffinityMask); + //fprintf(stderr, "\nThread %d checking\n\n", j); //fprintf(stdout, "sched_getaffinity init\n"); if (sched_getaffinity(0, sizeof(cpu_set_t), &procMask) == -1) { fprintf(stderr, "ggml_thread_create sched_getaffinity error\n"); } else { - int result = pthread_setaffinity_np(workers[j].thrd, sizeof(cpu_set_t), &procMask); - if (result !=0) fprintf(stderr, "ggml_thread_create pthread_setaffinity_np: %d", result); - //printf("Set returned by sched_getaffinity() contained:\n"); - //for (size_t k = 0; k < CPU_SETSIZE; k++) - //if (CPU_ISSET(k, &procMask)) - //printf(" CPU %zu\n", k); + int posCore = 0; + for (int32_t i = 0; i < 64; ++i) { + if (CPU_ISSET(i, &procMask) ) { + //fprintf(stderr, "Check thread %d for core %d poscore %d\n", thread, i, posCore); + if ((posCore+1) == j) { + CPU_SET(i, &newprocessAffinityMask); + //fprintf(stderr, "\nThread %d is assigned to core %d\n\n", j, i); + break; + } else { + CPU_CLR(i, &newprocessAffinityMask); + //fprintf(stderr, "Thread %d is NOT assigned to core %d\n\n", j, i); + } + posCore++; + } + } + int result = pthread_setaffinity_np(workers[j].thrd, sizeof(cpu_set_t), &newprocessAffinityMask); + if (result !=0) fprintf(stderr, "\n\nggml_thread_create pthread_setaffinity_np for thread %d\n", j); + /* + printf("Set returned by sched_getaffinity() contained:\n"); + cpu_set_t nprocMask; + CPU_ZERO(&nprocMask); + for (size_t k = 0; k < CPU_SETSIZE; k++) + if (CPU_ISSET(k, &procMask)) + printf(" CPU %zu\n", k); + pthread_getaffinity_np(workers[j].thrd, sizeof(cpu_set_t), &nprocMask); + printf("Set returned by pthread_getaffinity_np() contained:\n"); + for (size_t k = 0; k < CPU_SETSIZE; k++) + if (CPU_ISSET(k, &nprocMask)) + printf(" CPU %zu\n", k); + */ } /* int s; From 49c1657821526a47451d90c8adc7487409f0e980 Mon Sep 17 00:00:00 2001 From: mann1x <20623405+mann1x@users.noreply.github.com> Date: Sat, 27 Apr 2024 15:41:00 +0200 Subject: [PATCH 09/12] Fixes --- common/common.cpp | 8 ++++---- ggml.c | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 963f5ccb8dfde..8bd6eb44ab203 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -138,8 +138,8 @@ uint64_t generate_Mask(int32_t direction, int32_t req_threads, int32_t lltravers int32_t llcache = -1; #if defined(_WIN32) - DWORD_PTR processAffinityMask; - DWORD_PTR systemAffinityMask; + ULONG_PTR processAffinityMask; + ULONG_PTR systemAffinityMask; HANDLE hToken = nullptr; bool gotsystemMask = true; @@ -607,8 +607,8 @@ typedef enum _PROCESSINFOCLASS { } PROCESSINFOCLASS; int32_t setCpuAffinity(std::bitset<64> cpuMask) { - DWORD_PTR processAffinityMask; - DWORD_PTR systemAffinityMask; + ULONG_PTR processAffinityMask; + ULONG_PTR systemAffinityMask; int32_t coreSelected = get_count_procMask(cpuMask.to_ullong()); HANDLE hToken = nullptr; diff --git a/ggml.c b/ggml.c index 0994d01ff4fdf..68a9a709cf5ca 100644 --- a/ggml.c +++ b/ggml.c @@ -95,7 +95,7 @@ static int pthread_create(pthread_t * out, int32_t thread, thread_ret_t(*func)(v if (posCore+1 == thread) { //fprintf(stderr, "Thread %d is assigned to core %d\n", thread, i); } else { - newprocessAffinityMask = newprocessAffinityMask | (0ULL << i-1); + newprocessAffinityMask = newprocessAffinityMask | ((0ULL) << (i-1)); //fprintf(stderr, "Thread %d is NOT assigned to core %d\n", thread, i); break; } From fa125a10bbd1f0a750ff33d9887b3c7fbca063ef Mon Sep 17 00:00:00 2001 From: mann1x <20623405+mann1x@users.noreply.github.com> Date: Sat, 27 Apr 2024 20:30:03 +0200 Subject: [PATCH 10/12] Fix typo --- common/common.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/common.cpp b/common/common.cpp index 8bd6eb44ab203..2e4a3befb839e 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -733,7 +733,7 @@ ULONG set_procMask(int32_t direction = 0, int32_t req_threads = 0, int32_t lltra * Returns number of CPUs on system that are useful for math. */ int get_math_cpu_count() { -#if defined(__x86_164__) && defined(__linux__) +#if defined(__x86_64__) && defined(__linux__) int cpu_count = sysconf(_SC_NPROCESSORS_ONLN); if (cpu_count < 1) { return get_num_physical_cores(); From e5672d33cb26c0a6f15f2804885fdbcf34e0d416 Mon Sep 17 00:00:00 2001 From: mann1x <20623405+mann1x@users.noreply.github.com> Date: Sat, 27 Apr 2024 20:55:45 +0200 Subject: [PATCH 11/12] Fixes --- common/common.cpp | 4 +--- common/common.h | 7 +++++++ 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 2e4a3befb839e..5a2bcbb03e7f6 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -176,13 +176,11 @@ uint64_t generate_Mask(int32_t direction, int32_t req_threads, int32_t lltravers } return bMask.to_ullong(); } - #else if (cpuMask != 0) { std::bitset<64> reqMask = cpuMask; CPUSET_PRINT_DEBUG("Custom cpuMask: %s\n", reqMask.to_string().c_str()); - bMask = cpuMask; - return bMask.to_ullong(); + return reqMask.to_ullong(); } #endif diff --git a/common/common.h b/common/common.h index aee4de284604b..ca53a57509ace 100644 --- a/common/common.h +++ b/common/common.h @@ -60,6 +60,13 @@ bool cpuset_sorter_worst(CPU_SET_INFORMATION const& lhs, CPU_SET_INFORMATION con int get_math_cpu_count(int32_t req_threads, int32_t cpuset_order, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask); #endif +#if defined(__x86_64__) && defined(__linux__) +#include +int32_t setCpuAffinity(std::bitset<64> cpuMask); +uint64_t generate_Mask(int32_t direction, int32_t req_threads, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask); +uint64_t set_procMask(int32_t direction, int32_t req_threads, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask); +#endif + static const int32_t BEST_CORES = 0; static const int32_t WORST_CORES = 1; From 063e201b020b8903f9467c00018b86e5a174b2cc Mon Sep 17 00:00:00 2001 From: mann1x <20623405+mann1x@users.noreply.github.com> Date: Sun, 28 Apr 2024 22:46:12 +0200 Subject: [PATCH 12/12] Fixes, Linux support over 64 CPUs, Core 0 enabled at 6 cores and below --- common/common.cpp | 128 ++++++++++++++++++++++++++++------------------ common/common.h | 9 ++-- 2 files changed, 83 insertions(+), 54 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 5a2bcbb03e7f6..759adad9713e0 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -92,7 +92,7 @@ int32_t PhysicalCores = std::thread::hardware_concurrency(); // CPUSET logging // -#define CPUSET_DEBUG 0 +#define CPUSET_DEBUG 1 #if (CPUSET_DEBUG >= 1) #define CPUSET_PRINT_DEBUG(...) printf(__VA_ARGS__) #else @@ -127,9 +127,7 @@ int32_t get_count_procMask(ULONG_PTR procMask) { std::bitset<64> bMask = procMask; return bMask.count(); } -#endif -#if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__)) uint64_t generate_Mask(int32_t direction, int32_t req_threads, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask) { std::bitset<64> bMask; std::vector _cpuset; @@ -137,7 +135,6 @@ uint64_t generate_Mask(int32_t direction, int32_t req_threads, int32_t lltravers int32_t assigned_t = 0; int32_t llcache = -1; -#if defined(_WIN32) ULONG_PTR processAffinityMask; ULONG_PTR systemAffinityMask; HANDLE hToken = nullptr; @@ -176,13 +173,6 @@ uint64_t generate_Mask(int32_t direction, int32_t req_threads, int32_t lltravers } return bMask.to_ullong(); } -#else - if (cpuMask != 0) { - std::bitset<64> reqMask = cpuMask; - CPUSET_PRINT_DEBUG("Custom cpuMask: %s\n", reqMask.to_string().c_str()); - return reqMask.to_ullong(); - } -#endif if (direction == BEST_CORES) { _cpuset = cpuset_best; @@ -192,7 +182,7 @@ uint64_t generate_Mask(int32_t direction, int32_t req_threads, int32_t lltravers CPUSET_PRINT_DEBUG("\ngenerate_Mask dir=%d req_threads=%d lltraversal=%d llcache=%d\n", direction, req_threads, lltraversal, llcache); for (auto index : _cpuset) { bVal = 0; - if ((index.LogicalProcessorIndex != 0 || allowcz) && + if ((index.LogicalProcessorIndex != 0 || allowcz == 1) && ((cpuset_smt && index.Threads > 1) || !cpuset_smt || allowtc) && index.EfficiencyClass == 0 && ((llcache == index.LastLevelCacheIndex && lltraversal == 0) || llcache == -1 || lltraversal == 1) @@ -214,33 +204,73 @@ uint64_t generate_Mask(int32_t direction, int32_t req_threads, int32_t lltravers } return bMask.to_ullong(); } -#endif -#if defined(__x86_64__) && defined(__linux__) +#elif defined(__x86_64__) && defined(__linux__) #include -int32_t setCpuAffinity(std::bitset<64> cpuMask) { - int32_t coreSelected = cpuMask.count(); +cpu_set_t generate_Mask(int32_t direction, int32_t req_threads, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask) { + cpu_set_t bMask; + CPU_ZERO(&bMask); + std::vector _cpuset; + int32_t bVal = 0; + int32_t assigned_t = 0; + int32_t llcache = -1; + std::bitset<64> reqMask = cpuMask; - cpu_set_t mask; - CPU_ZERO(&mask); + if (cpuMask != 0) { + CPUSET_PRINT_DEBUG("Custom cpuMask: %s\n", reqMask.to_string().c_str()); + } - for (int32_t i = 0; i < 64; ++i) { - if (cpuMask[i] == 1) { - CPUSET_PRINT_DEBUG("Setting CPU %d\n", i); - CPU_SET(i, &mask); + if (direction == BEST_CORES) { + _cpuset = cpuset_best; + } else { + _cpuset = cpuset_worst; + } + CPUSET_PRINT_DEBUG("\ngenerate_Mask: dir=%d req_threads=%d lltraversal=%d llcache=%d\n", direction, req_threads, lltraversal, llcache); + for (auto index : _cpuset) { + bVal = 0; + if ((index.LogicalProcessorIndex != 0 || allowcz == 1) && + ((cpuset_smt && index.Threads > 1) || !cpuset_smt || allowtc) && + index.EfficiencyClass == 0 && + ((llcache == index.LastLevelCacheIndex && lltraversal == 0) || llcache == -1 || lltraversal == 1) + ) { + if (lltraversal == 0) { + CPUSET_PRINT_DEBUG("### cache for lltraversal %d pre llcache=%d now_cache=%u\n", lltraversal, llcache, index.LastLevelCacheIndex); + llcache = index.LastLevelCacheIndex; + CPUSET_PRINT_DEBUG("### cache for lltraversal %d pos llcache=%d now_cache=%u\n", lltraversal, llcache, index.LastLevelCacheIndex); + } + bVal = 1; + } + if (req_threads > 0 && assigned_t >= req_threads) { bVal = 0;} + if (cpuMask != 0) { + bVal = 1; + if (reqMask[index.LogicalProcessorIndex] == 0) { + bVal = 0; + } + } + if(bVal == 1) { + assigned_t++; + CPU_SET(index.LogicalProcessorIndex, &bMask); + CPUSET_PRINT_DEBUG("--> Assigned LogicalCoreIndex: %d lltraversal=%d llcache=%d now_cache=%u\n", index.LogicalProcessorIndex, lltraversal, llcache, index.LastLevelCacheIndex); } else { - CPU_CLR(i, &mask); + CPU_CLR(index.LogicalProcessorIndex, &bMask); } + CPUSET_PRINT_DEBUG("LogicalCoreIndex: %d b:%d smt=%d thrds=%d lltraversal=%d acz=%d atc=%d\n", index.LogicalProcessorIndex, bVal, cpuset_smt, index.Threads, lltraversal, allowcz, allowtc); } + return bMask; +} + +int32_t setCpuAffinity(cpu_set_t bMask) { + const cpu_set_t cpuMask = bMask; + int32_t coreSelected = CPU_COUNT(&cpuMask); - if (sched_setaffinity(0, sizeof(cpu_set_t), &mask) == -1) { + if (sched_setaffinity(0, sizeof(cpu_set_t), &cpuMask) == -1) { CPUSET_PRINT_DEBUG("setCpuAffinity sched_setaffinity error\n"); } - if (pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask) == -1) { + if (pthread_setaffinity_np(pthread_self(), sizeof(cpuMask), &cpuMask) == -1) { CPUSET_PRINT_DEBUG("setCpuAffinity pthread_setaffinity_np error\n"); - } - + } + return coreSelected; } @@ -289,15 +319,16 @@ static int count_math_cpus(int cpu_count) { return result; } -uint64_t set_procMask(int32_t direction = 0 , int32_t req_threads = 0, int32_t lltraversal = 0, int32_t allowtc = 0, int32_t allowcz = 0, int64_t cpuMask = 0) { - std::bitset<64> bMask; - +cpu_set_t set_procMask(int32_t direction = 0 , int32_t req_threads = 0, int32_t lltraversal = 0, int32_t allowtc = 0, int32_t allowcz = 0, int64_t cpuMask = 0) { + cpu_set_t bMask; + CPU_ZERO(&bMask); bMask = generate_Mask(direction, req_threads, lltraversal, allowtc, allowcz, cpuMask); - numPhysicalCores = bMask.count(); + numPhysicalCores = CPU_COUNT(&bMask); - CPUSET_PRINT_DEBUG("Generated Mask: %s\n", bMask.to_string().c_str()); - return bMask.to_ullong(); + CPUSET_PRINT_DEBUG("Generated Mask Count CPU: %d\n", numPhysicalCores); + + return bMask; } #endif @@ -318,7 +349,7 @@ int32_t get_num_physical_cores() { std::vector _cpuset; int32_t numLogicalCores = 0; - for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) { + for (uint32_t cpu=0; cpu < 1024; ++cpu) { CPUSET_PRINT_DEBUG("Check for Logical CPU: %d\n", cpu); std::ifstream thread_siblings("/sys/devices/system/cpu/cpu" + std::to_string(cpu) + "/topology/thread_siblings"); @@ -390,9 +421,8 @@ int32_t get_num_physical_cores() { std::sort(cpuset_worst.begin(), cpuset_worst.end(), &cpuset_sorter_worst); int32_t physicalCount = 0; - //int32_t physicalCount = static_cast(siblings.size()); - std::bitset<64> bMask = generate_Mask(WORST_CORES, 0, 1, 0, 1, 0); - physicalCount = bMask.count(); + cpu_set_t bMask = generate_Mask(WORST_CORES, 0, 1, 0, 1, 0); + physicalCount = CPU_COUNT(&bMask); CPUSET_PRINT_DEBUG("\n\n### Logical Processors Summary ###\n\n"); @@ -554,7 +584,7 @@ int32_t get_num_physical_cores() { std::sort(cpuset_worst.begin(), cpuset_worst.end(), &cpuset_sorter_worst); int32_t physicalCount = 0; - physicalCount = get_count_procMask(generate_Mask(WORST_CORES, 0, 1, 0, 1, 0)); + physicalCount = get_count_procMask(generate_Mask(WORST_CORES, 0, 1, 0, 2, 0)); CPUSET_PRINT_DEBUG("\n\n1st PhysicalCount: %d\n\n", physicalCount); @@ -731,7 +761,14 @@ ULONG set_procMask(int32_t direction = 0, int32_t req_threads = 0, int32_t lltra * Returns number of CPUs on system that are useful for math. */ int get_math_cpu_count() { -#if defined(__x86_64__) && defined(__linux__) +#if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__)) + int32_t _numPhysical = get_num_physical_cores(); + if (cpuset_enable) { + // Initial Affinity set + setCpuAffinity(set_procMask(WORST_CORES, 0, 1, 0, 0)); + } + return _numPhysical; +#elif defined(__linux__) int cpu_count = sysconf(_SC_NPROCESSORS_ONLN); if (cpu_count < 1) { return get_num_physical_cores(); @@ -746,14 +783,6 @@ int get_math_cpu_count() { } } } - -#elif defined(_WIN32) || (defined(__x86_64__) && defined(__linux__)) - int32_t _numPhysical = get_num_physical_cores(); - if (cpuset_enable) { - // Initial Affinity set - setCpuAffinity(set_procMask(WORST_CORES, 0, 1)); - } - return _numPhysical; #endif return get_num_physical_cores(); } @@ -762,6 +791,7 @@ int get_math_cpu_count() { int get_math_cpu_count(int32_t req_threads, int32_t cpuset_order, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask) { int32_t _numPhysical = get_num_physical_cores(); if (cpuset_enable) { + if (_numPhysical < 7 && allowcz == 2) allowcz = 1; _numPhysical = setCpuAffinity(set_procMask(cpuset_order, req_threads, lltraversal, allowtc, allowcz, cpuMask)); } return _numPhysical; @@ -2191,7 +2221,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { #if defined(_WIN32) || (defined(__x86_64__) && defined(__linux__)) printf(" -bco change the order of the selected cores from the best to worst (default: worst to best)\n"); printf(" -llct allow the core selection to traverse the last level cache (default: disabled)\n"); - printf(" -acz allow the core selection to pick the core 0 as well (default: disabled)\n"); + printf(" -acz allow the core selection to pick the core 0 as well (default: disabled for more than 6 cores)\n"); printf(" -atc allow the core selection to pick non physical, threaded, cores (default: disabled)\n"); printf(" -ccm specify a custom CPU Affinity bitmask in hex for the core selection (default: disabled)\n"); #endif @@ -3392,7 +3422,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, "threads: %d # default: %u\n", params.n_threads, get_math_cpu_count()); fprintf(stream, "bco: %d # default: 0\n", params.cpuset_order); fprintf(stream, "llct: %d # default: 0\n", params.cpuset_lltraversal); - fprintf(stream, "acz: %d # default: 0\n", params.cpuset_allowzero); + fprintf(stream, "acz: %d # default: auto\n", params.cpuset_allowzero); fprintf(stream, "atc: %d # default: 0\n", params.cpuset_allowthreads); #if defined(_WIN32) fprintf(stream, "ccm: %lli # default: none\n", params.cpuset_cpumask); diff --git a/common/common.h b/common/common.h index ca53a57509ace..dc7fcffb5f1d5 100644 --- a/common/common.h +++ b/common/common.h @@ -61,10 +61,9 @@ bool cpuset_sorter_worst(CPU_SET_INFORMATION const& lhs, CPU_SET_INFORMATION con int get_math_cpu_count(int32_t req_threads, int32_t cpuset_order, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask); #endif #if defined(__x86_64__) && defined(__linux__) -#include -int32_t setCpuAffinity(std::bitset<64> cpuMask); -uint64_t generate_Mask(int32_t direction, int32_t req_threads, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask); -uint64_t set_procMask(int32_t direction, int32_t req_threads, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask); +int32_t setCpuAffinity(cpu_set_t cpuMask); +cpu_set_t generate_Mask(int32_t direction, int32_t req_threads, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask); +cpu_set_t set_procMask(int32_t direction, int32_t req_threads, int32_t lltraversal, int32_t allowtc, int32_t allowcz, int64_t cpuMask); #endif static const int32_t BEST_CORES = 0; @@ -88,7 +87,7 @@ struct gpt_params { int32_t cpuset_lltraversal = 0; int32_t cpuset_order = WORST_CORES; int64_t cpuset_cpumask = 0; - int32_t cpuset_allowzero = 0; + int32_t cpuset_allowzero = 2; int32_t cpuset_allowthreads = 0; int32_t n_predict = -1; // new tokens to predict int32_t n_ctx = 512; // context size