@@ -695,7 +695,7 @@ class RunQueue {
695695
696696static std::atomic<uint32_t > next_tag{1 };
697697
698- template <typename Environment, bool kIsHybrid >
698+ template <typename Environment>
699699class ThreadPoolTempl : public onnxruntime ::concurrency::ExtendedThreadPoolInterface {
700700 private:
701701 struct PerThread ;
@@ -767,29 +767,6 @@ class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInter
767767 typedef std::function<void ()> Task;
768768 typedef RunQueue<Task, Tag, 1024 > Queue;
769769
770- // Class for waiting w/ exponential backoff.
771- // Template argument is maximum number of spins in backoff loop.
772- template <unsigned kMaxBackoff >
773- class ThreadPoolWaiter {
774- // Current number if spins in backoff loop
775- unsigned pause_time_;
776-
777- public:
778- void wait () {
779- // If kMaxBackoff is zero don't do any pausing.
780- if constexpr (kMaxBackoff == 1 ) {
781- onnxruntime::concurrency::SpinPause ();
782- } else if constexpr (kMaxBackoff > 1 ) {
783- // Exponential backoff
784- unsigned pause_time = pause_time_ + 1U ;
785- for (unsigned i = 0 ; i < pause_time; ++i) {
786- onnxruntime::concurrency::SpinPause ();
787- }
788- pause_time_ = (pause_time * 2U ) % kMaxBackoff ;
789- }
790- }
791- };
792-
793770 ThreadPoolTempl (const CHAR_TYPE* name, int num_threads, bool allow_spinning, Environment& env,
794771 const ThreadOptions& thread_options)
795772 : profiler_(num_threads, name),
@@ -931,9 +908,8 @@ class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInter
931908 // finish dispatch work. This avoids new tasks being started
932909 // concurrently with us attempting to end the parallel section.
933910 if (ps.dispatch_q_idx != -1 ) {
934- ThreadPoolWaiter<4 > waiter{};
935911 while (!ps.dispatch_done .load (std::memory_order_acquire)) {
936- waiter. wait ();
912+ onnxruntime::concurrency::SpinPause ();
937913 }
938914 }
939915
@@ -955,17 +931,15 @@ class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInter
955931
956932 // Wait for the dispatch task's own work...
957933 if (ps.dispatch_q_idx > -1 ) {
958- ThreadPoolWaiter<kIsHybrid ? 0 : 1 > waiter{};
959934 while (!ps.work_done .load (std::memory_order_acquire)) {
960- waiter. wait ();
935+ onnxruntime::concurrency::SpinPause ();
961936 }
962937 }
963938
964939 // ...and wait for any other tasks not revoked to finish their work
965940 auto tasks_to_wait_for = tasks_started - ps.tasks_revoked ;
966- ThreadPoolWaiter<kIsHybrid ? 0 : 1 > waiter{};
967941 while (ps.tasks_finished < tasks_to_wait_for) {
968- waiter. wait ();
942+ onnxruntime::concurrency::SpinPause ();
969943 }
970944
971945 // Clear status to allow the ThreadPoolParallelSection to be
@@ -1283,10 +1257,9 @@ class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInter
12831257 // Increase the worker count if needed. Each worker will pick up
12841258 // loops to execute from the current parallel section.
12851259 std::function<void (unsigned )> worker_fn = [&ps](unsigned par_idx) {
1286- ThreadPoolWaiter<kIsHybrid ? 4 : 0 > waiter{};
12871260 while (ps.active ) {
12881261 if (ps.current_loop .load () == nullptr ) {
1289- waiter. wait ();
1262+ onnxruntime::concurrency::SpinPause ();
12901263 } else {
12911264 ps.workers_in_loop ++;
12921265 ThreadPoolLoop* work_item = ps.current_loop ;
@@ -1307,9 +1280,8 @@ class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInter
13071280
13081281 // Wait for workers to exit the loop
13091282 ps.current_loop = 0 ;
1310- ThreadPoolWaiter<kIsHybrid ? 1 : 4 > waiter{};
13111283 while (ps.workers_in_loop ) {
1312- waiter. wait ();
1284+ onnxruntime::concurrency::SpinPause ();
13131285 }
13141286 profiler_.LogEnd (ThreadPoolProfiler::WAIT);
13151287 }
@@ -1560,30 +1532,13 @@ class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInter
15601532
15611533 assert (td.GetStatus () == WorkerData::ThreadStatus::Spinning);
15621534
1563- // The exact value of spin_count and steal_count are arbitrary and
1564- // were experimentally determined. These numbers yielded the best
1565- // performance across a range of workloads and
1566- // machines. Generally, the goal of tuning spin_count is to make
1567- // the number as small as possible while ensuring there is enough
1568- // slack so that if each core is doing the same amount of work it
1569- // won't sleep before they have all finished. The idea here is
1570- // that in pipelined workloads, it won't sleep during each stage
1571- // if it's done a bit faster than its neighbors, but that if there
1572- // are non-equal sizes of work distributed, it won't take too long
1573- // to reach sleep giving power (and thus frequency/performance) to
1574- // its neighbors. Since hybrid has P/E cores, a lower value is
1575- // chosen. On hybrid systems, even with equal sized workloads
1576- // distributed the compute time won't stay synced. Typically in
1577- // the hybrid case the P cores finish first (and are thus waiting)
1578- // which is essentially a priority inversion.
1579- constexpr int pref_spin_count = kIsHybrid ? 5000 : 10000 ;
1580- const int spin_count = allow_spinning_ ? pref_spin_count : 0 ;
1581- constexpr int steal_count = pref_spin_count / (kIsHybrid ? 25 : 100 );
1535+ constexpr int log2_spin = 20 ;
1536+ const int spin_count = allow_spinning_ ? (1ull << log2_spin) : 0 ;
1537+ const int steal_count = spin_count / 100 ;
15821538
15831539 SetDenormalAsZero (set_denormal_as_zero_);
15841540 profiler_.LogThreadId (thread_id);
15851541
1586- ThreadPoolWaiter<kIsHybrid ? 1 : 8 > waiter{};
15871542 while (!should_exit) {
15881543 Task t = q.PopFront ();
15891544 if (!t) {
@@ -1599,7 +1554,7 @@ class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInter
15991554 if (spin_loop_status_.load (std::memory_order_relaxed) == SpinLoopStatus::kIdle ) {
16001555 break ;
16011556 }
1602- waiter. wait ();
1557+ onnxruntime::concurrency::SpinPause ();
16031558 }
16041559
16051560 // Attempt to block
0 commit comments