Skip to content

Commit e7641bf

Browse files
committed
Revert "Revert "Merge pull request stan-dev#1212 from stan-dev/feature/faster-ad-tls-v5""
This reverts commit db1a002.
1 parent f49f224 commit e7641bf

18 files changed

+362
-145
lines changed

Jenkinsfile

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,9 @@ pipeline {
4949
skipDefaultCheckout()
5050
preserveStashes(buildCount: 7)
5151
}
52+
environment {
53+
STAN_NUM_THREADS = '4'
54+
}
5255
stages {
5356
stage('Kill previous builds') {
5457
when {
@@ -225,6 +228,17 @@ pipeline {
225228
runTestsWin("test/unit")
226229
}
227230
}
231+
stage('Windows Threading') {
232+
agent { label 'windows' }
233+
steps {
234+
deleteDirWin()
235+
unstash 'MathSetup'
236+
bat "echo CXX=${env.CXX} -Werror > make/local"
237+
bat "echo CXXFLAGS+=-DSTAN_THREADS >> make/local"
238+
runTestsWin("test/unit -f thread")
239+
runTestsWin("test/unit -f map_rect")
240+
}
241+
}
228242
}
229243
}
230244
stage('Additional merge tests') {
@@ -236,7 +250,7 @@ pipeline {
236250
deleteDir()
237251
unstash 'MathSetup'
238252
sh "echo CXX=${GCC} >> make/local"
239-
sh "echo CPPFLAGS=-DSTAN_THREADS >> make/local"
253+
sh "echo CXXFLAGS=-DSTAN_THREADS >> make/local"
240254
runTests("test/unit")
241255
}
242256
post { always { retry(3) { deleteDir() } } }

stan/math/prim/mat/functor/map_rect.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include <stan/math/prim/arr/err/check_matching_sizes.hpp>
55
#include <stan/math/prim/mat/fun/dims.hpp>
66
#include <stan/math/prim/mat/fun/typedefs.hpp>
7+
#include <stan/math/prim/scal/meta/return_type.hpp>
78

89
#define STAN_REGISTER_MAP_RECT(CALLID, FUNCTOR)
910

stan/math/prim/mat/functor/map_rect_concurrent.hpp

Lines changed: 3 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,13 @@
33

44
#include <stan/math/prim/mat/fun/typedefs.hpp>
55

6-
#include <stan/math/prim/mat/functor/map_rect_reduce.hpp>
7-
#include <stan/math/prim/mat/functor/map_rect_combine.hpp>
6+
#include <stan/math/prim/scal/meta/return_type.hpp>
87
#include <stan/math/prim/scal/err/invalid_argument.hpp>
98
#include <boost/lexical_cast.hpp>
109

10+
#include <cstdlib>
1111
#include <vector>
1212
#include <thread>
13-
#include <future>
14-
#include <cstdlib>
1513

1614
namespace stan {
1715
namespace math {
@@ -77,72 +75,7 @@ map_rect_concurrent(
7775
const std::vector<Eigen::Matrix<T_job_param, Eigen::Dynamic, 1>>&
7876
job_params,
7977
const std::vector<std::vector<double>>& x_r,
80-
const std::vector<std::vector<int>>& x_i, std::ostream* msgs = nullptr) {
81-
typedef map_rect_reduce<F, T_shared_param, T_job_param> ReduceF;
82-
typedef map_rect_combine<F, T_shared_param, T_job_param> CombineF;
83-
84-
const int num_jobs = job_params.size();
85-
const vector_d shared_params_dbl = value_of(shared_params);
86-
std::vector<std::future<std::vector<matrix_d>>> futures;
87-
88-
auto execute_chunk = [&](int start, int size) -> std::vector<matrix_d> {
89-
const int end = start + size;
90-
std::vector<matrix_d> chunk_f_out;
91-
chunk_f_out.reserve(size);
92-
for (int i = start; i != end; i++)
93-
chunk_f_out.push_back(ReduceF()(
94-
shared_params_dbl, value_of(job_params[i]), x_r[i], x_i[i], msgs));
95-
return chunk_f_out;
96-
};
97-
98-
int num_threads = get_num_threads(num_jobs);
99-
int num_jobs_per_thread = num_jobs / num_threads;
100-
futures.emplace_back(
101-
std::async(std::launch::deferred, execute_chunk, 0, num_jobs_per_thread));
102-
103-
#ifdef STAN_THREADS
104-
if (num_threads > 1) {
105-
const int num_big_threads = num_jobs % num_threads;
106-
const int first_big_thread = num_threads - num_big_threads;
107-
for (int i = 1, job_start = num_jobs_per_thread, job_size = 0;
108-
i < num_threads; ++i, job_start += job_size) {
109-
job_size = i >= first_big_thread ? num_jobs_per_thread + 1
110-
: num_jobs_per_thread;
111-
futures.emplace_back(
112-
std::async(std::launch::async, execute_chunk, job_start, job_size));
113-
}
114-
}
115-
#endif
116-
117-
// collect results
118-
std::vector<int> world_f_out;
119-
world_f_out.reserve(num_jobs);
120-
matrix_d world_output(0, 0);
121-
122-
int offset = 0;
123-
for (std::size_t i = 0; i < futures.size(); ++i) {
124-
const std::vector<matrix_d>& chunk_result = futures[i].get();
125-
if (i == 0)
126-
world_output.resize(chunk_result[0].rows(),
127-
num_jobs * chunk_result[0].cols());
128-
129-
for (const auto& job_result : chunk_result) {
130-
const int num_job_outputs = job_result.cols();
131-
world_f_out.push_back(num_job_outputs);
132-
133-
if (world_output.cols() < offset + num_job_outputs)
134-
world_output.conservativeResize(Eigen::NoChange,
135-
2 * (offset + num_job_outputs));
136-
137-
world_output.block(0, offset, world_output.rows(), num_job_outputs)
138-
= job_result;
139-
140-
offset += num_job_outputs;
141-
}
142-
}
143-
CombineF combine(shared_params, job_params);
144-
return combine(world_output, world_f_out);
145-
}
78+
const std::vector<std::vector<int>>& x_i, std::ostream* msgs = nullptr);
14679

14780
} // namespace internal
14881
} // namespace math

stan/math/rev/core.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include <stan/math/rev/core/build_vari_array.hpp>
66
#include <stan/math/rev/core/chainable_alloc.hpp>
77
#include <stan/math/rev/core/chainablestack.hpp>
8+
#include <stan/math/rev/core/init_chainablestack.hpp>
89
#include <stan/math/rev/core/ddv_vari.hpp>
910
#include <stan/math/rev/core/dv_vari.hpp>
1011
#include <stan/math/rev/core/dvd_vari.hpp>

stan/math/rev/core/autodiffstackstorage.hpp

Lines changed: 95 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -7,42 +7,98 @@
77
namespace stan {
88
namespace math {
99

10+
// Internal macro used to modify global pointer definition to the
11+
// global AD instance.
12+
#ifdef STAN_THREADS
13+
// Whenever STAN_THREADS is set a TLS keyword is used. For reasons
14+
// explained below we use the GNU compiler extension __thread if
15+
// supported by the compiler while the generic thread_local C++11
16+
// keyword is used otherwise.
17+
#ifdef __GNUC__
18+
#define STAN_THREADS_DEF __thread
19+
#else
20+
#define STAN_THREADS_DEF thread_local
21+
#endif
22+
#else
23+
// In case STAN_THREADS is not set, then no modifier is needed.
24+
#define STAN_THREADS_DEF
25+
#endif
26+
1027
/**
11-
* Provides a thread_local singleton if needed. Read warnings below!
12-
* For performance reasons the singleton is a global static for the
13-
* case of no threading which is returned by a function. This design
14-
* should allow the compiler to apply necessary inlining to get
15-
* maximal performance. However, this design suffers from "the static
16-
* init order fiasco"[0]. Anywhere this is used, we must be
17-
* absolutely positive that it doesn't matter when the singleton will
18-
* get initialized relative to other static variables. In exchange,
19-
* we get a more performant singleton pattern for the non-threading
20-
* case. In the threading case we use the defacto standard C++11
21-
* singleton pattern relying on a function wrapping a static local
22-
* variable. This standard pattern is expected to be well supported
23-
* by the major compilers (as its standard), but it does incur some
24-
* performance penalty. There has been some discussion on this; see
25-
* [1] and [2] and the discussions those PRs link to as well.
28+
* This struct always provides access to the autodiff stack using
29+
* the singleton pattern. Read warnings below!
30+
*
31+
* The singleton <code>instance_</code> is a global static pointer,
32+
* which is thread local (TLS) if the STAN_THREADS preprocess variable
33+
* is defined.
2634
*
27-
* These are thread_local only if the user asks for it with
28-
* -DSTAN_THREADS. This is primarily because Apple clang compilers
29-
* before 2016 don't support thread_local and the additional
30-
* performance cost. We have proposed removing support for those[3],
31-
* and at that time we should evaluate the performance of a switch to
32-
* thread_local. If there is no loss in performance, we can remove
33-
* this ifdef.
35+
* The use of a pointer is motivated by performance reasons for the
36+
* threading case. When a TLS is used, initialization with a constant
37+
* expression at compile time is required for fast access to the
38+
* TLS. As the autodiff storage struct is non-POD, its initialization
39+
* is a dynamic expression at compile time. These dynamic expressions
40+
* are wrapped, in the TLS case, by a TLS wrapper function which slows
41+
* down its access. Using a pointer instead allows to initialize at
42+
* compile time to <code>nullptr</code>, which is a compile time
43+
* constant. In this case, the compiler avoids the use of a TLS
44+
* wrapper function.
45+
*
46+
* For performance reasons we use the __thread keyword on compilers
47+
* which support it. The __thread keyword is a GNU compiler-specific
48+
* (gcc, clang, Intel) extension which requires initialization with a
49+
* compile time constant expression. The C++11 keyword thread_local
50+
* does allow for constant and dynamic initialization of the
51+
* TLS. Thus, only the __thread keyword gurantees that constant
52+
* initialization and it's implied speedup, is used.
53+
*
54+
* The initialzation of the AD instance at run-time is handled by the
55+
* lifetime of a AutodiffStackSingleton object. More specifically, the
56+
* first instance of the AutodiffStackSingleton object will initialize
57+
* the AD instance and take ownership (it is the only one instance
58+
* with the private member own_instance_ being true). Thus, whenever
59+
* the first instance of the AutodiffStackSingleton object gets
60+
* destructed, the AD tape will be destructed as well. Within
61+
* stan-math the initialization of the AD instance for the main thread
62+
* of the program is handled by instantiating the singleton once in
63+
* the init_chainablestack.hpp file. Whenever STAN_THREADS is defined
64+
* then all created child threads must instantiate a
65+
* AutodiffStackSingleton object within the child thread before
66+
* accessing the AD system in order to initialize the TLS AD tape
67+
* within the child thread.
68+
*
69+
* The design of a globally held (optionally TLS) pointer, which is
70+
* globally initialized, allows the compiler to apply necessary
71+
* inlining to get maximal performance. However, the design suffers
72+
* from "the static init order fiasco"[0]. Whenever the static init
73+
* order fiasco occurs, the C++ client of the library may instantiate
74+
* a AutodiffStackSingleton object at the adequate code position prior
75+
* to any AD tape access to ensure proper initialization order. In
76+
* exchange, we get a more performant singleton pattern with automatic
77+
* initialization of the AD stack for the main thread. There has been
78+
* some discussion on earlier designs using the Mayer singleton
79+
* approach; see [1] and [2] and the discussions those PRs link to as
80+
* well.
3481
*
3582
* [0] https://isocpp.org/wiki/faq/ctors#static-init-order
3683
* [1] https://github.com/stan-dev/math/pull/840
3784
* [2] https://github.com/stan-dev/math/pull/826
3885
* [3]
3986
* http://discourse.mc-stan.org/t/potentially-dropping-support-for-older-versions-of-apples-version-of-clang/3780/
87+
* [4] https://github.com/stan-dev/math/pull/1135
4088
*/
4189
template <typename ChainableT, typename ChainableAllocT>
4290
struct AutodiffStackSingleton {
4391
typedef AutodiffStackSingleton<ChainableT, ChainableAllocT>
4492
AutodiffStackSingleton_t;
4593

94+
AutodiffStackSingleton() : own_instance_(init()) {}
95+
~AutodiffStackSingleton() {
96+
if (own_instance_) {
97+
delete instance_;
98+
instance_ = nullptr;
99+
}
100+
}
101+
46102
struct AutodiffStackStorage {
47103
AutodiffStackStorage &operator=(const AutodiffStackStorage &) = delete;
48104

@@ -57,30 +113,32 @@ struct AutodiffStackSingleton {
57113
std::vector<size_t> nested_var_alloc_stack_starts_;
58114
};
59115

60-
AutodiffStackSingleton() = delete;
61116
explicit AutodiffStackSingleton(AutodiffStackSingleton_t const &) = delete;
62117
AutodiffStackSingleton &operator=(const AutodiffStackSingleton_t &) = delete;
63118

64-
static inline AutodiffStackStorage &instance() {
65-
#ifdef STAN_THREADS
66-
thread_local static AutodiffStackStorage instance_;
67-
#endif
68-
return instance_;
119+
static inline constexpr AutodiffStackStorage &instance() {
120+
return *instance_;
69121
}
70122

71-
#ifndef STAN_THREADS
72-
73123
private:
74-
static AutodiffStackStorage instance_;
75-
#endif
124+
static bool init() {
125+
if (!instance_) {
126+
instance_ = new AutodiffStackStorage();
127+
return true;
128+
}
129+
return false;
130+
}
131+
132+
static STAN_THREADS_DEF AutodiffStackStorage *instance_;
133+
const bool own_instance_;
76134
};
77135

78-
#ifndef STAN_THREADS
79136
template <typename ChainableT, typename ChainableAllocT>
80-
typename AutodiffStackSingleton<ChainableT,
81-
ChainableAllocT>::AutodiffStackStorage
82-
AutodiffStackSingleton<ChainableT, ChainableAllocT>::instance_;
83-
#endif
137+
STAN_THREADS_DEF
138+
typename AutodiffStackSingleton<ChainableT,
139+
ChainableAllocT>::AutodiffStackStorage
140+
*AutodiffStackSingleton<ChainableT, ChainableAllocT>::instance_
141+
= nullptr;
84142

85143
} // namespace math
86144
} // namespace stan
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
#ifndef STAN_MATH_REV_CORE_INIT_CHAINABLESTACK_HPP
2+
#define STAN_MATH_REV_CORE_INIT_CHAINABLESTACK_HPP
3+
4+
#include <stan/math/rev/core/chainablestack.hpp>
5+
6+
namespace stan {
7+
namespace math {
8+
namespace {
9+
10+
/**
11+
* Initializes the AD stack for the main process. See
12+
* autodiffstackstorage.hpp for more explanations.
13+
*/
14+
ChainableStack global_stack_instance_init;
15+
} // namespace
16+
} // namespace math
17+
} // namespace stan
18+
#endif

stan/math/rev/mat.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@
6868
#include <stan/math/rev/mat/functor/integrate_ode_adams.hpp>
6969
#include <stan/math/rev/mat/functor/integrate_ode_bdf.hpp>
7070
#include <stan/math/rev/mat/functor/integrate_dae.hpp>
71+
#include <stan/math/rev/mat/functor/map_rect_concurrent.hpp>
7172
#include <stan/math/rev/mat/functor/map_rect_reduce.hpp>
7273

7374
#endif

0 commit comments

Comments
 (0)