7
7
namespace stan {
8
8
namespace math {
9
9
10
+ // Internal macro used to modify global pointer definition to the
11
+ // global AD instance.
12
+ #ifdef STAN_THREADS
13
+ // Whenever STAN_THREADS is set a TLS keyword is used. For reasons
14
+ // explained below we use the GNU compiler extension __thread if
15
+ // supported by the compiler while the generic thread_local C++11
16
+ // keyword is used otherwise.
17
+ #ifdef __GNUC__
18
+ #define STAN_THREADS_DEF __thread
19
+ #else
20
+ #define STAN_THREADS_DEF thread_local
21
+ #endif
22
+ #else
23
+ // In case STAN_THREADS is not set, then no modifier is needed.
24
+ #define STAN_THREADS_DEF
25
+ #endif
26
+
10
27
/* *
11
- * Provides a thread_local singleton if needed. Read warnings below!
12
- * For performance reasons the singleton is a global static for the
13
- * case of no threading which is returned by a function. This design
14
- * should allow the compiler to apply necessary inlining to get
15
- * maximal performance. However, this design suffers from "the static
16
- * init order fiasco"[0]. Anywhere this is used, we must be
17
- * absolutely positive that it doesn't matter when the singleton will
18
- * get initialized relative to other static variables. In exchange,
19
- * we get a more performant singleton pattern for the non-threading
20
- * case. In the threading case we use the defacto standard C++11
21
- * singleton pattern relying on a function wrapping a static local
22
- * variable. This standard pattern is expected to be well supported
23
- * by the major compilers (as its standard), but it does incur some
24
- * performance penalty. There has been some discussion on this; see
25
- * [1] and [2] and the discussions those PRs link to as well.
28
+ * This struct always provides access to the autodiff stack using
29
+ * the singleton pattern. Read warnings below!
30
+ *
31
+ * The singleton <code>instance_</code> is a global static pointer,
32
+ * which is thread local (TLS) if the STAN_THREADS preprocess variable
33
+ * is defined.
26
34
*
27
- * These are thread_local only if the user asks for it with
28
- * -DSTAN_THREADS. This is primarily because Apple clang compilers
29
- * before 2016 don't support thread_local and the additional
30
- * performance cost. We have proposed removing support for those[3],
31
- * and at that time we should evaluate the performance of a switch to
32
- * thread_local. If there is no loss in performance, we can remove
33
- * this ifdef.
35
+ * The use of a pointer is motivated by performance reasons for the
36
+ * threading case. When a TLS is used, initialization with a constant
37
+ * expression at compile time is required for fast access to the
38
+ * TLS. As the autodiff storage struct is non-POD, its initialization
39
+ * is a dynamic expression at compile time. These dynamic expressions
40
+ * are wrapped, in the TLS case, by a TLS wrapper function which slows
41
+ * down its access. Using a pointer instead allows to initialize at
42
+ * compile time to <code>nullptr</code>, which is a compile time
43
+ * constant. In this case, the compiler avoids the use of a TLS
44
+ * wrapper function.
45
+ *
46
+ * For performance reasons we use the __thread keyword on compilers
47
+ * which support it. The __thread keyword is a GNU compiler-specific
48
+ * (gcc, clang, Intel) extension which requires initialization with a
49
+ * compile time constant expression. The C++11 keyword thread_local
50
+ * does allow for constant and dynamic initialization of the
51
+ * TLS. Thus, only the __thread keyword gurantees that constant
52
+ * initialization and it's implied speedup, is used.
53
+ *
54
+ * The initialzation of the AD instance at run-time is handled by the
55
+ * lifetime of a AutodiffStackSingleton object. More specifically, the
56
+ * first instance of the AutodiffStackSingleton object will initialize
57
+ * the AD instance and take ownership (it is the only one instance
58
+ * with the private member own_instance_ being true). Thus, whenever
59
+ * the first instance of the AutodiffStackSingleton object gets
60
+ * destructed, the AD tape will be destructed as well. Within
61
+ * stan-math the initialization of the AD instance for the main thread
62
+ * of the program is handled by instantiating the singleton once in
63
+ * the init_chainablestack.hpp file. Whenever STAN_THREADS is defined
64
+ * then all created child threads must instantiate a
65
+ * AutodiffStackSingleton object within the child thread before
66
+ * accessing the AD system in order to initialize the TLS AD tape
67
+ * within the child thread.
68
+ *
69
+ * The design of a globally held (optionally TLS) pointer, which is
70
+ * globally initialized, allows the compiler to apply necessary
71
+ * inlining to get maximal performance. However, the design suffers
72
+ * from "the static init order fiasco"[0]. Whenever the static init
73
+ * order fiasco occurs, the C++ client of the library may instantiate
74
+ * a AutodiffStackSingleton object at the adequate code position prior
75
+ * to any AD tape access to ensure proper initialization order. In
76
+ * exchange, we get a more performant singleton pattern with automatic
77
+ * initialization of the AD stack for the main thread. There has been
78
+ * some discussion on earlier designs using the Mayer singleton
79
+ * approach; see [1] and [2] and the discussions those PRs link to as
80
+ * well.
34
81
*
35
82
* [0] https://isocpp.org/wiki/faq/ctors#static-init-order
36
83
* [1] https://github.com/stan-dev/math/pull/840
37
84
* [2] https://github.com/stan-dev/math/pull/826
38
85
* [3]
39
86
* http://discourse.mc-stan.org/t/potentially-dropping-support-for-older-versions-of-apples-version-of-clang/3780/
87
+ * [4] https://github.com/stan-dev/math/pull/1135
40
88
*/
41
89
template <typename ChainableT, typename ChainableAllocT>
42
90
struct AutodiffStackSingleton {
43
91
typedef AutodiffStackSingleton<ChainableT, ChainableAllocT>
44
92
AutodiffStackSingleton_t;
45
93
94
+ AutodiffStackSingleton () : own_instance_(init()) {}
95
+ ~AutodiffStackSingleton () {
96
+ if (own_instance_) {
97
+ delete instance_;
98
+ instance_ = nullptr ;
99
+ }
100
+ }
101
+
46
102
struct AutodiffStackStorage {
47
103
AutodiffStackStorage &operator =(const AutodiffStackStorage &) = delete ;
48
104
@@ -57,30 +113,32 @@ struct AutodiffStackSingleton {
57
113
std::vector<size_t > nested_var_alloc_stack_starts_;
58
114
};
59
115
60
- AutodiffStackSingleton () = delete ;
61
116
explicit AutodiffStackSingleton (AutodiffStackSingleton_t const &) = delete;
62
117
AutodiffStackSingleton &operator =(const AutodiffStackSingleton_t &) = delete ;
63
118
64
- static inline AutodiffStackStorage &instance () {
65
- #ifdef STAN_THREADS
66
- thread_local static AutodiffStackStorage instance_;
67
- #endif
68
- return instance_;
119
+ static inline constexpr AutodiffStackStorage &instance () {
120
+ return *instance_;
69
121
}
70
122
71
- #ifndef STAN_THREADS
72
-
73
123
private:
74
- static AutodiffStackStorage instance_;
75
- #endif
124
+ static bool init () {
125
+ if (!instance_) {
126
+ instance_ = new AutodiffStackStorage ();
127
+ return true ;
128
+ }
129
+ return false ;
130
+ }
131
+
132
+ static STAN_THREADS_DEF AutodiffStackStorage *instance_;
133
+ const bool own_instance_;
76
134
};
77
135
78
- #ifndef STAN_THREADS
79
136
template <typename ChainableT, typename ChainableAllocT>
80
- typename AutodiffStackSingleton<ChainableT,
81
- ChainableAllocT>::AutodiffStackStorage
82
- AutodiffStackSingleton<ChainableT, ChainableAllocT>::instance_;
83
- #endif
137
+ STAN_THREADS_DEF
138
+ typename AutodiffStackSingleton<ChainableT,
139
+ ChainableAllocT>::AutodiffStackStorage
140
+ *AutodiffStackSingleton<ChainableT, ChainableAllocT>::instance_
141
+ = nullptr ;
84
142
85
143
} // namespace math
86
144
} // namespace stan
0 commit comments