| /* |
| * kmp_dispatch.h: dynamic scheduling - iteration initialization and dispatch. |
| */ |
| |
| //===----------------------------------------------------------------------===// |
| // |
| // The LLVM Compiler Infrastructure |
| // |
| // This file is dual licensed under the MIT and the University of Illinois Open |
| // Source Licenses. See LICENSE.txt for details. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #ifndef KMP_DISPATCH_H |
| #define KMP_DISPATCH_H |
| |
| /* ------------------------------------------------------------------------ */ |
| /* ------------------------------------------------------------------------ */ |
| |
| // Need to raise Win version from XP to Vista here for support of |
| // InterlockedExchange64 |
| #if defined(_WIN32_WINNT) && defined(_M_IX86) |
| #undef _WIN32_WINNT |
| #define _WIN32_WINNT 0x0502 |
| #endif |
| |
| #include "kmp.h" |
| #include "kmp_error.h" |
| #include "kmp_i18n.h" |
| #include "kmp_itt.h" |
| #include "kmp_stats.h" |
| #include "kmp_str.h" |
| #if KMP_OS_WINDOWS && KMP_ARCH_X86 |
| #include <float.h> |
| #endif |
| |
| #if OMPT_SUPPORT |
| #include "ompt-internal.h" |
| #include "ompt-specific.h" |
| #endif |
| |
| /* ------------------------------------------------------------------------ */ |
| /* ------------------------------------------------------------------------ */ |
| #if KMP_USE_HIER_SCHED |
| // Forward declarations of some hierarchical scheduling data structures |
| template <typename T> struct kmp_hier_t; |
| template <typename T> struct kmp_hier_top_unit_t; |
| #endif // KMP_USE_HIER_SCHED |
| |
| template <typename T> struct dispatch_shared_info_template; |
| template <typename T> struct dispatch_private_info_template; |
| |
| template <typename T> |
| extern void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid, |
| dispatch_private_info_template<T> *pr, |
| enum sched_type schedule, T lb, T ub, |
| typename traits_t<T>::signed_t st, |
| #if USE_ITT_BUILD |
| kmp_uint64 *cur_chunk, |
| #endif |
| typename traits_t<T>::signed_t chunk, |
| T nproc, T unit_id); |
| template <typename T> |
| extern int __kmp_dispatch_next_algorithm( |
| int gtid, dispatch_private_info_template<T> *pr, |
| dispatch_shared_info_template<T> volatile *sh, kmp_int32 *p_last, T *p_lb, |
| T *p_ub, typename traits_t<T>::signed_t *p_st, T nproc, T unit_id); |
| |
| void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref); |
| void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref); |
| |
| #if KMP_STATIC_STEAL_ENABLED |
| |
| // replaces dispatch_private_info{32,64} structures and |
| // dispatch_private_info{32,64}_t types |
| template <typename T> struct dispatch_private_infoXX_template { |
| typedef typename traits_t<T>::unsigned_t UT; |
| typedef typename traits_t<T>::signed_t ST; |
| UT count; // unsigned |
| T ub; |
| /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */ |
| T lb; |
| ST st; // signed |
| UT tc; // unsigned |
| T static_steal_counter; // for static_steal only; maybe better to put after ub |
| |
| /* parm[1-4] are used in different ways by different scheduling algorithms */ |
| |
| // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on ) |
| // a) parm3 is properly aligned and |
| // b) all parm1-4 are in the same cache line. |
| // Because of parm1-4 are used together, performance seems to be better |
| // if they are in the same line (not measured though). |
| |
| struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4 |
| T parm1; |
| T parm2; |
| T parm3; |
| T parm4; |
| }; |
| |
| UT ordered_lower; // unsigned |
| UT ordered_upper; // unsigned |
| #if KMP_OS_WINDOWS |
| T last_upper; |
| #endif /* KMP_OS_WINDOWS */ |
| }; |
| |
| #else /* KMP_STATIC_STEAL_ENABLED */ |
| |
| // replaces dispatch_private_info{32,64} structures and |
| // dispatch_private_info{32,64}_t types |
| template <typename T> struct dispatch_private_infoXX_template { |
| typedef typename traits_t<T>::unsigned_t UT; |
| typedef typename traits_t<T>::signed_t ST; |
| T lb; |
| T ub; |
| ST st; // signed |
| UT tc; // unsigned |
| |
| T parm1; |
| T parm2; |
| T parm3; |
| T parm4; |
| |
| UT count; // unsigned |
| |
| UT ordered_lower; // unsigned |
| UT ordered_upper; // unsigned |
| #if KMP_OS_WINDOWS |
| T last_upper; |
| #endif /* KMP_OS_WINDOWS */ |
| }; |
| #endif /* KMP_STATIC_STEAL_ENABLED */ |
| |
| template <typename T> struct KMP_ALIGN_CACHE dispatch_private_info_template { |
| // duplicate alignment here, otherwise size of structure is not correct in our |
| // compiler |
| union KMP_ALIGN_CACHE private_info_tmpl { |
| dispatch_private_infoXX_template<T> p; |
| dispatch_private_info64_t p64; |
| } u; |
| enum sched_type schedule; /* scheduling algorithm */ |
| kmp_sched_flags_t flags; /* flags (e.g., ordered, nomerge, etc.) */ |
| kmp_uint32 ordered_bumped; |
| // to retain the structure size after making order |
| kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 3]; |
| dispatch_private_info *next; /* stack of buffers for nest of serial regions */ |
| kmp_uint32 type_size; |
| #if KMP_USE_HIER_SCHED |
| kmp_int32 hier_id; |
| kmp_hier_top_unit_t<T> *hier_parent; |
| // member functions |
| kmp_int32 get_hier_id() const { return hier_id; } |
| kmp_hier_top_unit_t<T> *get_parent() { return hier_parent; } |
| #endif |
| enum cons_type pushed_ws; |
| }; |
| |
| // replaces dispatch_shared_info{32,64} structures and |
| // dispatch_shared_info{32,64}_t types |
| template <typename T> struct dispatch_shared_infoXX_template { |
| typedef typename traits_t<T>::unsigned_t UT; |
| /* chunk index under dynamic, number of idle threads under static-steal; |
| iteration index otherwise */ |
| volatile UT iteration; |
| volatile UT num_done; |
| volatile UT ordered_iteration; |
| // to retain the structure size making ordered_iteration scalar |
| UT ordered_dummy[KMP_MAX_ORDERED - 3]; |
| }; |
| |
| // replaces dispatch_shared_info structure and dispatch_shared_info_t type |
| template <typename T> struct dispatch_shared_info_template { |
| typedef typename traits_t<T>::unsigned_t UT; |
| // we need union here to keep the structure size |
| union shared_info_tmpl { |
| dispatch_shared_infoXX_template<UT> s; |
| dispatch_shared_info64_t s64; |
| } u; |
| volatile kmp_uint32 buffer_index; |
| #if OMP_45_ENABLED |
| volatile kmp_int32 doacross_buf_idx; // teamwise index |
| kmp_uint32 *doacross_flags; // array of iteration flags (0/1) |
| kmp_int32 doacross_num_done; // count finished threads |
| #endif |
| #if KMP_USE_HIER_SCHED |
| kmp_hier_t<T> *hier; |
| #endif |
| #if KMP_USE_HWLOC |
| // When linking with libhwloc, the ORDERED EPCC test slowsdown on big |
| // machines (> 48 cores). Performance analysis showed that a cache thrash |
| // was occurring and this padding helps alleviate the problem. |
| char padding[64]; |
| #endif |
| }; |
| |
| /* ------------------------------------------------------------------------ */ |
| /* ------------------------------------------------------------------------ */ |
| |
| #undef USE_TEST_LOCKS |
| |
| // test_then_add template (general template should NOT be used) |
| template <typename T> static __forceinline T test_then_add(volatile T *p, T d); |
| |
| template <> |
| __forceinline kmp_int32 test_then_add<kmp_int32>(volatile kmp_int32 *p, |
| kmp_int32 d) { |
| kmp_int32 r; |
| r = KMP_TEST_THEN_ADD32(p, d); |
| return r; |
| } |
| |
| template <> |
| __forceinline kmp_int64 test_then_add<kmp_int64>(volatile kmp_int64 *p, |
| kmp_int64 d) { |
| kmp_int64 r; |
| r = KMP_TEST_THEN_ADD64(p, d); |
| return r; |
| } |
| |
| // test_then_inc_acq template (general template should NOT be used) |
| template <typename T> static __forceinline T test_then_inc_acq(volatile T *p); |
| |
| template <> |
| __forceinline kmp_int32 test_then_inc_acq<kmp_int32>(volatile kmp_int32 *p) { |
| kmp_int32 r; |
| r = KMP_TEST_THEN_INC_ACQ32(p); |
| return r; |
| } |
| |
| template <> |
| __forceinline kmp_int64 test_then_inc_acq<kmp_int64>(volatile kmp_int64 *p) { |
| kmp_int64 r; |
| r = KMP_TEST_THEN_INC_ACQ64(p); |
| return r; |
| } |
| |
| // test_then_inc template (general template should NOT be used) |
| template <typename T> static __forceinline T test_then_inc(volatile T *p); |
| |
| template <> |
| __forceinline kmp_int32 test_then_inc<kmp_int32>(volatile kmp_int32 *p) { |
| kmp_int32 r; |
| r = KMP_TEST_THEN_INC32(p); |
| return r; |
| } |
| |
| template <> |
| __forceinline kmp_int64 test_then_inc<kmp_int64>(volatile kmp_int64 *p) { |
| kmp_int64 r; |
| r = KMP_TEST_THEN_INC64(p); |
| return r; |
| } |
| |
| // compare_and_swap template (general template should NOT be used) |
| template <typename T> |
| static __forceinline kmp_int32 compare_and_swap(volatile T *p, T c, T s); |
| |
| template <> |
| __forceinline kmp_int32 compare_and_swap<kmp_int32>(volatile kmp_int32 *p, |
| kmp_int32 c, kmp_int32 s) { |
| return KMP_COMPARE_AND_STORE_REL32(p, c, s); |
| } |
| |
| template <> |
| __forceinline kmp_int32 compare_and_swap<kmp_int64>(volatile kmp_int64 *p, |
| kmp_int64 c, kmp_int64 s) { |
| return KMP_COMPARE_AND_STORE_REL64(p, c, s); |
| } |
| |
| template <typename T> kmp_uint32 __kmp_ge(T value, T checker) { |
| return value >= checker; |
| } |
| template <typename T> kmp_uint32 __kmp_eq(T value, T checker) { |
| return value == checker; |
| } |
| |
| /* |
| Spin wait loop that first does pause, then yield. |
| Waits until function returns non-zero when called with *spinner and check. |
| Does NOT put threads to sleep. |
| Arguments: |
| UT is unsigned 4- or 8-byte type |
| spinner - memory location to check value |
| checker - value which spinner is >, <, ==, etc. |
| pred - predicate function to perform binary comparison of some sort |
| #if USE_ITT_BUILD |
| obj -- is higher-level synchronization object to report to ittnotify. It |
| is used to report locks consistently. For example, if lock is acquired |
| immediately, its address is reported to ittnotify via |
| KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired immediately |
| and lock routine calls to KMP_WAIT_YIELD(), the later should report the |
| same address, not an address of low-level spinner. |
| #endif // USE_ITT_BUILD |
| TODO: make inline function (move to header file for icl) |
| */ |
| template <typename UT> |
| static UT __kmp_wait_yield(volatile UT *spinner, UT checker, |
| kmp_uint32 (*pred)(UT, UT) |
| USE_ITT_BUILD_ARG(void *obj)) { |
| // note: we may not belong to a team at this point |
| volatile UT *spin = spinner; |
| UT check = checker; |
| kmp_uint32 spins; |
| kmp_uint32 (*f)(UT, UT) = pred; |
| UT r; |
| |
| KMP_FSYNC_SPIN_INIT(obj, CCAST(UT *, spin)); |
| KMP_INIT_YIELD(spins); |
| // main wait spin loop |
| while (!f(r = *spin, check)) { |
| KMP_FSYNC_SPIN_PREPARE(obj); |
| /* GEH - remove this since it was accidentally introduced when kmp_wait was |
| split. |
| It causes problems with infinite recursion because of exit lock */ |
| /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort) |
| __kmp_abort_thread(); */ |
| |
| // if we are oversubscribed, |
| // or have waited a bit (and KMP_LIBRARY=throughput, then yield |
| // pause is in the following code |
| KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc); |
| KMP_YIELD_SPIN(spins); |
| } |
| KMP_FSYNC_SPIN_ACQUIRED(obj); |
| return r; |
| } |
| |
| /* ------------------------------------------------------------------------ */ |
| /* ------------------------------------------------------------------------ */ |
| |
| template <typename UT> |
| void __kmp_dispatch_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { |
| typedef typename traits_t<UT>::signed_t ST; |
| dispatch_private_info_template<UT> *pr; |
| |
| int gtid = *gtid_ref; |
| // int cid = *cid_ref; |
| kmp_info_t *th = __kmp_threads[gtid]; |
| KMP_DEBUG_ASSERT(th->th.th_dispatch); |
| |
| KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid)); |
| if (__kmp_env_consistency_check) { |
| pr = reinterpret_cast<dispatch_private_info_template<UT> *>( |
| th->th.th_dispatch->th_dispatch_pr_current); |
| if (pr->pushed_ws != ct_none) { |
| #if KMP_USE_DYNAMIC_LOCK |
| __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL, 0); |
| #else |
| __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL); |
| #endif |
| } |
| } |
| |
| if (!th->th.th_team->t.t_serialized) { |
| dispatch_shared_info_template<UT> *sh = |
| reinterpret_cast<dispatch_shared_info_template<UT> *>( |
| th->th.th_dispatch->th_dispatch_sh_current); |
| UT lower; |
| |
| if (!__kmp_env_consistency_check) { |
| pr = reinterpret_cast<dispatch_private_info_template<UT> *>( |
| th->th.th_dispatch->th_dispatch_pr_current); |
| } |
| lower = pr->u.p.ordered_lower; |
| |
| #if !defined(KMP_GOMP_COMPAT) |
| if (__kmp_env_consistency_check) { |
| if (pr->ordered_bumped) { |
| struct cons_header *p = __kmp_threads[gtid]->th.th_cons; |
| __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting, |
| ct_ordered_in_pdo, loc_ref, |
| &p->stack_data[p->w_top]); |
| } |
| } |
| #endif /* !defined(KMP_GOMP_COMPAT) */ |
| |
| KMP_MB(); |
| #ifdef KMP_DEBUG |
| { |
| char *buff; |
| // create format specifiers before the debug output |
| buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d before wait: " |
| "ordered_iter:%%%s lower:%%%s\n", |
| traits_t<UT>::spec, traits_t<UT>::spec); |
| KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); |
| __kmp_str_free(&buff); |
| } |
| #endif |
| __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower, |
| __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL)); |
| KMP_MB(); /* is this necessary? */ |
| #ifdef KMP_DEBUG |
| { |
| char *buff; |
| // create format specifiers before the debug output |
| buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d after wait: " |
| "ordered_iter:%%%s lower:%%%s\n", |
| traits_t<UT>::spec, traits_t<UT>::spec); |
| KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower)); |
| __kmp_str_free(&buff); |
| } |
| #endif |
| } |
| KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid)); |
| } |
| |
| template <typename UT> |
| void __kmp_dispatch_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { |
| typedef typename traits_t<UT>::signed_t ST; |
| dispatch_private_info_template<UT> *pr; |
| |
| int gtid = *gtid_ref; |
| // int cid = *cid_ref; |
| kmp_info_t *th = __kmp_threads[gtid]; |
| KMP_DEBUG_ASSERT(th->th.th_dispatch); |
| |
| KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid)); |
| if (__kmp_env_consistency_check) { |
| pr = reinterpret_cast<dispatch_private_info_template<UT> *>( |
| th->th.th_dispatch->th_dispatch_pr_current); |
| if (pr->pushed_ws != ct_none) { |
| __kmp_pop_sync(gtid, ct_ordered_in_pdo, loc_ref); |
| } |
| } |
| |
| if (!th->th.th_team->t.t_serialized) { |
| dispatch_shared_info_template<UT> *sh = |
| reinterpret_cast<dispatch_shared_info_template<UT> *>( |
| th->th.th_dispatch->th_dispatch_sh_current); |
| |
| if (!__kmp_env_consistency_check) { |
| pr = reinterpret_cast<dispatch_private_info_template<UT> *>( |
| th->th.th_dispatch->th_dispatch_pr_current); |
| } |
| |
| KMP_FSYNC_RELEASING(CCAST(UT *, &sh->u.s.ordered_iteration)); |
| #if !defined(KMP_GOMP_COMPAT) |
| if (__kmp_env_consistency_check) { |
| if (pr->ordered_bumped != 0) { |
| struct cons_header *p = __kmp_threads[gtid]->th.th_cons; |
| /* How to test it? - OM */ |
| __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting, |
| ct_ordered_in_pdo, loc_ref, |
| &p->stack_data[p->w_top]); |
| } |
| } |
| #endif /* !defined(KMP_GOMP_COMPAT) */ |
| |
| KMP_MB(); /* Flush all pending memory write invalidates. */ |
| |
| pr->ordered_bumped += 1; |
| |
| KD_TRACE(1000, |
| ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n", |
| gtid, pr->ordered_bumped)); |
| |
| KMP_MB(); /* Flush all pending memory write invalidates. */ |
| |
| /* TODO use general release procedure? */ |
| test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration); |
| |
| KMP_MB(); /* Flush all pending memory write invalidates. */ |
| } |
| KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid)); |
| } |
| |
| /* Computes and returns x to the power of y, where y must a non-negative integer |
| */ |
| template <typename UT> |
| static __forceinline long double __kmp_pow(long double x, UT y) { |
| long double s = 1.0L; |
| |
| KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0); |
| // KMP_DEBUG_ASSERT(y >= 0); // y is unsigned |
| while (y) { |
| if (y & 1) |
| s *= x; |
| x *= x; |
| y >>= 1; |
| } |
| return s; |
| } |
| |
| /* Computes and returns the number of unassigned iterations after idx chunks |
| have been assigned |
| (the total number of unassigned iterations in chunks with index greater than |
| or equal to idx). |
| __forceinline seems to be broken so that if we __forceinline this function, |
| the behavior is wrong |
| (one of the unit tests, sch_guided_analytical_basic.cpp, fails) |
| */ |
| template <typename T> |
| static __inline typename traits_t<T>::unsigned_t |
| __kmp_dispatch_guided_remaining(T tc, typename traits_t<T>::floating_t base, |
| typename traits_t<T>::unsigned_t idx) { |
| /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at |
| least for ICL 8.1, long double arithmetic may not really have |
| long double precision, even with /Qlong_double. Currently, we |
| workaround that in the caller code, by manipulating the FPCW for |
| Windows* OS on IA-32 architecture. The lack of precision is not |
| expected to be a correctness issue, though. |
| */ |
| typedef typename traits_t<T>::unsigned_t UT; |
| |
| long double x = tc * __kmp_pow<UT>(base, idx); |
| UT r = (UT)x; |
| if (x == r) |
| return r; |
| return r + 1; |
| } |
| |
| // Parameters of the guided-iterative algorithm: |
| // p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic |
| // p3 = 1 / ( n * nproc ) // remaining iterations multiplier |
| // by default n = 2. For example with n = 3 the chunks distribution will be more |
| // flat. |
| // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc. |
| static const int guided_int_param = 2; |
| static const double guided_flt_param = 0.5; // = 1.0 / guided_int_param; |
| #endif // KMP_DISPATCH_H |