| #ifndef KMP_STATS_H |
| #define KMP_STATS_H |
| |
| /** @file kmp_stats.h |
| * Functions for collecting statistics. |
| */ |
| |
| //===----------------------------------------------------------------------===// |
| // |
| // The LLVM Compiler Infrastructure |
| // |
| // This file is dual licensed under the MIT and the University of Illinois Open |
| // Source Licenses. See LICENSE.txt for details. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "kmp_config.h" |
| #include "kmp_debug.h" |
| |
| #if KMP_STATS_ENABLED |
| /* Statistics accumulator. |
| Accumulates number of samples and computes min, max, mean, standard deviation |
| on the fly. |
| |
| Online variance calculation algorithm from |
| http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#On-line_algorithm |
| */ |
| |
| #include "kmp_stats_timing.h" |
| #include <limits> |
| #include <math.h> |
| #include <new> // placement new |
| #include <stdint.h> |
| #include <string> |
| #include <vector> |
| |
| /* Enable developer statistics here if you want them. They are more detailed |
| than is useful for application characterisation and are intended for the |
| runtime library developer. */ |
| #define KMP_DEVELOPER_STATS 0 |
| |
| /* Enable/Disable histogram output */ |
| #define KMP_STATS_HIST 0 |
| |
| /*! |
| * @ingroup STATS_GATHERING |
| * \brief flags to describe the statistic (timer or counter) |
| * |
| */ |
| enum stats_flags_e { |
| noTotal = 1 << 0, //!< do not show a TOTAL_aggregation for this statistic |
| onlyInMaster = 1 << 1, //!< statistic is valid only for master |
| noUnits = 1 << 2, //!< statistic doesn't need units printed next to it |
| notInMaster = 1 << 3, //!< statistic is valid only for non-master threads |
| logEvent = 1 << 4 //!< statistic can be logged on the event timeline when |
| //! KMP_STATS_EVENTS is on (valid only for timers) |
| }; |
| |
| /*! |
| * @ingroup STATS_GATHERING |
| * \brief the states which a thread can be in |
| * |
| */ |
| enum stats_state_e { |
| IDLE, |
| SERIAL_REGION, |
| FORK_JOIN_BARRIER, |
| PLAIN_BARRIER, |
| TASKWAIT, |
| TASKYIELD, |
| TASKGROUP, |
| IMPLICIT_TASK, |
| EXPLICIT_TASK |
| }; |
| |
| /*! |
| * \brief Add new counters under KMP_FOREACH_COUNTER() macro in kmp_stats.h |
| * |
| * @param macro a user defined macro that takes three arguments - |
| * macro(COUNTER_NAME, flags, arg) |
| * @param arg a user defined argument to send to the user defined macro |
| * |
| * \details A counter counts the occurrence of some event. Each thread |
| * accumulates its own count, at the end of execution the counts are aggregated |
| * treating each thread as a separate measurement. (Unless onlyInMaster is set, |
| * in which case there's only a single measurement). The min,mean,max are |
| * therefore the values for the threads. Adding the counter here and then |
| * putting a KMP_BLOCK_COUNTER(name) at the point you want to count is all you |
| * need to do. All of the tables and printing is generated from this macro. |
| * Format is "macro(name, flags, arg)" |
| * |
| * @ingroup STATS_GATHERING |
| */ |
| // clang-format off |
| #define KMP_FOREACH_COUNTER(macro, arg) \ |
| macro(OMP_PARALLEL,stats_flags_e::onlyInMaster|stats_flags_e::noTotal,arg) \ |
| macro(OMP_NESTED_PARALLEL, 0, arg) \ |
| macro(OMP_LOOP_STATIC, 0, arg) \ |
| macro(OMP_LOOP_STATIC_STEAL, 0, arg) \ |
| macro(OMP_LOOP_DYNAMIC, 0, arg) \ |
| macro(OMP_DISTRIBUTE, 0, arg) \ |
| macro(OMP_BARRIER, 0, arg) \ |
| macro(OMP_CRITICAL, 0, arg) \ |
| macro(OMP_SINGLE, 0, arg) \ |
| macro(OMP_MASTER, 0, arg) \ |
| macro(OMP_TEAMS, 0, arg) \ |
| macro(OMP_set_lock, 0, arg) \ |
| macro(OMP_test_lock, 0, arg) \ |
| macro(REDUCE_wait, 0, arg) \ |
| macro(REDUCE_nowait, 0, arg) \ |
| macro(OMP_TASKYIELD, 0, arg) \ |
| macro(OMP_TASKLOOP, 0, arg) \ |
| macro(TASK_executed, 0, arg) \ |
| macro(TASK_cancelled, 0, arg) \ |
| macro(TASK_stolen, 0, arg) |
| // clang-format on |
| |
| /*! |
| * \brief Add new timers under KMP_FOREACH_TIMER() macro in kmp_stats.h |
| * |
| * @param macro a user defined macro that takes three arguments - |
| * macro(TIMER_NAME, flags, arg) |
| * @param arg a user defined argument to send to the user defined macro |
| * |
| * \details A timer collects multiple samples of some count in each thread and |
| * then finally aggregates all of the samples from all of the threads. For most |
| * timers the printing code also provides an aggregation over the thread totals. |
| * These are printed as TOTAL_foo. The count is normally a time (in ticks), |
| * hence the name "timer". (But can be any value, so we use this for "number of |
| * arguments passed to fork" as well). For timers the threads are not |
| * significant, it's the individual observations that count, so the statistics |
| * are at that level. Format is "macro(name, flags, arg)" |
| * |
| * @ingroup STATS_GATHERING2 |
| */ |
| // clang-format off |
| #define KMP_FOREACH_TIMER(macro, arg) \ |
| macro (OMP_worker_thread_life, stats_flags_e::logEvent, arg) \ |
| macro (OMP_parallel, stats_flags_e::logEvent, arg) \ |
| macro (OMP_parallel_overhead, stats_flags_e::logEvent, arg) \ |
| macro (OMP_loop_static, 0, arg) \ |
| macro (OMP_loop_static_scheduling, 0, arg) \ |
| macro (OMP_loop_dynamic, 0, arg) \ |
| macro (OMP_loop_dynamic_scheduling, 0, arg) \ |
| macro (OMP_critical, 0, arg) \ |
| macro (OMP_critical_wait, 0, arg) \ |
| macro (OMP_single, 0, arg) \ |
| macro (OMP_master, 0, arg) \ |
| macro (OMP_task_immediate, 0, arg) \ |
| macro (OMP_task_taskwait, 0, arg) \ |
| macro (OMP_task_taskyield, 0, arg) \ |
| macro (OMP_task_taskgroup, 0, arg) \ |
| macro (OMP_task_join_bar, 0, arg) \ |
| macro (OMP_task_plain_bar, 0, arg) \ |
| macro (OMP_taskloop_scheduling, 0, arg) \ |
| macro (OMP_plain_barrier, stats_flags_e::logEvent, arg) \ |
| macro (OMP_idle, stats_flags_e::logEvent, arg) \ |
| macro (OMP_fork_barrier, stats_flags_e::logEvent, arg) \ |
| macro (OMP_join_barrier, stats_flags_e::logEvent, arg) \ |
| macro (OMP_serial, stats_flags_e::logEvent, arg) \ |
| macro (OMP_set_numthreads, stats_flags_e::noUnits | stats_flags_e::noTotal, \ |
| arg) \ |
| macro (OMP_PARALLEL_args, stats_flags_e::noUnits | stats_flags_e::noTotal, \ |
| arg) \ |
| macro (OMP_loop_static_iterations, \ |
| stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \ |
| macro (OMP_loop_dynamic_iterations, \ |
| stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \ |
| KMP_FOREACH_DEVELOPER_TIMER(macro, arg) |
| // clang-format on |
| |
| // OMP_worker_thread_life -- Time from thread becoming an OpenMP thread (either |
| // initializing OpenMP or being created by a master) |
| // until the thread is destroyed |
| // OMP_parallel -- Time thread spends executing work directly |
| // within a #pragma omp parallel |
| // OMP_parallel_overhead -- Time thread spends setting up a parallel region |
| // OMP_loop_static -- Time thread spends executing loop iterations from |
| // a statically scheduled loop |
| // OMP_loop_static_scheduling -- Time thread spends scheduling loop iterations |
| // from a statically scheduled loop |
| // OMP_loop_dynamic -- Time thread spends executing loop iterations from |
| // a dynamically scheduled loop |
| // OMP_loop_dynamic_scheduling -- Time thread spends scheduling loop iterations |
| // from a dynamically scheduled loop |
| // OMP_critical -- Time thread spends executing critical section |
| // OMP_critical_wait -- Time thread spends waiting to enter |
| // a critcal seciton |
| // OMP_single -- Time spent executing a "single" region |
| // OMP_master -- Time spent executing a "master" region |
| // OMP_task_immediate -- Time spent executing non-deferred tasks |
| // OMP_task_taskwait -- Time spent executing tasks inside a taskwait |
| // construct |
| // OMP_task_taskyield -- Time spent executing tasks inside a taskyield |
| // construct |
| // OMP_task_taskgroup -- Time spent executing tasks inside a taskygroup |
| // construct |
| // OMP_task_join_bar -- Time spent executing tasks inside a join barrier |
| // OMP_task_plain_bar -- Time spent executing tasks inside a barrier |
| // construct |
| // OMP_taskloop_scheduling -- Time spent scheduling tasks inside a taskloop |
| // construct |
| // OMP_plain_barrier -- Time spent in a #pragma omp barrier construct or |
| // inside implicit barrier at end of worksharing |
| // construct |
| // OMP_idle -- Time worker threads spend waiting for next |
| // parallel region |
| // OMP_fork_barrier -- Time spent in a the fork barrier surrounding a |
| // parallel region |
| // OMP_join_barrier -- Time spent in a the join barrier surrounding a |
| // parallel region |
| // OMP_serial -- Time thread zero spends executing serial code |
| // OMP_set_numthreads -- Values passed to omp_set_num_threads |
| // OMP_PARALLEL_args -- Number of arguments passed to a parallel region |
| // OMP_loop_static_iterations -- Number of iterations thread is assigned for |
| // statically scheduled loops |
| // OMP_loop_dynamic_iterations -- Number of iterations thread is assigned for |
| // dynamically scheduled loops |
| |
| #if (KMP_DEVELOPER_STATS) |
| // Timers which are of interest to runtime library developers, not end users. |
| // These have to be explicitly enabled in addition to the other stats. |
| |
| // KMP_fork_barrier -- time in __kmp_fork_barrier |
| // KMP_join_barrier -- time in __kmp_join_barrier |
| // KMP_barrier -- time in __kmp_barrier |
| // KMP_end_split_barrier -- time in __kmp_end_split_barrier |
| // KMP_setup_icv_copy -- time in __kmp_setup_icv_copy |
| // KMP_icv_copy -- start/stop timer for any ICV copying |
| // KMP_linear_gather -- time in __kmp_linear_barrier_gather |
| // KMP_linear_release -- time in __kmp_linear_barrier_release |
| // KMP_tree_gather -- time in __kmp_tree_barrier_gather |
| // KMP_tree_release -- time in __kmp_tree_barrier_release |
| // KMP_hyper_gather -- time in __kmp_hyper_barrier_gather |
| // KMP_hyper_release -- time in __kmp_hyper_barrier_release |
| // clang-format off |
| #define KMP_FOREACH_DEVELOPER_TIMER(macro, arg) \ |
| macro(KMP_fork_call, 0, arg) \ |
| macro(KMP_join_call, 0, arg) \ |
| macro(KMP_end_split_barrier, 0, arg) \ |
| macro(KMP_hier_gather, 0, arg) \ |
| macro(KMP_hier_release, 0, arg) \ |
| macro(KMP_hyper_gather, 0, arg) \ |
| macro(KMP_hyper_release, 0, arg) \ |
| macro(KMP_linear_gather, 0, arg) \ |
| macro(KMP_linear_release, 0, arg) \ |
| macro(KMP_tree_gather, 0, arg) \ |
| macro(KMP_tree_release, 0, arg) \ |
| macro(USER_resume, 0, arg) \ |
| macro(USER_suspend, 0, arg) \ |
| macro(KMP_allocate_team, 0, arg) \ |
| macro(KMP_setup_icv_copy, 0, arg) \ |
| macro(USER_icv_copy, 0, arg) \ |
| macro (FOR_static_steal_stolen, \ |
| stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \ |
| macro (FOR_static_steal_chunks, \ |
| stats_flags_e::noUnits | stats_flags_e::noTotal, arg) |
| #else |
| #define KMP_FOREACH_DEVELOPER_TIMER(macro, arg) |
| #endif |
| // clang-format on |
| |
| /*! |
| * \brief Add new explicit timers under KMP_FOREACH_EXPLICIT_TIMER() macro. |
| * |
| * @param macro a user defined macro that takes three arguments - |
| * macro(TIMER_NAME, flags, arg) |
| * @param arg a user defined argument to send to the user defined macro |
| * |
| * \warning YOU MUST HAVE THE SAME NAMED TIMER UNDER KMP_FOREACH_TIMER() OR ELSE |
| * BAD THINGS WILL HAPPEN! |
| * |
| * \details Explicit timers are ones where we need to allocate a timer itself |
| * (as well as the accumulated timing statistics). We allocate these on a |
| * per-thread basis, and explicitly start and stop them. Block timers just |
| * allocate the timer itself on the stack, and use the destructor to notice |
| * block exit; they don't need to be defined here. The name here should be the |
| * same as that of a timer above. |
| * |
| * @ingroup STATS_GATHERING |
| */ |
| #define KMP_FOREACH_EXPLICIT_TIMER(macro, arg) KMP_FOREACH_TIMER(macro, arg) |
| |
| #define ENUMERATE(name, ignore, prefix) prefix##name, |
| enum timer_e { KMP_FOREACH_TIMER(ENUMERATE, TIMER_) TIMER_LAST }; |
| |
| enum explicit_timer_e { |
| KMP_FOREACH_EXPLICIT_TIMER(ENUMERATE, EXPLICIT_TIMER_) EXPLICIT_TIMER_LAST |
| }; |
| |
| enum counter_e { KMP_FOREACH_COUNTER(ENUMERATE, COUNTER_) COUNTER_LAST }; |
| #undef ENUMERATE |
| |
| /* |
| * A logarithmic histogram. It accumulates the number of values in each power of |
| * ten bin. So 1<=x<10, 10<=x<100, ... |
| * Mostly useful where we have some big outliers and want to see information |
| * about them. |
| */ |
| class logHistogram { |
| enum { |
| numBins = 31, /* Number of powers of 10. If this changes you need to change |
| * the initializer for binMax */ |
| |
| /* |
| * If you want to use this to analyse values that may be less than 1, (for |
| * instance times in s), then the logOffset gives you negative powers. |
| * In our case here, we're just looking at times in ticks, or counts, so we |
| * can never see values with magnitude < 1 (other than zero), so we can set |
| * it to 0. As above change the initializer if you change this. |
| */ |
| logOffset = 0 |
| }; |
| uint32_t KMP_ALIGN_CACHE zeroCount; |
| struct { |
| uint32_t count; |
| double total; |
| } bins[numBins]; |
| |
| static double binMax[numBins]; |
| |
| #ifdef KMP_DEBUG |
| uint64_t _total; |
| |
| void check() const { |
| uint64_t t = zeroCount; |
| for (int i = 0; i < numBins; i++) |
| t += bins[i].count; |
| KMP_DEBUG_ASSERT(t == _total); |
| } |
| #else |
| void check() const {} |
| #endif |
| |
| public: |
| logHistogram() { reset(); } |
| |
| logHistogram(logHistogram const &o) { |
| for (int i = 0; i < numBins; i++) |
| bins[i] = o.bins[i]; |
| #ifdef KMP_DEBUG |
| _total = o._total; |
| #endif |
| } |
| |
| void reset() { |
| zeroCount = 0; |
| for (int i = 0; i < numBins; i++) { |
| bins[i].count = 0; |
| bins[i].total = 0; |
| } |
| |
| #ifdef KMP_DEBUG |
| _total = 0; |
| #endif |
| } |
| uint32_t count(int b) const { return bins[b + logOffset].count; } |
| double total(int b) const { return bins[b + logOffset].total; } |
| static uint32_t findBin(double sample); |
| |
| logHistogram &operator+=(logHistogram const &o) { |
| zeroCount += o.zeroCount; |
| for (int i = 0; i < numBins; i++) { |
| bins[i].count += o.bins[i].count; |
| bins[i].total += o.bins[i].total; |
| } |
| #ifdef KMP_DEBUG |
| _total += o._total; |
| check(); |
| #endif |
| |
| return *this; |
| } |
| |
| void addSample(double sample); |
| int minBin() const; |
| int maxBin() const; |
| |
| std::string format(char) const; |
| }; |
| |
| class statistic { |
| double KMP_ALIGN_CACHE minVal; |
| double maxVal; |
| double meanVal; |
| double m2; |
| uint64_t sampleCount; |
| double offset; |
| bool collectingHist; |
| logHistogram hist; |
| |
| public: |
| statistic(bool doHist = bool(KMP_STATS_HIST)) { |
| reset(); |
| collectingHist = doHist; |
| } |
| statistic(statistic const &o) |
| : minVal(o.minVal), maxVal(o.maxVal), meanVal(o.meanVal), m2(o.m2), |
| sampleCount(o.sampleCount), offset(o.offset), |
| collectingHist(o.collectingHist), hist(o.hist) {} |
| statistic(double minv, double maxv, double meanv, uint64_t sc, double sd) |
| : minVal(minv), maxVal(maxv), meanVal(meanv), m2(sd * sd * sc), |
| sampleCount(sc), offset(0.0), collectingHist(false) {} |
| bool haveHist() const { return collectingHist; } |
| double getMin() const { return minVal; } |
| double getMean() const { return meanVal; } |
| double getMax() const { return maxVal; } |
| uint64_t getCount() const { return sampleCount; } |
| double getSD() const { return sqrt(m2 / sampleCount); } |
| double getTotal() const { return sampleCount * meanVal; } |
| logHistogram const *getHist() const { return &hist; } |
| void setOffset(double d) { offset = d; } |
| |
| void reset() { |
| minVal = std::numeric_limits<double>::max(); |
| maxVal = -minVal; |
| meanVal = 0.0; |
| m2 = 0.0; |
| sampleCount = 0; |
| offset = 0.0; |
| hist.reset(); |
| } |
| void addSample(double sample); |
| void scale(double factor); |
| void scaleDown(double f) { scale(1. / f); } |
| void forceCount(uint64_t count) { sampleCount = count; } |
| statistic &operator+=(statistic const &other); |
| |
| std::string format(char unit, bool total = false) const; |
| std::string formatHist(char unit) const { return hist.format(unit); } |
| }; |
| |
| struct statInfo { |
| const char *name; |
| uint32_t flags; |
| }; |
| |
| class timeStat : public statistic { |
| static statInfo timerInfo[]; |
| |
| public: |
| timeStat() : statistic() {} |
| static const char *name(timer_e e) { return timerInfo[e].name; } |
| static bool noTotal(timer_e e) { |
| return timerInfo[e].flags & stats_flags_e::noTotal; |
| } |
| static bool masterOnly(timer_e e) { |
| return timerInfo[e].flags & stats_flags_e::onlyInMaster; |
| } |
| static bool workerOnly(timer_e e) { |
| return timerInfo[e].flags & stats_flags_e::notInMaster; |
| } |
| static bool noUnits(timer_e e) { |
| return timerInfo[e].flags & stats_flags_e::noUnits; |
| } |
| static bool logEvent(timer_e e) { |
| return timerInfo[e].flags & stats_flags_e::logEvent; |
| } |
| static void clearEventFlags() { |
| for (int i = 0; i < TIMER_LAST; i++) { |
| timerInfo[i].flags &= (~(stats_flags_e::logEvent)); |
| } |
| } |
| }; |
| |
| // Where we need explicitly to start and end the timer, this version can be used |
| // Since these timers normally aren't nicely scoped, so don't have a good place |
| // to live on the stack of the thread, they're more work to use. |
| class explicitTimer { |
| timeStat *stat; |
| timer_e timerEnumValue; |
| tsc_tick_count startTime; |
| tsc_tick_count pauseStartTime; |
| tsc_tick_count::tsc_interval_t totalPauseTime; |
| |
| public: |
| explicitTimer(timeStat *s, timer_e te) |
| : stat(s), timerEnumValue(te), startTime(), pauseStartTime(0), |
| totalPauseTime() {} |
| |
| // void setStat(timeStat *s) { stat = s; } |
| void start(tsc_tick_count tick); |
| void pause(tsc_tick_count tick) { pauseStartTime = tick; } |
| void resume(tsc_tick_count tick) { |
| totalPauseTime += (tick - pauseStartTime); |
| } |
| void stop(tsc_tick_count tick, kmp_stats_list *stats_ptr = nullptr); |
| void reset() { |
| startTime = 0; |
| pauseStartTime = 0; |
| totalPauseTime = 0; |
| } |
| timer_e get_type() const { return timerEnumValue; } |
| }; |
| |
| // Where you need to partition a threads clock ticks into separate states |
| // e.g., a partitionedTimers class with two timers of EXECUTING_TASK, and |
| // DOING_NOTHING would render these conditions: |
| // time(EXECUTING_TASK) + time(DOING_NOTHING) = total time thread is alive |
| // No clock tick in the EXECUTING_TASK is a member of DOING_NOTHING and vice |
| // versa |
| class partitionedTimers { |
| private: |
| std::vector<explicitTimer> timer_stack; |
| |
| public: |
| partitionedTimers(); |
| void init(explicitTimer timer); |
| void exchange(explicitTimer timer); |
| void push(explicitTimer timer); |
| void pop(); |
| void windup(); |
| }; |
| |
| // Special wrapper around the partioned timers to aid timing code blocks |
| // It avoids the need to have an explicit end, leaving the scope suffices. |
| class blockPartitionedTimer { |
| partitionedTimers *part_timers; |
| |
| public: |
| blockPartitionedTimer(partitionedTimers *pt, explicitTimer timer) |
| : part_timers(pt) { |
| part_timers->push(timer); |
| } |
| ~blockPartitionedTimer() { part_timers->pop(); } |
| }; |
| |
| // Special wrapper around the thread state to aid in keeping state in code |
| // blocks It avoids the need to have an explicit end, leaving the scope |
| // suffices. |
| class blockThreadState { |
| stats_state_e *state_pointer; |
| stats_state_e old_state; |
| |
| public: |
| blockThreadState(stats_state_e *thread_state_pointer, stats_state_e new_state) |
| : state_pointer(thread_state_pointer), old_state(*thread_state_pointer) { |
| *state_pointer = new_state; |
| } |
| ~blockThreadState() { *state_pointer = old_state; } |
| }; |
| |
| // If all you want is a count, then you can use this... |
| // The individual per-thread counts will be aggregated into a statistic at |
| // program exit. |
| class counter { |
| uint64_t value; |
| static const statInfo counterInfo[]; |
| |
| public: |
| counter() : value(0) {} |
| void increment() { value++; } |
| uint64_t getValue() const { return value; } |
| void reset() { value = 0; } |
| static const char *name(counter_e e) { return counterInfo[e].name; } |
| static bool masterOnly(counter_e e) { |
| return counterInfo[e].flags & stats_flags_e::onlyInMaster; |
| } |
| }; |
| |
| /* **************************************************************** |
| Class to implement an event |
| |
| There are four components to an event: start time, stop time |
| nest_level, and timer_name. |
| The start and stop time should be obvious (recorded in clock ticks). |
| The nest_level relates to the bar width in the timeline graph. |
| The timer_name is used to determine which timer event triggered this event. |
| |
| the interface to this class is through four read-only operations: |
| 1) getStart() -- returns the start time as 64 bit integer |
| 2) getStop() -- returns the stop time as 64 bit integer |
| 3) getNestLevel() -- returns the nest level of the event |
| 4) getTimerName() -- returns the timer name that triggered event |
| |
| *MORE ON NEST_LEVEL* |
| The nest level is used in the bar graph that represents the timeline. |
| Its main purpose is for showing how events are nested inside eachother. |
| For example, say events, A, B, and C are recorded. If the timeline |
| looks like this: |
| |
| Begin -------------------------------------------------------------> Time |
| | | | | | | |
| A B C C B A |
| start start start end end end |
| |
| Then A, B, C will have a nest level of 1, 2, 3 respectively. |
| These values are then used to calculate the barwidth so you can |
| see that inside A, B has occurred, and inside B, C has occurred. |
| Currently, this is shown with A's bar width being larger than B's |
| bar width, and B's bar width being larger than C's bar width. |
| |
| **************************************************************** */ |
| class kmp_stats_event { |
| uint64_t start; |
| uint64_t stop; |
| int nest_level; |
| timer_e timer_name; |
| |
| public: |
| kmp_stats_event() |
| : start(0), stop(0), nest_level(0), timer_name(TIMER_LAST) {} |
| kmp_stats_event(uint64_t strt, uint64_t stp, int nst, timer_e nme) |
| : start(strt), stop(stp), nest_level(nst), timer_name(nme) {} |
| inline uint64_t getStart() const { return start; } |
| inline uint64_t getStop() const { return stop; } |
| inline int getNestLevel() const { return nest_level; } |
| inline timer_e getTimerName() const { return timer_name; } |
| }; |
| |
| /* **************************************************************** |
| Class to implement a dynamically expandable array of events |
| |
| --------------------------------------------------------- |
| | event 1 | event 2 | event 3 | event 4 | ... | event N | |
| --------------------------------------------------------- |
| |
| An event is pushed onto the back of this array at every |
| explicitTimer->stop() call. The event records the thread #, |
| start time, stop time, and nest level related to the bar width. |
| |
| The event vector starts at size INIT_SIZE and grows (doubles in size) |
| if needed. An implication of this behavior is that log(N) |
| reallocations are needed (where N is number of events). If you want |
| to avoid reallocations, then set INIT_SIZE to a large value. |
| |
| the interface to this class is through six operations: |
| 1) reset() -- sets the internal_size back to 0 but does not deallocate any |
| memory |
| 2) size() -- returns the number of valid elements in the vector |
| 3) push_back(start, stop, nest, timer_name) -- pushes an event onto |
| the back of the array |
| 4) deallocate() -- frees all memory associated with the vector |
| 5) sort() -- sorts the vector by start time |
| 6) operator[index] or at(index) -- returns event reference at that index |
| **************************************************************** */ |
| class kmp_stats_event_vector { |
| kmp_stats_event *events; |
| int internal_size; |
| int allocated_size; |
| static const int INIT_SIZE = 1024; |
| |
| public: |
| kmp_stats_event_vector() { |
| events = |
| (kmp_stats_event *)__kmp_allocate(sizeof(kmp_stats_event) * INIT_SIZE); |
| internal_size = 0; |
| allocated_size = INIT_SIZE; |
| } |
| ~kmp_stats_event_vector() {} |
| inline void reset() { internal_size = 0; } |
| inline int size() const { return internal_size; } |
| void push_back(uint64_t start_time, uint64_t stop_time, int nest_level, |
| timer_e name) { |
| int i; |
| if (internal_size == allocated_size) { |
| kmp_stats_event *tmp = (kmp_stats_event *)__kmp_allocate( |
| sizeof(kmp_stats_event) * allocated_size * 2); |
| for (i = 0; i < internal_size; i++) |
| tmp[i] = events[i]; |
| __kmp_free(events); |
| events = tmp; |
| allocated_size *= 2; |
| } |
| events[internal_size] = |
| kmp_stats_event(start_time, stop_time, nest_level, name); |
| internal_size++; |
| return; |
| } |
| void deallocate(); |
| void sort(); |
| const kmp_stats_event &operator[](int index) const { return events[index]; } |
| kmp_stats_event &operator[](int index) { return events[index]; } |
| const kmp_stats_event &at(int index) const { return events[index]; } |
| kmp_stats_event &at(int index) { return events[index]; } |
| }; |
| |
| /* **************************************************************** |
| Class to implement a doubly-linked, circular, statistics list |
| |
| |---| ---> |---| ---> |---| ---> |---| ---> ... next |
| | | | | | | | | |
| |---| <--- |---| <--- |---| <--- |---| <--- ... prev |
| Sentinel first second third |
| Node node node node |
| |
| The Sentinel Node is the user handle on the list. |
| The first node corresponds to thread 0's statistics. |
| The second node corresponds to thread 1's statistics and so on... |
| |
| Each node has a _timers, _counters, and _explicitTimers array to hold that |
| thread's statistics. The _explicitTimers point to the correct _timer and |
| update its statistics at every stop() call. The explicitTimers' pointers are |
| set up in the constructor. Each node also has an event vector to hold that |
| thread's timing events. The event vector expands as necessary and records |
| the start-stop times for each timer. |
| |
| The nestLevel variable is for plotting events and is related |
| to the bar width in the timeline graph. |
| |
| Every thread will have a thread local pointer to its node in |
| the list. The sentinel node is used by the master thread to |
| store "dummy" statistics before __kmp_create_worker() is called. |
| **************************************************************** */ |
| class kmp_stats_list { |
| int gtid; |
| timeStat _timers[TIMER_LAST + 1]; |
| counter _counters[COUNTER_LAST + 1]; |
| explicitTimer thread_life_timer; |
| partitionedTimers _partitionedTimers; |
| int _nestLevel; // one per thread |
| kmp_stats_event_vector _event_vector; |
| kmp_stats_list *next; |
| kmp_stats_list *prev; |
| stats_state_e state; |
| int thread_is_idle_flag; |
| |
| public: |
| kmp_stats_list() |
| : thread_life_timer(&_timers[TIMER_OMP_worker_thread_life], |
| TIMER_OMP_worker_thread_life), |
| _nestLevel(0), _event_vector(), next(this), prev(this), state(IDLE), |
| thread_is_idle_flag(0) {} |
| ~kmp_stats_list() {} |
| inline timeStat *getTimer(timer_e idx) { return &_timers[idx]; } |
| inline counter *getCounter(counter_e idx) { return &_counters[idx]; } |
| inline partitionedTimers *getPartitionedTimers() { |
| return &_partitionedTimers; |
| } |
| inline timeStat *getTimers() { return _timers; } |
| inline counter *getCounters() { return _counters; } |
| inline kmp_stats_event_vector &getEventVector() { return _event_vector; } |
| inline void startLife() { thread_life_timer.start(tsc_tick_count::now()); } |
| inline void endLife() { thread_life_timer.stop(tsc_tick_count::now(), this); } |
| inline void resetEventVector() { _event_vector.reset(); } |
| inline void incrementNestValue() { _nestLevel++; } |
| inline int getNestValue() { return _nestLevel; } |
| inline void decrementNestValue() { _nestLevel--; } |
| inline int getGtid() const { return gtid; } |
| inline void setGtid(int newgtid) { gtid = newgtid; } |
| inline void setState(stats_state_e newstate) { state = newstate; } |
| inline stats_state_e getState() const { return state; } |
| inline stats_state_e *getStatePointer() { return &state; } |
| inline bool isIdle() { return thread_is_idle_flag == 1; } |
| inline void setIdleFlag() { thread_is_idle_flag = 1; } |
| inline void resetIdleFlag() { thread_is_idle_flag = 0; } |
| kmp_stats_list *push_back(int gtid); // returns newly created list node |
| inline void push_event(uint64_t start_time, uint64_t stop_time, |
| int nest_level, timer_e name) { |
| _event_vector.push_back(start_time, stop_time, nest_level, name); |
| } |
| void deallocate(); |
| class iterator; |
| kmp_stats_list::iterator begin(); |
| kmp_stats_list::iterator end(); |
| int size(); |
| class iterator { |
| kmp_stats_list *ptr; |
| friend kmp_stats_list::iterator kmp_stats_list::begin(); |
| friend kmp_stats_list::iterator kmp_stats_list::end(); |
| |
| public: |
| iterator(); |
| ~iterator(); |
| iterator operator++(); |
| iterator operator++(int dummy); |
| iterator operator--(); |
| iterator operator--(int dummy); |
| bool operator!=(const iterator &rhs); |
| bool operator==(const iterator &rhs); |
| kmp_stats_list *operator*() const; // dereference operator |
| }; |
| }; |
| |
| /* **************************************************************** |
| Class to encapsulate all output functions and the environment variables |
| |
| This module holds filenames for various outputs (normal stats, events, plot |
| file), as well as coloring information for the plot file. |
| |
| The filenames and flags variables are read from environment variables. |
| These are read once by the constructor of the global variable |
| __kmp_stats_output which calls init(). |
| |
| During this init() call, event flags for the timeStat::timerInfo[] global |
| array are cleared if KMP_STATS_EVENTS is not true (on, 1, yes). |
| |
| The only interface function that is public is outputStats(heading). This |
| function should print out everything it needs to, either to files or stderr, |
| depending on the environment variables described below |
| |
| ENVIRONMENT VARIABLES: |
| KMP_STATS_FILE -- if set, all statistics (not events) will be printed to this |
| file, otherwise, print to stderr |
| KMP_STATS_THREADS -- if set to "on", then will print per thread statistics to |
| either KMP_STATS_FILE or stderr |
| KMP_STATS_PLOT_FILE -- if set, print the ploticus plot file to this filename, |
| otherwise, the plot file is sent to "events.plt" |
| KMP_STATS_EVENTS -- if set to "on", then log events, otherwise, don't log |
| events |
| KMP_STATS_EVENTS_FILE -- if set, all events are outputted to this file, |
| otherwise, output is sent to "events.dat" |
| **************************************************************** */ |
| class kmp_stats_output_module { |
| |
| public: |
| struct rgb_color { |
| float r; |
| float g; |
| float b; |
| }; |
| |
| private: |
| std::string outputFileName; |
| static const char *eventsFileName; |
| static const char *plotFileName; |
| static int printPerThreadFlag; |
| static int printPerThreadEventsFlag; |
| static const rgb_color globalColorArray[]; |
| static rgb_color timerColorInfo[]; |
| |
| void init(); |
| static void setupEventColors(); |
| static void printPloticusFile(); |
| static void printHeaderInfo(FILE *statsOut); |
| static void printTimerStats(FILE *statsOut, statistic const *theStats, |
| statistic const *totalStats); |
| static void printCounterStats(FILE *statsOut, statistic const *theStats); |
| static void printCounters(FILE *statsOut, counter const *theCounters); |
| static void printEvents(FILE *eventsOut, kmp_stats_event_vector *theEvents, |
| int gtid); |
| static rgb_color getEventColor(timer_e e) { return timerColorInfo[e]; } |
| static void windupExplicitTimers(); |
| bool eventPrintingEnabled() const { return printPerThreadEventsFlag; } |
| |
| public: |
| kmp_stats_output_module() { init(); } |
| void outputStats(const char *heading); |
| }; |
| |
| #ifdef __cplusplus |
| extern "C" { |
| #endif |
| void __kmp_stats_init(); |
| void __kmp_stats_fini(); |
| void __kmp_reset_stats(); |
| void __kmp_output_stats(const char *); |
| void __kmp_accumulate_stats_at_exit(void); |
| // thread local pointer to stats node within list |
| extern KMP_THREAD_LOCAL kmp_stats_list *__kmp_stats_thread_ptr; |
| // head to stats list. |
| extern kmp_stats_list *__kmp_stats_list; |
| // lock for __kmp_stats_list |
| extern kmp_tas_lock_t __kmp_stats_lock; |
| // reference start time |
| extern tsc_tick_count __kmp_stats_start_time; |
| // interface to output |
| extern kmp_stats_output_module __kmp_stats_output; |
| |
| #ifdef __cplusplus |
| } |
| #endif |
| |
| // Simple, standard interfaces that drop out completely if stats aren't enabled |
| |
| /*! |
| * \brief Adds value to specified timer (name). |
| * |
| * @param name timer name as specified under the KMP_FOREACH_TIMER() macro |
| * @param value double precision sample value to add to statistics for the timer |
| * |
| * \details Use KMP_COUNT_VALUE(name, value) macro to add a particular value to |
| * a timer statistics. |
| * |
| * @ingroup STATS_GATHERING |
| */ |
| #define KMP_COUNT_VALUE(name, value) \ |
| __kmp_stats_thread_ptr->getTimer(TIMER_##name)->addSample(value) |
| |
| /*! |
| * \brief Increments specified counter (name). |
| * |
| * @param name counter name as specified under the KMP_FOREACH_COUNTER() macro |
| * |
| * \details Use KMP_COUNT_BLOCK(name, value) macro to increment a statistics |
| * counter for the executing thread. |
| * |
| * @ingroup STATS_GATHERING |
| */ |
| #define KMP_COUNT_BLOCK(name) \ |
| __kmp_stats_thread_ptr->getCounter(COUNTER_##name)->increment() |
| |
| /*! |
| * \brief Outputs the current thread statistics and reset them. |
| * |
| * @param heading_string heading put above the final stats output |
| * |
| * \details Explicitly stops all timers and outputs all stats. Environment |
| * variable, `OMPTB_STATSFILE=filename`, can be used to output the stats to a |
| * filename instead of stderr. Environment variable, |
| * `OMPTB_STATSTHREADS=true|undefined`, can be used to output thread specific |
| * stats. For now the `OMPTB_STATSTHREADS` environment variable can either be |
| * defined with any value, which will print out thread specific stats, or it can |
| * be undefined (not specified in the environment) and thread specific stats |
| * won't be printed. It should be noted that all statistics are reset when this |
| * macro is called. |
| * |
| * @ingroup STATS_GATHERING |
| */ |
| #define KMP_OUTPUT_STATS(heading_string) __kmp_output_stats(heading_string) |
| |
| /*! |
| * \brief Initializes the paritioned timers to begin with name. |
| * |
| * @param name timer which you want this thread to begin with |
| * |
| * @ingroup STATS_GATHERING |
| */ |
| #define KMP_INIT_PARTITIONED_TIMERS(name) \ |
| __kmp_stats_thread_ptr->getPartitionedTimers()->init(explicitTimer( \ |
| __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name)) |
| |
| #define KMP_TIME_PARTITIONED_BLOCK(name) \ |
| blockPartitionedTimer __PBLOCKTIME__( \ |
| __kmp_stats_thread_ptr->getPartitionedTimers(), \ |
| explicitTimer(__kmp_stats_thread_ptr->getTimer(TIMER_##name), \ |
| TIMER_##name)) |
| |
| #define KMP_PUSH_PARTITIONED_TIMER(name) \ |
| __kmp_stats_thread_ptr->getPartitionedTimers()->push(explicitTimer( \ |
| __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name)) |
| |
| #define KMP_POP_PARTITIONED_TIMER() \ |
| __kmp_stats_thread_ptr->getPartitionedTimers()->pop() |
| |
| #define KMP_EXCHANGE_PARTITIONED_TIMER(name) \ |
| __kmp_stats_thread_ptr->getPartitionedTimers()->exchange(explicitTimer( \ |
| __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name)) |
| |
| #define KMP_SET_THREAD_STATE(state_name) \ |
| __kmp_stats_thread_ptr->setState(state_name) |
| |
| #define KMP_GET_THREAD_STATE() __kmp_stats_thread_ptr->getState() |
| |
| #define KMP_SET_THREAD_STATE_BLOCK(state_name) \ |
| blockThreadState __BTHREADSTATE__(__kmp_stats_thread_ptr->getStatePointer(), \ |
| state_name) |
| |
| /*! |
| * \brief resets all stats (counters to 0, timers to 0 elapsed ticks) |
| * |
| * \details Reset all stats for all threads. |
| * |
| * @ingroup STATS_GATHERING |
| */ |
| #define KMP_RESET_STATS() __kmp_reset_stats() |
| |
| #if (KMP_DEVELOPER_STATS) |
| #define KMP_TIME_DEVELOPER_BLOCK(n) KMP_TIME_BLOCK(n) |
| #define KMP_COUNT_DEVELOPER_VALUE(n, v) KMP_COUNT_VALUE(n, v) |
| #define KMP_COUNT_DEVELOPER_BLOCK(n) KMP_COUNT_BLOCK(n) |
| #define KMP_START_DEVELOPER_EXPLICIT_TIMER(n) KMP_START_EXPLICIT_TIMER(n) |
| #define KMP_STOP_DEVELOPER_EXPLICIT_TIMER(n) KMP_STOP_EXPLICIT_TIMER(n) |
| #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) KMP_TIME_PARTITIONED_BLOCK(n) |
| #else |
| // Null definitions |
| #define KMP_TIME_DEVELOPER_BLOCK(n) ((void)0) |
| #define KMP_COUNT_DEVELOPER_VALUE(n, v) ((void)0) |
| #define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0) |
| #define KMP_START_DEVELOPER_EXPLICIT_TIMER(n) ((void)0) |
| #define KMP_STOP_DEVELOPER_EXPLICIT_TIMER(n) ((void)0) |
| #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) ((void)0) |
| #endif |
| |
| #else // KMP_STATS_ENABLED |
| |
| // Null definitions |
| #define KMP_TIME_BLOCK(n) ((void)0) |
| #define KMP_COUNT_VALUE(n, v) ((void)0) |
| #define KMP_COUNT_BLOCK(n) ((void)0) |
| #define KMP_START_EXPLICIT_TIMER(n) ((void)0) |
| #define KMP_STOP_EXPLICIT_TIMER(n) ((void)0) |
| |
| #define KMP_OUTPUT_STATS(heading_string) ((void)0) |
| #define KMP_RESET_STATS() ((void)0) |
| |
| #define KMP_TIME_DEVELOPER_BLOCK(n) ((void)0) |
| #define KMP_COUNT_DEVELOPER_VALUE(n, v) ((void)0) |
| #define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0) |
| #define KMP_START_DEVELOPER_EXPLICIT_TIMER(n) ((void)0) |
| #define KMP_STOP_DEVELOPER_EXPLICIT_TIMER(n) ((void)0) |
| #define KMP_INIT_PARTITIONED_TIMERS(name) ((void)0) |
| #define KMP_TIME_PARTITIONED_BLOCK(name) ((void)0) |
| #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) ((void)0) |
| #define KMP_PUSH_PARTITIONED_TIMER(name) ((void)0) |
| #define KMP_POP_PARTITIONED_TIMER() ((void)0) |
| #define KMP_SET_THREAD_STATE(state_name) ((void)0) |
| #define KMP_GET_THREAD_STATE() ((void)0) |
| #define KMP_SET_THREAD_STATE_BLOCK(state_name) ((void)0) |
| #endif // KMP_STATS_ENABLED |
| |
| #endif // KMP_STATS_H |