Intel® OpenMP* Runtime Library
kmp_barrier.cpp
1 /*
2  * kmp_barrier.cpp
3  */
4 
5 /* <copyright>
6  Copyright (c) 1997-2015 Intel Corporation. All Rights Reserved.
7 
8  Redistribution and use in source and binary forms, with or without
9  modification, are permitted provided that the following conditions
10  are met:
11 
12  * Redistributions of source code must retain the above copyright
13  notice, this list of conditions and the following disclaimer.
14  * Redistributions in binary form must reproduce the above copyright
15  notice, this list of conditions and the following disclaimer in the
16  documentation and/or other materials provided with the distribution.
17  * Neither the name of Intel Corporation nor the names of its
18  contributors may be used to endorse or promote products derived
19  from this software without specific prior written permission.
20 
21  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 
33 </copyright> */
34 
35 #include "kmp.h"
36 #include "kmp_wait_release.h"
37 #include "kmp_stats.h"
38 #include "kmp_itt.h"
39 #include "kmp_os.h"
40 
41 
42 #if KMP_MIC
43 #include <immintrin.h>
44 #define USE_NGO_STORES 1
45 #endif // KMP_MIC
46 
47 #if KMP_MIC && USE_NGO_STORES
48 // ICV copying
49 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
50 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
51 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
52 #define ngo_sync() __asm__ volatile ("lock; addl $0,0(%%rsp)" ::: "memory")
53 #else
54 #define ngo_load(src) ((void)0)
55 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
56 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
57 #define ngo_sync() ((void)0)
58 #endif /* KMP_MIC && USE_NGO_STORES */
59 
60 void __kmp_print_structure(void); // Forward declaration
61 
62 // ---------------------------- Barrier Algorithms ----------------------------
63 
64 // Linear Barrier
65 static void
66 __kmp_linear_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
67  void (*reduce)(void *, void *)
68  USE_ITT_BUILD_ARG(void * itt_sync_obj) )
69 {
70  KMP_TIME_BLOCK(KMP_linear_gather);
71  register kmp_team_t *team = this_thr->th.th_team;
72  register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
73  register kmp_info_t **other_threads = team->t.t_threads;
74 
75  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
76  gtid, team->t.t_id, tid, bt));
77  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
78 
79 #if USE_ITT_BUILD && USE_ITT_NOTIFY
80  // Barrier imbalance - save arrive time to the thread
81  if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
82  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
83  }
84 #endif
85  // We now perform a linear reduction to signal that all of the threads have arrived.
86  if (!KMP_MASTER_TID(tid)) {
87  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
88  "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
89  __kmp_gtid_from_tid(0, team), team->t.t_id, 0, &thr_bar->b_arrived,
90  thr_bar->b_arrived, thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
91  // Mark arrival to master thread
92  /* After performing this write, a worker thread may not assume that the team is valid
93  any more - it could be deallocated by the master thread at any time. */
94  kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
95  flag.release();
96  } else {
97  register kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
98  register int nproc = this_thr->th.th_team_nproc;
99  register int i;
100  // Don't have to worry about sleep bit here or atomic since team setting
101  register kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
102 
103  // Collect all the worker team member threads.
104  for (i=1; i<nproc; ++i) {
105 #if KMP_CACHE_MANAGE
106  // Prefetch next thread's arrived count
107  if (i+1 < nproc)
108  KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_arrived);
109 #endif /* KMP_CACHE_MANAGE */
110  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
111  "arrived(%p) == %llu\n", gtid, team->t.t_id, tid,
112  __kmp_gtid_from_tid(i, team), team->t.t_id, i,
113  &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
114 
115  // Wait for worker thread to arrive
116  kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
117  flag.wait(this_thr, FALSE
118  USE_ITT_BUILD_ARG(itt_sync_obj) );
119 #if USE_ITT_BUILD && USE_ITT_NOTIFY
120  // Barrier imbalance - write min of the thread time and the other thread time to the thread.
121  if (__kmp_forkjoin_frames_mode == 2) {
122  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
123  other_threads[i]->th.th_bar_min_time);
124  }
125 #endif
126  if (reduce) {
127  KA_TRACE(100, ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", gtid,
128  team->t.t_id, tid, __kmp_gtid_from_tid(i, team), team->t.t_id, i));
129  (*reduce)(this_thr->th.th_local.reduce_data,
130  other_threads[i]->th.th_local.reduce_data);
131  }
132  }
133  // Don't have to worry about sleep bit here or atomic since team setting
134  team_bar->b_arrived = new_state;
135  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
136  gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived, new_state));
137  }
138  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
139  gtid, team->t.t_id, tid, bt));
140 }
141 
142 static void
143 __kmp_linear_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
144  int propagate_icvs
145  USE_ITT_BUILD_ARG(void *itt_sync_obj) )
146 {
147  KMP_TIME_BLOCK(KMP_linear_release);
148  register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
149  register kmp_team_t *team;
150 
151  if (KMP_MASTER_TID(tid)) {
152  register unsigned int i;
153  register kmp_uint32 nproc = this_thr->th.th_team_nproc;
154  register kmp_info_t **other_threads;
155 
156  team = __kmp_threads[gtid]->th.th_team;
157  KMP_DEBUG_ASSERT(team != NULL);
158  other_threads = team->t.t_threads;
159 
160  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
161  gtid, team->t.t_id, tid, bt));
162 
163  if (nproc > 1) {
164 #if KMP_BARRIER_ICV_PUSH
165  KMP_START_EXPLICIT_TIMER(USER_icv_copy);
166  if (propagate_icvs) {
167  ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
168  for (i=1; i<nproc; ++i) {
169  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i], team, i, FALSE);
170  ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
171  &team->t.t_implicit_task_taskdata[0].td_icvs);
172  }
173  ngo_sync();
174  }
175  KMP_STOP_EXPLICIT_TIMER(USER_icv_copy);
176 #endif // KMP_BARRIER_ICV_PUSH
177 
178  // Now, release all of the worker threads
179  for (i=1; i<nproc; ++i) {
180 #if KMP_CACHE_MANAGE
181  // Prefetch next thread's go flag
182  if (i+1 < nproc)
183  KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_go);
184 #endif /* KMP_CACHE_MANAGE */
185  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
186  "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
187  other_threads[i]->th.th_info.ds.ds_gtid, team->t.t_id, i,
188  &other_threads[i]->th.th_bar[bt].bb.b_go,
189  other_threads[i]->th.th_bar[bt].bb.b_go,
190  other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
191  kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go, other_threads[i]);
192  flag.release();
193  }
194  }
195  } else { // Wait for the MASTER thread to release us
196  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
197  gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
198  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
199  flag.wait(this_thr, TRUE
200  USE_ITT_BUILD_ARG(itt_sync_obj) );
201 #if USE_ITT_BUILD && USE_ITT_NOTIFY
202  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
203  // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is disabled)
204  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
205  // Cancel wait on previous parallel region...
206  __kmp_itt_task_starting(itt_sync_obj);
207 
208  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
209  return;
210 
211  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
212  if (itt_sync_obj != NULL)
213  // Call prepare as early as possible for "new" barrier
214  __kmp_itt_task_finished(itt_sync_obj);
215  } else
216 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
217  // Early exit for reaping threads releasing forkjoin barrier
218  if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
219  return;
220  // The worker thread may now assume that the team is valid.
221 #ifdef KMP_DEBUG
222  tid = __kmp_tid_from_gtid(gtid);
223  team = __kmp_threads[gtid]->th.th_team;
224 #endif
225  KMP_DEBUG_ASSERT(team != NULL);
226  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
227  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
228  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
229  KMP_MB(); // Flush all pending memory write invalidates.
230  }
231  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
232  gtid, team->t.t_id, tid, bt));
233 }
234 
235 // Tree barrier
236 static void
237 __kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
238  void (*reduce)(void *, void *)
239  USE_ITT_BUILD_ARG(void *itt_sync_obj) )
240 {
241  KMP_TIME_BLOCK(KMP_tree_gather);
242  register kmp_team_t *team = this_thr->th.th_team;
243  register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
244  register kmp_info_t **other_threads = team->t.t_threads;
245  register kmp_uint32 nproc = this_thr->th.th_team_nproc;
246  register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
247  register kmp_uint32 branch_factor = 1 << branch_bits;
248  register kmp_uint32 child;
249  register kmp_uint32 child_tid;
250  register kmp_uint64 new_state;
251 
252  KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
253  gtid, team->t.t_id, tid, bt));
254  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
255 
256 #if USE_ITT_BUILD && USE_ITT_NOTIFY
257  // Barrier imbalance - save arrive time to the thread
258  if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
259  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
260  }
261 #endif
262  // Perform tree gather to wait until all threads have arrived; reduce any required data as we go
263  child_tid = (tid << branch_bits) + 1;
264  if (child_tid < nproc) {
265  // Parent threads wait for all their children to arrive
266  new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
267  child = 1;
268  do {
269  register kmp_info_t *child_thr = other_threads[child_tid];
270  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
271 #if KMP_CACHE_MANAGE
272  // Prefetch next thread's arrived count
273  if (child+1 <= branch_factor && child_tid+1 < nproc)
274  KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_arrived);
275 #endif /* KMP_CACHE_MANAGE */
276  KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
277  "arrived(%p) == %llu\n", gtid, team->t.t_id, tid,
278  __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
279  &child_bar->b_arrived, new_state));
280  // Wait for child to arrive
281  kmp_flag_64 flag(&child_bar->b_arrived, new_state);
282  flag.wait(this_thr, FALSE
283  USE_ITT_BUILD_ARG(itt_sync_obj) );
284 #if USE_ITT_BUILD && USE_ITT_NOTIFY
285  // Barrier imbalance - write min of the thread time and a child time to the thread.
286  if (__kmp_forkjoin_frames_mode == 2) {
287  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
288  child_thr->th.th_bar_min_time);
289  }
290 #endif
291  if (reduce) {
292  KA_TRACE(100, ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
293  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
294  team->t.t_id, child_tid));
295  (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
296  }
297  child++;
298  child_tid++;
299  }
300  while (child <= branch_factor && child_tid < nproc);
301  }
302 
303  if (!KMP_MASTER_TID(tid)) { // Worker threads
304  register kmp_int32 parent_tid = (tid - 1) >> branch_bits;
305 
306  KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
307  "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
308  __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
309  &thr_bar->b_arrived, thr_bar->b_arrived,
310  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
311 
312  // Mark arrival to parent thread
313  /* After performing this write, a worker thread may not assume that the team is valid
314  any more - it could be deallocated by the master thread at any time. */
315  kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
316  flag.release();
317  } else {
318  // Need to update the team arrived pointer if we are the master thread
319  if (nproc > 1) // New value was already computed above
320  team->t.t_bar[bt].b_arrived = new_state;
321  else
322  team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
323  KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
324  gtid, team->t.t_id, tid, team->t.t_id,
325  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
326  }
327  KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
328  gtid, team->t.t_id, tid, bt));
329 }
330 
331 static void
332 __kmp_tree_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
333  int propagate_icvs
334  USE_ITT_BUILD_ARG(void *itt_sync_obj) )
335 {
336  KMP_TIME_BLOCK(KMP_tree_release);
337  register kmp_team_t *team;
338  register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
339  register kmp_uint32 nproc;
340  register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
341  register kmp_uint32 branch_factor = 1 << branch_bits;
342  register kmp_uint32 child;
343  register kmp_uint32 child_tid;
344 
345  // Perform a tree release for all of the threads that have been gathered
346  if (!KMP_MASTER_TID(tid)) { // Handle fork barrier workers who aren't part of a team yet
347  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n",
348  gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
349  // Wait for parent thread to release us
350  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
351  flag.wait(this_thr, TRUE
352  USE_ITT_BUILD_ARG(itt_sync_obj) );
353 #if USE_ITT_BUILD && USE_ITT_NOTIFY
354  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
355  // In fork barrier where we could not get the object reliably (or ITTNOTIFY is disabled)
356  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
357  // Cancel wait on previous parallel region...
358  __kmp_itt_task_starting(itt_sync_obj);
359 
360  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
361  return;
362 
363  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
364  if (itt_sync_obj != NULL)
365  // Call prepare as early as possible for "new" barrier
366  __kmp_itt_task_finished(itt_sync_obj);
367  } else
368 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
369  // Early exit for reaping threads releasing forkjoin barrier
370  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
371  return;
372 
373  // The worker thread may now assume that the team is valid.
374  team = __kmp_threads[gtid]->th.th_team;
375  KMP_DEBUG_ASSERT(team != NULL);
376  tid = __kmp_tid_from_gtid(gtid);
377 
378  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
379  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
380  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
381  KMP_MB(); // Flush all pending memory write invalidates.
382  } else {
383  team = __kmp_threads[gtid]->th.th_team;
384  KMP_DEBUG_ASSERT(team != NULL);
385  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
386  gtid, team->t.t_id, tid, bt));
387  }
388  nproc = this_thr->th.th_team_nproc;
389  child_tid = (tid << branch_bits) + 1;
390 
391  if (child_tid < nproc) {
392  register kmp_info_t **other_threads = team->t.t_threads;
393  child = 1;
394  // Parent threads release all their children
395  do {
396  register kmp_info_t *child_thr = other_threads[child_tid];
397  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
398 #if KMP_CACHE_MANAGE
399  // Prefetch next thread's go count
400  if (child+1 <= branch_factor && child_tid+1 < nproc)
401  KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_go);
402 #endif /* KMP_CACHE_MANAGE */
403 
404 #if KMP_BARRIER_ICV_PUSH
405  KMP_START_EXPLICIT_TIMER(USER_icv_copy);
406  if (propagate_icvs) {
407  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[child_tid],
408  team, child_tid, FALSE);
409  copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
410  &team->t.t_implicit_task_taskdata[0].td_icvs);
411  }
412  KMP_STOP_EXPLICIT_TIMER(USER_icv_copy);
413 #endif // KMP_BARRIER_ICV_PUSH
414  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
415  "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
416  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
417  child_tid, &child_bar->b_go, child_bar->b_go,
418  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
419  // Release child from barrier
420  kmp_flag_64 flag(&child_bar->b_go, child_thr);
421  flag.release();
422  child++;
423  child_tid++;
424  }
425  while (child <= branch_factor && child_tid < nproc);
426  }
427  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
428  gtid, team->t.t_id, tid, bt));
429 }
430 
431 
432 // Hyper Barrier
433 static void
434 __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
435  void (*reduce)(void *, void *)
436  USE_ITT_BUILD_ARG(void *itt_sync_obj) )
437 {
438  KMP_TIME_BLOCK(KMP_hyper_gather);
439  register kmp_team_t *team = this_thr->th.th_team;
440  register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
441  register kmp_info_t **other_threads = team->t.t_threads;
442  register kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
443  register kmp_uint32 num_threads = this_thr->th.th_team_nproc;
444  register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
445  register kmp_uint32 branch_factor = 1 << branch_bits;
446  register kmp_uint32 offset;
447  register kmp_uint32 level;
448 
449  KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
450  gtid, team->t.t_id, tid, bt));
451 
452  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
453 
454 #if USE_ITT_BUILD && USE_ITT_NOTIFY
455  // Barrier imbalance - save arrive time to the thread
456  if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
457  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
458  }
459 #endif
460  /* Perform a hypercube-embedded tree gather to wait until all of the threads have
461  arrived, and reduce any required data as we go. */
462  kmp_flag_64 p_flag(&thr_bar->b_arrived);
463  for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
464  {
465  register kmp_uint32 child;
466  register kmp_uint32 child_tid;
467 
468  if (((tid >> level) & (branch_factor - 1)) != 0) {
469  register kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) -1);
470 
471  KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
472  "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
473  __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
474  &thr_bar->b_arrived, thr_bar->b_arrived,
475  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
476  // Mark arrival to parent thread
477  /* After performing this write (in the last iteration of the enclosing for loop),
478  a worker thread may not assume that the team is valid any more - it could be
479  deallocated by the master thread at any time. */
480  p_flag.set_waiter(other_threads[parent_tid]);
481  p_flag.release();
482  break;
483  }
484 
485  // Parent threads wait for children to arrive
486  if (new_state == KMP_BARRIER_UNUSED_STATE)
487  new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
488  for (child=1, child_tid=tid+(1 << level); child<branch_factor && child_tid<num_threads;
489  child++, child_tid+=(1 << level))
490  {
491  register kmp_info_t *child_thr = other_threads[child_tid];
492  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
493 #if KMP_CACHE_MANAGE
494  register kmp_uint32 next_child_tid = child_tid + (1 << level);
495  // Prefetch next thread's arrived count
496  if (child+1 < branch_factor && next_child_tid < num_threads)
497  KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
498 #endif /* KMP_CACHE_MANAGE */
499  KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
500  "arrived(%p) == %llu\n", gtid, team->t.t_id, tid,
501  __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
502  &child_bar->b_arrived, new_state));
503  // Wait for child to arrive
504  kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
505  c_flag.wait(this_thr, FALSE
506  USE_ITT_BUILD_ARG(itt_sync_obj) );
507 #if USE_ITT_BUILD && USE_ITT_NOTIFY
508  // Barrier imbalance - write min of the thread time and a child time to the thread.
509  if (__kmp_forkjoin_frames_mode == 2) {
510  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
511  child_thr->th.th_bar_min_time);
512  }
513 #endif
514  if (reduce) {
515  KA_TRACE(100, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
516  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
517  team->t.t_id, child_tid));
518  (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
519  }
520  }
521  }
522 
523  if (KMP_MASTER_TID(tid)) {
524  // Need to update the team arrived pointer if we are the master thread
525  if (new_state == KMP_BARRIER_UNUSED_STATE)
526  team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
527  else
528  team->t.t_bar[bt].b_arrived = new_state;
529  KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
530  gtid, team->t.t_id, tid, team->t.t_id,
531  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
532  }
533  KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
534  gtid, team->t.t_id, tid, bt));
535 }
536 
537 // The reverse versions seem to beat the forward versions overall
538 #define KMP_REVERSE_HYPER_BAR
539 static void
540 __kmp_hyper_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
541  int propagate_icvs
542  USE_ITT_BUILD_ARG(void *itt_sync_obj) )
543 {
544  KMP_TIME_BLOCK(KMP_hyper_release);
545  register kmp_team_t *team;
546  register kmp_bstate_t *thr_bar = & this_thr -> th.th_bar[ bt ].bb;
547  register kmp_info_t **other_threads;
548  register kmp_uint32 num_threads;
549  register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[ bt ];
550  register kmp_uint32 branch_factor = 1 << branch_bits;
551  register kmp_uint32 child;
552  register kmp_uint32 child_tid;
553  register kmp_uint32 offset;
554  register kmp_uint32 level;
555 
556  /* Perform a hypercube-embedded tree release for all of the threads that have been gathered.
557  If KMP_REVERSE_HYPER_BAR is defined (default) the threads are released in the reverse
558  order of the corresponding gather, otherwise threads are released in the same order. */
559  if (KMP_MASTER_TID(tid)) { // master
560  team = __kmp_threads[gtid]->th.th_team;
561  KMP_DEBUG_ASSERT(team != NULL);
562  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
563  gtid, team->t.t_id, tid, bt));
564 #if KMP_BARRIER_ICV_PUSH
565  if (propagate_icvs) { // master already has ICVs in final destination; copy
566  copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
567  }
568 #endif
569  }
570  else { // Handle fork barrier workers who aren't part of a team yet
571  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n",
572  gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
573  // Wait for parent thread to release us
574  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
575  flag.wait(this_thr, TRUE
576  USE_ITT_BUILD_ARG(itt_sync_obj) );
577 #if USE_ITT_BUILD && USE_ITT_NOTIFY
578  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
579  // In fork barrier where we could not get the object reliably
580  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
581  // Cancel wait on previous parallel region...
582  __kmp_itt_task_starting(itt_sync_obj);
583 
584  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
585  return;
586 
587  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
588  if (itt_sync_obj != NULL)
589  // Call prepare as early as possible for "new" barrier
590  __kmp_itt_task_finished(itt_sync_obj);
591  } else
592 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
593  // Early exit for reaping threads releasing forkjoin barrier
594  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
595  return;
596 
597  // The worker thread may now assume that the team is valid.
598  team = __kmp_threads[gtid]->th.th_team;
599  KMP_DEBUG_ASSERT(team != NULL);
600  tid = __kmp_tid_from_gtid(gtid);
601 
602  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
603  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
604  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
605  KMP_MB(); // Flush all pending memory write invalidates.
606  }
607  num_threads = this_thr->th.th_team_nproc;
608  other_threads = team->t.t_threads;
609 
610 #ifdef KMP_REVERSE_HYPER_BAR
611  // Count up to correct level for parent
612  for (level=0, offset=1; offset<num_threads && (((tid>>level) & (branch_factor-1)) == 0);
613  level+=branch_bits, offset<<=branch_bits);
614 
615  // Now go down from there
616  for (level-=branch_bits, offset>>=branch_bits; offset != 0;
617  level-=branch_bits, offset>>=branch_bits)
618 #else
619  // Go down the tree, level by level
620  for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
621 #endif // KMP_REVERSE_HYPER_BAR
622  {
623 #ifdef KMP_REVERSE_HYPER_BAR
624  /* Now go in reverse order through the children, highest to lowest.
625  Initial setting of child is conservative here. */
626  child = num_threads >> ((level==0)?level:level-1);
627  for (child=(child<branch_factor-1) ? child : branch_factor-1, child_tid=tid+(child<<level);
628  child>=1; child--, child_tid-=(1<<level))
629 #else
630  if (((tid >> level) & (branch_factor - 1)) != 0)
631  // No need to go lower than this, since this is the level parent would be notified
632  break;
633  // Iterate through children on this level of the tree
634  for (child=1, child_tid=tid+(1<<level); child<branch_factor && child_tid<num_threads;
635  child++, child_tid+=(1<<level))
636 #endif // KMP_REVERSE_HYPER_BAR
637  {
638  if (child_tid >= num_threads) continue; // Child doesn't exist so keep going
639  else {
640  register kmp_info_t *child_thr = other_threads[child_tid];
641  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
642 #if KMP_CACHE_MANAGE
643  register kmp_uint32 next_child_tid = child_tid - (1 << level);
644  // Prefetch next thread's go count
645 # ifdef KMP_REVERSE_HYPER_BAR
646  if (child-1 >= 1 && next_child_tid < num_threads)
647 # else
648  if (child+1 < branch_factor && next_child_tid < num_threads)
649 # endif // KMP_REVERSE_HYPER_BAR
650  KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
651 #endif /* KMP_CACHE_MANAGE */
652 
653 #if KMP_BARRIER_ICV_PUSH
654  if (propagate_icvs) // push my fixed ICVs to my child
655  copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
656 #endif // KMP_BARRIER_ICV_PUSH
657 
658  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
659  "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
660  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
661  child_tid, &child_bar->b_go, child_bar->b_go,
662  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
663  // Release child from barrier
664  kmp_flag_64 flag(&child_bar->b_go, child_thr);
665  flag.release();
666  }
667  }
668  }
669 #if KMP_BARRIER_ICV_PUSH
670  if (propagate_icvs && !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
671  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
672  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
673  }
674 #endif
675  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
676  gtid, team->t.t_id, tid, bt));
677 }
678 
679 // Hierarchical Barrier
680 
681 // Initialize thread barrier data
682 /* Initializes/re-initializes the hierarchical barrier data stored on a thread. Performs the
683  minimum amount of initialization required based on how the team has changed. Returns true if
684  leaf children will require both on-core and traditional wake-up mechanisms. For example, if the
685  team size increases, threads already in the team will respond to on-core wakeup on their parent
686  thread, but threads newly added to the team will only be listening on the their local b_go. */
687 static bool
688 __kmp_init_hierarchical_barrier_thread(enum barrier_type bt, kmp_bstate_t *thr_bar, kmp_uint32 nproc,
689  int gtid, int tid, kmp_team_t *team)
690 {
691  // Checks to determine if (re-)initialization is needed
692  bool uninitialized = thr_bar->team == NULL;
693  bool team_changed = team != thr_bar->team;
694  bool team_sz_changed = nproc != thr_bar->nproc;
695  bool tid_changed = tid != thr_bar->old_tid;
696  bool retval = false;
697 
698  if (uninitialized || team_sz_changed) {
699  __kmp_get_hierarchy(nproc, thr_bar);
700  }
701 
702  if (uninitialized || team_sz_changed || tid_changed) {
703  thr_bar->my_level = thr_bar->depth-1; // default for master
704  thr_bar->parent_tid = -1; // default for master
705  if (!KMP_MASTER_TID(tid)) { // if not master, find parent thread in hierarchy
706  kmp_uint32 d=0;
707  while (d<thr_bar->depth) { // find parent based on level of thread in hierarchy, and note level
708  kmp_uint32 rem;
709  if (d == thr_bar->depth-2) { // reached level right below the master
710  thr_bar->parent_tid = 0;
711  thr_bar->my_level = d;
712  break;
713  }
714  else if ((rem = tid%thr_bar->skip_per_level[d+1]) != 0) { // TODO: can we make this op faster?
715  // thread is not a subtree root at next level, so this is max
716  thr_bar->parent_tid = tid - rem;
717  thr_bar->my_level = d;
718  break;
719  }
720  ++d;
721  }
722  }
723  thr_bar->offset = 7-(tid-thr_bar->parent_tid-1);
724  thr_bar->old_tid = tid;
725  thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
726  }
727  if (uninitialized || team_changed || tid_changed) {
728  thr_bar->team = team;
729  thr_bar->parent_bar = &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
730  retval = true;
731  }
732  if (uninitialized || team_sz_changed || tid_changed) {
733  thr_bar->nproc = nproc;
734  thr_bar->leaf_kids = thr_bar->base_leaf_kids;
735  if (thr_bar->my_level == 0) thr_bar->leaf_kids=0;
736  if (thr_bar->leaf_kids && (kmp_uint32)tid+thr_bar->leaf_kids+1 > nproc)
737  thr_bar->leaf_kids = nproc - tid - 1;
738  thr_bar->leaf_state = 0;
739  for (int i=0; i<thr_bar->leaf_kids; ++i) ((char *)&(thr_bar->leaf_state))[7-i] = 1;
740  }
741  return retval;
742 }
743 
744 static void
745 __kmp_hierarchical_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr,
746  int gtid, int tid, void (*reduce) (void *, void *)
747  USE_ITT_BUILD_ARG(void * itt_sync_obj) )
748 {
749  KMP_TIME_BLOCK(KMP_hier_gather);
750  register kmp_team_t *team = this_thr->th.th_team;
751  register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
752  register kmp_uint32 nproc = this_thr->th.th_team_nproc;
753  register kmp_info_t **other_threads = team->t.t_threads;
754  register kmp_uint64 new_state;
755 
756  int level = team->t.t_level;
757  if (other_threads[0]->th.th_teams_microtask) // are we inside the teams construct?
758  if (this_thr->th.th_teams_size.nteams > 1)
759  ++level; // level was not increased in teams construct for team_of_masters
760  if (level == 1) thr_bar->use_oncore_barrier = 1;
761  else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
762 
763  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
764  gtid, team->t.t_id, tid, bt));
765  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
766 
767 #if USE_ITT_BUILD && USE_ITT_NOTIFY
768  // Barrier imbalance - save arrive time to the thread
769  if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
770  this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
771  }
772 #endif
773 
774  (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
775 
776  if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
777  register kmp_int32 child_tid;
778  new_state = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
779  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
780  if (thr_bar->leaf_kids) { // First, wait for leaf children to check-in on my b_arrived flag
781  kmp_uint64 leaf_state = KMP_MASTER_TID(tid) ? thr_bar->b_arrived | thr_bar->leaf_state : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
782  kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
783  flag.wait(this_thr, FALSE
784  USE_ITT_BUILD_ARG(itt_sync_obj) );
785  if (reduce) {
786  for (child_tid=tid+1; child_tid<=tid+thr_bar->leaf_kids; ++child_tid) {
787  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
788  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
789  team->t.t_id, child_tid));
790  (*reduce)(this_thr->th.th_local.reduce_data, other_threads[child_tid]->th.th_local.reduce_data);
791  }
792  }
793  (void) KMP_TEST_THEN_AND64((volatile kmp_int64 *)&thr_bar->b_arrived, ~(thr_bar->leaf_state)); // clear leaf_state bits
794  }
795  // Next, wait for higher level children on each child's b_arrived flag
796  for (kmp_uint32 d=1; d<thr_bar->my_level; ++d) { // gather lowest level threads first, but skip 0
797  kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
798  if (last > nproc) last = nproc;
799  for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
800  register kmp_info_t *child_thr = other_threads[child_tid];
801  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
802  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
803  "arrived(%p) == %llu\n",
804  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
805  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
806  kmp_flag_64 flag(&child_bar->b_arrived, new_state);
807  flag.wait(this_thr, FALSE
808  USE_ITT_BUILD_ARG(itt_sync_obj) );
809  if (reduce) {
810  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
811  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
812  team->t.t_id, child_tid));
813  (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
814  }
815  }
816  }
817  }
818  else { // Blocktime is not infinite
819  for (kmp_uint32 d=0; d<thr_bar->my_level; ++d) { // Gather lowest level threads first
820  kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
821  if (last > nproc) last = nproc;
822  for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
823  register kmp_info_t *child_thr = other_threads[child_tid];
824  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
825  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
826  "arrived(%p) == %llu\n",
827  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
828  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
829  kmp_flag_64 flag(&child_bar->b_arrived, new_state);
830  flag.wait(this_thr, FALSE
831  USE_ITT_BUILD_ARG(itt_sync_obj) );
832  if (reduce) {
833  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
834  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
835  team->t.t_id, child_tid));
836  (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
837  }
838  }
839  }
840  }
841  }
842  // All subordinates are gathered; now release parent if not master thread
843 
844  if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
845  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
846  "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
847  __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id, thr_bar->parent_tid,
848  &thr_bar->b_arrived, thr_bar->b_arrived, thr_bar->b_arrived+KMP_BARRIER_STATE_BUMP));
849  /* Mark arrival to parent: After performing this write, a worker thread may not assume that
850  the team is valid any more - it could be deallocated by the master thread at any time. */
851  if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
852  || !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived flag; release it
853  kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
854  flag.release();
855  }
856  else { // Leaf does special release on the "offset" bits of parent's b_arrived flag
857  thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
858  kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
859  flag.set_waiter(other_threads[thr_bar->parent_tid]);
860  flag.release();
861  }
862  } else { // Master thread needs to update the team's b_arrived value
863  team->t.t_bar[bt].b_arrived = new_state;
864  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
865  gtid, team->t.t_id, tid, team->t.t_id, &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
866  }
867  // Is the team access below unsafe or just technically invalid?
868  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
869  gtid, team->t.t_id, tid, bt));
870 }
871 
872 static void
873 __kmp_hierarchical_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
874  int propagate_icvs
875  USE_ITT_BUILD_ARG(void * itt_sync_obj) )
876 {
877  KMP_TIME_BLOCK(KMP_hier_release);
878  register kmp_team_t *team;
879  register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
880  register kmp_uint32 nproc;
881  bool team_change = false; // indicates on-core barrier shouldn't be used
882 
883  if (KMP_MASTER_TID(tid)) {
884  team = __kmp_threads[gtid]->th.th_team;
885  KMP_DEBUG_ASSERT(team != NULL);
886  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master entered barrier type %d\n",
887  gtid, team->t.t_id, tid, bt));
888  }
889  else { // Worker threads
890  // Wait for parent thread to release me
891  if (!thr_bar->use_oncore_barrier || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
892  || thr_bar->my_level != 0 || thr_bar->team == NULL) {
893  // Use traditional method of waiting on my own b_go flag
894  thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
895  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
896  flag.wait(this_thr, TRUE
897  USE_ITT_BUILD_ARG(itt_sync_obj) );
898  TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
899  }
900  else { // Thread barrier data is initialized, this is a leaf, blocktime is infinite, not nested
901  // Wait on my "offset" bits on parent's b_go flag
902  thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
903  kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP, thr_bar->offset,
904  bt, this_thr
905  USE_ITT_BUILD_ARG(itt_sync_obj) );
906  flag.wait(this_thr, TRUE);
907  if (thr_bar->wait_flag == KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
908  TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
909  }
910  else { // Reset my bits on parent's b_go flag
911  ((char*)&(thr_bar->parent_bar->b_go))[thr_bar->offset] = 0;
912  }
913  }
914  thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
915  // Early exit for reaping threads releasing forkjoin barrier
916  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
917  return;
918  // The worker thread may now assume that the team is valid.
919  team = __kmp_threads[gtid]->th.th_team;
920  KMP_DEBUG_ASSERT(team != NULL);
921  tid = __kmp_tid_from_gtid(gtid);
922 
923  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
924  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
925  KMP_MB(); // Flush all pending memory write invalidates.
926  }
927 
928  nproc = this_thr->th.th_team_nproc;
929  int level = team->t.t_level;
930  if (team->t.t_threads[0]->th.th_teams_microtask ) { // are we inside the teams construct?
931  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && this_thr->th.th_teams_level == level)
932  ++level; // level was not increased in teams construct for team_of_workers
933  if( this_thr->th.th_teams_size.nteams > 1 )
934  ++level; // level was not increased in teams construct for team_of_masters
935  }
936  if (level == 1) thr_bar->use_oncore_barrier = 1;
937  else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
938 
939  // If the team size has increased, we still communicate with old leaves via oncore barrier.
940  unsigned short int old_leaf_kids = thr_bar->leaf_kids;
941  kmp_uint64 old_leaf_state = thr_bar->leaf_state;
942  team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
943  // But if the entire team changes, we won't use oncore barrier at all
944  if (team_change) old_leaf_kids = 0;
945 
946 #if KMP_BARRIER_ICV_PUSH
947  if (propagate_icvs) {
948  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
949  if (KMP_MASTER_TID(tid)) { // master already has copy in final destination; copy
950  copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
951  }
952  else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) { // optimization for inf blocktime
953  if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
954  // leaves (on-core children) pull parent's fixed ICVs directly to local ICV store
955  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
956  &thr_bar->parent_bar->th_fixed_icvs);
957  // non-leaves will get ICVs piggybacked with b_go via NGO store
958  }
959  else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
960  if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can access
961  copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
962  else // leaves copy parent's fixed ICVs directly to local ICV store
963  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
964  &thr_bar->parent_bar->th_fixed_icvs);
965  }
966  }
967 #endif // KMP_BARRIER_ICV_PUSH
968 
969  // Now, release my children
970  if (thr_bar->my_level) { // not a leaf
971  register kmp_int32 child_tid;
972  kmp_uint32 last;
973  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
974  if (KMP_MASTER_TID(tid)) { // do a flat release
975  // Set local b_go to bump children via NGO store of the cache line containing IVCs and b_go.
976  thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
977  // Use ngo stores if available; b_go piggybacks in the last 8 bytes of the cache line
978  ngo_load(&thr_bar->th_fixed_icvs);
979  // This loops over all the threads skipping only the leaf nodes in the hierarchy
980  for (child_tid=thr_bar->skip_per_level[1]; child_tid<(int)nproc; child_tid+=thr_bar->skip_per_level[1]) {
981  register kmp_bstate_t *child_bar = &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
982  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
983  " go(%p): %u => %u\n",
984  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
985  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
986  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
987  // Use ngo store (if available) to both store ICVs and release child via child's b_go
988  ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
989  }
990  ngo_sync();
991  }
992  TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
993  // Now, release leaf children
994  if (thr_bar->leaf_kids) { // if there are any
995  // We test team_change on the off-chance that the level 1 team changed.
996  if (team_change || old_leaf_kids < thr_bar->leaf_kids) { // some old leaf_kids, some new
997  if (old_leaf_kids) { // release old leaf kids
998  thr_bar->b_go |= old_leaf_state;
999  }
1000  // Release new leaf kids
1001  last = tid+thr_bar->skip_per_level[1];
1002  if (last > nproc) last = nproc;
1003  for (child_tid=tid+1+old_leaf_kids; child_tid<(int)last; ++child_tid) { // skip_per_level[0]=1
1004  register kmp_info_t *child_thr = team->t.t_threads[child_tid];
1005  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1006  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1007  " T#%d(%d:%d) go(%p): %u => %u\n",
1008  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1009  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1010  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1011  // Release child using child's b_go flag
1012  kmp_flag_64 flag(&child_bar->b_go, child_thr);
1013  flag.release();
1014  }
1015  }
1016  else { // Release all children at once with leaf_state bits on my own b_go flag
1017  thr_bar->b_go |= thr_bar->leaf_state;
1018  }
1019  }
1020  }
1021  else { // Blocktime is not infinite; do a simple hierarchical release
1022  for (int d=thr_bar->my_level-1; d>=0; --d) { // Release highest level threads first
1023  last = tid+thr_bar->skip_per_level[d+1];
1024  kmp_uint32 skip = thr_bar->skip_per_level[d];
1025  if (last > nproc) last = nproc;
1026  for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
1027  register kmp_info_t *child_thr = team->t.t_threads[child_tid];
1028  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1029  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
1030  " go(%p): %u => %u\n",
1031  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1032  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1033  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1034  // Release child using child's b_go flag
1035  kmp_flag_64 flag(&child_bar->b_go, child_thr);
1036  flag.release();
1037  }
1038  }
1039  }
1040 #if KMP_BARRIER_ICV_PUSH
1041  if (propagate_icvs && !KMP_MASTER_TID(tid)) // non-leaves copy ICVs from fixed ICVs to local dest
1042  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
1043 #endif // KMP_BARRIER_ICV_PUSH
1044  }
1045  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1046  gtid, team->t.t_id, tid, bt));
1047 }
1048 
1049 
1050 // ---------------------------- End of Barrier Algorithms ----------------------------
1051 
1052 // Internal function to do a barrier.
1053 /* If is_split is true, do a split barrier, otherwise, do a plain barrier
1054  If reduce is non-NULL, do a split reduction barrier, otherwise, do a split barrier
1055  Returns 0 if master thread, 1 if worker thread. */
1056 int
1057 __kmp_barrier(enum barrier_type bt, int gtid, int is_split, size_t reduce_size,
1058  void *reduce_data, void (*reduce)(void *, void *))
1059 {
1060  KMP_TIME_BLOCK(KMP_barrier);
1061  register int tid = __kmp_tid_from_gtid(gtid);
1062  register kmp_info_t *this_thr = __kmp_threads[gtid];
1063  register kmp_team_t *team = this_thr->th.th_team;
1064  register int status = 0;
1065  ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1066 #if OMPT_SUPPORT
1067  ompt_task_id_t my_task_id;
1068  ompt_parallel_id_t my_parallel_id;
1069 #endif
1070 
1071  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n",
1072  gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1073 
1074 #if OMPT_SUPPORT && OMPT_TRACE
1075  if (ompt_status & ompt_status_track) {
1076  if (ompt_status == ompt_status_track_callback) {
1077  my_task_id = team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id;
1078  my_parallel_id = team->t.ompt_team_info.parallel_id;
1079 
1080  if (this_thr->th.ompt_thread_info.state == ompt_state_wait_single) {
1081  if (ompt_callbacks.ompt_callback(ompt_event_single_others_end)) {
1082  ompt_callbacks.ompt_callback(ompt_event_single_others_end)(
1083  my_parallel_id, my_task_id);
1084  }
1085  }
1086  this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1087  if (ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) {
1088  ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(
1089  my_parallel_id, my_task_id);
1090  }
1091  } else {
1092  this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1093  }
1094  }
1095 #endif
1096 
1097  if (! team->t.t_serialized) {
1098 #if USE_ITT_BUILD
1099  // This value will be used in itt notify events below.
1100  void *itt_sync_obj = NULL;
1101 # if USE_ITT_NOTIFY
1102  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1103  itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1104 # endif
1105 #endif /* USE_ITT_BUILD */
1106  if (__kmp_tasking_mode == tskm_extra_barrier) {
1107  __kmp_tasking_barrier(team, this_thr, gtid);
1108  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n",
1109  gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1110  }
1111 
1112  /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when
1113  the team struct is not guaranteed to exist. */
1114  // See note about the corresponding code in __kmp_join_barrier() being performance-critical.
1115  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1116  this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1117  this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1118  }
1119 
1120 #if USE_ITT_BUILD
1121  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1122  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1123 #endif /* USE_ITT_BUILD */
1124 #if USE_DEBUGGER
1125  // Let the debugger know: the thread arrived to the barrier and waiting.
1126  if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure.
1127  team->t.t_bar[bt].b_master_arrived += 1;
1128  } else {
1129  this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1130  } // if
1131 #endif /* USE_DEBUGGER */
1132  if (reduce != NULL) {
1133  //KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
1134  this_thr->th.th_local.reduce_data = reduce_data;
1135  }
1136 
1137  switch (__kmp_barrier_gather_pattern[bt]) {
1138  case bp_hyper_bar: {
1139  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
1140  __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid, reduce
1141  USE_ITT_BUILD_ARG(itt_sync_obj) );
1142  break;
1143  }
1144  case bp_hierarchical_bar: {
1145  __kmp_hierarchical_barrier_gather(bt, this_thr, gtid, tid, reduce
1146  USE_ITT_BUILD_ARG(itt_sync_obj));
1147  break;
1148  }
1149  case bp_tree_bar: {
1150  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
1151  __kmp_tree_barrier_gather(bt, this_thr, gtid, tid, reduce
1152  USE_ITT_BUILD_ARG(itt_sync_obj) );
1153  break;
1154  }
1155  default: {
1156  __kmp_linear_barrier_gather(bt, this_thr, gtid, tid, reduce
1157  USE_ITT_BUILD_ARG(itt_sync_obj) );
1158  }
1159  }
1160 
1161  KMP_MB();
1162 
1163  if (KMP_MASTER_TID(tid)) {
1164  status = 0;
1165  if (__kmp_tasking_mode != tskm_immediate_exec) {
1166  {
1167  __kmp_task_team_wait(this_thr, team
1168  USE_ITT_BUILD_ARG(itt_sync_obj) );
1169  }
1170  __kmp_task_team_setup(this_thr, team, 0, 0); // use 0,0 to only setup the current team if nthreads > 1
1171  }
1172 #if USE_DEBUGGER
1173  // Let the debugger know: All threads are arrived and starting leaving the barrier.
1174  team->t.t_bar[bt].b_team_arrived += 1;
1175 #endif
1176 
1177 #if USE_ITT_BUILD
1178  /* TODO: In case of split reduction barrier, master thread may send acquired event early,
1179  before the final summation into the shared variable is done (final summation can be a
1180  long operation for array reductions). */
1181  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1182  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1183 #endif /* USE_ITT_BUILD */
1184 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1185  // Barrier - report frame end (only if active_level == 1)
1186  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode &&
1187 #if OMP_40_ENABLED
1188  this_thr->th.th_teams_microtask == NULL &&
1189 #endif
1190  team->t.t_active_level == 1)
1191  {
1192  kmp_uint64 cur_time = __itt_get_timestamp();
1193  kmp_info_t **other_threads = team->t.t_threads;
1194  int nproc = this_thr->th.th_team_nproc;
1195  int i;
1196  switch(__kmp_forkjoin_frames_mode) {
1197  case 1:
1198  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1199  this_thr->th.th_frame_time = cur_time;
1200  break;
1201  case 2: // AC 2015-01-19: currently does not work for hierarchical (to be fixed)
1202  __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1203  break;
1204  case 3:
1205  if( __itt_metadata_add_ptr ) {
1206  // Initialize with master's wait time
1207  kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1208  for (i=1; i<nproc; ++i) {
1209  delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
1210  }
1211  __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, (kmp_uint64)( reduce != NULL));
1212  }
1213  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1214  this_thr->th.th_frame_time = cur_time;
1215  break;
1216  }
1217  }
1218 #endif /* USE_ITT_BUILD */
1219  } else {
1220  status = 1;
1221 #if USE_ITT_BUILD
1222  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1223  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1224 #endif /* USE_ITT_BUILD */
1225  }
1226  if (status == 1 || ! is_split) {
1227  switch (__kmp_barrier_release_pattern[bt]) {
1228  case bp_hyper_bar: {
1229  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1230  __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1231  USE_ITT_BUILD_ARG(itt_sync_obj) );
1232  break;
1233  }
1234  case bp_hierarchical_bar: {
1235  __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1236  USE_ITT_BUILD_ARG(itt_sync_obj) );
1237  break;
1238  }
1239  case bp_tree_bar: {
1240  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1241  __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1242  USE_ITT_BUILD_ARG(itt_sync_obj) );
1243  break;
1244  }
1245  default: {
1246  __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1247  USE_ITT_BUILD_ARG(itt_sync_obj) );
1248  }
1249  }
1250  if (__kmp_tasking_mode != tskm_immediate_exec) {
1251  __kmp_task_team_sync(this_thr, team);
1252  }
1253  }
1254 
1255 #if USE_ITT_BUILD
1256  /* GEH: TODO: Move this under if-condition above and also include in
1257  __kmp_end_split_barrier(). This will more accurately represent the actual release time
1258  of the threads for split barriers. */
1259  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1260  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1261 #endif /* USE_ITT_BUILD */
1262  } else { // Team is serialized.
1263  status = 0;
1264  if (__kmp_tasking_mode != tskm_immediate_exec) {
1265 #if OMP_41_ENABLED
1266  if ( this_thr->th.th_task_team != NULL ) {
1267  void *itt_sync_obj = NULL;
1268 #if USE_ITT_NOTIFY
1269  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1270  itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1271  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1272  }
1273 #endif
1274 
1275  kmp_task_team_t * task_team = this_thr->th.th_task_team;
1276  KMP_DEBUG_ASSERT(task_team->tt.tt_found_proxy_tasks == TRUE);
1277  __kmp_task_team_wait(this_thr, team
1278  USE_ITT_BUILD_ARG(itt_sync_obj));
1279  __kmp_task_team_setup(this_thr, team, 0, 0);
1280 
1281 #if USE_ITT_BUILD
1282  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1283  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1284 #endif /* USE_ITT_BUILD */
1285  }
1286 #else
1287  // The task team should be NULL for serialized code (tasks will be executed immediately)
1288  KMP_DEBUG_ASSERT(team->t.t_task_team[this_thr->th.th_task_state] == NULL);
1289  KMP_DEBUG_ASSERT(this_thr->th.th_task_team == NULL);
1290 #endif
1291  }
1292  }
1293  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1294  gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid), status));
1295 
1296 #if OMPT_SUPPORT
1297  if (ompt_status & ompt_status_track) {
1298 #if OMPT_TRACE
1299  if ((ompt_status == ompt_status_track_callback) &&
1300  ompt_callbacks.ompt_callback(ompt_event_barrier_end)) {
1301  ompt_callbacks.ompt_callback(ompt_event_barrier_end)(
1302  my_parallel_id, my_task_id);
1303  }
1304 #endif
1305  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1306  }
1307 #endif
1308 
1309  return status;
1310 }
1311 
1312 
1313 void
1314 __kmp_end_split_barrier(enum barrier_type bt, int gtid)
1315 {
1316  KMP_TIME_BLOCK(KMP_end_split_barrier);
1317  int tid = __kmp_tid_from_gtid(gtid);
1318  kmp_info_t *this_thr = __kmp_threads[gtid];
1319  kmp_team_t *team = this_thr->th.th_team;
1320 
1321  if (!team->t.t_serialized) {
1322  if (KMP_MASTER_GTID(gtid)) {
1323  switch (__kmp_barrier_release_pattern[bt]) {
1324  case bp_hyper_bar: {
1325  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1326  __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1327  USE_ITT_BUILD_ARG(NULL) );
1328  break;
1329  }
1330  case bp_hierarchical_bar: {
1331  __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1332  USE_ITT_BUILD_ARG(NULL));
1333  break;
1334  }
1335  case bp_tree_bar: {
1336  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1337  __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1338  USE_ITT_BUILD_ARG(NULL) );
1339  break;
1340  }
1341  default: {
1342  __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1343  USE_ITT_BUILD_ARG(NULL) );
1344  }
1345  }
1346  if (__kmp_tasking_mode != tskm_immediate_exec) {
1347  __kmp_task_team_sync(this_thr, team);
1348  } // if
1349  }
1350  }
1351 }
1352 
1353 
1354 void
1355 __kmp_join_barrier(int gtid)
1356 {
1357  KMP_TIME_BLOCK(KMP_join_barrier);
1358  register kmp_info_t *this_thr = __kmp_threads[gtid];
1359  register kmp_team_t *team;
1360  register kmp_uint nproc;
1361  kmp_info_t *master_thread;
1362  int tid;
1363 #ifdef KMP_DEBUG
1364  int team_id;
1365 #endif /* KMP_DEBUG */
1366 #if USE_ITT_BUILD
1367  void *itt_sync_obj = NULL;
1368 # if USE_ITT_NOTIFY
1369  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1370  // Get object created at fork_barrier
1371  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1372 # endif
1373 #endif /* USE_ITT_BUILD */
1374  KMP_MB();
1375 
1376  // Get current info
1377  team = this_thr->th.th_team;
1378  nproc = this_thr->th.th_team_nproc;
1379  KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1380  tid = __kmp_tid_from_gtid(gtid);
1381 #ifdef KMP_DEBUG
1382  team_id = team->t.t_id;
1383 #endif /* KMP_DEBUG */
1384  master_thread = this_thr->th.th_team_master;
1385 #ifdef KMP_DEBUG
1386  if (master_thread != team->t.t_threads[0]) {
1387  __kmp_print_structure();
1388  }
1389 #endif /* KMP_DEBUG */
1390  KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1391  KMP_MB();
1392 
1393  // Verify state
1394  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1395  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1396  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1397  KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1398  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n", gtid, team_id, tid));
1399 
1400 #if OMPT_SUPPORT && OMPT_TRACE
1401  if ((ompt_status == ompt_status_track_callback) &&
1402  ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) {
1403  ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(
1404  team->t.ompt_team_info.parallel_id,
1405  team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1406  }
1407  this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1408 #endif
1409 
1410  if (__kmp_tasking_mode == tskm_extra_barrier) {
1411  __kmp_tasking_barrier(team, this_thr, gtid);
1412  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid, team_id, tid));
1413  }
1414 # ifdef KMP_DEBUG
1415  if (__kmp_tasking_mode != tskm_immediate_exec) {
1416  KA_TRACE(20, ( "__kmp_join_barrier: T#%d, old team = %d, old task_team = %p, th_task_team = %p\n",
1417  __kmp_gtid_from_thread(this_thr), team_id, team->t.t_task_team[this_thr->th.th_task_state],
1418  this_thr->th.th_task_team));
1419  KMP_DEBUG_ASSERT(this_thr->th.th_task_team == team->t.t_task_team[this_thr->th.th_task_state]);
1420  }
1421 # endif /* KMP_DEBUG */
1422 
1423  /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when the
1424  team struct is not guaranteed to exist. Doing these loads causes a cache miss slows
1425  down EPCC parallel by 2x. As a workaround, we do not perform the copy if blocktime=infinite,
1426  since the values are not used by __kmp_wait_template() in that case. */
1427  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1428  this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1429  this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1430  }
1431 
1432 #if USE_ITT_BUILD
1433  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1434  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1435 #endif /* USE_ITT_BUILD */
1436 
1437  switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1438  case bp_hyper_bar: {
1439  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1440  __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1441  USE_ITT_BUILD_ARG(itt_sync_obj) );
1442  break;
1443  }
1444  case bp_hierarchical_bar: {
1445  __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1446  USE_ITT_BUILD_ARG(itt_sync_obj) );
1447  break;
1448  }
1449  case bp_tree_bar: {
1450  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1451  __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1452  USE_ITT_BUILD_ARG(itt_sync_obj) );
1453  break;
1454  }
1455  default: {
1456  __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1457  USE_ITT_BUILD_ARG(itt_sync_obj) );
1458  }
1459  }
1460 
1461  /* From this point on, the team data structure may be deallocated at any time by the
1462  master thread - it is unsafe to reference it in any of the worker threads. Any per-team
1463  data items that need to be referenced before the end of the barrier should be moved to
1464  the kmp_task_team_t structs. */
1465  if (KMP_MASTER_TID(tid)) {
1466  if (__kmp_tasking_mode != tskm_immediate_exec) {
1467  // Master shouldn't call decrease_load(). // TODO: enable master threads.
1468  // Master should have th_may_decrease_load == 0. // TODO: enable master threads.
1469  __kmp_task_team_wait(this_thr, team
1470  USE_ITT_BUILD_ARG(itt_sync_obj) );
1471  }
1472 #if USE_ITT_BUILD
1473  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1474  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1475 #endif /* USE_ITT_BUILD */
1476 
1477 # if USE_ITT_BUILD && USE_ITT_NOTIFY
1478  // Join barrier - report frame end
1479  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode &&
1480 #if OMP_40_ENABLED
1481  this_thr->th.th_teams_microtask == NULL &&
1482 #endif
1483  team->t.t_active_level == 1)
1484  {
1485  kmp_uint64 cur_time = __itt_get_timestamp();
1486  ident_t * loc = team->t.t_ident;
1487  kmp_info_t **other_threads = team->t.t_threads;
1488  int nproc = this_thr->th.th_team_nproc;
1489  int i;
1490  switch(__kmp_forkjoin_frames_mode) {
1491  case 1:
1492  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1493  break;
1494  case 2:
1495  __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1496  break;
1497  case 3:
1498  if( __itt_metadata_add_ptr ) {
1499  // Initialize with master's wait time
1500  kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1501  for (i=1; i<nproc; ++i) {
1502  delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
1503  }
1504  __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, 0);
1505  }
1506  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1507  this_thr->th.th_frame_time = cur_time;
1508  break;
1509  }
1510  }
1511 # endif /* USE_ITT_BUILD */
1512  }
1513 #if USE_ITT_BUILD
1514  else {
1515  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1516  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1517  }
1518 #endif /* USE_ITT_BUILD */
1519 
1520 #if KMP_DEBUG
1521  if (KMP_MASTER_TID(tid)) {
1522  KA_TRACE(15, ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1523  gtid, team_id, tid, nproc));
1524  }
1525 #endif /* KMP_DEBUG */
1526 
1527  // TODO now, mark worker threads as done so they may be disbanded
1528  KMP_MB(); // Flush all pending memory write invalidates.
1529  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
1530 
1531 #if OMPT_SUPPORT
1532  if (ompt_status == ompt_status_track) {
1533 #if OMPT_TRACE
1534  if ((ompt_status == ompt_status_track_callback) &&
1535  ompt_callbacks.ompt_callback(ompt_event_barrier_end)) {
1536  ompt_callbacks.ompt_callback(ompt_event_barrier_end)(
1537  team->t.ompt_team_info.parallel_id,
1538  team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1539  }
1540 #endif
1541 
1542  // return to default state
1543  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
1544  }
1545 #endif
1546 }
1547 
1548 
1549 // TODO release worker threads' fork barriers as we are ready instead of all at once
1550 void
1551 __kmp_fork_barrier(int gtid, int tid)
1552 {
1553  KMP_TIME_BLOCK(KMP_fork_barrier);
1554  kmp_info_t *this_thr = __kmp_threads[gtid];
1555  kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1556 #if USE_ITT_BUILD
1557  void * itt_sync_obj = NULL;
1558 #endif /* USE_ITT_BUILD */
1559 
1560  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n",
1561  gtid, (team != NULL) ? team->t.t_id : -1, tid));
1562 
1563  // th_team pointer only valid for master thread here
1564  if (KMP_MASTER_TID(tid)) {
1565 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1566  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1567  // Create itt barrier object
1568  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1569  __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
1570  }
1571 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1572 
1573 #ifdef KMP_DEBUG
1574  register kmp_info_t **other_threads = team->t.t_threads;
1575  register int i;
1576 
1577  // Verify state
1578  KMP_MB();
1579 
1580  for(i=1; i<team->t.t_nproc; ++i) {
1581  KA_TRACE(500, ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go == %u.\n",
1582  gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1583  team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1584  other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1585  KMP_DEBUG_ASSERT((TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go)
1586  & ~(KMP_BARRIER_SLEEP_STATE))
1587  == KMP_INIT_BARRIER_STATE);
1588  KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1589  }
1590 #endif
1591 
1592  if (__kmp_tasking_mode != tskm_immediate_exec) {
1593  __kmp_task_team_setup(this_thr, team, 1, 0); // 1,0 indicates setup both task teams if nthreads > 1
1594  }
1595 
1596  /* The master thread may have changed its blocktime between the join barrier and the
1597  fork barrier. Copy the blocktime info to the thread, where __kmp_wait_template() can
1598  access it when the team struct is not guaranteed to exist. */
1599  // See note about the corresponding code in __kmp_join_barrier() being performance-critical
1600  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1601  this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1602  this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1603  }
1604  } // master
1605 
1606  switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1607  case bp_hyper_bar: {
1608  KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1609  __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1610  USE_ITT_BUILD_ARG(itt_sync_obj) );
1611  break;
1612  }
1613  case bp_hierarchical_bar: {
1614  __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1615  USE_ITT_BUILD_ARG(itt_sync_obj) );
1616  break;
1617  }
1618  case bp_tree_bar: {
1619  KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1620  __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1621  USE_ITT_BUILD_ARG(itt_sync_obj) );
1622  break;
1623  }
1624  default: {
1625  __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1626  USE_ITT_BUILD_ARG(itt_sync_obj) );
1627  }
1628  }
1629 
1630  // Early exit for reaping threads releasing forkjoin barrier
1631  if (TCR_4(__kmp_global.g.g_done)) {
1632  if (this_thr->th.th_task_team != NULL) {
1633  if (KMP_MASTER_TID(tid)) {
1634  TCW_PTR(this_thr->th.th_task_team, NULL);
1635  }
1636  else {
1637  __kmp_unref_task_team(this_thr->th.th_task_team, this_thr);
1638  }
1639  }
1640 
1641 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1642  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1643  if (!KMP_MASTER_TID(tid)) {
1644  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1645  if (itt_sync_obj)
1646  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1647  }
1648  }
1649 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1650  KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
1651  return;
1652  }
1653 
1654  /* We can now assume that a valid team structure has been allocated by the master and
1655  propagated to all worker threads. The current thread, however, may not be part of the
1656  team, so we can't blindly assume that the team pointer is non-null. */
1657  team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
1658  KMP_DEBUG_ASSERT(team != NULL);
1659  tid = __kmp_tid_from_gtid(gtid);
1660 
1661 
1662 #if KMP_BARRIER_ICV_PULL
1663  /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1664  __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
1665  this data before this function is called. We cannot modify __kmp_fork_call() to look at
1666  the fixed ICVs in the master's thread struct, because it is not always the case that the
1667  threads arrays have been allocated when __kmp_fork_call() is executed. */
1668  KMP_START_EXPLICIT_TIMER(USER_icv_copy);
1669  if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs
1670  // Copy the initial ICVs from the master's thread struct to the implicit task for this tid.
1671  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
1672  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
1673  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1674  &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs);
1675  }
1676  KMP_STOP_EXPLICIT_TIMER(USER_icv_copy);
1677 #endif // KMP_BARRIER_ICV_PULL
1678 
1679  if (__kmp_tasking_mode != tskm_immediate_exec) {
1680  __kmp_task_team_sync(this_thr, team);
1681  }
1682 
1683 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1684  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
1685  if (proc_bind == proc_bind_intel) {
1686 #endif
1687 #if KMP_AFFINITY_SUPPORTED
1688  // Call dynamic affinity settings
1689  if(__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
1690  __kmp_balanced_affinity(tid, team->t.t_nproc);
1691  }
1692 #endif // KMP_AFFINITY_SUPPORTED
1693 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1694  }
1695  else if (proc_bind != proc_bind_false) {
1696  if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
1697  KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
1698  __kmp_gtid_from_thread(this_thr), this_thr->th.th_current_place));
1699  }
1700  else {
1701  __kmp_affinity_set_place(gtid);
1702  }
1703  }
1704 #endif
1705 
1706 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1707  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1708  if (!KMP_MASTER_TID(tid)) {
1709  // Get correct barrier object
1710  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1711  __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
1712  } // (prepare called inside barrier_release)
1713  }
1714 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1715  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid, team->t.t_id, tid));
1716 }
1717 
1718 
1719 void
1720 __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, kmp_internal_control_t *new_icvs, ident_t *loc )
1721 {
1722  KMP_TIME_BLOCK(KMP_setup_icv_copy);
1723  int f;
1724 
1725  KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
1726  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
1727 
1728  /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1729  __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
1730  this data before this function is called. */
1731 #if KMP_BARRIER_ICV_PULL
1732  /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains untouched), where
1733  all of the worker threads can access them and make their own copies after the barrier. */
1734  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point
1735  copy_icvs(&team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs, new_icvs);
1736  KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n",
1737  0, team->t.t_threads[0], team));
1738 #elif KMP_BARRIER_ICV_PUSH
1739  // The ICVs will be propagated in the fork barrier, so nothing needs to be done here.
1740  KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n",
1741  0, team->t.t_threads[0], team));
1742 #else
1743  // Copy the ICVs to each of the non-master threads. This takes O(nthreads) time.
1744  ngo_load(new_icvs);
1745  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point
1746  for (f=1; f<new_nproc; ++f) { // Skip the master thread
1747  // TODO: GEH - pass in better source location info since usually NULL here
1748  KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1749  f, team->t.t_threads[f], team));
1750  __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
1751  ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
1752  KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1753  f, team->t.t_threads[f], team));
1754  }
1755  ngo_sync();
1756 #endif // KMP_BARRIER_ICV_PULL
1757 }
#define KMP_START_EXPLICIT_TIMER(name)
"Starts" an explicit timer which will need a corresponding KMP_STOP_EXPLICIT_TIMER() macro...
Definition: kmp_stats.h:668
#define KMP_STOP_EXPLICIT_TIMER(name)
"Stops" an explicit timer.
Definition: kmp_stats.h:682
#define KMP_TIME_BLOCK(name)
Uses specified timer (name) to time code block.
Definition: kmp_stats.h:629
Definition: kmp.h:218