23#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_WARP_FUNCTIONS_H
24#define HIP_INCLUDE_HIP_AMD_DETAIL_WARP_FUNCTIONS_H
26__device__
static inline unsigned __hip_ds_bpermute(
int index,
unsigned src) {
27 union {
int i;
unsigned u;
float f; } tmp; tmp.u = src;
28 tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
32__device__
static inline float __hip_ds_bpermutef(
int index,
float src) {
33 union {
int i;
unsigned u;
float f; } tmp; tmp.f = src;
34 tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
38__device__
static inline unsigned __hip_ds_permute(
int index,
unsigned src) {
39 union {
int i;
unsigned u;
float f; } tmp; tmp.u = src;
40 tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
44__device__
static inline float __hip_ds_permutef(
int index,
float src) {
45 union {
int i;
unsigned u;
float f; } tmp; tmp.f = src;
46 tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
50#define __hip_ds_swizzle(src, pattern) __hip_ds_swizzle_N<(pattern)>((src))
51#define __hip_ds_swizzlef(src, pattern) __hip_ds_swizzlef_N<(pattern)>((src))
54__device__
static inline unsigned __hip_ds_swizzle_N(
unsigned int src) {
55 union {
int i;
unsigned u;
float f; } tmp; tmp.u = src;
56 tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
61__device__
static inline float __hip_ds_swizzlef_N(
float src) {
62 union {
int i;
unsigned u;
float f; } tmp; tmp.f = src;
63 tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
67#define __hip_move_dpp(src, dpp_ctrl, row_mask, bank_mask, bound_ctrl) \
68 __hip_move_dpp_N<(dpp_ctrl), (row_mask), (bank_mask), (bound_ctrl)>((src))
70template <
int dpp_ctrl,
int row_mask,
int bank_mask,
bool bound_ctrl>
71__device__
static inline int __hip_move_dpp_N(
int src) {
72 return __builtin_amdgcn_mov_dpp(src, dpp_ctrl, row_mask, bank_mask,
76static constexpr int warpSize = __AMDGCN_WAVEFRONT_SIZE;
80int __shfl(
int var,
int src_lane,
int width = warpSize) {
81 int self = __lane_id();
82 int index = (src_lane & (width - 1)) + (self & ~(width-1));
83 return __builtin_amdgcn_ds_bpermute(index<<2, var);
87unsigned int __shfl(
unsigned int var,
int src_lane,
int width = warpSize) {
88 union {
int i;
unsigned u;
float f; } tmp; tmp.u = var;
89 tmp.i = __shfl(tmp.i, src_lane, width);
94float __shfl(
float var,
int src_lane,
int width = warpSize) {
95 union {
int i;
unsigned u;
float f; } tmp; tmp.f = var;
96 tmp.i = __shfl(tmp.i, src_lane, width);
101double __shfl(
double var,
int src_lane,
int width = warpSize) {
102 static_assert(
sizeof(double) == 2 *
sizeof(
int),
"");
103 static_assert(
sizeof(double) ==
sizeof(uint64_t),
"");
105 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
106 tmp[0] = __shfl(tmp[0], src_lane, width);
107 tmp[1] = __shfl(tmp[1], src_lane, width);
109 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
110 double tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
115long __shfl(
long var,
int src_lane,
int width = warpSize)
118 static_assert(
sizeof(long) == 2 *
sizeof(
int),
"");
119 static_assert(
sizeof(long) ==
sizeof(uint64_t),
"");
121 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
122 tmp[0] = __shfl(tmp[0], src_lane, width);
123 tmp[1] = __shfl(tmp[1], src_lane, width);
125 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
126 long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
129 static_assert(
sizeof(long) ==
sizeof(
int),
"");
130 return static_cast<long>(__shfl(
static_cast<int>(var), src_lane, width));
135unsigned long __shfl(
unsigned long var,
int src_lane,
int width = warpSize) {
137 static_assert(
sizeof(
unsigned long) == 2 *
sizeof(
unsigned int),
"");
138 static_assert(
sizeof(
unsigned long) ==
sizeof(uint64_t),
"");
140 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
141 tmp[0] = __shfl(tmp[0], src_lane, width);
142 tmp[1] = __shfl(tmp[1], src_lane, width);
144 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
145 unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
148 static_assert(
sizeof(
unsigned long) ==
sizeof(
unsigned int),
"");
149 return static_cast<unsigned long>(__shfl(
static_cast<unsigned int>(var), src_lane, width));
154long long __shfl(
long long var,
int src_lane,
int width = warpSize)
156 static_assert(
sizeof(
long long) == 2 *
sizeof(
int),
"");
157 static_assert(
sizeof(
long long) ==
sizeof(uint64_t),
"");
159 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
160 tmp[0] = __shfl(tmp[0], src_lane, width);
161 tmp[1] = __shfl(tmp[1], src_lane, width);
163 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
164 long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
169unsigned long long __shfl(
unsigned long long var,
int src_lane,
int width = warpSize) {
170 static_assert(
sizeof(
unsigned long long) == 2 *
sizeof(
unsigned int),
"");
171 static_assert(
sizeof(
unsigned long long) ==
sizeof(uint64_t),
"");
173 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
174 tmp[0] = __shfl(tmp[0], src_lane, width);
175 tmp[1] = __shfl(tmp[1], src_lane, width);
177 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
178 unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
184int __shfl_up(
int var,
unsigned int lane_delta,
int width = warpSize) {
185 int self = __lane_id();
186 int index = self - lane_delta;
187 index = (index < (self & ~(width-1)))?self:index;
188 return __builtin_amdgcn_ds_bpermute(index<<2, var);
192unsigned int __shfl_up(
unsigned int var,
unsigned int lane_delta,
int width = warpSize) {
193 union {
int i;
unsigned u;
float f; } tmp; tmp.u = var;
194 tmp.i = __shfl_up(tmp.i, lane_delta, width);
199float __shfl_up(
float var,
unsigned int lane_delta,
int width = warpSize) {
200 union {
int i;
unsigned u;
float f; } tmp; tmp.f = var;
201 tmp.i = __shfl_up(tmp.i, lane_delta, width);
206double __shfl_up(
double var,
unsigned int lane_delta,
int width = warpSize) {
207 static_assert(
sizeof(double) == 2 *
sizeof(
int),
"");
208 static_assert(
sizeof(double) ==
sizeof(uint64_t),
"");
210 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
211 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
212 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
214 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
215 double tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
220long __shfl_up(
long var,
unsigned int lane_delta,
int width = warpSize)
223 static_assert(
sizeof(long) == 2 *
sizeof(
int),
"");
224 static_assert(
sizeof(long) ==
sizeof(uint64_t),
"");
226 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
227 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
228 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
230 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
231 long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
234 static_assert(
sizeof(long) ==
sizeof(
int),
"");
235 return static_cast<long>(__shfl_up(
static_cast<int>(var), lane_delta, width));
241unsigned long __shfl_up(
unsigned long var,
unsigned int lane_delta,
int width = warpSize)
244 static_assert(
sizeof(
unsigned long) == 2 *
sizeof(
unsigned int),
"");
245 static_assert(
sizeof(
unsigned long) ==
sizeof(uint64_t),
"");
247 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
248 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
249 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
251 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
252 unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
255 static_assert(
sizeof(
unsigned long) ==
sizeof(
unsigned int),
"");
256 return static_cast<unsigned long>(__shfl_up(
static_cast<unsigned int>(var), lane_delta, width));
262long long __shfl_up(
long long var,
unsigned int lane_delta,
int width = warpSize)
264 static_assert(
sizeof(
long long) == 2 *
sizeof(
int),
"");
265 static_assert(
sizeof(
long long) ==
sizeof(uint64_t),
"");
266 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
267 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
268 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
269 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
270 long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
276unsigned long long __shfl_up(
unsigned long long var,
unsigned int lane_delta,
int width = warpSize)
278 static_assert(
sizeof(
unsigned long long) == 2 *
sizeof(
unsigned int),
"");
279 static_assert(
sizeof(
unsigned long long) ==
sizeof(uint64_t),
"");
280 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
281 tmp[0] = __shfl_up(tmp[0], lane_delta, width);
282 tmp[1] = __shfl_up(tmp[1], lane_delta, width);
283 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
284 unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
290int __shfl_down(
int var,
unsigned int lane_delta,
int width = warpSize) {
291 int self = __lane_id();
292 int index = self + lane_delta;
293 index = (int)((self&(width-1))+lane_delta) >= width?self:index;
294 return __builtin_amdgcn_ds_bpermute(index<<2, var);
298unsigned int __shfl_down(
unsigned int var,
unsigned int lane_delta,
int width = warpSize) {
299 union {
int i;
unsigned u;
float f; } tmp; tmp.u = var;
300 tmp.i = __shfl_down(tmp.i, lane_delta, width);
305float __shfl_down(
float var,
unsigned int lane_delta,
int width = warpSize) {
306 union {
int i;
unsigned u;
float f; } tmp; tmp.f = var;
307 tmp.i = __shfl_down(tmp.i, lane_delta, width);
312double __shfl_down(
double var,
unsigned int lane_delta,
int width = warpSize) {
313 static_assert(
sizeof(double) == 2 *
sizeof(
int),
"");
314 static_assert(
sizeof(double) ==
sizeof(uint64_t),
"");
316 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
317 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
318 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
320 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
321 double tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
326long __shfl_down(
long var,
unsigned int lane_delta,
int width = warpSize)
329 static_assert(
sizeof(long) == 2 *
sizeof(
int),
"");
330 static_assert(
sizeof(long) ==
sizeof(uint64_t),
"");
332 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
333 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
334 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
336 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
337 long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
340 static_assert(
sizeof(long) ==
sizeof(
int),
"");
341 return static_cast<long>(__shfl_down(
static_cast<int>(var), lane_delta, width));
346unsigned long __shfl_down(
unsigned long var,
unsigned int lane_delta,
int width = warpSize)
349 static_assert(
sizeof(
unsigned long) == 2 *
sizeof(
unsigned int),
"");
350 static_assert(
sizeof(
unsigned long) ==
sizeof(uint64_t),
"");
352 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
353 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
354 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
356 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
357 unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
360 static_assert(
sizeof(
unsigned long) ==
sizeof(
unsigned int),
"");
361 return static_cast<unsigned long>(__shfl_down(
static_cast<unsigned int>(var), lane_delta, width));
366long long __shfl_down(
long long var,
unsigned int lane_delta,
int width = warpSize)
368 static_assert(
sizeof(
long long) == 2 *
sizeof(
int),
"");
369 static_assert(
sizeof(
long long) ==
sizeof(uint64_t),
"");
370 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
371 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
372 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
373 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
374 long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
379unsigned long long __shfl_down(
unsigned long long var,
unsigned int lane_delta,
int width = warpSize)
381 static_assert(
sizeof(
unsigned long long) == 2 *
sizeof(
unsigned int),
"");
382 static_assert(
sizeof(
unsigned long long) ==
sizeof(uint64_t),
"");
383 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
384 tmp[0] = __shfl_down(tmp[0], lane_delta, width);
385 tmp[1] = __shfl_down(tmp[1], lane_delta, width);
386 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
387 unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
393int __shfl_xor(
int var,
int lane_mask,
int width = warpSize) {
394 int self = __lane_id();
395 int index = self^lane_mask;
396 index = index >= ((self+width)&~(width-1))?self:index;
397 return __builtin_amdgcn_ds_bpermute(index<<2, var);
401unsigned int __shfl_xor(
unsigned int var,
int lane_mask,
int width = warpSize) {
402 union {
int i;
unsigned u;
float f; } tmp; tmp.u = var;
403 tmp.i = __shfl_xor(tmp.i, lane_mask, width);
408float __shfl_xor(
float var,
int lane_mask,
int width = warpSize) {
409 union {
int i;
unsigned u;
float f; } tmp; tmp.f = var;
410 tmp.i = __shfl_xor(tmp.i, lane_mask, width);
415double __shfl_xor(
double var,
int lane_mask,
int width = warpSize) {
416 static_assert(
sizeof(double) == 2 *
sizeof(
int),
"");
417 static_assert(
sizeof(double) ==
sizeof(uint64_t),
"");
419 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
420 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
421 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
423 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
424 double tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
429long __shfl_xor(
long var,
int lane_mask,
int width = warpSize)
432 static_assert(
sizeof(long) == 2 *
sizeof(
int),
"");
433 static_assert(
sizeof(long) ==
sizeof(uint64_t),
"");
435 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
436 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
437 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
439 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
440 long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
443 static_assert(
sizeof(long) ==
sizeof(
int),
"");
444 return static_cast<long>(__shfl_xor(
static_cast<int>(var), lane_mask, width));
449unsigned long __shfl_xor(
unsigned long var,
int lane_mask,
int width = warpSize)
452 static_assert(
sizeof(
unsigned long) == 2 *
sizeof(
unsigned int),
"");
453 static_assert(
sizeof(
unsigned long) ==
sizeof(uint64_t),
"");
455 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
456 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
457 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
459 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
460 unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
463 static_assert(
sizeof(
unsigned long) ==
sizeof(
unsigned int),
"");
464 return static_cast<unsigned long>(__shfl_xor(
static_cast<unsigned int>(var), lane_mask, width));
469long long __shfl_xor(
long long var,
int lane_mask,
int width = warpSize)
471 static_assert(
sizeof(
long long) == 2 *
sizeof(
int),
"");
472 static_assert(
sizeof(
long long) ==
sizeof(uint64_t),
"");
473 int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
474 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
475 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
476 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
477 long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
482unsigned long long __shfl_xor(
unsigned long long var,
int lane_mask,
int width = warpSize)
484 static_assert(
sizeof(
unsigned long long) == 2 *
sizeof(
unsigned int),
"");
485 static_assert(
sizeof(
unsigned long long) ==
sizeof(uint64_t),
"");
486 unsigned int tmp[2]; __builtin_memcpy(tmp, &var,
sizeof(tmp));
487 tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
488 tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
489 uint64_t tmp0 = (
static_cast<uint64_t
>(tmp[1]) << 32ull) |
static_cast<uint32_t
>(tmp[0]);
490 unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));