-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpredict_model_snippet.cu
148 lines (120 loc) · 4.08 KB
/
predict_model_snippet.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#include <cub/cub.cuh>
/* sum: 2x1 array */
/*
* kernel_array_beam_slave_sincos kernel that performs the reduction manually, instead of using a library
*
* This kernel can be executed with any number of threads per block, as long as it's a power of 2
* The number of threads per block is however unrelated to N
* This kernel is also intended to be executed with only a single thread block
*/
extern "C"
__global__ void
sincos_manual(int N, float r1, float r2, float r3, float *x, float *y, float *z, float *sum) {
int n=threadIdx.x;
__shared__ float sh_sin[block_size_x];
__shared__ float sh_cos[block_size_x];
float l_sin = 0.0f;
float l_cos = 0.0f;
//thread block iterates over elements 0 to N
//values are accumulated in shared memory
for (int i=n; i<N; i+=block_size_x) {
float ss,cc;
sincosf((r1*__ldg(&x[i])+r2*__ldg(&y[i])+r3*__ldg(&z[i])),&ss,&cc);
l_sin += ss;
l_cos += cc;
}
sh_sin[n] = l_sin;
sh_cos[n] = l_cos;
__syncthreads();
//reduction loop
#pragma unroll
for (unsigned int s=block_size_x/2; s>0; s>>=1) {
if (n < s) {
sh_sin[n] += sh_sin[n+s];
sh_cos[n] += sh_cos[n+s];
}
__syncthreads();
}
// now thread 0 will add up results
if (n==0) {
sum[0] = sh_sin[0];
sum[1] = sh_cos[0];
}
}
/*
* kernel_array_beam_slave_sincos kernel that performs the reduction manually, instead of using a library
*
* This kernel can be executed with any number of threads per block, recommended to use a multiple of 32
* The number of threads per block is unrelated to N
* This kernel is also intended to be executed with only a single thread block
*/
extern "C"
__global__ void
sincos_cub(int N, float r1, float r2, float r3, float *x, float *y, float *z, float *sum) {
int n=threadIdx.x;
// Specialize BlockReduce for a 1D block of block_size_x threads on type T
typedef cub::BlockReduce<float, block_size_x> BlockReduce;
// Allocate shared memory for BlockReduce
__shared__ typename BlockReduce::TempStorage temp_storage;
float sin_sum = 0.0f;
float cos_sum = 0.0f;
// accumulate in thread-local registers
for (int i=n; i<N; i+=block_size_x) {
float ss,cc;
sincosf((r1*__ldg(&x[i])+r2*__ldg(&y[i])+r3*__ldg(&z[i])),&ss,&cc);
sin_sum += ss;
cos_sum += cc;
}
__syncthreads();
//reduce using CUB library
sin_sum = BlockReduce(temp_storage).Sum(sin_sum);
__syncthreads(); // because temp_storage is reused, sync is needed here
cos_sum = BlockReduce(temp_storage).Sum(cos_sum);
//thread 0 stores per-thread block result
if (n==0) {
sum[0]=sin_sum;
sum[1]=cos_sum;
}
}
// Old kernel
extern "C"
__global__ void
kernel_array_beam_slave_sincos_original(int N, float r1, float r2, float r3, float *x, float *y, float *z, float *sum, int blockDim_2) {
unsigned int n=threadIdx.x; //+blockDim.x*blockIdx.x;
__shared__ float tmpsum[1000]; /* assumed to be size 2*Nx1 */
if (n<N) {
float ss,cc;
sincosf((r1*__ldg(&x[n])+r2*__ldg(&y[n])+r3*__ldg(&z[n])),&ss,&cc);
tmpsum[2*n]=ss;
tmpsum[2*n+1]=cc;
}
__syncthreads();
// Build summation tree over elements, handling case where total threads is not a power of two.
int nTotalThreads = blockDim_2; // Total number of threads (==N), rounded up to the next power of two
while(nTotalThreads > 1) {
int halfPoint = (nTotalThreads >> 1); // divide by two
if (n < halfPoint) {
int thread2 = n + halfPoint;
if (thread2 < blockDim.x) { // Skipping the fictitious threads >N ( blockDim.x ... blockDim_2-1 )
tmpsum[2*n] = tmpsum[2*n]+tmpsum[2*thread2];
tmpsum[2*n+1] = tmpsum[2*n+1]+tmpsum[2*thread2+1];
}
}
__syncthreads();
nTotalThreads = halfPoint; // Reducing the binary tree size by two
}
/* now thread 0 will add up results */
if (threadIdx.x==0) {
sum[0]=tmpsum[0];
sum[1]=tmpsum[1];
}
}
__device__ int
NearestPowerOf2 (int n){
if (!n) return n; //(0 == 2^0)
int x = 1;
while(x < n) {
x <<= 1;
}
return x;
}