00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00028
00029 #include "toeplitz.h"
00030
00031
00032 #include "gpu_vector_kernel.cu"
00033
00041 __device__ void memcpy4(void *dest, void *src, unsigned int size) {
00042
00043 unsigned int tid = threadIdx.x;
00044 float *destf = (float *)dest;
00045 float *srcf = (float *)src;
00046
00047 __syncthreads();
00048
00049 size >>= 2;
00050 for(unsigned int b=0; b<size; b+=blockSize, srcf+=blockSize, destf+=blockSize) {
00051 unsigned int i = b + tid;
00052 if(i < size) destf[tid] = srcf[tid];
00053 }
00054
00055 __syncthreads();
00056 }
00057
00071 __device__ void levinson(unsigned int n, T td, const T *t, T *b, T *x, T *work, T *swork) {
00072
00073
00074 unsigned int tid = threadIdx.x;
00075 unsigned int n_align = align(n);
00076
00077 #ifdef SHOW_PARAMS
00078
00079 if(tid == 0) {
00080 fprintf(stderr, "levinson: n = %d, td = %.6f, t = [", n, td);
00081 for(int i=0; i<n-1; ++i) fprintf(stderr, " %.6f", t[i]);
00082 fprintf(stderr, " ], b = [");
00083 for(int i=0; i<n; ++i) fprintf(stderr, " %.6f", b[i]);
00084 fprintf(stderr, " ]\n");
00085 }
00086 #endif
00087
00088
00089 if(n == 0) return;
00090
00091
00092 T *y = work;
00093 T *tn = y + n_align;
00094
00095
00096 __shared__ T b_cache[half_warp], tn_cache[half_warp];
00097 __shared__ T alpha, mu;
00098 T beta = (T)1.0;
00099
00100
00101 T factor = (T)1.0 / td;
00102 for(unsigned int i=0; i<n-1; i+=blockSize) {
00103 unsigned int ti = i + tid;
00104 if(ti < n-1) {
00105 T aux = t[ti] * factor;
00106 if(ti < half_warp) tn_cache[ti] = aux;
00107 tn[ti] = aux;
00108 }
00109
00110 __syncthreads();
00111 }
00112
00113
00114 for(unsigned int i=0; i<n; i+=blockSize) {
00115 volatile unsigned int bi = i + tid;
00116 if(bi < n) {
00117 T aux = b[bi] * factor;
00118 if(bi < half_warp) b_cache[bi] = aux;
00119 b[bi] = aux;
00120 }
00121
00122 __syncthreads();
00123 }
00124
00125
00126 if(tid == 0) {
00127 x[0] = b_cache[0];
00128 y[0] = -tn_cache[0];
00129 alpha = -tn_cache[0];
00130 }
00131 __syncthreads();
00132
00133
00134 for(int k=1; k<n; ++k) {
00135
00136
00137 unsigned int k_half_warp = k & half_warp_mask;
00138 if(k_half_warp == 0) {
00139 if(tid < half_warp) {
00140 unsigned int i = k + tid;
00141 if(i < n) {
00142 b_cache[tid] = b[i];
00143 tn_cache[tid] = tn[i];
00144 }
00145 }
00146 __syncthreads();
00147 }
00148
00149
00150 T dot_result = dot_reverse_y(k, tn, x, swork);
00151 if(tid == 0) {
00152 beta = ((T)1.0 - alpha * alpha) * beta;
00153 mu = (b_cache[k_half_warp] - dot_result) / beta;
00154 }
00155 __syncthreads();
00156
00157 axpy_reverse_x(k, mu, y, x, swork);
00158 if(tid == k_half_warp) x[k] = mu;
00159 __syncthreads();
00160
00161 if(k < n-1) {
00162 dot_result = dot_reverse_y(k, tn, y, swork);
00163 if(tid == 0) alpha = -(tn_cache[k_half_warp] + dot_result) / beta;
00164 __syncthreads();
00165
00166 axpxb_reverse_x(k, alpha, y, y, (T)1.0, swork);
00167 if(tid == k_half_warp) y[k] = alpha;
00168 }
00169
00170 __syncthreads();
00171 }
00172 }
00173
00186 __device__ void reorthogonalize(unsigned int n, unsigned int k_size, const T *p, T *v, T *h, T *swork) {
00187
00188
00189 unsigned int tid = threadIdx.x;
00190 unsigned int n_align = align(n);
00191
00192
00193 const T *pk = p;
00194 for(unsigned int bk=0; bk<k_size; bk+=half_warp) {
00195
00196
00197 __shared__ T dr[half_warp];
00198
00199
00200 for(unsigned int w=0; w<half_warp && bk+w<k_size; ++w, pk+=n_align) {
00201
00202
00203 T aux = dot(n, pk, v, swork);
00204 if(tid == 0) dr[w] = aux;
00205 __syncthreads();
00206 }
00207
00208
00209 unsigned int bi = bk + tid;
00210 if(tid < half_warp && bi < k_size) h[bi] = dr[tid];
00211 __syncthreads();
00212 }
00213
00214
00215 for(unsigned int bn=0; bn<n; bn+=blockSize) {
00216 unsigned int j = bn + tid;
00217
00218
00219 T *bh = swork;
00220 T *ba = bh + blockSize;
00221
00222
00223 ba[tid] = (T)0.0;
00224 __syncthreads();
00225
00226
00227 for(unsigned int bk=0; bk<k_size; bk+=blockSize) {
00228 unsigned int k = bk + tid;
00229
00230
00231 if(k < k_size) bh[tid] = h[k];
00232 __syncthreads();
00233
00234
00235 for(unsigned int ci=0; ci<k_size; ++ci) {
00236 if(j < n) ba[tid] += bh[ci] * p[ci * n_align + j];
00237 }
00238 }
00239 __syncthreads();
00240
00241
00242 if(j < n) v[j] -= ba[tid];
00243 }
00244
00245 __syncthreads();
00246 }
00247
00266 __device__ void si2w(unsigned int n, T td, T *t, unsigned int min_k, unsigned int num_it, unsigned int max_k, T sigma,
00267 T *p, T *q, tridiagonal_entry *m, T *work) {
00268
00269
00270 if(min_k > max_k || num_it == 0 || n == 0) return;
00271
00272
00273 volatile unsigned int tid = threadIdx.x;
00274 volatile unsigned int n_align = align(n);
00275
00276
00277 extern __shared__ T swork[];
00278 __shared__ tridiagonal_entry mk;
00279
00280
00281 if(min_k > 0) {
00282 if(tid < 4) ((T *)&mk)[tid] = ((T *)&m[min_k - 1])[tid];
00283 __syncthreads();
00284 }
00285
00286
00287 if(min_k == 0) {
00288 const T isq2 = (T)0.7071068;
00289 for(unsigned int b=0; b<n; b+=blockSize) {
00290 int i = b + tid;
00291 if(i == 0) { p[i] = isq2; q[i] = isq2; }
00292 else if(i == n-1) { p[i] = isq2; q[i] = -isq2; }
00293 else if(i < n) { p[i] = (T)0.0; q[i] = (T)0.0; }
00294
00295 __syncthreads();
00296 }
00297 }
00298
00299
00300
00301 T *w = work;
00302 T *va = work;
00303
00304 T *vs = work + n_align;
00305
00306 T *v = work + n_align * 2;
00307 T *work_ort = v;
00308
00309 T *work_lev = work + n_align * 3;
00310
00311
00312 T *pk = p + min_k * n_align;
00313 T *qk = q + min_k * n_align;
00314 unsigned int max_it = min(max_k, min_k + num_it);
00315 __syncthreads();
00316
00317 unsigned int k;
00318 for(k=min_k+1; k<=max_it; ++k) {
00319
00320
00321 for(unsigned int b=0; b<n; b+=blockSize) {
00322 unsigned int i = b + tid;
00323 if(i < n) w[i] = pk[i] + qk[i];
00324 }
00325 __syncthreads();
00326
00327
00328 levinson(n, td - sigma, t, w, v, work_lev, swork);
00329
00330
00331 axpxb_reverse_x(n, (T)1.0, v, vs, (T)0.5, swork);
00332 axpxb_reverse_x(n, -(T)1.0, v, va, (T)0.5, swork);
00333
00334
00335 T aux_alpha = dot(n, vs, pk, swork);
00336 T aux_gamma = dot(n, va, qk, swork);
00337
00338 if(tid == 0) { mk.alpha = aux_alpha; mk.gamma = aux_gamma; }
00339 __syncthreads();
00340
00341
00342 if(k < max_k) {
00343
00344
00345 axpy(n, -mk.alpha, pk, vs);
00346 axpy(n, -mk.gamma, qk, va);
00347
00348
00349 if(k > 1) {
00350
00351 axpy(n, -mk.beta, pk - n_align, vs);
00352 axpy(n, -mk.delta, qk - n_align, va);
00353 }
00354
00355
00356 reorthogonalize(n, k, p, vs, work_ort, swork);
00357 reorthogonalize(n, k, q, va, work_ort, swork);
00358
00359
00360 T aux_beta = norm(n, vs, swork);
00361 T aux_delta = norm(n, va, swork);
00362 if(tid == 0) { mk.beta = aux_beta; mk.delta = aux_delta; }
00363 __syncthreads();
00364
00365
00366 pk += n_align; qk += n_align;
00367 T ibk = (T)1.0 / mk.beta;
00368 T idk = (T)1.0 / mk.delta;
00369
00370 for(unsigned int b=0; b<n; b+=blockSize) {
00371 volatile unsigned int i = b + tid;
00372 if(i < n) {
00373 pk[i] = vs[i] * ibk;
00374 qk[i] = va[i] * idk;
00375 }
00376 }
00377 __syncthreads();
00378 }
00379
00380
00381 if(tid < 4) ((T *)&m[k-1])[tid] = ((T *)&mk)[tid];
00382 __syncthreads();
00383 }
00384 }
00385
00397 __global__ void si2w_parallel(unsigned int n, T td, T *t, unsigned int k, unsigned int inc_k, unsigned int max_k, interval_gpu_info *exec_window) {
00398
00399
00400 __shared__ interval_gpu_info i;
00401 memcpy4(&i, exec_window + blockIdx.x, sizeof(interval_gpu_info));
00402
00403
00404 if(i.processed) return;
00405
00406
00407 unsigned int k_real = k - i.start_k;
00408 if(k_real <= max_k) si2w(n, td, t, k_real, inc_k, max_k, i.sigma, i.p, i.q, i.m, i.workspace);
00409 }
00410