00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00028
00029 #include "toeplitz.h"
00030
00040 __device__ T norm(unsigned int n, const T *x, T *swork) {
00041
00042
00043 unsigned int tid = threadIdx.x;
00044 T *br = swork;
00045 T result = (T)0.0;
00046
00047 #ifdef SINGLE_THREAD
00048
00049 if(tid == 0) {
00050 for(unsigned int i=0; i<n; ++i) result += x[i] * x[i];
00051 #ifdef SINGLE
00052 result = sqrtf(result);
00053 #else
00054 result = sqrt(result);
00055 #endif
00056 }
00057 __syncthreads();
00058 return result;
00059 #endif
00060
00061
00062 for(unsigned int b=0; b<n; b+=blockSize) {
00063 unsigned int i = b + tid;
00064
00065
00066 if(i < n) {
00067 T aux = x[i];
00068 br[tid] = aux * aux;
00069 } else br[tid] = (T)0.0;
00070 __syncthreads();
00071
00072
00073
00074 int s = blockSize;
00075 while(s > 8) {
00076 s >>= 1;
00077 if(tid < s) br[tid] += br[tid + s];
00078 __syncthreads();
00079 }
00080
00081 if(tid == 0) {
00082 for(unsigned int j=0; j<s; ++j) result += br[j];
00083 }
00084 __syncthreads();
00085 }
00086
00087 if(tid == 0) {
00088 #ifdef SINGLE
00089 result = sqrtf(result);
00090 #else
00091 result = sqrt(result);
00092 #endif
00093 }
00094 __syncthreads();
00095
00096
00097 return result;
00098 }
00099
00100
00110 __device__ T dot(unsigned int n, const T *x, const T *y, T *swork) {
00111
00112
00113 unsigned int tid = threadIdx.x;
00114 T *br = swork;
00115 T result = (T)0.0;
00116
00117 #ifdef SINGLE_THREAD
00118
00119 if(tid == 0) {
00120 for(unsigned int i=0; i<n; ++i) result += x[i] * y[i];
00121 }
00122 __syncthreads();
00123 return result;
00124 #endif
00125
00126
00127 for(unsigned int b=0; b<n; b+=blockSize) {
00128 unsigned int i = b + tid;
00129
00130
00131 if(i < n) br[tid] = x[i] * y[i];
00132 else br[tid] = (T)0.0;
00133 __syncthreads();
00134
00135
00136
00137 int s = blockSize;
00138 while(s > 8) {
00139 s >>= 1;
00140 if(tid < s) br[tid] += br[tid + s];
00141 __syncthreads();
00142 }
00143
00144 if(tid == 0) {
00145 for(unsigned int j=0; j<s; ++j) result += br[j];
00146 }
00147 __syncthreads();
00148 }
00149
00150
00151 return result;
00152 }
00153
00163 __device__ T dot_reverse_y(unsigned int n, const T *x, const T *y, T *swork) {
00164
00165
00166 T *bx = swork;
00167 T *by = bx + blockSize * 2 - 1;
00168 T *br = by + blockSize;
00169
00170
00171 unsigned int tid = threadIdx.x;
00172 unsigned int n_align = align(n);
00173 unsigned int shift = n_align - n;
00174 T result = 0.0;
00175
00176 #ifdef SHOW_PARAMS
00177
00178 if(tid == 0) {
00179 fprintf(stderr, "dot_reverse_y: n = %d, x = [", n);
00180 for(int i=0; i<n; ++i) fprintf(stderr, " %.6f", x[i]);
00181 fprintf(stderr, " ], y = [");
00182 for(int i=0; i<n; ++i) fprintf(stderr, " %.6f", y[i]);
00183 fprintf(stderr, " ]\n");
00184 }
00185 #endif
00186
00187 #ifdef SINGLE_THREAD
00188
00189 if(tid == 0) {
00190 for(unsigned int i=0; i<n; ++i) result += x[i] * y[n-i-1];
00191 }
00192 __syncthreads();
00193 return result;
00194 #endif
00195
00196
00197 bx[tid] = (T)0.0;
00198 __syncthreads();
00199
00200
00201 for(unsigned int i=0; i<n; i+=blockSize) {
00202
00203
00204 unsigned int xi = i + tid;
00205 unsigned int yi = n_align - i - blockSize + tid;
00206
00207 if(xi < n) bx[shift + tid] = x[xi];
00208 if(yi < n) by[tid] = y[yi]; else by[tid] = (T)0.0;
00209 __syncthreads();
00210
00211
00212 br[tid] = bx[tid] * by[blockSize - tid - 1];
00213 __syncthreads();
00214
00215
00216 if(tid < shift) bx[tid] = bx[tid + blockSize];
00217
00218
00219
00220 int s = blockSize;
00221 while(s > 8) {
00222 s >>= 1;
00223 if(tid < s) br[tid] += br[tid + s];
00224 __syncthreads();
00225 }
00226
00227 if(tid == 0) {
00228 for(unsigned int j=0; j<s; ++j) result += br[j];
00229 }
00230 __syncthreads();
00231 }
00232
00233
00234 return result;
00235 }
00236
00245 __device__ void axpy(unsigned int n, T a, const T *x, T *y) {
00246
00247
00248 unsigned int tid = threadIdx.x;
00249
00250 #ifdef SINGLE_THREAD
00251
00252 if(tid == 0) {
00253 for(unsigned int i=0; i<n; ++i) y[i] += a * x[i];
00254 }
00255 __syncthreads();
00256 return;
00257 #endif
00258
00259
00260 for(unsigned int b=0; b<n; b+=blockSize) {
00261
00262
00263 unsigned int i = b + tid;
00264 if(i < n) y[i] += a * x[i];
00265 }
00266 __syncthreads();
00267 }
00268
00278 __device__ void axpy_reverse_x(unsigned int n, T a, const T *x, T *y, T *swork) {
00279
00280
00281 T *bx = swork;
00282 T *by = bx + blockSize;
00283 T *br = by + 2 * blockSize - 1;
00284
00285
00286 unsigned int tid = threadIdx.x;
00287 unsigned int n_align = align(n);
00288 unsigned int shift = n_align - n;
00289 unsigned int lastblock = blockSize - shift;
00290
00291 #ifdef SHOW_PARAMS
00292
00293 if(tid == 0) {
00294 fprintf(stderr, "axpy_reverse_x: n = %d, a = %.6f, x = [", n, a);
00295 for(unsigned int i=0; i<n; ++i) fprintf(stderr, " %.6f", x[i]);
00296 fprintf(stderr, " ], y = [");
00297 for(unsigned int i=0; i<n; ++i) fprintf(stderr, " %.6f", y[i]);
00298 fprintf(stderr, " ]\n");
00299 }
00300 #endif
00301
00302 #ifdef SINGLE_THREAD
00303
00304 if(tid == 0) {
00305 for(unsigned int i=0; i<n; ++i) y[i] += a * x[n-i-1];
00306 }
00307 __syncthreads();
00308 return;
00309 #endif
00310
00311
00312 for(unsigned int i=0; i<n; i+=blockSize) {
00313
00314
00315 unsigned int xi = n_align - i - blockSize + tid;
00316 unsigned int yi = i + tid;
00317
00318 if(xi < n) bx[tid] = x[xi];
00319 if(yi < n) by[shift + tid] = y[yi];
00320 __syncthreads();
00321
00322 #ifdef DEBUG
00323 if(tid == 0) {
00324 fprintf(stderr, "by:");
00325 for(unsigned int i=0; i<blockSize*2-1; ++i) fprintf(stderr, " %.6f", by[i]);
00326 fprintf(stderr, "\n");
00327
00328 fprintf(stderr, "bx:");
00329 for(unsigned int i=0; i<blockSize; ++i) fprintf(stderr, " %.6f", bx[i]);
00330 fprintf(stderr, "\n");
00331 }
00332 #endif
00333
00334
00335 if(i > 0) {
00336
00337 br[tid + lastblock] = a * bx[blockSize - tid - 1] + by[tid];
00338
00339 } else {
00340
00341 if(tid >= shift) br[tid - shift] = a * bx[blockSize - tid - 1] + by[tid];
00342 }
00343 __syncthreads();
00344
00345 #ifdef DEBUG
00346 if(tid == 0) {
00347 fprintf(stderr, "br:");
00348 for(unsigned int i=0; i<2*blockSize; ++i) fprintf(stderr, " %.6f", br[i]);
00349 fprintf(stderr, "\n\n");
00350 }
00351 #endif
00352
00353
00354 if(i > 0) y[yi - blockSize] = br[tid];
00355
00356
00357 if(tid < shift) by[tid] = by[tid + blockSize];
00358
00359
00360 if(i > 0 && tid < lastblock) br[tid] = br[tid + blockSize];
00361 __syncthreads();
00362 }
00363
00364
00365 unsigned int yi = n_align - blockSize + tid;
00366 if(yi < n) y[yi] = br[tid];
00367 __syncthreads();
00368 }
00369
00380 __device__ void axpxb_reverse_x(unsigned int n, T a, const T *x, T *y, T b, T *swork) {
00381
00382
00383 T *bb = swork;
00384 T *bf = bb + blockSize;
00385 T *brf = bf + blockSize * 2 - 1;
00386 T *brb = brf + blockSize * 2;
00387
00388
00389 unsigned int tid = threadIdx.x;
00390 unsigned int n_align = align(n);
00391 unsigned int shift = n_align - n;
00392 unsigned int lastblock = blockSize - shift;
00393
00394 #ifdef SHOW_PARAMS
00395
00396 if(tid == 0) {
00397 fprintf(stderr, "axpxb_reverse_x: n = %d, a = %.6f, x = [", n, a);
00398 for(unsigned int i=0; i<n; ++i) fprintf(stderr, " %.6f", x[i]);
00399 fprintf(stderr, " ]\n");
00400 }
00401 #endif
00402
00403 #ifdef SINGLE_THREAD
00404
00405 if(tid == 0) {
00406 for(unsigned int i=0; i<n/2; ++i) {
00407 T aux = x[i];
00408 y[i] = b * (aux + a * x[n-i-1]);
00409 y[n-i-1] = b * (x[n-i-1] + a * aux);
00410 }
00411
00412 if(n & 1) y[n/2] = b * (x[n/2] + a * x[n/2]);
00413 }
00414 __syncthreads();
00415 return;
00416 #endif
00417
00418
00419 unsigned int num_blocks = n_align / blockSize;
00420 unsigned int max_n = blockSize * (1 + (num_blocks >> 1));
00421 for(unsigned int i=0; i<max_n; i+=blockSize) {
00422
00423
00424 unsigned int bi = n_align - i - blockSize + tid;
00425 unsigned int fi = i + tid;
00426
00427 if(bi < n) bb[tid] = x[bi];
00428 if(fi < n) bf[shift + tid] = x[fi];
00429 __syncthreads();
00430
00431 #ifdef DEBUG
00432 if(tid == 0) {
00433 fprintf(stderr, "bf:");
00434 for(unsigned int i=0; i<blockSize*2-1; ++i) fprintf(stderr, " %.6f", bf[i]);
00435 fprintf(stderr, "\n");
00436
00437 fprintf(stderr, "bb:");
00438 for(unsigned int i=0; i<blockSize; ++i) fprintf(stderr, " %.6f", bb[i]);
00439 fprintf(stderr, "\n");
00440 }
00441 #endif
00442
00443
00444 if(i > 0) {
00445
00446 brf[tid + lastblock] = b * (a * bb[blockSize - tid - 1] + bf[tid]);
00447
00448
00449 brb[tid] = b * (bb[tid] + a * bf[blockSize - tid - 1]);
00450
00451 } else {
00452
00453 if(tid >= shift) brf[tid - shift] = b * (a * bb[blockSize - tid - 1] + bf[tid]);
00454
00455
00456 if(tid < lastblock) brb[tid] = b * (bb[tid] + a * bf[blockSize - tid - 1]);
00457 }
00458 __syncthreads();
00459
00460 #ifdef DEBUG
00461 if(tid == 0) {
00462 fprintf(stderr, "brf:");
00463 for(unsigned int i=0; i<2*blockSize; ++i) fprintf(stderr, " %.6f", brf[i]);
00464 fprintf(stderr, "\nbrb:");
00465 for(unsigned int i=0; i<blockSize; ++i) fprintf(stderr, " %.6f", brb[i]);
00466 fprintf(stderr, "\n\n");
00467 }
00468 #endif
00469
00470
00471 if(i > 0) y[fi - blockSize] = brf[tid];
00472
00473
00474 if(i + blockSize >= max_n && !(num_blocks & 1)) break;
00475
00476
00477 if(bi < n) y[bi] = brb[tid];
00478
00479
00480 if(i + blockSize >= max_n) break;
00481
00482
00483 if(tid < shift) bf[tid] = bf[tid + blockSize];
00484
00485
00486 if(i > 0 && tid < lastblock) brf[tid] = brf[tid + blockSize];
00487 __syncthreads();
00488 }
00489 __syncthreads();
00490 }
00491