00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00028
00029 #include <math.h>
00030 #include <time.h>
00031 #include <stdio.h>
00032
00033
00034 #include "toeplitz.h"
00035
00036
00037 #include "cmdline_cpu.h"
00038
00039
00040 static struct gengetopt_args_info_cpu args_info;
00041
00049 void parse_cmdline(int argc, char *argv[], struct gengetopt_args_info_cpu *args_info) {
00050
00051 if(cmdline_parser(argc, argv, args_info) != 0) {
00052 fprintf(stderr, "Run %s --help ('-h') to see the list of options.\n", argv[0]);
00053 exit(1);
00054 }
00055
00056 if(args_info->help_given) {
00057 cmdline_parser_print_help();
00058 exit(0);
00059 }
00060
00061 if(args_info->version_given) {
00062 cmdline_parser_print_version();
00063 exit(0);
00064 }
00065 }
00066
00084 int si2w_cpu(unsigned int n, const T *t, T ia, T ib, T sigma, unsigned int nEig, unsigned int max_k, T tolerance, T *l_out, T *v_out) {
00085
00086
00087 if(nEig == 0) return 0;
00088
00089
00090 T *a = (T *) malloc(max_k * sizeof(T));
00091 T *b = (T *) malloc(max_k * sizeof(T));
00092 T *g = (T *) malloc(max_k * sizeof(T));
00093 T *d = (T *) malloc(max_k * sizeof(T));
00094 T *p = (T *) malloc(n * max_k * sizeof(T));
00095 T *q = (T *) malloc(n * max_k * sizeof(T));
00096 T *pk = p, *qk = q;
00097
00098
00099 int i, j, k;
00100 memset(p, 0, n * sizeof(T));
00101 memset(q, 0, n * sizeof(T));
00102 p[0] = p[n-1] = (T)1.0;
00103 q[0] = (T)1.0; q[n-1] = (T)-1.0;
00104
00105
00106 T normP = _nrm2(n, p, 1);
00107 T normQ = _nrm2(n, q, 1);
00108 _scal(n, (T)1.0 / normP, p, 1);
00109 _scal(n, (T)1.0 / normQ, q, 1);
00110
00111
00112 int eigen_left = nEig;
00113 unsigned int extracted_s = 0, extracted_a = 0;
00114 unsigned int converged_s = 0, converged_a = 0;
00115
00116 T ls[nEig], la[nEig];
00117 T *rs = (T *) malloc(nEig * n * sizeof(T));
00118 T *ra = (T *) malloc(nEig * n * sizeof(T));
00119
00120
00121 T *w = (T *) malloc(n * sizeof(T));
00122 T *v = (T *) malloc(n * sizeof(T));
00123 T *vs = (T *) malloc(n * sizeof(T));
00124 T *va = (T *) malloc(n * sizeof(T));
00125 T *aux = (T *) malloc(n * sizeof(T));
00126 T *aux_k = (T *) malloc(max_k * sizeof(T));
00127
00128
00129 for(k=1; k<=max_k; ++k) {
00130
00131
00132 for(i=0; i<n; ++i) w[i] = pk[i] + qk[i];
00133
00134
00135 levinson(n, t[0] - sigma, &t[1], w, v);
00136
00137
00138 for(i=0; i<n; ++i) {
00139 vs[i] = (T)0.5 * (v[i] + v[n-i-1]);
00140 va[i] = (T)0.5 * (v[i] - v[n-i-1]);
00141 }
00142
00143
00144 a[k-1] = _dot(n, vs, 1, pk, 1);
00145 g[k-1] = _dot(n, va, 1, qk, 1);
00146
00147 if(k < max_k) {
00148
00149 _axpy(n, -a[k-1], pk, 1, vs, 1);
00150 _axpy(n, -g[k-1], qk, 1, va, 1);
00151
00152
00153 if(k > 1) {
00154 _axpy(n, -b[k-2], pk - n, 1, vs, 1);
00155 _axpy(n, -d[k-2], qk - n, 1, va, 1);
00156 }
00157
00158
00159
00160 _gemv(CblasColMajor, CblasTrans, n, k, (T)1.0, p, n, vs, 1, 0.0, aux_k, 1);
00161 _gemv(CblasColMajor, CblasNoTrans, n, k, (T)-1.0, p, n, aux_k, 1, 1.0, vs, 1);
00162
00163
00164 _gemv(CblasColMajor, CblasTrans, n, k, (T)1.0, q, n, va, 1, 0.0, aux_k, 1);
00165 _gemv(CblasColMajor, CblasNoTrans, n, k, (T)-1.0, q, n, aux_k, 1, 1.0, va, 1);
00166
00167
00168 b[k-1] = _nrm2(n, vs, 1);
00169 d[k-1] = _nrm2(n, va, 1);
00170
00171
00172 pk += n; qk += n;
00173 memset(pk, 0, n * sizeof(T));
00174 memset(qk, 0, n * sizeof(T));
00175 _axpy(n, (T)1.0 / b[k-1], vs, 1, pk, 1);
00176 _axpy(n, (T)1.0 / d[k-1], va, 1, qk, 1);
00177 }
00178
00179
00180 eigen_left -= extract_eigenvalues(n, k, max_k, n, &converged_s, &extracted_s, eigen_left, ia, ib, t, p, a, b, tolerance, ls, rs);
00181
00182
00183 eigen_left -= extract_eigenvalues(n, k, max_k, n, &converged_a, &extracted_a, eigen_left, ia, ib, t, q, g, d, tolerance, la, ra);
00184
00185
00186 if(eigen_left <= 0) break;
00187 }
00188
00189
00190 int l = 0;
00191 for(j=0; j<extracted_s; ++j) {
00192 l_out[l] = ls[j];
00193 memcpy(v_out + l * n, rs + j * n, n * sizeof(T));
00194 ++l;
00195 }
00196
00197 for(j=0; j<extracted_a; ++j) {
00198 l_out[l] = la[j];
00199 memcpy(v_out + l * n, ra + j * n, n * sizeof(T));
00200 ++l;
00201 }
00202
00203
00204 free(w); free(v); free(va); free(vs);
00205 free(a); free(b); free(g); free(d);
00206 free(p); free(q); free(rs); free(ra);
00207 free(aux); free(aux_k);
00208
00209
00210 return l;
00211 }
00212
00213
00214 int main(int argc, char *argv[]) {
00215
00216
00217 parse_cmdline(argc, argv, &args_info);
00218
00219
00220 if(args_info.tolerance_arg <= 0.0f) {
00221 fprintf(stderr, "Error: invalid tolerance param value.\n");
00222 return 1;
00223 }
00224
00225
00226 if(args_info.max_eigs_interval_arg <= 0) {
00227 fprintf(stderr, "Error: invalid value for max-eigs-interval param.\n");
00228 return 1;
00229 }
00230
00231
00232 if(args_info.max_k_arg <= 0) {
00233 fprintf(stderr, "Error: invalid value for max-k param.\n");
00234 return 1;
00235 }
00236
00237
00238 if(args_info.max_interval_retries_arg < 0) {
00239 fprintf(stderr, "Error: invalid value for max-interval-retries param.\n");
00240 return 1;
00241 }
00242
00243
00244 extract_global_info global;
00245 global.tolerance = args_info.tolerance_arg;
00246 global.max_eig_interval = args_info.max_eigs_interval_arg;
00247 global.max_k = args_info.max_k_arg;
00248 global.max_retry_count = args_info.max_interval_retries_arg;
00249
00250
00251 if(args_info.random_seed_given) srand(args_info.random_seed_arg);
00252 else srand(time(NULL));
00253 unsigned int i;
00254
00255
00256 if(args_info.file_given) {
00257
00258
00259 if(!strcmp(args_info.file_arg, "-")) {
00260 if(!readInput((int *) &global.n, &global.t, stdin)) {
00261 fprintf(stderr, "Error reading data from standard input.\n");
00262 free(global.device);
00263 return 1;
00264 }
00265
00266
00267 } else {
00268 FILE *input = fopen(args_info.file_arg, "r");
00269 if(!input) {
00270 fprintf(stderr, "Error opening input file \"%s\"\n", args_info.file_arg);
00271 free(global.device);
00272 return 1;
00273 }
00274
00275 if(!readInput((int *) &global.n, &global.t, input)) {
00276 fprintf(stderr, "Error reading input file \"%s\"\n", args_info.file_arg);
00277 free(global.device);
00278 fclose(input);
00279 return 1;
00280 }
00281
00282 fclose(input);
00283 }
00284 }
00285
00286
00287 else if(args_info.random_data_given) {
00288
00289
00290 if(args_info.random_data_arg <= 0) {
00291 fprintf(stderr, "Error: invalid data size.\n");
00292 free(global.device);
00293 return 1;
00294 }
00295
00296
00297 global.n = args_info.random_data_arg;
00298 global.t = (T *) malloc(global.n * sizeof(T));
00299
00300
00301 for(i=0; i<global.n; ++i) {
00302 global.t[i] = rand() / (T)RAND_MAX;
00303 global.t[i] *= (rand() & 1) ? (T)1 : (T)-1;
00304 }
00305
00306
00307 if(args_info.save_random_data_given) {
00308
00309 FILE *out = fopen(args_info.save_random_data_arg, "w");
00310 if(!out) {
00311 fprintf(stderr, "Error opening input file \"%s\"\n", args_info.save_random_data_arg);
00312 free(global.device);
00313 free(global.t);
00314 return 1;
00315 }
00316
00317 fprintf(out, "%d\n", global.n);
00318 for(i=0; i<global.n; ++i) fprintf(out, "%.8f ", global.t[i]);
00319 fprintf(out, "\n");
00320
00321 fclose(out);
00322 }
00323 }
00324
00325
00326 T lbg, ubg;
00327 clock_t t1_intervals = clock();
00328 toeplitz_gershgorin(global.t, global.n, &lbg, &ubg);
00329
00330
00331 if(args_info.show_stats_flag) {
00332 printf("Calculating extraction intervals based on Gershgorin disc [%.6f, %.6f]: up to %d eigenvalues per interval.\n",
00333 lbg, ubg, global.max_eig_interval);
00334 }
00335
00336 global.cpu = calc_eigen_intervals(global.t, global.n, lbg, ubg, global.max_eig_interval, &global.num_intervals);
00337 clock_t t2_intervals = clock();
00338
00339
00340 T *out_l = (T *) malloc(global.n * sizeof(T));
00341 T *out_v = (T *) malloc(global.n * global.n * sizeof(T));
00342 T *out_l_int = out_l, *out_v_int = out_v;
00343
00344
00345 clock_t t1_extraction = clock();
00346 unsigned int total_extracted = 0;
00347 unsigned int intervals_restarted = 0, restart_count = 0;
00348 unsigned int intervals_partial = 0, intervals_failed = 0;
00349
00350 for(i=0; i<global.num_intervals; ++i) {
00351 interval_cpu_info *ic = &global.cpu[i];
00352 T sigma = (ic->lb + ic->ub) * (T)0.5;
00353
00354 unsigned int extracted, retry = 0;
00355 do {
00356 extracted = si2w_cpu(global.n, global.t, ic->lb, ic->ub, sigma, ic->num_eigs, global.max_k, global.tolerance, out_l_int, out_v_int);
00357 if(extracted < ic->num_eigs && ++retry <= global.max_retry_count) {
00358
00359 if(args_info.show_interval_status_flag) {
00360 printf("Retrying interval %d/%d: [%.6f, %.6f] with sigma %.6f, retry count = %d\n",
00361 i + 1, global.num_intervals, ic->lb, ic->ub, sigma, retry);
00362 }
00363
00364 sigma = (rand() / (T)RAND_MAX) * (ic->ub - ic->lb) + ic->lb;
00365
00366 if(retry == 1) ++intervals_restarted;
00367 ++restart_count;
00368 }
00369
00370 if(retry > global.max_retry_count) {
00371 if(args_info.show_interval_status_flag) {
00372 printf("Skipping interval %d/%d: [%.6f, %.6f] (max retry count reached, %d / %d eigenvalues extracted)\n",
00373 i + 1, global.num_intervals, ic->lb, ic->ub, ic->extracted_s + ic->extracted_a, ic->num_eigs);
00374 }
00375
00376 if(extracted == 0) ++intervals_failed;
00377 else ++intervals_partial;
00378
00379 break;
00380 }
00381
00382 } while(extracted < ic->num_eigs);
00383
00384 out_l_int += extracted;
00385 out_v_int += extracted * global.n;
00386 total_extracted += extracted;
00387 }
00388
00389 clock_t t2_extraction = clock();
00390
00391
00392 sort_eigenvalues(total_extracted, global.n, out_l, out_v);
00393
00394
00395 float time_intervals = (t2_intervals - t1_intervals) / (float)CLOCKS_PER_SEC;
00396 float time_extraction = (t2_extraction - t1_extraction) / (float)CLOCKS_PER_SEC;
00397 float time_total = time_intervals + time_extraction;
00398
00399
00400 if(args_info.show_stats_flag) {
00401 printf("\n");
00402
00403 printf("Extracted %d of %d (%.2f%%) eigenvalues (maximum subspace of size %d).\n",
00404 total_extracted, global.n, 100.0f * total_extracted / (float)global.n, global.max_k);
00405
00406 printf("Total elapsed time: %.3fs. Interval precalculation: %.3fs (%.2f%%), eigenvalue extraction: %.3fs (%.2f%%).\n",
00407 time_total, time_intervals, 100.0f * time_intervals / time_total, time_extraction, 100.0f * time_extraction / time_total);
00408
00409 printf("From a total of %d intervals, %d (%.2f%%) were restarted (%.2f retries/int), ",
00410 global.num_intervals, intervals_restarted, 100.0f * intervals_restarted / (float) global.num_intervals,
00411 restart_count / (float) max(1, intervals_restarted));
00412
00413 printf("%d had partial extraction (%.2f%%) and %d failed to extract (%.2f%%).\n",
00414 intervals_partial, 100.0f * intervals_partial / (float) global.num_intervals, intervals_failed,
00415 100.0f * intervals_failed / (float) global.num_intervals);
00416 }
00417
00418
00419 if(args_info.show_eigenvalues_flag) print_vector("eigenvalues", out_l, total_extracted);
00420 if(args_info.show_eigenvectors_flag) print_matrix("eigenvectors", out_v, global.n, total_extracted, global.n);
00421
00422
00423 free(global.t);
00424 free(global.cpu);
00425 free(out_l);
00426 free(out_v);
00427
00428 return 0;
00429 }
00430