00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00028 #include <stdio.h>
00029 #include <assert.h>
00030 #include <time.h>
00031
00032
00033 extern "C" {
00034
00035
00036 #include "toeplitz.h"
00037 }
00038
00039
00040 #include "cmdline_gpu.h"
00041
00042
00043 #include "toeplitz_kernel.cu"
00044
00045
00046 static struct gengetopt_args_info_gpu args_info;
00047
00055 void parse_cmdline(int argc, char *argv[], struct gengetopt_args_info_gpu *args_info) {
00056
00057 if(cmdline_parser(argc, argv, args_info) != 0) {
00058 fprintf(stderr, "Run %s --help ('-h') to see the list of options.\n", argv[0]);
00059 exit(1);
00060 }
00061
00062 if(args_info->help_given) {
00063 cmdline_parser_print_help();
00064 exit(0);
00065 }
00066
00067 if(args_info->version_given) {
00068 cmdline_parser_print_version();
00069 exit(0);
00070 }
00071 }
00072
00085 void assign_interval(extract_global_info *global, unsigned int num_device, unsigned int window_pos, unsigned int num_interval, unsigned int start_k) {
00086
00087
00088 if(!global) return;
00089
00090
00091 extract_device_info *info = &global->device[num_device];
00092
00093
00094 if(window_pos >= info->window_size) return;
00095
00096
00097 if(num_interval >= global->num_intervals) {
00098 info->exec_window[window_pos].interval_index = INTERVAL_NONE;
00099 --info->window_used;
00100 --global->global_window_used;
00101 return;
00102 }
00103
00104
00105 interval_cpu_info *cpu = &global->cpu[num_interval];
00106 interval_gpu_info *gpu = &global->gpu[num_interval];
00107
00108
00109 cpu->extracted_s = 0; cpu->extracted_a = 0;
00110 cpu->converged_s = 0; cpu->converged_a = 0;
00111 cpu->eigen_left = cpu->num_eigs;
00112
00113
00114 gpu->start_k = start_k;
00115 gpu->processed = false;
00116
00117
00118 if(info->exec_window[window_pos].interval_index != num_interval) gpu->sigma = (cpu->lb + cpu->ub) * (T)0.5;
00119 else gpu->sigma = (rand() / (T)RAND_MAX) * (cpu->ub - cpu->lb) + cpu->lb;
00120
00121
00122 cpu->p = info->cpu_p + window_pos * global->size_pq;
00123 cpu->q = info->cpu_q + window_pos * global->size_pq;
00124 cpu->m = info->cpu_m + window_pos * global->inc_k;
00125
00126 cpu->alpha = info->cpu_alpha + window_pos * global->max_k;
00127 cpu->beta = info->cpu_beta + window_pos * global->max_k;
00128 cpu->gamma = info->cpu_gamma + window_pos * global->max_k;
00129 cpu->delta = info->cpu_delta + window_pos * global->max_k;
00130
00131
00132 gpu->p = info->gpu_p + window_pos * global->size_pq;
00133 gpu->q = info->gpu_q + window_pos * global->size_pq;
00134 gpu->m = info->gpu_m + window_pos * global->max_k;
00135 gpu->workspace = info->gpu_workspace + window_pos * global->size_workspace;
00136
00137
00138 if(info->exec_window[window_pos].interval_index == INTERVAL_NONE) {
00139 ++info->window_used;
00140 ++global->global_window_used;
00141 }
00142
00143
00144 info->exec_window[window_pos].interval_index = num_interval;
00145 }
00146
00152 void delete_extract_info(extract_global_info *global) {
00153
00154
00155 for(unsigned int j=0; j<global->num_devices; ++j) {
00156
00157
00158 extract_device_info *info = &global->device[j];
00159 cudaSetDevice(j);
00160
00161
00162 cudaStreamDestroy(info->exec_stream);
00163 for(unsigned int i=0; i<info->window_size; ++i) cudaStreamDestroy(info->exec_window[i].copy_stream);
00164
00165
00166 cudaFree(info->gpu_t);
00167 cudaFree(info->gpu_p);
00168 cudaFree(info->gpu_q);
00169 cudaFree(info->gpu_m);
00170 cudaFree(info->gpu_workspace);
00171 cudaFree(info->exec_window_gpu);
00172
00173 cudaFreeHost(info->cpu_p);
00174 cudaFreeHost(info->cpu_q);
00175 cudaFreeHost(info->cpu_m);
00176
00177 free(info->cpu_alpha);
00178 free(info->cpu_beta);
00179 free(info->cpu_gamma);
00180 free(info->cpu_delta);
00181 free(info->exec_window);
00182 free(info->device_prop);
00183 }
00184
00185 for(unsigned int i=0; i<global->num_intervals; ++i) {
00186 free(global->cpu[i].ls);
00187 free(global->cpu[i].la);
00188 free(global->cpu[i].vs);
00189 free(global->cpu[i].va);
00190 }
00191
00192 cudaFreeHost(global->gpu);
00193 free(global->cpu);
00194 free(global->device);
00195 }
00196
00207 int calc_memory_parameters(extract_global_info *global, unsigned int max_window_size) {
00208
00209
00210 if(!global) return FALSE;
00211
00212
00213 unsigned int n_align = align(global->n);
00214
00215 global->size_pq = n_align * global->max_k;
00216 global->size_workspace = max(5 * n_align, 4 * n_align + global->max_k);
00217 global->size_toeplitz_data = (global->n - 1) * sizeof(T);
00218 global->size_smem = (6 * blockSize - 1) * sizeof(T);
00219 global->global_window_used = 0;
00220
00221 int gmem_window_entry = (global->max_k * sizeof(tridiagonal_entry) + sizeof(interval_gpu_info) +
00222 (2 * global->size_pq + global->size_workspace) * sizeof(T));
00223
00224
00225 for(unsigned int i=0; i<global->num_devices; ++i) {
00226
00227
00228 extract_device_info *info = &global->device[i];
00229 cudaSetDevice(i);
00230
00231
00232 info->device_prop = (cudaDeviceProp *) malloc(sizeof(cudaDeviceProp));
00233 if(cudaGetDeviceProperties(info->device_prop, i) != cudaSuccess) {
00234 fprintf(stderr, "Error obtaining device properties: %s\n", cudaGetErrorString(cudaGetLastError()));
00235 return 1;
00236 }
00237
00238
00239 if(blockSize > info->device_prop->maxThreadsPerBlock) {
00240 fprintf(stderr, "Fatal error: too many threads per block in device #%d. Try to reduce the blockSize macro.\n", i);
00241 return FALSE;
00242 }
00243
00244
00245 int max_window_size_gmem = (info->device_prop->totalGlobalMem - global->size_toeplitz_data) / gmem_window_entry;
00246 if(max_window_size_gmem <= 0) {
00247 fprintf(stderr, "Fatal error: not enough global memory in device #%d. Try to reduce maximum subspace size or number of eigenvalues per interval.\n", i);
00248 return FALSE;
00249 }
00250
00251
00252 int max_window_size_smem = info->device_prop->sharedMemPerBlock - global->size_smem - 512;
00253 if(max_window_size_smem <= 0) {
00254 fprintf(stderr, "Fatal error: not enough shared memory in device #%d. Try to reduce number of threads per block (blockSize macro).\n", i);
00255 return FALSE;
00256 }
00257
00258
00259 info->window_size = min(max_window_size_gmem, max_window_size_smem);
00260 info->window_size = min(info->window_size, info->device_prop->maxGridSize[0]);
00261 info->window_size = min(info->window_size, global->num_intervals);
00262 info->window_used = 0;
00263
00264
00265 if(max_window_size > 0) info->window_size = min(info->window_size, max_window_size);
00266 }
00267
00268
00269 int extra_slots = 0;
00270 int mean_slots = (global->num_intervals + global->num_devices - 1) / global->num_devices;
00271 int slots_left = global->num_intervals;
00272
00273 for(unsigned int i=0; i<global->num_devices; ++i) {
00274
00275
00276 extract_device_info *info = &global->device[i];
00277
00278
00279 extra_slots = max(0, mean_slots + extra_slots - (int)info->window_size);
00280 info->window_size = min(mean_slots + extra_slots, info->window_size);
00281 info->window_size = min(slots_left, info->window_size);
00282 slots_left -= info->window_size;
00283
00284
00285 unsigned int gmem_size = info->window_size * gmem_window_entry + global->size_toeplitz_data;
00286
00287 if(args_info.show_stats_flag) {
00288 printf("Device #%d: shared memory = %d/%d KB, global memory = %d/%d MB. Up to %d intervals evaluated in parallel.\n",
00289 i, global->size_smem / 1024, info->device_prop->sharedMemPerBlock / 1024, gmem_size / 1024 / 1024,
00290 info->device_prop->totalGlobalMem / 1024 / 1024, info->window_size);
00291 }
00292 }
00293
00294 return TRUE;
00295 }
00296
00302 void prepare_devices(extract_global_info *global) {
00303
00304
00305 global->next_interval = 0;
00306
00307
00308 assert(cudaMallocHost((void **)&global->gpu, global->num_intervals * sizeof(interval_gpu_info)) != cudaErrorMemoryAllocation);
00309
00310
00311 for(unsigned int i=0; i<global->num_intervals; ++i) {
00312 global->cpu[i].ls = (T *) malloc(global->cpu[i].num_eigs * sizeof(T));
00313 global->cpu[i].la = (T *) malloc(global->cpu[i].num_eigs * sizeof(T));
00314 global->cpu[i].vs = (T *) malloc(global->cpu[i].num_eigs * global->n * sizeof(T));
00315 global->cpu[i].va = (T *) malloc(global->cpu[i].num_eigs * global->n * sizeof(T));
00316 }
00317
00318
00319 for(unsigned int j=0; j<global->num_devices; ++j) {
00320
00321
00322 extract_device_info *info = &global->device[j];
00323 cudaSetDevice(j);
00324
00325
00326 assert(cudaMalloc((void **)&info->gpu_p, info->window_size * global->size_pq * sizeof(T)) != cudaErrorMemoryAllocation);
00327 assert(cudaMalloc((void **)&info->gpu_q, info->window_size * global->size_pq * sizeof(T)) != cudaErrorMemoryAllocation);
00328 assert(cudaMalloc((void **)&info->gpu_m, info->window_size * global->max_k * sizeof(tridiagonal_entry)) != cudaErrorMemoryAllocation);
00329 assert(cudaMalloc((void **)&info->gpu_workspace, info->window_size * global->size_workspace * sizeof(T)) != cudaErrorMemoryAllocation);
00330 assert(cudaMalloc((void **)&info->exec_window_gpu, info->window_size * sizeof(interval_gpu_info)) != cudaErrorMemoryAllocation);
00331
00332
00333 info->exec_window = (exec_window_entry *) malloc(info->window_size * sizeof(exec_window_entry));
00334 for(unsigned int i=0; i<info->window_size; ++i) {
00335 info->exec_window[i].interval_index = INTERVAL_NONE;
00336 cudaStreamCreate(&info->exec_window[i].copy_stream);
00337 }
00338
00339 info->cpu_alpha = (T *) malloc(info->window_size * global->max_k * sizeof(T));
00340 info->cpu_beta = (T *) malloc(info->window_size * global->max_k * sizeof(T));
00341 info->cpu_gamma = (T *) malloc(info->window_size * global->max_k * sizeof(T));
00342 info->cpu_delta = (T *) malloc(info->window_size * global->max_k * sizeof(T));
00343
00344
00345 assert(cudaMallocHost((void **)&info->cpu_p, info->window_size * global->size_pq * sizeof(T)) != cudaErrorMemoryAllocation);
00346 assert(cudaMallocHost((void **)&info->cpu_q, info->window_size * global->size_pq * sizeof(T)) != cudaErrorMemoryAllocation);
00347 assert(cudaMallocHost((void **)&info->cpu_m, info->window_size * global->inc_k * sizeof(tridiagonal_entry)) != cudaErrorMemoryAllocation);
00348
00349
00350 for(unsigned int i=0; i<info->window_size; ++i) assign_interval(global, j, i, global->next_interval + i, 0);
00351
00352
00353 cudaMemcpy(info->exec_window_gpu, global->gpu + global->next_interval, info->window_size * sizeof(interval_gpu_info), cudaMemcpyHostToDevice);
00354 global->next_interval += info->window_size;
00355
00356
00357 assert(cudaMalloc((void **)&info->gpu_t, global->size_toeplitz_data) != cudaErrorMemoryAllocation);
00358 cudaMemcpy(info->gpu_t, &global->t[1], global->size_toeplitz_data, cudaMemcpyHostToDevice);
00359
00360
00361 cudaStreamCreate(&info->exec_stream);
00362 }
00363 }
00364
00365
00366 int main(int argc, char *argv[]) {
00367
00368
00369 parse_cmdline(argc, argv, &args_info);
00370
00371
00372 extract_global_info global;
00373 cudaGetDeviceCount((int *) &global.num_devices);
00374
00375 if(global.num_devices == 0) {
00376 fprintf(stderr, "Fatal error: no CUDA capable devices found!!\n");
00377 return 1;
00378 }
00379
00380
00381 if(args_info.max_devices_given) {
00382 if(args_info.max_devices_arg <= 0) {
00383 fprintf(stderr, "Error: invalid value for max-devices param.\n");
00384 return 1;
00385 }
00386
00387 global.num_devices = min(args_info.max_devices_arg, global.num_devices);
00388 }
00389
00390
00391 if(args_info.max_window_size_given && args_info.max_window_size_arg <= 0) {
00392 fprintf(stderr, "Error: invalid value for max-window-size param.\n");
00393 return 1;
00394 }
00395
00396
00397 if(args_info.tolerance_arg <= 0.0f) {
00398 fprintf(stderr, "Error: invalid tolerance param value.\n");
00399 return 1;
00400 }
00401
00402
00403 if(args_info.max_eigs_interval_arg <= 0) {
00404 fprintf(stderr, "Error: invalid value for max-eigs-interval param.\n");
00405 return 1;
00406 }
00407
00408
00409 if(args_info.max_k_arg <= 0) {
00410 fprintf(stderr, "Error: invalid value for max-k param.\n");
00411 return 1;
00412 }
00413
00414
00415 if(args_info.inc_k_arg <= 0) {
00416 fprintf(stderr, "Error: invalid value for inc-k param.\n");
00417 return 1;
00418 }
00419
00420
00421 if(args_info.max_interval_retries_arg < 0) {
00422 fprintf(stderr, "Error: invalid value for max-interval-retries param.\n");
00423 return 1;
00424 }
00425
00426
00427 global.tolerance = args_info.tolerance_arg;
00428 global.max_eig_interval = args_info.max_eigs_interval_arg;
00429 global.max_k = args_info.max_k_arg;
00430 global.inc_k = args_info.inc_k_arg;
00431 global.max_retry_count = args_info.max_interval_retries_arg;
00432
00433
00434 global.device = (extract_device_info *) calloc(global.num_devices, sizeof(extract_device_info));
00435
00436
00437 if(args_info.random_seed_given) srand(args_info.random_seed_arg);
00438 else srand(time(NULL));
00439
00440
00441 if(args_info.file_given) {
00442
00443
00444 if(!strcmp(args_info.file_arg, "-")) {
00445 if(!readInput((int *) &global.n, &global.t, stdin)) {
00446 fprintf(stderr, "Error reading data from standard input.\n");
00447 free(global.device);
00448 return 1;
00449 }
00450
00451
00452 } else {
00453 FILE *input = fopen(args_info.file_arg, "r");
00454 if(!input) {
00455 fprintf(stderr, "Error opening input file \"%s\"\n", args_info.file_arg);
00456 free(global.device);
00457 return 1;
00458 }
00459
00460 if(!readInput((int *) &global.n, &global.t, input)) {
00461 fprintf(stderr, "Error reading input file \"%s\"\n", args_info.file_arg);
00462 free(global.device);
00463 fclose(input);
00464 return 1;
00465 }
00466
00467 fclose(input);
00468 }
00469 }
00470
00471
00472 else if(args_info.random_data_given) {
00473
00474
00475 if(args_info.random_data_arg <= 0) {
00476 fprintf(stderr, "Error: invalid data size.\n");
00477 free(global.device);
00478 return 1;
00479 }
00480
00481
00482 global.n = args_info.random_data_arg;
00483 global.t = (T *) malloc(global.n * sizeof(T));
00484
00485
00486 for(unsigned int i=0; i<global.n; ++i) {
00487 global.t[i] = rand() / (T)RAND_MAX;
00488 global.t[i] *= (rand() & 1) ? (T)1 : (T)-1;
00489 }
00490
00491
00492 if(args_info.save_random_data_given) {
00493
00494 FILE *out = fopen(args_info.save_random_data_arg, "w");
00495 if(!out) {
00496 fprintf(stderr, "Error opening input file \"%s\"\n", args_info.save_random_data_arg);
00497 free(global.device);
00498 free(global.t);
00499 return 1;
00500 }
00501
00502 fprintf(out, "%d\n", global.n);
00503 for(unsigned int i=0; i<global.n; ++i) fprintf(out, "%.8f ", global.t[i]);
00504 fprintf(out, "\n");
00505
00506 fclose(out);
00507 }
00508 }
00509
00510
00511 T lbg, ubg;
00512 clock_t t1_intervals = clock();
00513 toeplitz_gershgorin(global.t, global.n, &lbg, &ubg);
00514
00515
00516 if(args_info.show_stats_flag) {
00517 printf("Calculating extraction intervals based on Gershgorin disc [%.6f, %.6f]: up to %d eigenvalues per interval.\n",
00518 lbg, ubg, global.max_eig_interval);
00519 }
00520
00521 global.cpu = calc_eigen_intervals(global.t, global.n, lbg, ubg, global.max_eig_interval, &global.num_intervals);
00522 clock_t t2_intervals = clock();
00523
00524
00525 unsigned int max_window_size = args_info.max_window_size_given ? args_info.max_window_size_arg : 0;
00526 if(!calc_memory_parameters(&global, max_window_size)) {
00527 for(unsigned int i=0; i<global.num_devices; ++i) free(global.device[i].device_prop);
00528 free(global.device);
00529 free(global.cpu);
00530 free(global.t);
00531 return 1;
00532 }
00533
00534
00535 prepare_devices(&global);
00536
00537
00538 unsigned int convergence_time_acc = 0;
00539 clock_t t1_extraction = clock();
00540
00541
00542 unsigned int k;
00543 for(k=0; global.global_window_used > 0; k += global.inc_k) {
00544
00545
00546 for(unsigned int dev=0; dev<global.num_devices; ++dev) {
00547
00548
00549 extract_device_info *info = &global.device[dev];
00550 cudaSetDevice(dev);
00551
00552
00553 cudaStreamSynchronize(info->exec_stream);
00554
00555
00556 dim3 block(blockSize);
00557 dim3 grid(info->window_size);
00558 si2w_parallel <<< grid, block, global.size_smem >>> (global.n, global.t[0], info->gpu_t, k, global.inc_k, global.max_k, info->exec_window_gpu);
00559 }
00560
00561
00562 clock_t t1_convergence = clock();
00563 for(unsigned int dev=0; k > 0 && dev<global.num_devices; ++dev) {
00564
00565
00566 extract_device_info *info = &global.device[dev];
00567 cudaSetDevice(dev);
00568
00569
00570 for(unsigned int j=0; j<global.inc_k; ++j) {
00571
00572
00573 unsigned int kj = k - global.inc_k + j;
00574
00575
00576 for(unsigned int i=0; i<info->window_size; ++i) {
00577
00578
00579 int interval_index = info->exec_window[i].interval_index;
00580 if(interval_index == INTERVAL_NONE) continue;
00581
00582
00583 interval_gpu_info *ig = &global.gpu[interval_index];
00584 interval_cpu_info *ic = &global.cpu[interval_index];
00585
00586
00587 int k_real = kj - (int)ig->start_k;
00588 if(ig->processed || k_real < 0) continue;
00589
00590
00591 cudaStreamSynchronize(info->exec_window[i].copy_stream);
00592
00593
00594 ic->alpha[k_real] = ic->m[j].alpha;
00595 ic->beta [k_real] = ic->m[j].beta;
00596 ic->gamma[k_real] = ic->m[j].gamma;
00597 ic->delta[k_real] = ic->m[j].delta;
00598
00599
00600 unsigned int n_align = align(global.n);
00601 ic->eigen_left -= extract_eigenvalues(global.n, k_real + 1, global.max_k, n_align, &ic->converged_s,
00602 &ic->extracted_s, ic->eigen_left, ic->lb, ic->ub, global.t, ic->p, ic->alpha,
00603 ic->beta, global.tolerance, ic->ls, ic->vs);
00604
00605
00606 ic->eigen_left -= extract_eigenvalues(global.n, k_real + 1, global.max_k, n_align, &ic->converged_a,
00607 &ic->extracted_a, ic->eigen_left, ic->lb, ic->ub, global.t, ic->q, ic->gamma,
00608 ic->delta, global.tolerance, ic->la, ic->va);
00609
00610
00611 if(ic->eigen_left <= 0) {
00612
00613
00614 ig->processed = true;
00615 assign_interval(&global, dev, i, global.next_interval++, k + global.inc_k);
00616 }
00617
00618
00619 else if(k_real + 1 >= global.max_k) {
00620
00621
00622 if(++ic->retry_count <= global.max_retry_count) {
00623 ig->sigma = (rand() / (T)RAND_MAX) * (ic->ub - ic->lb) + ic->lb;
00624 assign_interval(&global, dev, i, interval_index, k + global.inc_k);
00625
00626 if(args_info.show_interval_status_flag) {
00627 printf("Retrying interval %d/%d in device #%d: [%.6f, %.6f] with sigma %.6f, retry count = %d\n",
00628 interval_index + 1, global.num_intervals, dev, ic->lb, ic->ub, ig->sigma, ic->retry_count);
00629 }
00630
00631 } else {
00632
00633 assign_interval(&global, dev, i, global.next_interval++, k + global.inc_k);
00634 ig->processed = true;
00635
00636 if(args_info.show_interval_status_flag) {
00637 printf("Skipping interval %d/%d in device #%d: [%.6f, %.6f] (max retry count reached, %d / %d eigenvalues extracted)\n",
00638 interval_index + 1, global.num_intervals, dev, ic->lb, ic->ub, ic->extracted_s + ic->extracted_a, ic->num_eigs);
00639 }
00640 }
00641 }
00642 }
00643 }
00644 }
00645
00646
00647 clock_t t2_convergence = clock();
00648 convergence_time_acc += t2_convergence - t1_convergence;
00649
00650
00651 for(unsigned int dev=0; dev<global.num_devices; ++dev) {
00652
00653
00654 extract_device_info *info = &global.device[dev];
00655 cudaSetDevice(dev);
00656
00657
00658 cudaThreadSynchronize();
00659
00660
00661 if(info->window_used <= 0) continue;
00662
00663
00664 for(unsigned i=0; i<info->window_size; ++i) {
00665
00666
00667 int interval_index = info->exec_window[i].interval_index;
00668 if(interval_index == INTERVAL_NONE) continue;
00669
00670
00671 interval_gpu_info *ig = &global.gpu[interval_index];
00672 interval_cpu_info *ic = &global.cpu[interval_index];
00673
00674
00675 int k_real = k - (int)ig->start_k;
00676 if(k_real < 0 || k_real + global.inc_k >= global.max_k) {
00677 cudaMemcpyAsync(info->exec_window_gpu + i, global.gpu + interval_index, sizeof(interval_gpu_info), cudaMemcpyHostToDevice, info->exec_stream);
00678 }
00679
00680
00681 if(ig->processed || k_real < 0) continue;
00682
00683
00684 unsigned int n_align = align(global.n);
00685 unsigned int num_it = min(global.inc_k, global.max_k - k_real);
00686 cudaMemcpyAsync(ic->m, ig->m + k_real, num_it * sizeof(tridiagonal_entry), cudaMemcpyDeviceToHost, info->exec_window[i].copy_stream);
00687 cudaMemcpyAsync(ic->p + n_align * k_real, ig->p + n_align * k_real, num_it * n_align * sizeof(T), cudaMemcpyDeviceToHost, info->exec_window[i].copy_stream);
00688 cudaMemcpyAsync(ic->q + n_align * k_real, ig->q + n_align * k_real, num_it * n_align * sizeof(T), cudaMemcpyDeviceToHost, info->exec_window[i].copy_stream);
00689 }
00690 }
00691 }
00692
00693
00694 clock_t t2_extraction = clock();
00695
00696
00697 unsigned int total_extracted = 0;
00698 unsigned int intervals_restarted = 0, restart_count = 0;
00699 unsigned int intervals_partial = 0, intervals_failed = 0;
00700
00701 for(unsigned int i=0; i<global.num_intervals; ++i) {
00702 unsigned int extracted = global.cpu[i].extracted_s + global.cpu[i].extracted_a;
00703 total_extracted += extracted;
00704
00705 if(global.cpu[i].retry_count > 0) {
00706 restart_count += global.cpu[i].retry_count - 1;
00707 ++intervals_restarted;
00708
00709 if(global.cpu[i].retry_count > global.max_retry_count) {
00710 if(extracted == 0) ++intervals_failed;
00711 else ++intervals_partial;
00712 }
00713 }
00714 }
00715
00716 T *el = (T *) malloc(total_extracted * sizeof(T));
00717 T *ev = (T *) malloc(total_extracted * global.n * sizeof(T));
00718 T *el_p = el, *ev_p = ev;
00719
00720 for(unsigned int i=0; i<global.num_intervals; ++i) {
00721 interval_cpu_info *ic = &global.cpu[i];
00722
00723
00724 memcpy(el_p, ic->ls, ic->extracted_s * sizeof(T));
00725 el_p += ic->extracted_s;
00726
00727 memcpy(el_p, ic->la, ic->extracted_a * sizeof(T));
00728 el_p += ic->extracted_a;
00729
00730
00731 memcpy(ev_p, ic->vs, ic->extracted_s * global.n * sizeof(T));
00732 ev_p += ic->extracted_s * global.n;
00733
00734 memcpy(ev_p, ic->va, ic->extracted_a * global.n * sizeof(T));
00735 ev_p += ic->extracted_a * global.n;
00736 }
00737
00738
00739 delete_extract_info(&global);
00740
00741
00742 sort_eigenvalues(total_extracted, global.n, el, ev);
00743
00744
00745 float time_intervals = (t2_intervals - t1_intervals) / (float)CLOCKS_PER_SEC;
00746 float time_extraction = (t2_extraction - t1_extraction) / (float)CLOCKS_PER_SEC;
00747 float time_convergence = convergence_time_acc / (float)CLOCKS_PER_SEC;
00748 float time_total = time_intervals + time_extraction;
00749
00750
00751 if(args_info.show_stats_flag) {
00752 printf("\n");
00753
00754 printf("Extracted %d of %d (%.2f%%) eigenvalues (maximum subspace of size %d).\n",
00755 total_extracted, global.n, 100.0f * total_extracted / (float)global.n, global.max_k);
00756
00757 printf("Total elapsed time: %.3fs. Interval precalculation: %.3fs (%.2f%%), eigenvalue extraction: %.3fs (%.2f%%), convergence checking: %.3fs.\n",
00758 time_total, time_intervals, 100.0f * time_intervals / time_total, time_extraction, 100.0f * time_extraction / time_total, time_convergence);
00759
00760 printf("From a total of %d intervals, %d (%.2f%%) were restarted (%.2f retries/int), ",
00761 global.num_intervals, intervals_restarted, 100.0f * intervals_restarted / (float) global.num_intervals,
00762 restart_count / (float) max(1, intervals_restarted));
00763
00764 printf("%d had partial extraction (%.2f%%) and %d failed to extract (%.2f%%).\n",
00765 intervals_partial, 100.0f * intervals_partial / (float) global.num_intervals, intervals_failed,
00766 100.0f * intervals_failed / (float) global.num_intervals);
00767 }
00768
00769
00770 if(args_info.show_eigenvalues_flag) print_vector("eigenvalues", el, total_extracted);
00771 if(args_info.show_eigenvectors_flag) print_matrix("eigenvectors", ev, global.n, total_extracted, global.n);
00772
00773
00774 free(el);
00775 free(ev);
00776
00777 return 0;
00778 }
00779