00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00028 #ifndef _TOEPLITZ_H_
00029 #define _TOEPLITZ_H_
00030
00031
00032 #include <stdio.h>
00033 #include <stdlib.h>
00034 #include <string.h>
00035
00036
00037 #include <f2c.h>
00038 #include <clapack.h>
00039 #include <gsl/gsl_cblas.h>
00040
00041
00042 #include <cuda_runtime.h>
00043
00044
00045 #define SINGLE // Use simple precision floating point format instead of double
00046
00047
00048
00049
00050
00051
00052
00053
00054 #define blockSize 256 //512
00055 #define align(x) (((x) + 255) & ~255) //(((x) + 511) & ~511)
00056
00057 #define half_warp 16
00058 #define half_warp_mask 0x0F
00059
00060 #define print_vector_gpu(name, v, n) { if(tid == 0) { \
00061 fprintf(stderr, "%s: [", name); \
00062 for(int _i=0; _i<n; ++_i) fprintf(stderr, " %.6f", v[_i]); \
00063 fprintf(stderr, " ]\n"); } }
00064
00065
00066 #ifdef SINGLE
00067 #define T float
00068 #define Tsf "%f"
00069 #define _nrm2 cblas_snrm2
00070 #define _scal cblas_sscal
00071 #define _axpy cblas_saxpy
00072 #define _gemv cblas_sgemv
00073 #define _gemm cblas_sgemm
00074 #define _spmv cblas_sspmv
00075 #define _dot cblas_sdot
00076 #define _spsv sspsv_
00077 #define _stev sstev_
00078 #else
00079 #define T double
00080 #define Tsf "%lf"
00081 #define _nrm2 cblas_dnrm2
00082 #define _scal cblas_dscal
00083 #define _axpy cblas_daxpy
00084 #define _gemv cblas_dgemv
00085 #define _gemm cblas_dgemm
00086 #define _spmv cblas_dspmv
00087 #define _dot cblas_ddot
00088 #define _spsv dspsv_
00089 #define _stev dstev_
00090 #endif
00091
00092
00093 #ifndef TRUE
00094 #define TRUE 1
00095 #endif
00096
00097 #ifndef FALSE
00098 #define FALSE 0
00099 #endif
00100
00101
00102 #define INTERVAL_NONE -1
00103
00104
00105 #ifdef __GNUC__
00106 typedef struct __attribute__ ((aligned(16 * sizeof(T)))) tridiagonal_entry_struct {
00107 #else
00108 typedef struct __align__(16 * sizeof(T)) tridiagonal_entry_struct {
00109 #endif
00110 T alpha;
00111 T gamma;
00112 T beta;
00113 T delta;
00114 } tridiagonal_entry;
00115
00116
00117 typedef struct interval_cpu_info_struct {
00118
00119
00120 T lb, ub;
00121 unsigned int num_eigs;
00122
00123
00124 T *p, *q;
00125 T *ls, *la, *vs, *va;
00126 T *alpha, *beta, *gamma, *delta;
00127 tridiagonal_entry *m;
00128
00129 int eigen_left;
00130 unsigned int extracted_s, extracted_a;
00131 unsigned int converged_s, converged_a;
00132 unsigned int retry_count;
00133
00134 } interval_cpu_info;
00135
00136
00137 #ifdef __GNUC__
00138 typedef struct __attribute__ ((aligned(16 * sizeof(float)))) interval_gpu_info_struct {
00139 #else
00140 typedef struct __align__(16 * sizeof(float)) interval_gpu_info_struct {
00141 #endif
00142 unsigned int start_k;
00143 unsigned char processed;
00144 T sigma;
00145 T *p, *q;
00146 T *workspace;
00147 tridiagonal_entry *m;
00148 } interval_gpu_info;
00149
00150
00151 typedef struct exec_window_entry_struct {
00152 int interval_index;
00153 cudaStream_t copy_stream;
00154 } exec_window_entry;
00155
00156
00157 typedef struct extract_device_info_struct {
00158
00159
00160 unsigned int window_size;
00161 unsigned int window_used;
00162 exec_window_entry *exec_window;
00163 interval_gpu_info *exec_window_gpu;
00164 cudaStream_t exec_stream;
00165
00166
00167 struct cudaDeviceProp *device_prop;
00168
00169
00170 T *gpu_t;
00171 T *gpu_p, *gpu_q;
00172 T *gpu_workspace;
00173 tridiagonal_entry *gpu_m;
00174
00175 T *cpu_p, *cpu_q;
00176 T *cpu_alpha, *cpu_beta;
00177 T *cpu_gamma, *cpu_delta;
00178 tridiagonal_entry *cpu_m;
00179
00180 } extract_device_info;
00181
00182
00183 typedef struct extract_global_info_struct {
00184
00185
00186 unsigned int num_intervals;
00187 unsigned int max_eig_interval;
00188 unsigned int max_k;
00189 unsigned int max_retry_count;
00190 unsigned int inc_k;
00191
00192
00193 unsigned int n;
00194 T *t;
00195 T tolerance;
00196
00197
00198 interval_cpu_info *cpu;
00199 interval_gpu_info *gpu;
00200 unsigned int next_interval;
00201 unsigned int global_window_used;
00202
00203
00204 unsigned int num_devices;
00205 extract_device_info *device;
00206
00207
00208 unsigned int size_pq;
00209 unsigned int size_workspace;
00210 unsigned int size_toeplitz_data;
00211 unsigned int size_smem;
00212
00213 } extract_global_info;
00214
00215
00216 int readInput(int *n, T **t, FILE *file);
00217 void print_vector(const char *name, T *v, int n);
00218 void print_matrix(const char *name, T *m, int cols, int rows, int lda);
00219
00220
00221 void toeplitz_gershgorin(const T *t, unsigned int n, T *low, T *up);
00222 void toeplitz_mult(int n, const T *t, const T *x, T *y);
00223 void levinson(unsigned int n, T td, const T *t0, const T *b0, T *x);
00224
00225
00226 void sort_eigenvalues(unsigned int extracted, unsigned int n, T *el, T *ev);
00227 interval_cpu_info *calc_eigen_intervals(T *t, unsigned int n, T lbg, T ubg, unsigned int max_eig_interval, unsigned int *num_intervals);
00228 unsigned int extract_eigenvalues(unsigned int n, unsigned int k, unsigned int max_k, unsigned int p_lda,
00229 unsigned int *converged, unsigned int *extracted, unsigned int required,
00230 T a, T b, const T *t, const T *p, const T *alpha, const T *beta, T tolerance,
00231 T *l, T *v);
00232 #endif
00233