From eb29dc003a38e0741d2a6bed1ce36e98146f789b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Villemot?= Date: Mon, 1 Feb 2021 14:03:57 +0100 Subject: [PATCH] Bytecode: remove CUDA-specific stuff --- mex/sources/bytecode/Interpreter.cc | 12 +- mex/sources/bytecode/Interpreter.hh | 8 +- mex/sources/bytecode/SparseMatrix.cc | 1594 +------------------ mex/sources/bytecode/SparseMatrix.hh | 63 +- mex/sources/bytecode/SparseMatrix_kernel.cu | 121 -- mex/sources/bytecode/bytecode.cc | 190 +-- 6 files changed, 9 insertions(+), 1979 deletions(-) delete mode 100644 mex/sources/bytecode/SparseMatrix_kernel.cu diff --git a/mex/sources/bytecode/Interpreter.cc b/mex/sources/bytecode/Interpreter.cc index b34bcc066..7a8492aa6 100644 --- a/mex/sources/bytecode/Interpreter.cc +++ b/mex/sources/bytecode/Interpreter.cc @@ -33,16 +33,8 @@ Interpreter::Interpreter(double *params_arg, double *y_arg, double *ya_arg, doub int maxit_arg_, double solve_tolf_arg, size_t size_of_direction_arg, double slowc_arg, int y_decal_arg, double markowitz_c_arg, string &filename_arg, int minimal_solving_periods_arg, int stack_solve_algo_arg, int solve_algo_arg, bool global_temporary_terms_arg, bool print_arg, bool print_error_arg, mxArray *GlobalTemporaryTerms_arg, - bool steady_state_arg, bool print_it_arg, int col_x_arg, int col_y_arg -#ifdef CUDA - , const int CUDA_device_arg, cublasHandle_t cublas_handle_arg, cusparseHandle_t cusparse_handle_arg, cusparseMatDescr_t descr_arg -#endif - ) -: dynSparseMatrix(y_size_arg, y_kmin_arg, y_kmax_arg, print_it_arg, steady_state_arg, periods_arg, minimal_solving_periods_arg, slowc_arg -#ifdef CUDA - , CUDA_device_arg, cublas_handle_arg, cusparse_handle_arg, descr_arg -#endif - ) + bool steady_state_arg, bool print_it_arg, int col_x_arg, int col_y_arg) +: dynSparseMatrix(y_size_arg, y_kmin_arg, y_kmax_arg, print_it_arg, steady_state_arg, periods_arg, minimal_solving_periods_arg, slowc_arg) { params = params_arg; y = y_arg; diff --git a/mex/sources/bytecode/Interpreter.hh b/mex/sources/bytecode/Interpreter.hh index 5ab8cd784..3d990d588 100644 --- a/mex/sources/bytecode/Interpreter.hh +++ b/mex/sources/bytecode/Interpreter.hh @@ -1,5 +1,5 @@ /* - * Copyright © 2007-2017 Dynare Team + * Copyright © 2007-2021 Dynare Team * * This file is part of Dynare. * @@ -58,11 +58,7 @@ public: int maxit_arg_, double solve_tolf_arg, size_t size_of_direction_arg, double slowc_arg, int y_decal_arg, double markowitz_c_arg, string &filename_arg, int minimal_solving_periods_arg, int stack_solve_algo_arg, int solve_algo_arg, bool global_temporary_terms_arg, bool print_arg, bool print_error_arg, mxArray *GlobalTemporaryTerms_arg, - bool steady_state_arg, bool print_it_arg, int col_x_arg, int col_y_arg -#ifdef CUDA - , const int CUDA_device, cublasHandle_t cublas_handle_arg, cusparseHandle_t cusparse_handle_arg, cusparseMatDescr_t descr_arg -#endif - ); + bool steady_state_arg, bool print_it_arg, int col_x_arg, int col_y_arg); bool extended_path(string file_name, string bin_basename, bool evaluate, int block, int &nb_blocks, int nb_periods, vector sextended_path, vector sconstrained_extended_path, vector dates, table_conditional_global_type table_conditional_global); bool compute_blocks(string file_name, string bin_basename, bool evaluate, int block, int &nb_blocks); void check_for_controlled_exo_validity(FBEGINBLOCK_ *fb, vector sconstrained_extended_path); diff --git a/mex/sources/bytecode/SparseMatrix.cc b/mex/sources/bytecode/SparseMatrix.cc index edd9a3b2e..7009f1d3d 100644 --- a/mex/sources/bytecode/SparseMatrix.cc +++ b/mex/sources/bytecode/SparseMatrix.cc @@ -27,10 +27,6 @@ //#include #include "SparseMatrix.hh" -#ifdef CUDA -# include "SparseMatrix_kernel.cu" -#endif - using namespace std; dynSparseMatrix::dynSparseMatrix() @@ -55,11 +51,7 @@ dynSparseMatrix::dynSparseMatrix() } dynSparseMatrix::dynSparseMatrix(const int y_size_arg, const int y_kmin_arg, const int y_kmax_arg, const bool print_it_arg, const bool steady_state_arg, const int periods_arg, - const int minimal_solving_periods_arg, const double slowc_arg -#ifdef CUDA - , const int CUDA_device_arg, cublasHandle_t cublas_handle_arg, cusparseHandle_t cusparse_handle_arg, cusparseMatDescr_t descr_arg -#endif - ) : + const int minimal_solving_periods_arg, const double slowc_arg) : Evaluate(y_size_arg, y_kmin_arg, y_kmax_arg, print_it_arg, steady_state_arg, periods_arg, minimal_solving_periods_arg, slowc_arg) { pivotva = nullptr; @@ -79,12 +71,6 @@ dynSparseMatrix::dynSparseMatrix(const int y_size_arg, const int y_kmin_arg, con lu_inc_tol = 1e-10; Symbolic = nullptr; Numeric = nullptr; -#ifdef CUDA - CUDA_device = CUDA_device_arg; - cublas_handle = cublas_handle_arg; - cusparse_handle = cusparse_handle_arg; - CUDA_descr = descr_arg; -#endif } int @@ -1095,409 +1081,6 @@ dynSparseMatrix::Init_UMFPACK_Sparse(int periods, int y_kmin, int y_kmax, int Si #endif } -void -dynSparseMatrix::Init_CUDA_Sparse_Simple(int Size, map, int>, int> &IM, SuiteSparse_long **Ap, SuiteSparse_long **Ai, double **Ax, double **b, double **x0, bool &zero_solution, mxArray *x0_m) -{ - int eq, var; - - *b = static_cast(mxMalloc(Size * sizeof(double))); - test_mxMalloc(*b, __LINE__, __FILE__, __func__, Size * sizeof(double)); - if (!(*b)) - { - ostringstream tmp; - tmp << " in Init_CUDA_Sparse, can't retrieve b vector\n"; - throw FatalExceptionHandling(tmp.str()); - } - double *Host_x0 = mxGetPr(x0_m); - if (!Host_x0) - { - ostringstream tmp; - tmp << " in Init_CUDA_Sparse_Simple, can't retrieve x0 vector\n"; - throw FatalExceptionHandling(tmp.str()); - } - *Ap = static_cast(mxMalloc((Size+1) * sizeof(SuiteSparse_long))); - test_mxMalloc(*Ap, __LINE__, __FILE__, __func__, (Size+1) * sizeof(SuiteSparse_long)); - if (!(*Ap)) - { - ostringstream tmp; - tmp << " in Init_CUDA_Sparse, can't allocate Ap index vector\n"; - throw FatalExceptionHandling(tmp.str()); - } - size_t prior_nz = IM.size(); - *Ai = static_cast(mxMalloc(prior_nz * sizeof(SuiteSparse_long))); - test_mxMalloc(*Ai, __LINE__, __FILE__, __func__, prior_nz * sizeof(SuiteSparse_long)); - if (!(*Ai)) - { - ostringstream tmp; - tmp << " in Init_CUDA_Sparse, can't allocate Ai index vector\n"; - throw FatalExceptionHandling(tmp.str()); - } - *Ax = static_cast(mxMalloc(prior_nz * sizeof(double))); - test_mxMalloc(*Ax, __LINE__, __FILE__, __func__, prior_nz * sizeof(double)); - if (!(*Ax)) - { - ostringstream tmp; - tmp << " in Init_CUDA_Sparse, can't retrieve Ax matrix\n"; - throw FatalExceptionHandling(tmp.str()); - } - - map, int>, int>::iterator it4; - for (int i = 0; i < Size; i++) - { - int eq = index_vara[i]; - ya[eq+it_*y_size] = y[eq+it_*y_size]; - } - -#ifdef DEBUG - unsigned int max_nze = prior_nz; //mxGetNzmax(A_m); -#endif - unsigned int NZE = 0; - int last_var = 0; - double cum_abs_sum = 0; - for (int i = 0; i < Size; i++) - { - (*b)[i] = u[i]; - cum_abs_sum += fabs((*b)[i]); - (*x0)[i] = y[i]; - } - if (cum_abs_sum < 1e-20) - zero_solution = true; - else - zero_solution = false; - - (*Ap)[0] = 0; - last_var = -1; - it4 = IM.begin(); - while (it4 != IM.end()) - { - var = it4->first.first.first; - if (var != last_var) - { - (*Ap)[1+last_var] = NZE; - last_var = var; - } - eq = it4->first.second; - int index = it4->second; -#ifdef DEBUG - if (index < 0 || index >= u_count_alloc || index > Size + Size*Size) - { - ostringstream tmp; - tmp << " in Init_CUDA_Sparse_Simple, index (" << index << ") out of range for u vector max = " << Size+Size*Size << " allocated = " << u_count_alloc << "\n"; - throw FatalExceptionHandling(tmp.str()); - } - if (NZE >= max_nze) - { - ostringstream tmp; - tmp << " in Init_CUDA_Sparse_Simple, exceeds the capacity of A_m sparse matrix\n"; - throw FatalExceptionHandling(tmp.str()); - } -#endif - (*Ax)[NZE] = u[index]; - (*Ai)[NZE] = eq; - NZE++; -#ifdef DEBUG - if (eq < 0 || eq >= Size) - { - ostringstream tmp; - tmp << " in Init_CUDA_Sparse_Simple, index (" << eq << ") out of range for b vector\n"; - throw FatalExceptionHandling(tmp.str()); - } - if (var < 0 || var >= Size) - { - ostringstream tmp; - tmp << " in Init_CUDA_Sparse_Simple, index (" << var << ") out of range for index_vara vector\n"; - throw FatalExceptionHandling(tmp.str()); - } - if (index_vara[var] < 0 || index_vara[var] >= y_size) - { - ostringstream tmp; - tmp << " in Init_CUDA_Sparse_Simple, index (" << index_vara[var] << ") out of range for y vector max=" << y_size << " (0)\n"; - throw FatalExceptionHandling(tmp.str()); - } -#endif - it4++; - } - (*Ap)[Size] = NZE; -} - -#ifdef CUDA -void -dynSparseMatrix::Init_CUDA_Sparse(int periods, int y_kmin, int y_kmax, int Size, map, int>, int> &IM, int **Ap, int **Ai, double **Ax, int **Ap_tild, int **Ai_tild, double **A_tild, double **b, double **x0, mxArray *x0_m, int *nnz, int *nnz_tild, int preconditioner) -{ - //cudaError_t cuda_error; - int t, eq, var, lag, ti_y_kmin, ti_y_kmax; - int n = periods * Size; - size_t prior_nz = IM.size() * periods; - size_t preconditioner_size = 0; - map, int> jacob_struct; - - /* ask cuda how many devices it can find */ - int device_count; - cudaGetDeviceCount(&device_count); - - cudaSetDevice(CUDA_device); - - double *Host_b = (double *) mxMalloc(n * sizeof(double)); - test_mxMalloc(Host_b, __LINE__, __FILE__, __func__, n * sizeof(double)); - cudaChk(cudaMalloc((void **) b, n * sizeof(double)), " in Init_Cuda_Sparse, not enought memory to allocate b vector on the graphic card\n"); - - double *Host_x0 = mxGetPr(x0_m); - if (!Host_x0) - { - ostringstream tmp; - tmp << " in Init_Cuda_Sparse, can't retrieve x0 vector\n"; - throw FatalExceptionHandling(tmp.str()); - } - cudaChk(cudaMalloc((void **) x0, n * sizeof(double)), " in Init_Cuda_Sparse, not enought memory to allocate x0 vector on the graphic card\n"); - - int *Host_Ap = (int *) mxMalloc((n+1) * sizeof(int)); - test_mxMalloc(Host_Ap, __LINE__, __FILE__, __func__, (n+1) * sizeof(int)); - - int *Host_Ai = (int *) mxMalloc(prior_nz * sizeof(int)); - test_mxMalloc(Host_Ai, __LINE__, __FILE__, __func__, prior_nz * sizeof(int)); - - double *Host_Ax = (double *) mxMalloc(prior_nz * sizeof(double)); - test_mxMalloc(Host_Ax, __LINE__, __FILE__, __func__, prior_nz * sizeof(double)); - - int *Host_Ai_tild, *Host_Ap_tild; - if (preconditioner == 3) - { - Host_Ap_tild = (int *) mxMalloc((n+1)*sizeof(int)); - test_mxMalloc(Host_Ap_tild, __LINE__, __FILE__, __func__, (n+1)*sizeof(int)); - Host_Ai_tild = (int *) mxMalloc(prior_nz*sizeof(int)); - test_mxMalloc(Host_Ai_tild, __LINE__, __FILE__, __func__, prior_nz*sizeof(int)); - Host_Ap_tild[0] = 0; - } - - if (preconditioner == 0) - preconditioner_size = n; - else if (preconditioner == 1 || preconditioner == 2 || preconditioner == 3) - preconditioner_size = prior_nz; - - double *Host_A_tild = (double *) mxMalloc(preconditioner_size * sizeof(double)); - test_mxMalloc(Host_A_tild, __LINE__, __FILE__, __func__, preconditioner_size * sizeof(double)); - - map, int>, int>::iterator it4; - for (int i = 0; i < y_size*(periods+y_kmin); i++) - ya[i] = y[i]; -# ifdef DEBUG - unsigned int max_nze = mxGetNzmax(A_m); -# endif - unsigned int NZE = 0, NZE_tild = 0; - int last_eq = 0; - for (int i = 0; i < periods*Size; i++) - { - Host_b[i] = 0; - Host_x0[i] = y[index_vara[Size*y_kmin+i]]; - } - - //Ordered in CSR and not in CSC - - Host_Ap[0] = 0; - for (t = 0; t < periods; t++) - { - last_eq = -1; - it4 = IM.begin(); - while (it4 != IM.end()) - { - eq = it4->first.first.first; - if (eq != last_eq) - { -# ifdef DEBUG - if (1+last_eq + t * Size > (n + 1)) - { - ostringstream tmp; - tmp << " in Init_CUDA_Sparse, 1+last_eq + t * Size (" << 1+last_eq + t * Size << ") out of range for Host_Ap vector\n"; - throw FatalExceptionHandling(tmp.str()); - } -# endif - Host_Ap[1+last_eq + t * Size] = NZE; - if (preconditioner == 3 && t == 0) - Host_Ap_tild[1+last_eq] = NZE_tild; - last_eq = eq; - } - var = it4->first.second+Size*t; - lag = it4->first.first.second; - int index = it4->second+ (t /*+ lag*/) * u_count_init; - if (eq < (periods+y_kmax)*Size) - { - ti_y_kmin = -min(t, y_kmin); - ti_y_kmax = min(periods-(t + 1), y_kmax); - if ((lag <= ti_y_kmax && lag >= ti_y_kmin) || preconditioner == 3) /*Build the index for sparse matrix containing the jacobian : u*/ - { -# ifdef DEBUG - if (index < 0 || index >= u_count_alloc || index > (periods-1)* IM.size() + Size * Size + periods * Size) - { - ostringstream tmp; - tmp << " in Init_CUDA_Sparse, index (" << index << ") out of range for u vector max = " << (periods-1)* IM.size() + Size * Size + periods * Size << " allocated = " << u_count_alloc << "\n"; - throw FatalExceptionHandling(tmp.str()); - } - if (NZE >= prior_nz) - { - ostringstream tmp; - tmp << " in Init_CUDA_Sparse, exceeds the capacity of A_i or A_x sparse matrix\n"; - throw FatalExceptionHandling(tmp.str()); - } -# endif - bool to_store = true; - if (preconditioner == 0) - { - if (lag == 0 && it4->first.second == eq) - Host_A_tild[var] = u[index]; - } - else if (preconditioner == 1 || preconditioner == 2) - Host_A_tild[NZE] = u[index]; - else if (preconditioner == 3) - { - if (lag > ti_y_kmax || lag < ti_y_kmin) - { - Host_b[eq + t * Size] += u[index]*y[index_vara[var+Size*(y_kmin+lag)]]; - to_store = false; - } - if (t == 0) - { - map, int>::const_iterator it = jacob_struct.find(make_pair(eq + t * Size, var)); - if (it != jacob_struct.end()) - Host_A_tild[it->second] += u[index]; - else - { - jacob_struct[make_pair(eq, var)] = NZE_tild; - Host_A_tild[NZE_tild] = u[index]; - Host_Ai_tild[NZE_tild] = var; - NZE_tild++; - } - } - } - if (to_store) - { - Host_Ax[NZE] = u[index]; - Host_Ai[NZE] = var + lag * Size; - NZE++; - } - } - else - { -# ifdef DEBUG - if (var < 0 || var >= Size * periods) - { - ostringstream tmp; - tmp << " in Init_CUDA_Sparse, index (" << var << ") out of range for b vector\n"; - throw FatalExceptionHandling(tmp.str()); - } - if (var+Size*(y_kmin+t+lag) < 0 || var+Size*(y_kmin+lag) >= Size*(periods+y_kmin+y_kmax)) - { - ostringstream tmp; - tmp << " in Init_CUDA_Sparse, index (" << var+Size*(y_kmin+lag) << ") out of range for index_vara vector max=" << Size*(periods+y_kmin+y_kmax) << "\n"; - throw FatalExceptionHandling(tmp.str()); - } - if (index_vara[var+Size*(y_kmin+lag)] < 0 || index_vara[var+Size*(y_kmin+lag)] >= y_size*(periods+y_kmin+y_kmax)) - { - ostringstream tmp; - tmp << " in Init_CUDA_Sparse, index (" << index_vara[var+Size*(y_kmin+lag)] << ") out of range for y vector max=" << y_size*(periods+y_kmin+y_kmax) << "\n"; - throw FatalExceptionHandling(tmp.str()); - } -# endif - Host_b[eq + t * Size] += u[index]*y[index_vara[var+Size*(y_kmin+lag)]]; - } - } - else // ...and store it in the u vector - { -# ifdef DEBUG - if (index < 0 || index >= u_count_alloc) - { - ostringstream tmp; - tmp << " in Init_CUDA_Sparse, index (" << index << ") out of range for u vector\n"; - throw FatalExceptionHandling(tmp.str()); - } - if (var < 0 || var >= (Size*periods)) - { - ostringstream tmp; - tmp << " in Init_CUDA_Sparse, index (" << var << ") out of range for b vector\n"; - throw FatalExceptionHandling(tmp.str()); - } -# endif - Host_b[var] += u[index]; - } - it4++; - } - } - Host_Ap[Size*periods] = NZE; - if (preconditioner == 3) - { - int *tmp_Ap_tild = (int *) mxMalloc((Size + 1) * sizeof(int)); - test_mxMalloc(tmp_Ap_tild, __LINE__, __FILE__, __func__, (Size + 1) * sizeof(int)); - int *tmp_Ai_tild = (int *) mxMalloc(NZE_tild * sizeof(int)); - test_mxMalloc(tmp_Ai_tild, __LINE__, __FILE__, __func__, NZE_tild * sizeof(int)); - double *tmp_A_tild = (double *) mxMalloc(NZE_tild * sizeof(double)); - test_mxMalloc(tmp_A_tild, __LINE__, __FILE__, __func__, NZE_tild * sizeof(double)); - memcpy(tmp_Ap_tild, Host_Ap_tild, (Size + 1) * sizeof(int)); - memcpy(tmp_Ai_tild, Host_Ai_tild, NZE_tild * sizeof(int)); - memcpy(tmp_A_tild, Host_A_tild, NZE_tild * sizeof(double)); - //int NZE_tild_old = NZE_tild; - NZE_tild = 0; - Host_Ap_tild[0] = NZE_tild; - - for (int i = 0; i < Size; i++) - { - for (int j = tmp_Ap_tild[i]; j < tmp_Ap_tild[i+1]; j++) - if (abs(tmp_A_tild[j]) > 1.0e-20) - { - Host_A_tild[NZE_tild] = tmp_A_tild[j]; - Host_Ai_tild[NZE_tild] = tmp_Ai_tild[j]; - NZE_tild++; - } - Host_Ap_tild[i+1] = NZE_tild; - } - mxFree(tmp_Ap_tild); - mxFree(tmp_Ai_tild); - mxFree(tmp_A_tild); - } - - *nnz = NZE; - *nnz_tild = NZE_tild; - if (preconditioner == 1 || preconditioner == 2 || preconditioner == 3) - preconditioner_size = NZE; - -# ifdef DEBUG - mexPrintf("Host_Ax = ["); - for (int i = 0; i < NZE; i++) - mexPrintf("%f ", Host_Ax[i]); - mexPrintf("]\n"); - - mexPrintf("Host_Ap = ["); - for (int i = 0; i < n+1; i++) - mexPrintf("%d ", Host_Ap[i]); - mexPrintf("]\n"); - - mexPrintf("Host_Ai = ["); - for (int i = 0; i < NZE; i++) - mexPrintf("%d ", Host_Ai[i]); - mexPrintf("]\n"); -# endif - cudaChk(cudaMalloc((void **) Ai, NZE * sizeof(int)), " in Init_Cuda_Sparse, can't allocate Ai index vector on the graphic card\n"); - cudaChk(cudaMalloc((void **) Ax, NZE * sizeof(double)), " in Init_Cuda_Sparse, can't allocate Ax on the graphic card\n"); - cudaChk(cudaMalloc((void **) Ap, (n+1) * sizeof(int)), " in Init_Cuda_Sparse, can't allocate Ap index vector on the graphic card\n"); - if (preconditioner == 3) - { - cudaChk(cudaMalloc((void **) Ai_tild, NZE_tild * sizeof(int)), " in Init_Cuda_Sparse, can't allocate Ai_tild index vector on the graphic card\n"); - cudaChk(cudaMalloc((void **) Ap_tild, (n+1) * sizeof(int)), " in Init_Cuda_Sparse, can't allocate Ap_tild index vector on the graphic card\n"); - } - cudaChk(cudaMalloc((void **) A_tild, preconditioner_size * sizeof(double)), " in Init_Cuda_Sparse, can't allocate A_tild on the graphic card\n"); - - cudaChk(cudaMemcpy(*x0, Host_x0, n * sizeof(double), cudaMemcpyHostToDevice), " in Init_CUDA_Sparse, cudaMemcpy x0 = Host_x0 failed"); - cudaChk(cudaMemcpy(*b, Host_b, n * sizeof(double), cudaMemcpyHostToDevice), " in Init_CUDA_Sparse, cudaMemcpy b = Host_b failed"); - cudaChk(cudaMemcpy(*Ap, Host_Ap, (n + 1) * sizeof(int), cudaMemcpyHostToDevice), " in Init_CUDA_Sparse, cudaMemcpy Ap = Host_Ap failed"); - cudaChk(cudaMemcpy(*Ai, Host_Ai, NZE * sizeof(int), cudaMemcpyHostToDevice), " in Init_CUDA_Sparse, cudaMemcpy Ai = Host_Ai failed"); - cudaChk(cudaMemcpy(*Ax, Host_Ax, NZE * sizeof(double), cudaMemcpyHostToDevice), " in Init_CUDA_Sparse, cudaMemcpy Ax = Host_Ax failed"); - if (preconditioner == 3) - { - cudaChk(cudaMemcpy(*Ap_tild, Host_Ap_tild, (n + 1) * sizeof(int), cudaMemcpyHostToDevice), " in Init_CUDA_Sparse, cudaMemcpy Ap_tild = Host_Ap_tild failed"); - cudaChk(cudaMemcpy(*Ai_tild, Host_Ai_tild, NZE_tild * sizeof(int), cudaMemcpyHostToDevice), " in Init_CUDA_Sparse, cudaMemcpy Ai_tild = Host_Ai_til failed"); - } - cudaChk(cudaMemcpy(*A_tild, Host_A_tild, preconditioner_size * sizeof(double), cudaMemcpyHostToDevice), " in Init_CUDA_Sparse, cudaMemcpy A_tild = Host_A_tild failed"); -} -#endif - void dynSparseMatrix::PrintM(int n, double *Ax, mwIndex *Ap, mwIndex *Ai) { @@ -3356,85 +2939,6 @@ dynSparseMatrix::Solve_LU_UMFPack(mxArray *A_m, mxArray *b_m, int Size, double s mxDestroyArray(b_m); } -#ifdef CUDA -void -printM(int n, double *Ax, int *Ap, int *Ai, cusparseMatDescr_t descrA, cusparseHandle_t cusparse_handle) -{ - //cudaError_t cuda_error; - //cusparseStatus_t cusparse_status; - double *A_dense; - cudaChk(cudaMalloc((void **) &A_dense, n * n *sizeof(double)), "A_dense cudaMalloc has failed\n"); - - cusparseChk(cusparseDcsr2dense(cusparse_handle, n, n, descrA, - Ax, Ap, Ai, A_dense, n), "cusparseDcsr2dense has failed\n"); - double *A_dense_hoste = (double *) mxMalloc(n * n * sizeof(double)); - test_mxMalloc(A_dense_hoste, __LINE__, __FILE__, __func__, n * n * sizeof(double)); - cudaChk(cudaMemcpy(A_dense_hoste, A_dense, n * n * sizeof(double), cudaMemcpyDeviceToHost), " cudaMemcpy(A_dense_hoste, A_dense) has failed\n"); - mexPrintf("----------------------\n"); - mexPrintf("FillMode=%d, IndexBase=%d, MatType=%d, DiagType=%d\n", cusparseGetMatFillMode(descrA), cusparseGetMatIndexBase(descrA), cusparseGetMatType(descrA), cusparseGetMatDiagType(descrA)); - //mexEvalString("drawnow;"); - for (int i = 0; i < n; i++) - { - for (int j = 0; j < n; j++) - mexPrintf("%-6.3f ", A_dense_hoste[i + j * n]); - mexPrintf("\n"); - } - mxFree(A_dense_hoste); - cudaChk(cudaFree(A_dense), "cudaFree(A_dense) has failed\n"); -} - -void -dynSparseMatrix::Solve_CUDA_BiCGStab_Free(double *tmp_vect_host, double *p, double *r, double *v, double *s, double *t, double *y_, double *z, double *tmp_, - int *Ai, double *Ax, int *Ap, double *x0, double *b, double *A_tild, int *A_tild_i, int *A_tild_p /*, double* Lx, int* Li, int* Lp, - double* Ux, int* Ui, int* Up, int* device_n*/, cusparseSolveAnalysisInfo_t infoL, cusparseSolveAnalysisInfo_t infoU, - cusparseMatDescr_t descrL, cusparseMatDescr_t descrU, int preconditioner) -{ - //cudaError_t cuda_error; - //cusparseStatus_t cusparse_status; - mxFree(tmp_vect_host); - cudaChk(cudaFree(p), " in Solve_Cuda_BiCGStab, can't free p\n"); - cudaChk(cudaFree(r), " in Solve_Cuda_BiCGStab, can't free r\n"); - cudaChk(cudaFree(v), " in Solve_Cuda_BiCGStab, can't free v\n"); - cudaChk(cudaFree(s), " in Solve_Cuda_BiCGStab, can't free s\n"); - cudaChk(cudaFree(t), " in Solve_Cuda_BiCGStab, can't free t\n"); - cudaChk(cudaFree(y_), " in Solve_Cuda_BiCGStab, can't free y_\n"); - cudaChk(cudaFree(z), " in Solve_Cuda_BiCGStab, can't free z\n"); - cudaChk(cudaFree(tmp_), " in Solve_Cuda_BiCGStab, can't free tmp_\n"); - cudaChk(cudaFree(Ai), " in Solve_Cuda_BiCGStab, can't free Ai\n"); - cudaChk(cudaFree(Ax), " in Solve_Cuda_BiCGStab, can't free Ax\n"); - cudaChk(cudaFree(Ap), " in Solve_Cuda_BiCGStab, can't free Ap\n"); - cudaChk(cudaFree(x0), " in Solve_Cuda_BiCGStab, can't free x0\n"); - cudaChk(cudaFree(b), " in Solve_Cuda_BiCGStab, can't free b\n"); - /*if (preconditioner == 0) - {*/ - cudaChk(cudaFree(A_tild), " in Solve_Cuda_BiCGStab, can't free A_tild (1)\n"); - cudaChk(cudaFree(A_tild_i), " in Solve_Cuda_BiCGStab, can't free A_tild_i (1)\n"); - cudaChk(cudaFree(A_tild_p), " in Solve_Cuda_BiCGStab, can't free A_tild_p (1)\n"); - /*} - else - { - cudaChk(cudaFree(Lx), " in Solve_Cuda_BiCGStab, can't free Lx\n"); - cudaChk(cudaFree(Li), " in Solve_Cuda_BiCGStab, can't free Li\n"); - cudaChk(cudaFree(Lp), " in Solve_Cuda_BiCGStab, can't free Lp\n"); - cudaChk(cudaFree(Ux), " in Solve_Cuda_BiCGStab, can't free Ux\n"); - cudaChk(cudaFree(Ui), " in Solve_Cuda_BiCGStab, can't free Ui\n"); - cudaChk(cudaFree(Up), " in Solve_Cuda_BiCGStab, can't free Up\n"); - }*/ - //cudaChk(cudaFree(device_n), " in Solve_Cuda_BiCGStab, can't free device_n\n"); - if (preconditioner == 1 || preconditioner == 2 || preconditioner == 3) - { - cusparseChk(cusparseDestroySolveAnalysisInfo(infoL), - " in Solve_Cuda_BiCGStab, cusparseDestroySolveAnalysisInfo has failed for infoL\n"); - cusparseChk(cusparseDestroySolveAnalysisInfo(infoU), - " in Solve_Cuda_BiCGStab, cusparseDestroySolveAnalysisInfo has failed for infoU\n"); - } - cusparseChk(cusparseDestroyMatDescr(descrL), - " in Solve_Cuda_BiCGStab, matrix descriptor destruction failed for descrL\n"); - cusparseChk(cusparseDestroyMatDescr(descrU), - " in Solve_Cuda_BiCGStab, matrix descriptor destruction failed for descrU\n"); -} -#endif - void Solve(double *Ax, int *Ap, int *Ai, double *b, int n, bool Lower, double *x) { @@ -3507,1087 +3011,6 @@ Check(int n, double *Ax, int *Ap, int *Ai, double *b, double *x, bool Lower) } } -#ifdef CUDA -int -dynSparseMatrix::Solve_CUDA_BiCGStab(int *Ap, int *Ai, double *Ax, int *Ap_tild, int *Ai_tild, double *A_tild, double *b, double *x0, int n, int Size, double slowc_l, bool is_two_boundaries, - int it_, int nnz, int nnz_tild, int preconditioner, int max_iterations, int block) -{ - cusparseSolveAnalysisInfo_t info, infoL, infoU; - cusparseMatDescr_t descrL, descrU; - const double tol = 1.0e-6; //1.0e-6; - const double eps = 1.0e-16; - double *p, *r, *r0, *v, *s, *t, *y_, *z, *tmp_; - int *A_tild_i, *A_tild_p; - double *Qx; - int *Qi, *Qj; - double *Px; - int *Pi, *Pj; - int Q_nnz, P_nnz; - int W_nnz; - double bnorm; - double tmp1, tmp2; - int refinement_needed = 0, stagnation = 0; - int max_refinement = min(min(int (floor(double (n)/50)), 10), n-max_iterations), max_stagnation = 3; - int nblocks = ceil(double (n) / double (1024)); - int n_threads; - if (nblocks == 0) - n_threads = n; - else - n_threads = 1024; - int periods = n / Size; - - double *tmp_vect_host = (double *) mxMalloc(n * sizeof(double)); - test_mxMalloc(tmp_vect_host, __LINE__, __FILE__, __func__, n * sizeof(double)); - - cublasChk(cublasDnrm2(cublas_handle, n, b, 1, &bnorm), - " in Solve_Cuda_BiCGStab, cublasDnrm2(b) has failed\n"); - - double tolb = tol * bnorm; - - if (bnorm == 0.0) - { - // if b = 0 the A.x = 0 => x = 0 - cudaChk(cudaFree(Ai), " in Solve_Cuda_BiCGStab, can't free Ai\n"); - cudaChk(cudaFree(Ax), " in Solve_Cuda_BiCGStab, can't free Ax\n"); - cudaChk(cudaFree(Ap), " in Solve_Cuda_BiCGStab, can't free Ap\n"); - if (preconditioner == 3) - { - cudaChk(cudaFree(Ai_tild), " in Solve_Cuda_BiCGStab, can't free Ai_tild\n"); - cudaChk(cudaFree(Ap_tild), " in Solve_Cuda_BiCGStab, can't free Ap_tild\n"); - } - cudaChk(cudaFree(A_tild), " in Solve_Cuda_BiCGStab, can't free A_tild\n"); - cudaChk(cudaFree(x0), " in Solve_Cuda_BiCGStab, can't free x0\n"); - cudaChk(cudaFree(b), " in Solve_Cuda_BiCGStab, can't free b\n"); - if (is_two_boundaries) - for (int i = 0; i < n; i++) - { - int eq = index_vara[i+Size*y_kmin]; - double yy = -y[eq]; - direction[eq] = yy; - y[eq] += slowc * yy; - } - else - for (int i = 0; i < n; i++) - { - int eq = index_vara[i]; - double yy = -y[eq+it_*y_size]; - direction[eq] = yy; - y[eq+it_*y_size] += slowc * yy; - } - return 0; - } - - int iteration = 0; - bool convergence = false; - double zeros = 0.0, one = 1.0, m_one = -1.0; - - cudaChk(cudaMalloc((void **) &tmp_, n * sizeof(double)), " in Solve_Cuda_Sparse, can't allocate tmp_ on the graphic card\n"); - - cudaChk(cudaMalloc((void **) &r, n * sizeof(double)), " in Solve_Cuda_BiCGStab, can't allocate r on the graphic card\n"); - - cudaChk(cudaMemcpy(r, b, n * sizeof(double), cudaMemcpyDeviceToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy r = b has failed\n"); - - //r = b - A * x0 - cusparseChk(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, n, - n, nnz, &m_one, - CUDA_descr, Ax, - Ap, Ai, - x0, &one, - r), "in Solve_Cuda_BiCGStab, cusparseDcsrmv A * x0 has failed"); - - cudaChk(cudaMemcpy(tmp_vect_host, r, n*sizeof(double), cudaMemcpyDeviceToHost), " in Solve_Cuda_BiCGStab, cudaMemcpy tmp_vect_host = p_tild has failed\n"); - /*mexPrintf("r\n"); - for (int i = 0; i < n; i++) - mexPrintf("%f\n",tmp_vect_host[i]);*/ - - cudaChk(cudaMalloc((void **) &r0, n * sizeof(double)), " in Solve_Cuda_BiCGStab, can't allocate r0 on the graphic card\n"); - cudaChk(cudaMemcpy(r0, r, n * sizeof(double), cudaMemcpyDeviceToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy r0 = r has failed\n"); - - cublasChk(cublasDnrm2(cublas_handle, n, // numerator - r, 1, - &tmp1), - " in Solve_Cuda_BiCGStab, cublasDnrm2(r) has failed\n"); - double conv_criteria = tmp1; - - convergence = conv_criteria < tolb; - if (convergence) - { - /* the initial value (x0) is solution of A x = b*/ - cudaChk(cudaFree(Ai), " in Solve_Cuda_BiCGStab, can't free Ai\n"); - cudaChk(cudaFree(Ax), " in Solve_Cuda_BiCGStab, can't free Ax\n"); - cudaChk(cudaFree(Ap), " in Solve_Cuda_BiCGStab, can't free Ap\n"); - if (preconditioner == 3) - { - cudaChk(cudaFree(Ai_tild), " in Solve_Cuda_BiCGStab, can't free Ai_tild\n"); - cudaChk(cudaFree(Ap_tild), " in Solve_Cuda_BiCGStab, can't free Ap_tild\n"); - } - cudaChk(cudaFree(A_tild), " in Solve_Cuda_BiCGStab, can't free A_tild\n"); - cudaChk(cudaFree(x0), " in Solve_Cuda_BiCGStab, can't free x0\n"); - cudaChk(cudaFree(b), " in Solve_Cuda_BiCGStab, can't free b\n"); - return 0; - } - - if (preconditioner == 0) - { - //Apply the Jacobi preconditioner - /*VecDiv<<>>(r_, A_tild, z_, n); - cuda_error = cudaMemcpy(zz_, z_, n * sizeof(double), cudaMemcpyDeviceToDevice);*/ - } - else if (preconditioner == 1) - { - //Apply an incomplete LU decomposition of A as preconditioner - cusparseChk(cusparseCreateSolveAnalysisInfo(&info), " in Solve_Cuda_BiCGStab, cusparseCreateSolveAnalysisInfo for info has failed\n"); - - cusparseChk(cusparseDcsrsv_analysis(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, - n, nnz, CUDA_descr, - A_tild, Ap, Ai, - info), - " in Solve_Cuda_BiCGStab, cusparseDcsrsm_analysis(info) has failed\n"); - - cusparseChk(cusparseDcsrilu0(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, - n, CUDA_descr, - A_tild, Ap, Ai, - info), - " in Solve_Cuda_BiCGStab, cusparseDcsrilu0 has failed\n"); - - //Make a copy of the indexes in A_tild_i and A_tild_p to use it the Bicgstab algorithm - cudaChk(cudaMalloc((void **) &A_tild_i, nnz * sizeof(int)), " in Solve_Cuda_BiCGStab, can't allocate A_tild_i on the graphic card\n"); - cudaChk(cudaMemcpy(A_tild_i, Ai, nnz * sizeof(int), cudaMemcpyDeviceToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy A_tild_i = Ai has failed\n"); - cudaChk(cudaMalloc((void **) &A_tild_p, (n + 1) * sizeof(int)), " in Solve_Cuda_BiCGStab, can't allocate A_tild_p on the graphic card\n"); - cudaChk(cudaMemcpy(A_tild_p, Ap, (n + 1) * sizeof(int), cudaMemcpyDeviceToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy A_tild_p = Ap has failed\n"); - } - else if (preconditioner == 2) - { - //Because the Jacobian matrix A is store in CSC format in matlab - // we have to transpose it to get a CSR format used by CUDA - mwIndex *Awi, *Awp; - double *A_tild_host = (double *) mxMalloc(nnz*sizeof(double)); - test_mxMalloc(A_tild_host, __LINE__, __FILE__, __func__, nnz*sizeof(double)); - Awi = (mwIndex *) mxMalloc(nnz * sizeof(mwIndex)); - test_mxMalloc(Awi, __LINE__, __FILE__, __func__, nnz * sizeof(mwIndex)); - Awp = (mwIndex *) mxMalloc((n + 1) * sizeof(mwIndex)); - test_mxMalloc(Awp, __LINE__, __FILE__, __func__, (n + 1) * sizeof(mwIndex)); - int *Aii = (int *) mxMalloc(nnz * sizeof(int)); - test_mxMalloc(Aii, __LINE__, __FILE__, __func__, nnz * sizeof(int)); - int *Aip = (int *) mxMalloc((n + 1) * sizeof(int)); - test_mxMalloc(Aip, __LINE__, __FILE__, __func__, (n + 1) * sizeof(int)); - cudaChk(cudaMemcpy(A_tild_host, A_tild, nnz*sizeof(double), cudaMemcpyDeviceToHost), " in Solve_Cuda_BiCGStab, cudaMemcpy A_tild_host = A_tild has failed\n"); - cudaChk(cudaMemcpy(Aii, Ai, nnz*sizeof(int), cudaMemcpyDeviceToHost), " in Solve_Cuda_BiCGStab, cudaMemcpy Aii = Ai has failed\n"); - cudaChk(cudaMemcpy(Aip, Ap, (n+1)*sizeof(int), cudaMemcpyDeviceToHost), " in Solve_Cuda_BiCGStab, cudaMemcpy Aip = Ai has failed\n"); - for (int i = 0; i < nnz; i++) - Awi[i] = Aii[i]; - for (int i = 0; i < n + 1; i++) - Awp[i] = Aip[i]; - mxFree(Aii); - mxFree(Aip); - mxArray *At_m = mxCreateSparse(n, n, nnz, mxREAL); - mxSetIr(At_m, Awi); - mxSetJc(At_m, Awp); - mxSetPr(At_m, A_tild_host); - mxArray *A_m; - mexCallMATLAB(1, &A_m, 1, &At_m, "transpose"); - mxDestroyArray(At_m); - - /*mexPrintf("A_m\n"); - mexCallMATLAB(0, NULL, 1, &A_m, "disp_dense");*/ - /*mxFree(Awi); - mxFree(Awp);*/ - - /*[L1, U1] = ilu(g1a=;*/ - const char *field_names[] = {"type", "droptol", "milu", "udiag", "thresh"}; - const int type = 0; - const int droptol = 1; - const int milu = 2; - const int udiag = 3; - const int thresh = 4; - mwSize dims[1] = {(mwSize) 1 }; - mxArray *Setup = mxCreateStructArray(1, dims, 5, field_names); - mxSetFieldByNumber(Setup, 0, type, mxCreateString("ilutp")); - //mxSetFieldByNumber(Setup, 0, type, mxCreateString("nofill")); - mxSetFieldByNumber(Setup, 0, droptol, mxCreateDoubleScalar(lu_inc_tol)); - mxSetFieldByNumber(Setup, 0, milu, mxCreateString("off")); - mxSetFieldByNumber(Setup, 0, udiag, mxCreateDoubleScalar(0)); - mxSetFieldByNumber(Setup, 0, thresh, mxCreateDoubleScalar(1)); - //mxSetFieldByNumber(Setup, 0, thresh, mxCreateDoubleScalar(1)); - mxArray *lhs0[2]; - mxArray *rhs0[2]; - rhs0[0] = A_m; - rhs0[1] = Setup; - ostringstream tmp; - if (mexCallMATLAB(2, lhs0, 2, rhs0, "ilu")) - { - tmp << " In BiCGStab, the incomplet LU decomposition (ilu) ahs failed.\n"; - throw FatalExceptionHandling(tmp.str()); - } - mxDestroyArray(Setup); - - /* //ILUT preconditionner computed by Matlab (todo: in futur version of cuda replace it by a new equivalent cuda function) - const char *field_names[] = {"type", "droptol", "milu", "udiag", "thresh"}; - const int type = 0; - const int droptol = 1; - const int milu = 2; - const int udiag = 3; - const int thresh = 4; - mwSize dims[1] = {(mwSize)1 }; - mxArray *Setup = mxCreateStructArray(1, dims, 5, field_names); - mxSetFieldByNumber(Setup, 0, type, mxCreateString("ilutp")); - mxSetFieldByNumber(Setup, 0, droptol, mxCreateDoubleScalar(lu_inc_tol)); - mxSetFieldByNumber(Setup, 0, milu, mxCreateString("off")); - mxSetFieldByNumber(Setup, 0, udiag, mxCreateDoubleScalar(0)); - mxSetFieldByNumber(Setup, 0, thresh, mxCreateDoubleScalar(0)); - mxArray *lhs0[2], *rhs0[2]; - rhs0[0] = A_m; - rhs0[1] = Setup; - mexCallMATLAB(1, lhs0, 2, rhs0, "ilu"); - */ - // To store the resultng matrix in a CSR format we have to transpose it - mxArray *Wt = lhs0[0]; - mwIndex *Wtj = mxGetJc(Wt); - nnz = Wtj[n]; - mxArray *W; - mexCallMATLAB(1, &W, 1, &Wt, "transpose"); - mxDestroyArray(Wt); - double *pW = mxGetPr(W); - mwIndex *Wi = mxGetIr(W); - mwIndex *Wp = mxGetJc(W); - int *Wii = (int *) mxMalloc(nnz * sizeof(int)); - test_mxMalloc(Wii, __LINE__, __FILE__, __func__, nnz * sizeof(int)); - int *Wip = (int *) mxMalloc((n + 1) * sizeof(int)); - test_mxMalloc(Wip, __LINE__, __FILE__, __func__, (n + 1) * sizeof(int)); - for (int i = 0; i < nnz; i++) - Wii[i] = Wi[i]; - for (int i = 0; i < n + 1; i++) - Wip[i] = Wp[i]; - - //mxFree(A_tild_host); - - cudaChk(cudaFree(A_tild), "cudaFree(A_tild) has failed\n"); - - cudaChk(cudaMalloc((void **) &A_tild, nnz * sizeof(double)), " in Solve_Cuda_BiCGStab, can't allocate A_tild on the graphic card\n"); - cudaChk(cudaMemcpy(A_tild, pW, nnz * sizeof(double), cudaMemcpyHostToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy A_tild = pW has failed\n"); - cudaChk(cudaMalloc((void **) &A_tild_i, nnz * sizeof(int)), " in Solve_Cuda_BiCGStab, can't allocate Ai on the graphic card\n"); - cudaChk(cudaMemcpy(A_tild_i, Wii, nnz * sizeof(int), cudaMemcpyHostToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy A_tild_i = A_tild_i_host has failed\n"); - cudaChk(cudaMalloc((void **) &A_tild_p, (n + 1) * sizeof(int)), " in Solve_Cuda_BiCGStab, can't allocate A_tild_p on the graphic card\n"); - cudaChk(cudaMemcpy(A_tild_p, Wip, (n + 1) * sizeof(int), cudaMemcpyHostToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy A_tild_p = A_tild_j_host has failed\n"); - /*mxFree(pW); - mxFree(Wi); - mxFree(Wj);*/ - mxDestroyArray(W); - mxFree(Wii); - mxFree(Wip); - } - else if (preconditioner == 3) - { - mwIndex *Aowi, *Aowp; - double *A_host = (double *) mxMalloc(nnz*sizeof(double)); - test_mxMalloc(A_host, __LINE__, __FILE__, __func__, nnz*sizeof(double)); - Aowi = (mwIndex *) mxMalloc(nnz * sizeof(mwIndex)); - test_mxMalloc(Aowi, __LINE__, __FILE__, __func__, nnz * sizeof(mwIndex)); - Aowp = (mwIndex *) mxMalloc((n + 1) * sizeof(mwIndex)); - test_mxMalloc(Aowp, __LINE__, __FILE__, __func__, (n + 1) * sizeof(mwIndex)); - int *Aoii = (int *) mxMalloc(nnz * sizeof(int)); - test_mxMalloc(Aoii, __LINE__, __FILE__, __func__, nnz * sizeof(int)); - int *Aoip = (int *) mxMalloc((n + 1) * sizeof(int)); - test_mxMalloc(Aoip, __LINE__, __FILE__, __func__, (n + 1) * sizeof(int)); - cudaChk(cudaMemcpy(A_host, Ax, nnz*sizeof(double), cudaMemcpyDeviceToHost), " in Solve_Cuda_BiCGStab, cudaMemcpy A_tild_host = A_tild has failed\n"); - cudaChk(cudaMemcpy(Aoii, Ai, nnz*sizeof(int), cudaMemcpyDeviceToHost), " in Solve_Cuda_BiCGStab, cudaMemcpy Aii = Ai_tild has failed\n"); - cudaChk(cudaMemcpy(Aoip, Ap, (n+1)*sizeof(int), cudaMemcpyDeviceToHost), " in Solve_Cuda_BiCGStab, cudaMemcpy Aip = Ap_tild has failed\n"); - for (int i = 0; i < nnz; i++) - Aowi[i] = Aoii[i]; - for (int i = 0; i < n + 1; i++) - Aowp[i] = Aoip[i]; - mxFree(Aoii); - mxFree(Aoip); - mxArray *Ao_m = mxCreateSparse(n, n, nnz, mxREAL); - mxSetIr(Ao_m, Aowi); - mxSetJc(Ao_m, Aowp); - mxSetPr(Ao_m, A_host); - /*mexPrintf("A_m\n"); - mxArray *Aoo; - mexCallMATLAB(1, &Aoo, 1, &Ao_m, "transpose"); - mexCallMATLAB(0, NULL, 1, &Aoo, "disp_dense"); - mxDestroyArray(Ao_m); - mxDestroyArray(Aoo);*/ - - //Because the Jacobian matrix A is store in CSC format in matlab - // we have to transpose it to get a CSR format used by CUDA - mwIndex *Awi, *Awp; - double *A_tild_host = (double *) mxMalloc(nnz_tild*sizeof(double)); - test_mxMalloc(A_tild_host, __LINE__, __FILE__, __func__, nnz_tild*sizeof(double)); - Awi = (mwIndex *) mxMalloc(nnz_tild * sizeof(mwIndex)); - test_mxMalloc(Awi, __LINE__, __FILE__, __func__, nnz_tild * sizeof(mwIndex)); - Awp = (mwIndex *) mxMalloc((Size + 1) * sizeof(mwIndex)); - test_mxMalloc(Awp, __LINE__, __FILE__, __func__, (Size + 1) * sizeof(mwIndex)); - int *Aii = (int *) mxMalloc(nnz_tild * sizeof(int)); - test_mxMalloc(Aii, __LINE__, __FILE__, __func__, nnz_tild * sizeof(int)); - int *Aip = (int *) mxMalloc((Size + 1) * sizeof(int)); - test_mxMalloc(Aip, __LINE__, __FILE__, __func__, (Size + 1) * sizeof(int)); - cudaChk(cudaMemcpy(A_tild_host, A_tild, nnz_tild*sizeof(double), cudaMemcpyDeviceToHost), " in Solve_Cuda_BiCGStab, cudaMemcpy A_tild_host = A_tild has failed\n"); - cudaChk(cudaMemcpy(Aii, Ai_tild, nnz_tild*sizeof(int), cudaMemcpyDeviceToHost), " in Solve_Cuda_BiCGStab, cudaMemcpy Aii = Ai_tild has failed\n"); - cudaChk(cudaMemcpy(Aip, Ap_tild, (Size+1)*sizeof(int), cudaMemcpyDeviceToHost), " in Solve_Cuda_BiCGStab, cudaMemcpy Aip = Ap_tild has failed\n"); - for (int i = 0; i < nnz_tild; i++) - Awi[i] = Aii[i]; - for (int i = 0; i < Size + 1; i++) - Awp[i] = Aip[i]; - /*for (int i = 0; i < nnz_tild; i++) - mexPrintf("%20.17f\n",A_tild_host[i]);*/ - mxFree(Aii); - mxFree(Aip); - mxArray *At_m = mxCreateSparse(Size, Size, nnz_tild, mxREAL); - mxSetIr(At_m, Awi); - mxSetJc(At_m, Awp); - mxSetPr(At_m, A_tild_host); - mxArray *A_m; - mexCallMATLAB(1, &A_m, 1, &At_m, "transpose"); - /*mexPrintf("A_tild_m\n"); - mexCallMATLAB(0, NULL, 1, &A_m, "disp_dense");*/ - mxDestroyArray(At_m); - mxArray *P, *Q, *L, *U; - mxArray *lhs0[4]; - mexCallMATLAB(4, lhs0, 1, &A_m, "lu"); - - mxArray *P0, *Q0, *L0, *U0; - L0 = lhs0[0]; - U0 = lhs0[1]; - P0 = lhs0[2]; - Q0 = lhs0[3]; - mexCallMATLAB(1, &P, 1, &P0, "transpose"); - mexCallMATLAB(1, &Q, 1, &Q0, "transpose"); - mexCallMATLAB(1, &L, 1, &L0, "transpose"); - mexCallMATLAB(1, &U, 1, &U0, "transpose"); - mxDestroyArray(P0); - mxDestroyArray(Q0); - mxDestroyArray(L0); - mxDestroyArray(U0); - /*L = lhs0[0]; - U = lhs0[1]; - P = lhs0[2]; - Q = lhs0[3];*/ - - /*mexPrintf("L\n"); - mexCallMATLAB(0, NULL, 1, &L, "disp_dense"); - - mexPrintf("U\n"); - mexCallMATLAB(0, NULL, 1, &U, "disp_dense"); - - mexPrintf("P\n"); - mexCallMATLAB(0, NULL, 1, &P, "disp_dense"); - - mexPrintf("Q\n"); - mexCallMATLAB(0, NULL, 1, &Q, "disp_dense");*/ - - mwIndex *Qiw_host = mxGetIr(Q); - mwIndex *Qjw_host = mxGetJc(Q); - double *Qx_host = mxGetPr(Q); - Q_nnz = Qjw_host[Size]; - mexPrintf("Q_nnz=%d\n", Q_nnz); - int *Qi_host = (int *) mxMalloc(Q_nnz * periods * sizeof(int)); - test_mxMalloc(Qi_host, __LINE__, __FILE__, __func__, Q_nnz * periods * sizeof(int)); - double *Q_x_host = (double *) mxMalloc(Q_nnz * periods * sizeof(double)); - test_mxMalloc(Q_x_host, __LINE__, __FILE__, __func__, Q_nnz * periods * sizeof(double)); - int *Qj_host = (int *) mxMalloc((n + 1) * sizeof(int)); - test_mxMalloc(Qj_host, __LINE__, __FILE__, __func__, (n + 1) * sizeof(int)); - for (int t = 0; t < periods; t++) - { - for (int i = 0; i < Q_nnz; i++) - { - Qi_host[i + t * Q_nnz] = Qiw_host[i] + t * Size; - Q_x_host[i + t * Q_nnz] = Qx_host[i]; - } - for (int i = 0; i < Size; i++) - { - Qj_host[i + t * Size] = Qjw_host[i] + t * Q_nnz; - } - } - Qj_host[periods * Size] = periods * Q_nnz; - - /*mwIndex *Qtiw_host = (mwIndex*) mxMalloc(Q_nnz * periods * sizeof(mwIndex)); - double *Qt_x_host = (double*)mxMalloc(Q_nnz * periods * sizeof(double)); - mwIndex *Qtjw_host = (mwIndex*)mxMalloc((n + 1) * sizeof(mwIndex)); - mexPrintf("n = %d\n",n); - for (int i = 0; i < n + 1; i++) - Qtjw_host[i] = Qj_host[i]; - for (int i = 0; i < Q_nnz * periods; i++) - { - Qtiw_host[i] = Qi_host[i]; - Qt_x_host[i] = Q_x_host[i]; - } - mxArray* Qt_m = mxCreateSparse(n,n,Q_nnz * periods,mxREAL); - mxSetIr(Qt_m, Qtiw_host); - mxSetJc(Qt_m, Qtjw_host); - mxSetPr(Qt_m, Qt_x_host); - mexPrintf("Qt_m\n"); - mexCallMATLAB(0, NULL, 1, &Qt_m, "disp_dense");*/ - - /*mexPrintf("Qtjw_host[periods * Size=%d]=%d\n", periods * Size, Qtjw_host[periods * Size]); - for (int i = 0; i < n; i++) - for (int j = Qtjw_host[i]; j < Qtjw_host[i+1]; j++) - mexPrintf("(i=%d, j=%d) = %f\n", i, Qtiw_host[j], Qt_x_host[j]);*/ - //mxDestroyArray(Qt_m); - - cudaChk(cudaMalloc((void **) &Qx, Q_nnz * periods * sizeof(double)), " in Solve_Cuda_BiCGStab, can't allocate Qx on the graphic card\n"); - cudaChk(cudaMemcpy(Qx, Q_x_host, Q_nnz * periods * sizeof(double), cudaMemcpyHostToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy Qx = Qx_host has failed\n"); - cudaChk(cudaMalloc((void **) &Qi, Q_nnz * periods * sizeof(int)), " in Solve_Cuda_BiCGStab, can't allocate Qi on the graphic card\n"); - cudaChk(cudaMemcpy(Qi, Qi_host, Q_nnz * periods * sizeof(int), cudaMemcpyHostToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy Qi = Qi_host has failed\n"); - cudaChk(cudaMalloc((void **) &Qj, (Size * periods + 1) * sizeof(int)), " in Solve_Cuda_BiCGStab, can't allocate Qj on the graphic card\n"); - cudaChk(cudaMemcpy(Qj, Qj_host, (Size * periods + 1) * sizeof(int), cudaMemcpyHostToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy Qj = Qj_host has failed\n"); - mxFree(Qi_host); - mxFree(Qj_host); - mxFree(Q_x_host); - mxDestroyArray(Q); - - mwIndex *Piw_host = mxGetIr(P); - mwIndex *Pjw_host = mxGetJc(P); - double *Px_host = mxGetPr(P); - P_nnz = Pjw_host[Size]; - int *Pi_host = (int *) mxMalloc(P_nnz * periods * sizeof(int)); - test_mxMalloc(Pi_host, __LINE__, __FILE__, __func__, P_nnz * periods * sizeof(int)); - double *P_x_host = (double *) mxMalloc(P_nnz * periods * sizeof(double)); - test_mxMalloc(P_x_host, __LINE__, __FILE__, __func__, P_nnz * periods * sizeof(double)); - int *Pj_host = (int *) mxMalloc((n + 1) * sizeof(int)); - test_mxMalloc(Pj_host, __LINE__, __FILE__, __func__, (n + 1) * sizeof(int)); - for (int t = 0; t < periods; t++) - { - for (int i = 0; i < P_nnz; i++) - { - Pi_host[i + t * P_nnz] = Piw_host[i] + t * Size; - P_x_host[i + t * P_nnz] = Px_host[i]; - } - for (int i = 0; i < Size; i++) - Pj_host[i + t * Size] = Pjw_host[i] + t * P_nnz; - } - Pj_host[periods * Size] = periods * P_nnz; - - /*mwIndex *Ptiw_host = (mwIndex*) mxMalloc(P_nnz * periods * sizeof(mwIndex)); - double *Pt_x_host = (double*)mxMalloc(P_nnz * periods * sizeof(double)); - mwIndex *Ptjw_host = (mwIndex*)mxMalloc((n + 1) * sizeof(mwIndex)); - for (int i = 0; i < n + 1; i++) - Ptjw_host[i] = Pj_host[i]; - for (int i = 0; i < P_nnz * periods; i++) - { - Ptiw_host[i] = Pi_host[i]; - Pt_x_host[i] = P_x_host[i]; - } - mxArray* Pt_m = mxCreateSparse(n,n,P_nnz * periods,mxREAL); - mxSetIr(Pt_m, Ptiw_host); - mxSetJc(Pt_m, Ptjw_host); - mxSetPr(Pt_m, Pt_x_host); - mexPrintf("Pt_m\n"); - mexCallMATLAB(0, NULL, 1, &Pt_m, "disp_dense"); - mxDestroyArray(Pt_m);*/ - - cudaChk(cudaMalloc((void **) &Px, P_nnz * periods * sizeof(double)), " in Solve_Cuda_BiCGStab, can't allocate Px on the graphic card\n"); - cudaChk(cudaMemcpy(Px, P_x_host, P_nnz * periods * sizeof(double), cudaMemcpyHostToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy Px = Px_host has failed\n"); - cudaChk(cudaMalloc((void **) &Pi, P_nnz * periods * sizeof(int)), " in Solve_Cuda_BiCGStab, can't allocate Pi on the graphic card\n"); - cudaChk(cudaMemcpy(Pi, Pi_host, P_nnz * periods * sizeof(int), cudaMemcpyHostToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy Pi = Pi_host has failed\n"); - cudaChk(cudaMalloc((void **) &Pj, (Size * periods + 1) * sizeof(int)), " in Solve_Cuda_BiCGStab, can't allocate Pj on the graphic card\n"); - cudaChk(cudaMemcpy(Pj, Pj_host, (Size * periods + 1) * sizeof(int), cudaMemcpyHostToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy Pj = Pj_host has failed\n"); - mxFree(Pi_host); - mxFree(Pj_host); - mxFree(P_x_host); - mxDestroyArray(P); - - /*mwIndex* Piw_host = mxGetIr(P); - mwIndex* Pjw_host = mxGetJc(P); - double* Px_host = mxGetPr(P); - P_nnz = Pjw_host[Size]; - int *Pi_host = (int*)mxMalloc(P_nnz * sizeof(int)); - int *Pj_host = (int*)mxMalloc((Size + 1) * sizeof(int)); - for (int i = 0; i < P_nnz; i++) - Pi_host[i] = Piw_host[i]; - for (int i = 0; i < Size + 1; i++) - Pj_host[i] = Pjw_host[i]; - - cudaChk(cudaMalloc((void**)&Px, P_nnz * sizeof(double)), " in Solve_Cuda_BiCGStab, can't allocate Px on the graphic card\n"); - cudaChk(cudaMemcpy(Px, Px_host, P_nnz * sizeof(double), cudaMemcpyHostToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy Px = Px_host has failed\n"); - cudaChk(cudaMalloc((void**)&Pi, P_nnz * sizeof(int)), " in Solve_Cuda_BiCGStab, can't allocate Pi on the graphic card\n"); - cudaChk(cudaMemcpy(Pi, Pi_host, P_nnz * sizeof(int), cudaMemcpyHostToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy Pi = Pi_host has failed\n"); - cudaChk(cudaMalloc((void**)&Pj, (Size + 1) * sizeof(int)), " in Solve_Cuda_BiCGStab, can't allocate Pj on the graphic card\n"); - cudaChk(cudaMemcpy(Pj, Pj_host, (Size + 1) * sizeof(int), cudaMemcpyHostToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy Pj = Pj_host has failed\n"); - mxFree(Pi_host); - mxFree(Pj_host); - mxDestroyArray(P);*/ - - /*mexPrintf("L\n"); - mexCallMATLAB(0, NULL, 1, &L, "disp_dense"); - - mexPrintf("U\n"); - mexCallMATLAB(0, NULL, 1, &U, "disp_dense");*/ - - mwIndex *Liw_host = mxGetIr(L); - mwIndex *Ljw_host = mxGetJc(L); - double *Lx_host = mxGetPr(L); - int L_nnz = Ljw_host[Size]; - - mwIndex *Uiw_host = mxGetIr(U); - mwIndex *Ujw_host = mxGetJc(U); - double *Ux_host = mxGetPr(U); - int U_nnz = Ujw_host[Size]; - - double *pW = (double *) mxMalloc((L_nnz + U_nnz - Size) * periods * sizeof(double)); - test_mxMalloc(pW, __LINE__, __FILE__, __func__, (L_nnz + U_nnz - Size) * periods * sizeof(double)); - int *Wi = (int *) mxMalloc((L_nnz + U_nnz - Size) * periods * sizeof(int)); - test_mxMalloc(Wi, __LINE__, __FILE__, __func__, (L_nnz + U_nnz - Size) * periods * sizeof(int)); - int *Wj = (int *) mxMalloc((n + 1) * sizeof(int)); - test_mxMalloc(Wj, __LINE__, __FILE__, __func__, (n + 1) * sizeof(int)); - Wj[0] = 0; - W_nnz = 0; - for (int t = 0; t < periods; t++) - for (int i = 0; i < Size; i++) - { - for (mwIndex l = Ujw_host[i]; l < Ujw_host[i+1]; l++) - { - Wi[W_nnz] = Uiw_host[l] + t * Size; - pW[W_nnz] = Ux_host[l]; - //mexPrintf("Wj[%d] = %d, Wi[%d] = Uiw_host[%d] + t * Size = %d, pW[%d]=%f\n", i + t * Size, Wj[i + t * Size], W_nnz, l, Uiw_host[l] + t * Size, W_nnz, Ux_host[l]); - W_nnz++; - } - for (mwIndex l = Ljw_host[i]; l < Ljw_host[i+1]; l++) - { - if (Liw_host[l] > i) - { - Wi[W_nnz] = Liw_host[l] + t * Size; - pW[W_nnz] = Lx_host[l]; - //mexPrintf("Wj[%d] = %d, Wi[%d] = Liw_host[%d] + t * Size = %d, pW[%d]=%f\n", i + t * Size, Wj[i + t * Size], W_nnz, l, Liw_host[l] + t * Size, W_nnz, Lx_host[l]); - W_nnz++; - } - } - Wj[i + 1 + t * Size] = W_nnz; - } - //mexPrintf("Wj[%d] = %d, n=%d\n", periods * Size, Wj[periods * Size], n); - cudaChk(cudaMalloc((void **) &A_tild, W_nnz * sizeof(double)), " in Solve_Cuda_BiCGStab, can't allocate Px on the graphic card\n"); - cudaChk(cudaMemcpy(A_tild, pW, W_nnz * sizeof(double), cudaMemcpyHostToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy A_tild = pW has failed\n"); - cudaChk(cudaMalloc((void **) &A_tild_i, W_nnz * sizeof(int)), " in Solve_Cuda_BiCGStab, can't allocate Pi on the graphic card\n"); - cudaChk(cudaMemcpy(A_tild_i, Wi, W_nnz * sizeof(int), cudaMemcpyHostToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy A_tild_i = Wi has failed\n"); - cudaChk(cudaMalloc((void **) &A_tild_p, (n + 1) * sizeof(int)), " in Solve_Cuda_BiCGStab, can't allocate Pj on the graphic card\n"); - cudaChk(cudaMemcpy(A_tild_p, Wj, (n + 1) * sizeof(int), cudaMemcpyHostToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy A_tild_p = Wj has failed\n"); - - /*mwIndex *Wwi = (mwIndex*)mxMalloc(W_nnz * sizeof(mwIndex)); - mwIndex *Wwj = (mwIndex*)mxMalloc((n + 1) * sizeof(mwIndex)); - for (int i = 0; i < W_nnz; i++) - Wwi[i] = Wi[i]; - for (int i = 0; i < n + 1; i++) - Wwj[i] = Wj[i]; - mxFree(Wi); - mxFree(Wj); - mxArray* Ao_tild = mxCreateSparse(n,n,W_nnz,mxREAL); - mxSetIr(Ao_tild, Wwi); - mxSetJc(Ao_tild, Wwj); - mxSetPr(Ao_tild, pW); - mexPrintf("Ao_tild\n"); - mexCallMATLAB(0, NULL, 1, &Ao_tild, "disp_dense"); - mxDestroyArray(Ao_tild);*/ - - /*ostringstream tmp; - tmp << "debugging"; - mexWarnMsgTxt(tmp.str().c_str()); - return 4;*/ - - /* Apply the permutation matrices (P and Q) to the b vector of system to solve : - b_tild = P-1 . b = P' . b */ - /*cudaChk(cudaMalloc((void**)&b_tild, n * sizeof(double)), " in Solve_Cuda_BiCGStab, can't allocate b_tild on the graphic card\n"); - cusparseChk(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_TRANSPOSE, - n, n, nnz, &one, CUDA_descr, - Px, Pj, Pi, - b, &zeros, - b_tild), - " in Solve_Cuda_BiCGStab, b_tild = cusparseDcsrmv(P', b) has failed\n"); - - cusparseChk(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_TRANSPOSE, - n, n, nnz, &one, CUDA_descr, - Px, Pj, Pi, - b, &zeros, - b), - " in Solve_Cuda_BiCGStab, b = cusparseDcsrmv(P', b) has failed\n"); - */ - /*mexPrintf("Wt = lu(A_m)\n"); - mexCallMATLAB(0, NULL, 1, &Wt, "disp_dense");*/ - /*ostringstream tmp; - tmp << "debugging"; - mexWarnMsgTxt(tmp.str().c_str()); - return 4;*/ - // To store the resultng matrix in a CSR format we have to transpose it - /*mwIndex* Wtj = mxGetJc(Wt); - nnz = Wtj[n]; - mxArray* W; - mexCallMATLAB(1, &W, 1, &Wt, "transpose"); - mxDestroyArray(Wt); - pW = mxGetPr(W); - Wwi = mxGetIr(W); - mwIndex* Wp = mxGetJc(W); - int *Wii = (int*)mxMalloc(nnz * sizeof(int)); - int *Wip = (int*)mxMalloc((n + 1) * sizeof(int)); - for (int i = 0; i < nnz; i++) - Wii[i] = Wi[i]; - for (int i = 0; i < n + 1; i++) - Wip[i] = Wp[i]; - - //mxFree(A_tild_host); - - cudaChk(cudaFree(Ai_tild), " in Solve_Cuda_BiCGStab, cudaFree(Ai_tild) has failed\n"); - cudaChk(cudaFree(Ap_tild), " in Solve_Cuda_BiCGStab, cudaFree(Ap_tild) has failed\n"); - cudaChk(cudaFree(A_tild), " in Solve_Cuda_BiCGStab, cudaFree(A_tild) has failed\n"); - - cudaChk(cudaMalloc((void**)&A_tild, nnz * sizeof(double)), " in Solve_Cuda_BiCGStab, can't allocate A_tild on the graphic card\n"); - cudaChk(cudaMemcpy(A_tild, pW, nnz * sizeof(double), cudaMemcpyHostToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy A_tild = pW has failed\n"); - cudaChk(cudaMalloc((void**)&A_tild_i, nnz * sizeof(int)), " in Solve_Cuda_BiCGStab, can't allocate Ai on the graphic card\n"); - cudaChk(cudaMemcpy(A_tild_i, Wii, nnz * sizeof(int), cudaMemcpyHostToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy A_tild_i = A_tild_i_host has failed\n"); - cudaChk(cudaMalloc((void**)&A_tild_p, (n + 1) * sizeof(int)), " in Solve_Cuda_BiCGStab, can't allocate A_tild_p on the graphic card\n"); - cudaChk(cudaMemcpy(A_tild_p, Wip, (n + 1) * sizeof(int), cudaMemcpyHostToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy A_tild_p = A_tild_j_host has failed\n"); - mxDestroyArray(W); - mxFree(Wii); - mxFree(Wip);*/ - } - if (preconditioner == 1 || preconditioner == 2 || preconditioner == 3) - { - cusparseChk(cusparseCreateMatDescr(&descrL), - " in Solve_Cuda_BiCGStab, cusparseCreateMatDescr has failed for descrL\n"); - cusparseChk(cusparseSetMatIndexBase(descrL, CUSPARSE_INDEX_BASE_ZERO), - " in Solve_Cuda_BiCGStab, cusparseSetMatIndexBase has failed for descrL\n"); - cusparseChk(cusparseSetMatType(descrL, CUSPARSE_MATRIX_TYPE_GENERAL), - " in Solve_Cuda_BiCGStab, cusparseSetMatType has failed for descrL\n"); - cusparseChk(cusparseSetMatFillMode(descrL, CUSPARSE_FILL_MODE_LOWER), - " in Solve_Cuda_BiCGStab, cusparseSetFillMod has failed for descrL\n"); - cusparseChk(cusparseSetMatDiagType(descrL, CUSPARSE_DIAG_TYPE_UNIT), - " in Solve_Cuda_BiCGStab, cusparseSetMatDiagType has failed for descrL\n"); - - cusparseChk(cusparseCreateMatDescr(&descrU), - " in Solve_Cuda_BiCGStab, cusparseCreateMatDescr has failed for descrU\n"); - cusparseChk(cusparseSetMatIndexBase(descrU, CUSPARSE_INDEX_BASE_ZERO), - " in Solve_Cuda_BiCGStab, cusparseSetMatIndexBase has failed for descrU\n"); - cusparseChk(cusparseSetMatType(descrU, CUSPARSE_MATRIX_TYPE_GENERAL), - " in Solve_Cuda_BiCGStab, cusparseSetMatType has failed for descrU\n"); - cusparseChk(cusparseSetMatFillMode(descrU, CUSPARSE_FILL_MODE_UPPER), - " in Solve_Cuda_BiCGStab, cusparseSetFillMod has failed for descrU\n"); - cusparseChk(cusparseSetMatDiagType(descrU, CUSPARSE_DIAG_TYPE_NON_UNIT), - " in Solve_Cuda_BiCGStab, cusparseSetMatDiagType has failed for descrU\n"); - - int host_nnz_tild; - if (preconditioner == 3) - host_nnz_tild = W_nnz; - else - host_nnz_tild = nnz; - - if (preconditioner == 1) - cusparseChk(cusparseDestroySolveAnalysisInfo(info), - " in Solve_Cuda_BiCGStab, cusparseDestroySolveAnalysisInfo has failed for info\n"); - - cusparseChk(cusparseCreateSolveAnalysisInfo(&infoL), - " in Solve_Cuda_BiCGStab, cusparseCreateSolveAnalysisInfo has failed for infoL\n"); - cusparseChk(cusparseDcsrsv_analysis(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, - n, host_nnz_tild, descrL, - A_tild, A_tild_p, A_tild_i, - infoL), - " in Solve_Cuda_BiCGStab, cusparseDcsrsm_analysis for infoL has failed\n"); - - cusparseChk(cusparseCreateSolveAnalysisInfo(&infoU), - " in Solve_Cuda_BiCGStab, cusparseCreateSolveAnalysisInfo has failed for infoU\n"); - cusparseChk(cusparseDcsrsv_analysis(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, - n, host_nnz_tild, descrU, - A_tild, A_tild_p, A_tild_i, - infoU), - " in Solve_Cuda_BiCGStab, cusparseDcsrsm_analysis for infoU has failed\n"); - } - - cudaChk(cudaMalloc((void **) &v, n * sizeof(double)), " in Solve_Cuda_BiCGStab, can't allocate v on the graphic card\n"); - cudaChk(cudaMalloc((void **) &p, n * sizeof(double)), " in Solve_Cuda_BiCGStab, can't allocate p on the graphic card\n"); - //cudaChk(cudaMemset(p, 0, n * sizeof(double)), " in Solve_Cuda_BiCGStab, cudaMemset p = 0 has failed\n"); - cudaChk(cudaMalloc((void **) &s, n * sizeof(double)), " in Solve_Cuda_BiCGStab, can't allocate s on the graphic card\n"); - cudaChk(cudaMalloc((void **) &t, n * sizeof(double)), " in Solve_Cuda_BiCGStab, can't allocate t on the graphic card\n"); - cudaChk(cudaMalloc((void **) &y_, n * sizeof(double)), " in Solve_Cuda_BiCGStab, can't allocate y_ on the graphic card\n"); - cudaChk(cudaMalloc((void **) &z, n * sizeof(double)), " in Solve_Cuda_BiCGStab, can't allocate z on the graphic card\n"); - - double rho = 1.0, alpha = 1.0, omega = 1.0; - - //residual = P*B*Q - L*U; - //norm(Z,1) should be close to 0 - - while (iteration < 50 /*max_iterations*/ && !convergence) - { - double rho_prev = rho; - /**store in s previous value of r*/ - cudaChk(cudaMemcpy(s, r, n * sizeof(double), cudaMemcpyDeviceToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy s = r has failed\n"); - - /**rho = r0 . r*/ - cublasChk(cublasDdot(cublas_handle, n, // numerator - r0, 1, - r, 1, - &rho), - " in Solve_Cuda_BiCGStab, rho = cublasDdot(r0, r) has failed\n"); - - mexPrintf("rho=%f\n", rho); - - double beta; - - if (iteration == 0) - { - cudaChk(cudaMemcpy(p, r, n * sizeof(double), cudaMemcpyDeviceToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy p = r has failed\n"); - } - else - { - /**beta = (rho / rho_prev) . (alpha / omega);*/ - beta = rho / rho_prev * alpha / omega; - - /**p = r + beta * (p - omega * v)*/ - // tmp_ = p - omega * v - VecAdd<<< nblocks, n_threads>>> (tmp_, p, -omega, v, n); - //p = r + beta * tmp_ - VecAdd<<< nblocks, n_threads>>> (p, r, beta, tmp_, n); - } - - /**y_ solution of A_tild * y_ = p <=> L . U . y_ = p*/ - // L tmp_ = p => tmp_ = L^-1 p, with tmp_ = U . y_ - - if (preconditioner == 3) - { - double *p_tild; - - cudaChk(cudaMemcpy(tmp_vect_host, p, n*sizeof(double), cudaMemcpyDeviceToHost), " in Solve_Cuda_BiCGStab, cudaMemcpy tmp_vect_host = p has failed\n"); - /*mexPrintf("p\n"); - for (int i = 0; i < n; i++) - mexPrintf("%f\n",tmp_vect_host[i]);*/ - - cudaChk(cudaMalloc((void **) &p_tild, n * sizeof(double)), " in Solve_Cuda_BiCGStab, can't allocate b_tild on the graphic card\n"); - cusparseChk(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, - n, n, P_nnz * periods, &one, CUDA_descr, - Px, Pj, Pi, - p, &zeros, - p_tild), - " in Solve_Cuda_BiCGStab, p_tild = cusparseDcsrmv(P', p) has failed\n"); - - /*mexPrintf("P\n"); - printM(n, Px, Pj, Pi, CUDA_descr, cusparse_handle);*/ - - cudaChk(cudaMemcpy(tmp_vect_host, p_tild, n*sizeof(double), cudaMemcpyDeviceToHost), " in Solve_Cuda_BiCGStab, cudaMemcpy tmp_vect_host = p_tild has failed\n"); - /*mexPrintf("p_tild\n"); - for (int i = 0; i < n; i++) - mexPrintf("%f\n",tmp_vect_host[i]);*/ - - cusparseChk(cusparseDcsrsv_solve(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, - n, &one, - descrL, - A_tild, A_tild_p, A_tild_i, - infoL, p_tild, - tmp_), - " in Solve_Cuda_BiCGStab, cusparseDcsrsv_solve for L . tmp_ = p_tild has failed\n"); - cudaChk(cudaFree(p_tild), " in Solve_Cuda_BiCGStab, can't free p_tild\n"); - - cudaChk(cudaMemcpy(tmp_vect_host, tmp_, n*sizeof(double), cudaMemcpyDeviceToHost), " in Solve_Cuda_BiCGStab, cudaMemcpy tmp_vect_host = v has failed\n"); - /*mexPrintf("tmp_\n"); - for (int i = 0; i < n; i++) - mexPrintf("%f\n",tmp_vect_host[i]);*/ - } - else - cusparseChk(cusparseDcsrsv_solve(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, - n, &one, - descrL, - A_tild, A_tild_p, A_tild_i, - infoL, p, - tmp_), - " in Solve_Cuda_BiCGStab, cusparseDcsrsv_solve for L . tmp_ = p has failed\n"); - - // U . y_ = L^-1 p <=> U . y_ = tmp_ => y_ = U^-1 L^-1 p - cusparseChk(cusparseDcsrsv_solve(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, - n, &one, - descrU, - A_tild, A_tild_p, A_tild_i, - infoU, tmp_, - y_), - " in Solve_Cuda_BiCGStab, cusparseDcsrsv_solve for U . y_ = tmp_ has failed\n"); - - /*cudaChk(cudaMemcpy(tmp_vect_host, y_, n*sizeof(double), cudaMemcpyDeviceToHost), " in Solve_Cuda_BiCGStab, cudaMemcpy tmp_vect_host = v has failed\n"); - mexPrintf("y_\n"); - for (int i = 0; i < n; i++) - mexPrintf("%f\n",tmp_vect_host[i]);*/ - - if (preconditioner == 3) - { - double *y_tild; - cudaChk(cudaMalloc((void **) &y_tild, n * sizeof(double)), " in Solve_Cuda_BiCGStab, can't allocate b_tild on the graphic card\n"); - cudaChk(cudaMemcpy(y_tild, y_, n * sizeof(double), cudaMemcpyDeviceToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy y_tild = y_ has failed\n"); - cusparseChk(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, - n, n, Q_nnz * periods, &one, CUDA_descr, - Qx, Qj, Qi, - y_tild, &zeros, - y_), - " in Solve_Cuda_BiCGStab, y_ = cusparseDcsrmv(Q', y_tild) has failed\n"); - cudaChk(cudaFree(y_tild), " in Solve_Cuda_BiCGStab, can't free y_tild\n"); - } - /*cudaChk(cudaMemcpy(tmp_vect_host, y_, n*sizeof(double), cudaMemcpyDeviceToHost), " in Solve_Cuda_BiCGStab, cudaMemcpy tmp_vect_host = v has failed\n"); - mexPrintf("y_\n"); - for (int i = 0; i < n; i++) - mexPrintf("%f\n",tmp_vect_host[i]);*/ - /**v = A*y_*/ - cusparseChk(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, - n, n, nnz, &one, CUDA_descr, - Ax, Ap, Ai, - y_, &zeros, - v), - " in Solve_Cuda_BiCGStab, v = cusparseDcsrmv(A, y_) has failed\n"); - cudaChk(cudaMemcpy(tmp_vect_host, v, n*sizeof(double), cudaMemcpyDeviceToHost), " in Solve_Cuda_BiCGStab, cudaMemcpy tmp_vect_host = v has failed\n"); - /*mexPrintf("v\n"); - for (int i = 0; i < n; i++) - mexPrintf("%f\n",tmp_vect_host[i]);*/ - - /**alpha = rho / (rr0 . v) with rr0 = r0*/ - cublasChk(cublasDdot(cublas_handle, n, // numerator - r0, 1, - v, 1, - &tmp1), - " in Solve_Cuda_BiCGStab, cublasDdot(r0, v) has failed\n"); - - alpha = rho / tmp1; - mexPrintf("rho = %f, tmp1 = %f\n", rho, tmp1); - mexPrintf("alpha = %f\n", alpha); - - if (alpha == 0 || isinf(alpha) || isnan(alpha)) - { - Solve_CUDA_BiCGStab_Free(tmp_vect_host, p, r, v, s, t, y_, z, tmp_, Ai, Ax, Ap, x0, b, A_tild, A_tild_i, A_tild_p, infoL, infoU, descrL, descrU, preconditioner); - ostringstream tmp; - tmp << "one of the scalar quantities (alpha=" << alpha << ") calculated during BICGSTAB became too small or too large to continue computing, in block " << block+1; - mexWarnMsgTxt(tmp.str().c_str()); - return 4; - } - - /** Check for potential stagnation*/ - cublasChk(cublasDnrm2(cublas_handle, n, // numerator - y_, 1, - &tmp1), - " in Solve_Cuda_BiCGStab, cublasDnrm2(y_) has failed\n"); - cublasChk(cublasDnrm2(cublas_handle, n, // denominator - x0, 1, - &tmp2), - " in Solve_Cuda_BiCGStab, cublasDnrm2(y_) has failed\n"); - mexPrintf("abs(alpha)*tmp1 = %f, alpha = %f, tmp1 = %f, tmp2 = %f, eps = %f\n", abs(alpha)*tmp1, alpha, tmp1, tmp2, eps); - if (abs(alpha)*tmp1 < eps * tmp2) - stagnation++; - else - stagnation = 0; - - /**x = x + alpha * y_*/ - VecInc<<< nblocks, n_threads>>> (x0, alpha, y_, n); - - /**s = r_prev - alpha *v with r_prev = s*/ - VecInc<<< nblocks, n_threads>>> (s, -alpha, v, n); - - /**Has BiCGStab converged?*/ - cublasChk(cublasDnrm2(cublas_handle, n, // numerator - s, 1, - &tmp1), - " in Solve_Cuda_BiCGStab, cublasDnrm2(s) has failed\n"); - conv_criteria = tmp1; - mexPrintf("conv_criteria = %f, tolb = %f\n", conv_criteria, tolb); - convergence = conv_criteria < tolb; - - if (convergence || stagnation >= max_stagnation || refinement_needed) - { - /**s = b - A * x0*/ - cudaChk(cudaMemcpy(s, b, n * sizeof(double), cudaMemcpyDeviceToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy s = b has failed\n"); - cusparseChk(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, - n, n, nnz, &m_one, CUDA_descr, - Ax, Ap, Ai, - x0, &one, - s), - " in Solve_Cuda_BiCGStab, s = b - cusparseDcsrmv(A, x0) has failed\n"); - cublasChk(cublasDnrm2(cublas_handle, n, // numerator - s, 1, - &tmp1), - " in Solve_Cuda_BiCGStab, cublasDnrm2(s) has failed\n"); - conv_criteria = tmp1; - convergence = conv_criteria < tolb; - if (convergence) - { - break; - } - else - { - if (stagnation >= max_stagnation && refinement_needed == 0) - stagnation = 0; - refinement_needed++; - if (refinement_needed > max_refinement) - { - Solve_CUDA_BiCGStab_Free(tmp_vect_host, p, r, v, s, t, y_, z, tmp_, Ai, Ax, Ap, x0, b, A_tild, A_tild_i, A_tild_p, infoL, infoU, descrL, descrU, preconditioner); - ostringstream tmp; - tmp << "Error in bytecode: BiCGStab stagnated (Two consecutive iterates were the same.), in block " << block+1; - mexWarnMsgTxt(tmp.str().c_str()); - return 3; - } - } - } - - /**z solution of A_tild * z = s*/ - // L tmp_ = s => tmp_ = L^-1 s, with tmp_ = U . z - if (preconditioner == 3) - { - double *s_tild; - cudaChk(cudaMalloc((void **) &s_tild, n * sizeof(double)), " in Solve_Cuda_BiCGStab, can't allocate b_tild on the graphic card\n"); - cusparseChk(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, - n, n, P_nnz * periods, &one, CUDA_descr, - Px, Pj, Pi, - s, &zeros, - s_tild), - " in Solve_Cuda_BiCGStab, s_tild = cusparseDcsrmv(P', s) has failed\n"); - cusparseChk(cusparseDcsrsv_solve(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, - n, &one, - descrL, - A_tild, A_tild_p, A_tild_i, - infoL, s_tild, - tmp_), - " in Solve_Cuda_BiCGStab, cusparseDcsrsv_solve for L . tmp_ = s_tild has failed\n"); - cudaChk(cudaFree(s_tild), " in Solve_Cuda_BiCGStab, can't free s_tild\n"); - } - else - cusparseChk(cusparseDcsrsv_solve(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, - n, &one, - descrL, - //Lx, Lp, Li, - A_tild, A_tild_p, A_tild_i, - infoL, s, - tmp_), - " in Solve_Cuda_BiCGStab, cusparseDcsrsv_solve for L . tmp_ = s has failed\n"); - // U . z = L^-1 s <=> U . z = tmp_ => z = U^-1 L^-1 s - cusparseChk(cusparseDcsrsv_solve(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, - n, &one, - descrU, - //Ux, Up, Ui, - A_tild, A_tild_p, A_tild_i, - infoU, tmp_, - z), - " in Solve_Cuda_BiCGStab, cusparseDcsrsv_solve for U . z = tmp_ has failed\n"); - if (preconditioner == 3) - { - double *z_tild; - cudaChk(cudaMalloc((void **) &z_tild, n * sizeof(double)), " in Solve_Cuda_BiCGStab, can't allocate z_tild on the graphic card\n"); - cudaChk(cudaMemcpy(z_tild, z, n * sizeof(double), cudaMemcpyDeviceToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy z_tild = z has failed\n"); - cusparseChk(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, - n, n, Q_nnz * periods, &one, CUDA_descr, - Qx, Qj, Qi, - z_tild, &zeros, - z), - " in Solve_Cuda_BiCGStab, z = cusparseDcsrmv(Q, z_tild) has failed\n"); - cudaChk(cudaFree(z_tild), " in Solve_Cuda_BiCGStab, can't free x_tild\n"); - } - /**t = A * z*/ - cusparseChk(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, - n, n, nnz, &one, CUDA_descr, - Ax, Ap, Ai, - z, &zeros, - t), - " in Solve_Cuda_BiCGStab, t = cusparseDcsrmv(A, z) has failed\n"); - - /** omega = (t' s) / (t' t)*/ - cublasChk(cublasDdot(cublas_handle, n, // numerator - t, 1, - s, 1, - &tmp1), - " in Solve_Cuda_BiCGStab, cublasDdot(t, s) has failed\n"); - - cublasChk(cublasDdot(cublas_handle, n, // numerator - t, 1, - t, 1, - &tmp2), - " in Solve_Cuda_BiCGStab, cublasDdot(t, t) has failed\n"); - - omega = tmp1 / tmp2; - - if (omega == 0 || isinf(omega) || isnan(omega)) - { - Solve_CUDA_BiCGStab_Free(tmp_vect_host, p, r, v, s, t, y_, z, tmp_, Ai, Ax, Ap, x0, b, A_tild, A_tild_i, A_tild_p, infoL, infoU, descrL, descrU, preconditioner); - ostringstream tmp; - mexEvalString("diary off;"); - tmp << "one of the scalar quantities (omega=" << omega << ") calculated during BICGSTAB became too small or too large to continue computing, in block " << block+1; - mexWarnMsgTxt(tmp.str().c_str()); - return 4; - } - - /**x = x + omega * z*/ - VecInc<<< nblocks, n_threads>>> (x0, omega, z, n); - - /**r = s - omega * t*/ - VecAdd<<< nblocks, n_threads>>> (r, s, -omega, t, n); - - /**Has BiCGStab converged?*/ - cublasChk(cublasDnrm2(cublas_handle, n, // numerator - r, 1, - &tmp1), - " in Solve_Cuda_BiCGStab, cublasDnrm2(r) has failed\n"); - conv_criteria = tmp1; - - convergence = conv_criteria < tolb; - - if (convergence || stagnation >= max_stagnation || refinement_needed) - { - /**r = b - A * x0*/ - cudaChk(cudaMemcpy(r, b, n * sizeof(double), cudaMemcpyDeviceToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy r = b has failed\n"); - cusparseChk(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, - n, n, nnz, &m_one, CUDA_descr, - Ax, Ap, Ai, - x0, &one, - r), - " in Solve_Cuda_BiCGStab, r = b - cusparseDcsrmv(A, x0) has failed\n"); - cublasChk(cublasDnrm2(cublas_handle, n, // numerator - r, 1, - &tmp1), - " in Solve_Cuda_BiCGStab, cublasDnrm2(r) has failed\n"); - conv_criteria = tmp1; - convergence = conv_criteria < tolb; - if (convergence) - { - mexPrintf("convergence achieved\n"); - break; - } - else - { - if (stagnation >= max_stagnation && refinement_needed == 0) - stagnation = 0; - refinement_needed++; - if (refinement_needed > max_refinement) - { - Solve_CUDA_BiCGStab_Free(tmp_vect_host, p, r, v, s, t, y_, z, tmp_, Ai, Ax, Ap, x0, b, A_tild, A_tild_i, A_tild_p, /*Lx, Li, Lp, Ux, Ui, Up, device_n, */ infoL, infoU, descrL, descrU, preconditioner); - ostringstream tmp; - mexEvalString("diary off;"); - tmp << "Error in bytecode: BiCGStab stagnated (Two consecutive iterates were the same.), in block " << block+1; - mexWarnMsgTxt(tmp.str().c_str()); - return 3; - } - } - } - - iteration++; - } - cudaChk(cudaMemcpy(tmp_vect_host, x0, n * sizeof(double), cudaMemcpyDeviceToHost), " in Solve_Cuda_BiCGStab, cudaMemcpy tmp_vect_host = x0 has failed\n"); - - if (is_two_boundaries) - for (int i = 0; i < n; i++) - { - int eq = index_vara[i+Size*y_kmin]; - double yy = -(tmp_vect_host[i] + y[eq]); - direction[eq] = yy; - y[eq] += slowc * yy; - } - else - for (int i = 0; i < n; i++) - { - int eq = index_vara[i]; - double yy = -(tmp_vect_host[i] + y[eq+it_*y_size]); - direction[eq] = yy; - y[eq+it_*y_size] += slowc * yy; - } - Solve_CUDA_BiCGStab_Free(tmp_vect_host, p, r, v, s, t, y_, z, tmp_, Ai, Ax, Ap, x0, b, A_tild, A_tild_i, A_tild_p, infoL, infoU, descrL, descrU, preconditioner); - - if (iteration >= max_iterations) - { - ostringstream tmp; - mexEvalString("diary off;"); - tmp << "Error in bytecode: No convergence inside BiCGStab, in block " << block+1; - mexWarnMsgTxt(tmp.str().c_str()); - return 1; - } - else - return 0; -} -#endif - void dynSparseMatrix::Solve_Matlab_GMRES(mxArray *A_m, mxArray *b_m, int Size, double slowc, int block, bool is_two_boundaries, int it_, mxArray *x0_m) { @@ -6339,13 +4762,6 @@ dynSparseMatrix::Simulate_Newton_Two_Boundaries(int blck, int y_size, int y_kmin { double top = 0.5; double bottom = 0.1; -#ifdef CUDA - int nnz, nnz_tild; - int *Ap_i, *Ai_i; - int *Ap_i_tild, *Ai_i_tild; - double *x0, *A_tild; - -#endif int preconditioner = 2; if (start_compare == 0) start_compare = y_kmin; @@ -6551,10 +4967,6 @@ dynSparseMatrix::Simulate_Newton_Two_Boundaries(int blck, int y_size, int y_kmin } if (stack_solve_algo == 0 || stack_solve_algo == 4) Init_UMFPACK_Sparse(periods, y_kmin, y_kmax, Size, IM_i, &Ap, &Ai, &Ax, &b, x0_m, vector_table_conditional_local, blck); -#ifdef CUDA - else if (stack_solve_algo == 7) - Init_CUDA_Sparse(periods, y_kmin, y_kmax, Size, IM_i, &Ap_i, &Ai_i, &Ax, &Ap_i_tild, &Ai_i_tild, &A_tild, &b, &x0, x0_m, &nnz, &nnz_tild, preconditioner); -#endif else Init_Matlab_Sparse(periods, y_kmin, y_kmax, Size, IM_i, A_m, b_m, x0_m); @@ -6569,10 +4981,6 @@ dynSparseMatrix::Simulate_Newton_Two_Boundaries(int blck, int y_size, int y_kmin Solve_Matlab_BiCGStab(A_m, b_m, Size, slowc, blck, true, 0, x0_m, 1); else if (stack_solve_algo == 5) Solve_ByteCode_Symbolic_Sparse_GaussianElimination(Size, symbolic, blck); -#ifdef CUDA - else if (stack_solve_algo == 7) - Solve_CUDA_BiCGStab(Ap_i, Ai_i, Ax, Ap_i_tild, Ai_i_tild, A_tild, b, x0, Size * periods, Size, slowc, true, 0, nnz, nnz_tild, preconditioner, Size * periods, blck); -#endif } if (print_it) { diff --git a/mex/sources/bytecode/SparseMatrix.hh b/mex/sources/bytecode/SparseMatrix.hh index 6780e5120..e662c0a21 100644 --- a/mex/sources/bytecode/SparseMatrix.hh +++ b/mex/sources/bytecode/SparseMatrix.hh @@ -28,51 +28,11 @@ #include "dynblas.h" #include "dynumfpack.h" -#ifdef CUDA -# include "cuda.h" -# include "cuda_runtime_api.h" -# include "cublas_v2.h" -# include "cusparse_v2.h" -#endif - #include "Mem_Mngr.hh" #include "ErrorHandling.hh" //#include "Interpreter.hh" #include "Evaluate.hh" -#define cudaChk(x, y) \ - { \ - cudaError_t cuda_error = x; \ - if (cuda_error != cudaSuccess) \ - { \ - ostringstream tmp; \ - tmp << y; \ - throw FatalExceptionHandling(tmp.str()); \ - } \ - }; - -#define cusparseChk(x, y) \ - { \ - cusparseStatus_t cusparse_status = x; \ - if (cusparse_status != CUSPARSE_STATUS_SUCCESS) \ - { \ - ostringstream tmp; \ - tmp << y; \ - throw FatalExceptionHandling(tmp.str()); \ - } \ - }; - -#define cublasChk(x, y) \ - { \ - cublasStatus_t cublas_status = x; \ - if (cublas_status != CUBLAS_STATUS_SUCCESS) \ - { \ - ostringstream tmp; \ - tmp << y; \ - throw FatalExceptionHandling(tmp.str()); \ - } \ - }; - #define NEW_ALLOC #define MARKOVITZ @@ -101,11 +61,7 @@ class dynSparseMatrix : public Evaluate { public: dynSparseMatrix(); - dynSparseMatrix(const int y_size_arg, const int y_kmin_arg, const int y_kmax_arg, const bool print_it_arg, const bool steady_state_arg, const int periods_arg, const int minimal_solving_periods_arg, const double slowc_arg -#ifdef CUDA - , const int CUDA_device_arg, cublasHandle_t cublas_handle_arg, cusparseHandle_t cusparse_handle_arg, cusparseMatDescr_t descr_arg -#endif - ); + dynSparseMatrix(const int y_size_arg, const int y_kmin_arg, const int y_kmax_arg, const bool print_it_arg, const bool steady_state_arg, const int periods_arg, const int minimal_solving_periods_arg, const double slowc_arg); void Simulate_Newton_Two_Boundaries(int blck, int y_size, int y_kmin, int y_kmax, int Size, int periods, bool cvg, int minimal_solving_periods, int stack_solve_algo, unsigned int endo_name_length, char *P_endo_names, vector_table_conditional_local_type vector_table_conditional_local); void Simulate_Newton_One_Boundary(bool forward); void fixe_u(double **u, int u_count_int, int max_lag_plus_max_lead_plus_1); @@ -123,12 +79,8 @@ private: void Init_GE(int periods, int y_kmin, int y_kmax, int Size, map, int>, int> &IM); void Init_Matlab_Sparse(int periods, int y_kmin, int y_kmax, int Size, map, int>, int> &IM, mxArray *A_m, mxArray *b_m, mxArray *x0_m); void Init_UMFPACK_Sparse(int periods, int y_kmin, int y_kmax, int Size, map, int>, int> &IM, SuiteSparse_long **Ap, SuiteSparse_long **Ai, double **Ax, double **b, mxArray *x0_m, vector_table_conditional_local_type vector_table_conditional_local, int block_num); -#ifdef CUDA - void Init_CUDA_Sparse(int periods, int y_kmin, int y_kmax, int Size, map, int>, int> &IM, int **Ap, int **Ai, double **Ax, int **Ap_tild, int **Ai_tild, double **A_tild, double **b, double **x0, mxArray *x0_m, int *nnz, int *nnz_tild, int preconditioner); -#endif void Init_Matlab_Sparse_Simple(int Size, map, int>, int> &IM, mxArray *A_m, mxArray *b_m, bool &zero_solution, mxArray *x0_m); void Init_UMFPACK_Sparse_Simple(int Size, map, int>, int> &IM, SuiteSparse_long **Ap, SuiteSparse_long **Ai, double **Ax, double **b, bool &zero_solution, mxArray *x0_m); - void Init_CUDA_Sparse_Simple(int Size, map, int>, int> &IM, SuiteSparse_long **Ap, SuiteSparse_long **Ai, double **Ax, double **b, double **x0, bool &zero_solution, mxArray *x0_m); void Simple_Init(int Size, std::map, int>, int> &IM, bool &zero_solution); void End_GE(int Size); bool mnbrak(double *ax, double *bx, double *cx, double *fa, double *fb, double *fc); @@ -145,13 +97,6 @@ private: void Solve_LU_UMFPack(SuiteSparse_long *Ap, SuiteSparse_long *Ai, double *Ax, double *b, int n, int Size, double slowc_l, bool is_two_boundaries, int it_); void End_Matlab_LU_UMFPack(); -#ifdef CUDA - void Solve_CUDA_BiCGStab_Free(double *tmp_vect_host, double *p, double *r, double *v, double *s, double *t, double *y_, double *z, double *tmp_, - int *Ai, double *Ax, int *Ap, double *x0, double *b, double *A_tild, int *A_tild_i, int *A_tild_p, - cusparseSolveAnalysisInfo_t infoL, cusparseSolveAnalysisInfo_t infoU, - cusparseMatDescr_t descrL, cusparseMatDescr_t descrU, int preconditioner); - int Solve_CUDA_BiCGStab(int *Ap, int *Ai, double *Ax, int *Ap_tild, int *Ai_tild, double *A_tild, double *b, double *x0, int n, int Size, double slowc_l, bool is_two_boundaries, int it_, int nnz, int nnz_tild, int preconditioner, int max_iterations, int block); -#endif void Solve_Matlab_GMRES(mxArray *A_m, mxArray *b_m, int Size, double slowc, int block, bool is_two_boundaries, int it_, mxArray *x0_m); void Solve_Matlab_BiCGStab(mxArray *A_m, mxArray *b_m, int Size, double slowc, int block, bool is_two_boundaries, int it_, mxArray *x0_m, int precond); void Check_and_Correct_Previous_Iteration(int block_num, int y_size, int size, double crit_opt_old); @@ -196,12 +141,6 @@ private: mxArray *Sparse_substract_SA_SB(mxArray *A_m, mxArray *B_m); mxArray *Sparse_substract_A_SB(mxArray *A_m, mxArray *B_m); mxArray *substract_A_B(mxArray *A_m, mxArray *B_m); -#ifdef CUDA - int CUDA_device; - cublasHandle_t cublas_handle; - cusparseHandle_t cusparse_handle; - cusparseMatDescr_t CUDA_descr; -#endif protected: stack Stack; int nb_prologue_table_u, nb_first_table_u, nb_middle_table_u, nb_last_table_u; diff --git a/mex/sources/bytecode/SparseMatrix_kernel.cu b/mex/sources/bytecode/SparseMatrix_kernel.cu deleted file mode 100644 index 9d37596fa..000000000 --- a/mex/sources/bytecode/SparseMatrix_kernel.cu +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Copyright (C) 2007-2012 Dynare Team - * - * This file is part of Dynare. - * - * Dynare is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * Dynare is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Dynare. If not, see . - */ - -#ifndef SPARMATRIX_KERNEL -#define SPARMATRIX_KERNEL - -// Kernel definition of vector division -__global__ void -VecDiv(double* A, double* B, double* C, int n) -{ - int i = blockIdx.x * 1024 + threadIdx.x; - if (i < n) - C[i] = (B[i] != 0.0 ? A[i] / B[i] : A[i]); -} - -__global__ void -VecAdd(double* res, double* r, double alpha, double* x, int n) -{ - int i = blockIdx.x * 1024 + threadIdx.x; - if (i < n) - res[i] = r[i] + alpha * x[i]; -} - -__global__ void -VecInc(double* res, double alpha, double* x, int n) -{ - int i = blockIdx.x * 1024 + threadIdx.x; - if (i < n) - res[i] += alpha * x[i]; -} - - -__global__ void -update_x(double* x, double alpha, double* y, double omega, double *z) -{ - int i = threadIdx.x; - x[i] += alpha * y[i] + omega * z[i]; -} - -__global__ void -Get_LU_dim(int *n, int* A_tild_i, int *A_tild_p, int *nnz_l, int *nnz_u) -{ - nnz_u[0] = 0; - nnz_l[0] = 0; - for (int i = 0; i < n[0]; i++) - { - for (int j = A_tild_p[i]; j < A_tild_p[i+1]; j++) - { - if (A_tild_i[j] < i) - nnz_l[0]++; - else if (A_tild_i[j] == i) - { - nnz_u[0]++; - //nnz_l[0]++; - } - else - nnz_u[0]++; - } - } -} - -__global__ void -Get_LU1_dim(int* n, int *nnz_l, int *nnz_u) -{ - nnz_u[0] = 3+n[0]; - nnz_l[0] = 1+n[0]; -} - - -__global__ void -Get_L_and_U(int *n, double* A_tild_x, int* A_tild_i, int *A_tild_p, double* Lx, int* Li, int *Lp, double* Ux, int* Ui, int* Up) -{ - int nnz_u = 0, nnz_l = 0; - Lp[0] = 0; - Up[0] = 0; - for (int i = 0; i < n[0]; i++) - { - for (int j = A_tild_p[i]; j < A_tild_p[i+1]; j++) - { - if (A_tild_i[j] < i) - { - Lx[nnz_l] = A_tild_x[j]; - Li[nnz_l] = A_tild_i[j]; - nnz_l++; - } - else if (A_tild_i[j] == i) - { - Ux[nnz_u] = A_tild_x[j]; - Lx[nnz_l] = 1.0; - Li[nnz_l] = Ui[nnz_u] = A_tild_i[j]; - nnz_u++; - //nnz_l++; - } - else - { - Ux[nnz_u] = A_tild_x[j]; - Ui[nnz_u] = A_tild_i[j]; - nnz_u++; - } - } - Lp[i+1] = nnz_l; - Up[i+1] = nnz_u; - } -} -#endif diff --git a/mex/sources/bytecode/bytecode.cc b/mex/sources/bytecode/bytecode.cc index c3ddfdc1f..d338296cd 100644 --- a/mex/sources/bytecode/bytecode.cc +++ b/mex/sources/bytecode/bytecode.cc @@ -57,164 +57,6 @@ Get_Argument(const mxArray *prhs) //#include #include -#ifdef CUDA -int -GPU_Test_and_Info(cublasHandle_t *cublas_handle, cusparseHandle_t *cusparse_handle, cusparseMatDescr_t *descr) -{ - cudaDeviceProp deviceProp; - int device_count, device, version, version_max = 0; - cublasStatus_t cublas_status; - cudaError_t cuda_error; - *descr = 0; - - /* ask cuda how many devices it can find */ - cudaGetDeviceCount(&device_count); - if (device_count < 1) - { - /* if it couldn't find any fail out */ - ostringstream tmp; - tmp << " Unable to find a CUDA device. Unable to implement CUDA solvers\n"; - throw FatalExceptionHandling(tmp.str()); - } - else - { - mexPrintf("-----------------------------------------\n"); - for (int i = 0; i < device_count; i++) - { - cudaSetDevice(i); - // Statistics about the GPU device - cuda_error = cudaGetDeviceProperties(&deviceProp, i); - if (cuda_error != cudaSuccess) - { - ostringstream tmp; - tmp << " bytecode cudaGetDeviceProperties failed\n"; - throw FatalExceptionHandling(tmp.str()); - } - mexPrintf("> GPU device %d: \"%s\" has:\n - %d Multi-Processors,\n - %d threads per multiprocessor,\n", i, deviceProp.name, deviceProp.multiProcessorCount, deviceProp.maxThreadsPerMultiProcessor); - mexEvalString("drawnow;"); - version = (deviceProp.major * 0x10 + deviceProp.minor); - if (version >= version_max) - { - device = i; - version_max = version; - } - mexPrintf(" - %4.2fMhz clock rate,\n - %2.0fMb of memory,\n - %d.%d compute capabilities.\n", double (deviceProp.clockRate) / (1024 * 1024), double (deviceProp.totalGlobalMem) / (1024 * 1024), deviceProp.major, deviceProp.minor); - mexEvalString("drawnow;"); - } - } - mexPrintf("> Device %d selected\n", device); - mexEvalString("drawnow;"); - - cuda_error = cudaSetDevice(device); - if (cuda_error != cudaSuccess) - { - ostringstream tmp; - tmp << " bytecode cudaSetDevice failed\n"; - throw FatalExceptionHandling(tmp.str()); - } - - if (version_max < 0x11) - { - ostringstream tmp; - tmp << " bytecode requires a minimum CUDA compute 1.1 capability\n"; - cudaDeviceReset(); - throw FatalExceptionHandling(tmp.str()); - } - - // Initialize CuBlas library - cublas_status = cublasCreate(cublas_handle); - if (cublas_status != CUBLAS_STATUS_SUCCESS) - { - ostringstream tmp; - switch (cublas_status) - { - case CUBLAS_STATUS_NOT_INITIALIZED: - tmp << " the CUBLAS initialization failed.\n"; - break; - case CUBLAS_STATUS_ALLOC_FAILED: - tmp << " the resources could not be allocated.\n"; - break; - default: - tmp << " unknown error during the initialization of cusparse library.\n"; - } - throw FatalExceptionHandling(tmp.str()); - } - - // Initialize the CuSparse library - cusparseStatus_t cusparse_status; - cusparse_status = cusparseCreate(cusparse_handle); - if (cusparse_status != CUSPARSE_STATUS_SUCCESS) - { - ostringstream tmp; - switch (cusparse_status) - { - case CUSPARSE_STATUS_NOT_INITIALIZED: - tmp << " the CUDA Runtime initialization failed.\n"; - break; - case CUSPARSE_STATUS_ALLOC_FAILED: - tmp << " the resources could not be allocated.\n"; - break; - case CUSPARSE_STATUS_ARCH_MISMATCH: - tmp << " the device compute capability (CC) is less than 1.1. The CC of at least 1.1 is required.\n"; - break; - default: - tmp << " unknown error during the initialization of cusparse library.\n"; - } - throw FatalExceptionHandling(tmp.str()); - } - - // Create and setup matrix descriptor - cusparse_status = cusparseCreateMatDescr(descr); - if (cusparse_status != CUSPARSE_STATUS_SUCCESS) - { - ostringstream tmp; - tmp << " Matrix descriptor initialization failed\n"; - throw FatalExceptionHandling(tmp.str()); - } - cusparseSetMatType(*descr, CUSPARSE_MATRIX_TYPE_GENERAL); - cusparseSetMatIndexBase(*descr, CUSPARSE_INDEX_BASE_ZERO); - - mexPrintf("> Driver version:\n"); - int cuda_version; - cuda_error = cudaDriverGetVersion(&cuda_version); - if (cuda_error != cudaSuccess) - { - ostringstream tmp; - tmp << " cudaGetVersion has failed\n"; - throw FatalExceptionHandling(tmp.str()); - } - mexPrintf(" - CUDA version %5.3f\n", double (cuda_version) / 1000); - int cublas_version; - cublas_status = cublasGetVersion(*cublas_handle, &cublas_version); - if (cublas_status != CUBLAS_STATUS_SUCCESS) - { - ostringstream tmp; - tmp << " cublasGetVersion has failed\n"; - throw FatalExceptionHandling(tmp.str()); - } - mexPrintf(" - CUBLAS version %5.3f\n", double (cublas_version) / 1000); - int cusparse_version; - cusparse_status = cusparseGetVersion(*cusparse_handle, &cusparse_version); - if (cusparse_status != CUSPARSE_STATUS_SUCCESS) - { - ostringstream tmp; - tmp << " cusparseGetVersion has failed\n"; - throw FatalExceptionHandling(tmp.str()); - } - mexPrintf(" - CUSPARSE version %5.3f\n", double (cusparse_version) / 1000); - mexPrintf("-----------------------------------------\n"); - return device; -} - -void -GPU_close(cublasHandle_t cublas_handle, cusparseHandle_t cusparse_handle, cusparseMatDescr_t descr) -{ - cublasChk(cublasDestroy(cublas_handle), "in bytecode cublasDestroy failed\n"); - cusparseChk(cusparseDestroyMatDescr(descr), "in bytecode cusparseDestroyMatDescr failed\n"); - cusparseChk(cusparseDestroy(cusparse_handle), "in bytecode cusparseDestroy failed\n"); -} - -#endif string deblank(string x) { @@ -437,12 +279,6 @@ main(int nrhs, const char *prhs[]) int max_periods = 0; -#ifdef CUDA - int CUDA_device = -1; - cublasHandle_t cublas_handle; - cusparseHandle_t cusparse_handle; - cusparseMatDescr_t descr; -#endif try { Get_Arguments_and_global_variables(nrhs, prhs, count_array_argument, @@ -1005,20 +841,9 @@ main(int nrhs, const char *prhs[]) mexWarnMsgTxt("Not enough space. Filename is truncated."); string file_name = fname; -#ifdef CUDA - try - { - if (stack_solve_algo == 7 && !steady_state) - CUDA_device = GPU_Test_and_Info(&cublas_handle, &cusparse_handle, &descr); - } - catch (GeneralExceptionHandling &feh) - { - mexErrMsgTxt(feh.GetErrorMsg().c_str()); - } -#else if (stack_solve_algo == 7 && !steady_state) - mexErrMsgTxt("bytecode has not been compiled with CUDA option. Bytecode Can't use options_.stack_solve_algo=7\n"); -#endif + mexErrMsgTxt("Bytecode: Can't use option stack_solve_algo=7\n"); + size_t size_of_direction = col_y*row_y*sizeof(double); auto *y = static_cast(mxMalloc(size_of_direction)); error_msg.test_mxMalloc(y, __LINE__, __FILE__, __func__, size_of_direction); @@ -1045,11 +870,7 @@ main(int nrhs, const char *prhs[]) clock_t t0 = clock(); Interpreter interprete(params, y, ya, x, steady_yd, steady_xd, direction, y_size, nb_row_x, nb_row_xd, periods, y_kmin, y_kmax, maxit_, solve_tolf, size_of_direction, slowc, y_decal, markowitz_c, file_name, minimal_solving_periods, stack_solve_algo, solve_algo, global_temporary_terms, print, print_error, GlobalTemporaryTerms, steady_state, - print_it, col_x, col_y -#ifdef CUDA - , CUDA_device, cublas_handle, cusparse_handle, descr -#endif - ); + print_it, col_x, col_y); string f(fname); mxFree(fname); int nb_blocks = 0; @@ -1078,11 +899,6 @@ main(int nrhs, const char *prhs[]) } } -#ifdef CUDA - if (stack_solve_algo == 7 && !steady_state) - GPU_close(cublas_handle, cusparse_handle, descr); -#endif - clock_t t1 = clock(); if (!steady_state && !evaluate && print) mexPrintf("Simulation Time=%f milliseconds\n", 1000.0*(double (t1)-double (t0))/double (CLOCKS_PER_SEC));