Bytecode: remove CUDA-specific stuff

2021-02-01 14:03:57 +01:00 · 2021-02-01 14:03:57 +01:00 · eb29dc003a
parent 5ef440ad9b
commit eb29dc003a
6 changed files with 9 additions and 1979 deletions
--- a/mex/sources/bytecode/Interpreter.cc
+++ b/mex/sources/bytecode/Interpreter.cc
@ -33,16 +33,8 @@ Interpreter::Interpreter(double *params_arg, double *y_arg, double *ya_arg, doub
                         int maxit_arg_, double solve_tolf_arg, size_t size_of_direction_arg, double slowc_arg, int y_decal_arg, double markowitz_c_arg,
                         string &filename_arg, int minimal_solving_periods_arg, int stack_solve_algo_arg, int solve_algo_arg,
                         bool global_temporary_terms_arg, bool print_arg, bool print_error_arg, mxArray *GlobalTemporaryTerms_arg,
-                         bool steady_state_arg, bool print_it_arg, int col_x_arg, int col_y_arg
+                         bool steady_state_arg, bool print_it_arg, int col_x_arg, int col_y_arg)
-#ifdef CUDA
+: dynSparseMatrix(y_size_arg, y_kmin_arg, y_kmax_arg, print_it_arg, steady_state_arg, periods_arg, minimal_solving_periods_arg, slowc_arg)
                         , const int CUDA_device_arg, cublasHandle_t cublas_handle_arg, cusparseHandle_t cusparse_handle_arg, cusparseMatDescr_t descr_arg
 #endif
                         )
 : dynSparseMatrix(y_size_arg, y_kmin_arg, y_kmax_arg, print_it_arg, steady_state_arg, periods_arg, minimal_solving_periods_arg, slowc_arg
 #ifdef CUDA
                  , CUDA_device_arg, cublas_handle_arg, cusparse_handle_arg, descr_arg
 #endif
                  )
 {
  params = params_arg;
  y = y_arg;
--- a/mex/sources/bytecode/Interpreter.hh
+++ b/mex/sources/bytecode/Interpreter.hh
@ -1,5 +1,5 @@
 /*
- * Copyright © 2007-2017 Dynare Team
+ * Copyright © 2007-2021 Dynare Team
 *
 * This file is part of Dynare.
 *
@ -58,11 +58,7 @@ public:
              int maxit_arg_, double solve_tolf_arg, size_t size_of_direction_arg, double slowc_arg, int y_decal_arg, double markowitz_c_arg,
              string &filename_arg, int minimal_solving_periods_arg, int stack_solve_algo_arg, int solve_algo_arg,
              bool global_temporary_terms_arg, bool print_arg, bool print_error_arg, mxArray *GlobalTemporaryTerms_arg,
-              bool steady_state_arg, bool print_it_arg, int col_x_arg, int col_y_arg
+              bool steady_state_arg, bool print_it_arg, int col_x_arg, int col_y_arg);
 #ifdef CUDA
              , const int CUDA_device, cublasHandle_t cublas_handle_arg, cusparseHandle_t cusparse_handle_arg, cusparseMatDescr_t descr_arg
 #endif
              );
  bool extended_path(string file_name, string bin_basename, bool evaluate, int block, int &nb_blocks, int nb_periods, vector<s_plan> sextended_path, vector<s_plan> sconstrained_extended_path, vector<string> dates, table_conditional_global_type table_conditional_global);
  bool compute_blocks(string file_name, string bin_basename, bool evaluate, int block, int &nb_blocks);
  void check_for_controlled_exo_validity(FBEGINBLOCK_ *fb, vector<s_plan> sconstrained_extended_path);
--- a/mex/sources/bytecode/SparseMatrix.cc
+++ b/mex/sources/bytecode/SparseMatrix.cc
--- a/mex/sources/bytecode/SparseMatrix.hh
+++ b/mex/sources/bytecode/SparseMatrix.hh
@ -28,51 +28,11 @@
 #include "dynblas.h"
 #include "dynumfpack.h"
 #ifdef CUDA
 # include "cuda.h"
 # include "cuda_runtime_api.h"
 # include "cublas_v2.h"
 # include "cusparse_v2.h"
 #endif
 #include "Mem_Mngr.hh"
 #include "ErrorHandling.hh"
 //#include "Interpreter.hh"
 #include "Evaluate.hh"
 #define cudaChk(x, y)                                   \
  {                                                     \
    cudaError_t cuda_error = x;                         \
    if (cuda_error != cudaSuccess)                      \
      {                                                 \
        ostringstream tmp;                              \
        tmp << y;                                       \
        throw FatalExceptionHandling(tmp.str());        \
      }                                                 \
  };
 #define cusparseChk(x, y)                               \
  {                                                     \
    cusparseStatus_t cusparse_status = x;               \
    if (cusparse_status != CUSPARSE_STATUS_SUCCESS)     \
      {                                                 \
        ostringstream tmp;                              \
        tmp << y;                                       \
        throw FatalExceptionHandling(tmp.str());        \
      }                                                 \
  };
 #define cublasChk(x, y)                                 \
  {                                                     \
    cublasStatus_t cublas_status = x;                   \
    if (cublas_status != CUBLAS_STATUS_SUCCESS)         \
      {                                                 \
        ostringstream tmp;                              \
        tmp << y;                                       \
        throw FatalExceptionHandling(tmp.str());        \
      }                                                 \
  };
 #define NEW_ALLOC
 #define MARKOVITZ
@ -101,11 +61,7 @@ class dynSparseMatrix : public Evaluate
 {
 public:
  dynSparseMatrix();
-  dynSparseMatrix(const int y_size_arg, const int y_kmin_arg, const int y_kmax_arg, const bool print_it_arg, const bool steady_state_arg, const int periods_arg, const int minimal_solving_periods_arg, const double slowc_arg
+  dynSparseMatrix(const int y_size_arg, const int y_kmin_arg, const int y_kmax_arg, const bool print_it_arg, const bool steady_state_arg, const int periods_arg, const int minimal_solving_periods_arg, const double slowc_arg);
 #ifdef CUDA
                  , const int CUDA_device_arg, cublasHandle_t cublas_handle_arg, cusparseHandle_t cusparse_handle_arg, cusparseMatDescr_t descr_arg
 #endif
                  );
  void Simulate_Newton_Two_Boundaries(int blck, int y_size, int y_kmin, int y_kmax, int Size, int periods, bool cvg, int minimal_solving_periods, int stack_solve_algo, unsigned int endo_name_length, char *P_endo_names, vector_table_conditional_local_type vector_table_conditional_local);
  void Simulate_Newton_One_Boundary(bool forward);
  void fixe_u(double **u, int u_count_int, int max_lag_plus_max_lead_plus_1);
@ -123,12 +79,8 @@ private:
  void Init_GE(int periods, int y_kmin, int y_kmax, int Size, map<pair<pair<int, int>, int>, int> &IM);
  void Init_Matlab_Sparse(int periods, int y_kmin, int y_kmax, int Size, map<pair<pair<int, int>, int>, int> &IM, mxArray *A_m, mxArray *b_m, mxArray *x0_m);
  void Init_UMFPACK_Sparse(int periods, int y_kmin, int y_kmax, int Size, map<pair<pair<int, int>, int>, int> &IM, SuiteSparse_long **Ap, SuiteSparse_long **Ai, double **Ax, double **b, mxArray *x0_m, vector_table_conditional_local_type vector_table_conditional_local, int block_num);
 #ifdef CUDA
  void Init_CUDA_Sparse(int periods, int y_kmin, int y_kmax, int Size, map<pair<pair<int, int>, int>, int> &IM, int **Ap, int **Ai, double **Ax, int **Ap_tild, int **Ai_tild, double **A_tild, double **b, double **x0, mxArray *x0_m, int *nnz, int *nnz_tild, int preconditioner);
 #endif
  void Init_Matlab_Sparse_Simple(int Size, map<pair<pair<int, int>, int>, int> &IM, mxArray *A_m, mxArray *b_m, bool &zero_solution, mxArray *x0_m);
  void Init_UMFPACK_Sparse_Simple(int Size, map<pair<pair<int, int>, int>, int> &IM, SuiteSparse_long **Ap, SuiteSparse_long **Ai, double **Ax, double **b, bool &zero_solution, mxArray *x0_m);
  void Init_CUDA_Sparse_Simple(int Size, map<pair<pair<int, int>, int>, int> &IM, SuiteSparse_long **Ap, SuiteSparse_long **Ai, double **Ax, double **b, double **x0, bool &zero_solution, mxArray *x0_m);
  void Simple_Init(int Size, std::map<std::pair<std::pair<int, int>, int>, int> &IM, bool &zero_solution);
  void End_GE(int Size);
  bool mnbrak(double *ax, double *bx, double *cx, double *fa, double *fb, double *fc);
@ -145,13 +97,6 @@ private:
  void Solve_LU_UMFPack(SuiteSparse_long *Ap, SuiteSparse_long *Ai, double *Ax, double *b, int n, int Size, double slowc_l, bool is_two_boundaries, int it_);
  void End_Matlab_LU_UMFPack();
 #ifdef CUDA
  void Solve_CUDA_BiCGStab_Free(double *tmp_vect_host, double *p, double *r, double *v, double *s, double *t, double *y_, double *z, double *tmp_,
                                int *Ai, double *Ax, int *Ap, double *x0, double *b, double *A_tild, int *A_tild_i, int *A_tild_p,
                                cusparseSolveAnalysisInfo_t infoL, cusparseSolveAnalysisInfo_t infoU,
                                cusparseMatDescr_t descrL, cusparseMatDescr_t descrU, int preconditioner);
  int Solve_CUDA_BiCGStab(int *Ap, int *Ai, double *Ax, int *Ap_tild, int *Ai_tild, double *A_tild, double *b, double *x0, int n, int Size, double slowc_l, bool is_two_boundaries, int it_, int nnz, int nnz_tild, int preconditioner, int max_iterations, int block);
 #endif
  void Solve_Matlab_GMRES(mxArray *A_m, mxArray *b_m, int Size, double slowc, int block, bool is_two_boundaries, int it_, mxArray *x0_m);
  void Solve_Matlab_BiCGStab(mxArray *A_m, mxArray *b_m, int Size, double slowc, int block, bool is_two_boundaries, int it_, mxArray *x0_m, int precond);
  void Check_and_Correct_Previous_Iteration(int block_num, int y_size, int size, double crit_opt_old);
@ -196,12 +141,6 @@ private:
  mxArray *Sparse_substract_SA_SB(mxArray *A_m, mxArray *B_m);
  mxArray *Sparse_substract_A_SB(mxArray *A_m, mxArray *B_m);
  mxArray *substract_A_B(mxArray *A_m, mxArray *B_m);
 #ifdef CUDA
  int CUDA_device;
  cublasHandle_t cublas_handle;
  cusparseHandle_t cusparse_handle;
  cusparseMatDescr_t CUDA_descr;
 #endif
 protected:
  stack<double> Stack;
  int nb_prologue_table_u, nb_first_table_u, nb_middle_table_u, nb_last_table_u;
--- a/mex/sources/bytecode/SparseMatrix_kernel.cu
+++ b/mex/sources/bytecode/SparseMatrix_kernel.cu
@ -1,121 +0,0 @@
 /*
 * Copyright (C) 2007-2012 Dynare Team
 *
 * This file is part of Dynare.
 *
 * Dynare is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Dynare is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Dynare.  If not, see <http://www.gnu.org/licenses/>.
 */
 #ifndef SPARMATRIX_KERNEL
 #define SPARMATRIX_KERNEL
 // Kernel definition of vector division
 __global__ void
 VecDiv(double* A, double* B, double* C, int n)
 {
  int i = blockIdx.x * 1024 + threadIdx.x;
  if (i < n)
    C[i] = (B[i] != 0.0 ? A[i] / B[i] : A[i]);
 }
 __global__ void
 VecAdd(double* res, double* r, double alpha, double* x, int n)
 {
  int i = blockIdx.x * 1024 + threadIdx.x;
  if (i < n)
    res[i] = r[i] + alpha * x[i];
 }
 __global__ void
 VecInc(double* res, double alpha, double* x, int n)
 {
  int i = blockIdx.x * 1024 + threadIdx.x;
  if (i < n)
    res[i] += alpha * x[i];
 }
 __global__ void
 update_x(double* x, double alpha, double* y, double omega, double *z)
 {
  int i = threadIdx.x;
  x[i] += alpha * y[i] + omega * z[i];
 }
 __global__ void
 Get_LU_dim(int *n, int* A_tild_i, int *A_tild_p, int *nnz_l, int *nnz_u)
 {
  nnz_u[0] = 0;
  nnz_l[0] = 0;
  for (int i = 0; i < n[0]; i++)
    {
      for (int j = A_tild_p[i]; j < A_tild_p[i+1]; j++)
        {
          if (A_tild_i[j] < i)
            nnz_l[0]++;
          else if (A_tild_i[j] == i)
            {
              nnz_u[0]++;
              //nnz_l[0]++;
            }
          else
            nnz_u[0]++;
        }
    }
 }
 __global__ void
 Get_LU1_dim(int* n, int *nnz_l, int *nnz_u)
 {
  nnz_u[0] = 3+n[0];
  nnz_l[0] = 1+n[0];
 }
 __global__ void
 Get_L_and_U(int *n, double* A_tild_x, int* A_tild_i, int *A_tild_p, double* Lx, int* Li, int *Lp, double* Ux, int* Ui, int* Up)
 {
  int nnz_u = 0, nnz_l = 0;
  Lp[0] = 0;
  Up[0] = 0;
  for (int i = 0; i < n[0]; i++)
    {
      for (int j = A_tild_p[i]; j < A_tild_p[i+1]; j++)
        {
          if (A_tild_i[j] < i)
            {
              Lx[nnz_l] = A_tild_x[j];
              Li[nnz_l] = A_tild_i[j];
              nnz_l++;
            }
          else if (A_tild_i[j] == i)
            {
              Ux[nnz_u] = A_tild_x[j];
              Lx[nnz_l] = 1.0;
              Li[nnz_l] = Ui[nnz_u] = A_tild_i[j];
              nnz_u++;
              //nnz_l++;
            }
          else
            {
              Ux[nnz_u] = A_tild_x[j];
              Ui[nnz_u] = A_tild_i[j];
              nnz_u++;
            }
        }
      Lp[i+1] = nnz_l;
      Up[i+1] = nnz_u;
    }
 }
 #endif
--- a/mex/sources/bytecode/bytecode.cc
+++ b/mex/sources/bytecode/bytecode.cc
@ -57,164 +57,6 @@ Get_Argument(const mxArray *prhs)
 //#include <windows.h>
 #include <cstdio>
 #ifdef CUDA
 int
 GPU_Test_and_Info(cublasHandle_t *cublas_handle, cusparseHandle_t *cusparse_handle, cusparseMatDescr_t *descr)
 {
  cudaDeviceProp deviceProp;
  int device_count, device, version, version_max = 0;
  cublasStatus_t cublas_status;
  cudaError_t cuda_error;
  *descr = 0;
  /* ask cuda how many devices it can find */
  cudaGetDeviceCount(&device_count);
  if (device_count < 1)
    {
      /* if it couldn't find any fail out */
      ostringstream tmp;
      tmp << " Unable to find a CUDA device. Unable to implement CUDA solvers\n";
      throw FatalExceptionHandling(tmp.str());
    }
  else
    {
      mexPrintf("-----------------------------------------\n");
      for (int i = 0; i < device_count; i++)
        {
          cudaSetDevice(i);
          // Statistics about the GPU device
          cuda_error = cudaGetDeviceProperties(&deviceProp, i);
          if (cuda_error != cudaSuccess)
            {
              ostringstream tmp;
              tmp << "  bytecode cudaGetDeviceProperties failed\n";
              throw FatalExceptionHandling(tmp.str());
            }
          mexPrintf("> GPU device %d: \"%s\" has:\n   - %d Multi-Processors,\n   - %d threads per multiprocessor,\n", i, deviceProp.name, deviceProp.multiProcessorCount, deviceProp.maxThreadsPerMultiProcessor);
          mexEvalString("drawnow;");
          version = (deviceProp.major * 0x10 + deviceProp.minor);
          if (version >= version_max)
            {
              device = i;
              version_max = version;
            }
          mexPrintf("   - %4.2fMhz clock rate,\n   - %2.0fMb of memory,\n   - %d.%d compute capabilities.\n", double (deviceProp.clockRate) / (1024 * 1024), double (deviceProp.totalGlobalMem) / (1024 * 1024), deviceProp.major, deviceProp.minor);
          mexEvalString("drawnow;");
        }
    }
  mexPrintf("> Device %d selected\n", device);
  mexEvalString("drawnow;");
  cuda_error = cudaSetDevice(device);
  if (cuda_error != cudaSuccess)
    {
      ostringstream tmp;
      tmp << "  bytecode cudaSetDevice failed\n";
      throw FatalExceptionHandling(tmp.str());
    }
  if (version_max < 0x11)
    {
      ostringstream tmp;
      tmp << "  bytecode requires a minimum CUDA compute 1.1 capability\n";
      cudaDeviceReset();
      throw FatalExceptionHandling(tmp.str());
    }
  // Initialize CuBlas library
  cublas_status = cublasCreate(cublas_handle);
  if (cublas_status != CUBLAS_STATUS_SUCCESS)
    {
      ostringstream tmp;
      switch (cublas_status)
        {
        case CUBLAS_STATUS_NOT_INITIALIZED:
          tmp << " the CUBLAS initialization failed.\n";
          break;
        case CUBLAS_STATUS_ALLOC_FAILED:
          tmp << " the resources could not be allocated.\n";
          break;
        default:
          tmp << " unknown error during the initialization of cusparse library.\n";
        }
      throw FatalExceptionHandling(tmp.str());
    }
  // Initialize the CuSparse library
  cusparseStatus_t cusparse_status;
  cusparse_status = cusparseCreate(cusparse_handle);
  if (cusparse_status != CUSPARSE_STATUS_SUCCESS)
    {
      ostringstream tmp;
      switch (cusparse_status)
        {
        case CUSPARSE_STATUS_NOT_INITIALIZED:
          tmp << " the CUDA Runtime initialization failed.\n";
          break;
        case CUSPARSE_STATUS_ALLOC_FAILED:
          tmp <<  " the resources could not be allocated.\n";
          break;
        case CUSPARSE_STATUS_ARCH_MISMATCH:
          tmp <<  " the device compute capability (CC) is less than 1.1. The CC of at least 1.1 is required.\n";
          break;
        default:
          tmp << " unknown error during the initialization of cusparse library.\n";
        }
      throw FatalExceptionHandling(tmp.str());
    }
  // Create and setup matrix descriptor
  cusparse_status = cusparseCreateMatDescr(descr);
  if (cusparse_status != CUSPARSE_STATUS_SUCCESS)
    {
      ostringstream tmp;
      tmp << " Matrix descriptor initialization failed\n";
      throw FatalExceptionHandling(tmp.str());
    }
  cusparseSetMatType(*descr, CUSPARSE_MATRIX_TYPE_GENERAL);
  cusparseSetMatIndexBase(*descr, CUSPARSE_INDEX_BASE_ZERO);
  mexPrintf("> Driver version:\n");
  int cuda_version;
  cuda_error = cudaDriverGetVersion(&cuda_version);
  if (cuda_error != cudaSuccess)
    {
      ostringstream tmp;
      tmp << " cudaGetVersion has failed\n";
      throw FatalExceptionHandling(tmp.str());
    }
  mexPrintf("   - CUDA version %5.3f\n", double (cuda_version) / 1000);
  int cublas_version;
  cublas_status = cublasGetVersion(*cublas_handle, &cublas_version);
  if (cublas_status != CUBLAS_STATUS_SUCCESS)
    {
      ostringstream tmp;
      tmp << " cublasGetVersion has failed\n";
      throw FatalExceptionHandling(tmp.str());
    }
  mexPrintf("   - CUBLAS version %5.3f\n", double (cublas_version) / 1000);
  int cusparse_version;
  cusparse_status = cusparseGetVersion(*cusparse_handle, &cusparse_version);
  if (cusparse_status != CUSPARSE_STATUS_SUCCESS)
    {
      ostringstream tmp;
      tmp << " cusparseGetVersion has failed\n";
      throw FatalExceptionHandling(tmp.str());
    }
  mexPrintf("   - CUSPARSE version %5.3f\n", double (cusparse_version) / 1000);
  mexPrintf("-----------------------------------------\n");
  return device;
 }
 void
 GPU_close(cublasHandle_t cublas_handle, cusparseHandle_t cusparse_handle, cusparseMatDescr_t descr)
 {
  cublasChk(cublasDestroy(cublas_handle), "in bytecode cublasDestroy failed\n");
  cusparseChk(cusparseDestroyMatDescr(descr), "in bytecode cusparseDestroyMatDescr failed\n");
  cusparseChk(cusparseDestroy(cusparse_handle), "in bytecode cusparseDestroy failed\n");
 }
 #endif
 string
 deblank(string x)
 {
@ -437,12 +279,6 @@ main(int nrhs, const char *prhs[])
  int max_periods = 0;
 #ifdef CUDA
  int CUDA_device = -1;
  cublasHandle_t cublas_handle;
  cusparseHandle_t cusparse_handle;
  cusparseMatDescr_t descr;
 #endif
  try
    {
      Get_Arguments_and_global_variables(nrhs, prhs, count_array_argument,
@ -1005,20 +841,9 @@ main(int nrhs, const char *prhs[])
    mexWarnMsgTxt("Not enough space. Filename is truncated.");
  string file_name = fname;
 #ifdef CUDA
  try
    {
  if (stack_solve_algo == 7 && !steady_state)
-        CUDA_device = GPU_Test_and_Info(&cublas_handle, &cusparse_handle, &descr);
+    mexErrMsgTxt("Bytecode: Can't use option stack_solve_algo=7\n");
-    }
+
  catch (GeneralExceptionHandling &feh)
    {
      mexErrMsgTxt(feh.GetErrorMsg().c_str());
    }
 #else
  if (stack_solve_algo == 7 && !steady_state)
    mexErrMsgTxt("bytecode has not been compiled with CUDA option. Bytecode Can't use options_.stack_solve_algo=7\n");
 #endif
  size_t size_of_direction = col_y*row_y*sizeof(double);
  auto *y = static_cast<double *>(mxMalloc(size_of_direction));
  error_msg.test_mxMalloc(y, __LINE__, __FILE__, __func__, size_of_direction);
@ -1045,11 +870,7 @@ main(int nrhs, const char *prhs[])
  clock_t t0 = clock();
  Interpreter interprete(params, y, ya, x, steady_yd, steady_xd, direction, y_size, nb_row_x, nb_row_xd, periods, y_kmin, y_kmax, maxit_, solve_tolf, size_of_direction, slowc, y_decal,
                         markowitz_c, file_name, minimal_solving_periods, stack_solve_algo, solve_algo, global_temporary_terms, print, print_error, GlobalTemporaryTerms, steady_state,
-                         print_it, col_x, col_y
+                         print_it, col_x, col_y);
 #ifdef CUDA
                         , CUDA_device, cublas_handle, cusparse_handle, descr
 #endif
                         );
  string f(fname);
  mxFree(fname);
  int nb_blocks = 0;
@ -1078,11 +899,6 @@ main(int nrhs, const char *prhs[])
        }
    }
 #ifdef CUDA
  if (stack_solve_algo == 7 && !steady_state)
    GPU_close(cublas_handle, cusparse_handle, descr);
 #endif
  clock_t t1 = clock();
  if (!steady_state && !evaluate && print)
    mexPrintf("Simulation Time=%f milliseconds\n", 1000.0*(double (t1)-double (t0))/double (CLOCKS_PER_SEC));