From eb29dc003a38e0741d2a6bed1ce36e98146f789b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Villemot?= <sebastien@dynare.org>
Date: Mon, 1 Feb 2021 14:03:57 +0100
Subject: [PATCH] Bytecode: remove CUDA-specific stuff

---
 mex/sources/bytecode/Interpreter.cc         |   12 +-
 mex/sources/bytecode/Interpreter.hh         |    8 +-
 mex/sources/bytecode/SparseMatrix.cc        | 1594 +------------------
 mex/sources/bytecode/SparseMatrix.hh        |   63 +-
 mex/sources/bytecode/SparseMatrix_kernel.cu |  121 --
 mex/sources/bytecode/bytecode.cc            |  190 +--
 6 files changed, 9 insertions(+), 1979 deletions(-)
 delete mode 100644 mex/sources/bytecode/SparseMatrix_kernel.cu
diff --git a/mex/sources/bytecode/Interpreter.cc b/mex/sources/bytecode/Interpreter.cc
index b34bcc066..7a8492aa6 100644
--- a/mex/sources/bytecode/Interpreter.cc
+++ b/mex/sources/bytecode/Interpreter.cc
@@ -33,16 +33,8 @@ Interpreter::Interpreter(double *params_arg, double *y_arg, double *ya_arg, doub
                          int maxit_arg_, double solve_tolf_arg, size_t size_of_direction_arg, double slowc_arg, int y_decal_arg, double markowitz_c_arg,
                          string &filename_arg, int minimal_solving_periods_arg, int stack_solve_algo_arg, int solve_algo_arg,
                          bool global_temporary_terms_arg, bool print_arg, bool print_error_arg, mxArray *GlobalTemporaryTerms_arg,
-                         bool steady_state_arg, bool print_it_arg, int col_x_arg, int col_y_arg
-#ifdef CUDA
-                         , const int CUDA_device_arg, cublasHandle_t cublas_handle_arg, cusparseHandle_t cusparse_handle_arg, cusparseMatDescr_t descr_arg
-#endif
-                         )
-: dynSparseMatrix(y_size_arg, y_kmin_arg, y_kmax_arg, print_it_arg, steady_state_arg, periods_arg, minimal_solving_periods_arg, slowc_arg
-#ifdef CUDA
-                  , CUDA_device_arg, cublas_handle_arg, cusparse_handle_arg, descr_arg
-#endif
-                  )
+                         bool steady_state_arg, bool print_it_arg, int col_x_arg, int col_y_arg)
+: dynSparseMatrix(y_size_arg, y_kmin_arg, y_kmax_arg, print_it_arg, steady_state_arg, periods_arg, minimal_solving_periods_arg, slowc_arg)
 {
   params = params_arg;
   y = y_arg;
diff --git a/mex/sources/bytecode/Interpreter.hh b/mex/sources/bytecode/Interpreter.hh
index 5ab8cd784..3d990d588 100644
--- a/mex/sources/bytecode/Interpreter.hh
+++ b/mex/sources/bytecode/Interpreter.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2007-2017 Dynare Team
+ * Copyright © 2007-2021 Dynare Team
  *
  * This file is part of Dynare.
  *
@@ -58,11 +58,7 @@ public:
               int maxit_arg_, double solve_tolf_arg, size_t size_of_direction_arg, double slowc_arg, int y_decal_arg, double markowitz_c_arg,
               string &filename_arg, int minimal_solving_periods_arg, int stack_solve_algo_arg, int solve_algo_arg,
               bool global_temporary_terms_arg, bool print_arg, bool print_error_arg, mxArray *GlobalTemporaryTerms_arg,
-              bool steady_state_arg, bool print_it_arg, int col_x_arg, int col_y_arg
-#ifdef CUDA
-              , const int CUDA_device, cublasHandle_t cublas_handle_arg, cusparseHandle_t cusparse_handle_arg, cusparseMatDescr_t descr_arg
-#endif
-              );
+              bool steady_state_arg, bool print_it_arg, int col_x_arg, int col_y_arg);
   bool extended_path(string file_name, string bin_basename, bool evaluate, int block, int &nb_blocks, int nb_periods, vector<s_plan> sextended_path, vector<s_plan> sconstrained_extended_path, vector<string> dates, table_conditional_global_type table_conditional_global);
   bool compute_blocks(string file_name, string bin_basename, bool evaluate, int block, int &nb_blocks);
   void check_for_controlled_exo_validity(FBEGINBLOCK_ *fb, vector<s_plan> sconstrained_extended_path);
diff --git a/mex/sources/bytecode/SparseMatrix.cc b/mex/sources/bytecode/SparseMatrix.cc
index edd9a3b2e..7009f1d3d 100644
--- a/mex/sources/bytecode/SparseMatrix.cc
+++ b/mex/sources/bytecode/SparseMatrix.cc
@@ -27,10 +27,6 @@
 //#include <minimize.h>
 #include "SparseMatrix.hh"
 
-#ifdef CUDA
-# include "SparseMatrix_kernel.cu"
-#endif
-
 using namespace std;
 
 dynSparseMatrix::dynSparseMatrix()
@@ -55,11 +51,7 @@ dynSparseMatrix::dynSparseMatrix()
 }
 
 dynSparseMatrix::dynSparseMatrix(const int y_size_arg, const int y_kmin_arg, const int y_kmax_arg, const bool print_it_arg, const bool steady_state_arg, const int periods_arg,
-                                 const int minimal_solving_periods_arg, const double slowc_arg
-#ifdef CUDA
-                                 , const int CUDA_device_arg, cublasHandle_t cublas_handle_arg, cusparseHandle_t cusparse_handle_arg, cusparseMatDescr_t descr_arg
-#endif
-                                 ) :
+                                 const int minimal_solving_periods_arg, const double slowc_arg) :
   Evaluate(y_size_arg, y_kmin_arg, y_kmax_arg, print_it_arg, steady_state_arg, periods_arg, minimal_solving_periods_arg, slowc_arg)
 {
   pivotva = nullptr;
@@ -79,12 +71,6 @@ dynSparseMatrix::dynSparseMatrix(const int y_size_arg, const int y_kmin_arg, con
   lu_inc_tol = 1e-10;
   Symbolic = nullptr;
   Numeric = nullptr;
-#ifdef CUDA
-  CUDA_device = CUDA_device_arg;
-  cublas_handle = cublas_handle_arg;
-  cusparse_handle = cusparse_handle_arg;
-  CUDA_descr = descr_arg;
-#endif
 }
 
 int
@@ -1095,409 +1081,6 @@ dynSparseMatrix::Init_UMFPACK_Sparse(int periods, int y_kmin, int y_kmax, int Si
 #endif
 }
 
-void
-dynSparseMatrix::Init_CUDA_Sparse_Simple(int Size, map<pair<pair<int, int>, int>, int> &IM, SuiteSparse_long **Ap, SuiteSparse_long **Ai, double **Ax, double **b, double **x0, bool &zero_solution, mxArray *x0_m)
-{
-  int eq, var;
-
-  *b = static_cast<double *>(mxMalloc(Size * sizeof(double)));
-  test_mxMalloc(*b, __LINE__, __FILE__, __func__, Size * sizeof(double));
-  if (!(*b))
-    {
-      ostringstream tmp;
-      tmp << " in Init_CUDA_Sparse, can't retrieve b vector\n";
-      throw FatalExceptionHandling(tmp.str());
-    }
-  double *Host_x0 = mxGetPr(x0_m);
-  if (!Host_x0)
-    {
-      ostringstream tmp;
-      tmp << " in Init_CUDA_Sparse_Simple, can't retrieve x0 vector\n";
-      throw FatalExceptionHandling(tmp.str());
-    }
-  *Ap = static_cast<SuiteSparse_long *>(mxMalloc((Size+1) * sizeof(SuiteSparse_long)));
-  test_mxMalloc(*Ap, __LINE__, __FILE__, __func__, (Size+1) * sizeof(SuiteSparse_long));
-  if (!(*Ap))
-    {
-      ostringstream tmp;
-      tmp << " in Init_CUDA_Sparse, can't allocate Ap index vector\n";
-      throw FatalExceptionHandling(tmp.str());
-    }
-  size_t prior_nz = IM.size();
-  *Ai = static_cast<SuiteSparse_long *>(mxMalloc(prior_nz * sizeof(SuiteSparse_long)));
-  test_mxMalloc(*Ai, __LINE__, __FILE__, __func__, prior_nz * sizeof(SuiteSparse_long));
-  if (!(*Ai))
-    {
-      ostringstream tmp;
-      tmp << " in Init_CUDA_Sparse, can't allocate Ai index vector\n";
-      throw FatalExceptionHandling(tmp.str());
-    }
-  *Ax = static_cast<double *>(mxMalloc(prior_nz * sizeof(double)));
-  test_mxMalloc(*Ax, __LINE__, __FILE__, __func__, prior_nz * sizeof(double));
-  if (!(*Ax))
-    {
-      ostringstream tmp;
-      tmp << " in Init_CUDA_Sparse, can't retrieve Ax matrix\n";
-      throw FatalExceptionHandling(tmp.str());
-    }
-
-  map<pair<pair<int, int>, int>, int>::iterator it4;
-  for (int i = 0; i < Size; i++)
-    {
-      int eq = index_vara[i];
-      ya[eq+it_*y_size] = y[eq+it_*y_size];
-    }
-
-#ifdef DEBUG
-  unsigned int max_nze = prior_nz; //mxGetNzmax(A_m);
-#endif
-  unsigned int NZE = 0;
-  int last_var = 0;
-  double cum_abs_sum = 0;
-  for (int i = 0; i < Size; i++)
-    {
-      (*b)[i] = u[i];
-      cum_abs_sum += fabs((*b)[i]);
-      (*x0)[i] = y[i];
-    }
-  if (cum_abs_sum < 1e-20)
-    zero_solution = true;
-  else
-    zero_solution = false;
-
-  (*Ap)[0] = 0;
-  last_var = -1;
-  it4 = IM.begin();
-  while (it4 != IM.end())
-    {
-      var = it4->first.first.first;
-      if (var != last_var)
-        {
-          (*Ap)[1+last_var] = NZE;
-          last_var = var;
-        }
-      eq = it4->first.second;
-      int index = it4->second;
-#ifdef DEBUG
-      if (index < 0 || index >= u_count_alloc || index > Size + Size*Size)
-        {
-          ostringstream tmp;
-          tmp << " in Init_CUDA_Sparse_Simple, index (" << index << ") out of range for u vector max = " << Size+Size*Size << " allocated = " << u_count_alloc << "\n";
-          throw FatalExceptionHandling(tmp.str());
-        }
-      if (NZE >= max_nze)
-        {
-          ostringstream tmp;
-          tmp << " in Init_CUDA_Sparse_Simple, exceeds the capacity of A_m sparse matrix\n";
-          throw FatalExceptionHandling(tmp.str());
-        }
-#endif
-      (*Ax)[NZE] = u[index];
-      (*Ai)[NZE] = eq;
-      NZE++;
-#ifdef DEBUG
-      if (eq < 0 || eq >= Size)
-        {
-          ostringstream tmp;
-          tmp << " in Init_CUDA_Sparse_Simple, index (" << eq << ") out of range for b vector\n";
-          throw FatalExceptionHandling(tmp.str());
-        }
-      if (var < 0 || var >= Size)
-        {
-          ostringstream tmp;
-          tmp << " in Init_CUDA_Sparse_Simple, index (" << var << ") out of range for index_vara vector\n";
-          throw FatalExceptionHandling(tmp.str());
-        }
-      if (index_vara[var] < 0 || index_vara[var] >= y_size)
-        {
-          ostringstream tmp;
-          tmp << " in Init_CUDA_Sparse_Simple, index (" << index_vara[var] << ") out of range for y vector max=" << y_size << " (0)\n";
-          throw FatalExceptionHandling(tmp.str());
-        }
-#endif
-      it4++;
-    }
-  (*Ap)[Size] = NZE;
-}
-
-#ifdef CUDA
-void
-dynSparseMatrix::Init_CUDA_Sparse(int periods, int y_kmin, int y_kmax, int Size, map<pair<pair<int, int>, int>, int> &IM, int **Ap, int **Ai, double **Ax, int **Ap_tild, int **Ai_tild, double **A_tild, double **b, double **x0, mxArray *x0_m, int *nnz, int *nnz_tild, int preconditioner)
-{
-  //cudaError_t cuda_error;
-  int t, eq, var, lag, ti_y_kmin, ti_y_kmax;
-  int n = periods * Size;
-  size_t prior_nz = IM.size() * periods;
-  size_t preconditioner_size = 0;
-  map<pair<int, int>, int> jacob_struct;
-
-  /* ask cuda how many devices it can find */
-  int device_count;
-  cudaGetDeviceCount(&device_count);
-
-  cudaSetDevice(CUDA_device);
-
-  double *Host_b = (double *) mxMalloc(n * sizeof(double));
-  test_mxMalloc(Host_b, __LINE__, __FILE__, __func__, n * sizeof(double));
-  cudaChk(cudaMalloc((void **) b, n * sizeof(double)), " in Init_Cuda_Sparse, not enought memory to allocate b vector on the graphic card\n");
-
-  double *Host_x0 = mxGetPr(x0_m);
-  if (!Host_x0)
-    {
-      ostringstream tmp;
-      tmp << " in Init_Cuda_Sparse, can't retrieve x0 vector\n";
-      throw FatalExceptionHandling(tmp.str());
-    }
-  cudaChk(cudaMalloc((void **) x0, n * sizeof(double)), " in Init_Cuda_Sparse, not enought memory to allocate x0 vector on the graphic card\n");
-
-  int *Host_Ap = (int *) mxMalloc((n+1) * sizeof(int));
-  test_mxMalloc(Host_Ap, __LINE__, __FILE__, __func__, (n+1) * sizeof(int));
-
-  int *Host_Ai = (int *) mxMalloc(prior_nz * sizeof(int));
-  test_mxMalloc(Host_Ai, __LINE__, __FILE__, __func__, prior_nz * sizeof(int));
-
-  double *Host_Ax = (double *) mxMalloc(prior_nz * sizeof(double));
-  test_mxMalloc(Host_Ax, __LINE__, __FILE__, __func__, prior_nz * sizeof(double));
-
-  int *Host_Ai_tild, *Host_Ap_tild;
-  if (preconditioner == 3)
-    {
-      Host_Ap_tild = (int *) mxMalloc((n+1)*sizeof(int));
-      test_mxMalloc(Host_Ap_tild, __LINE__, __FILE__, __func__, (n+1)*sizeof(int));
-      Host_Ai_tild = (int *) mxMalloc(prior_nz*sizeof(int));
-      test_mxMalloc(Host_Ai_tild, __LINE__, __FILE__, __func__, prior_nz*sizeof(int));
-      Host_Ap_tild[0] = 0;
-    }
-
-  if (preconditioner == 0)
-    preconditioner_size = n;
-  else if (preconditioner == 1 || preconditioner == 2 || preconditioner == 3)
-    preconditioner_size = prior_nz;
-
-  double *Host_A_tild = (double *) mxMalloc(preconditioner_size * sizeof(double));
-  test_mxMalloc(Host_A_tild, __LINE__, __FILE__, __func__, preconditioner_size * sizeof(double));
-
-  map<pair<pair<int, int>, int>, int>::iterator it4;
-  for (int i = 0; i < y_size*(periods+y_kmin); i++)
-    ya[i] = y[i];
-# ifdef DEBUG
-  unsigned int max_nze = mxGetNzmax(A_m);
-# endif
-  unsigned int NZE = 0, NZE_tild = 0;
-  int last_eq = 0;
-  for (int i = 0; i < periods*Size; i++)
-    {
-      Host_b[i] = 0;
-      Host_x0[i] = y[index_vara[Size*y_kmin+i]];
-    }
-
-  //Ordered in CSR and not in CSC
-
-  Host_Ap[0] = 0;
-  for (t = 0; t < periods; t++)
-    {
-      last_eq = -1;
-      it4 = IM.begin();
-      while (it4 != IM.end())
-        {
-          eq = it4->first.first.first;
-          if (eq != last_eq)
-            {
-# ifdef DEBUG
-              if (1+last_eq + t * Size > (n + 1))
-                {
-                  ostringstream tmp;
-                  tmp << " in Init_CUDA_Sparse, 1+last_eq + t * Size (" << 1+last_eq + t * Size << ") out of range for Host_Ap vector\n";
-                  throw FatalExceptionHandling(tmp.str());
-                }
-# endif
-              Host_Ap[1+last_eq + t * Size] = NZE;
-              if (preconditioner == 3 && t == 0)
-                Host_Ap_tild[1+last_eq] = NZE_tild;
-              last_eq = eq;
-            }
-          var = it4->first.second+Size*t;
-          lag = it4->first.first.second;
-          int index = it4->second+ (t /*+ lag*/) * u_count_init;
-          if (eq < (periods+y_kmax)*Size)
-            {
-              ti_y_kmin = -min(t, y_kmin);
-              ti_y_kmax = min(periods-(t + 1), y_kmax);
-              if ((lag <= ti_y_kmax && lag >= ti_y_kmin) || preconditioner == 3) /*Build the index for sparse matrix containing the jacobian : u*/
-                {
-# ifdef DEBUG
-                  if (index < 0 || index >= u_count_alloc || index > (periods-1)* IM.size() + Size * Size + periods * Size)
-                    {
-                      ostringstream tmp;
-                      tmp << " in Init_CUDA_Sparse, index (" << index << ") out of range for u vector max = " << (periods-1)* IM.size() + Size * Size + periods * Size << " allocated = " << u_count_alloc << "\n";
-                      throw FatalExceptionHandling(tmp.str());
-                    }
-                  if (NZE >= prior_nz)
-                    {
-                      ostringstream tmp;
-                      tmp << " in Init_CUDA_Sparse, exceeds the capacity of A_i or A_x sparse matrix\n";
-                      throw FatalExceptionHandling(tmp.str());
-                    }
-# endif
-                  bool to_store = true;
-                  if (preconditioner == 0)
-                    {
-                      if (lag == 0 && it4->first.second == eq)
-                        Host_A_tild[var] = u[index];
-                    }
-                  else if (preconditioner == 1 || preconditioner == 2)
-                    Host_A_tild[NZE] = u[index];
-                  else if (preconditioner == 3)
-                    {
-                      if (lag > ti_y_kmax || lag < ti_y_kmin)
-                        {
-                          Host_b[eq + t * Size] += u[index]*y[index_vara[var+Size*(y_kmin+lag)]];
-                          to_store = false;
-                        }
-                      if (t == 0)
-                        {
-                          map<pair<int, int>, int>::const_iterator it = jacob_struct.find(make_pair(eq + t * Size, var));
-                          if (it != jacob_struct.end())
-                            Host_A_tild[it->second] += u[index];
-                          else
-                            {
-                              jacob_struct[make_pair(eq, var)] = NZE_tild;
-                              Host_A_tild[NZE_tild] = u[index];
-                              Host_Ai_tild[NZE_tild] = var;
-                              NZE_tild++;
-                            }
-                        }
-                    }
-                  if (to_store)
-                    {
-                      Host_Ax[NZE] = u[index];
-                      Host_Ai[NZE] = var + lag * Size;
-                      NZE++;
-                    }
-                }
-              else
-                {
-# ifdef DEBUG
-                  if (var < 0 || var >= Size * periods)
-                    {
-                      ostringstream tmp;
-                      tmp << " in Init_CUDA_Sparse, index (" << var << ") out of range for b vector\n";
-                      throw FatalExceptionHandling(tmp.str());
-                    }
-                  if (var+Size*(y_kmin+t+lag) < 0 || var+Size*(y_kmin+lag) >= Size*(periods+y_kmin+y_kmax))
-                    {
-                      ostringstream tmp;
-                      tmp << " in Init_CUDA_Sparse, index (" << var+Size*(y_kmin+lag) << ") out of range for index_vara vector max=" << Size*(periods+y_kmin+y_kmax) << "\n";
-                      throw FatalExceptionHandling(tmp.str());
-                    }
-                  if (index_vara[var+Size*(y_kmin+lag)] < 0 || index_vara[var+Size*(y_kmin+lag)] >= y_size*(periods+y_kmin+y_kmax))
-                    {
-                      ostringstream tmp;
-                      tmp << " in Init_CUDA_Sparse, index (" << index_vara[var+Size*(y_kmin+lag)] << ") out of range for y vector max=" << y_size*(periods+y_kmin+y_kmax) << "\n";
-                      throw FatalExceptionHandling(tmp.str());
-                    }
-# endif
-                  Host_b[eq + t * Size] += u[index]*y[index_vara[var+Size*(y_kmin+lag)]];
-                }
-            }
-          else // ...and store it in the u vector
-            {
-# ifdef DEBUG
-              if (index < 0 || index >= u_count_alloc)
-                {
-                  ostringstream tmp;
-                  tmp << " in Init_CUDA_Sparse, index (" << index << ") out of range for u vector\n";
-                  throw FatalExceptionHandling(tmp.str());
-                }
-              if (var < 0 || var >= (Size*periods))
-                {
-                  ostringstream tmp;
-                  tmp << " in Init_CUDA_Sparse, index (" << var << ") out of range for b vector\n";
-                  throw FatalExceptionHandling(tmp.str());
-                }
-# endif
-              Host_b[var] += u[index];
-            }
-          it4++;
-        }
-    }
-  Host_Ap[Size*periods] = NZE;
-  if (preconditioner == 3)
-    {
-      int *tmp_Ap_tild = (int *) mxMalloc((Size + 1) * sizeof(int));
-      test_mxMalloc(tmp_Ap_tild, __LINE__, __FILE__, __func__, (Size + 1) * sizeof(int));
-      int *tmp_Ai_tild = (int *) mxMalloc(NZE_tild * sizeof(int));
-      test_mxMalloc(tmp_Ai_tild, __LINE__, __FILE__, __func__, NZE_tild * sizeof(int));
-      double *tmp_A_tild = (double *) mxMalloc(NZE_tild * sizeof(double));
-      test_mxMalloc(tmp_A_tild, __LINE__, __FILE__, __func__, NZE_tild * sizeof(double));
-      memcpy(tmp_Ap_tild, Host_Ap_tild, (Size + 1) * sizeof(int));
-      memcpy(tmp_Ai_tild, Host_Ai_tild, NZE_tild * sizeof(int));
-      memcpy(tmp_A_tild, Host_A_tild, NZE_tild * sizeof(double));
-      //int NZE_tild_old = NZE_tild;
-      NZE_tild = 0;
-      Host_Ap_tild[0] = NZE_tild;
-
-      for (int i = 0; i < Size; i++)
-        {
-          for (int j = tmp_Ap_tild[i]; j < tmp_Ap_tild[i+1]; j++)
-            if (abs(tmp_A_tild[j]) > 1.0e-20)
-              {
-                Host_A_tild[NZE_tild] = tmp_A_tild[j];
-                Host_Ai_tild[NZE_tild] = tmp_Ai_tild[j];
-                NZE_tild++;
-              }
-          Host_Ap_tild[i+1] = NZE_tild;
-        }
-      mxFree(tmp_Ap_tild);
-      mxFree(tmp_Ai_tild);
-      mxFree(tmp_A_tild);
-    }
-
-  *nnz = NZE;
-  *nnz_tild = NZE_tild;
-  if (preconditioner == 1 || preconditioner == 2 || preconditioner == 3)
-    preconditioner_size = NZE;
-
-# ifdef DEBUG
-  mexPrintf("Host_Ax = [");
-  for (int i = 0; i < NZE; i++)
-    mexPrintf("%f ", Host_Ax[i]);
-  mexPrintf("]\n");
-
-  mexPrintf("Host_Ap = [");
-  for (int i = 0; i < n+1; i++)
-    mexPrintf("%d ", Host_Ap[i]);
-  mexPrintf("]\n");
-
-  mexPrintf("Host_Ai = [");
-  for (int i = 0; i < NZE; i++)
-    mexPrintf("%d ", Host_Ai[i]);
-  mexPrintf("]\n");
-# endif
-  cudaChk(cudaMalloc((void **) Ai, NZE * sizeof(int)), " in Init_Cuda_Sparse, can't allocate Ai index vector on the graphic card\n");
-  cudaChk(cudaMalloc((void **) Ax, NZE * sizeof(double)), "  in Init_Cuda_Sparse, can't allocate Ax on the graphic card\n");
-  cudaChk(cudaMalloc((void **) Ap, (n+1) * sizeof(int)), " in Init_Cuda_Sparse, can't allocate Ap index vector on the graphic card\n");
-  if (preconditioner == 3)
-    {
-      cudaChk(cudaMalloc((void **) Ai_tild, NZE_tild * sizeof(int)), " in Init_Cuda_Sparse, can't allocate Ai_tild index vector on the graphic card\n");
-      cudaChk(cudaMalloc((void **) Ap_tild, (n+1) * sizeof(int)), " in Init_Cuda_Sparse, can't allocate Ap_tild index vector on the graphic card\n");
-    }
-  cudaChk(cudaMalloc((void **) A_tild, preconditioner_size * sizeof(double)), "  in Init_Cuda_Sparse, can't allocate A_tild on the graphic card\n");
-
-  cudaChk(cudaMemcpy(*x0, Host_x0, n * sizeof(double), cudaMemcpyHostToDevice), " in Init_CUDA_Sparse, cudaMemcpy x0 = Host_x0 failed");
-  cudaChk(cudaMemcpy(*b, Host_b, n * sizeof(double), cudaMemcpyHostToDevice), " in Init_CUDA_Sparse, cudaMemcpy b = Host_b failed");
-  cudaChk(cudaMemcpy(*Ap, Host_Ap, (n + 1) * sizeof(int), cudaMemcpyHostToDevice), " in Init_CUDA_Sparse, cudaMemcpy Ap = Host_Ap failed");
-  cudaChk(cudaMemcpy(*Ai, Host_Ai, NZE * sizeof(int), cudaMemcpyHostToDevice), " in Init_CUDA_Sparse, cudaMemcpy Ai = Host_Ai failed");
-  cudaChk(cudaMemcpy(*Ax, Host_Ax, NZE * sizeof(double), cudaMemcpyHostToDevice), " in Init_CUDA_Sparse, cudaMemcpy Ax = Host_Ax failed");
-  if (preconditioner == 3)
-    {
-      cudaChk(cudaMemcpy(*Ap_tild, Host_Ap_tild, (n + 1) * sizeof(int), cudaMemcpyHostToDevice), " in Init_CUDA_Sparse, cudaMemcpy Ap_tild = Host_Ap_tild failed");
-      cudaChk(cudaMemcpy(*Ai_tild, Host_Ai_tild, NZE_tild * sizeof(int), cudaMemcpyHostToDevice), " in Init_CUDA_Sparse, cudaMemcpy Ai_tild = Host_Ai_til failed");
-    }
-  cudaChk(cudaMemcpy(*A_tild, Host_A_tild, preconditioner_size * sizeof(double), cudaMemcpyHostToDevice), " in Init_CUDA_Sparse, cudaMemcpy A_tild = Host_A_tild failed");
-}
-#endif
-
 void
 dynSparseMatrix::PrintM(int n, double *Ax, mwIndex *Ap, mwIndex *Ai)
 {
@@ -3356,85 +2939,6 @@ dynSparseMatrix::Solve_LU_UMFPack(mxArray *A_m, mxArray *b_m, int Size, double s
   mxDestroyArray(b_m);
 }
 
-#ifdef CUDA
-void
-printM(int n, double *Ax, int *Ap, int *Ai, cusparseMatDescr_t descrA, cusparseHandle_t cusparse_handle)
-{
-  //cudaError_t cuda_error;
-  //cusparseStatus_t cusparse_status;
-  double *A_dense;
-  cudaChk(cudaMalloc((void **) &A_dense, n * n *sizeof(double)), "A_dense cudaMalloc has failed\n");
-
-  cusparseChk(cusparseDcsr2dense(cusparse_handle, n, n, descrA,
-                                 Ax, Ap, Ai, A_dense, n), "cusparseDcsr2dense has failed\n");
-  double *A_dense_hoste = (double *) mxMalloc(n * n * sizeof(double));
-  test_mxMalloc(A_dense_hoste, __LINE__, __FILE__, __func__, n * n * sizeof(double));
-  cudaChk(cudaMemcpy(A_dense_hoste, A_dense, n * n * sizeof(double), cudaMemcpyDeviceToHost), " cudaMemcpy(A_dense_hoste, A_dense) has failed\n");
-  mexPrintf("----------------------\n");
-  mexPrintf("FillMode=%d, IndexBase=%d, MatType=%d, DiagType=%d\n", cusparseGetMatFillMode(descrA), cusparseGetMatIndexBase(descrA), cusparseGetMatType(descrA), cusparseGetMatDiagType(descrA));
-  //mexEvalString("drawnow;");
-  for (int i = 0; i < n; i++)
-    {
-      for (int j = 0; j < n; j++)
-        mexPrintf("%-6.3f ", A_dense_hoste[i + j * n]);
-      mexPrintf("\n");
-    }
-  mxFree(A_dense_hoste);
-  cudaChk(cudaFree(A_dense), "cudaFree(A_dense) has failed\n");
-}
-
-void
-dynSparseMatrix::Solve_CUDA_BiCGStab_Free(double *tmp_vect_host, double *p, double *r, double *v, double *s, double *t, double *y_, double *z, double *tmp_,
-                                          int *Ai, double *Ax, int *Ap, double *x0, double *b, double *A_tild, int *A_tild_i, int *A_tild_p /*, double* Lx, int* Li, int* Lp,
-                                                                                                                                              double* Ux, int* Ui, int* Up, int* device_n*/, cusparseSolveAnalysisInfo_t infoL, cusparseSolveAnalysisInfo_t infoU,
-                                          cusparseMatDescr_t descrL, cusparseMatDescr_t descrU, int preconditioner)
-{
-  //cudaError_t cuda_error;
-  //cusparseStatus_t cusparse_status;
-  mxFree(tmp_vect_host);
-  cudaChk(cudaFree(p), "  in Solve_Cuda_BiCGStab, can't free p\n");
-  cudaChk(cudaFree(r), "  in Solve_Cuda_BiCGStab, can't free r\n");
-  cudaChk(cudaFree(v), "  in Solve_Cuda_BiCGStab, can't free v\n");
-  cudaChk(cudaFree(s), "  in Solve_Cuda_BiCGStab, can't free s\n");
-  cudaChk(cudaFree(t), "  in Solve_Cuda_BiCGStab, can't free t\n");
-  cudaChk(cudaFree(y_), "  in Solve_Cuda_BiCGStab, can't free y_\n");
-  cudaChk(cudaFree(z), "  in Solve_Cuda_BiCGStab, can't free z\n");
-  cudaChk(cudaFree(tmp_), "  in Solve_Cuda_BiCGStab, can't free tmp_\n");
-  cudaChk(cudaFree(Ai), "  in Solve_Cuda_BiCGStab, can't free Ai\n");
-  cudaChk(cudaFree(Ax), "  in Solve_Cuda_BiCGStab, can't free Ax\n");
-  cudaChk(cudaFree(Ap), "  in Solve_Cuda_BiCGStab, can't free Ap\n");
-  cudaChk(cudaFree(x0), "  in Solve_Cuda_BiCGStab, can't free x0\n");
-  cudaChk(cudaFree(b), "  in Solve_Cuda_BiCGStab, can't free b\n");
-  /*if (preconditioner == 0)
-    {*/
-  cudaChk(cudaFree(A_tild), "  in Solve_Cuda_BiCGStab, can't free A_tild (1)\n");
-  cudaChk(cudaFree(A_tild_i), "  in Solve_Cuda_BiCGStab, can't free A_tild_i (1)\n");
-  cudaChk(cudaFree(A_tild_p), "  in Solve_Cuda_BiCGStab, can't free A_tild_p (1)\n");
-  /*}
-    else
-    {
-    cudaChk(cudaFree(Lx), "  in Solve_Cuda_BiCGStab, can't free Lx\n");
-    cudaChk(cudaFree(Li), "  in Solve_Cuda_BiCGStab, can't free Li\n");
-    cudaChk(cudaFree(Lp), "  in Solve_Cuda_BiCGStab, can't free Lp\n");
-    cudaChk(cudaFree(Ux), "  in Solve_Cuda_BiCGStab, can't free Ux\n");
-    cudaChk(cudaFree(Ui), "  in Solve_Cuda_BiCGStab, can't free Ui\n");
-    cudaChk(cudaFree(Up), "  in Solve_Cuda_BiCGStab, can't free Up\n");
-    }*/
-  //cudaChk(cudaFree(device_n), "  in Solve_Cuda_BiCGStab, can't free device_n\n");
-  if (preconditioner == 1 || preconditioner == 2 || preconditioner == 3)
-    {
-      cusparseChk(cusparseDestroySolveAnalysisInfo(infoL),
-                  "  in Solve_Cuda_BiCGStab, cusparseDestroySolveAnalysisInfo has failed for infoL\n");
-      cusparseChk(cusparseDestroySolveAnalysisInfo(infoU),
-                  "  in Solve_Cuda_BiCGStab, cusparseDestroySolveAnalysisInfo has failed for infoU\n");
-    }
-  cusparseChk(cusparseDestroyMatDescr(descrL),
-              " in Solve_Cuda_BiCGStab, matrix descriptor destruction failed for descrL\n");
-  cusparseChk(cusparseDestroyMatDescr(descrU),
-              " in Solve_Cuda_BiCGStab, matrix descriptor destruction failed for descrU\n");
-}
-#endif
-
 void
 Solve(double *Ax, int *Ap, int *Ai, double *b, int n, bool Lower, double *x)
 {
@@ -3507,1087 +3011,6 @@ Check(int n, double *Ax, int *Ap, int *Ai, double *b, double *x, bool Lower)
     }
 }
 
-#ifdef CUDA
-int
-dynSparseMatrix::Solve_CUDA_BiCGStab(int *Ap, int *Ai, double *Ax, int *Ap_tild, int *Ai_tild, double *A_tild, double *b, double *x0, int n, int Size, double slowc_l, bool is_two_boundaries,
-                                     int it_, int nnz, int nnz_tild, int preconditioner, int max_iterations, int block)
-{
-  cusparseSolveAnalysisInfo_t info, infoL, infoU;
-  cusparseMatDescr_t descrL, descrU;
-  const double tol = 1.0e-6; //1.0e-6;
-  const double eps = 1.0e-16;
-  double *p, *r, *r0, *v, *s, *t, *y_, *z, *tmp_;
-  int *A_tild_i, *A_tild_p;
-  double *Qx;
-  int *Qi, *Qj;
-  double *Px;
-  int *Pi, *Pj;
-  int Q_nnz, P_nnz;
-  int W_nnz;
-  double bnorm;
-  double tmp1, tmp2;
-  int refinement_needed = 0, stagnation = 0;
-  int max_refinement = min(min(int (floor(double (n)/50)), 10), n-max_iterations), max_stagnation = 3;
-  int nblocks = ceil(double (n) / double (1024));
-  int n_threads;
-  if (nblocks == 0)
-    n_threads = n;
-  else
-    n_threads = 1024;
-  int periods = n / Size;
-
-  double *tmp_vect_host = (double *) mxMalloc(n * sizeof(double));
-  test_mxMalloc(tmp_vect_host, __LINE__, __FILE__, __func__, n * sizeof(double));
-
-  cublasChk(cublasDnrm2(cublas_handle, n, b, 1, &bnorm),
-            "  in Solve_Cuda_BiCGStab, cublasDnrm2(b) has failed\n");
-
-  double tolb = tol * bnorm;
-
-  if (bnorm == 0.0)
-    {
-      // if b = 0 the A.x = 0 => x = 0
-      cudaChk(cudaFree(Ai), "  in Solve_Cuda_BiCGStab, can't free Ai\n");
-      cudaChk(cudaFree(Ax), "  in Solve_Cuda_BiCGStab, can't free Ax\n");
-      cudaChk(cudaFree(Ap), "  in Solve_Cuda_BiCGStab, can't free Ap\n");
-      if (preconditioner == 3)
-        {
-          cudaChk(cudaFree(Ai_tild), "  in Solve_Cuda_BiCGStab, can't free Ai_tild\n");
-          cudaChk(cudaFree(Ap_tild), "  in Solve_Cuda_BiCGStab, can't free Ap_tild\n");
-        }
-      cudaChk(cudaFree(A_tild), "  in Solve_Cuda_BiCGStab, can't free A_tild\n");
-      cudaChk(cudaFree(x0), "  in Solve_Cuda_BiCGStab, can't free x0\n");
-      cudaChk(cudaFree(b), "  in Solve_Cuda_BiCGStab, can't free b\n");
-      if (is_two_boundaries)
-        for (int i = 0; i < n; i++)
-          {
-            int eq = index_vara[i+Size*y_kmin];
-            double yy = -y[eq];
-            direction[eq] = yy;
-            y[eq] += slowc * yy;
-          }
-      else
-        for (int i = 0; i < n; i++)
-          {
-            int eq = index_vara[i];
-            double yy = -y[eq+it_*y_size];
-            direction[eq] = yy;
-            y[eq+it_*y_size] += slowc * yy;
-          }
-      return 0;
-    }
-
-  int iteration = 0;
-  bool convergence = false;
-  double zeros = 0.0, one = 1.0, m_one = -1.0;
-
-  cudaChk(cudaMalloc((void **) &tmp_, n * sizeof(double)), "  in Solve_Cuda_Sparse, can't allocate tmp_ on the graphic card\n");
-
-  cudaChk(cudaMalloc((void **) &r, n * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate r on the graphic card\n");
-
-  cudaChk(cudaMemcpy(r, b, n * sizeof(double), cudaMemcpyDeviceToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy r = b has failed\n");
-
-  //r = b - A * x0
-  cusparseChk(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, n,
-                             n, nnz, &m_one,
-                             CUDA_descr, Ax,
-                             Ap, Ai,
-                             x0, &one,
-                             r), "in Solve_Cuda_BiCGStab, cusparseDcsrmv A * x0 has failed");
-
-  cudaChk(cudaMemcpy(tmp_vect_host, r, n*sizeof(double), cudaMemcpyDeviceToHost), "  in Solve_Cuda_BiCGStab, cudaMemcpy tmp_vect_host = p_tild has failed\n");
-  /*mexPrintf("r\n");
-    for (int i = 0; i < n; i++)
-    mexPrintf("%f\n",tmp_vect_host[i]);*/
-
-  cudaChk(cudaMalloc((void **) &r0, n * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate r0 on the graphic card\n");
-  cudaChk(cudaMemcpy(r0, r, n * sizeof(double), cudaMemcpyDeviceToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy r0 = r has failed\n");
-
-  cublasChk(cublasDnrm2(cublas_handle, n, // numerator
-                        r, 1,
-                        &tmp1),
-            "  in Solve_Cuda_BiCGStab, cublasDnrm2(r) has failed\n");
-  double conv_criteria = tmp1;
-
-  convergence = conv_criteria < tolb;
-  if (convergence)
-    {
-      /* the initial value (x0) is solution of A x = b*/
-      cudaChk(cudaFree(Ai), "  in Solve_Cuda_BiCGStab, can't free Ai\n");
-      cudaChk(cudaFree(Ax), "  in Solve_Cuda_BiCGStab, can't free Ax\n");
-      cudaChk(cudaFree(Ap), "  in Solve_Cuda_BiCGStab, can't free Ap\n");
-      if (preconditioner == 3)
-        {
-          cudaChk(cudaFree(Ai_tild), "  in Solve_Cuda_BiCGStab, can't free Ai_tild\n");
-          cudaChk(cudaFree(Ap_tild), "  in Solve_Cuda_BiCGStab, can't free Ap_tild\n");
-        }
-      cudaChk(cudaFree(A_tild), "  in Solve_Cuda_BiCGStab, can't free A_tild\n");
-      cudaChk(cudaFree(x0), "  in Solve_Cuda_BiCGStab, can't free x0\n");
-      cudaChk(cudaFree(b), "  in Solve_Cuda_BiCGStab, can't free b\n");
-      return 0;
-    }
-
-  if (preconditioner == 0)
-    {
-      //Apply the Jacobi preconditioner
-      /*VecDiv<<<nblocks, n_threads>>>(r_, A_tild, z_, n);
-        cuda_error = cudaMemcpy(zz_, z_, n * sizeof(double), cudaMemcpyDeviceToDevice);*/
-    }
-  else if (preconditioner == 1)
-    {
-      //Apply an incomplete LU decomposition of A as preconditioner
-      cusparseChk(cusparseCreateSolveAnalysisInfo(&info), "  in Solve_Cuda_BiCGStab, cusparseCreateSolveAnalysisInfo for info has failed\n");
-
-      cusparseChk(cusparseDcsrsv_analysis(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                          n, nnz, CUDA_descr,
-                                          A_tild, Ap, Ai,
-                                          info),
-                  "  in Solve_Cuda_BiCGStab, cusparseDcsrsm_analysis(info) has failed\n");
-
-      cusparseChk(cusparseDcsrilu0(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                   n, CUDA_descr,
-                                   A_tild, Ap, Ai,
-                                   info),
-                  "  in Solve_Cuda_BiCGStab, cusparseDcsrilu0 has failed\n");
-
-      //Make a copy of the indexes in A_tild_i and A_tild_p to use it the Bicgstab algorithm
-      cudaChk(cudaMalloc((void **) &A_tild_i, nnz * sizeof(int)), "  in Solve_Cuda_BiCGStab, can't allocate A_tild_i on the graphic card\n");
-      cudaChk(cudaMemcpy(A_tild_i, Ai, nnz * sizeof(int), cudaMemcpyDeviceToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy A_tild_i = Ai has failed\n");
-      cudaChk(cudaMalloc((void **) &A_tild_p, (n + 1) * sizeof(int)), "  in Solve_Cuda_BiCGStab, can't allocate A_tild_p on the graphic card\n");
-      cudaChk(cudaMemcpy(A_tild_p, Ap, (n + 1) * sizeof(int), cudaMemcpyDeviceToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy A_tild_p = Ap has failed\n");
-    }
-  else if (preconditioner == 2)
-    {
-      //Because the Jacobian matrix A is store in CSC format in matlab
-      // we have to transpose it to get a CSR format used by CUDA
-      mwIndex *Awi, *Awp;
-      double *A_tild_host = (double *) mxMalloc(nnz*sizeof(double));
-      test_mxMalloc(A_tild_host, __LINE__, __FILE__, __func__, nnz*sizeof(double));
-      Awi = (mwIndex *) mxMalloc(nnz * sizeof(mwIndex));
-      test_mxMalloc(Awi, __LINE__, __FILE__, __func__, nnz * sizeof(mwIndex));
-      Awp = (mwIndex *) mxMalloc((n + 1) * sizeof(mwIndex));
-      test_mxMalloc(Awp, __LINE__, __FILE__, __func__, (n + 1) * sizeof(mwIndex));
-      int *Aii = (int *) mxMalloc(nnz * sizeof(int));
-      test_mxMalloc(Aii, __LINE__, __FILE__, __func__, nnz * sizeof(int));
-      int *Aip = (int *) mxMalloc((n + 1) * sizeof(int));
-      test_mxMalloc(Aip, __LINE__, __FILE__, __func__, (n + 1) * sizeof(int));
-      cudaChk(cudaMemcpy(A_tild_host, A_tild, nnz*sizeof(double), cudaMemcpyDeviceToHost), "  in Solve_Cuda_BiCGStab, cudaMemcpy A_tild_host = A_tild has failed\n");
-      cudaChk(cudaMemcpy(Aii, Ai, nnz*sizeof(int), cudaMemcpyDeviceToHost), "  in Solve_Cuda_BiCGStab, cudaMemcpy Aii = Ai has failed\n");
-      cudaChk(cudaMemcpy(Aip, Ap, (n+1)*sizeof(int), cudaMemcpyDeviceToHost), "  in Solve_Cuda_BiCGStab, cudaMemcpy Aip = Ai has failed\n");
-      for (int i = 0; i < nnz; i++)
-        Awi[i] = Aii[i];
-      for (int i = 0; i < n + 1; i++)
-        Awp[i] = Aip[i];
-      mxFree(Aii);
-      mxFree(Aip);
-      mxArray *At_m = mxCreateSparse(n, n, nnz, mxREAL);
-      mxSetIr(At_m, Awi);
-      mxSetJc(At_m, Awp);
-      mxSetPr(At_m, A_tild_host);
-      mxArray *A_m;
-      mexCallMATLAB(1, &A_m, 1, &At_m, "transpose");
-      mxDestroyArray(At_m);
-
-      /*mexPrintf("A_m\n");
-        mexCallMATLAB(0, NULL, 1, &A_m, "disp_dense");*/
-      /*mxFree(Awi);
-        mxFree(Awp);*/
-
-      /*[L1, U1] = ilu(g1a=;*/
-      const char *field_names[] = {"type", "droptol", "milu", "udiag", "thresh"};
-      const int type = 0;
-      const int droptol = 1;
-      const int milu = 2;
-      const int udiag = 3;
-      const int thresh = 4;
-      mwSize dims[1] = {(mwSize) 1 };
-      mxArray *Setup = mxCreateStructArray(1, dims, 5, field_names);
-      mxSetFieldByNumber(Setup, 0, type, mxCreateString("ilutp"));
-      //mxSetFieldByNumber(Setup, 0, type, mxCreateString("nofill"));
-      mxSetFieldByNumber(Setup, 0, droptol, mxCreateDoubleScalar(lu_inc_tol));
-      mxSetFieldByNumber(Setup, 0, milu, mxCreateString("off"));
-      mxSetFieldByNumber(Setup, 0, udiag, mxCreateDoubleScalar(0));
-      mxSetFieldByNumber(Setup, 0, thresh, mxCreateDoubleScalar(1));
-      //mxSetFieldByNumber(Setup, 0, thresh, mxCreateDoubleScalar(1));
-      mxArray *lhs0[2];
-      mxArray *rhs0[2];
-      rhs0[0] = A_m;
-      rhs0[1] = Setup;
-      ostringstream tmp;
-      if (mexCallMATLAB(2, lhs0, 2, rhs0, "ilu"))
-        {
-          tmp << " In BiCGStab, the incomplet LU decomposition (ilu) ahs failed.\n";
-          throw FatalExceptionHandling(tmp.str());
-        }
-      mxDestroyArray(Setup);
-
-      /*     //ILUT preconditionner computed by Matlab (todo: in futur version of cuda replace it by a new equivalent cuda function)
-             const char *field_names[] = {"type", "droptol", "milu", "udiag", "thresh"};
-             const int type = 0;
-             const int droptol = 1;
-             const int milu = 2;
-             const int udiag = 3;
-             const int thresh = 4;
-             mwSize dims[1] = {(mwSize)1 };
-             mxArray *Setup = mxCreateStructArray(1, dims, 5, field_names);
-             mxSetFieldByNumber(Setup, 0, type, mxCreateString("ilutp"));
-             mxSetFieldByNumber(Setup, 0, droptol, mxCreateDoubleScalar(lu_inc_tol));
-             mxSetFieldByNumber(Setup, 0, milu, mxCreateString("off"));
-             mxSetFieldByNumber(Setup, 0, udiag, mxCreateDoubleScalar(0));
-             mxSetFieldByNumber(Setup, 0, thresh, mxCreateDoubleScalar(0));
-             mxArray *lhs0[2], *rhs0[2];
-             rhs0[0] = A_m;
-             rhs0[1] = Setup;
-             mexCallMATLAB(1, lhs0, 2, rhs0, "ilu");
-      */
-      // To store the resultng matrix in a CSR format we have to transpose it
-      mxArray *Wt = lhs0[0];
-      mwIndex *Wtj = mxGetJc(Wt);
-      nnz = Wtj[n];
-      mxArray *W;
-      mexCallMATLAB(1, &W, 1, &Wt, "transpose");
-      mxDestroyArray(Wt);
-      double *pW = mxGetPr(W);
-      mwIndex *Wi = mxGetIr(W);
-      mwIndex *Wp = mxGetJc(W);
-      int *Wii = (int *) mxMalloc(nnz * sizeof(int));
-      test_mxMalloc(Wii, __LINE__, __FILE__, __func__, nnz * sizeof(int));
-      int *Wip = (int *) mxMalloc((n + 1) * sizeof(int));
-      test_mxMalloc(Wip, __LINE__, __FILE__, __func__, (n + 1) * sizeof(int));
-      for (int i = 0; i < nnz; i++)
-        Wii[i] = Wi[i];
-      for (int i = 0; i < n + 1; i++)
-        Wip[i] = Wp[i];
-
-      //mxFree(A_tild_host);
-
-      cudaChk(cudaFree(A_tild), "cudaFree(A_tild) has failed\n");
-
-      cudaChk(cudaMalloc((void **) &A_tild, nnz * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate A_tild on the graphic card\n");
-      cudaChk(cudaMemcpy(A_tild, pW, nnz * sizeof(double), cudaMemcpyHostToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy A_tild = pW has failed\n");
-      cudaChk(cudaMalloc((void **) &A_tild_i, nnz * sizeof(int)), "  in Solve_Cuda_BiCGStab, can't allocate Ai on the graphic card\n");
-      cudaChk(cudaMemcpy(A_tild_i, Wii, nnz * sizeof(int), cudaMemcpyHostToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy A_tild_i = A_tild_i_host has failed\n");
-      cudaChk(cudaMalloc((void **) &A_tild_p, (n + 1) * sizeof(int)), "  in Solve_Cuda_BiCGStab, can't allocate A_tild_p on the graphic card\n");
-      cudaChk(cudaMemcpy(A_tild_p, Wip, (n + 1) * sizeof(int), cudaMemcpyHostToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy A_tild_p = A_tild_j_host has failed\n");
-      /*mxFree(pW);
-        mxFree(Wi);
-        mxFree(Wj);*/
-      mxDestroyArray(W);
-      mxFree(Wii);
-      mxFree(Wip);
-    }
-  else if (preconditioner == 3)
-    {
-      mwIndex *Aowi, *Aowp;
-      double *A_host = (double *) mxMalloc(nnz*sizeof(double));
-      test_mxMalloc(A_host, __LINE__, __FILE__, __func__, nnz*sizeof(double));
-      Aowi = (mwIndex *) mxMalloc(nnz * sizeof(mwIndex));
-      test_mxMalloc(Aowi, __LINE__, __FILE__, __func__, nnz * sizeof(mwIndex));
-      Aowp = (mwIndex *) mxMalloc((n + 1) * sizeof(mwIndex));
-      test_mxMalloc(Aowp, __LINE__, __FILE__, __func__, (n + 1) * sizeof(mwIndex));
-      int *Aoii = (int *) mxMalloc(nnz * sizeof(int));
-      test_mxMalloc(Aoii, __LINE__, __FILE__, __func__, nnz * sizeof(int));
-      int *Aoip = (int *) mxMalloc((n + 1) * sizeof(int));
-      test_mxMalloc(Aoip, __LINE__, __FILE__, __func__, (n + 1) * sizeof(int));
-      cudaChk(cudaMemcpy(A_host, Ax, nnz*sizeof(double), cudaMemcpyDeviceToHost), "  in Solve_Cuda_BiCGStab, cudaMemcpy A_tild_host = A_tild has failed\n");
-      cudaChk(cudaMemcpy(Aoii, Ai, nnz*sizeof(int), cudaMemcpyDeviceToHost), "  in Solve_Cuda_BiCGStab, cudaMemcpy Aii = Ai_tild has failed\n");
-      cudaChk(cudaMemcpy(Aoip, Ap, (n+1)*sizeof(int), cudaMemcpyDeviceToHost), "  in Solve_Cuda_BiCGStab, cudaMemcpy Aip = Ap_tild has failed\n");
-      for (int i = 0; i < nnz; i++)
-        Aowi[i] = Aoii[i];
-      for (int i = 0; i < n + 1; i++)
-        Aowp[i] = Aoip[i];
-      mxFree(Aoii);
-      mxFree(Aoip);
-      mxArray *Ao_m = mxCreateSparse(n, n, nnz, mxREAL);
-      mxSetIr(Ao_m, Aowi);
-      mxSetJc(Ao_m, Aowp);
-      mxSetPr(Ao_m, A_host);
-      /*mexPrintf("A_m\n");
-        mxArray *Aoo;
-        mexCallMATLAB(1, &Aoo, 1, &Ao_m, "transpose");
-        mexCallMATLAB(0, NULL, 1, &Aoo, "disp_dense");
-        mxDestroyArray(Ao_m);
-        mxDestroyArray(Aoo);*/
-
-      //Because the Jacobian matrix A is store in CSC format in matlab
-      // we have to transpose it to get a CSR format used by CUDA
-      mwIndex *Awi, *Awp;
-      double *A_tild_host = (double *) mxMalloc(nnz_tild*sizeof(double));
-      test_mxMalloc(A_tild_host, __LINE__, __FILE__, __func__, nnz_tild*sizeof(double));
-      Awi = (mwIndex *) mxMalloc(nnz_tild * sizeof(mwIndex));
-      test_mxMalloc(Awi, __LINE__, __FILE__, __func__, nnz_tild * sizeof(mwIndex));
-      Awp = (mwIndex *) mxMalloc((Size + 1) * sizeof(mwIndex));
-      test_mxMalloc(Awp, __LINE__, __FILE__, __func__, (Size + 1) * sizeof(mwIndex));
-      int *Aii = (int *) mxMalloc(nnz_tild * sizeof(int));
-      test_mxMalloc(Aii, __LINE__, __FILE__, __func__, nnz_tild * sizeof(int));
-      int *Aip = (int *) mxMalloc((Size + 1) * sizeof(int));
-      test_mxMalloc(Aip, __LINE__, __FILE__, __func__, (Size + 1) * sizeof(int));
-      cudaChk(cudaMemcpy(A_tild_host, A_tild, nnz_tild*sizeof(double), cudaMemcpyDeviceToHost), "  in Solve_Cuda_BiCGStab, cudaMemcpy A_tild_host = A_tild has failed\n");
-      cudaChk(cudaMemcpy(Aii, Ai_tild, nnz_tild*sizeof(int), cudaMemcpyDeviceToHost), "  in Solve_Cuda_BiCGStab, cudaMemcpy Aii = Ai_tild has failed\n");
-      cudaChk(cudaMemcpy(Aip, Ap_tild, (Size+1)*sizeof(int), cudaMemcpyDeviceToHost), "  in Solve_Cuda_BiCGStab, cudaMemcpy Aip = Ap_tild has failed\n");
-      for (int i = 0; i < nnz_tild; i++)
-        Awi[i] = Aii[i];
-      for (int i = 0; i < Size + 1; i++)
-        Awp[i] = Aip[i];
-      /*for (int i = 0; i < nnz_tild; i++)
-        mexPrintf("%20.17f\n",A_tild_host[i]);*/
-      mxFree(Aii);
-      mxFree(Aip);
-      mxArray *At_m = mxCreateSparse(Size, Size, nnz_tild, mxREAL);
-      mxSetIr(At_m, Awi);
-      mxSetJc(At_m, Awp);
-      mxSetPr(At_m, A_tild_host);
-      mxArray *A_m;
-      mexCallMATLAB(1, &A_m, 1, &At_m, "transpose");
-      /*mexPrintf("A_tild_m\n");
-        mexCallMATLAB(0, NULL, 1, &A_m, "disp_dense");*/
-      mxDestroyArray(At_m);
-      mxArray *P, *Q, *L, *U;
-      mxArray *lhs0[4];
-      mexCallMATLAB(4, lhs0, 1, &A_m, "lu");
-
-      mxArray *P0, *Q0, *L0, *U0;
-      L0 = lhs0[0];
-      U0 = lhs0[1];
-      P0 = lhs0[2];
-      Q0 = lhs0[3];
-      mexCallMATLAB(1, &P, 1, &P0, "transpose");
-      mexCallMATLAB(1, &Q, 1, &Q0, "transpose");
-      mexCallMATLAB(1, &L, 1, &L0, "transpose");
-      mexCallMATLAB(1, &U, 1, &U0, "transpose");
-      mxDestroyArray(P0);
-      mxDestroyArray(Q0);
-      mxDestroyArray(L0);
-      mxDestroyArray(U0);
-      /*L = lhs0[0];
-        U = lhs0[1];
-        P = lhs0[2];
-        Q = lhs0[3];*/
-
-      /*mexPrintf("L\n");
-        mexCallMATLAB(0, NULL, 1, &L, "disp_dense");
-
-        mexPrintf("U\n");
-        mexCallMATLAB(0, NULL, 1, &U, "disp_dense");
-
-        mexPrintf("P\n");
-        mexCallMATLAB(0, NULL, 1, &P, "disp_dense");
-
-        mexPrintf("Q\n");
-        mexCallMATLAB(0, NULL, 1, &Q, "disp_dense");*/
-
-      mwIndex *Qiw_host = mxGetIr(Q);
-      mwIndex *Qjw_host = mxGetJc(Q);
-      double *Qx_host = mxGetPr(Q);
-      Q_nnz = Qjw_host[Size];
-      mexPrintf("Q_nnz=%d\n", Q_nnz);
-      int *Qi_host = (int *) mxMalloc(Q_nnz * periods * sizeof(int));
-      test_mxMalloc(Qi_host, __LINE__, __FILE__, __func__, Q_nnz * periods * sizeof(int));
-      double *Q_x_host = (double *) mxMalloc(Q_nnz * periods * sizeof(double));
-      test_mxMalloc(Q_x_host, __LINE__, __FILE__, __func__, Q_nnz * periods * sizeof(double));
-      int *Qj_host = (int *) mxMalloc((n + 1) * sizeof(int));
-      test_mxMalloc(Qj_host, __LINE__, __FILE__, __func__, (n + 1) * sizeof(int));
-      for (int t = 0; t < periods; t++)
-        {
-          for (int i = 0; i < Q_nnz; i++)
-            {
-              Qi_host[i + t * Q_nnz] = Qiw_host[i] + t * Size;
-              Q_x_host[i + t * Q_nnz] = Qx_host[i];
-            }
-          for (int i = 0; i < Size; i++)
-            {
-              Qj_host[i + t * Size] = Qjw_host[i] + t * Q_nnz;
-            }
-        }
-      Qj_host[periods * Size] = periods * Q_nnz;
-
-      /*mwIndex *Qtiw_host  = (mwIndex*) mxMalloc(Q_nnz * periods * sizeof(mwIndex));
-        double *Qt_x_host = (double*)mxMalloc(Q_nnz * periods * sizeof(double));
-        mwIndex *Qtjw_host = (mwIndex*)mxMalloc((n + 1) * sizeof(mwIndex));
-        mexPrintf("n = %d\n",n);
-        for (int i = 0; i < n + 1; i++)
-        Qtjw_host[i] = Qj_host[i];
-        for (int i = 0; i < Q_nnz * periods; i++)
-        {
-        Qtiw_host[i] = Qi_host[i];
-        Qt_x_host[i] = Q_x_host[i];
-        }
-        mxArray* Qt_m = mxCreateSparse(n,n,Q_nnz * periods,mxREAL);
-        mxSetIr(Qt_m, Qtiw_host);
-        mxSetJc(Qt_m, Qtjw_host);
-        mxSetPr(Qt_m, Qt_x_host);
-        mexPrintf("Qt_m\n");
-        mexCallMATLAB(0, NULL, 1, &Qt_m, "disp_dense");*/
-
-      /*mexPrintf("Qtjw_host[periods * Size=%d]=%d\n", periods * Size, Qtjw_host[periods * Size]);
-        for (int i = 0; i < n; i++)
-        for (int j = Qtjw_host[i]; j < Qtjw_host[i+1]; j++)
-        mexPrintf("(i=%d, j=%d) = %f\n", i, Qtiw_host[j], Qt_x_host[j]);*/
-      //mxDestroyArray(Qt_m);
-
-      cudaChk(cudaMalloc((void **) &Qx, Q_nnz * periods * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate Qx on the graphic card\n");
-      cudaChk(cudaMemcpy(Qx, Q_x_host, Q_nnz * periods * sizeof(double), cudaMemcpyHostToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy Qx = Qx_host has failed\n");
-      cudaChk(cudaMalloc((void **) &Qi, Q_nnz * periods * sizeof(int)), "  in Solve_Cuda_BiCGStab, can't allocate Qi on the graphic card\n");
-      cudaChk(cudaMemcpy(Qi, Qi_host, Q_nnz * periods * sizeof(int), cudaMemcpyHostToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy Qi = Qi_host has failed\n");
-      cudaChk(cudaMalloc((void **) &Qj, (Size * periods + 1) * sizeof(int)), "  in Solve_Cuda_BiCGStab, can't allocate Qj on the graphic card\n");
-      cudaChk(cudaMemcpy(Qj, Qj_host, (Size * periods + 1) * sizeof(int), cudaMemcpyHostToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy Qj = Qj_host has failed\n");
-      mxFree(Qi_host);
-      mxFree(Qj_host);
-      mxFree(Q_x_host);
-      mxDestroyArray(Q);
-
-      mwIndex *Piw_host = mxGetIr(P);
-      mwIndex *Pjw_host = mxGetJc(P);
-      double *Px_host = mxGetPr(P);
-      P_nnz = Pjw_host[Size];
-      int *Pi_host = (int *) mxMalloc(P_nnz * periods * sizeof(int));
-      test_mxMalloc(Pi_host, __LINE__, __FILE__, __func__, P_nnz * periods * sizeof(int));
-      double *P_x_host = (double *) mxMalloc(P_nnz * periods * sizeof(double));
-      test_mxMalloc(P_x_host, __LINE__, __FILE__, __func__, P_nnz * periods * sizeof(double));
-      int *Pj_host = (int *) mxMalloc((n + 1) * sizeof(int));
-      test_mxMalloc(Pj_host, __LINE__, __FILE__, __func__, (n + 1) * sizeof(int));
-      for (int t = 0; t < periods; t++)
-        {
-          for (int i = 0; i < P_nnz; i++)
-            {
-              Pi_host[i + t * P_nnz] = Piw_host[i] + t * Size;
-              P_x_host[i + t * P_nnz] = Px_host[i];
-            }
-          for (int i = 0; i < Size; i++)
-            Pj_host[i + t * Size] = Pjw_host[i] + t * P_nnz;
-        }
-      Pj_host[periods * Size] = periods * P_nnz;
-
-      /*mwIndex *Ptiw_host  = (mwIndex*) mxMalloc(P_nnz * periods * sizeof(mwIndex));
-        double *Pt_x_host = (double*)mxMalloc(P_nnz * periods * sizeof(double));
-        mwIndex *Ptjw_host = (mwIndex*)mxMalloc((n + 1) * sizeof(mwIndex));
-        for (int i = 0; i < n + 1; i++)
-        Ptjw_host[i] = Pj_host[i];
-        for (int i = 0; i < P_nnz * periods; i++)
-        {
-        Ptiw_host[i] = Pi_host[i];
-        Pt_x_host[i] = P_x_host[i];
-        }
-        mxArray* Pt_m = mxCreateSparse(n,n,P_nnz * periods,mxREAL);
-        mxSetIr(Pt_m, Ptiw_host);
-        mxSetJc(Pt_m, Ptjw_host);
-        mxSetPr(Pt_m, Pt_x_host);
-        mexPrintf("Pt_m\n");
-        mexCallMATLAB(0, NULL, 1, &Pt_m, "disp_dense");
-        mxDestroyArray(Pt_m);*/
-
-      cudaChk(cudaMalloc((void **) &Px, P_nnz * periods * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate Px on the graphic card\n");
-      cudaChk(cudaMemcpy(Px, P_x_host, P_nnz * periods * sizeof(double), cudaMemcpyHostToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy Px = Px_host has failed\n");
-      cudaChk(cudaMalloc((void **) &Pi, P_nnz * periods * sizeof(int)), "  in Solve_Cuda_BiCGStab, can't allocate Pi on the graphic card\n");
-      cudaChk(cudaMemcpy(Pi, Pi_host, P_nnz * periods * sizeof(int), cudaMemcpyHostToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy Pi = Pi_host has failed\n");
-      cudaChk(cudaMalloc((void **) &Pj, (Size * periods + 1) * sizeof(int)), "  in Solve_Cuda_BiCGStab, can't allocate Pj on the graphic card\n");
-      cudaChk(cudaMemcpy(Pj, Pj_host, (Size * periods + 1) * sizeof(int), cudaMemcpyHostToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy Pj = Pj_host has failed\n");
-      mxFree(Pi_host);
-      mxFree(Pj_host);
-      mxFree(P_x_host);
-      mxDestroyArray(P);
-
-      /*mwIndex* Piw_host = mxGetIr(P);
-        mwIndex* Pjw_host = mxGetJc(P);
-        double*  Px_host = mxGetPr(P);
-        P_nnz = Pjw_host[Size];
-        int *Pi_host = (int*)mxMalloc(P_nnz * sizeof(int));
-        int *Pj_host = (int*)mxMalloc((Size + 1) * sizeof(int));
-        for (int i = 0; i < P_nnz; i++)
-        Pi_host[i] = Piw_host[i];
-        for (int i = 0; i < Size + 1; i++)
-        Pj_host[i] = Pjw_host[i];
-
-        cudaChk(cudaMalloc((void**)&Px, P_nnz * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate Px on the graphic card\n");
-        cudaChk(cudaMemcpy(Px, Px_host, P_nnz * sizeof(double), cudaMemcpyHostToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy Px = Px_host has failed\n");
-        cudaChk(cudaMalloc((void**)&Pi, P_nnz * sizeof(int)), "  in Solve_Cuda_BiCGStab, can't allocate Pi on the graphic card\n");
-        cudaChk(cudaMemcpy(Pi, Pi_host, P_nnz * sizeof(int), cudaMemcpyHostToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy Pi = Pi_host has failed\n");
-        cudaChk(cudaMalloc((void**)&Pj, (Size + 1) * sizeof(int)), "  in Solve_Cuda_BiCGStab, can't allocate Pj on the graphic card\n");
-        cudaChk(cudaMemcpy(Pj, Pj_host, (Size + 1) * sizeof(int), cudaMemcpyHostToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy Pj = Pj_host has failed\n");
-        mxFree(Pi_host);
-        mxFree(Pj_host);
-        mxDestroyArray(P);*/
-
-      /*mexPrintf("L\n");
-        mexCallMATLAB(0, NULL, 1, &L, "disp_dense");
-
-        mexPrintf("U\n");
-        mexCallMATLAB(0, NULL, 1, &U, "disp_dense");*/
-
-      mwIndex *Liw_host = mxGetIr(L);
-      mwIndex *Ljw_host = mxGetJc(L);
-      double *Lx_host = mxGetPr(L);
-      int L_nnz = Ljw_host[Size];
-
-      mwIndex *Uiw_host = mxGetIr(U);
-      mwIndex *Ujw_host = mxGetJc(U);
-      double *Ux_host = mxGetPr(U);
-      int U_nnz = Ujw_host[Size];
-
-      double *pW = (double *) mxMalloc((L_nnz + U_nnz - Size) * periods * sizeof(double));
-      test_mxMalloc(pW, __LINE__, __FILE__, __func__, (L_nnz + U_nnz - Size) * periods * sizeof(double));
-      int *Wi = (int *) mxMalloc((L_nnz + U_nnz - Size) * periods * sizeof(int));
-      test_mxMalloc(Wi, __LINE__, __FILE__, __func__, (L_nnz + U_nnz - Size) * periods * sizeof(int));
-      int *Wj = (int *) mxMalloc((n + 1) * sizeof(int));
-      test_mxMalloc(Wj, __LINE__, __FILE__, __func__, (n + 1) * sizeof(int));
-      Wj[0] = 0;
-      W_nnz = 0;
-      for (int t = 0; t < periods; t++)
-        for (int i = 0; i < Size; i++)
-          {
-            for (mwIndex l = Ujw_host[i]; l < Ujw_host[i+1]; l++)
-              {
-                Wi[W_nnz] = Uiw_host[l] + t * Size;
-                pW[W_nnz] = Ux_host[l];
-                //mexPrintf("Wj[%d] = %d, Wi[%d] = Uiw_host[%d] + t * Size = %d, pW[%d]=%f\n", i + t * Size, Wj[i + t * Size], W_nnz, l, Uiw_host[l] + t * Size, W_nnz, Ux_host[l]);
-                W_nnz++;
-              }
-            for (mwIndex l = Ljw_host[i]; l < Ljw_host[i+1]; l++)
-              {
-                if (Liw_host[l] > i)
-                  {
-                    Wi[W_nnz] = Liw_host[l] + t * Size;
-                    pW[W_nnz] = Lx_host[l];
-                    //mexPrintf("Wj[%d] = %d, Wi[%d] = Liw_host[%d] + t * Size = %d, pW[%d]=%f\n", i  + t * Size, Wj[i + t * Size], W_nnz, l, Liw_host[l] + t * Size, W_nnz, Lx_host[l]);
-                    W_nnz++;
-                  }
-              }
-            Wj[i + 1 + t * Size] = W_nnz;
-          }
-      //mexPrintf("Wj[%d] = %d, n=%d\n", periods * Size, Wj[periods * Size], n);
-      cudaChk(cudaMalloc((void **) &A_tild, W_nnz * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate Px on the graphic card\n");
-      cudaChk(cudaMemcpy(A_tild, pW, W_nnz * sizeof(double), cudaMemcpyHostToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy A_tild = pW has failed\n");
-      cudaChk(cudaMalloc((void **) &A_tild_i, W_nnz * sizeof(int)), "  in Solve_Cuda_BiCGStab, can't allocate Pi on the graphic card\n");
-      cudaChk(cudaMemcpy(A_tild_i, Wi, W_nnz * sizeof(int), cudaMemcpyHostToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy A_tild_i = Wi has failed\n");
-      cudaChk(cudaMalloc((void **) &A_tild_p, (n + 1) * sizeof(int)), "  in Solve_Cuda_BiCGStab, can't allocate Pj on the graphic card\n");
-      cudaChk(cudaMemcpy(A_tild_p, Wj, (n + 1) * sizeof(int), cudaMemcpyHostToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy A_tild_p = Wj has failed\n");
-
-      /*mwIndex *Wwi = (mwIndex*)mxMalloc(W_nnz * sizeof(mwIndex));
-        mwIndex *Wwj = (mwIndex*)mxMalloc((n + 1) * sizeof(mwIndex));
-        for (int i = 0; i < W_nnz; i++)
-        Wwi[i] = Wi[i];
-        for (int i = 0; i < n + 1; i++)
-        Wwj[i] = Wj[i];
-        mxFree(Wi);
-        mxFree(Wj);
-        mxArray* Ao_tild = mxCreateSparse(n,n,W_nnz,mxREAL);
-        mxSetIr(Ao_tild, Wwi);
-        mxSetJc(Ao_tild, Wwj);
-        mxSetPr(Ao_tild, pW);
-        mexPrintf("Ao_tild\n");
-        mexCallMATLAB(0, NULL, 1, &Ao_tild, "disp_dense");
-        mxDestroyArray(Ao_tild);*/
-
-      /*ostringstream tmp;
-        tmp << "debugging";
-        mexWarnMsgTxt(tmp.str().c_str());
-        return 4;*/
-
-      /* Apply the permutation matrices (P and Q) to the b vector of system to solve :
-         b_tild = P-1 . b  = P' . b */
-      /*cudaChk(cudaMalloc((void**)&b_tild, n * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate b_tild on the graphic card\n");
-        cusparseChk(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_TRANSPOSE,
-        n, n, nnz, &one, CUDA_descr,
-        Px, Pj, Pi,
-        b, &zeros,
-        b_tild),
-        "  in Solve_Cuda_BiCGStab, b_tild = cusparseDcsrmv(P', b) has failed\n");
-
-        cusparseChk(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_TRANSPOSE,
-        n, n, nnz, &one, CUDA_descr,
-        Px, Pj, Pi,
-        b, &zeros,
-        b),
-        "  in Solve_Cuda_BiCGStab, b = cusparseDcsrmv(P', b) has failed\n");
-      */
-      /*mexPrintf("Wt = lu(A_m)\n");
-        mexCallMATLAB(0, NULL, 1, &Wt, "disp_dense");*/
-      /*ostringstream tmp;
-        tmp << "debugging";
-        mexWarnMsgTxt(tmp.str().c_str());
-        return 4;*/
-      // To store the resultng matrix in a CSR format we have to transpose it
-      /*mwIndex* Wtj = mxGetJc(Wt);
-        nnz = Wtj[n];
-        mxArray* W;
-        mexCallMATLAB(1, &W, 1, &Wt, "transpose");
-        mxDestroyArray(Wt);
-        pW = mxGetPr(W);
-        Wwi = mxGetIr(W);
-        mwIndex* Wp = mxGetJc(W);
-        int *Wii = (int*)mxMalloc(nnz * sizeof(int));
-        int *Wip = (int*)mxMalloc((n + 1) * sizeof(int));
-        for (int i = 0; i < nnz; i++)
-        Wii[i] = Wi[i];
-        for (int i = 0; i < n + 1; i++)
-        Wip[i] = Wp[i];
-
-        //mxFree(A_tild_host);
-
-        cudaChk(cudaFree(Ai_tild), "  in Solve_Cuda_BiCGStab, cudaFree(Ai_tild) has failed\n");
-        cudaChk(cudaFree(Ap_tild), "  in Solve_Cuda_BiCGStab, cudaFree(Ap_tild) has failed\n");
-        cudaChk(cudaFree(A_tild), "  in Solve_Cuda_BiCGStab, cudaFree(A_tild) has failed\n");
-
-        cudaChk(cudaMalloc((void**)&A_tild, nnz * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate A_tild on the graphic card\n");
-        cudaChk(cudaMemcpy(A_tild, pW, nnz * sizeof(double), cudaMemcpyHostToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy A_tild = pW has failed\n");
-        cudaChk(cudaMalloc((void**)&A_tild_i, nnz * sizeof(int)), "  in Solve_Cuda_BiCGStab, can't allocate Ai on the graphic card\n");
-        cudaChk(cudaMemcpy(A_tild_i, Wii, nnz * sizeof(int), cudaMemcpyHostToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy A_tild_i = A_tild_i_host has failed\n");
-        cudaChk(cudaMalloc((void**)&A_tild_p, (n + 1) * sizeof(int)), "  in Solve_Cuda_BiCGStab, can't allocate A_tild_p on the graphic card\n");
-        cudaChk(cudaMemcpy(A_tild_p, Wip, (n + 1) * sizeof(int), cudaMemcpyHostToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy A_tild_p = A_tild_j_host has failed\n");
-        mxDestroyArray(W);
-        mxFree(Wii);
-        mxFree(Wip);*/
-    }
-  if (preconditioner == 1 || preconditioner == 2 || preconditioner == 3)
-    {
-      cusparseChk(cusparseCreateMatDescr(&descrL),
-                  "  in Solve_Cuda_BiCGStab, cusparseCreateMatDescr has failed for descrL\n");
-      cusparseChk(cusparseSetMatIndexBase(descrL, CUSPARSE_INDEX_BASE_ZERO),
-                  "  in Solve_Cuda_BiCGStab, cusparseSetMatIndexBase has failed for descrL\n");
-      cusparseChk(cusparseSetMatType(descrL, CUSPARSE_MATRIX_TYPE_GENERAL),
-                  "  in Solve_Cuda_BiCGStab, cusparseSetMatType has failed for descrL\n");
-      cusparseChk(cusparseSetMatFillMode(descrL, CUSPARSE_FILL_MODE_LOWER),
-                  "  in Solve_Cuda_BiCGStab, cusparseSetFillMod has failed for descrL\n");
-      cusparseChk(cusparseSetMatDiagType(descrL, CUSPARSE_DIAG_TYPE_UNIT),
-                  "  in Solve_Cuda_BiCGStab, cusparseSetMatDiagType has failed for descrL\n");
-
-      cusparseChk(cusparseCreateMatDescr(&descrU),
-                  "  in Solve_Cuda_BiCGStab, cusparseCreateMatDescr has failed for descrU\n");
-      cusparseChk(cusparseSetMatIndexBase(descrU, CUSPARSE_INDEX_BASE_ZERO),
-                  "  in Solve_Cuda_BiCGStab, cusparseSetMatIndexBase has failed for descrU\n");
-      cusparseChk(cusparseSetMatType(descrU, CUSPARSE_MATRIX_TYPE_GENERAL),
-                  "  in Solve_Cuda_BiCGStab, cusparseSetMatType has failed for descrU\n");
-      cusparseChk(cusparseSetMatFillMode(descrU, CUSPARSE_FILL_MODE_UPPER),
-                  "  in Solve_Cuda_BiCGStab, cusparseSetFillMod has failed for descrU\n");
-      cusparseChk(cusparseSetMatDiagType(descrU, CUSPARSE_DIAG_TYPE_NON_UNIT),
-                  "  in Solve_Cuda_BiCGStab, cusparseSetMatDiagType has failed for descrU\n");
-
-      int host_nnz_tild;
-      if (preconditioner == 3)
-        host_nnz_tild = W_nnz;
-      else
-        host_nnz_tild = nnz;
-
-      if (preconditioner == 1)
-        cusparseChk(cusparseDestroySolveAnalysisInfo(info),
-                    "  in Solve_Cuda_BiCGStab, cusparseDestroySolveAnalysisInfo has failed for info\n");
-
-      cusparseChk(cusparseCreateSolveAnalysisInfo(&infoL),
-                  "  in Solve_Cuda_BiCGStab, cusparseCreateSolveAnalysisInfo has failed for infoL\n");
-      cusparseChk(cusparseDcsrsv_analysis(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                          n, host_nnz_tild, descrL,
-                                          A_tild, A_tild_p, A_tild_i,
-                                          infoL),
-                  "  in Solve_Cuda_BiCGStab, cusparseDcsrsm_analysis for infoL has failed\n");
-
-      cusparseChk(cusparseCreateSolveAnalysisInfo(&infoU),
-                  "  in Solve_Cuda_BiCGStab, cusparseCreateSolveAnalysisInfo has failed for infoU\n");
-      cusparseChk(cusparseDcsrsv_analysis(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                          n, host_nnz_tild, descrU,
-                                          A_tild, A_tild_p, A_tild_i,
-                                          infoU),
-                  "  in Solve_Cuda_BiCGStab, cusparseDcsrsm_analysis for infoU has failed\n");
-    }
-
-  cudaChk(cudaMalloc((void **) &v, n * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate v on the graphic card\n");
-  cudaChk(cudaMalloc((void **) &p, n * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate p on the graphic card\n");
-  //cudaChk(cudaMemset(p, 0, n * sizeof(double)), "  in Solve_Cuda_BiCGStab, cudaMemset p = 0 has failed\n");
-  cudaChk(cudaMalloc((void **) &s, n * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate s on the graphic card\n");
-  cudaChk(cudaMalloc((void **) &t, n * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate t on the graphic card\n");
-  cudaChk(cudaMalloc((void **) &y_, n * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate y_ on the graphic card\n");
-  cudaChk(cudaMalloc((void **) &z, n * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate z on the graphic card\n");
-
-  double rho = 1.0, alpha = 1.0, omega = 1.0;
-
-  //residual = P*B*Q - L*U;
-  //norm(Z,1) should be close to 0
-
-  while (iteration < 50 /*max_iterations*/ && !convergence)
-    {
-      double rho_prev = rho;
-      /**store in s previous value of r*/
-      cudaChk(cudaMemcpy(s, r, n * sizeof(double), cudaMemcpyDeviceToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy s = r has failed\n");
-
-      /**rho = r0 . r*/
-      cublasChk(cublasDdot(cublas_handle, n, // numerator
-                           r0, 1,
-                           r, 1,
-                           &rho),
-                "  in Solve_Cuda_BiCGStab, rho = cublasDdot(r0, r) has failed\n");
-
-      mexPrintf("rho=%f\n", rho);
-
-      double beta;
-
-      if (iteration == 0)
-        {
-          cudaChk(cudaMemcpy(p, r, n * sizeof(double), cudaMemcpyDeviceToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy p = r has failed\n");
-        }
-      else
-        {
-          /**beta = (rho / rho_prev) . (alpha / omega);*/
-          beta = rho / rho_prev * alpha / omega;
-
-          /**p = r + beta * (p - omega * v)*/
-          // tmp_ = p - omega * v
-          VecAdd<<< nblocks, n_threads>>> (tmp_, p, -omega, v, n);
-          //p = r + beta * tmp_
-          VecAdd<<< nblocks, n_threads>>> (p, r, beta, tmp_, n);
-        }
-
-      /**y_ solution of A_tild * y_ = p <=> L . U . y_ = p*/
-      //  L tmp_ = p => tmp_ = L^-1 p, with tmp_ = U . y_
-
-      if (preconditioner == 3)
-        {
-          double *p_tild;
-
-          cudaChk(cudaMemcpy(tmp_vect_host, p, n*sizeof(double), cudaMemcpyDeviceToHost), "  in Solve_Cuda_BiCGStab, cudaMemcpy tmp_vect_host = p has failed\n");
-          /*mexPrintf("p\n");
-            for (int i = 0; i < n; i++)
-            mexPrintf("%f\n",tmp_vect_host[i]);*/
-
-          cudaChk(cudaMalloc((void **) &p_tild, n * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate b_tild on the graphic card\n");
-          cusparseChk(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                     n, n, P_nnz * periods, &one, CUDA_descr,
-                                     Px, Pj, Pi,
-                                     p, &zeros,
-                                     p_tild),
-                      "  in Solve_Cuda_BiCGStab, p_tild = cusparseDcsrmv(P', p) has failed\n");
-
-          /*mexPrintf("P\n");
-            printM(n, Px, Pj, Pi, CUDA_descr, cusparse_handle);*/
-
-          cudaChk(cudaMemcpy(tmp_vect_host, p_tild, n*sizeof(double), cudaMemcpyDeviceToHost), "  in Solve_Cuda_BiCGStab, cudaMemcpy tmp_vect_host = p_tild has failed\n");
-          /*mexPrintf("p_tild\n");
-            for (int i = 0; i < n; i++)
-            mexPrintf("%f\n",tmp_vect_host[i]);*/
-
-          cusparseChk(cusparseDcsrsv_solve(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                           n, &one,
-                                           descrL,
-                                           A_tild, A_tild_p, A_tild_i,
-                                           infoL, p_tild,
-                                           tmp_),
-                      "  in Solve_Cuda_BiCGStab, cusparseDcsrsv_solve for L . tmp_ = p_tild has failed\n");
-          cudaChk(cudaFree(p_tild), "  in Solve_Cuda_BiCGStab, can't free p_tild\n");
-
-          cudaChk(cudaMemcpy(tmp_vect_host, tmp_, n*sizeof(double), cudaMemcpyDeviceToHost), "  in Solve_Cuda_BiCGStab, cudaMemcpy tmp_vect_host = v has failed\n");
-          /*mexPrintf("tmp_\n");
-            for (int i = 0; i < n; i++)
-            mexPrintf("%f\n",tmp_vect_host[i]);*/
-        }
-      else
-        cusparseChk(cusparseDcsrsv_solve(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                         n, &one,
-                                         descrL,
-                                         A_tild, A_tild_p, A_tild_i,
-                                         infoL, p,
-                                         tmp_),
-                    "  in Solve_Cuda_BiCGStab, cusparseDcsrsv_solve for L . tmp_ = p has failed\n");
-
-      //  U . y_ = L^-1 p <=> U . y_ = tmp_ => y_ = U^-1 L^-1 p
-      cusparseChk(cusparseDcsrsv_solve(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                       n, &one,
-                                       descrU,
-                                       A_tild, A_tild_p, A_tild_i,
-                                       infoU, tmp_,
-                                       y_),
-                  "  in Solve_Cuda_BiCGStab, cusparseDcsrsv_solve for U . y_ = tmp_ has failed\n");
-
-      /*cudaChk(cudaMemcpy(tmp_vect_host, y_, n*sizeof(double), cudaMemcpyDeviceToHost), "  in Solve_Cuda_BiCGStab, cudaMemcpy tmp_vect_host = v has failed\n");
-        mexPrintf("y_\n");
-        for (int i = 0; i < n; i++)
-        mexPrintf("%f\n",tmp_vect_host[i]);*/
-
-      if (preconditioner == 3)
-        {
-          double *y_tild;
-          cudaChk(cudaMalloc((void **) &y_tild, n * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate b_tild on the graphic card\n");
-          cudaChk(cudaMemcpy(y_tild, y_, n  * sizeof(double), cudaMemcpyDeviceToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy y_tild = y_ has failed\n");
-          cusparseChk(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                     n, n, Q_nnz * periods, &one, CUDA_descr,
-                                     Qx, Qj, Qi,
-                                     y_tild, &zeros,
-                                     y_),
-                      "  in Solve_Cuda_BiCGStab, y_ = cusparseDcsrmv(Q', y_tild) has failed\n");
-          cudaChk(cudaFree(y_tild), "  in Solve_Cuda_BiCGStab, can't free y_tild\n");
-        }
-      /*cudaChk(cudaMemcpy(tmp_vect_host, y_, n*sizeof(double), cudaMemcpyDeviceToHost), "  in Solve_Cuda_BiCGStab, cudaMemcpy tmp_vect_host = v has failed\n");
-        mexPrintf("y_\n");
-        for (int i = 0; i < n; i++)
-        mexPrintf("%f\n",tmp_vect_host[i]);*/
-      /**v = A*y_*/
-      cusparseChk(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                 n, n, nnz, &one, CUDA_descr,
-                                 Ax, Ap, Ai,
-                                 y_, &zeros,
-                                 v),
-                  "  in Solve_Cuda_BiCGStab, v = cusparseDcsrmv(A, y_) has failed\n");
-      cudaChk(cudaMemcpy(tmp_vect_host, v, n*sizeof(double), cudaMemcpyDeviceToHost), "  in Solve_Cuda_BiCGStab, cudaMemcpy tmp_vect_host = v has failed\n");
-      /*mexPrintf("v\n");
-        for (int i = 0; i < n; i++)
-        mexPrintf("%f\n",tmp_vect_host[i]);*/
-
-      /**alpha = rho / (rr0 . v) with rr0 = r0*/
-      cublasChk(cublasDdot(cublas_handle, n, // numerator
-                           r0, 1,
-                           v, 1,
-                           &tmp1),
-                "  in Solve_Cuda_BiCGStab, cublasDdot(r0, v) has failed\n");
-
-      alpha = rho / tmp1;
-      mexPrintf("rho = %f, tmp1 = %f\n", rho, tmp1);
-      mexPrintf("alpha = %f\n", alpha);
-
-      if (alpha == 0 || isinf(alpha) || isnan(alpha))
-        {
-          Solve_CUDA_BiCGStab_Free(tmp_vect_host, p, r, v, s, t, y_, z, tmp_, Ai, Ax, Ap, x0, b, A_tild, A_tild_i, A_tild_p, infoL, infoU, descrL, descrU, preconditioner);
-          ostringstream tmp;
-          tmp << "one of the scalar quantities (alpha=" << alpha << ") calculated during BICGSTAB became too small or too large to continue computing, in block " << block+1;
-          mexWarnMsgTxt(tmp.str().c_str());
-          return 4;
-        }
-
-      /** Check for potential stagnation*/
-      cublasChk(cublasDnrm2(cublas_handle, n, // numerator
-                            y_, 1,
-                            &tmp1),
-                "  in Solve_Cuda_BiCGStab, cublasDnrm2(y_) has failed\n");
-      cublasChk(cublasDnrm2(cublas_handle, n, // denominator
-                            x0, 1,
-                            &tmp2),
-                "  in Solve_Cuda_BiCGStab, cublasDnrm2(y_) has failed\n");
-      mexPrintf("abs(alpha)*tmp1  = %f, alpha = %f, tmp1 = %f, tmp2 = %f, eps = %f\n", abs(alpha)*tmp1, alpha, tmp1, tmp2, eps);
-      if (abs(alpha)*tmp1 < eps * tmp2)
-        stagnation++;
-      else
-        stagnation = 0;
-
-      /**x = x + alpha * y_*/
-      VecInc<<< nblocks, n_threads>>> (x0, alpha, y_, n);
-
-      /**s = r_prev - alpha *v with r_prev = s*/
-      VecInc<<< nblocks, n_threads>>> (s, -alpha, v, n);
-
-      /**Has BiCGStab converged?*/
-      cublasChk(cublasDnrm2(cublas_handle, n, // numerator
-                            s, 1,
-                            &tmp1),
-                "  in Solve_Cuda_BiCGStab, cublasDnrm2(s) has failed\n");
-      conv_criteria = tmp1;
-      mexPrintf("conv_criteria = %f, tolb = %f\n", conv_criteria, tolb);
-      convergence = conv_criteria < tolb;
-
-      if (convergence || stagnation >= max_stagnation || refinement_needed)
-        {
-          /**s = b - A * x0*/
-          cudaChk(cudaMemcpy(s, b, n * sizeof(double), cudaMemcpyDeviceToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy s = b has failed\n");
-          cusparseChk(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                     n, n, nnz, &m_one, CUDA_descr,
-                                     Ax, Ap, Ai,
-                                     x0, &one,
-                                     s),
-                      "  in Solve_Cuda_BiCGStab, s = b - cusparseDcsrmv(A, x0) has failed\n");
-          cublasChk(cublasDnrm2(cublas_handle, n, // numerator
-                                s, 1,
-                                &tmp1),
-                    "  in Solve_Cuda_BiCGStab, cublasDnrm2(s) has failed\n");
-          conv_criteria = tmp1;
-          convergence = conv_criteria < tolb;
-          if (convergence)
-            {
-              break;
-            }
-          else
-            {
-              if (stagnation >= max_stagnation && refinement_needed == 0)
-                stagnation = 0;
-              refinement_needed++;
-              if (refinement_needed > max_refinement)
-                {
-                  Solve_CUDA_BiCGStab_Free(tmp_vect_host, p, r, v, s, t, y_, z, tmp_, Ai, Ax, Ap, x0, b, A_tild, A_tild_i, A_tild_p, infoL, infoU, descrL, descrU, preconditioner);
-                  ostringstream tmp;
-                  tmp << "Error in bytecode: BiCGStab stagnated (Two consecutive iterates were the same.), in block " << block+1;
-                  mexWarnMsgTxt(tmp.str().c_str());
-                  return 3;
-                }
-            }
-        }
-
-      /**z solution of A_tild * z = s*/
-      //  L tmp_ = s => tmp_ = L^-1 s, with tmp_ = U . z
-      if (preconditioner == 3)
-        {
-          double *s_tild;
-          cudaChk(cudaMalloc((void **) &s_tild, n * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate b_tild on the graphic card\n");
-          cusparseChk(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                     n, n, P_nnz * periods, &one, CUDA_descr,
-                                     Px, Pj, Pi,
-                                     s, &zeros,
-                                     s_tild),
-                      "  in Solve_Cuda_BiCGStab, s_tild = cusparseDcsrmv(P', s) has failed\n");
-          cusparseChk(cusparseDcsrsv_solve(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                           n, &one,
-                                           descrL,
-                                           A_tild, A_tild_p, A_tild_i,
-                                           infoL, s_tild,
-                                           tmp_),
-                      "  in Solve_Cuda_BiCGStab, cusparseDcsrsv_solve for L . tmp_ = s_tild has failed\n");
-          cudaChk(cudaFree(s_tild), "  in Solve_Cuda_BiCGStab, can't free s_tild\n");
-        }
-      else
-        cusparseChk(cusparseDcsrsv_solve(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                         n, &one,
-                                         descrL,
-                                         //Lx, Lp, Li,
-                                         A_tild, A_tild_p, A_tild_i,
-                                         infoL, s,
-                                         tmp_),
-                    "  in Solve_Cuda_BiCGStab, cusparseDcsrsv_solve for L . tmp_ = s has failed\n");
-      //  U . z = L^-1 s <=> U . z = tmp_ => z = U^-1 L^-1 s
-      cusparseChk(cusparseDcsrsv_solve(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                       n, &one,
-                                       descrU,
-                                       //Ux, Up, Ui,
-                                       A_tild, A_tild_p, A_tild_i,
-                                       infoU, tmp_,
-                                       z),
-                  "  in Solve_Cuda_BiCGStab, cusparseDcsrsv_solve for U . z = tmp_ has failed\n");
-      if (preconditioner == 3)
-        {
-          double *z_tild;
-          cudaChk(cudaMalloc((void **) &z_tild, n * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate z_tild on the graphic card\n");
-          cudaChk(cudaMemcpy(z_tild, z, n  * sizeof(double), cudaMemcpyDeviceToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy z_tild = z has failed\n");
-          cusparseChk(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                     n, n, Q_nnz * periods, &one, CUDA_descr,
-                                     Qx, Qj, Qi,
-                                     z_tild, &zeros,
-                                     z),
-                      "  in Solve_Cuda_BiCGStab, z = cusparseDcsrmv(Q, z_tild) has failed\n");
-          cudaChk(cudaFree(z_tild), "  in Solve_Cuda_BiCGStab, can't free x_tild\n");
-        }
-      /**t = A * z*/
-      cusparseChk(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                 n, n, nnz, &one, CUDA_descr,
-                                 Ax, Ap, Ai,
-                                 z, &zeros,
-                                 t),
-                  "  in Solve_Cuda_BiCGStab, t = cusparseDcsrmv(A, z) has failed\n");
-
-      /** omega = (t' s) / (t' t)*/
-      cublasChk(cublasDdot(cublas_handle, n, // numerator
-                           t, 1,
-                           s, 1,
-                           &tmp1),
-                "  in Solve_Cuda_BiCGStab, cublasDdot(t, s) has failed\n");
-
-      cublasChk(cublasDdot(cublas_handle, n, // numerator
-                           t, 1,
-                           t, 1,
-                           &tmp2),
-                "  in Solve_Cuda_BiCGStab, cublasDdot(t, t) has failed\n");
-
-      omega = tmp1 / tmp2;
-
-      if (omega == 0 || isinf(omega) || isnan(omega))
-        {
-          Solve_CUDA_BiCGStab_Free(tmp_vect_host, p, r, v, s, t, y_, z, tmp_, Ai, Ax, Ap, x0, b, A_tild, A_tild_i, A_tild_p, infoL, infoU, descrL, descrU, preconditioner);
-          ostringstream tmp;
-          mexEvalString("diary off;");
-          tmp << "one of the scalar quantities (omega=" << omega << ") calculated during BICGSTAB became too small or too large to continue computing, in block " << block+1;
-          mexWarnMsgTxt(tmp.str().c_str());
-          return 4;
-        }
-
-      /**x = x +  omega * z*/
-      VecInc<<< nblocks, n_threads>>> (x0, omega, z, n);
-
-      /**r = s - omega * t*/
-      VecAdd<<< nblocks, n_threads>>> (r, s, -omega, t, n);
-
-      /**Has BiCGStab converged?*/
-      cublasChk(cublasDnrm2(cublas_handle, n, // numerator
-                            r, 1,
-                            &tmp1),
-                "  in Solve_Cuda_BiCGStab, cublasDnrm2(r) has failed\n");
-      conv_criteria = tmp1;
-
-      convergence = conv_criteria < tolb;
-
-      if (convergence || stagnation >= max_stagnation || refinement_needed)
-        {
-          /**r = b - A * x0*/
-          cudaChk(cudaMemcpy(r, b, n * sizeof(double), cudaMemcpyDeviceToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy r = b has failed\n");
-          cusparseChk(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                     n, n, nnz, &m_one, CUDA_descr,
-                                     Ax, Ap, Ai,
-                                     x0, &one,
-                                     r),
-                      "  in Solve_Cuda_BiCGStab, r = b - cusparseDcsrmv(A, x0) has failed\n");
-          cublasChk(cublasDnrm2(cublas_handle, n, // numerator
-                                r, 1,
-                                &tmp1),
-                    "  in Solve_Cuda_BiCGStab, cublasDnrm2(r) has failed\n");
-          conv_criteria = tmp1;
-          convergence = conv_criteria < tolb;
-          if (convergence)
-            {
-              mexPrintf("convergence achieved\n");
-              break;
-            }
-          else
-            {
-              if (stagnation >= max_stagnation && refinement_needed == 0)
-                stagnation = 0;
-              refinement_needed++;
-              if (refinement_needed > max_refinement)
-                {
-                  Solve_CUDA_BiCGStab_Free(tmp_vect_host, p, r, v, s, t, y_, z, tmp_, Ai, Ax, Ap, x0, b, A_tild, A_tild_i, A_tild_p, /*Lx, Li, Lp, Ux, Ui, Up, device_n, */ infoL, infoU, descrL, descrU, preconditioner);
-                  ostringstream tmp;
-                  mexEvalString("diary off;");
-                  tmp << "Error in bytecode: BiCGStab stagnated (Two consecutive iterates were the same.), in block " << block+1;
-                  mexWarnMsgTxt(tmp.str().c_str());
-                  return 3;
-                }
-            }
-        }
-
-      iteration++;
-    }
-  cudaChk(cudaMemcpy(tmp_vect_host, x0, n * sizeof(double), cudaMemcpyDeviceToHost), "  in Solve_Cuda_BiCGStab, cudaMemcpy tmp_vect_host = x0 has failed\n");
-
-  if (is_two_boundaries)
-    for (int i = 0; i < n; i++)
-      {
-        int eq = index_vara[i+Size*y_kmin];
-        double yy = -(tmp_vect_host[i] + y[eq]);
-        direction[eq] = yy;
-        y[eq] += slowc * yy;
-      }
-  else
-    for (int i = 0; i < n; i++)
-      {
-        int eq = index_vara[i];
-        double yy = -(tmp_vect_host[i] + y[eq+it_*y_size]);
-        direction[eq] = yy;
-        y[eq+it_*y_size] += slowc * yy;
-      }
-  Solve_CUDA_BiCGStab_Free(tmp_vect_host, p, r, v, s, t, y_, z, tmp_, Ai, Ax, Ap, x0, b, A_tild, A_tild_i, A_tild_p, infoL, infoU, descrL, descrU, preconditioner);
-
-  if (iteration >= max_iterations)
-    {
-      ostringstream tmp;
-      mexEvalString("diary off;");
-      tmp << "Error in bytecode: No convergence inside BiCGStab, in block " << block+1;
-      mexWarnMsgTxt(tmp.str().c_str());
-      return 1;
-    }
-  else
-    return 0;
-}
-#endif
-
 void
 dynSparseMatrix::Solve_Matlab_GMRES(mxArray *A_m, mxArray *b_m, int Size, double slowc, int block, bool is_two_boundaries, int it_, mxArray *x0_m)
 {
@@ -6339,13 +4762,6 @@ dynSparseMatrix::Simulate_Newton_Two_Boundaries(int blck, int y_size, int y_kmin
 {
   double top = 0.5;
   double bottom = 0.1;
-#ifdef CUDA
-  int nnz, nnz_tild;
-  int *Ap_i, *Ai_i;
-  int *Ap_i_tild, *Ai_i_tild;
-  double *x0, *A_tild;
-
-#endif
   int preconditioner = 2;
   if (start_compare == 0)
     start_compare = y_kmin;
@@ -6551,10 +4967,6 @@ dynSparseMatrix::Simulate_Newton_Two_Boundaries(int blck, int y_size, int y_kmin
             }
           if (stack_solve_algo == 0 || stack_solve_algo == 4)
             Init_UMFPACK_Sparse(periods, y_kmin, y_kmax, Size, IM_i, &Ap, &Ai, &Ax, &b, x0_m, vector_table_conditional_local, blck);
-#ifdef CUDA
-          else if (stack_solve_algo == 7)
-            Init_CUDA_Sparse(periods, y_kmin, y_kmax, Size, IM_i, &Ap_i, &Ai_i, &Ax, &Ap_i_tild, &Ai_i_tild, &A_tild, &b, &x0, x0_m, &nnz, &nnz_tild, preconditioner);
-#endif
           else
             Init_Matlab_Sparse(periods, y_kmin, y_kmax, Size, IM_i, A_m, b_m, x0_m);
 
@@ -6569,10 +4981,6 @@ dynSparseMatrix::Simulate_Newton_Two_Boundaries(int blck, int y_size, int y_kmin
         Solve_Matlab_BiCGStab(A_m, b_m, Size, slowc, blck, true, 0, x0_m, 1);
       else if (stack_solve_algo == 5)
         Solve_ByteCode_Symbolic_Sparse_GaussianElimination(Size, symbolic, blck);
-#ifdef CUDA
-      else if (stack_solve_algo == 7)
-        Solve_CUDA_BiCGStab(Ap_i, Ai_i, Ax, Ap_i_tild, Ai_i_tild, A_tild, b, x0, Size * periods, Size, slowc, true, 0, nnz, nnz_tild, preconditioner, Size * periods, blck);
-#endif
     }
   if (print_it)
     {
diff --git a/mex/sources/bytecode/SparseMatrix.hh b/mex/sources/bytecode/SparseMatrix.hh
index 6780e5120..e662c0a21 100644
--- a/mex/sources/bytecode/SparseMatrix.hh
+++ b/mex/sources/bytecode/SparseMatrix.hh
@@ -28,51 +28,11 @@
 #include "dynblas.h"
 #include "dynumfpack.h"
 
-#ifdef CUDA
-# include "cuda.h"
-# include "cuda_runtime_api.h"
-# include "cublas_v2.h"
-# include "cusparse_v2.h"
-#endif
-
 #include "Mem_Mngr.hh"
 #include "ErrorHandling.hh"
 //#include "Interpreter.hh"
 #include "Evaluate.hh"
 
-#define cudaChk(x, y)                                   \
-  {                                                     \
-    cudaError_t cuda_error = x;                         \
-    if (cuda_error != cudaSuccess)                      \
-      {                                                 \
-        ostringstream tmp;                              \
-        tmp << y;                                       \
-        throw FatalExceptionHandling(tmp.str());        \
-      }                                                 \
-  };
-
-#define cusparseChk(x, y)                               \
-  {                                                     \
-    cusparseStatus_t cusparse_status = x;               \
-    if (cusparse_status != CUSPARSE_STATUS_SUCCESS)     \
-      {                                                 \
-        ostringstream tmp;                              \
-        tmp << y;                                       \
-        throw FatalExceptionHandling(tmp.str());        \
-      }                                                 \
-  };
-
-#define cublasChk(x, y)                                 \
-  {                                                     \
-    cublasStatus_t cublas_status = x;                   \
-    if (cublas_status != CUBLAS_STATUS_SUCCESS)         \
-      {                                                 \
-        ostringstream tmp;                              \
-        tmp << y;                                       \
-        throw FatalExceptionHandling(tmp.str());        \
-      }                                                 \
-  };
-
 #define NEW_ALLOC
 #define MARKOVITZ
 
@@ -101,11 +61,7 @@ class dynSparseMatrix : public Evaluate
 {
 public:
   dynSparseMatrix();
-  dynSparseMatrix(const int y_size_arg, const int y_kmin_arg, const int y_kmax_arg, const bool print_it_arg, const bool steady_state_arg, const int periods_arg, const int minimal_solving_periods_arg, const double slowc_arg
-#ifdef CUDA
-                  , const int CUDA_device_arg, cublasHandle_t cublas_handle_arg, cusparseHandle_t cusparse_handle_arg, cusparseMatDescr_t descr_arg
-#endif
-                  );
+  dynSparseMatrix(const int y_size_arg, const int y_kmin_arg, const int y_kmax_arg, const bool print_it_arg, const bool steady_state_arg, const int periods_arg, const int minimal_solving_periods_arg, const double slowc_arg);
   void Simulate_Newton_Two_Boundaries(int blck, int y_size, int y_kmin, int y_kmax, int Size, int periods, bool cvg, int minimal_solving_periods, int stack_solve_algo, unsigned int endo_name_length, char *P_endo_names, vector_table_conditional_local_type vector_table_conditional_local);
   void Simulate_Newton_One_Boundary(bool forward);
   void fixe_u(double **u, int u_count_int, int max_lag_plus_max_lead_plus_1);
@@ -123,12 +79,8 @@ private:
   void Init_GE(int periods, int y_kmin, int y_kmax, int Size, map<pair<pair<int, int>, int>, int> &IM);
   void Init_Matlab_Sparse(int periods, int y_kmin, int y_kmax, int Size, map<pair<pair<int, int>, int>, int> &IM, mxArray *A_m, mxArray *b_m, mxArray *x0_m);
   void Init_UMFPACK_Sparse(int periods, int y_kmin, int y_kmax, int Size, map<pair<pair<int, int>, int>, int> &IM, SuiteSparse_long **Ap, SuiteSparse_long **Ai, double **Ax, double **b, mxArray *x0_m, vector_table_conditional_local_type vector_table_conditional_local, int block_num);
-#ifdef CUDA
-  void Init_CUDA_Sparse(int periods, int y_kmin, int y_kmax, int Size, map<pair<pair<int, int>, int>, int> &IM, int **Ap, int **Ai, double **Ax, int **Ap_tild, int **Ai_tild, double **A_tild, double **b, double **x0, mxArray *x0_m, int *nnz, int *nnz_tild, int preconditioner);
-#endif
   void Init_Matlab_Sparse_Simple(int Size, map<pair<pair<int, int>, int>, int> &IM, mxArray *A_m, mxArray *b_m, bool &zero_solution, mxArray *x0_m);
   void Init_UMFPACK_Sparse_Simple(int Size, map<pair<pair<int, int>, int>, int> &IM, SuiteSparse_long **Ap, SuiteSparse_long **Ai, double **Ax, double **b, bool &zero_solution, mxArray *x0_m);
-  void Init_CUDA_Sparse_Simple(int Size, map<pair<pair<int, int>, int>, int> &IM, SuiteSparse_long **Ap, SuiteSparse_long **Ai, double **Ax, double **b, double **x0, bool &zero_solution, mxArray *x0_m);
   void Simple_Init(int Size, std::map<std::pair<std::pair<int, int>, int>, int> &IM, bool &zero_solution);
   void End_GE(int Size);
   bool mnbrak(double *ax, double *bx, double *cx, double *fa, double *fb, double *fc);
@@ -145,13 +97,6 @@ private:
   void Solve_LU_UMFPack(SuiteSparse_long *Ap, SuiteSparse_long *Ai, double *Ax, double *b, int n, int Size, double slowc_l, bool is_two_boundaries, int it_);
 
   void End_Matlab_LU_UMFPack();
-#ifdef CUDA
-  void Solve_CUDA_BiCGStab_Free(double *tmp_vect_host, double *p, double *r, double *v, double *s, double *t, double *y_, double *z, double *tmp_,
-                                int *Ai, double *Ax, int *Ap, double *x0, double *b, double *A_tild, int *A_tild_i, int *A_tild_p,
-                                cusparseSolveAnalysisInfo_t infoL, cusparseSolveAnalysisInfo_t infoU,
-                                cusparseMatDescr_t descrL, cusparseMatDescr_t descrU, int preconditioner);
-  int Solve_CUDA_BiCGStab(int *Ap, int *Ai, double *Ax, int *Ap_tild, int *Ai_tild, double *A_tild, double *b, double *x0, int n, int Size, double slowc_l, bool is_two_boundaries, int it_, int nnz, int nnz_tild, int preconditioner, int max_iterations, int block);
-#endif
   void Solve_Matlab_GMRES(mxArray *A_m, mxArray *b_m, int Size, double slowc, int block, bool is_two_boundaries, int it_, mxArray *x0_m);
   void Solve_Matlab_BiCGStab(mxArray *A_m, mxArray *b_m, int Size, double slowc, int block, bool is_two_boundaries, int it_, mxArray *x0_m, int precond);
   void Check_and_Correct_Previous_Iteration(int block_num, int y_size, int size, double crit_opt_old);
@@ -196,12 +141,6 @@ private:
   mxArray *Sparse_substract_SA_SB(mxArray *A_m, mxArray *B_m);
   mxArray *Sparse_substract_A_SB(mxArray *A_m, mxArray *B_m);
   mxArray *substract_A_B(mxArray *A_m, mxArray *B_m);
-#ifdef CUDA
-  int CUDA_device;
-  cublasHandle_t cublas_handle;
-  cusparseHandle_t cusparse_handle;
-  cusparseMatDescr_t CUDA_descr;
-#endif
 protected:
   stack<double> Stack;
   int nb_prologue_table_u, nb_first_table_u, nb_middle_table_u, nb_last_table_u;
diff --git a/mex/sources/bytecode/SparseMatrix_kernel.cu b/mex/sources/bytecode/SparseMatrix_kernel.cu
deleted file mode 100644
index 9d37596fa..000000000
--- a/mex/sources/bytecode/SparseMatrix_kernel.cu
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (C) 2007-2012 Dynare Team
- *
- * This file is part of Dynare.
- *
- * Dynare is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * Dynare is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with Dynare.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef SPARMATRIX_KERNEL
-#define SPARMATRIX_KERNEL
-
-// Kernel definition of vector division
-__global__ void
-VecDiv(double* A, double* B, double* C, int n)
-{
-  int i = blockIdx.x * 1024 + threadIdx.x;
-  if (i < n)
-    C[i] = (B[i] != 0.0 ? A[i] / B[i] : A[i]);
-}
-
-__global__ void
-VecAdd(double* res, double* r, double alpha, double* x, int n)
-{
-  int i = blockIdx.x * 1024 + threadIdx.x;
-  if (i < n)
-    res[i] = r[i] + alpha * x[i];
-}
-
-__global__ void
-VecInc(double* res, double alpha, double* x, int n)
-{
-  int i = blockIdx.x * 1024 + threadIdx.x;
-  if (i < n)
-    res[i] += alpha * x[i];
-}
-
-
-__global__ void
-update_x(double* x, double alpha, double* y, double omega, double *z)
-{
-  int i = threadIdx.x;
-  x[i] += alpha * y[i] + omega * z[i];
-}
-
-__global__ void
-Get_LU_dim(int *n, int* A_tild_i, int *A_tild_p, int *nnz_l, int *nnz_u)
-{
-  nnz_u[0] = 0;
-  nnz_l[0] = 0;
-  for (int i = 0; i < n[0]; i++)
-    {
-      for (int j = A_tild_p[i]; j < A_tild_p[i+1]; j++)
-        {
-          if (A_tild_i[j] < i)
-            nnz_l[0]++;
-          else if (A_tild_i[j] == i)
-            {
-              nnz_u[0]++;
-              //nnz_l[0]++;
-            }
-          else
-            nnz_u[0]++;
-        }
-    }
-}
-
-__global__ void
-Get_LU1_dim(int* n, int *nnz_l, int *nnz_u)
-{
-  nnz_u[0] = 3+n[0];
-  nnz_l[0] = 1+n[0];
-}
-
-
-__global__ void
-Get_L_and_U(int *n, double* A_tild_x, int* A_tild_i, int *A_tild_p, double* Lx, int* Li, int *Lp, double* Ux, int* Ui, int* Up)
-{
-  int nnz_u = 0, nnz_l = 0;
-  Lp[0] = 0;
-  Up[0] = 0;
-  for (int i = 0; i < n[0]; i++)
-    {
-      for (int j = A_tild_p[i]; j < A_tild_p[i+1]; j++)
-        {
-          if (A_tild_i[j] < i)
-            {
-              Lx[nnz_l] = A_tild_x[j];
-              Li[nnz_l] = A_tild_i[j];
-              nnz_l++;
-            }
-          else if (A_tild_i[j] == i)
-            {
-              Ux[nnz_u] = A_tild_x[j];
-              Lx[nnz_l] = 1.0;
-              Li[nnz_l] = Ui[nnz_u] = A_tild_i[j];
-              nnz_u++;
-              //nnz_l++;
-            }
-          else
-            {
-              Ux[nnz_u] = A_tild_x[j];
-              Ui[nnz_u] = A_tild_i[j];
-              nnz_u++;
-            }
-        }
-      Lp[i+1] = nnz_l;
-      Up[i+1] = nnz_u;
-    }
-}
-#endif
diff --git a/mex/sources/bytecode/bytecode.cc b/mex/sources/bytecode/bytecode.cc
index c3ddfdc1f..d338296cd 100644
--- a/mex/sources/bytecode/bytecode.cc
+++ b/mex/sources/bytecode/bytecode.cc
@@ -57,164 +57,6 @@ Get_Argument(const mxArray *prhs)
 //#include <windows.h>
 #include <cstdio>
 
-#ifdef CUDA
-int
-GPU_Test_and_Info(cublasHandle_t *cublas_handle, cusparseHandle_t *cusparse_handle, cusparseMatDescr_t *descr)
-{
-  cudaDeviceProp deviceProp;
-  int device_count, device, version, version_max = 0;
-  cublasStatus_t cublas_status;
-  cudaError_t cuda_error;
-  *descr = 0;
-
-  /* ask cuda how many devices it can find */
-  cudaGetDeviceCount(&device_count);
-  if (device_count < 1)
-    {
-      /* if it couldn't find any fail out */
-      ostringstream tmp;
-      tmp << " Unable to find a CUDA device. Unable to implement CUDA solvers\n";
-      throw FatalExceptionHandling(tmp.str());
-    }
-  else
-    {
-      mexPrintf("-----------------------------------------\n");
-      for (int i = 0; i < device_count; i++)
-        {
-          cudaSetDevice(i);
-          // Statistics about the GPU device
-          cuda_error = cudaGetDeviceProperties(&deviceProp, i);
-          if (cuda_error != cudaSuccess)
-            {
-              ostringstream tmp;
-              tmp << "  bytecode cudaGetDeviceProperties failed\n";
-              throw FatalExceptionHandling(tmp.str());
-            }
-          mexPrintf("> GPU device %d: \"%s\" has:\n   - %d Multi-Processors,\n   - %d threads per multiprocessor,\n", i, deviceProp.name, deviceProp.multiProcessorCount, deviceProp.maxThreadsPerMultiProcessor);
-          mexEvalString("drawnow;");
-          version = (deviceProp.major * 0x10 + deviceProp.minor);
-          if (version >= version_max)
-            {
-              device = i;
-              version_max = version;
-            }
-          mexPrintf("   - %4.2fMhz clock rate,\n   - %2.0fMb of memory,\n   - %d.%d compute capabilities.\n", double (deviceProp.clockRate) / (1024 * 1024), double (deviceProp.totalGlobalMem) / (1024 * 1024), deviceProp.major, deviceProp.minor);
-          mexEvalString("drawnow;");
-        }
-    }
-  mexPrintf("> Device %d selected\n", device);
-  mexEvalString("drawnow;");
-
-  cuda_error = cudaSetDevice(device);
-  if (cuda_error != cudaSuccess)
-    {
-      ostringstream tmp;
-      tmp << "  bytecode cudaSetDevice failed\n";
-      throw FatalExceptionHandling(tmp.str());
-    }
-
-  if (version_max < 0x11)
-    {
-      ostringstream tmp;
-      tmp << "  bytecode requires a minimum CUDA compute 1.1 capability\n";
-      cudaDeviceReset();
-      throw FatalExceptionHandling(tmp.str());
-    }
-
-  // Initialize CuBlas library
-  cublas_status = cublasCreate(cublas_handle);
-  if (cublas_status != CUBLAS_STATUS_SUCCESS)
-    {
-      ostringstream tmp;
-      switch (cublas_status)
-        {
-        case CUBLAS_STATUS_NOT_INITIALIZED:
-          tmp << " the CUBLAS initialization failed.\n";
-          break;
-        case CUBLAS_STATUS_ALLOC_FAILED:
-          tmp << " the resources could not be allocated.\n";
-          break;
-        default:
-          tmp << " unknown error during the initialization of cusparse library.\n";
-        }
-      throw FatalExceptionHandling(tmp.str());
-    }
-
-  // Initialize the CuSparse library
-  cusparseStatus_t cusparse_status;
-  cusparse_status = cusparseCreate(cusparse_handle);
-  if (cusparse_status != CUSPARSE_STATUS_SUCCESS)
-    {
-      ostringstream tmp;
-      switch (cusparse_status)
-        {
-        case CUSPARSE_STATUS_NOT_INITIALIZED:
-          tmp << " the CUDA Runtime initialization failed.\n";
-          break;
-        case CUSPARSE_STATUS_ALLOC_FAILED:
-          tmp <<  " the resources could not be allocated.\n";
-          break;
-        case CUSPARSE_STATUS_ARCH_MISMATCH:
-          tmp <<  " the device compute capability (CC) is less than 1.1. The CC of at least 1.1 is required.\n";
-          break;
-        default:
-          tmp << " unknown error during the initialization of cusparse library.\n";
-        }
-      throw FatalExceptionHandling(tmp.str());
-    }
-
-  // Create and setup matrix descriptor
-  cusparse_status = cusparseCreateMatDescr(descr);
-  if (cusparse_status != CUSPARSE_STATUS_SUCCESS)
-    {
-      ostringstream tmp;
-      tmp << " Matrix descriptor initialization failed\n";
-      throw FatalExceptionHandling(tmp.str());
-    }
-  cusparseSetMatType(*descr, CUSPARSE_MATRIX_TYPE_GENERAL);
-  cusparseSetMatIndexBase(*descr, CUSPARSE_INDEX_BASE_ZERO);
-
-  mexPrintf("> Driver version:\n");
-  int cuda_version;
-  cuda_error = cudaDriverGetVersion(&cuda_version);
-  if (cuda_error != cudaSuccess)
-    {
-      ostringstream tmp;
-      tmp << " cudaGetVersion has failed\n";
-      throw FatalExceptionHandling(tmp.str());
-    }
-  mexPrintf("   - CUDA version %5.3f\n", double (cuda_version) / 1000);
-  int cublas_version;
-  cublas_status = cublasGetVersion(*cublas_handle, &cublas_version);
-  if (cublas_status != CUBLAS_STATUS_SUCCESS)
-    {
-      ostringstream tmp;
-      tmp << " cublasGetVersion has failed\n";
-      throw FatalExceptionHandling(tmp.str());
-    }
-  mexPrintf("   - CUBLAS version %5.3f\n", double (cublas_version) / 1000);
-  int cusparse_version;
-  cusparse_status = cusparseGetVersion(*cusparse_handle, &cusparse_version);
-  if (cusparse_status != CUSPARSE_STATUS_SUCCESS)
-    {
-      ostringstream tmp;
-      tmp << " cusparseGetVersion has failed\n";
-      throw FatalExceptionHandling(tmp.str());
-    }
-  mexPrintf("   - CUSPARSE version %5.3f\n", double (cusparse_version) / 1000);
-  mexPrintf("-----------------------------------------\n");
-  return device;
-}
-
-void
-GPU_close(cublasHandle_t cublas_handle, cusparseHandle_t cusparse_handle, cusparseMatDescr_t descr)
-{
-  cublasChk(cublasDestroy(cublas_handle), "in bytecode cublasDestroy failed\n");
-  cusparseChk(cusparseDestroyMatDescr(descr), "in bytecode cusparseDestroyMatDescr failed\n");
-  cusparseChk(cusparseDestroy(cusparse_handle), "in bytecode cusparseDestroy failed\n");
-}
-
-#endif
 string
 deblank(string x)
 {
@@ -437,12 +279,6 @@ main(int nrhs, const char *prhs[])
 
   int max_periods = 0;
 
-#ifdef CUDA
-  int CUDA_device = -1;
-  cublasHandle_t cublas_handle;
-  cusparseHandle_t cusparse_handle;
-  cusparseMatDescr_t descr;
-#endif
   try
     {
       Get_Arguments_and_global_variables(nrhs, prhs, count_array_argument,
@@ -1005,20 +841,9 @@ main(int nrhs, const char *prhs[])
     mexWarnMsgTxt("Not enough space. Filename is truncated.");
   string file_name = fname;
 
-#ifdef CUDA
-  try
-    {
-      if (stack_solve_algo == 7 && !steady_state)
-        CUDA_device = GPU_Test_and_Info(&cublas_handle, &cusparse_handle, &descr);
-    }
-  catch (GeneralExceptionHandling &feh)
-    {
-      mexErrMsgTxt(feh.GetErrorMsg().c_str());
-    }
-#else
   if (stack_solve_algo == 7 && !steady_state)
-    mexErrMsgTxt("bytecode has not been compiled with CUDA option. Bytecode Can't use options_.stack_solve_algo=7\n");
-#endif
+    mexErrMsgTxt("Bytecode: Can't use option stack_solve_algo=7\n");
+
   size_t size_of_direction = col_y*row_y*sizeof(double);
   auto *y = static_cast<double *>(mxMalloc(size_of_direction));
   error_msg.test_mxMalloc(y, __LINE__, __FILE__, __func__, size_of_direction);
@@ -1045,11 +870,7 @@ main(int nrhs, const char *prhs[])
   clock_t t0 = clock();
   Interpreter interprete(params, y, ya, x, steady_yd, steady_xd, direction, y_size, nb_row_x, nb_row_xd, periods, y_kmin, y_kmax, maxit_, solve_tolf, size_of_direction, slowc, y_decal,
                          markowitz_c, file_name, minimal_solving_periods, stack_solve_algo, solve_algo, global_temporary_terms, print, print_error, GlobalTemporaryTerms, steady_state,
-                         print_it, col_x, col_y
-#ifdef CUDA
-                         , CUDA_device, cublas_handle, cusparse_handle, descr
-#endif
-                         );
+                         print_it, col_x, col_y);
   string f(fname);
   mxFree(fname);
   int nb_blocks = 0;
@@ -1078,11 +899,6 @@ main(int nrhs, const char *prhs[])
         }
     }
 
-#ifdef CUDA
-  if (stack_solve_algo == 7 && !steady_state)
-    GPU_close(cublas_handle, cusparse_handle, descr);
-#endif
-
   clock_t t1 = clock();
   if (!steady_state && !evaluate && print)
     mexPrintf("Simulation Time=%f milliseconds\n", 1000.0*(double (t1)-double (t0))/double (CLOCKS_PER_SEC));