dynare/mex/sources/bytecode/SparseMatrix.cc

/*
 * Copyright (C) 2007-2017 Dynare Team
 *
 * This file is part of Dynare.
 *
 * Dynare is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Dynare is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Dynare.  If not, see <http://www.gnu.org/licenses/>.
 */

//define _GLIBCXX_USE_C99_FENV_TR1 1
//include <cfenv>

#include <cstring>
#include <ctime>
#include <sstream>
//#include <gsl/gsl_min.h>
//#include <minimize.h>
#include "SparseMatrix.hh"

#ifdef CUDA
# include "SparseMatrix_kernel.cu"
#endif

using namespace std;
#ifdef _MSC_VER
# include <windows.h>
HINSTANCE hinstLib;

# define UMFPACK_INFO 90
# define UMFPACK_CONTROL 20
/* used in all UMFPACK_report_* routines: */
# define UMFPACK_PRL 0                   /* print level */
/* returned by all routines that use Info: */
# define UMFPACK_OK (0)
# define UMFPACK_STATUS 0        /* UMFPACK_OK, or other result */

typedef void (*t_umfpack_dl_free_numeric)(void **Numeric);
t_umfpack_dl_free_numeric umfpack_dl_free_numeric;
typedef void (*t_umfpack_dl_free_symbolic)(void **Symbolic);
t_umfpack_dl_free_symbolic umfpack_dl_free_symbolic;
typedef int64_t (*t_umfpack_dl_solve)(int64_t sys,
                                      const int64_t Ap [],
                                      const int64_t Ai [],
                                      const double Ax [],
                                      double X [],
                                      const double B [],
                                      void *Numeric,
                                      const double Control [UMFPACK_CONTROL],
                                      double Info [UMFPACK_INFO]);
t_umfpack_dl_solve umfpack_dl_solve;
typedef int64_t (*t_umfpack_dl_numeric)(const int64_t Ap [],
                                        const int64_t Ai [],
                                        const double Ax [],
                                        void *Symbolic,
                                        void **Numeric,
                                        const double Control [UMFPACK_CONTROL],
                                        double Info [UMFPACK_INFO]);
t_umfpack_dl_numeric umfpack_dl_numeric;
typedef int64_t (*t_umfpack_dl_symbolic)(int64_t n_row,
                                         int64_t n_col,
                                         const int64_t Ap [],
                                         const int64_t Ai [],
                                         const double Ax [],
                                         void **Symbolic,
                                         const double Control [UMFPACK_CONTROL],
                                         double Info [UMFPACK_INFO]);
t_umfpack_dl_symbolic umfpack_dl_symbolic;
typedef void (*t_umfpack_dl_report_info)(const double Control [UMFPACK_CONTROL],
                                         const double Info [UMFPACK_INFO]);
t_umfpack_dl_report_info umfpack_dl_report_info;
typedef void (*t_umfpack_dl_report_status)(const double Control [UMFPACK_CONTROL],
                                           int64_t status);
t_umfpack_dl_report_status umfpack_dl_report_status;
typedef void (*t_umfpack_dl_defaults)(double Control [UMFPACK_CONTROL]);
t_umfpack_dl_defaults umfpack_dl_defaults;

#endif

dynSparseMatrix::dynSparseMatrix()
{
  pivotva = NULL;
  g_save_op = NULL;
  g_nop_all = 0;
  mem_mngr.init_Mem();
  symbolic = true;
  alt_symbolic = false;
  alt_symbolic_count = 0;
  max_u = 0;
  min_u = 0x7FFFFFFF;
  res1a = 9.0e60;
  tbreak_g = 0;
  start_compare = 0;
  restart = 0;
  IM_i.clear();
  lu_inc_tol = 1e-10;
  Symbolic = NULL;
  Numeric = NULL;
#ifdef _MSC_VER
  // Get a handle to the DLL module.
  hinstLib = LoadLibrary(TEXT("libmwumfpack.dll"));
  // If the handle is valid, try to get the function address.
  if (hinstLib)
    {
      umfpack_dl_free_numeric = (t_umfpack_dl_free_numeric) GetProcAddress(hinstLib, "umfpack_dl_free_numeric");
      if (!umfpack_dl_free_numeric)
        {
          mexPrintf("umfpack_dl_free_numeric not found\n");
          ostringstream tmp;
          tmp << " in libmwumfpack.dll, the function umfpack_dl_free_numeric is not found.";
          throw FatalExceptionHandling(tmp.str());
        }
      umfpack_dl_free_symbolic = (t_umfpack_dl_free_symbolic) GetProcAddress(hinstLib, "umfpack_dl_free_symbolic");
      if (!umfpack_dl_free_symbolic)
        {
          ostringstream tmp;
          tmp << " in libmwumfpack.dll, the function umfpack_dl_free_symbolic is not found.";
          throw FatalExceptionHandling(tmp.str());
        }
      umfpack_dl_solve = (t_umfpack_dl_solve) GetProcAddress(hinstLib, "umfpack_dl_free_solve");
      if (!umfpack_dl_solve)
        {
          ostringstream tmp;
          tmp << " in libmwumfpack.dll, the function umfpack_dl_solve is not found.";
          throw FatalExceptionHandling(tmp.str());
        }
      umfpack_dl_numeric = (t_umfpack_dl_numeric) GetProcAddress(hinstLib, "umfpack_dl_numeric");
      if (!umfpack_dl_numeric)
        {
          ostringstream tmp;
          tmp << " in libmwumfpack.dll, the function umfpack_dl_numeric is not found.";
          throw FatalExceptionHandling(tmp.str());
        }
      umfpack_dl_symbolic = (t_umfpack_dl_symbolic) GetProcAddress(hinstLib, "umfpack_dl_symbolic");
      if (!umfpack_dl_symbolic)
        {
          ostringstream tmp;
          tmp << " in libmwumfpack.dll, the function umfpack_dl_symbolic is not found.";
          throw FatalExceptionHandling(tmp.str());
        }
      umfpack_dl_report_info = (t_umfpack_dl_report_info) GetProcAddress(hinstLib, "umfpack_dl_report_info");
      if (!umfpack_dl_report_info)
        {
          ostringstream tmp;
          tmp << " in libmwumfpack.dll, the function umfpack_dl_report_info is not found.";
          throw FatalExceptionHandling(tmp.str());
        }
      umfpack_dl_report_status = (t_umfpack_dl_report_status) GetProcAddress(hinstLib, "umfpack_dl_report_status");
      if (!umfpack_dl_report_status)
        {
          ostringstream tmp;
          tmp << " in libmwumfpack.dll, the function umfpack_dl_report_status is not found.";
          throw FatalExceptionHandling(tmp.str());
        }
      umfpack_dl_defaults = (t_umfpack_dl_defaults) GetProcAddress(hinstLib, "umfpack_dl_defaults");
      if (!umfpack_dl_defaults)
        {
          ostringstream tmp;
          tmp << " in libmwumfpack.dll, the function umfpack_dl_defaults is not found.";
          throw FatalExceptionHandling(tmp.str());
        }
    }
  else
    {
      mexPrintf("library loading error\n");
      ostringstream tmp;
      tmp << " in main, libmwumfpack.dll not found. \n Check that \\Program files\\MATLAB\\RXXXXX\\bin\\win64 is in the current path.";
      throw FatalExceptionHandling(tmp.str());
    }
#endif
}

dynSparseMatrix::dynSparseMatrix(const int y_size_arg, const int y_kmin_arg, const int y_kmax_arg, const bool print_it_arg, const bool steady_state_arg, const int periods_arg,
                                 const int minimal_solving_periods_arg, const double slowc_arg
#ifdef CUDA
                                 , const int CUDA_device_arg, cublasHandle_t cublas_handle_arg, cusparseHandle_t cusparse_handle_arg, cusparseMatDescr_t descr_arg
#endif
                                 ) :
  Evaluate(y_size_arg, y_kmin_arg, y_kmax_arg, print_it_arg, steady_state_arg, periods_arg, minimal_solving_periods_arg, slowc_arg)
{
  pivotva = NULL;
  g_save_op = NULL;
  g_nop_all = 0;
  mem_mngr.init_Mem();
  symbolic = true;
  alt_symbolic = false;
  alt_symbolic_count = 0;
  max_u = 0;
  min_u = 0x7FFFFFFF;
  res1a = 9.0e60;
  tbreak_g = 0;
  start_compare = 0;
  restart = 0;
  IM_i.clear();
  lu_inc_tol = 1e-10;
  Symbolic = NULL;
  Numeric = NULL;
#ifdef CUDA
  CUDA_device = CUDA_device_arg;
  cublas_handle = cublas_handle_arg;
  cusparse_handle = cusparse_handle_arg;
  CUDA_descr = descr_arg;
#endif
#ifdef _MSC_VER
  // Get a handle to the DLL module.
  hinstLib = LoadLibrary(TEXT("libmwumfpack.dll"));
  // If the handle is valid, try to get the function address.
  if (hinstLib != NULL)
    {
      umfpack_dl_free_numeric = (t_umfpack_dl_free_numeric) GetProcAddress(hinstLib, "umfpack_dl_free_numeric");
      if (!umfpack_dl_free_numeric)
        {
          ostringstream tmp;
          tmp << " in libmwumfpack.dll, the function umfpack_dl_free_numeric is not found.";
          throw FatalExceptionHandling(tmp.str());
        }
      umfpack_dl_free_symbolic = (t_umfpack_dl_free_symbolic) GetProcAddress(hinstLib, "umfpack_dl_free_symbolic");
      if (!umfpack_dl_free_symbolic)
        {
          ostringstream tmp;
          tmp << " in libmwumfpack.dll, the function umfpack_dl_free_symbolic is not found.";
          throw FatalExceptionHandling(tmp.str());
        }
      umfpack_dl_report_info = (t_umfpack_dl_report_info) GetProcAddress(hinstLib, "umfpack_dl_report_info");
      if (!umfpack_dl_report_info)
        {
          ostringstream tmp;
          tmp << " in libmwumfpack.dll, the function umfpack_dl_report_info is not found.";
          throw FatalExceptionHandling(tmp.str());
        }
      umfpack_dl_solve = (t_umfpack_dl_solve) GetProcAddress(hinstLib, "umfpack_dl_solve");
      if (!umfpack_dl_solve)
        {
          ostringstream tmp;
          tmp << " in libmwumfpack.dll, the function umfpack_dl_solve is not found.";
          throw FatalExceptionHandling(tmp.str());
        }
      umfpack_dl_numeric = (t_umfpack_dl_numeric) GetProcAddress(hinstLib, "umfpack_dl_numeric");
      if (!umfpack_dl_numeric)
        {
          ostringstream tmp;
          tmp << " in libmwumfpack.dll, the function umfpack_dl_numeric is not found.";
          throw FatalExceptionHandling(tmp.str());
        }
      umfpack_dl_symbolic = (t_umfpack_dl_symbolic) GetProcAddress(hinstLib, "umfpack_dl_symbolic");
      if (!umfpack_dl_symbolic)
        {
          ostringstream tmp;
          tmp << " in libmwumfpack.dll, the function umfpack_dl_symbolic is not found.";
          throw FatalExceptionHandling(tmp.str());
        }
      umfpack_dl_report_status = (t_umfpack_dl_report_status) GetProcAddress(hinstLib, "umfpack_dl_report_status");
      if (!umfpack_dl_report_status)
        {
          ostringstream tmp;
          tmp << " in libmwumfpack.dll, the function umfpack_dl_report_status is not found.";
          throw FatalExceptionHandling(tmp.str());
        }
      umfpack_dl_defaults = (t_umfpack_dl_defaults) GetProcAddress(hinstLib, "umfpack_dl_defaults");
      if (!umfpack_dl_defaults)
        {
          ostringstream tmp;
          tmp << " in libmwumfpack.dll, the function umfpack_dl_defaults is not found.";
          throw FatalExceptionHandling(tmp.str());
        }
    }
  else
    {
      mexPrintf("library loading error\n");
      ostringstream tmp;
      tmp << " in main, libmwumfpack.dll not found. \n Check that \\Program files\\MATLAB\\RXXXXX\\bin\\win64 in the current path.";
      throw FatalExceptionHandling(tmp.str());
    }
#endif
}

int
dynSparseMatrix::NRow(int r)
{
  return NbNZRow[r];
}

int
dynSparseMatrix::NCol(int c)
{
  return NbNZCol[c];
}

int
dynSparseMatrix::At_Row(int r, NonZeroElem **first)
{
  (*first) = FNZE_R[r];
  return NbNZRow[r];
}

int
dynSparseMatrix::Union_Row(int row1, int row2)
{
  NonZeroElem *first1, *first2;
  int n1 = At_Row(row1, &first1);
  int n2 = At_Row(row2, &first2);
  int i1 = 0, i2 = 0, nb_elem = 0;
  while (i1 < n1 && i2 < n2)
    {
      if (first1->c_index == first2->c_index)
        {
          nb_elem++;
          i1++;
          i2++;
          first1 = first1->NZE_R_N;
          first2 = first2->NZE_R_N;
        }
      else if (first1->c_index < first2->c_index)
        {
          nb_elem++;
          i1++;
          first1 = first1->NZE_R_N;
        }
      else
        {
          nb_elem++;
          i2++;
          first2 = first2->NZE_R_N;
        }
    }
  return nb_elem;
}

int
dynSparseMatrix::At_Pos(int r, int c, NonZeroElem **first)
{
  (*first) = FNZE_R[r];
  while ((*first)->c_index != c)
    (*first) = (*first)->NZE_R_N;
  return NbNZRow[r];
}

int
dynSparseMatrix::At_Col(int c, NonZeroElem **first)
{
  (*first) = FNZE_C[c];
  return NbNZCol[c];
}

int
dynSparseMatrix::At_Col(int c, int lag, NonZeroElem **first)
{
  (*first) = FNZE_C[c];
  int i = 0;
  while ((*first)->lag_index != lag && (*first))
    (*first) = (*first)->NZE_C_N;
  if ((*first))
    {
      NonZeroElem *firsta = (*first);
      if (!firsta->NZE_C_N)
        i++;
      else
        {
          while (firsta->lag_index == lag && firsta->NZE_C_N)
            {
              firsta = firsta->NZE_C_N;
              i++;
            }
          if (firsta->lag_index == lag)
            i++;
        }
    }
  return i;
}

void
dynSparseMatrix::Delete(const int r, const int c)
{
  NonZeroElem *first = FNZE_R[r], *firsta = NULL;

  while (first->c_index != c)
    {
      firsta = first;
      first = first->NZE_R_N;
    }
  if (firsta != NULL)
    firsta->NZE_R_N = first->NZE_R_N;
  if (first == FNZE_R[r])
    FNZE_R[r] = first->NZE_R_N;
  NbNZRow[r]--;

  first = FNZE_C[c];
  firsta = NULL;
  while (first->r_index != r)
    {
      firsta = first;
      first = first->NZE_C_N;
    }

  if (firsta != NULL)
    firsta->NZE_C_N = first->NZE_C_N;
  if (first == FNZE_C[c])
    FNZE_C[c] = first->NZE_C_N;

  u_liste.push_back(first->u_index);
  mem_mngr.mxFree_NZE(first);
  NbNZCol[c]--;
}

void
dynSparseMatrix::Print(int Size, int *b)
{
  int a, i, j, k, l;
  mexPrintf("   ");
  for (k = 0; k < Size*periods; k++)
    mexPrintf("%-2d ", k);
  mexPrintf("    |    ");
  for (k = 0; k < Size*periods; k++)
    mexPrintf("%8d", k);
  mexPrintf("\n");
  for (i = 0; i < Size*periods; i++)
    {
      NonZeroElem *first = FNZE_R[i];
      j = NbNZRow[i];
      mexPrintf("%-2d ", i);
      a = 0;
      for (k = 0; k < j; k++)
        {
          for (l = 0; l < (first->c_index-a); l++)
            mexPrintf("   ");
          mexPrintf("%-2d ", first->u_index);
          a = first->c_index+1;
          first = first->NZE_R_N;
        }
      for (k = a; k < Size*periods; k++)
        mexPrintf("   ");
      mexPrintf("%-2d ", b[i]);

      first = FNZE_R[i];
      j = NbNZRow[i];
      mexPrintf(" | %-2d ", i);
      a = 0;
      for (k = 0; k < j; k++)
        {
          for (l = 0; l < (first->c_index-a); l++)
            mexPrintf("        ");
          mexPrintf("%8.4f", double (u[first->u_index]));
          a = first->c_index+1;
          first = first->NZE_R_N;
        }
      for (k = a; k < Size*periods; k++)
        mexPrintf("        ");
      mexPrintf("%8.4f", double (u[b[i]]));
      mexPrintf("\n");
    }
}

void
dynSparseMatrix::Insert(const int r, const int c, const int u_index, const int lag_index)
{
  NonZeroElem *firstn, *first, *firsta, *a;
  firstn = mem_mngr.mxMalloc_NZE();
  first = FNZE_R[r];
  firsta = NULL;
  while (first->c_index < c && (a = first->NZE_R_N))
    {
      firsta = first;
      first = a;
    }
  firstn->u_index = u_index;
  firstn->r_index = r;
  firstn->c_index = c;
  firstn->lag_index = lag_index;
  if (first->c_index > c)
    {
      if (first == FNZE_R[r])
        FNZE_R[r] = firstn;
      if (firsta != NULL)
        firsta->NZE_R_N = firstn;
      firstn->NZE_R_N = first;
    }
  else
    {
      first->NZE_R_N = firstn;
      firstn->NZE_R_N = NULL;
    }
  NbNZRow[r]++;
  first = FNZE_C[c];
  firsta = NULL;
  while (first->r_index < r && (a = first->NZE_C_N))
    {
      firsta = first;
      first = a;
    }
  if (first->r_index > r)
    {
      if (first == FNZE_C[c])
        FNZE_C[c] = firstn;
      if (firsta != NULL)
        firsta->NZE_C_N = firstn;
      firstn->NZE_C_N = first;
    }
  else
    {
      first->NZE_C_N = firstn;
      firstn->NZE_C_N = NULL;
    }

  NbNZCol[c]++;
}

void
dynSparseMatrix::Close_SaveCode()
{
  SaveCode.close();
}

void
dynSparseMatrix::Read_SparseMatrix(string file_name, const int Size, int periods, int y_kmin, int y_kmax, bool two_boundaries, int stack_solve_algo, int solve_algo)
{
  unsigned int eq, var;
  int lag;
  mem_mngr.fixe_file_name(file_name);
  /*mexPrintf("steady_state=%d, size=%d, solve_algo=%d, stack_solve_algo=%d, two_boundaries=%d\n",steady_state, Size, solve_algo, stack_solve_algo, two_boundaries);
    mexEvalString("drawnow;");*/
  if (!SaveCode.is_open())
    {
      if (steady_state)
        SaveCode.open(file_name + "/model/bytecode/static.bin", ios::in | ios::binary);
      else
        SaveCode.open(file_name + "/model/bytecode/dynamic.bin", ios::in | ios::binary);
      if (!SaveCode.is_open())
        {
          ostringstream tmp;
          if (steady_state)
            tmp << " in Read_SparseMatrix, " << file_name << "/model/bytecode/static.bin cannot be opened\n";
          else
            tmp << " in Read_SparseMatrix, " << file_name << "/model/bytecode/dynamic.bin cannot be opened\n";
          throw FatalExceptionHandling(tmp.str());
        }
    }
  IM_i.clear();
  if (two_boundaries)
    {
      if (stack_solve_algo == 5)
        {
          for (int i = 0; i < u_count_init-Size; i++)
            {
              int val;
              SaveCode.read(reinterpret_cast<char *>(&eq), sizeof(eq));
              SaveCode.read(reinterpret_cast<char *>(&var), sizeof(var));
              SaveCode.read(reinterpret_cast<char *>(&lag), sizeof(lag));
              SaveCode.read(reinterpret_cast<char *>(&val), sizeof(val));
              IM_i[make_pair(make_pair(eq, var), lag)] = val;
            }
          for (int j = 0; j < Size; j++)
            IM_i[make_pair(make_pair(j, Size*(periods+y_kmax)), 0)] = j;
        }
      else if (stack_solve_algo >= 0 && stack_solve_algo <= 4)
        {
          for (int i = 0; i < u_count_init-Size; i++)
            {
              int val;
              SaveCode.read(reinterpret_cast<char *>(&eq), sizeof(eq));
              SaveCode.read(reinterpret_cast<char *>(&var), sizeof(var));
              SaveCode.read(reinterpret_cast<char *>(&lag), sizeof(lag));
              SaveCode.read(reinterpret_cast<char *>(&val), sizeof(val));
              IM_i[make_pair(make_pair(var - lag*Size, -lag), eq)] = val;
            }
          for (int j = 0; j < Size; j++)
            IM_i[make_pair(make_pair(Size*(periods+y_kmax), 0), j)] = j;
        }
      else if (stack_solve_algo == 7)
        {
          for (int i = 0; i < u_count_init-Size; i++)
            {
              int val;
              SaveCode.read(reinterpret_cast<char *>(&eq), sizeof(eq));
              SaveCode.read(reinterpret_cast<char *>(&var), sizeof(var));
              SaveCode.read(reinterpret_cast<char *>(&lag), sizeof(lag));
              SaveCode.read(reinterpret_cast<char *>(&val), sizeof(val));
              IM_i[make_pair(make_pair(eq, lag), var - lag * Size)] = val;
            }
          for (int j = 0; j < Size; j++)
            IM_i[make_pair(make_pair(Size*(periods+y_kmax), 0), j)] = j;
        }
    }
  else
    {
      if ((stack_solve_algo == 5 && !steady_state) || (solve_algo == 5 && steady_state))
        {
          for (int i = 0; i < u_count_init; i++)
            {
              int val;
              SaveCode.read(reinterpret_cast<char *>(&eq), sizeof(eq));
              SaveCode.read(reinterpret_cast<char *>(&var), sizeof(var));
              SaveCode.read(reinterpret_cast<char *>(&lag), sizeof(lag));
              SaveCode.read(reinterpret_cast<char *>(&val), sizeof(val));
              IM_i[make_pair(make_pair(eq, var), lag)] = val;
            }
        }
      else if (((stack_solve_algo >= 0 || stack_solve_algo <= 4) && !steady_state) || ((solve_algo >= 6 || solve_algo <= 8) && steady_state))
        {
          for (int i = 0; i < u_count_init; i++)
            {
              int val;
              SaveCode.read(reinterpret_cast<char *>(&eq), sizeof(eq));
              SaveCode.read(reinterpret_cast<char *>(&var), sizeof(var));
              SaveCode.read(reinterpret_cast<char *>(&lag), sizeof(lag));
              SaveCode.read(reinterpret_cast<char *>(&val), sizeof(val));
              IM_i[make_pair(make_pair(var - lag*Size, -lag), eq)] = val;
            }
        }
    }
  index_vara = (int *) mxMalloc(Size*(periods+y_kmin+y_kmax)*sizeof(int));
  test_mxMalloc(index_vara, __LINE__, __FILE__, __func__, Size*(periods+y_kmin+y_kmax)*sizeof(int));
  for (int j = 0; j < Size; j++)
    SaveCode.read(reinterpret_cast<char *>(&index_vara[j]), sizeof(*index_vara));
  if (periods+y_kmin+y_kmax > 1)
    for (int i = 1; i < periods+y_kmin+y_kmax; i++)
      {
        for (int j = 0; j < Size; j++)
          index_vara[j+Size*i] = index_vara[j+Size*(i-1)] + y_size;
      }
  index_equa = (int *) mxMalloc(Size*sizeof(int));
  test_mxMalloc(index_equa, __LINE__, __FILE__, __func__, Size*sizeof(int));
  for (int j = 0; j < Size; j++)
    SaveCode.read(reinterpret_cast<char *>(&index_equa[j]), sizeof(*index_equa));
}

void
dynSparseMatrix::Simple_Init(int Size, map<pair<pair<int, int>, int>, int> &IM, bool &zero_solution)
{
  int i, eq, var, lag;
  map<pair<pair<int, int>, int>, int>::iterator it4;
  NonZeroElem *first;
  pivot = (int *) mxMalloc(Size*sizeof(int));
  test_mxMalloc(pivot, __LINE__, __FILE__, __func__, Size*sizeof(int));
  pivot_save = (int *) mxMalloc(Size*sizeof(int));
  test_mxMalloc(pivot_save, __LINE__, __FILE__, __func__, Size*sizeof(int));
  pivotk = (int *) mxMalloc(Size*sizeof(int));
  test_mxMalloc(pivotk, __LINE__, __FILE__, __func__, Size*sizeof(int));
  pivotv = (double *) mxMalloc(Size*sizeof(double));
  test_mxMalloc(pivotv, __LINE__, __FILE__, __func__, Size*sizeof(double));
  pivotva = (double *) mxMalloc(Size*sizeof(double));
  test_mxMalloc(pivotva, __LINE__, __FILE__, __func__, Size*sizeof(double));
  b = (int *) mxMalloc(Size*sizeof(int));
  test_mxMalloc(b, __LINE__, __FILE__, __func__, Size*sizeof(int));
  line_done = (bool *) mxMalloc(Size*sizeof(bool));
  test_mxMalloc(line_done, __LINE__, __FILE__, __func__, Size*sizeof(bool));

  mem_mngr.init_CHUNK_BLCK_SIZE(u_count);
  g_save_op = NULL;
  g_nop_all = 0;
  i = Size*sizeof(NonZeroElem *);
  FNZE_R = (NonZeroElem **) mxMalloc(i);
  test_mxMalloc(FNZE_R, __LINE__, __FILE__, __func__, i);
  FNZE_C = (NonZeroElem **) mxMalloc(i);
  test_mxMalloc(FNZE_C, __LINE__, __FILE__, __func__, i);
  NonZeroElem **temp_NZE_R = (NonZeroElem **) mxMalloc(i);
  test_mxMalloc(temp_NZE_R, __LINE__, __FILE__, __func__, i);
  NonZeroElem **temp_NZE_C = (NonZeroElem **) mxMalloc(i);
  test_mxMalloc(temp_NZE_C, __LINE__, __FILE__, __func__, i);
  i = Size*sizeof(int);
  NbNZRow = (int *) mxMalloc(i);
  test_mxMalloc(NbNZRow, __LINE__, __FILE__, __func__, i);
  NbNZCol = (int *) mxMalloc(i);
  test_mxMalloc(NbNZCol, __LINE__, __FILE__, __func__, i);
  it4 = IM.begin();
  eq = -1;
  for (i = 0; i < Size; i++)
    {
      line_done[i] = 0;
      FNZE_C[i] = NULL;
      FNZE_R[i] = NULL;
      temp_NZE_C[i] = 0;
      temp_NZE_R[i] = 0;
      NbNZRow[i] = 0;
      NbNZCol[i] = 0;
    }
  int u_count1 = Size;
  while (it4 != IM.end())
    {
      var = it4->first.first.second;
      eq = it4->first.first.first;
      lag = it4->first.second;
      if (lag == 0)   /*Build the index for sparse matrix containing the jacobian : u*/
        {
          NbNZRow[eq]++;
          NbNZCol[var]++;
          first = mem_mngr.mxMalloc_NZE();
          first->NZE_C_N = NULL;
          first->NZE_R_N = NULL;
          first->u_index = u_count1;
          first->r_index = eq;
          first->c_index = var;
          first->lag_index = lag;
          if (FNZE_R[eq] == NULL)
            FNZE_R[eq] = first;
          if (FNZE_C[var] == NULL)
            FNZE_C[var] = first;
          if (temp_NZE_R[eq] != NULL)
            temp_NZE_R[eq]->NZE_R_N = first;
          if (temp_NZE_C[var] != NULL)
            temp_NZE_C[var]->NZE_C_N = first;
          temp_NZE_R[eq] = first;
          temp_NZE_C[var] = first;
          u_count1++;
        }
      it4++;
    }
  double cum_abs_sum = 0;
  for (int i = 0; i < Size; i++)
    {
      b[i] = i;
      cum_abs_sum += fabs(u[i]);
    }
  if (cum_abs_sum < 1e-20)
    zero_solution = true;
  else
    zero_solution = false;

  mxFree(temp_NZE_R);
  mxFree(temp_NZE_C);
  u_count = u_count1;
}

void
dynSparseMatrix::Init_Matlab_Sparse_Simple(int Size, map<pair<pair<int, int>, int>, int> &IM, mxArray *A_m, mxArray *b_m, bool &zero_solution, mxArray *x0_m)
{
  int eq, var;
  double *b = mxGetPr(b_m);
  if (!b)
    {
      ostringstream tmp;
      tmp << " in Init_Matlab_Sparse_Simple, can't retrieve b vector\n";
      throw FatalExceptionHandling(tmp.str());
    }
  double *x0 = mxGetPr(x0_m);
  if (!x0)
    {
      ostringstream tmp;
      tmp << " in Init_Matlab_Sparse_Simple, can't retrieve x0 vector\n";
      throw FatalExceptionHandling(tmp.str());
    }
  mwIndex *Ai = mxGetIr(A_m);
  if (!Ai)
    {
      ostringstream tmp;
      tmp << " in Init_Matlab_Sparse_Simple, can't allocate Ai index vector\n";
      throw FatalExceptionHandling(tmp.str());
    }
  mwIndex *Aj = mxGetJc(A_m);
  if (!Aj)
    {
      ostringstream tmp;
      tmp << " in Init_Matlab_Sparse_Simple, can't allocate Aj index vector\n";
      throw FatalExceptionHandling(tmp.str());
    }
  double *A = mxGetPr(A_m);
  if (!A)
    {
      ostringstream tmp;
      tmp << " in Init_Matlab_Sparse_Simple, can't retrieve A matrix\n";
      throw FatalExceptionHandling(tmp.str());
    }
  map<pair<pair<int, int>, int>, int>::iterator it4;
  for (int i = 0; i < y_size*(periods+y_kmin); i++)
    ya[i] = y[i];
#ifdef DEBUG
  unsigned int max_nze = mxGetNzmax(A_m);
#endif
  unsigned int NZE = 0;
  int last_var = 0;
  double cum_abs_sum = 0;
  for (int i = 0; i < Size; i++)
    {
      b[i] = u[i];
      cum_abs_sum += fabs(b[i]);
      x0[i] = y[i];
    }
  if (cum_abs_sum < 1e-20)
    zero_solution = true;
  else
    zero_solution = false;

  Aj[0] = 0;
  last_var = 0;
  it4 = IM.begin();
  while (it4 != IM.end())
    {
      var = it4->first.first.first;
      if (var != last_var)
        {
          Aj[1+last_var ] = NZE;
          last_var = var;
        }
      eq = it4->first.second;
      int index = it4->second;
#ifdef DEBUG
      if (index < 0 || index >= u_count_alloc || index > Size + Size*Size)
        {
          ostringstream tmp;
          tmp << " in Init_Matlab_Sparse_Simple, index (" << index << ") out of range for u vector max = " << Size+Size*Size << " allocated = " << u_count_alloc << "\n";
          throw FatalExceptionHandling(tmp.str());
        }
      if (NZE >= max_nze)
        {
          ostringstream tmp;
          tmp << " in Init_Matlab_Sparse_Simple, exceeds the capacity of A_m sparse matrix\n";
          throw FatalExceptionHandling(tmp.str());
        }
#endif
      A[NZE] = u[index];
      Ai[NZE] = eq;
      NZE++;
#ifdef DEBUG
      if (eq < 0 || eq >= Size)
        {
          ostringstream tmp;
          tmp << " in Init_Matlab_Sparse_Simple, index (" << eq << ") out of range for b vector\n";
          throw FatalExceptionHandling(tmp.str());
        }
      if (var < 0 || var >= Size)
        {
          ostringstream tmp;
          tmp << " in Init_Matlab_Sparse_Simple, index (" << var << ") out of range for index_vara vector\n";
          throw FatalExceptionHandling(tmp.str());
        }
      if (index_vara[var] < 0 || index_vara[var] >= y_size)
        {
          ostringstream tmp;
          tmp << " in Init_Matlab_Sparse_Simple, index (" << index_vara[var] << ") out of range for y vector max=" << y_size << " (0)\n";
          throw FatalExceptionHandling(tmp.str());
        }
#endif
      it4++;
    }
  Aj[Size] = NZE;
}

void
dynSparseMatrix::Init_UMFPACK_Sparse_Simple(int Size, map<pair<pair<int, int>, int>, int> &IM, SuiteSparse_long **Ap, SuiteSparse_long **Ai, double **Ax, double **b, bool &zero_solution, mxArray *x0_m)
{
  int eq, var;
  *b = (double *) mxMalloc(Size * sizeof(double));
  test_mxMalloc(*b, __LINE__, __FILE__, __func__, Size * sizeof(double));
  if (!(*b))
    {
      ostringstream tmp;
      tmp << " in Init_UMFPACK_Sparse, can't retrieve b vector\n";
      throw FatalExceptionHandling(tmp.str());
    }
  double *x0 = mxGetPr(x0_m);
  if (!x0)
    {
      ostringstream tmp;
      tmp << " in Init_UMFPACK_Sparse_Simple, can't retrieve x0 vector\n";
      throw FatalExceptionHandling(tmp.str());
    }
  *Ap = (SuiteSparse_long *) mxMalloc((Size+1) * sizeof(SuiteSparse_long));
  test_mxMalloc(*Ap, __LINE__, __FILE__, __func__, (Size+1) * sizeof(SuiteSparse_long));
  if (!(*Ap))
    {
      ostringstream tmp;
      tmp << " in Init_UMFPACK_Sparse, can't allocate Ap index vector\n";
      throw FatalExceptionHandling(tmp.str());
    }
  size_t prior_nz = IM.size();
  *Ai = (SuiteSparse_long *) mxMalloc(prior_nz * sizeof(SuiteSparse_long));
  test_mxMalloc(*Ai, __LINE__, __FILE__, __func__, prior_nz * sizeof(SuiteSparse_long));
  if (!(*Ai))
    {
      ostringstream tmp;
      tmp << " in Init_UMFPACK_Sparse, can't allocate Ai index vector\n";
      throw FatalExceptionHandling(tmp.str());
    }
  *Ax = (double *) mxMalloc(prior_nz * sizeof(double));
  test_mxMalloc(*Ax, __LINE__, __FILE__, __func__, prior_nz * sizeof(double));
  if (!(*Ax))
    {
      ostringstream tmp;
      tmp << " in Init_UMFPACK_Sparse, can't retrieve Ax matrix\n";
      throw FatalExceptionHandling(tmp.str());
    }
  map<pair<pair<int, int>, int>, int>::iterator it4;
  for (int i = 0; i < Size; i++)
    {
      int eq = index_vara[i];
      ya[eq+it_*y_size] = y[eq+it_*y_size];
    }
#ifdef DEBUG
  unsigned int max_nze = prior_nz;//mxGetNzmax(A_m);
#endif
  unsigned int NZE = 0;
  int last_var = 0;
  double cum_abs_sum = 0;
  for (int i = 0; i < Size; i++)
    {
      (*b)[i] = u[i];
      cum_abs_sum += fabs((*b)[i]);
      x0[i] = y[i];
    }
  if (cum_abs_sum < 1e-20)
    zero_solution = true;
  else
    zero_solution = false;

  (*Ap)[0] = 0;
  last_var = 0;
  it4 = IM.begin();
  while (it4 != IM.end())
    {
      var = it4->first.first.first;
      if (var != last_var)
        {
          (*Ap)[1+last_var ] = NZE;
          last_var = var;
        }
      eq = it4->first.second;
      int index = it4->second;
#ifdef DEBUG
      if (index < 0 || index >= u_count_alloc || index > Size + Size*Size)
        {
          ostringstream tmp;
          tmp << " in Init_Matlab_Sparse_Simple, index (" << index << ") out of range for u vector max = " << Size+Size*Size << " allocated = " << u_count_alloc << "\n";
          throw FatalExceptionHandling(tmp.str());
        }
      if (NZE >= max_nze)
        {
          ostringstream tmp;
          tmp << " in Init_Matlab_Sparse_Simple, exceeds the capacity of A_m sparse matrix\n";
          throw FatalExceptionHandling(tmp.str());
        }
#endif
      (*Ax)[NZE] = u[index];
      (*Ai)[NZE] = eq;
      NZE++;
#ifdef DEBUG
      if (eq < 0 || eq >= Size)
        {
          ostringstream tmp;
          tmp << " in Init_Matlab_Sparse_Simple, index (" << eq << ") out of range for b vector\n";
          throw FatalExceptionHandling(tmp.str());
        }
      if (var < 0 || var >= Size)
        {
          ostringstream tmp;
          tmp << " in Init_Matlab_Sparse_Simple, index (" << var << ") out of range for index_vara vector\n";
          throw FatalExceptionHandling(tmp.str());
        }
      if (index_vara[var] < 0 || index_vara[var] >= y_size)
        {
          ostringstream tmp;
          tmp << " in Init_Matlab_Sparse_Simple, index (" << index_vara[var] << ") out of range for y vector max=" << y_size << " (0)\n";
          throw FatalExceptionHandling(tmp.str());
        }
#endif
      it4++;
    }
  (*Ap)[Size] = NZE;
}

int
dynSparseMatrix::find_exo_num(vector<s_plan> sconstrained_extended_path, int value)
{
  int res = -1;
  int i = 0;
  for (vector<s_plan>::iterator it = sconstrained_extended_path.begin(); it != sconstrained_extended_path.end(); it++, i++)
    if (it->exo_num == value)
      {
        res = i;
        break;
      }
  return res;
}

int
dynSparseMatrix::find_int_date(vector<pair<int, double> > per_value, int value)
{
  int res = -1;
  int i = 0;
  for (vector<pair<int, double> >::iterator it = per_value.begin(); it != per_value.end(); it++, i++)
    if (it->first == value)
      {
        res = i;
        break;
      }
  return res;
}

void
dynSparseMatrix::Init_UMFPACK_Sparse(int periods, int y_kmin, int y_kmax, int Size, map<pair<pair<int, int>, int>, int> &IM, SuiteSparse_long **Ap, SuiteSparse_long **Ai, double **Ax, double **b, mxArray *x0_m, vector_table_conditional_local_type vector_table_conditional_local, int block_num)
{
  int t, eq, var, lag, ti_y_kmin, ti_y_kmax;
  double *jacob_exo;
  int row_x = 0;
#ifdef DEBUG
  int col_x;
#endif
  int n = periods * Size;
  *b = (double *) mxMalloc(n * sizeof(double));
  if (!(*b))
    {
      ostringstream tmp;
      tmp << " in Init_UMFPACK_Sparse, can't retrieve b vector\n";
      throw FatalExceptionHandling(tmp.str());
    }
  double *x0 = mxGetPr(x0_m);
  if (!x0)
    {
      ostringstream tmp;
      tmp << " in Init_UMFPACK_Sparse_Simple, can't retrieve x0 vector\n";
      throw FatalExceptionHandling(tmp.str());
    }
  *Ap = (SuiteSparse_long *) mxMalloc((n+1) * sizeof(SuiteSparse_long));
  test_mxMalloc(*Ap, __LINE__, __FILE__, __func__, (n+1) * sizeof(SuiteSparse_long));
  if (!(*Ap))
    {
      ostringstream tmp;
      tmp << " in Init_UMFPACK_Sparse, can't allocate Ap index vector\n";
      throw FatalExceptionHandling(tmp.str());
    }
  size_t prior_nz = IM.size() * periods;
  *Ai = (SuiteSparse_long *) mxMalloc(prior_nz * sizeof(SuiteSparse_long));
  test_mxMalloc(*Ai, __LINE__, __FILE__, __func__, prior_nz * sizeof(SuiteSparse_long));
  if (!(*Ai))
    {
      ostringstream tmp;
      tmp << " in Init_UMFPACK_Sparse, can't allocate Ai index vector\n";
      throw FatalExceptionHandling(tmp.str());
    }
  *Ax = (double *) mxMalloc(prior_nz * sizeof(double));
  test_mxMalloc(*Ax, __LINE__, __FILE__, __func__, prior_nz * sizeof(double));
  if (!(*Ax))
    {
      ostringstream tmp;
      tmp << " in Init_UMFPACK_Sparse, can't retrieve Ax matrix\n";
      throw FatalExceptionHandling(tmp.str());
    }
  map<pair<pair<int, int>, int>, int>::iterator it4, it5;
  for (int i = 0; i < y_size*(periods+y_kmin); i++)
    ya[i] = y[i];
#ifdef DEBUG
  unsigned int max_nze = prior_nz; //mxGetNzmax(A_m);
#endif
  unsigned int NZE = 0;
  int last_var = 0;
  for (int i = 0; i < periods*Size; i++)
    {
      (*b)[i] = 0;
      x0[i] = y[index_vara[Size*y_kmin+i]];
    }
  if (vector_table_conditional_local.size())
    {
      jacob_exo = mxGetPr(jacobian_exo_block[block_num]);
      row_x = mxGetM(jacobian_exo_block[block_num]);
#ifdef DEBUG
      col_x = mxGetN(jacobian_exo_block[block_num]);
#endif
    }
  else
    {
      jacob_exo = NULL;
    }
#ifdef DEBUG
  int local_index;
#endif

  bool fliped = false;
  bool fliped_exogenous_derivatives_updated = false;
  int flip_exo;
  (*Ap)[0] = 0;
  for (t = 0; t < periods; t++)
    {
      last_var = -1;
      it4 = IM.begin();
      var = 0;
      while (it4 != IM.end())
        {
          var = it4->first.first.first;
#ifdef DEBUG
          if (var < 0 || var >= Size)
            {
              ostringstream tmp;
              tmp << " in Init_UMFPACK_Sparse, var (" << var << ") out of range\n";
              throw FatalExceptionHandling(tmp.str());
            }
#endif
          eq = it4->first.second+Size*t;
#ifdef DEBUG
          if (eq < 0 || eq >= Size)
            {
              ostringstream tmp;
              tmp << " in Init_UMFPACK_Sparse, eq (" << eq << ") out of range\n";
              throw FatalExceptionHandling(tmp.str());
            }
#endif
          lag = -it4->first.first.second;
          int index = it4->second+ (t-lag) * u_count_init;
          if (var != last_var)
            {
              (*Ap)[1+last_var + t * Size] = NZE;
              last_var = var;
              if (var < Size*(periods+y_kmax))
                {
                  if (t == 0 && vector_table_conditional_local.size())
                    {
                      fliped = vector_table_conditional_local[var].is_cond;
                      fliped_exogenous_derivatives_updated = false;
                    }
                  else
                    fliped = false;
                }
              else
                fliped = false;
            }
          if (fliped)
            {
              if ((t == 0) && (var < (periods+y_kmax)*Size) && (lag == 0) && (vector_table_conditional_local.size()))
                {
                  flip_exo = vector_table_conditional_local[var].var_exo;
#ifdef DEBUG
                  local_index = eq;
#endif
                  if (!fliped_exogenous_derivatives_updated)
                    {
                      fliped_exogenous_derivatives_updated = true;
                      for (int k = 0; k < row_x; k++)
                        {
                          if (jacob_exo[k + row_x*flip_exo] != 0)
                            {
                              (*Ax)[NZE] = jacob_exo[k + row_x*flip_exo];
                              (*Ai)[NZE] = k;
                              NZE++;

#ifdef DEBUG
                              if (local_index < 0 || local_index >= Size * periods)
                                {
                                  ostringstream tmp;
                                  tmp << " in Init_UMFPACK_Sparse, index (" << local_index << ") out of range for b vector\n";
                                  throw FatalExceptionHandling(tmp.str());
                                }
                              if (k + row_x*flip_exo < 0 || k + row_x*flip_exo >= row_x * col_x)
                                {
                                  ostringstream tmp;
                                  tmp << " in Init_UMFPACK_Sparse, index (" << var+Size*(y_kmin+t+lag) << ") out of range for jacob_exo vector\n";
                                  throw FatalExceptionHandling(tmp.str());
                                }
                              if (t+y_kmin+flip_exo*nb_row_x < 0 || t+y_kmin+flip_exo*nb_row_x >= nb_row_x * this->col_x)
                                {
                                  ostringstream tmp;
                                  tmp << " in Init_UMFPACK_Sparse, index (" << index_vara[var+Size*(y_kmin+t+lag)] << ") out of range for x vector max=" << nb_row_x * this->col_x << "\n";
                                  throw FatalExceptionHandling(tmp.str());
                                }
#endif
                              u[k] -=  jacob_exo[k + row_x*flip_exo] * x[t+y_kmin+flip_exo*nb_row_x];
                            }
                        }
                    }
                }
            }
          /*if (t==0)
            {
            if (min_lag > lag)
            min_lag = lag;
            if (max_lag < lag)
            max_lag = lag;
            }*/

          if (var < (periods+y_kmax)*Size)
            {
              ti_y_kmin = -min(t, y_kmin);
              ti_y_kmax = min(periods-(t +1), y_kmax);
              int ti_new_y_kmax = min(t, y_kmax);
              int ti_new_y_kmin = -min(periods-(t+1), y_kmin);
              if (lag <= ti_new_y_kmax && lag >= ti_new_y_kmin)   /*Build the index for sparse matrix containing the jacobian : u*/
                {
#ifdef DEBUG
                  if (index < 0 || index >= u_count_alloc || index > Size + Size*Size)
                    {
                      ostringstream tmp;
                      tmp << " in Init_UMFPACK_Sparse, index (" << index << ") out of range for u vector max = " << Size+Size*Size << " allocated = " << u_count_alloc << "\n";
                      throw FatalExceptionHandling(tmp.str());
                    }
                  if (NZE >= max_nze)
                    {
                      ostringstream tmp;
                      tmp << " in Init_UMFPACK_Sparse, exceeds the capacity of A_m sparse matrix\n";
                      throw FatalExceptionHandling(tmp.str());
                    }
#endif
                  if ((!fliped /*|| lag != 0*/) /*&& (!(vector_table_conditional_local[eq-lag*Size].is_cond && (t-lag == 0)))*/)
                    {
                      (*Ax)[NZE] = u[index];
                      (*Ai)[NZE] = eq - lag * Size;
                      NZE++;
                    }
                  else /*if (fliped)*/
                    {
#ifdef DEBUG
                      if (eq - lag * Size < 0 || eq  - lag * Size >= Size * periods)
                        {
                          ostringstream tmp;
                          tmp << " in Init_UMFPACK_Sparse, index (" << eq  - lag * Size << ") out of range for b vector\n";
                          throw FatalExceptionHandling(tmp.str());
                        }
                      if (var+Size*(y_kmin+t) < 0 || var+Size*(y_kmin+t) >= Size*(periods+y_kmin+y_kmax))
                        {
                          ostringstream tmp;
                          tmp << " in Init_UMFPACK_Sparse, index (" << var+Size*(y_kmin+t) << ") out of range for index_vara vector\n";
                          throw FatalExceptionHandling(tmp.str());
                        }
                      if (index_vara[var+Size*(y_kmin+t /*+lag*/)] < 0 || index_vara[var+Size*(y_kmin+t /*+lag*/)] >= y_size*(periods+y_kmin+y_kmax))
                        {
                          ostringstream tmp;
                          tmp << " in Init_UMFPACK_Sparse, index (" << index_vara[var+Size*(y_kmin+t /*+lag*/)] << ") out of range for y vector max=" << y_size*(periods+y_kmin+y_kmax) << "\n";
                          throw FatalExceptionHandling(tmp.str());
                        }
#endif
                      (*b)[eq - lag * Size] += u[index] * y[index_vara[var+Size*(y_kmin+t /*+lag*/)]];
                    }

                }
              if (lag > ti_y_kmax || lag < ti_y_kmin)
                {
#ifdef DEBUG
                  if (eq < 0 || eq >= Size * periods)
                    {
                      ostringstream tmp;
                      tmp << " in Init_UMFPACK_Sparse, index (" << eq << ") out of range for b vector\n";
                      throw FatalExceptionHandling(tmp.str());
                    }
                  if (var+Size*(y_kmin+t+lag) < 0 || var+Size*(y_kmin+t+lag) >= Size*(periods+y_kmin+y_kmax))
                    {
                      ostringstream tmp;
                      tmp << " in Init_UMFPACK_Sparse, index (" << var+Size*(y_kmin+t+lag) << ") out of range for index_vara vector\n";
                      throw FatalExceptionHandling(tmp.str());
                    }
                  if (index_vara[var+Size*(y_kmin+t+lag)] < 0 || index_vara[var+Size*(y_kmin+t+lag)] >= y_size*(periods+y_kmin+y_kmax))
                    {
                      ostringstream tmp;
                      tmp << " in Init_UMFPACK_Sparse, index (" << index_vara[var+Size*(y_kmin+t+lag)] << ") out of range for y vector max=" << y_size*(periods+y_kmin+y_kmax) << "\n";
                      throw FatalExceptionHandling(tmp.str());
                    }
#endif
                  (*b)[eq]  += u[index+lag*u_count_init]*y[index_vara[var+Size*(y_kmin+t+lag)]];
                }
            }
          else           /* ...and store it in the u vector*/
            {
#ifdef DEBUG
              if (index < 0 || index >= u_count_alloc)
                {
                  ostringstream tmp;
                  tmp << " in Init_UMFPACK_Sparse, index (" << index << ") out of range for u vector\n";
                  throw FatalExceptionHandling(tmp.str());
                }
              if (eq < 0 || eq >= (Size*periods))
                {
                  ostringstream tmp;
                  tmp << " in Init_UMFPACK_Sparse, index (" << eq << ") out of range for b vector\n";
                  throw FatalExceptionHandling(tmp.str());
                }
#endif
              (*b)[eq]  += u[index];
            }
          it4++;
        }
    }
  (*Ap)[Size*periods] = NZE;
#ifdef DEBUG
  mexPrintf("*Ax = [");
  for (int i = 0; i < NZE; i++)
    mexPrintf("%f ", (*Ax)[i]);
  mexPrintf("]\n");

  mexPrintf("*Ap = [");
  for (int i = 0; i < n+1; i++)
    mexPrintf("%d ", (*Ap)[i]);
  mexPrintf("]\n");

  mexPrintf("*Ai = [");
  for (int i = 0; i < NZE; i++)
    mexPrintf("%d ", (*Ai)[i]);
  mexPrintf("]\n");
#endif
}

void
dynSparseMatrix::Init_CUDA_Sparse_Simple(int Size, map<pair<pair<int, int>, int>, int> &IM, SuiteSparse_long **Ap, SuiteSparse_long **Ai, double **Ax, double **b, double **x0, bool &zero_solution, mxArray *x0_m)
{
  int eq, var;

  *b = (double *) mxMalloc(Size * sizeof(double));
  test_mxMalloc(*b, __LINE__, __FILE__, __func__, Size * sizeof(double));
  if (!(*b))
    {
      ostringstream tmp;
      tmp << " in Init_CUDA_Sparse, can't retrieve b vector\n";
      throw FatalExceptionHandling(tmp.str());
    }
  double *Host_x0 = mxGetPr(x0_m);
  if (!Host_x0)
    {
      ostringstream tmp;
      tmp << " in Init_CUDA_Sparse_Simple, can't retrieve x0 vector\n";
      throw FatalExceptionHandling(tmp.str());
    }
  *Ap = (SuiteSparse_long *) mxMalloc((Size+1) * sizeof(SuiteSparse_long));
  test_mxMalloc(*Ap, __LINE__, __FILE__, __func__, (Size+1) * sizeof(SuiteSparse_long));
  if (!(*Ap))
    {
      ostringstream tmp;
      tmp << " in Init_CUDA_Sparse, can't allocate Ap index vector\n";
      throw FatalExceptionHandling(tmp.str());
    }
  size_t prior_nz = IM.size();
  *Ai = (SuiteSparse_long *) mxMalloc(prior_nz * sizeof(SuiteSparse_long));
  test_mxMalloc(*Ai, __LINE__, __FILE__, __func__, prior_nz * sizeof(SuiteSparse_long));
  if (!(*Ai))
    {
      ostringstream tmp;
      tmp << " in Init_CUDA_Sparse, can't allocate Ai index vector\n";
      throw FatalExceptionHandling(tmp.str());
    }
  *Ax = (double *) mxMalloc(prior_nz * sizeof(double));
  test_mxMalloc(*Ax, __LINE__, __FILE__, __func__, prior_nz * sizeof(double));
  if (!(*Ax))
    {
      ostringstream tmp;
      tmp << " in Init_CUDA_Sparse, can't retrieve Ax matrix\n";
      throw FatalExceptionHandling(tmp.str());
    }

  map<pair<pair<int, int>, int>, int>::iterator it4;
  for (int i = 0; i < Size; i++)
    {
      int eq = index_vara[i];
      ya[eq+it_*y_size] = y[eq+it_*y_size];
    }

#ifdef DEBUG
  unsigned int max_nze = prior_nz; //mxGetNzmax(A_m);
#endif
  unsigned int NZE = 0;
  int last_var = 0;
  double cum_abs_sum = 0;
  for (int i = 0; i < Size; i++)
    {
      (*b)[i] = u[i];
      cum_abs_sum += fabs((*b)[i]);
      (*x0)[i] = y[i];
    }
  if (cum_abs_sum < 1e-20)
    zero_solution = true;
  else
    zero_solution = false;

  (*Ap)[0] = 0;
  last_var = -1;
  it4 = IM.begin();
  while (it4 != IM.end())
    {
      var = it4->first.first.first;
      if (var != last_var)
        {
          (*Ap)[1+last_var ] = NZE;
          last_var = var;
        }
      eq = it4->first.second;
      int index = it4->second;
#ifdef DEBUG
      if (index < 0 || index >= u_count_alloc || index > Size + Size*Size)
        {
          ostringstream tmp;
          tmp << " in Init_CUDA_Sparse_Simple, index (" << index << ") out of range for u vector max = " << Size+Size*Size << " allocated = " << u_count_alloc << "\n";
          throw FatalExceptionHandling(tmp.str());
        }
      if (NZE >= max_nze)
        {
          ostringstream tmp;
          tmp << " in Init_CUDA_Sparse_Simple, exceeds the capacity of A_m sparse matrix\n";
          throw FatalExceptionHandling(tmp.str());
        }
#endif
      (*Ax)[NZE] = u[index];
      (*Ai)[NZE] = eq;
      NZE++;
#ifdef DEBUG
      if (eq < 0 || eq >= Size)
        {
          ostringstream tmp;
          tmp << " in Init_CUDA_Sparse_Simple, index (" << eq << ") out of range for b vector\n";
          throw FatalExceptionHandling(tmp.str());
        }
      if (var < 0 || var >= Size)
        {
          ostringstream tmp;
          tmp << " in Init_CUDA_Sparse_Simple, index (" << var << ") out of range for index_vara vector\n";
          throw FatalExceptionHandling(tmp.str());
        }
      if (index_vara[var] < 0 || index_vara[var] >= y_size)
        {
          ostringstream tmp;
          tmp << " in Init_CUDA_Sparse_Simple, index (" << index_vara[var] << ") out of range for y vector max=" << y_size << " (0)\n";
          throw FatalExceptionHandling(tmp.str());
        }
#endif
      it4++;
    }
  (*Ap)[Size] = NZE;
}

#ifdef CUDA
void
dynSparseMatrix::Init_CUDA_Sparse(int periods, int y_kmin, int y_kmax, int Size, map<pair<pair<int, int>, int>, int> &IM, int **Ap, int **Ai, double **Ax, int **Ap_tild, int **Ai_tild, double **A_tild, double **b, double **x0, mxArray *x0_m, int *nnz, int *nnz_tild, int preconditioner)
{
  //cudaError_t cuda_error;
  int t, eq, var, lag, ti_y_kmin, ti_y_kmax;
  int n = periods * Size;
  size_t prior_nz = IM.size() * periods;
  size_t preconditioner_size = 0;
  map<pair<int, int>, int> jacob_struct;

  /* ask cuda how many devices it can find */
  int device_count;
  cudaGetDeviceCount(&device_count);

  cudaSetDevice(CUDA_device);

  double *Host_b = (double *) mxMalloc(n * sizeof(double));
  test_mxMalloc(Host_b, __LINE__, __FILE__, __func__, n * sizeof(double));
  cudaChk(cudaMalloc((void **) b, n * sizeof(double)), " in Init_Cuda_Sparse, not enought memory to allocate b vector on the graphic card\n");

  double *Host_x0 = mxGetPr(x0_m);
  if (!Host_x0)
    {
      ostringstream tmp;
      tmp << " in Init_Cuda_Sparse, can't retrieve x0 vector\n";
      throw FatalExceptionHandling(tmp.str());
    }
  cudaChk(cudaMalloc((void **) x0, n * sizeof(double)), " in Init_Cuda_Sparse, not enought memory to allocate x0 vector on the graphic card\n");

  int *Host_Ap = (int *) mxMalloc((n+1) * sizeof(int));
  test_mxMalloc(Host_Ap, __LINE__, __FILE__, __func__, (n+1) * sizeof(int));

  int *Host_Ai = (int *) mxMalloc(prior_nz * sizeof(int));
  test_mxMalloc(Host_Ai, __LINE__, __FILE__, __func__, prior_nz * sizeof(int));

  double *Host_Ax = (double *) mxMalloc(prior_nz * sizeof(double));
  test_mxMalloc(Host_Ax, __LINE__, __FILE__, __func__, prior_nz * sizeof(double));

  int *Host_Ai_tild, *Host_Ap_tild;
  if (preconditioner == 3)
    {
      Host_Ap_tild = (int *) mxMalloc((n+1)*sizeof(int));
      test_mxMalloc(Host_Ap_tild, __LINE__, __FILE__, __func__, (n+1)*sizeof(int));
      Host_Ai_tild = (int *) mxMalloc(prior_nz*sizeof(int));
      test_mxMalloc(Host_Ai_tild, __LINE__, __FILE__, __func__, prior_nz*sizeof(int));
      Host_Ap_tild[0] = 0;
    }

  if (preconditioner == 0)
    preconditioner_size = n;
  else if (preconditioner == 1 || preconditioner == 2 || preconditioner == 3)
    preconditioner_size = prior_nz;

  double *Host_A_tild = (double *) mxMalloc(preconditioner_size * sizeof(double));
  test_mxMalloc(Host_A_tild, __LINE__, __FILE__, __func__, preconditioner_size * sizeof(double));

  map<pair<pair<int, int>, int>, int>::iterator it4;
  for (int i = 0; i < y_size*(periods+y_kmin); i++)
    ya[i] = y[i];
# ifdef DEBUG
  unsigned int max_nze = mxGetNzmax(A_m);
# endif
  unsigned int NZE = 0, NZE_tild = 0;
  int last_eq = 0;
  for (int i = 0; i < periods*Size; i++)
    {
      Host_b[i] = 0;
      Host_x0[i] = y[index_vara[Size*y_kmin+i]];
    }

  //Ordered in CSR and not in CSC

  Host_Ap[0] = 0;
  for (t = 0; t < periods; t++)
    {
      last_eq = -1;
      it4 = IM.begin();
      while (it4 != IM.end())
        {
          eq = it4->first.first.first;
          if (eq != last_eq)
            {
# ifdef DEBUG
              if (1+last_eq + t * Size > (n + 1))
                {
                  ostringstream tmp;
                  tmp << " in Init_CUDA_Sparse, 1+last_eq + t * Size (" << 1+last_eq + t * Size << ") out of range for Host_Ap vector\n";
                  throw FatalExceptionHandling(tmp.str());
                }
# endif
              Host_Ap[1+last_eq + t * Size] = NZE;
              if (preconditioner == 3 && t == 0)
                Host_Ap_tild[1+last_eq ] = NZE_tild;
              last_eq = eq;
            }
          var = it4->first.second+Size*t;
          lag = it4->first.first.second;
          int index = it4->second+ (t /*+ lag*/) * u_count_init;
          if (eq < (periods+y_kmax)*Size)
            {
              ti_y_kmin = -min(t, y_kmin);
              ti_y_kmax = min(periods-(t + 1), y_kmax);
              if ((lag <= ti_y_kmax && lag >= ti_y_kmin) || preconditioner == 3)  /*Build the index for sparse matrix containing the jacobian : u*/
                {
# ifdef DEBUG
                  if (index < 0 || index >= u_count_alloc || index > (periods-1)* IM.size() + Size * Size + periods * Size)
                    {
                      ostringstream tmp;
                      tmp << " in Init_CUDA_Sparse, index (" << index << ") out of range for u vector max = " << (periods-1)* IM.size() + Size * Size + periods * Size << " allocated = " << u_count_alloc << "\n";
                      throw FatalExceptionHandling(tmp.str());
                    }
                  if (NZE >= prior_nz)
                    {
                      ostringstream tmp;
                      tmp << " in Init_CUDA_Sparse, exceeds the capacity of A_i or A_x sparse matrix\n";
                      throw FatalExceptionHandling(tmp.str());
                    }
# endif
                  bool to_store = true;
                  if (preconditioner == 0)
                    {
                      if (lag == 0 && it4->first.second == eq)
                        Host_A_tild[var] = u[index];
                    }
                  else if (preconditioner == 1 || preconditioner == 2)
                    Host_A_tild[NZE] = u[index];
                  else if (preconditioner == 3)
                    {
                      if (lag > ti_y_kmax || lag < ti_y_kmin)
                        {
                          Host_b[eq + t * Size]  += u[index]*y[index_vara[var+Size*(y_kmin+lag)]];
                          to_store = false;
                        }
                      if (t == 0)
                        {
                          map<pair<int, int>, int>::const_iterator it = jacob_struct.find(make_pair(eq + t * Size, var));
                          if (it != jacob_struct.end())
                            Host_A_tild[it->second] += u[index];
                          else
                            {
                              jacob_struct[make_pair(eq, var)] = NZE_tild;
                              Host_A_tild[NZE_tild] = u[index];
                              Host_Ai_tild[NZE_tild] = var;
                              NZE_tild++;
                            }
                        }
                    }
                  if (to_store)
                    {
                      Host_Ax[NZE] = u[index];
                      Host_Ai[NZE] = var + lag * Size;
                      NZE++;
                    }
                }
              else
                {
# ifdef DEBUG
                  if (var < 0 || var >= Size * periods)
                    {
                      ostringstream tmp;
                      tmp << " in Init_CUDA_Sparse, index (" << var << ") out of range for b vector\n";
                      throw FatalExceptionHandling(tmp.str());
                    }
                  if (var+Size*(y_kmin+t+lag) < 0 || var+Size*(y_kmin+lag) >= Size*(periods+y_kmin+y_kmax))
                    {
                      ostringstream tmp;
                      tmp << " in Init_CUDA_Sparse, index (" << var+Size*(y_kmin+lag) << ") out of range for index_vara vector max=" << Size*(periods+y_kmin+y_kmax) << "\n";
                      throw FatalExceptionHandling(tmp.str());
                    }
                  if (index_vara[var+Size*(y_kmin+lag)] < 0 || index_vara[var+Size*(y_kmin+lag)] >= y_size*(periods+y_kmin+y_kmax))
                    {
                      ostringstream tmp;
                      tmp << " in Init_CUDA_Sparse, index (" << index_vara[var+Size*(y_kmin+lag)] << ") out of range for y vector max=" << y_size*(periods+y_kmin+y_kmax) << "\n";
                      throw FatalExceptionHandling(tmp.str());
                    }
# endif
                  Host_b[eq + t * Size]  += u[index]*y[index_vara[var+Size*(y_kmin+lag)]];
                }
            }
          else           // ...and store it in the u vector
            {
# ifdef DEBUG
              if (index < 0 || index >= u_count_alloc)
                {
                  ostringstream tmp;
                  tmp << " in Init_CUDA_Sparse, index (" << index << ") out of range for u vector\n";
                  throw FatalExceptionHandling(tmp.str());
                }
              if (var < 0 || var >= (Size*periods))
                {
                  ostringstream tmp;
                  tmp << " in Init_CUDA_Sparse, index (" << var << ") out of range for b vector\n";
                  throw FatalExceptionHandling(tmp.str());
                }
# endif
              Host_b[var]  += u[index];
            }
          it4++;
        }
    }
  Host_Ap[Size*periods] = NZE;
  if (preconditioner == 3)
    {
      int *tmp_Ap_tild = (int *) mxMalloc((Size + 1) * sizeof(int));
      test_mxMalloc(tmp_Ap_tild, __LINE__, __FILE__, __func__, (Size + 1) * sizeof(int));
      int *tmp_Ai_tild = (int *) mxMalloc(NZE_tild * sizeof(int));
      test_mxMalloc(tmp_Ai_tild, __LINE__, __FILE__, __func__, NZE_tild * sizeof(int));
      double *tmp_A_tild = (double *) mxMalloc(NZE_tild * sizeof(double));
      test_mxMalloc(tmp_A_tild, __LINE__, __FILE__, __func__, NZE_tild * sizeof(double));
      memcpy(tmp_Ap_tild, Host_Ap_tild, (Size + 1) * sizeof(int));
      memcpy(tmp_Ai_tild, Host_Ai_tild, NZE_tild * sizeof(int));
      memcpy(tmp_A_tild, Host_A_tild, NZE_tild * sizeof(double));
      //int NZE_tild_old = NZE_tild;
      NZE_tild = 0;
      Host_Ap_tild[0] = NZE_tild;

      for (int i = 0; i < Size; i++)
        {
          for (int j = tmp_Ap_tild[i]; j < tmp_Ap_tild[i+1]; j++)
            if (abs(tmp_A_tild[j]) > 1.0e-20)
              {
                Host_A_tild[NZE_tild] = tmp_A_tild[j];
                Host_Ai_tild[NZE_tild] = tmp_Ai_tild[j];
                NZE_tild++;
              }
          Host_Ap_tild[i+1] = NZE_tild;
        }
      mxFree(tmp_Ap_tild);
      mxFree(tmp_Ai_tild);
      mxFree(tmp_A_tild);
    }

  *nnz = NZE;
  *nnz_tild = NZE_tild;
  if (preconditioner == 1 || preconditioner == 2 || preconditioner == 3)
    preconditioner_size = NZE;

# ifdef DEBUG
  mexPrintf("Host_Ax = [");
  for (int i = 0; i < NZE; i++)
    mexPrintf("%f ", Host_Ax[i]);
  mexPrintf("]\n");

  mexPrintf("Host_Ap = [");
  for (int i = 0; i < n+1; i++)
    mexPrintf("%d ", Host_Ap[i]);
  mexPrintf("]\n");

  mexPrintf("Host_Ai = [");
  for (int i = 0; i < NZE; i++)
    mexPrintf("%d ", Host_Ai[i]);
  mexPrintf("]\n");
# endif
  cudaChk(cudaMalloc((void **) Ai, NZE * sizeof(int)), " in Init_Cuda_Sparse, can't allocate Ai index vector on the graphic card\n");
  cudaChk(cudaMalloc((void **) Ax, NZE * sizeof(double)), "  in Init_Cuda_Sparse, can't allocate Ax on the graphic card\n");
  cudaChk(cudaMalloc((void **) Ap, (n+1) * sizeof(int)), " in Init_Cuda_Sparse, can't allocate Ap index vector on the graphic card\n");
  if (preconditioner == 3)
    {
      cudaChk(cudaMalloc((void **) Ai_tild, NZE_tild * sizeof(int)), " in Init_Cuda_Sparse, can't allocate Ai_tild index vector on the graphic card\n");
      cudaChk(cudaMalloc((void **) Ap_tild, (n+1) * sizeof(int)), " in Init_Cuda_Sparse, can't allocate Ap_tild index vector on the graphic card\n");
    }
  cudaChk(cudaMalloc((void **) A_tild, preconditioner_size * sizeof(double)), "  in Init_Cuda_Sparse, can't allocate A_tild on the graphic card\n");

  cudaChk(cudaMemcpy(*x0,     Host_x0,     n *                   sizeof(double), cudaMemcpyHostToDevice), " in Init_CUDA_Sparse, cudaMemcpy x0 = Host_x0 failed");
  cudaChk(cudaMemcpy(*b,      Host_b,      n *                   sizeof(double), cudaMemcpyHostToDevice), " in Init_CUDA_Sparse, cudaMemcpy b = Host_b failed");
  cudaChk(cudaMemcpy(*Ap,     Host_Ap,     (n + 1) *             sizeof(int),    cudaMemcpyHostToDevice), " in Init_CUDA_Sparse, cudaMemcpy Ap = Host_Ap failed");
  cudaChk(cudaMemcpy(*Ai,     Host_Ai,     NZE *                 sizeof(int),    cudaMemcpyHostToDevice), " in Init_CUDA_Sparse, cudaMemcpy Ai = Host_Ai failed");
  cudaChk(cudaMemcpy(*Ax,     Host_Ax,     NZE *                 sizeof(double), cudaMemcpyHostToDevice), " in Init_CUDA_Sparse, cudaMemcpy Ax = Host_Ax failed");
  if (preconditioner == 3)
    {
      cudaChk(cudaMemcpy(*Ap_tild,     Host_Ap_tild,     (n + 1) *             sizeof(int),    cudaMemcpyHostToDevice), " in Init_CUDA_Sparse, cudaMemcpy Ap_tild = Host_Ap_tild failed");
      cudaChk(cudaMemcpy(*Ai_tild,     Host_Ai_tild,     NZE_tild *                 sizeof(int),    cudaMemcpyHostToDevice), " in Init_CUDA_Sparse, cudaMemcpy Ai_tild = Host_Ai_til failed");
    }
  cudaChk(cudaMemcpy(*A_tild, Host_A_tild, preconditioner_size * sizeof(double), cudaMemcpyHostToDevice), " in Init_CUDA_Sparse, cudaMemcpy A_tild = Host_A_tild failed");
}
#endif

void
dynSparseMatrix::PrintM(int n, double *Ax, mwIndex *Ap, mwIndex *Ai)
{
  int nnz = Ap[n];
  double *A = (double *) mxMalloc(n * n * sizeof(double));
  test_mxMalloc(A, __LINE__, __FILE__, __func__, n * n * sizeof(double));
  memset(A, 0, n * n  * sizeof(double));
  int k = 0;
  for (int i = 0; i < n; i++)
    {
      for (int j = Ap[i]; j < (int) Ap[i + 1]; j++)
        {
          int row = Ai[j];
          A[row *n + i] = Ax[j];
          k++;
        }
    }
  if (nnz != k)
    mexPrintf("Problem nnz(%d) != number of elements(%d)\n", nnz, k);
  mexPrintf("----------------------\n");
  //mexEvalString("drawnow;");
  for (int i = 0; i < n; i++)
    {
      for (int j = 0; j < n; j++)
        mexPrintf("%-6.3f ", A[i * n + j]);
      mexPrintf("\n");
    }
  mxFree(A);
}

void
dynSparseMatrix::Init_Matlab_Sparse(int periods, int y_kmin, int y_kmax, int Size, map<pair<pair<int, int>, int>, int> &IM, mxArray *A_m, mxArray *b_m, mxArray *x0_m)
{
  int t, eq, var, lag, ti_y_kmin, ti_y_kmax;
  double *b = mxGetPr(b_m);

  if (!b)
    {
      ostringstream tmp;
      tmp << " in Init_Matlab_Sparse, can't retrieve b vector\n";
      throw FatalExceptionHandling(tmp.str());
    }
  double *x0 = mxGetPr(x0_m);
  if (!x0)
    {
      ostringstream tmp;
      tmp << " in Init_Matlab_Sparse_Simple, can't retrieve x0 vector\n";
      throw FatalExceptionHandling(tmp.str());
    }
  mwIndex *Aj = mxGetJc(A_m);
  if (!Aj)
    {
      ostringstream tmp;
      tmp << " in Init_Matlab_Sparse, can't allocate Aj index vector\n";
      throw FatalExceptionHandling(tmp.str());
    }
  mwIndex *Ai = mxGetIr(A_m);
  if (!Ai)
    {
      ostringstream tmp;
      tmp << " in Init_Matlab_Sparse, can't allocate Ai index vector\n";
      throw FatalExceptionHandling(tmp.str());
    }
  double *A = mxGetPr(A_m);
  if (!A)
    {
      ostringstream tmp;
      tmp << " in Init_Matlab_Sparse, can't retrieve A matrix\n";
      throw FatalExceptionHandling(tmp.str());
    }

  map<pair<pair<int, int>, int>, int>::iterator it4;
  for (int i = 0; i < y_size*(periods+y_kmin); i++)
    ya[i] = y[i];
#ifdef DEBUG
  unsigned int max_nze = mxGetNzmax(A_m);
#endif
  unsigned int NZE = 0;
  int last_var = 0;
  for (int i = 0; i < periods*Size; i++)
    {
      b[i] = 0;
      x0[i] = y[index_vara[Size*y_kmin+i]];
    }
  Aj[0] = 0;
  for (t = 0; t < periods; t++)
    {
      last_var = 0;
      it4 = IM.begin();
      while (it4 != IM.end())
        {
          var = it4->first.first.first;
          if (var != last_var)
            {
              Aj[1+last_var + t * Size] = NZE;
              last_var = var;
            }
          eq = it4->first.second+Size*t;
          lag = -it4->first.first.second;
          int index = it4->second+ (t-lag) * u_count_init;
          if (var < (periods+y_kmax)*Size)
            {
              ti_y_kmin = -min(t, y_kmin);
              ti_y_kmax = min(periods-(t +1), y_kmax);
              int ti_new_y_kmax = min(t, y_kmax);
              int ti_new_y_kmin = -min(periods-(t+1), y_kmin);
              if (lag <= ti_new_y_kmax && lag >= ti_new_y_kmin)   /*Build the index for sparse matrix containing the jacobian : u*/
                {
#ifdef DEBUG
                  if (index < 0 || index >= u_count_alloc || index > Size + Size*Size)
                    {
                      ostringstream tmp;
                      tmp << " in Init_Matlab_Sparse, index (" << index << ") out of range for u vector max = " << Size+Size*Size << " allocated = " << u_count_alloc << "\n";
                      throw FatalExceptionHandling(tmp.str());
                    }
                  if (NZE >= max_nze)
                    {
                      ostringstream tmp;
                      tmp << " in Init_Matlab_Sparse, exceeds the capacity of A_m sparse matrix\n";
                      throw FatalExceptionHandling(tmp.str());
                    }
#endif
                  A[NZE] = u[index];
                  Ai[NZE] = eq - lag * Size;
                  NZE++;
                }
              if (lag > ti_y_kmax || lag < ti_y_kmin)
                {
#ifdef DEBUG
                  if (eq < 0 || eq >= Size * periods)
                    {
                      ostringstream tmp;
                      tmp << " in Init_Matlab_Sparse, index (" << eq << ") out of range for b vector\n";
                      throw FatalExceptionHandling(tmp.str());
                    }
                  if (var+Size*(y_kmin+t+lag) < 0 || var+Size*(y_kmin+t+lag) >= Size*(periods+y_kmin+y_kmax))
                    {
                      ostringstream tmp;
                      tmp << " in Init_Matlab_Sparse, index (" << var+Size*(y_kmin+t+lag) << ") out of range for index_vara vector\n";
                      throw FatalExceptionHandling(tmp.str());
                    }
                  if (index_vara[var+Size*(y_kmin+t+lag)] < 0 || index_vara[var+Size*(y_kmin+t+lag)] >= y_size*(periods+y_kmin+y_kmax))
                    {
                      ostringstream tmp;
                      tmp << " in Init_Matlab_Sparse, index (" << index_vara[var+Size*(y_kmin+t+lag)] << ") out of range for y vector max=" << y_size*(periods+y_kmin+y_kmax) << "\n";
                      throw FatalExceptionHandling(tmp.str());
                    }
#endif
                  b[eq]  += u[index+lag*u_count_init]*y[index_vara[var+Size*(y_kmin+t+lag)]];
                }
            }
          else           /* ...and store it in the u vector*/
            {
#ifdef DEBUG
              if (index < 0 || index >= u_count_alloc)
                {
                  ostringstream tmp;
                  tmp << " in Init_Matlab_Sparse, index (" << index << ") out of range for u vector\n";
                  throw FatalExceptionHandling(tmp.str());
                }
              if (eq < 0 || eq >= (Size*periods))
                {
                  ostringstream tmp;
                  tmp << " in Init_Matlab_Sparse, index (" << eq << ") out of range for b vector\n";
                  throw FatalExceptionHandling(tmp.str());
                }
#endif
              b[eq]  += u[index];
            }
          it4++;
        }
    }
  Aj[Size*periods] = NZE;
}

void
dynSparseMatrix::Init_GE(int periods, int y_kmin, int y_kmax, int Size, map<pair<pair<int, int>, int>, int> &IM)
{
  int t, i, eq, var, lag, ti_y_kmin, ti_y_kmax;
  double tmp_b = 0.0;
  map<pair<pair<int, int>, int>, int>::iterator it4;
  NonZeroElem *first;
  pivot = (int *) mxMalloc(Size*periods*sizeof(int));
  test_mxMalloc(pivot, __LINE__, __FILE__, __func__, Size*periods*sizeof(int));
  pivot_save = (int *) mxMalloc(Size*periods*sizeof(int));
  test_mxMalloc(pivot_save, __LINE__, __FILE__, __func__, Size*periods*sizeof(int));
  pivotk = (int *) mxMalloc(Size*periods*sizeof(int));
  test_mxMalloc(pivotk, __LINE__, __FILE__, __func__, Size*periods*sizeof(int));
  pivotv = (double *) mxMalloc(Size*periods*sizeof(double));
  test_mxMalloc(pivotv, __LINE__, __FILE__, __func__, Size*periods*sizeof(double));
  pivotva = (double *) mxMalloc(Size*periods*sizeof(double));
  test_mxMalloc(pivotva, __LINE__, __FILE__, __func__, Size*periods*sizeof(double));
  b = (int *) mxMalloc(Size*periods*sizeof(int));
  test_mxMalloc(b, __LINE__, __FILE__, __func__, Size*periods*sizeof(int));
  line_done = (bool *) mxMalloc(Size*periods*sizeof(bool));
  test_mxMalloc(line_done, __LINE__, __FILE__, __func__, Size*periods*sizeof(bool));
  mem_mngr.init_CHUNK_BLCK_SIZE(u_count);
  g_save_op = NULL;
  g_nop_all = 0;
  i = (periods+y_kmax+1)*Size*sizeof(NonZeroElem *);
  FNZE_R = (NonZeroElem **) mxMalloc(i);
  test_mxMalloc(FNZE_R, __LINE__, __FILE__, __func__, i);
  FNZE_C = (NonZeroElem **) mxMalloc(i);
  test_mxMalloc(FNZE_C, __LINE__, __FILE__, __func__, i);
  NonZeroElem **temp_NZE_R = (NonZeroElem **) mxMalloc(i);
  test_mxMalloc(temp_NZE_R, __LINE__, __FILE__, __func__, i);
  NonZeroElem **temp_NZE_C = (NonZeroElem **) mxMalloc(i);
  test_mxMalloc(temp_NZE_C, __LINE__, __FILE__, __func__, i);
  i = (periods+y_kmax+1)*Size*sizeof(int);
  NbNZRow = (int *) mxMalloc(i);
  test_mxMalloc(NbNZRow, __LINE__, __FILE__, __func__, i);
  NbNZCol = (int *) mxMalloc(i);
  test_mxMalloc(NbNZCol, __LINE__, __FILE__, __func__, i);

  for (int i = 0; i < periods*Size; i++)
    {
      b[i] = 0;
      line_done[i] = 0;
    }
  for (int i = 0; i < (periods+y_kmax+1)*Size; i++)
    {
      FNZE_C[i] = NULL;
      FNZE_R[i] = NULL;
      temp_NZE_C[i] = NULL;
      temp_NZE_R[i] = NULL;
      NbNZRow[i] = 0;
      NbNZCol[i] = 0;
    }
  int nnz = 0;
  //pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) ordered private(it4, ti_y_kmin, ti_y_kmax, eq, var, lag) schedule(dynamic)
  for (t = 0; t < periods; t++)
    {
      ti_y_kmin = -min(t, y_kmin);
      ti_y_kmax = min(periods-(t+1), y_kmax);
      it4 = IM.begin();
      eq = -1;
      //pragma omp ordered
      while (it4 != IM.end())
        {
          var = it4->first.first.second;
          if (eq != it4->first.first.first+Size*t)
            tmp_b = 0;
          eq = it4->first.first.first+Size*t;
          lag = it4->first.second;
          if (var < (periods+y_kmax)*Size)
            {
              lag = it4->first.second;
              if (lag <= ti_y_kmax && lag >= ti_y_kmin)   /*Build the index for sparse matrix containing the jacobian : u*/
                {
                  nnz++;
                  var += Size*t;
                  NbNZRow[eq]++;
                  NbNZCol[var]++;
                  first = mem_mngr.mxMalloc_NZE();
                  first->NZE_C_N = NULL;
                  first->NZE_R_N = NULL;
                  first->u_index = it4->second+u_count_init*t;
                  first->r_index = eq;
                  first->c_index = var;
                  first->lag_index = lag;
                  if (FNZE_R[eq] == NULL)
                    FNZE_R[eq] = first;
                  if (FNZE_C[var] == NULL)
                    FNZE_C[var] = first;
                  if (temp_NZE_R[eq] != NULL)
                    temp_NZE_R[eq]->NZE_R_N = first;
                  if (temp_NZE_C[var] != NULL)
                    temp_NZE_C[var]->NZE_C_N = first;
                  temp_NZE_R[eq] = first;
                  temp_NZE_C[var] = first;
                }
              else       /*Build the additive terms ooutside the simulation periods related to the first lags and the last leads...*/
                {
                  if (lag < ti_y_kmin)
                    {
                      tmp_b += u[it4->second+u_count_init*t]*y[index_vara[var+Size*(y_kmin+t)]];
                    }
                  else
                    {
                      tmp_b += u[it4->second+u_count_init*t]*y[index_vara[var+Size*(y_kmin+t)]];

                    }
                }
            }
          else           /* ...and store it in the u vector*/
            {
              b[eq] = it4->second+u_count_init*t;
              u[b[eq]] += tmp_b;
              tmp_b = 0;
            }
          it4++;
        }
    }
  mxFree(temp_NZE_R);
  mxFree(temp_NZE_C);
}

int
dynSparseMatrix::Get_u()
{
  if (!u_liste.empty())
    {
      int i = u_liste.back();
      u_liste.pop_back();
      return i;
    }
  else
    {
      if (u_count < u_count_alloc)
        {
          int i = u_count;
          u_count++;
          return i;
        }
      else
        {
          u_count_alloc += 5*u_count_alloc_save;
          u = (double *) mxRealloc(u, u_count_alloc*sizeof(double));
          if (!u)
            {
              ostringstream tmp;
              tmp << " in Get_u, memory exhausted (realloc(" << u_count_alloc*sizeof(double) << "))\n";
              throw FatalExceptionHandling(tmp.str());
            }
          int i = u_count;
          u_count++;
          return i;
        }
    }
}

void
dynSparseMatrix::Delete_u(int pos)
{
  u_liste.push_back(pos);
}

void
dynSparseMatrix::Clear_u()
{
  u_liste.clear();
}

void
dynSparseMatrix::Print_u()
{
  for (unsigned int i = 0; i < u_liste.size(); i++)
    mexPrintf("%d ", u_liste[i]);
}

void
dynSparseMatrix::End_GE(int Size)
{
  mem_mngr.Free_All();
  mxFree(FNZE_R);
  mxFree(FNZE_C);
  mxFree(NbNZRow);
  mxFree(NbNZCol);
  mxFree(b);
  mxFree(line_done);
  mxFree(pivot);
  mxFree(pivot_save);
  mxFree(pivotk);
  mxFree(pivotv);
  mxFree(pivotva);
}

bool
dynSparseMatrix::compare(int *save_op, int *save_opa, int *save_opaa, int beg_t, int periods, long int nop4,  int Size)
{
  long int i, j, nop = nop4/2;
  double r = 0.0;
  bool OK = true;
  t_save_op_s *save_op_s, *save_opa_s, *save_opaa_s;
  int *diff1, *diff2;
  diff1 = (int *) mxMalloc(nop*sizeof(int));
  test_mxMalloc(diff1, __LINE__, __FILE__, __func__, nop*sizeof(int));
  diff2 = (int *) mxMalloc(nop*sizeof(int));
  test_mxMalloc(diff2, __LINE__, __FILE__, __func__, nop*sizeof(int));
  int max_save_ops_first = -1;
  j = i = 0;
  while (i < nop4 && OK)
    {
      save_op_s = (t_save_op_s *) &(save_op[i]);
      save_opa_s = (t_save_op_s *) &(save_opa[i]);
      save_opaa_s = (t_save_op_s *) &(save_opaa[i]);
      diff1[j] = save_op_s->first-save_opa_s->first;
      if (max_save_ops_first < save_op_s->first+diff1[j]*(periods-beg_t))
        {
          max_save_ops_first = save_op_s->first+diff1[j]*(periods-beg_t);
        }
      switch (save_op_s->operat)
        {
        case IFLD:
        case IFDIV:
          OK = (save_op_s->operat == save_opa_s->operat && save_opa_s->operat == save_opaa_s->operat
                && diff1[j] == (save_opa_s->first-save_opaa_s->first));
          i += 2;
          break;
        case IFLESS:
        case IFSUB:
          diff2[j] = save_op_s->second-save_opa_s->second;
          OK = (save_op_s->operat == save_opa_s->operat && save_opa_s->operat == save_opaa_s->operat
                && diff1[j] == (save_opa_s->first-save_opaa_s->first)
                && diff2[j] == (save_opa_s->second-save_opaa_s->second));
          i += 3;
          break;
        default:
          ostringstream tmp;
          tmp << " in compare, unknown operator = " << save_op_s->operat << "\n";
          throw FatalExceptionHandling(tmp.str());
        }
      j++;
    }
  // the same pivot for all remaining periods
  if (OK)
    {
      for (int i = beg_t; i < periods; i++)
        {
          for (int j = 0; j < Size; j++)
            pivot[i*Size+j] = pivot[(i-1)*Size+j]+Size;
        }
      if (max_save_ops_first >= u_count_alloc)
        {
          u_count_alloc += max_save_ops_first;
          u = (double *) mxRealloc(u, u_count_alloc*sizeof(double));
          if (!u)
            {
              ostringstream tmp;
              tmp << " in compare, memory exhausted (realloc(" << u_count_alloc*sizeof(double) << "))\n";
              throw FatalExceptionHandling(tmp.str());
            }
        }
      for (int t = 1; t < periods-beg_t-y_kmax; t++)
        {
          int i = j = 0;
          double *up;
          while (i < nop4)
            {
              t_save_op_s *save_op_s = (t_save_op_s *) (&(save_op[i]));
              up = &u[save_op_s->first+t*diff1[j]];
              switch (save_op_s->operat)
                {
                case IFLD:
                  r = *up;
                  i += 2;
                  break;
                case IFDIV:
                  *up /= r;
                  i += 2;
                  break;
                case IFSUB:
                  *up -= u[save_op_s->second+t*diff2[j]]*r;;
                  i += 3;
                  break;
                case IFLESS:
                  *up = -u[save_op_s->second+t*diff2[j]]*r;
                  i += 3;
                  break;
                }
              j++;
            }
        }
      int t1 = max(1, periods-beg_t-y_kmax);
      int periods_beg_t = periods-beg_t;
      for (int t = t1; t < periods_beg_t; t++)
        {
          int i = j = 0;
          int gap = periods_beg_t-t;
          while (i < nop4)
            {
              t_save_op_s *save_op_s = (t_save_op_s *) (&(save_op[i]));
              if (save_op_s->lag < gap)
                {
                  double *up = &u[save_op_s->first+t*diff1[j]];
                  switch (save_op_s->operat)
                    {
                    case IFLD:
                      r = *up;
                      i += 2;
                      break;
                    case IFDIV:
                      *up /= r;
                      i += 2;
                      break;
                    case IFSUB:
                      *up -= u[save_op_s->second+t*diff2[j]]*r;
                      i += 3;
                      break;
                    case IFLESS:
                      *up = -u[save_op_s->second+t*diff2[j]]*r;
                      i += 3;
                      break;
                    }
                }
              else
                {
                  switch (save_op_s->operat)
                    {
                    case IFLD:
                    case IFDIV:
                      i += 2;
                      break;
                    case IFSUB:
                    case IFLESS:
                      i += 3;
                      break;
                    }
                }
              j++;
            }
        }
    }
  mxFree(diff1);
  mxFree(diff2);
  return OK;
}

int
dynSparseMatrix::complete(int beg_t, int Size, int periods, int *b)
{
  long int i, j, k, nop, nopa, nop1, cal_y, nb_var, pos, max_var, min_var;
  NonZeroElem *first;
  int *save_code;
  int *diff;
  double yy = 0.0, err;

  int size_of_save_code = (1+y_kmax)*Size*(Size+1+4)/2*4;
  save_code = (int *) mxMalloc(size_of_save_code*sizeof(int));
  test_mxMalloc(save_code, __LINE__, __FILE__, __func__, size_of_save_code*sizeof(int));
  int size_of_diff = (1+y_kmax)*Size*(Size+1+4);
  diff = (int *) mxMalloc(size_of_diff*sizeof(int));
  test_mxMalloc(diff, __LINE__, __FILE__, __func__, size_of_diff*sizeof(int));
  cal_y = y_size*y_kmin;

  i = (beg_t+1)*Size-1;
  nop = 0;
  for (j = i; j > i-Size; j--)
    {
      pos = pivot[j];
      nb_var = At_Row(pos, &first);
      first = first->NZE_R_N;
      nb_var--;
      save_code[nop] = IFLDZ;
      save_code[nop+1] = 0;
      save_code[nop+2] = 0;
      save_code[nop+3] = 0;
#ifdef DEBUG
      if ((nop+3) >= size_of_save_code)
        mexPrintf("out of save_code[%d] (bound=%d)\n", nop+2, size_of_save_code);
#endif
      nop += 4;
      for (k = 0; k < nb_var; k++)
        {
          save_code[nop] = IFMUL;
          save_code[nop+1] = index_vara[first->c_index]+cal_y;
          save_code[nop+2] = first->u_index;
          save_code[nop+3] = first->lag_index;
#ifdef DEBUG
          if ((nop+3) >= size_of_save_code)
            mexPrintf("out of save_code[%d] (bound=%d)\n", nop+2, size_of_save_code);
#endif
          nop += 4;
          first = first->NZE_R_N;
        }
      save_code[nop] = IFADD;
      save_code[nop+1] = b[pos];
      save_code[nop+2] = 0;
      save_code[nop+3] = 0;
#ifdef DEBUG
      if ((nop+3) >= size_of_save_code)
        mexPrintf("out of save_code[%d] (bound=%d)\n", nop+2, size_of_save_code);
#endif
      nop += 4;
      save_code[nop] = IFSTP;
      save_code[nop+1] = index_vara[j]+y_size*y_kmin;
      save_code[nop+2] = 0;
      save_code[nop+3] = 0;
#ifdef DEBUG
      if ((nop+2) >= size_of_save_code)
        mexPrintf("out of save_code[%d] (bound=%d)\n", nop+2, size_of_save_code);
#endif
      nop += 4;
    }
  i = beg_t*Size-1;
  nop1 = nopa = 0;
  for (j = i; j > i-Size; j--)
    {
      pos = pivot[j];
      nb_var = At_Row(pos, &first);
      first = first->NZE_R_N;
      nb_var--;
      diff[nopa] = 0;
      diff[nopa+1] = 0;
      nopa += 2;
      nop1 += 4;
      for (k = 0; k < nb_var; k++)
        {
          diff[nopa] = save_code[nop1+1]-(index_vara[first->c_index]+cal_y);
          diff[nopa+1] = save_code[nop1+2]-(first->u_index);
#ifdef DEBUG
          if ((nop1+2) >= size_of_save_code)
            mexPrintf("out of save_code[%d] (bound=%d)\n", nop1+2, size_of_save_code);
          if ((nopa+1) >= size_of_diff)
            mexPrintf("out of diff[%d] (bound=%d)\n", nopa+2, size_of_diff);
#endif
          nopa += 2;
          nop1 += 4;
          first = first->NZE_R_N;
        }
      diff[nopa] = save_code[nop1+1]-(b[pos]);
      diff[nopa+1] = 0;
#ifdef DEBUG
      if ((nop1+3) >= size_of_save_code)
        mexPrintf("out of save_code[%d] (bound=%d)\n", nop1+2, size_of_save_code);
      if ((nopa+1) >= size_of_diff)
        mexPrintf("out of diff[%d] (bound=%d)\n", nopa+2, size_of_diff);
#endif
      nopa += 2;
      nop1 += 4;
      diff[nopa] = save_code[nop1+1]-(index_vara[j]+y_size*y_kmin);
      diff[nopa+1] = 0;
#ifdef DEBUG
      if ((nop1+4) >= size_of_save_code)
        mexPrintf("out of save_code[%d] (bound=%d)\n", nop1+2, size_of_save_code);
      if ((nopa+1) >= size_of_diff)
        mexPrintf("out of diff[%d] (bound=%d)\n", nopa+2, size_of_diff);
#endif
      nopa += 2;
      nop1 += 4;
    }
  max_var = (periods+y_kmin)*y_size;
  min_var = y_kmin*y_size;
  for (int t = periods+y_kmin-1; t >= beg_t+y_kmin; t--)
    {
      int j = 0, k;
      int ti = t-y_kmin-beg_t;
      for (int i = 0; i < nop; i += 4)
        {
          switch (save_code[i])
            {
            case IFLDZ:
              yy = 0;
              break;
            case IFMUL:
              k = save_code[i+1]+ti*diff[j];
              if (k < max_var && k > min_var)
                {
                  yy += y[k]*u[save_code[i+2]+ti*diff[j+1]];
                }
              break;
            case IFADD:
              yy = -(yy+u[save_code[i+1]+ti*diff[j]]);
              break;
            case IFSTP:
              k = save_code[i+1]+ti*diff[j];
              err = yy - y[k];
              y[k] += slowc*(err);
              break;
            }
          j += 2;
        }
    }
  mxFree(save_code);
  mxFree(diff);
  return (beg_t);
}

void
dynSparseMatrix::bksub(int tbreak, int last_period, int Size, double slowc_l)
{
  NonZeroElem *first;
  int i, j, k;
  double yy;
  for (int i = 0; i < y_size*(periods+y_kmin); i++)
    y[i] = ya[i];
  if (symbolic && tbreak)
    last_period = complete(tbreak, Size, periods, b);
  else
    last_period = periods;
  for (int t = last_period+y_kmin-1; t >= y_kmin; t--)
    {
      int ti = (t-y_kmin)*Size;
      int cal = y_kmin*Size;
      int cal_y = y_size*y_kmin;
      for (i = ti-1; i >= ti-Size; i--)
        {
          j = i+cal;
          int pos = pivot[i+Size];
          int nb_var = At_Row(pos, &first);
          first = first->NZE_R_N;
          nb_var--;
          int eq = index_vara[j]+y_size;
          yy = 0;
          for (k = 0; k < nb_var; k++)
            {
              yy += y[index_vara[first->c_index]+cal_y]*u[first->u_index];
              first = first->NZE_R_N;
            }
          yy = -(yy+y[eq]+u[b[pos]]);
          direction[eq] = yy;
          y[eq] += slowc_l*yy;
        }
    }
}

void
dynSparseMatrix::simple_bksub(int it_, int Size, double slowc_l)
{
  int i, k;
  double yy;
  NonZeroElem *first;
  for (int i = 0; i < y_size; i++)
    y[i+it_*y_size] = ya[i+it_*y_size];
  for (i = Size-1; i >= 0; i--)
    {
      int pos = pivot[i];
      int nb_var = At_Row(pos, &first);
      first = first->NZE_R_N;
      nb_var--;
      int eq = index_vara[i];
      yy = 0;
      for (k = 0; k < nb_var; k++)
        {
          yy += y[index_vara[first->c_index]+it_*y_size]*u[first->u_index];
          first = first->NZE_R_N;
        }
      yy = -(yy+y[eq+it_*y_size]+u[b[pos]]);
      direction[eq+it_*y_size] = yy;
      y[eq+it_*y_size] += slowc_l*yy;
    }
}

void
dynSparseMatrix::CheckIt(int y_size, int y_kmin, int y_kmax, int Size, int periods)
{
  const double epsilon = 1e-7;
  fstream SaveResult;
  ostringstream out;
  out << "Result" << iter;
  SaveResult.open(out.str().c_str(), ios::in);
  if (!SaveResult.is_open())
    {
      ostringstream tmp;
      tmp << " in CheckIt, Result file cannot be opened\n";
      throw FatalExceptionHandling(tmp.str());
    }
  mexPrintf("Reading Result...");
  int row, col;
  SaveResult >> row;
  mexPrintf("row=%d\n", row);
  SaveResult >> col;
  mexPrintf("col=%d\n", col);
  double G1a;
  mexPrintf("Allocated\n");
  NonZeroElem *first;
  for (int j = 0; j < col; j++)
    {
      mexPrintf("j=%d ", j);
      int nb_equ = At_Col(j, &first);
      mexPrintf("nb_equ=%d\n", nb_equ);
      int line;
      if (first)
        line = first->r_index;
      else
        line = -9999999;
      for (int i = 0; i < row; i++)
        {
          SaveResult >> G1a;
          if (line == i)
            {
              if (abs(u[first->u_index]/G1a-1) > epsilon)
                mexPrintf("Problem at r=%d c=%d u[first->u_index]=%5.14f G1a[i][j]=%5.14f %f\n", i, j, u[first->u_index], G1a, u[first->u_index]/G1a-1);
              first = first->NZE_C_N;
              if (first)
                line = first->r_index;
              else
                line = -9999999;
            }
          else
            {
              if (G1a != 0.0)
                mexPrintf("Problem at r=%d c=%d G1a[i][j]=%f\n", i, j, G1a);
            }
        }
    }
  SaveResult >> row;
  mexPrintf("row(2)=%d\n", row);
  double *B;
  B = (double *) mxMalloc(row*sizeof(double));
  test_mxMalloc(B, __LINE__, __FILE__, __func__, row*sizeof(double));
  for (int i = 0; i < row; i++)
    SaveResult >> B[i];
  SaveResult.close();
  mexPrintf("done\n");
  mexPrintf("Comparing...");
  for (int i = 0; i < row; i++)
    {
      if (abs(u[b[i]]+B[i]) > epsilon)
        mexPrintf("Problem at i=%d u[b[i]]=%f B[i]=%f\n", i, u[b[i]], B[i]);
    }
  mxFree(B);
}

void
dynSparseMatrix::Check_the_Solution(int periods, int y_kmin, int y_kmax, int Size, double *u, int *pivot, int *b)
{
  const double epsilon = 1e-10;
  Init_GE(periods, y_kmin, y_kmax, Size, IM_i);
  NonZeroElem *first;
  int cal_y = y_kmin*Size;
  mexPrintf("     ");
  for (int i = 0; i < Size; i++)
    mexPrintf(" %8d", i);
  mexPrintf("\n");
  for (int t = y_kmin; t < periods+y_kmin; t++)
    {
      mexPrintf("t=%5d", t);
      for (int i = 0; i < Size; i++)
        mexPrintf(" %d %1.6f", t*y_size+index_vara[i], y[t*y_size+index_vara[i]]);
      mexPrintf("\n");
    }
  for (int i = 0; i < Size*periods; i++)
    {
      double res = 0;
      int pos = pivot[i];
      mexPrintf("pos[%d]=%d", i, pos);
      int nb_var = At_Row(pos, &first);
      mexPrintf(" nb_var=%d\n", nb_var);
      for (int j = 0; j < nb_var; j++)
        {
          mexPrintf("(y[%d]=%f)*(u[%d]=%f)(r=%d, c=%d)\n", index_vara[first->c_index]+cal_y, y[index_vara[first->c_index]+cal_y], first->u_index, u[first->u_index], first->r_index, first->c_index);
          res += y[index_vara[first->c_index]+cal_y]*u[first->u_index];
          first = first->NZE_R_N;
        }
      double tmp_ = res;
      res += u[b[pos]];
      if (abs(res) > epsilon)
        mexPrintf("Error for equation %d => res=%f y[%d]=%f u[b[%d]]=%f somme(y*u)=%f\n", pos, res, pos, y[index_vara[pos]], pos, u[b[pos]], tmp_);
    }
}

mxArray *
dynSparseMatrix::substract_A_B(mxArray *A_m, mxArray *B_m)
{
  size_t n_A = mxGetN(A_m);
  size_t m_A = mxGetM(A_m);
  double *A_d = mxGetPr(A_m);
  size_t n_B = mxGetN(B_m);
  double *B_d = mxGetPr(B_m);
  mxArray *C_m = mxCreateDoubleMatrix(m_A, n_B, mxREAL);
  double *C_d = mxGetPr(C_m);
  for (int j = 0; j < (int) n_A; j++)
    for (unsigned int i = 0; i < m_A; i++)
      {
        size_t index = j*m_A+i;
        C_d[index] = A_d[index] - B_d[index];
      }
  return C_m;
}

mxArray *
dynSparseMatrix::Sparse_substract_A_SB(mxArray *A_m, mxArray *B_m)
{
  size_t n_B = mxGetN(B_m);
  size_t m_B = mxGetM(B_m);
  mwIndex *B_i = mxGetIr(B_m);
  mwIndex *B_j = mxGetJc(B_m);
  size_t total_nze_B = B_j[n_B];
  double *B_d = mxGetPr(B_m);
  mxArray *C_m = mxDuplicateArray(A_m);
  double *C_d = mxGetPr(C_m);
  unsigned int nze_B = 0;
  unsigned int B_col = 0;
  while (nze_B < total_nze_B)
    {
      while (nze_B >= (unsigned int) B_j[B_col+1] && (nze_B < total_nze_B))
        B_col++;
      C_d[B_col*m_B+B_i[nze_B]] -= B_d[nze_B];
      nze_B++;
    }
  return C_m;
}

mxArray *
dynSparseMatrix::Sparse_substract_SA_SB(mxArray *A_m, mxArray *B_m)
{
  size_t n_A = mxGetN(A_m);
  size_t m_A = mxGetM(A_m);
  mwIndex *A_i = mxGetIr(A_m);
  mwIndex *A_j = mxGetJc(A_m);
  size_t total_nze_A = A_j[n_A];
  double *A_d = mxGetPr(A_m);
  size_t n_B = mxGetN(B_m);
  mwIndex *B_i = mxGetIr(B_m);
  mwIndex *B_j = mxGetJc(B_m);
  size_t total_nze_B = B_j[n_B];
  double *B_d = mxGetPr(B_m);
  mxArray *C_m = mxCreateSparse(m_A, n_B, m_A*n_B, mxREAL);
  mwIndex *C_i = mxGetIr(C_m);
  mwIndex *C_j = mxGetJc(C_m);
  double *C_d = mxGetPr(C_m);
  unsigned int nze_B = 0, nze_C = 0, nze_A = 0;
  unsigned int A_col = 0, B_col = 0, C_col = 0;
  C_j[C_col] = 0;
  while (nze_A < total_nze_A || nze_B < total_nze_B)
    {
      while (nze_A >= (unsigned int) A_j[A_col+1] && (nze_A < total_nze_A))
        A_col++;
      size_t A_row = A_i[nze_A];
      while (nze_B >= (unsigned int) B_j[B_col+1] && (nze_B < total_nze_B))
        B_col++;
      size_t B_row = B_i[nze_B];
      if (A_col == B_col)
        {
          if (A_row == B_row && (nze_B < total_nze_B && nze_A < total_nze_A))
            {
              C_d[nze_C] = A_d[nze_A++] - B_d[nze_B++];
              C_i[nze_C] = A_row;
              while (C_col < A_col)
                C_j[++C_col] = nze_C;
              C_j[A_col+1] = nze_C++;
              C_col = A_col;
            }
          else if (A_row < B_row || (nze_B >= total_nze_B && nze_A < total_nze_A))
            {
              C_d[nze_C] = A_d[nze_A++];
              C_i[nze_C] = A_row;
              while (C_col < A_col)
                C_j[++C_col] = nze_C;
              C_j[A_col+1] = nze_C++;
              C_col = A_col;
            }
          else
            {
              C_d[nze_C] = -B_d[nze_B++];
              C_i[nze_C] = B_row;
              while (C_col < B_col)
                C_j[++C_col] = nze_C;
              C_j[B_col+1] = nze_C++;
              C_col = B_col;
            }
        }
      else if (A_col < B_col || (nze_B >= total_nze_B && nze_A < total_nze_A))
        {
          C_d[nze_C] = A_d[nze_A++];
          C_i[nze_C] = A_row;
          while (C_col < A_col)
            C_j[++C_col] = nze_C;
          C_j[A_col+1] = nze_C++;
          C_col = A_col;
        }
      else
        {
          C_d[nze_C] = -B_d[nze_B++];
          C_i[nze_C] = B_row;
          while (C_col < B_col)
            C_j[++C_col] = nze_C;
          C_j[B_col+1] = nze_C++;
          C_col = B_col;
        }
    }
  while (C_col < n_B)
    C_j[++C_col] = nze_C;
  mxSetNzmax(C_m, nze_C);
  return C_m;
}

mxArray *
dynSparseMatrix::mult_SAT_B(mxArray *A_m, mxArray *B_m)
{
  size_t n_A = mxGetN(A_m);
  size_t m_A = mxGetM(A_m);
  mwIndex *A_i = mxGetIr(A_m);
  mwIndex *A_j = mxGetJc(A_m);
  double *A_d = mxGetPr(A_m);
  size_t n_B = mxGetN(B_m);
  double *B_d = mxGetPr(B_m);
  mxArray *C_m = mxCreateDoubleMatrix(m_A, n_B, mxREAL);
  double *C_d = mxGetPr(C_m);
  for (int j = 0; j < (int) n_B; j++)
    {
      for (unsigned int i = 0; i < n_A; i++)
        {
          double sum = 0;
          size_t nze_A = A_j[i];
          while (nze_A < (unsigned int) A_j[i+1])
            {
              size_t i_A = A_i[nze_A];
              sum += A_d[nze_A++] * B_d[i_A];
            }
          C_d[j*n_A+i] = sum;
        }
    }
  return C_m;
}

mxArray *
dynSparseMatrix::Sparse_mult_SAT_B(mxArray *A_m, mxArray *B_m)
{
  size_t n_A = mxGetN(A_m);
  size_t m_A = mxGetM(A_m);
  mwIndex *A_i = mxGetIr(A_m);
  mwIndex *A_j = mxGetJc(A_m);
  double *A_d = mxGetPr(A_m);
  size_t n_B = mxGetN(B_m);
  size_t m_B = mxGetM(B_m);
  double *B_d = mxGetPr(B_m);
  mxArray *C_m = mxCreateSparse(m_A, n_B, m_A*n_B, mxREAL);
  mwIndex *C_i = mxGetIr(C_m);
  mwIndex *C_j = mxGetJc(C_m);
  double *C_d = mxGetPr(C_m);
  unsigned int nze_C = 0;
  //unsigned int nze_A = 0;
  unsigned int C_col = 0;
  C_j[C_col] = 0;
  //#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
  for (unsigned int j = 0; j < n_B; j++)
    {
      for (unsigned int i = 0; i < n_A; i++)
        {
          double sum = 0;
          size_t nze_A = A_j[i];
          while (nze_A < (unsigned int) A_j[i+1])
            {
              size_t i_A = A_i[nze_A];
              sum += A_d[nze_A++] * B_d[i_A];
            }
          if (fabs(sum) > 1e-10)
            {
              C_d[nze_C] = sum;
              C_i[nze_C] = i;
              while (C_col < j)
                C_j[++C_col] = nze_C;
              nze_C++;
            }
        }
    }
  while (C_col < m_B)
    C_j[++C_col] = nze_C;
  mxSetNzmax(C_m, nze_C);
  return C_m;
}

mxArray *
dynSparseMatrix::Sparse_mult_SAT_SB(mxArray *A_m, mxArray *B_m)
{
  size_t n_A = mxGetN(A_m);
  size_t m_A = mxGetM(A_m);
  mwIndex *A_i = mxGetIr(A_m);
  mwIndex *A_j = mxGetJc(A_m);
  double *A_d = mxGetPr(A_m);
  size_t n_B = mxGetN(B_m);
  mwIndex *B_i = mxGetIr(B_m);
  mwIndex *B_j = mxGetJc(B_m);
  double *B_d = mxGetPr(B_m);
  mxArray *C_m = mxCreateSparse(m_A, n_B, m_A*n_B, mxREAL);
  mwIndex *C_i = mxGetIr(C_m);
  mwIndex *C_j = mxGetJc(C_m);
  double *C_d = mxGetPr(C_m);
  size_t nze_B = 0, nze_C = 0, nze_A = 0;
  unsigned int C_col = 0;
  C_j[C_col] = 0;
  for (unsigned int j = 0; j < n_B; j++)
    {
      for (unsigned int i = 0; i < n_A; i++)
        {
          double sum = 0;
          nze_B = B_j[j];
          nze_A = A_j[i];
          while (nze_A < (unsigned int) A_j[i+1] && nze_B < (unsigned int) B_j[j+1])
            {
              size_t i_A = A_i[nze_A];
              size_t i_B = B_i[nze_B];
              if (i_A == i_B)
                sum += A_d[nze_A++] * B_d[nze_B++];
              else if (i_A < i_B)
                nze_A++;
              else
                nze_B++;
            }
          if (fabs(sum) > 1e-10)
            {
              C_d[nze_C] = sum;
              C_i[nze_C] = i;
              while (C_col < j)
                C_j[++C_col] = nze_C;
              nze_C++;
            }
        }
    }
  while (C_col < n_B)
    C_j[++C_col] = nze_C;
  mxSetNzmax(C_m, nze_C);
  return C_m;
}

mxArray *
dynSparseMatrix::Sparse_transpose(mxArray *A_m)
{
  size_t n_A = mxGetN(A_m);
  size_t m_A = mxGetM(A_m);
  mwIndex *A_i = mxGetIr(A_m);
  mwIndex *A_j = mxGetJc(A_m);
  size_t total_nze_A = A_j[n_A];
  double *A_d = mxGetPr(A_m);
  mxArray *C_m = mxCreateSparse(n_A, m_A, total_nze_A, mxREAL);
  mwIndex *C_i = mxGetIr(C_m);
  mwIndex *C_j = mxGetJc(C_m);
  double *C_d = mxGetPr(C_m);
  unsigned int nze_C = 0, nze_A = 0;
  memset(C_j, 0, m_A);
  map<pair<mwIndex, unsigned int>, double> B2;
  for (unsigned int i = 0; i < n_A; i++)
    {
      while (nze_A < (unsigned int) A_j[i+1])
        {
          C_j[A_i[nze_A]+1]++;
          B2[make_pair(A_i[nze_A], i)] = A_d[nze_A];
          nze_A++;
        }
    }
  for (unsigned int i = 0; i < m_A; i++)
    C_j[i+1] += C_j[i];
  for (map<pair<mwIndex, unsigned int>, double>::const_iterator it = B2.begin(); it != B2.end(); it++)
    {
      C_d[nze_C] = it->second;
      C_i[nze_C++] = it->first.second;
    }
  return C_m;
}

#define sign(a, b) ((b) >= 0.0 ? fabs(a) : -fabs(a))
bool
dynSparseMatrix::mnbrak(double *ax, double *bx, double *cx, double *fa, double *fb, double *fc)
{
  const double GOLD = 1.618034;
  const double GLIMIT = 100.0;
  const double TINY = 1.0e-20;

  double tmp;
  mexPrintf("bracketing *ax=%f, *bx=%f\n", *ax, *bx);
  //mexEvalString("drawnow;");
  double ulim, u, r, q, fu;
  if (!compute_complete(*ax, fa))
    return false;
  if (!compute_complete(*bx, fb))
    return false;
  if (*fb > *fa)
    {
      tmp = *ax;
      *ax = *bx;
      *bx = tmp;

      tmp = *fa;
      *fa = *fb;
      *fb = tmp;
    }
  *cx = (*bx)+GOLD*(*bx-*ax);
  if (!compute_complete(*cx, fc))
    return false;
  while (*fb > *fc)
    {
      r = (*bx-*ax)*(*fb-*fc);
      q = (*bx-*cx)*(*fb-*fa);
      u = (*bx)-((*bx-*cx)*q-(*bx-*ax)*r)
        /(2.0*sign(fmax(fabs(q-r), TINY), q-r));
      ulim = (*bx)+GLIMIT*(*cx-*bx);
      if ((*bx-u)*(u-*cx) > 0.0)
        {
          if (!compute_complete(u, &fu))
            return false;
          if (fu < *fc)
            {
              *ax = (*bx);
              *bx = u;
              *fa = (*fb);
              *fb = fu;
              return true;
            }
          else if (fu > *fb)
            {
              *cx = u;
              *fc = fu;
              return true;
            }
          u = (*cx)+GOLD*(*cx-*bx);
          if (!compute_complete(u, &fu))
            return false;
        }
      else if ((*cx-u)*(u-ulim) > 0.0)
        {
          if (!compute_complete(u, &fu))
            return false;
          if (fu < *fc)
            {
              *bx = *cx;
              *cx = u;
              u = *cx+GOLD*(*cx-*bx);
              *fb = *fc;
              *fc = fu;
              if (!compute_complete(u, &fu))
                return false;
            }
        }
      else if ((u-ulim)*(ulim-*cx) >= 0.0)
        {
          u = ulim;
          if (!compute_complete(u, &fu))
            return false;
        }
      else
        {
          u = (*cx)+GOLD*(*cx-*bx);
          if (!compute_complete(u, &fu))
            return false;
        }
      *ax = *bx;
      *bx = *cx;
      *cx = u;
      *fa = *fb;
      *fb = *fc;
      *fc = fu;
    }
  return true;
}

bool
dynSparseMatrix::golden(double ax, double bx, double cx, double tol, double solve_tolf, double *xmin)
{
  const double R = 0.61803399;
  const double C = (1.0-R);
  mexPrintf("golden\n");
  //mexEvalString("drawnow;");
  double f1, f2, x0, x1, x2, x3;
  int iter = 0, max_iter = 100;
  x0 = ax;
  x3 = cx;
  if (fabs(cx-bx) > fabs(bx-ax))
    {
      x1 = bx;
      x2 = bx+C*(cx-bx);
    }
  else
    {
      x2 = bx;
      x1 = bx-C*(bx-ax);
    }
  if (!compute_complete(x1, &f1))
    return false;
  if (!compute_complete(x2, &f2))
    return false;
  while ((fabs(x3-x0) > tol*(fabs(x1)+fabs(x2)) && (f1 > solve_tolf && f2 > solve_tolf)) && (iter < max_iter) && (abs(x1 - x2) > 1e-4))
    {
      if (f2 < f1)
        {
          x0 = x1;
          x1 = x2;
          x2 = R*x1+C*x3;
          f1 = f2;
          if (!compute_complete(x2, &f2))
            return false;
        }
      else
        {
          x3 = x2;
          x2 = x1;
          x1 = R*x2+C*x0;
          f2 = f1;
          if (!compute_complete(x1, &f1))
            return false;
        }
      iter++;
    }
  if (f1 < f2)
    {
      *xmin = x1;
      return true;
    }
  else
    {
      *xmin = x2;
      return true;
    }
}

void
dynSparseMatrix::Solve_Matlab_Relaxation(mxArray *A_m, mxArray *b_m, unsigned int Size, double slowc_l, bool is_two_boundaries, int  it_)
{
  mxArray *B1, *C1, *A2, *B2, *A3, *b1, *b2;
  double *b_m_d = mxGetPr(b_m);
  if (!b_m_d)
    {
      ostringstream tmp;
      tmp << " in Solve_Matlab_Relaxation, can't retrieve b_m vector\n";
      throw FatalExceptionHandling(tmp.str());
    }
  mwIndex *A_m_i = mxGetIr(A_m);
  if (!A_m_i)
    {
      ostringstream tmp;
      tmp << " in Solve_Matlab_Relaxation, can't allocate A_m_i index vector\n";
      throw FatalExceptionHandling(tmp.str());
    }
  mwIndex *A_m_j = mxGetJc(A_m);
  if (!A_m_j)
    {
      ostringstream tmp;
      tmp << " in Solve_Matlab_Relaxation, can't allocate A_m_j index vector\n";
      throw FatalExceptionHandling(tmp.str());
    }
  double *A_m_d = mxGetPr(A_m);
  if (!A_m_d)
    {
      ostringstream tmp;
      tmp << " in Solve_Matlab_Relaxation, can't retrieve A matrix\n";
      throw FatalExceptionHandling(tmp.str());
    }
  size_t max_nze = A_m_j[Size*periods];
  unsigned int nze = 0;
  size_t var = A_m_j[nze];
  B1 = mxCreateSparse(Size, Size, Size*Size, mxREAL);
  mwIndex *B1_i = mxGetIr(B1);
  mwIndex *B1_j = mxGetJc(B1);
  double *B1_d = mxGetPr(B1);
  unsigned int B1_nze = 0;
  unsigned int B1_var = 0;
  B1_i[B1_nze] = 0;
  B1_j[B1_var] = 0;
  C1 = mxCreateSparse(Size, Size, Size*Size, mxREAL);
  mwIndex *C1_i = mxGetIr(C1);
  mwIndex *C1_j = mxGetJc(C1);
  double *C1_d = mxGetPr(C1);
  unsigned int C1_nze = 0;
  unsigned int C1_var = 0;
  C1_i[C1_nze] = 0;
  C1_j[C1_var] = 0;
  A2 = mxCreateSparse(Size, Size, Size*Size, mxREAL);
  mwIndex *A2_i = mxGetIr(A2);
  mwIndex *A2_j = mxGetJc(A2);
  double *A2_d = mxGetPr(A2);
  unsigned int A2_nze = 0;
  unsigned int A2_var = 0;
  A2_i[A2_nze] = 0;
  A2_j[A2_var] = 0;
  B2 = mxCreateSparse(Size, Size, Size*Size, mxREAL);
  mwIndex *B2_i = mxGetIr(B2);
  mwIndex *B2_j = mxGetJc(B2);
  double *B2_d = mxGetPr(B2);
  unsigned int B2_nze = 0;
  unsigned int B2_var = 0;
  B2_i[B2_nze] = 0;
  B2_j[B2_var] = 0;
  A3 = mxCreateSparse(Size, Size, Size*Size, mxREAL);
  mwIndex *A3_i = mxGetIr(A3);
  mwIndex *A3_j = mxGetJc(A3);
  double *A3_d = mxGetPr(A3);
  unsigned int A3_nze = 0;
  unsigned int A3_var = 0;
  A3_i[A3_nze] = 0;
  A3_j[A3_var] = 0;
  b1 = mxCreateDoubleMatrix(Size, 1, mxREAL);
  double *b1_d = mxGetPr(b1);
  b2 = mxCreateDoubleMatrix(Size, 1, mxREAL);
  double *b2_d = mxGetPr(b2);
  size_t eq = 0;
  /*B1 C1
    A2 B2
    A3*/
  while (var < 2*Size && nze < max_nze)
    {
      if ((unsigned int) A_m_j[var+1] <= nze)
        {
          if (var < Size)
            b1_d[var] = b_m_d[var];
          else
            b2_d[var - Size] = b_m_d[var];
          var++;
        }
      eq = A_m_i[nze];
      if (var < Size)
        {
          if (eq < Size)
            {
              while (B1_var < var)
                B1_j[++B1_var] = B1_nze;
              B1_i[B1_nze] = eq;
              B1_d[B1_nze] = A_m_d[nze];
              B1_nze++;
            }
          else
            {
              while (A2_var < var)
                A2_j[++A2_var] = A2_nze;
              A2_i[A2_nze] = eq - Size;
              A2_d[A2_nze] = A_m_d[nze];
              A2_nze++;
            }
        }
      else if (var < 2*Size)
        {
          if (eq < Size)
            {
              while (C1_var < var - Size)
                C1_j[++C1_var] = C1_nze;
              C1_i[C1_nze] = eq;
              C1_d[C1_nze] = A_m_d[nze];
              C1_nze++;
            }
          else if (eq < 2*Size)
            {
              while (B2_var < var - Size)
                B2_j[++B2_var] = B2_nze;
              B2_i[B2_nze] = eq - Size;
              B2_d[B2_nze] = A_m_d[nze];
              B2_nze++;
            }
          else
            {
              while (A3_var < var - Size)
                A3_j[++A3_var] = A3_nze;
              A3_i[A3_nze] = eq - 2*Size;
              A3_d[A3_nze] = A_m_d[nze];
              A3_nze++;
            }
        }
      nze++;
    }
  while (B1_var < Size)
    B1_j[++B1_var] = B1_nze;
  while (C1_var < Size)
    C1_j[++C1_var] = C1_nze;
  while (A2_var < Size)
    A2_j[++A2_var] = A2_nze;
  while (B2_var < Size)
    B2_j[++B2_var] = B2_nze;
  while (A3_var < Size)
    A3_j[++A3_var] = A3_nze;
  mxArray *d1 = NULL;
  vector<pair<mxArray *, mxArray *> > triangular_form;
  double sumc = 0, C_sumc = 1000;
  mxArray *B1_inv = NULL;
  mxArray *B1_inv_t = NULL;
  for (int t = 1; t <= periods; t++)
    {
      if (abs(sumc / C_sumc -1) > 1e-10*res1)
        {
          C_sumc = sumc;
          if (B1_inv)
            mxDestroyArray(B1_inv);
          mexCallMATLAB(1, &B1_inv, 1, &B1, "inv");
          mwIndex *B_inv_j = mxGetJc(B1_inv);
          size_t B_inv_nze = B_inv_j[Size];
          double *B_inv_d = mxGetPr(B1_inv);
          sumc = 0;
          for (unsigned int i = 0; i < B_inv_nze; i++)
            sumc += fabs(B_inv_d[i]);
        }
      B1_inv_t = Sparse_transpose(B1_inv);
      mxArray *S1 = Sparse_mult_SAT_SB(B1_inv_t, C1);

      d1 = mult_SAT_B(B1_inv_t, b1);
      if (t < periods)
        //Computation for the next lines
        {
          mxDestroyArray(B1_inv_t);
          mxArray *A2_t = Sparse_transpose(A2);
          mxDestroyArray(A2);

          mxArray *tmp = Sparse_mult_SAT_SB(A2_t, S1);
          mxDestroyArray(B1);
          B1 = Sparse_substract_SA_SB(B2, tmp);
          mxDestroyArray(tmp);

          tmp = mult_SAT_B(A2_t, d1);
          b1 = substract_A_B(b2, tmp);
          mxDestroyArray(tmp);

          triangular_form.push_back(make_pair(S1, d1));
          mxDestroyArray(A2_t);
        }
      A2 = mxDuplicateArray(A3);

      //I  S1
      //0  B1 C1  =>B1 =
      //   A2 B2  => A2 = A3
      //      A3
      C1_nze = B2_nze = A3_nze = 0;
      C1_var = B2_var = A3_var = 0;

      if (nze < max_nze)
        nze--;
      while (var < (t+2)*Size && nze < max_nze)
        {
          if ((unsigned int) A_m_j[var+1] <= nze)
            {
              b2_d[var - (t+1) * Size] = b_m_d[var];
              var++;
            }
          eq = A_m_i[nze];
          if (eq < (t+1) * Size)
            {
              C1_d[C1_nze] = A_m_d[nze];
              C1_nze++;
            }
          else if (eq < (t+2)*Size)
            {
              B2_d[B2_nze] = A_m_d[nze];
              B2_nze++;
            }
          else
            {
              A3_d[A3_nze] = A_m_d[nze];
              A3_nze++;
            }
          nze++;
        }
    }
  double *d1_d = mxGetPr(d1);
  for (unsigned i = 0; i < Size; i++)
    {
      int eq = index_vara[i+Size*(y_kmin+periods-1)];
      double yy = -(d1_d[i] + y[eq]);
      direction[eq] = yy;
      y[eq] += slowc_l * yy;
    }

  pair<mxArray *, mxArray *> tf;
  for (int t = periods-2; t >= 0; t--)
    {
      mxArray *tmp;
      tf = triangular_form.back();
      triangular_form.pop_back();
      mxArray *tf_first_t = Sparse_transpose(tf.first);
      mxDestroyArray(tf.first);
      tmp = mult_SAT_B(tf_first_t, d1);
      d1 = substract_A_B(tf.second, tmp);
      d1_d = mxGetPr(d1);
      mxDestroyArray(tmp);
      for (unsigned i = 0; i < Size; i++)
        {
          int eq = index_vara[i+Size*(y_kmin+t)];
          double yy = -(d1_d[i] + y[eq]);
          direction[eq] = yy;
          y[eq] += slowc_l * yy;
        }
      mxDestroyArray(tf_first_t);
      mxDestroyArray(tf.second);
    }
  mxDestroyArray(B1);
  mxDestroyArray(C1);
  mxDestroyArray(A2);
  mxDestroyArray(B2);
  mxDestroyArray(A3);
  mxDestroyArray(b1);
  mxDestroyArray(b2);
  mxDestroyArray(A_m);
  mxDestroyArray(b_m);
}

void
dynSparseMatrix::Solve_Matlab_LU_UMFPack(mxArray *A_m, mxArray *b_m, int Size, double slowc_l, bool is_two_boundaries, int  it_)
{
  size_t n = mxGetM(A_m);
  mxArray *z;
  mxArray *rhs[2];
  rhs[0] = A_m;
  rhs[1] = b_m;
  mexCallMATLAB(1, &z, 2, rhs, "mldivide");
  double *res = mxGetPr(z);
  if (is_two_boundaries)
    for (int i = 0; i < (int) n; i++)
      {
        int eq = index_vara[i+Size*y_kmin];
        double yy = -(res[i] + y[eq]);
        direction[eq] = yy;
        y[eq] += slowc_l * yy;
      }
  else
    for (int i = 0; i < (int) n; i++)
      {
        int eq = index_vara[i];
        double yy = -(res[i] + y[eq+it_*y_size]);
        direction[eq] = yy;
        y[eq+it_*y_size] += slowc_l * yy;
      }
  mxDestroyArray(A_m);
  mxDestroyArray(b_m);
  mxDestroyArray(z);
}

void
dynSparseMatrix::End_Matlab_LU_UMFPack()
{
  if (Symbolic)
    umfpack_dl_free_symbolic(&Symbolic);
  if (Numeric)
    umfpack_dl_free_numeric(&Numeric);
}

void
dynSparseMatrix::End_Solver()
{
  if (((stack_solve_algo == 0 || stack_solve_algo == 4) && !steady_state) || (solve_algo == 6 && steady_state))
    End_Matlab_LU_UMFPack();
}

void
dynSparseMatrix::Printfull_UMFPack(SuiteSparse_long *Ap, SuiteSparse_long *Ai, double *Ax, double *b, int n)
{
  double A[n*n];
  for (int i = 0; i  < n*n; i++)
    A[i] = 0;
  int k = 0;
  for (int i = 0; i < n; i++)
    for (int j = Ap[i]; j < Ap[i+1]; j++)
      A[Ai[j] * n + i] =  Ax[k++];
  for (int i = 0; i < n; i++)
    {
      for (int j = 0; j < n; j++)
        mexPrintf("%4.1f ", A[i*n+j]);
      mexPrintf("     %6.3f\n", b[i]);
    }
}

void
dynSparseMatrix::Print_UMFPack(SuiteSparse_long *Ap, SuiteSparse_long *Ai, double *Ax, int n)
{
  int k = 0;
  for (int i = 0; i < n; i++)
    for (int j = Ap[i]; j < Ap[i+1]; j++)
      mexPrintf("(%d, %d)    %f\n", Ai[j]+1, i+1, Ax[k++]);
}

void
dynSparseMatrix::Solve_LU_UMFPack(SuiteSparse_long *Ap, SuiteSparse_long *Ai, double *Ax, double *b, int n, int Size, double slowc_l, bool is_two_boundaries, int  it_, vector_table_conditional_local_type vector_table_conditional_local)
{
  SuiteSparse_long status, sys = 0;
#ifndef _MSC_VER
  double Control [UMFPACK_CONTROL], Info [UMFPACK_INFO], res [n];
#else
  double *Control, *Info, *res;
  Control = (double *) mxMalloc(UMFPACK_CONTROL * sizeof(double));
  test_mxMalloc(Control, __LINE__, __FILE__, __func__, UMFPACK_CONTROL * sizeof(double));
  Info = (double *) mxMalloc(UMFPACK_INFO * sizeof(double));
  test_mxMalloc(Info, __LINE__, __FILE__, __func__, UMFPACK_INFO * sizeof(double));
  res = (double *) mxMalloc(n * sizeof(double));
  test_mxMalloc(res, __LINE__, __FILE__, __func__, n * sizeof(double));
#endif

  umfpack_dl_defaults(Control);
  Control [UMFPACK_PRL] = 5;
  status = 0;
  if (iter == 0)
    {
      status = umfpack_dl_symbolic(n, n, Ap, Ai, Ax, &Symbolic, Control, Info);
      if (status < 0)
        {
          umfpack_dl_report_info(Control, Info);
          umfpack_dl_report_status(Control, status);
          ostringstream  Error;
          Error << " umfpack_dl_symbolic failed\n";
          throw FatalExceptionHandling(Error.str());
        }
    }
  if (iter > 0)
    umfpack_dl_free_numeric(&Numeric);
  status = umfpack_dl_numeric(Ap, Ai, Ax, Symbolic, &Numeric, Control, Info);
  if (status < 0)
    {
      umfpack_dl_report_info(Control, Info);
      umfpack_dl_report_status(Control, status);
      ostringstream  Error;
      Error << " umfpack_dl_numeric failed\n";
      throw FatalExceptionHandling(Error.str());
    }
  status = umfpack_dl_solve(sys, Ap, Ai, Ax, res, b, Numeric, Control, Info);
  if (status != UMFPACK_OK)
    {
      umfpack_dl_report_info(Control, Info);
      umfpack_dl_report_status(Control, status);
      ostringstream  Error;
      Error << " umfpack_dl_solve failed\n";
      throw FatalExceptionHandling(Error.str());
    }

  if (vector_table_conditional_local.size())
    {
      if (is_two_boundaries)
        for (int t = 0; t < n / Size; t++)
          if (t == 0)
            {
              for (int i = 0; i < Size; i++)
                {
                  bool fliped = vector_table_conditional_local[i].is_cond;
                  if (fliped)
                    {
                      int eq = index_vara[i+Size*(y_kmin)];
                      int flip_exo = vector_table_conditional_local[i].var_exo;
                      double  yy = -(res[i] + x[y_kmin + flip_exo*nb_row_x]);
                      direction[eq] = 0;
                      x[flip_exo*nb_row_x + y_kmin] += slowc_l * yy;
                    }
                  else
                    {
                      int eq = index_vara[i+Size*(y_kmin)];
                      double yy = -(res[i ] + y[eq]);
                      direction[eq] = yy;
                      y[eq] += slowc_l * yy;
                    }
                }
            }
          else
            {
              for (int i = 0; i < Size; i++)
                {
                  int eq = index_vara[i+Size*(t + y_kmin)];
                  double yy = -(res[i + Size * t] + y[eq]);
                  direction[eq] = yy;
                  y[eq] += slowc_l * yy;
                }
            }
      else
        for (int i = 0; i < n; i++)
          {
            int eq = index_vara[i];
            double yy = -(res[i] + y[eq+it_*y_size]);
            direction[eq] = yy;
            y[eq+it_*y_size] += slowc_l * yy;
          }
    }
  else
    {
      if (is_two_boundaries)
        for (int i = 0; i < n; i++)
          {
            int eq = index_vara[i+Size*y_kmin];
            double yy = -(res[i] + y[eq]);
            direction[eq] = yy;
            y[eq] += slowc_l * yy;
          }
      else
        for (int i = 0; i < n; i++)
          {
            int eq = index_vara[i];
            double yy = -(res[i] + y[eq+it_*y_size]);
            direction[eq] = yy;
            y[eq+it_*y_size] += slowc_l * yy;
          }
    }

  mxFree(Ap);
  mxFree(Ai);
  mxFree(Ax);
  mxFree(b);
#ifdef _MSC_VER
  mxFree(Control);
  mxFree(Info);
  mxFree(res);
#endif
}

void
dynSparseMatrix::Solve_LU_UMFPack(SuiteSparse_long *Ap, SuiteSparse_long *Ai, double *Ax, double *b, int n, int Size, double slowc_l, bool is_two_boundaries, int  it_)
{
  SuiteSparse_long status, sys = 0;
#ifndef _MSC_VER
  double Control [UMFPACK_CONTROL], Info [UMFPACK_INFO], res [n];
#else
  double *Control, *Info, *res;
  Control = (double *) mxMalloc(UMFPACK_CONTROL * sizeof(double));
  test_mxMalloc(Control, __LINE__, __FILE__, __func__, UMFPACK_CONTROL * sizeof(double));
  Info = (double *) mxMalloc(UMFPACK_INFO * sizeof(double));
  test_mxMalloc(Info, __LINE__, __FILE__, __func__, UMFPACK_INFO * sizeof(double));
  res = (double *) mxMalloc(n * sizeof(double));
  test_mxMalloc(res, __LINE__, __FILE__, __func__, n * sizeof(double));
#endif

  umfpack_dl_defaults(Control);
  Control [UMFPACK_PRL] = 5;
  status = 0;
  if (iter == 0)
    {
      status = umfpack_dl_symbolic(n, n, Ap, Ai, Ax, &Symbolic, Control, Info);
      if (status < 0)
        {
          umfpack_dl_report_info(Control, Info);
          umfpack_dl_report_status(Control, status);
          ostringstream  Error;
          Error << " umfpack_dl_symbolic failed\n";
          throw FatalExceptionHandling(Error.str());
        }
    }
  if (iter > 0)
    umfpack_dl_free_numeric(&Numeric);
  status = umfpack_dl_numeric(Ap, Ai, Ax, Symbolic, &Numeric, Control, Info);
  if (status < 0)
    {
      umfpack_dl_report_info(Control, Info);
      umfpack_dl_report_status(Control, status);
      ostringstream  Error;
      Error << " umfpack_dl_numeric failed\n";
      throw FatalExceptionHandling(Error.str());
    }
  status = umfpack_dl_solve(sys, Ap, Ai, Ax, res, b, Numeric, Control, Info);
  if (status != UMFPACK_OK)
    {
      umfpack_dl_report_info(Control, Info);
      umfpack_dl_report_status(Control, status);
      ostringstream  Error;
      Error << " umfpack_dl_solve failed\n";
      throw FatalExceptionHandling(Error.str());
    }

  if (is_two_boundaries)
    for (int i = 0; i < n; i++)
      {
        int eq = index_vara[i+Size*y_kmin];
        double yy = -(res[i] + y[eq]);
        direction[eq] = yy;
        y[eq] += slowc_l * yy;
      }
  else
    for (int i = 0; i < n; i++)
      {
        int eq = index_vara[i];
        double yy = -(res[i] + y[eq+it_*y_size]);
        direction[eq] = yy;
        y[eq+it_*y_size] += slowc_l * yy;
      }
  mxFree(Ap);
  mxFree(Ai);
  mxFree(Ax);
  mxFree(b);
#ifdef _MSC_VER
  mxFree(Control);
  mxFree(Info);
  mxFree(res);
#endif
}

void
dynSparseMatrix::Solve_LU_UMFPack(mxArray *A_m, mxArray *b_m, int Size, double slowc_l, bool is_two_boundaries, int  it_)
{
  SuiteSparse_long n = mxGetM(A_m);

  SuiteSparse_long *Ap = (SuiteSparse_long *) mxGetJc(A_m);

  SuiteSparse_long *Ai = (SuiteSparse_long *) mxGetIr(A_m);
  double *Ax = mxGetPr(A_m);
  double *B  = mxGetPr(b_m);
  SuiteSparse_long status, sys = 0;
#ifndef _MSC_VER
  double Control [UMFPACK_CONTROL], Info [UMFPACK_INFO], res [n];
#else
  double *Control, *Info, *res;
  Control = (double *) mxMalloc(UMFPACK_CONTROL * sizeof(double));
  test_mxMalloc(Control, __LINE__, __FILE__, __func__, UMFPACK_CONTROL * sizeof(double));
  Info = (double *) mxMalloc(UMFPACK_INFO * sizeof(double));
  test_mxMalloc(Info, __LINE__, __FILE__, __func__, UMFPACK_INFO * sizeof(double));
  res = (double *) mxMalloc(n * sizeof(double));
  test_mxMalloc(res, __LINE__, __FILE__, __func__, n * sizeof(double));
#endif
  void *Symbolic, *Numeric;
  umfpack_dl_defaults(Control);

  status = umfpack_dl_symbolic(n, n, Ap, Ai, Ax, &Symbolic, Control, Info);
  if (status != UMFPACK_OK)
    umfpack_dl_report_info((double *) NULL, Info);

  status = umfpack_dl_numeric(Ap, Ai, Ax, Symbolic, &Numeric, Control, Info);
  if (status != UMFPACK_OK)
    umfpack_dl_report_info((double *) NULL, Info);

  status = umfpack_dl_solve(sys, Ap, Ai, Ax, res, B, Numeric, Control, Info);
  if (status != UMFPACK_OK)
    umfpack_dl_report_info((double *) NULL, Info);
  //double *res = mxGetPr(z);
  if (is_two_boundaries)
    for (int i = 0; i < n; i++)
      {
        int eq = index_vara[i+Size*y_kmin];
        double yy = -(res[i] + y[eq]);
        direction[eq] = yy;
        y[eq] += slowc_l * yy;
      }
  else
    for (int i = 0; i < n; i++)
      {
        int eq = index_vara[i];
        double yy = -(res[i] + y[eq+it_*y_size]);
        direction[eq] = yy;
        y[eq+it_*y_size] += slowc_l * yy;
      }
  mxDestroyArray(A_m);
  mxDestroyArray(b_m);
#ifdef _MSC_VER
  mxFree(Control);
  mxFree(Info);
  mxFree(res);
#endif

}

#ifdef CUDA
void
printM(int n, double *Ax, int *Ap, int *Ai,  cusparseMatDescr_t descrA, cusparseHandle_t cusparse_handle)
{
  //cudaError_t cuda_error;
  //cusparseStatus_t cusparse_status;
  double *A_dense;
  cudaChk(cudaMalloc((void **) &A_dense, n * n *sizeof(double)), "A_dense cudaMalloc has failed\n");

  cusparseChk(cusparseDcsr2dense(cusparse_handle, n, n, descrA,
                                 Ax, Ap, Ai, A_dense, n), "cusparseDcsr2dense has failed\n");
  double *A_dense_hoste = (double *) mxMalloc(n * n * sizeof(double));
  test_mxMalloc(A_dense_hoste, __LINE__, __FILE__, __func__, n * n * sizeof(double));
  cudaChk(cudaMemcpy(A_dense_hoste, A_dense, n * n * sizeof(double), cudaMemcpyDeviceToHost), " cudaMemcpy(A_dense_hoste, A_dense) has failed\n");
  mexPrintf("----------------------\n");
  mexPrintf("FillMode=%d, IndexBase=%d, MatType=%d, DiagType=%d\n", cusparseGetMatFillMode(descrA), cusparseGetMatIndexBase(descrA), cusparseGetMatType(descrA), cusparseGetMatDiagType(descrA));
  //mexEvalString("drawnow;");
  for (int i = 0; i < n; i++)
    {
      for (int j = 0; j < n; j++)
        mexPrintf("%-6.3f ", A_dense_hoste[i + j * n]);
      mexPrintf("\n");
    }
  mxFree(A_dense_hoste);
  cudaChk(cudaFree(A_dense), "cudaFree(A_dense) has failed\n");
}

void
dynSparseMatrix::Solve_CUDA_BiCGStab_Free(double *tmp_vect_host, double *p, double *r, double *v, double *s, double *t, double *y_, double *z, double *tmp_,
                                          int *Ai, double *Ax, int *Ap, double *x0, double *b, double *A_tild, int *A_tild_i, int *A_tild_p/*, double* Lx, int* Li, int* Lp,
                                                                                                                                             double* Ux, int* Ui, int* Up, int* device_n*/, cusparseSolveAnalysisInfo_t infoL, cusparseSolveAnalysisInfo_t infoU,
                                          cusparseMatDescr_t descrL, cusparseMatDescr_t descrU, int preconditioner)
{
  //cudaError_t cuda_error;
  //cusparseStatus_t cusparse_status;
  mxFree(tmp_vect_host);
  cudaChk(cudaFree(p), "  in Solve_Cuda_BiCGStab, can't free p\n");
  cudaChk(cudaFree(r), "  in Solve_Cuda_BiCGStab, can't free r\n");
  cudaChk(cudaFree(v), "  in Solve_Cuda_BiCGStab, can't free v\n");
  cudaChk(cudaFree(s), "  in Solve_Cuda_BiCGStab, can't free s\n");
  cudaChk(cudaFree(t), "  in Solve_Cuda_BiCGStab, can't free t\n");
  cudaChk(cudaFree(y_), "  in Solve_Cuda_BiCGStab, can't free y_\n");
  cudaChk(cudaFree(z), "  in Solve_Cuda_BiCGStab, can't free z\n");
  cudaChk(cudaFree(tmp_), "  in Solve_Cuda_BiCGStab, can't free tmp_\n");
  cudaChk(cudaFree(Ai), "  in Solve_Cuda_BiCGStab, can't free Ai\n");
  cudaChk(cudaFree(Ax), "  in Solve_Cuda_BiCGStab, can't free Ax\n");
  cudaChk(cudaFree(Ap), "  in Solve_Cuda_BiCGStab, can't free Ap\n");
  cudaChk(cudaFree(x0), "  in Solve_Cuda_BiCGStab, can't free x0\n");
  cudaChk(cudaFree(b), "  in Solve_Cuda_BiCGStab, can't free b\n");
  /*if (preconditioner == 0)
    {*/
  cudaChk(cudaFree(A_tild), "  in Solve_Cuda_BiCGStab, can't free A_tild (1)\n");
  cudaChk(cudaFree(A_tild_i), "  in Solve_Cuda_BiCGStab, can't free A_tild_i (1)\n");
  cudaChk(cudaFree(A_tild_p), "  in Solve_Cuda_BiCGStab, can't free A_tild_p (1)\n");
  /*}
    else
    {
    cudaChk(cudaFree(Lx), "  in Solve_Cuda_BiCGStab, can't free Lx\n");
    cudaChk(cudaFree(Li), "  in Solve_Cuda_BiCGStab, can't free Li\n");
    cudaChk(cudaFree(Lp), "  in Solve_Cuda_BiCGStab, can't free Lp\n");
    cudaChk(cudaFree(Ux), "  in Solve_Cuda_BiCGStab, can't free Ux\n");
    cudaChk(cudaFree(Ui), "  in Solve_Cuda_BiCGStab, can't free Ui\n");
    cudaChk(cudaFree(Up), "  in Solve_Cuda_BiCGStab, can't free Up\n");
    }*/
  //cudaChk(cudaFree(device_n), "  in Solve_Cuda_BiCGStab, can't free device_n\n");
  if (preconditioner == 1 || preconditioner == 2 || preconditioner == 3)
    {
      cusparseChk(cusparseDestroySolveAnalysisInfo(infoL),
                  "  in Solve_Cuda_BiCGStab, cusparseDestroySolveAnalysisInfo has failed for infoL\n");
      cusparseChk(cusparseDestroySolveAnalysisInfo(infoU),
                  "  in Solve_Cuda_BiCGStab, cusparseDestroySolveAnalysisInfo has failed for infoU\n");
    }
  cusparseChk(cusparseDestroyMatDescr(descrL),
              " in Solve_Cuda_BiCGStab, matrix descriptor destruction failed for descrL\n");
  cusparseChk(cusparseDestroyMatDescr(descrU),
              " in Solve_Cuda_BiCGStab, matrix descriptor destruction failed for descrU\n");
}
#endif

void
Solve(double *Ax, int *Ap, int *Ai, double *b, int n, bool Lower, double *x)
{
  if (Lower)
    {
      for (int i = 0; i < n; i++)
        {
          double sum = 0;
          for (int j = Ap[i]; j < Ap[i+1]; j++)
            {
              int k = Ai[j];
              if (k < i)
                sum += x[k] * Ax[j];
            }
          x[i] = b[i] - sum;
        }
    }
  else
    {
      for (int i = n-1; i >= 0; i--)
        {
          double sum = 0, mul = 1;
          for (int j = Ap[i]; j < Ap[i+1]; j++)
            {
              int k = Ai[j];
              if (k > i)
                sum += x[k] * Ax[j];
              else if (k == i)
                mul = Ax[j];
            }
          x[i] = (b[i] - sum) / mul;
        }
    }
}

void
Check(int n, double *Ax, int *Ap, int *Ai, double *b, double *x, bool Lower)
{
  if (Lower)
    {
      for (int i = 0; i < n; i++)
        {
          double sum = 0;
          for (int j = Ap[i]; j < Ap[i+1]; j++)
            {
              int k = Ai[j];
              if (k < i)
                sum += x[k] * Ax[j];
            }
          double err =  b[i] - sum - x[i];
          if (abs(err) > 1e-10)
            mexPrintf("error at i=%d\n", i);
        }
    }
  else
    {
      for (int i = n-1; i >= 0; i--)
        {
          double sum = 0;
          for (int j = Ap[i]; j < Ap[i+1]; j++)
            {
              int k = Ai[j];
              if (k >= i)
                sum += x[k] * Ax[j];
            }
          double err =  b[i] - sum;
          if (abs(err) > 1e-10)
            mexPrintf("error at i=%d\n", i);
        }
    }
}

#ifdef CUDA
int
dynSparseMatrix::Solve_CUDA_BiCGStab(int *Ap, int *Ai, double *Ax, int *Ap_tild, int *Ai_tild, double *A_tild, double *b, double *x0, int n, int Size, double slowc_l, bool is_two_boundaries,
                                     int  it_, int nnz, int nnz_tild, int preconditioner, int max_iterations, int block)
{
  cusparseSolveAnalysisInfo_t info, infoL, infoU;
  cusparseMatDescr_t descrL, descrU;
  const double tol = 1.0e-6;//1.0e-6;
  const double eps = 1.0e-16;
  double *p, *r, *r0, *v, *s, *t, *y_, *z, *tmp_;
  int *A_tild_i, *A_tild_p;
  double *Qx;
  int *Qi, *Qj;
  double *Px;
  int *Pi, *Pj;
  int Q_nnz, P_nnz;
  int W_nnz;
  double bnorm;
  double tmp1, tmp2;
  int refinement_needed = 0, stagnation = 0;
  int max_refinement = min(min(int (floor(double (n)/50)), 10), n-max_iterations), max_stagnation = 3;
  int nblocks = ceil(double (n) / double (1024));
  int n_threads;
  if (nblocks == 0)
    n_threads = n;
  else
    n_threads = 1024;
  int periods = n / Size;

  double *tmp_vect_host = (double *) mxMalloc(n * sizeof(double));
  test_mxMalloc(tmp_vect_host, __LINE__, __FILE__, __func__, n * sizeof(double));

  cublasChk(cublasDnrm2(cublas_handle, n, b, 1, &bnorm),
            "  in Solve_Cuda_BiCGStab, cublasDnrm2(b) has failed\n");

  double tolb = tol * bnorm;

  if (bnorm == 0.0)
    {
      // if b = 0 the A.x = 0 => x = 0
      cudaChk(cudaFree(Ai), "  in Solve_Cuda_BiCGStab, can't free Ai\n");
      cudaChk(cudaFree(Ax), "  in Solve_Cuda_BiCGStab, can't free Ax\n");
      cudaChk(cudaFree(Ap), "  in Solve_Cuda_BiCGStab, can't free Ap\n");
      if (preconditioner == 3)
        {
          cudaChk(cudaFree(Ai_tild), "  in Solve_Cuda_BiCGStab, can't free Ai_tild\n");
          cudaChk(cudaFree(Ap_tild), "  in Solve_Cuda_BiCGStab, can't free Ap_tild\n");
        }
      cudaChk(cudaFree(A_tild), "  in Solve_Cuda_BiCGStab, can't free A_tild\n");
      cudaChk(cudaFree(x0), "  in Solve_Cuda_BiCGStab, can't free x0\n");
      cudaChk(cudaFree(b), "  in Solve_Cuda_BiCGStab, can't free b\n");
      if (is_two_boundaries)
        for (int i = 0; i < n; i++)
          {
            int eq = index_vara[i+Size*y_kmin];
            double yy = -y[eq];
            direction[eq] = yy;
            y[eq] += slowc * yy;
          }
      else
        for (int i = 0; i < n; i++)
          {
            int eq = index_vara[i];
            double yy = -y[eq+it_*y_size];
            direction[eq] = yy;
            y[eq+it_*y_size] += slowc * yy;
          }
      return 0;
    }

  int iteration = 0;
  bool convergence = false;
  double zeros = 0.0, one = 1.0, m_one = -1.0;

  cudaChk(cudaMalloc((void **) &tmp_, n * sizeof(double)), "  in Solve_Cuda_Sparse, can't allocate tmp_ on the graphic card\n");

  cudaChk(cudaMalloc((void **) &r, n * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate r on the graphic card\n");

  cudaChk(cudaMemcpy(r, b, n * sizeof(double), cudaMemcpyDeviceToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy r = b has failed\n");

  //r = b - A * x0
  cusparseChk(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, n,
                             n, nnz, &m_one,
                             CUDA_descr, Ax,
                             Ap, Ai,
                             x0, &one,
                             r), "in Solve_Cuda_BiCGStab, cusparseDcsrmv A * x0 has failed");

  cudaChk(cudaMemcpy(tmp_vect_host, r, n*sizeof(double), cudaMemcpyDeviceToHost), "  in Solve_Cuda_BiCGStab, cudaMemcpy tmp_vect_host = p_tild has failed\n");
  /*mexPrintf("r\n");
    for (int i = 0; i < n; i++)
    mexPrintf("%f\n",tmp_vect_host[i]);*/

  cudaChk(cudaMalloc((void **) &r0, n * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate r0 on the graphic card\n");
  cudaChk(cudaMemcpy(r0, r, n * sizeof(double), cudaMemcpyDeviceToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy r0 = r has failed\n");

  cublasChk(cublasDnrm2(cublas_handle, n, // numerator
                        r, 1,
                        &tmp1),
            "  in Solve_Cuda_BiCGStab, cublasDnrm2(r) has failed\n");
  double conv_criteria = tmp1;

  convergence = conv_criteria < tolb;
  if (convergence)
    {
      /* the initial value (x0) is solution of A x = b*/
      cudaChk(cudaFree(Ai), "  in Solve_Cuda_BiCGStab, can't free Ai\n");
      cudaChk(cudaFree(Ax), "  in Solve_Cuda_BiCGStab, can't free Ax\n");
      cudaChk(cudaFree(Ap), "  in Solve_Cuda_BiCGStab, can't free Ap\n");
      if (preconditioner == 3)
        {
          cudaChk(cudaFree(Ai_tild), "  in Solve_Cuda_BiCGStab, can't free Ai_tild\n");
          cudaChk(cudaFree(Ap_tild), "  in Solve_Cuda_BiCGStab, can't free Ap_tild\n");
        }
      cudaChk(cudaFree(A_tild), "  in Solve_Cuda_BiCGStab, can't free A_tild\n");
      cudaChk(cudaFree(x0), "  in Solve_Cuda_BiCGStab, can't free x0\n");
      cudaChk(cudaFree(b), "  in Solve_Cuda_BiCGStab, can't free b\n");
      return 0;
    }

  if (preconditioner == 0)
    {
      //Apply the Jacobi preconditioner
      /*VecDiv<<<nblocks, n_threads>>>(r_, A_tild, z_, n);
        cuda_error = cudaMemcpy(zz_, z_, n * sizeof(double), cudaMemcpyDeviceToDevice);*/
    }
  else if (preconditioner == 1)
    {
      //Apply an incomplete LU decomposition of A as preconditioner
      cusparseChk(cusparseCreateSolveAnalysisInfo(&info), "  in Solve_Cuda_BiCGStab, cusparseCreateSolveAnalysisInfo for info has failed\n");

      cusparseChk(cusparseDcsrsv_analysis(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
                                          n, nnz, CUDA_descr,
                                          A_tild, Ap, Ai,
                                          info),
                  "  in Solve_Cuda_BiCGStab, cusparseDcsrsm_analysis(info) has failed\n");

      cusparseChk(cusparseDcsrilu0(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
                                   n, CUDA_descr,
                                   A_tild, Ap, Ai,
                                   info),
                  "  in Solve_Cuda_BiCGStab, cusparseDcsrilu0 has failed\n");

      //Make a copy of the indexes in A_tild_i and A_tild_p to use it the Bicgstab algorithm
      cudaChk(cudaMalloc((void **) &A_tild_i, nnz * sizeof(int)), "  in Solve_Cuda_BiCGStab, can't allocate A_tild_i on the graphic card\n");
      cudaChk(cudaMemcpy(A_tild_i, Ai, nnz * sizeof(int), cudaMemcpyDeviceToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy A_tild_i = Ai has failed\n");
      cudaChk(cudaMalloc((void **) &A_tild_p, (n + 1) * sizeof(int)), "  in Solve_Cuda_BiCGStab, can't allocate A_tild_p on the graphic card\n");
      cudaChk(cudaMemcpy(A_tild_p, Ap, (n + 1) * sizeof(int), cudaMemcpyDeviceToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy A_tild_p = Ap has failed\n");
    }
  else if (preconditioner == 2)
    {
      //Because the Jacobian matrix A is store in CSC format in matlab
      // we have to transpose it to get a CSR format used by CUDA
      mwIndex *Awi, *Awp;
      double *A_tild_host = (double *) mxMalloc(nnz*sizeof(double));
      test_mxMalloc(A_tild_host, __LINE__, __FILE__, __func__, nnz*sizeof(double));
      Awi = (mwIndex *) mxMalloc(nnz * sizeof(mwIndex));
      test_mxMalloc(Awi, __LINE__, __FILE__, __func__, nnz * sizeof(mwIndex));
      Awp = (mwIndex *) mxMalloc((n + 1) * sizeof(mwIndex));
      test_mxMalloc(Awp, __LINE__, __FILE__, __func__, (n + 1) * sizeof(mwIndex));
      int *Aii = (int *) mxMalloc(nnz * sizeof(int));
      test_mxMalloc(Aii, __LINE__, __FILE__, __func__, nnz * sizeof(int));
      int *Aip = (int *) mxMalloc((n + 1) * sizeof(int));
      test_mxMalloc(Aip, __LINE__, __FILE__, __func__, (n + 1) * sizeof(int));
      cudaChk(cudaMemcpy(A_tild_host, A_tild, nnz*sizeof(double), cudaMemcpyDeviceToHost), "  in Solve_Cuda_BiCGStab, cudaMemcpy A_tild_host = A_tild has failed\n");
      cudaChk(cudaMemcpy(Aii, Ai, nnz*sizeof(int), cudaMemcpyDeviceToHost), "  in Solve_Cuda_BiCGStab, cudaMemcpy Aii = Ai has failed\n");
      cudaChk(cudaMemcpy(Aip, Ap, (n+1)*sizeof(int), cudaMemcpyDeviceToHost), "  in Solve_Cuda_BiCGStab, cudaMemcpy Aip = Ai has failed\n");
      for (int i = 0; i < nnz; i++)
        Awi[i] = Aii[i];
      for (int i = 0; i < n + 1; i++)
        Awp[i] = Aip[i];
      mxFree(Aii);
      mxFree(Aip);
      mxArray *At_m = mxCreateSparse(n, n, nnz, mxREAL);
      mxSetIr(At_m, Awi);
      mxSetJc(At_m, Awp);
      mxSetPr(At_m, A_tild_host);
      mxArray *A_m;
      mexCallMATLAB(1, &A_m, 1, &At_m, "transpose");
      mxDestroyArray(At_m);

      /*mexPrintf("A_m\n");
        mexCallMATLAB(0, NULL, 1, &A_m, "disp_dense");*/
      /*mxFree(Awi);
        mxFree(Awp);*/

      /*[L1, U1] = ilu(g1a=;*/
      const char *field_names[] = {"type", "droptol", "milu", "udiag", "thresh"};
      const int type = 0;
      const int droptol = 1;
      const int milu = 2;
      const int udiag = 3;
      const int thresh = 4;
      mwSize dims[1] = {(mwSize) 1 };
      mxArray *Setup = mxCreateStructArray(1, dims, 5, field_names);
      mxSetFieldByNumber(Setup, 0, type, mxCreateString("ilutp"));
      //mxSetFieldByNumber(Setup, 0, type, mxCreateString("nofill"));
      mxSetFieldByNumber(Setup, 0, droptol, mxCreateDoubleScalar(lu_inc_tol));
      mxSetFieldByNumber(Setup, 0, milu, mxCreateString("off"));
      mxSetFieldByNumber(Setup, 0, udiag, mxCreateDoubleScalar(0));
      mxSetFieldByNumber(Setup, 0, thresh, mxCreateDoubleScalar(1));
      //mxSetFieldByNumber(Setup, 0, thresh, mxCreateDoubleScalar(1));
      mxArray *lhs0[2];
      mxArray *rhs0[2];
      rhs0[0] = A_m;
      rhs0[1] = Setup;
      ostringstream tmp;
      if (mexCallMATLAB(2, lhs0, 2, rhs0, "ilu"))
        {
          tmp << " In BiCGStab, the incomplet LU decomposition (ilu) ahs failed.\n";
          throw FatalExceptionHandling(tmp.str());
        }
      mxDestroyArray(Setup);

      /*     //ILUT preconditionner computed by Matlab (todo: in futur version of cuda replace it by a new equivalent cuda function)
             const char *field_names[] = {"type", "droptol", "milu", "udiag", "thresh"};
             const int type = 0;
             const int droptol = 1;
             const int milu = 2;
             const int udiag = 3;
             const int thresh = 4;
             mwSize dims[1] = {(mwSize)1 };
             mxArray *Setup = mxCreateStructArray(1, dims, 5, field_names);
             mxSetFieldByNumber(Setup, 0, type, mxCreateString("ilutp"));
             mxSetFieldByNumber(Setup, 0, droptol, mxCreateDoubleScalar(lu_inc_tol));
             mxSetFieldByNumber(Setup, 0, milu, mxCreateString("off"));
             mxSetFieldByNumber(Setup, 0, udiag, mxCreateDoubleScalar(0));
             mxSetFieldByNumber(Setup, 0, thresh, mxCreateDoubleScalar(0));
             mxArray *lhs0[2], *rhs0[2];
             rhs0[0] = A_m;
             rhs0[1] = Setup;
             mexCallMATLAB(1, lhs0, 2, rhs0, "ilu");
      */
      // To store the resultng matrix in a CSR format we have to transpose it
      mxArray *Wt = lhs0[0];
      mwIndex *Wtj = mxGetJc(Wt);
      nnz = Wtj[n];
      mxArray *W;
      mexCallMATLAB(1, &W, 1, &Wt, "transpose");
      mxDestroyArray(Wt);
      double *pW = mxGetPr(W);
      mwIndex *Wi = mxGetIr(W);
      mwIndex *Wp = mxGetJc(W);
      int *Wii = (int *) mxMalloc(nnz * sizeof(int));
      test_mxMalloc(Wii, __LINE__, __FILE__, __func__, nnz * sizeof(int));
      int *Wip = (int *) mxMalloc((n + 1) * sizeof(int));
      test_mxMalloc(Wip, __LINE__, __FILE__, __func__, (n + 1) * sizeof(int));
      for (int i = 0; i < nnz; i++)
        Wii[i] = Wi[i];
      for (int i = 0; i < n + 1; i++)
        Wip[i] = Wp[i];

      //mxFree(A_tild_host);

      cudaChk(cudaFree(A_tild), "cudaFree(A_tild) has failed\n");

      cudaChk(cudaMalloc((void **) &A_tild, nnz * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate A_tild on the graphic card\n");
      cudaChk(cudaMemcpy(A_tild, pW, nnz * sizeof(double), cudaMemcpyHostToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy A_tild = pW has failed\n");
      cudaChk(cudaMalloc((void **) &A_tild_i, nnz * sizeof(int)), "  in Solve_Cuda_BiCGStab, can't allocate Ai on the graphic card\n");
      cudaChk(cudaMemcpy(A_tild_i, Wii, nnz * sizeof(int), cudaMemcpyHostToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy A_tild_i = A_tild_i_host has failed\n");
      cudaChk(cudaMalloc((void **) &A_tild_p, (n + 1) * sizeof(int)), "  in Solve_Cuda_BiCGStab, can't allocate A_tild_p on the graphic card\n");
      cudaChk(cudaMemcpy(A_tild_p, Wip, (n + 1) * sizeof(int), cudaMemcpyHostToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy A_tild_p = A_tild_j_host has failed\n");
      /*mxFree(pW);
        mxFree(Wi);
        mxFree(Wj);*/
      mxDestroyArray(W);
      mxFree(Wii);
      mxFree(Wip);
    }
  else if (preconditioner == 3)
    {
      mwIndex *Aowi, *Aowp;
      double *A_host = (double *) mxMalloc(nnz*sizeof(double));
      test_mxMalloc(A_host, __LINE__, __FILE__, __func__, nnz*sizeof(double));
      Aowi = (mwIndex *) mxMalloc(nnz * sizeof(mwIndex));
      test_mxMalloc(Aowi, __LINE__, __FILE__, __func__, nnz * sizeof(mwIndex));
      Aowp = (mwIndex *) mxMalloc((n + 1) * sizeof(mwIndex));
      test_mxMalloc(Aowp, __LINE__, __FILE__, __func__, (n + 1) * sizeof(mwIndex));
      int *Aoii = (int *) mxMalloc(nnz * sizeof(int));
      test_mxMalloc(Aoii, __LINE__, __FILE__, __func__, nnz * sizeof(int));
      int *Aoip = (int *) mxMalloc((n + 1) * sizeof(int));
      test_mxMalloc(Aoip, __LINE__, __FILE__, __func__, (n + 1) * sizeof(int));
      cudaChk(cudaMemcpy(A_host, Ax, nnz*sizeof(double), cudaMemcpyDeviceToHost), "  in Solve_Cuda_BiCGStab, cudaMemcpy A_tild_host = A_tild has failed\n");
      cudaChk(cudaMemcpy(Aoii, Ai, nnz*sizeof(int), cudaMemcpyDeviceToHost), "  in Solve_Cuda_BiCGStab, cudaMemcpy Aii = Ai_tild has failed\n");
      cudaChk(cudaMemcpy(Aoip, Ap, (n+1)*sizeof(int), cudaMemcpyDeviceToHost), "  in Solve_Cuda_BiCGStab, cudaMemcpy Aip = Ap_tild has failed\n");
      for (int i = 0; i < nnz; i++)
        Aowi[i] = Aoii[i];
      for (int i = 0; i < n + 1; i++)
        Aowp[i] = Aoip[i];
      mxFree(Aoii);
      mxFree(Aoip);
      mxArray *Ao_m = mxCreateSparse(n, n, nnz, mxREAL);
      mxSetIr(Ao_m, Aowi);
      mxSetJc(Ao_m, Aowp);
      mxSetPr(Ao_m, A_host);
      /*mexPrintf("A_m\n");
        mxArray *Aoo;
        mexCallMATLAB(1, &Aoo, 1, &Ao_m, "transpose");
        mexCallMATLAB(0, NULL, 1, &Aoo, "disp_dense");
        mxDestroyArray(Ao_m);
        mxDestroyArray(Aoo);*/

      //Because the Jacobian matrix A is store in CSC format in matlab
      // we have to transpose it to get a CSR format used by CUDA
      mwIndex *Awi, *Awp;
      double *A_tild_host = (double *) mxMalloc(nnz_tild*sizeof(double));
      test_mxMalloc(A_tild_host, __LINE__, __FILE__, __func__, nnz_tild*sizeof(double));
      Awi = (mwIndex *) mxMalloc(nnz_tild * sizeof(mwIndex));
      test_mxMalloc(Awi, __LINE__, __FILE__, __func__, nnz_tild * sizeof(mwIndex));
      Awp = (mwIndex *) mxMalloc((Size + 1) * sizeof(mwIndex));
      test_mxMalloc(Awp, __LINE__, __FILE__, __func__, (Size + 1) * sizeof(mwIndex));
      int *Aii = (int *) mxMalloc(nnz_tild * sizeof(int));
      test_mxMalloc(Aii, __LINE__, __FILE__, __func__, nnz_tild * sizeof(int));
      int *Aip = (int *) mxMalloc((Size + 1) * sizeof(int));
      test_mxMalloc(Aip, __LINE__, __FILE__, __func__, (Size + 1) * sizeof(int));
      cudaChk(cudaMemcpy(A_tild_host, A_tild, nnz_tild*sizeof(double), cudaMemcpyDeviceToHost), "  in Solve_Cuda_BiCGStab, cudaMemcpy A_tild_host = A_tild has failed\n");
      cudaChk(cudaMemcpy(Aii, Ai_tild, nnz_tild*sizeof(int), cudaMemcpyDeviceToHost), "  in Solve_Cuda_BiCGStab, cudaMemcpy Aii = Ai_tild has failed\n");
      cudaChk(cudaMemcpy(Aip, Ap_tild, (Size+1)*sizeof(int), cudaMemcpyDeviceToHost), "  in Solve_Cuda_BiCGStab, cudaMemcpy Aip = Ap_tild has failed\n");
      for (int i = 0; i < nnz_tild; i++)
        Awi[i] = Aii[i];
      for (int i = 0; i < Size + 1; i++)
        Awp[i] = Aip[i];
      /*for (int i = 0; i < nnz_tild; i++)
        mexPrintf("%20.17f\n",A_tild_host[i]);*/
      mxFree(Aii);
      mxFree(Aip);
      mxArray *At_m = mxCreateSparse(Size, Size, nnz_tild, mxREAL);
      mxSetIr(At_m, Awi);
      mxSetJc(At_m, Awp);
      mxSetPr(At_m, A_tild_host);
      mxArray *A_m;
      mexCallMATLAB(1, &A_m, 1, &At_m, "transpose");
      /*mexPrintf("A_tild_m\n");
        mexCallMATLAB(0, NULL, 1, &A_m, "disp_dense");*/
      mxDestroyArray(At_m);
      mxArray *P, *Q, *L, *U;
      mxArray *lhs0[4];
      mexCallMATLAB(4, lhs0, 1, &A_m, "lu");

      mxArray *P0, *Q0, *L0, *U0;
      L0 = lhs0[0];
      U0 = lhs0[1];
      P0 = lhs0[2];
      Q0 = lhs0[3];
      mexCallMATLAB(1, &P, 1, &P0, "transpose");
      mexCallMATLAB(1, &Q, 1, &Q0, "transpose");
      mexCallMATLAB(1, &L, 1, &L0, "transpose");
      mexCallMATLAB(1, &U, 1, &U0, "transpose");
      mxDestroyArray(P0);
      mxDestroyArray(Q0);
      mxDestroyArray(L0);
      mxDestroyArray(U0);
      /*L = lhs0[0];
        U = lhs0[1];
        P = lhs0[2];
        Q = lhs0[3];*/

      /*mexPrintf("L\n");
        mexCallMATLAB(0, NULL, 1, &L, "disp_dense");

        mexPrintf("U\n");
        mexCallMATLAB(0, NULL, 1, &U, "disp_dense");

        mexPrintf("P\n");
        mexCallMATLAB(0, NULL, 1, &P, "disp_dense");

        mexPrintf("Q\n");
        mexCallMATLAB(0, NULL, 1, &Q, "disp_dense");*/

      mwIndex *Qiw_host = mxGetIr(Q);
      mwIndex *Qjw_host = mxGetJc(Q);
      double *Qx_host = mxGetPr(Q);
      Q_nnz = Qjw_host[Size];
      mexPrintf("Q_nnz=%d\n", Q_nnz);
      int *Qi_host = (int *) mxMalloc(Q_nnz * periods * sizeof(int));
      test_mxMalloc(Qi_host, __LINE__, __FILE__, __func__, Q_nnz * periods * sizeof(int));
      double *Q_x_host = (double *) mxMalloc(Q_nnz * periods * sizeof(double));
      test_mxMalloc(Q_x_host, __LINE__, __FILE__, __func__, Q_nnz * periods * sizeof(double));
      int *Qj_host = (int *) mxMalloc((n + 1) * sizeof(int));
      test_mxMalloc(Qj_host, __LINE__, __FILE__, __func__, (n + 1) * sizeof(int));
      for (int t = 0; t < periods; t++)
        {
          for (int i = 0; i < Q_nnz; i++)
            {
              Qi_host[i + t * Q_nnz] = Qiw_host[i] + t * Size;
              Q_x_host[i + t * Q_nnz] = Qx_host[i];
            }
          for (int i = 0; i < Size; i++)
            {
              Qj_host[i + t * Size] = Qjw_host[i] + t * Q_nnz;
            }
        }
      Qj_host[periods * Size] = periods * Q_nnz;

      /*mwIndex *Qtiw_host  = (mwIndex*) mxMalloc(Q_nnz * periods * sizeof(mwIndex));
        double *Qt_x_host = (double*)mxMalloc(Q_nnz * periods * sizeof(double));
        mwIndex *Qtjw_host = (mwIndex*)mxMalloc((n + 1) * sizeof(mwIndex));
        mexPrintf("n = %d\n",n);
        for (int i = 0; i < n + 1; i++)
        Qtjw_host[i] = Qj_host[i];
        for (int i = 0; i < Q_nnz * periods; i++)
        {
        Qtiw_host[i] = Qi_host[i];
        Qt_x_host[i] = Q_x_host[i];
        }
        mxArray* Qt_m = mxCreateSparse(n,n,Q_nnz * periods,mxREAL);
        mxSetIr(Qt_m, Qtiw_host);
        mxSetJc(Qt_m, Qtjw_host);
        mxSetPr(Qt_m, Qt_x_host);
        mexPrintf("Qt_m\n");
        mexCallMATLAB(0, NULL, 1, &Qt_m, "disp_dense");*/

      /*mexPrintf("Qtjw_host[periods * Size=%d]=%d\n", periods * Size, Qtjw_host[periods * Size]);
        for (int i = 0; i < n; i++)
        for (int j = Qtjw_host[i]; j < Qtjw_host[i+1]; j++)
        mexPrintf("(i=%d, j=%d) = %f\n", i, Qtiw_host[j], Qt_x_host[j]);*/
      //mxDestroyArray(Qt_m);

      cudaChk(cudaMalloc((void **) &Qx, Q_nnz * periods * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate Qx on the graphic card\n");
      cudaChk(cudaMemcpy(Qx, Q_x_host, Q_nnz * periods * sizeof(double), cudaMemcpyHostToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy Qx = Qx_host has failed\n");
      cudaChk(cudaMalloc((void **) &Qi, Q_nnz * periods * sizeof(int)), "  in Solve_Cuda_BiCGStab, can't allocate Qi on the graphic card\n");
      cudaChk(cudaMemcpy(Qi, Qi_host, Q_nnz * periods * sizeof(int), cudaMemcpyHostToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy Qi = Qi_host has failed\n");
      cudaChk(cudaMalloc((void **) &Qj, (Size * periods + 1) * sizeof(int)), "  in Solve_Cuda_BiCGStab, can't allocate Qj on the graphic card\n");
      cudaChk(cudaMemcpy(Qj, Qj_host, (Size * periods + 1) * sizeof(int), cudaMemcpyHostToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy Qj = Qj_host has failed\n");
      mxFree(Qi_host);
      mxFree(Qj_host);
      mxFree(Q_x_host);
      mxDestroyArray(Q);

      mwIndex *Piw_host = mxGetIr(P);
      mwIndex *Pjw_host = mxGetJc(P);
      double *Px_host = mxGetPr(P);
      P_nnz = Pjw_host[Size];
      int *Pi_host = (int *) mxMalloc(P_nnz * periods * sizeof(int));
      test_mxMalloc(Pi_host, __LINE__, __FILE__, __func__, P_nnz * periods * sizeof(int));
      double *P_x_host = (double *) mxMalloc(P_nnz * periods * sizeof(double));
      test_mxMalloc(P_x_host, __LINE__, __FILE__, __func__, P_nnz * periods * sizeof(double));
      int *Pj_host = (int *) mxMalloc((n + 1) * sizeof(int));
      test_mxMalloc(Pj_host, __LINE__, __FILE__, __func__, (n + 1) * sizeof(int));
      for (int t = 0; t < periods; t++)
        {
          for (int i = 0; i < P_nnz; i++)
            {
              Pi_host[i + t * P_nnz] = Piw_host[i] + t * Size;
              P_x_host[i + t * P_nnz] = Px_host[i];
            }
          for (int i = 0; i < Size; i++)
            Pj_host[i + t * Size] = Pjw_host[i] + t * P_nnz;
        }
      Pj_host[periods * Size] = periods * P_nnz;

      /*mwIndex *Ptiw_host  = (mwIndex*) mxMalloc(P_nnz * periods * sizeof(mwIndex));
        double *Pt_x_host = (double*)mxMalloc(P_nnz * periods * sizeof(double));
        mwIndex *Ptjw_host = (mwIndex*)mxMalloc((n + 1) * sizeof(mwIndex));
        for (int i = 0; i < n + 1; i++)
        Ptjw_host[i] = Pj_host[i];
        for (int i = 0; i < P_nnz * periods; i++)
        {
        Ptiw_host[i] = Pi_host[i];
        Pt_x_host[i] = P_x_host[i];
        }
        mxArray* Pt_m = mxCreateSparse(n,n,P_nnz * periods,mxREAL);
        mxSetIr(Pt_m, Ptiw_host);
        mxSetJc(Pt_m, Ptjw_host);
        mxSetPr(Pt_m, Pt_x_host);
        mexPrintf("Pt_m\n");
        mexCallMATLAB(0, NULL, 1, &Pt_m, "disp_dense");
        mxDestroyArray(Pt_m);*/

      cudaChk(cudaMalloc((void **) &Px, P_nnz * periods * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate Px on the graphic card\n");
      cudaChk(cudaMemcpy(Px, P_x_host, P_nnz * periods * sizeof(double), cudaMemcpyHostToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy Px = Px_host has failed\n");
      cudaChk(cudaMalloc((void **) &Pi, P_nnz * periods * sizeof(int)), "  in Solve_Cuda_BiCGStab, can't allocate Pi on the graphic card\n");
      cudaChk(cudaMemcpy(Pi, Pi_host, P_nnz * periods * sizeof(int), cudaMemcpyHostToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy Pi = Pi_host has failed\n");
      cudaChk(cudaMalloc((void **) &Pj, (Size * periods + 1) * sizeof(int)), "  in Solve_Cuda_BiCGStab, can't allocate Pj on the graphic card\n");
      cudaChk(cudaMemcpy(Pj, Pj_host, (Size * periods + 1) * sizeof(int), cudaMemcpyHostToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy Pj = Pj_host has failed\n");
      mxFree(Pi_host);
      mxFree(Pj_host);
      mxFree(P_x_host);
      mxDestroyArray(P);

      /*mwIndex* Piw_host = mxGetIr(P);
        mwIndex* Pjw_host = mxGetJc(P);
        double*  Px_host = mxGetPr(P);
        P_nnz = Pjw_host[Size];
        int *Pi_host = (int*)mxMalloc(P_nnz * sizeof(int));
        int *Pj_host = (int*)mxMalloc((Size + 1) * sizeof(int));
        for (int i = 0; i < P_nnz; i++)
        Pi_host[i] = Piw_host[i];
        for (int i = 0; i < Size + 1; i++)
        Pj_host[i] = Pjw_host[i];

        cudaChk(cudaMalloc((void**)&Px, P_nnz * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate Px on the graphic card\n");
        cudaChk(cudaMemcpy(Px, Px_host, P_nnz * sizeof(double), cudaMemcpyHostToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy Px = Px_host has failed\n");
        cudaChk(cudaMalloc((void**)&Pi, P_nnz * sizeof(int)), "  in Solve_Cuda_BiCGStab, can't allocate Pi on the graphic card\n");
        cudaChk(cudaMemcpy(Pi, Pi_host, P_nnz * sizeof(int), cudaMemcpyHostToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy Pi = Pi_host has failed\n");
        cudaChk(cudaMalloc((void**)&Pj, (Size + 1) * sizeof(int)), "  in Solve_Cuda_BiCGStab, can't allocate Pj on the graphic card\n");
        cudaChk(cudaMemcpy(Pj, Pj_host, (Size + 1) * sizeof(int), cudaMemcpyHostToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy Pj = Pj_host has failed\n");
        mxFree(Pi_host);
        mxFree(Pj_host);
        mxDestroyArray(P);*/

      /*mexPrintf("L\n");
        mexCallMATLAB(0, NULL, 1, &L, "disp_dense");

        mexPrintf("U\n");
        mexCallMATLAB(0, NULL, 1, &U, "disp_dense");*/

      mwIndex *Liw_host = mxGetIr(L);
      mwIndex *Ljw_host = mxGetJc(L);
      double *Lx_host = mxGetPr(L);
      int L_nnz = Ljw_host[Size];

      mwIndex *Uiw_host = mxGetIr(U);
      mwIndex *Ujw_host = mxGetJc(U);
      double *Ux_host = mxGetPr(U);
      int U_nnz = Ujw_host[Size];

      double *pW = (double *) mxMalloc((L_nnz + U_nnz - Size) * periods * sizeof(double));
      test_mxMalloc(pW, __LINE__, __FILE__, __func__, (L_nnz + U_nnz - Size) * periods * sizeof(double));
      int *Wi = (int *) mxMalloc((L_nnz + U_nnz - Size) * periods * sizeof(int));
      test_mxMalloc(Wi, __LINE__, __FILE__, __func__, (L_nnz + U_nnz - Size) * periods * sizeof(int));
      int *Wj = (int *) mxMalloc((n + 1) * sizeof(int));
      test_mxMalloc(Wj, __LINE__, __FILE__, __func__, (n + 1) * sizeof(int));
      Wj[0] = 0;
      W_nnz = 0;
      for (int t = 0; t < periods; t++)
        for (int i = 0; i < Size; i++)
          {
            for (mwIndex l  = Ujw_host[i]; l < Ujw_host[i+1]; l++)
              {
                Wi[W_nnz] = Uiw_host[l] + t * Size;
                pW[W_nnz] = Ux_host[l];
                //mexPrintf("Wj[%d] = %d, Wi[%d] = Uiw_host[%d] + t * Size = %d, pW[%d]=%f\n", i + t * Size, Wj[i + t * Size], W_nnz, l, Uiw_host[l] + t * Size, W_nnz, Ux_host[l]);
                W_nnz++;
              }
            for (mwIndex l  = Ljw_host[i]; l < Ljw_host[i+1]; l++)
              {
                if (Liw_host[l] > i)
                  {
                    Wi[W_nnz] = Liw_host[l] + t * Size;
                    pW[W_nnz] = Lx_host[l];
                    //mexPrintf("Wj[%d] = %d, Wi[%d] = Liw_host[%d] + t * Size = %d, pW[%d]=%f\n", i  + t * Size, Wj[i + t * Size], W_nnz, l, Liw_host[l] + t * Size, W_nnz, Lx_host[l]);
                    W_nnz++;
                  }
              }
            Wj[i + 1 + t * Size] = W_nnz;
          }
      //mexPrintf("Wj[%d] = %d, n=%d\n", periods * Size, Wj[periods * Size], n);
      cudaChk(cudaMalloc((void **) &A_tild, W_nnz * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate Px on the graphic card\n");
      cudaChk(cudaMemcpy(A_tild, pW, W_nnz * sizeof(double), cudaMemcpyHostToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy A_tild = pW has failed\n");
      cudaChk(cudaMalloc((void **) &A_tild_i, W_nnz * sizeof(int)), "  in Solve_Cuda_BiCGStab, can't allocate Pi on the graphic card\n");
      cudaChk(cudaMemcpy(A_tild_i, Wi, W_nnz * sizeof(int), cudaMemcpyHostToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy A_tild_i = Wi has failed\n");
      cudaChk(cudaMalloc((void **) &A_tild_p, (n + 1) * sizeof(int)), "  in Solve_Cuda_BiCGStab, can't allocate Pj on the graphic card\n");
      cudaChk(cudaMemcpy(A_tild_p, Wj, (n + 1) * sizeof(int), cudaMemcpyHostToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy A_tild_p = Wj has failed\n");

      /*mwIndex *Wwi = (mwIndex*)mxMalloc(W_nnz * sizeof(mwIndex));
        mwIndex *Wwj = (mwIndex*)mxMalloc((n + 1) * sizeof(mwIndex));
        for (int i = 0; i < W_nnz; i++)
        Wwi[i] = Wi[i];
        for (int i = 0; i < n + 1; i++)
        Wwj[i] = Wj[i];
        mxFree(Wi);
        mxFree(Wj);
        mxArray* Ao_tild = mxCreateSparse(n,n,W_nnz,mxREAL);
        mxSetIr(Ao_tild, Wwi);
        mxSetJc(Ao_tild, Wwj);
        mxSetPr(Ao_tild, pW);
        mexPrintf("Ao_tild\n");
        mexCallMATLAB(0, NULL, 1, &Ao_tild, "disp_dense");
        mxDestroyArray(Ao_tild);*/

      /*ostringstream tmp;
        tmp << "debugging";
        mexWarnMsgTxt(tmp.str().c_str());
        return 4;*/

      /* Apply the permutation matrices (P and Q) to the b vector of system to solve :
         b_tild = P-1 . b  = P' . b */
      /*cudaChk(cudaMalloc((void**)&b_tild, n * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate b_tild on the graphic card\n");
        cusparseChk(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_TRANSPOSE,
        n, n, nnz, &one, CUDA_descr,
        Px, Pj, Pi,
        b, &zeros,
        b_tild),
        "  in Solve_Cuda_BiCGStab, b_tild = cusparseDcsrmv(P', b) has failed\n");

        cusparseChk(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_TRANSPOSE,
        n, n, nnz, &one, CUDA_descr,
        Px, Pj, Pi,
        b, &zeros,
        b),
        "  in Solve_Cuda_BiCGStab, b = cusparseDcsrmv(P', b) has failed\n");
      */
      /*mexPrintf("Wt = lu(A_m)\n");
        mexCallMATLAB(0, NULL, 1, &Wt, "disp_dense");*/
      /*ostringstream tmp;
        tmp << "debugging";
        mexWarnMsgTxt(tmp.str().c_str());
        return 4;*/
      // To store the resultng matrix in a CSR format we have to transpose it
      /*mwIndex* Wtj = mxGetJc(Wt);
        nnz = Wtj[n];
        mxArray* W;
        mexCallMATLAB(1, &W, 1, &Wt, "transpose");
        mxDestroyArray(Wt);
        pW = mxGetPr(W);
        Wwi = mxGetIr(W);
        mwIndex* Wp = mxGetJc(W);
        int *Wii = (int*)mxMalloc(nnz * sizeof(int));
        int *Wip = (int*)mxMalloc((n + 1) * sizeof(int));
        for (int i = 0; i < nnz; i++)
        Wii[i] = Wi[i];
        for (int i = 0; i < n + 1; i++)
        Wip[i] = Wp[i];

        //mxFree(A_tild_host);

        cudaChk(cudaFree(Ai_tild), "  in Solve_Cuda_BiCGStab, cudaFree(Ai_tild) has failed\n");
        cudaChk(cudaFree(Ap_tild), "  in Solve_Cuda_BiCGStab, cudaFree(Ap_tild) has failed\n");
        cudaChk(cudaFree(A_tild), "  in Solve_Cuda_BiCGStab, cudaFree(A_tild) has failed\n");

        cudaChk(cudaMalloc((void**)&A_tild, nnz * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate A_tild on the graphic card\n");
        cudaChk(cudaMemcpy(A_tild, pW, nnz * sizeof(double), cudaMemcpyHostToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy A_tild = pW has failed\n");
        cudaChk(cudaMalloc((void**)&A_tild_i, nnz * sizeof(int)), "  in Solve_Cuda_BiCGStab, can't allocate Ai on the graphic card\n");
        cudaChk(cudaMemcpy(A_tild_i, Wii, nnz * sizeof(int), cudaMemcpyHostToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy A_tild_i = A_tild_i_host has failed\n");
        cudaChk(cudaMalloc((void**)&A_tild_p, (n + 1) * sizeof(int)), "  in Solve_Cuda_BiCGStab, can't allocate A_tild_p on the graphic card\n");
        cudaChk(cudaMemcpy(A_tild_p, Wip, (n + 1) * sizeof(int), cudaMemcpyHostToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy A_tild_p = A_tild_j_host has failed\n");
        mxDestroyArray(W);
        mxFree(Wii);
        mxFree(Wip);*/
    }
  if (preconditioner == 1 || preconditioner == 2 || preconditioner == 3)
    {
      cusparseChk(cusparseCreateMatDescr(&descrL),
                  "  in Solve_Cuda_BiCGStab, cusparseCreateMatDescr has failed for descrL\n");
      cusparseChk(cusparseSetMatIndexBase(descrL, CUSPARSE_INDEX_BASE_ZERO),
                  "  in Solve_Cuda_BiCGStab, cusparseSetMatIndexBase has failed for descrL\n");
      cusparseChk(cusparseSetMatType(descrL, CUSPARSE_MATRIX_TYPE_GENERAL),
                  "  in Solve_Cuda_BiCGStab, cusparseSetMatType has failed for descrL\n");
      cusparseChk(cusparseSetMatFillMode(descrL, CUSPARSE_FILL_MODE_LOWER),
                  "  in Solve_Cuda_BiCGStab, cusparseSetFillMod has failed for descrL\n");
      cusparseChk(cusparseSetMatDiagType(descrL, CUSPARSE_DIAG_TYPE_UNIT),
                  "  in Solve_Cuda_BiCGStab, cusparseSetMatDiagType has failed for descrL\n");

      cusparseChk(cusparseCreateMatDescr(&descrU),
                  "  in Solve_Cuda_BiCGStab, cusparseCreateMatDescr has failed for descrU\n");
      cusparseChk(cusparseSetMatIndexBase(descrU, CUSPARSE_INDEX_BASE_ZERO),
                  "  in Solve_Cuda_BiCGStab, cusparseSetMatIndexBase has failed for descrU\n");
      cusparseChk(cusparseSetMatType(descrU, CUSPARSE_MATRIX_TYPE_GENERAL),
                  "  in Solve_Cuda_BiCGStab, cusparseSetMatType has failed for descrU\n");
      cusparseChk(cusparseSetMatFillMode(descrU, CUSPARSE_FILL_MODE_UPPER),
                  "  in Solve_Cuda_BiCGStab, cusparseSetFillMod has failed for descrU\n");
      cusparseChk(cusparseSetMatDiagType(descrU, CUSPARSE_DIAG_TYPE_NON_UNIT),
                  "  in Solve_Cuda_BiCGStab, cusparseSetMatDiagType has failed for descrU\n");

      int host_nnz_tild;
      if  (preconditioner == 3)
        host_nnz_tild = W_nnz;
      else
        host_nnz_tild = nnz;

      if (preconditioner == 1)
        cusparseChk(cusparseDestroySolveAnalysisInfo(info),
                    "  in Solve_Cuda_BiCGStab, cusparseDestroySolveAnalysisInfo has failed for info\n");

      cusparseChk(cusparseCreateSolveAnalysisInfo(&infoL),
                  "  in Solve_Cuda_BiCGStab, cusparseCreateSolveAnalysisInfo has failed for infoL\n");
      cusparseChk(cusparseDcsrsv_analysis(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
                                          n, host_nnz_tild, descrL,
                                          A_tild, A_tild_p, A_tild_i,
                                          infoL),
                  "  in Solve_Cuda_BiCGStab, cusparseDcsrsm_analysis for infoL has failed\n");

      cusparseChk(cusparseCreateSolveAnalysisInfo(&infoU),
                  "  in Solve_Cuda_BiCGStab, cusparseCreateSolveAnalysisInfo has failed for infoU\n");
      cusparseChk(cusparseDcsrsv_analysis(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
                                          n, host_nnz_tild, descrU,
                                          A_tild, A_tild_p, A_tild_i,
                                          infoU),
                  "  in Solve_Cuda_BiCGStab, cusparseDcsrsm_analysis for infoU has failed\n");
    }

  cudaChk(cudaMalloc((void **) &v, n * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate v on the graphic card\n");
  cudaChk(cudaMalloc((void **) &p, n * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate p on the graphic card\n");
  //cudaChk(cudaMemset(p, 0, n * sizeof(double)), "  in Solve_Cuda_BiCGStab, cudaMemset p = 0 has failed\n");
  cudaChk(cudaMalloc((void **) &s, n * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate s on the graphic card\n");
  cudaChk(cudaMalloc((void **) &t, n * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate t on the graphic card\n");
  cudaChk(cudaMalloc((void **) &y_, n * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate y_ on the graphic card\n");
  cudaChk(cudaMalloc((void **) &z, n * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate z on the graphic card\n");

  double rho = 1.0, alpha = 1.0, omega = 1.0;

  //residual = P*B*Q - L*U;
  //norm(Z,1) should be close to 0

  while (iteration < 50 /*max_iterations*/ && !convergence)
    {
      double rho_prev = rho;
      /**store in s previous value of r*/
      cudaChk(cudaMemcpy(s, r, n * sizeof(double), cudaMemcpyDeviceToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy s = r has failed\n");

      /**rho = r0 . r*/
      cublasChk(cublasDdot(cublas_handle, n, // numerator
                           r0, 1,
                           r, 1,
                           &rho),
                "  in Solve_Cuda_BiCGStab, rho = cublasDdot(r0, r) has failed\n");

      mexPrintf("rho=%f\n", rho);

      double beta;

      if (iteration == 0)
        {
          cudaChk(cudaMemcpy(p, r, n * sizeof(double), cudaMemcpyDeviceToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy p = r has failed\n");
        }
      else
        {
          /**beta = (rho / rho_prev) . (alpha / omega);*/
          beta = rho / rho_prev * alpha / omega;

          /**p = r + beta * (p - omega * v)*/
          // tmp_ = p - omega * v
          VecAdd<<< nblocks, n_threads>>> (tmp_, p, -omega, v, n);
          //p = r + beta * tmp_
          VecAdd<<< nblocks, n_threads>>> (p, r, beta, tmp_, n);
        }

      /**y_ solution of A_tild * y_ = p <=> L . U . y_ = p*/
      //  L tmp_ = p => tmp_ = L^-1 p, with tmp_ = U . y_

      if (preconditioner == 3)
        {
          double *p_tild;

          cudaChk(cudaMemcpy(tmp_vect_host, p, n*sizeof(double), cudaMemcpyDeviceToHost), "  in Solve_Cuda_BiCGStab, cudaMemcpy tmp_vect_host = p has failed\n");
          /*mexPrintf("p\n");
            for (int i = 0; i < n; i++)
            mexPrintf("%f\n",tmp_vect_host[i]);*/

          cudaChk(cudaMalloc((void **) &p_tild, n * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate b_tild on the graphic card\n");
          cusparseChk(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
                                     n, n, P_nnz * periods, &one, CUDA_descr,
                                     Px, Pj, Pi,
                                     p, &zeros,
                                     p_tild),
                      "  in Solve_Cuda_BiCGStab, p_tild = cusparseDcsrmv(P', p) has failed\n");

          /*mexPrintf("P\n");
            printM(n, Px, Pj, Pi, CUDA_descr, cusparse_handle);*/

          cudaChk(cudaMemcpy(tmp_vect_host, p_tild, n*sizeof(double), cudaMemcpyDeviceToHost), "  in Solve_Cuda_BiCGStab, cudaMemcpy tmp_vect_host = p_tild has failed\n");
          /*mexPrintf("p_tild\n");
            for (int i = 0; i < n; i++)
            mexPrintf("%f\n",tmp_vect_host[i]);*/

          cusparseChk(cusparseDcsrsv_solve(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
                                           n, &one,
                                           descrL,
                                           A_tild, A_tild_p, A_tild_i,
                                           infoL, p_tild,
                                           tmp_),
                      "  in Solve_Cuda_BiCGStab, cusparseDcsrsv_solve for L . tmp_ = p_tild has failed\n");
          cudaChk(cudaFree(p_tild), "  in Solve_Cuda_BiCGStab, can't free p_tild\n");

          cudaChk(cudaMemcpy(tmp_vect_host, tmp_, n*sizeof(double), cudaMemcpyDeviceToHost), "  in Solve_Cuda_BiCGStab, cudaMemcpy tmp_vect_host = v has failed\n");
          /*mexPrintf("tmp_\n");
            for (int i = 0; i < n; i++)
            mexPrintf("%f\n",tmp_vect_host[i]);*/
        }
      else
        cusparseChk(cusparseDcsrsv_solve(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
                                         n, &one,
                                         descrL,
                                         A_tild, A_tild_p, A_tild_i,
                                         infoL, p,
                                         tmp_),
                    "  in Solve_Cuda_BiCGStab, cusparseDcsrsv_solve for L . tmp_ = p has failed\n");

      //  U . y_ = L^-1 p <=> U . y_ = tmp_ => y_ = U^-1 L^-1 p
      cusparseChk(cusparseDcsrsv_solve(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
                                       n, &one,
                                       descrU,
                                       A_tild, A_tild_p, A_tild_i,
                                       infoU, tmp_,
                                       y_),
                  "  in Solve_Cuda_BiCGStab, cusparseDcsrsv_solve for U . y_ = tmp_ has failed\n");

      /*cudaChk(cudaMemcpy(tmp_vect_host, y_, n*sizeof(double), cudaMemcpyDeviceToHost), "  in Solve_Cuda_BiCGStab, cudaMemcpy tmp_vect_host = v has failed\n");
        mexPrintf("y_\n");
        for (int i = 0; i < n; i++)
        mexPrintf("%f\n",tmp_vect_host[i]);*/

      if (preconditioner == 3)
        {
          double *y_tild;
          cudaChk(cudaMalloc((void **) &y_tild, n * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate b_tild on the graphic card\n");
          cudaChk(cudaMemcpy(y_tild, y_, n  * sizeof(double), cudaMemcpyDeviceToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy y_tild = y_ has failed\n");
          cusparseChk(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
                                     n, n, Q_nnz * periods, &one, CUDA_descr,
                                     Qx, Qj, Qi,
                                     y_tild, &zeros,
                                     y_),
                      "  in Solve_Cuda_BiCGStab, y_ = cusparseDcsrmv(Q', y_tild) has failed\n");
          cudaChk(cudaFree(y_tild), "  in Solve_Cuda_BiCGStab, can't free y_tild\n");
        }
      /*cudaChk(cudaMemcpy(tmp_vect_host, y_, n*sizeof(double), cudaMemcpyDeviceToHost), "  in Solve_Cuda_BiCGStab, cudaMemcpy tmp_vect_host = v has failed\n");
        mexPrintf("y_\n");
        for (int i = 0; i < n; i++)
        mexPrintf("%f\n",tmp_vect_host[i]);*/
      /**v = A*y_*/
      cusparseChk(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
                                 n, n, nnz, &one, CUDA_descr,
                                 Ax, Ap, Ai,
                                 y_, &zeros,
                                 v),
                  "  in Solve_Cuda_BiCGStab, v = cusparseDcsrmv(A, y_) has failed\n");
      cudaChk(cudaMemcpy(tmp_vect_host, v, n*sizeof(double), cudaMemcpyDeviceToHost), "  in Solve_Cuda_BiCGStab, cudaMemcpy tmp_vect_host = v has failed\n");
      /*mexPrintf("v\n");
        for (int i = 0; i < n; i++)
        mexPrintf("%f\n",tmp_vect_host[i]);*/

      /**alpha = rho / (rr0 . v) with rr0 = r0*/
      cublasChk(cublasDdot(cublas_handle, n, // numerator
                           r0, 1,
                           v, 1,
                           &tmp1),
                "  in Solve_Cuda_BiCGStab, cublasDdot(r0, v) has failed\n");

      alpha = rho / tmp1;
      mexPrintf("rho = %f, tmp1 = %f\n", rho, tmp1);
      mexPrintf("alpha = %f\n", alpha);

      if (alpha == 0 || isinf(alpha) || isnan(alpha))
        {
          Solve_CUDA_BiCGStab_Free(tmp_vect_host, p, r, v, s, t, y_, z, tmp_, Ai, Ax, Ap, x0, b, A_tild, A_tild_i, A_tild_p, infoL, infoU, descrL, descrU, preconditioner);
          ostringstream tmp;
          tmp << "one of the scalar quantities (alpha=" << alpha << ") calculated during BICGSTAB became too small or too large to continue computing, in block " << block+1;
          mexWarnMsgTxt(tmp.str().c_str());
          return 4;
        }

      /** Check for potential stagnation*/
      cublasChk(cublasDnrm2(cublas_handle, n, // numerator
                            y_, 1,
                            &tmp1),
                "  in Solve_Cuda_BiCGStab, cublasDnrm2(y_) has failed\n");
      cublasChk(cublasDnrm2(cublas_handle, n, // denominator
                            x0, 1,
                            &tmp2),
                "  in Solve_Cuda_BiCGStab, cublasDnrm2(y_) has failed\n");
      mexPrintf("abs(alpha)*tmp1  = %f, alpha = %f, tmp1 = %f, tmp2 = %f, eps = %f\n", abs(alpha)*tmp1, alpha, tmp1, tmp2, eps);
      if (abs(alpha)*tmp1  < eps * tmp2)
        stagnation++;
      else
        stagnation = 0;

      /**x = x + alpha * y_*/
      VecInc<<< nblocks, n_threads>>> (x0, alpha, y_, n);

      /**s = r_prev - alpha *v with r_prev = s*/
      VecInc<<< nblocks, n_threads>>> (s, -alpha, v, n);

      /**Has BiCGStab converged?*/
      cublasChk(cublasDnrm2(cublas_handle, n, // numerator
                            s, 1,
                            &tmp1),
                "  in Solve_Cuda_BiCGStab, cublasDnrm2(s) has failed\n");
      conv_criteria = tmp1;
      mexPrintf("conv_criteria = %f, tolb = %f\n", conv_criteria, tolb);
      convergence = conv_criteria < tolb;

      if (convergence || stagnation >= max_stagnation || refinement_needed)
        {
          /**s = b - A * x0*/
          cudaChk(cudaMemcpy(s, b, n * sizeof(double), cudaMemcpyDeviceToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy s = b has failed\n");
          cusparseChk(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
                                     n, n, nnz, &m_one, CUDA_descr,
                                     Ax, Ap, Ai,
                                     x0, &one,
                                     s),
                      "  in Solve_Cuda_BiCGStab, s = b - cusparseDcsrmv(A, x0) has failed\n");
          cublasChk(cublasDnrm2(cublas_handle, n, // numerator
                                s, 1,
                                &tmp1),
                    "  in Solve_Cuda_BiCGStab, cublasDnrm2(s) has failed\n");
          conv_criteria = tmp1;
          convergence = conv_criteria < tolb;
          if (convergence)
            {
              break;
            }
          else
            {
              if (stagnation >= max_stagnation && refinement_needed == 0)
                stagnation = 0;
              refinement_needed++;
              if (refinement_needed > max_refinement)
                {
                  Solve_CUDA_BiCGStab_Free(tmp_vect_host, p, r, v, s, t, y_, z, tmp_, Ai, Ax, Ap, x0, b, A_tild, A_tild_i, A_tild_p, infoL, infoU, descrL, descrU, preconditioner);
                  ostringstream tmp;
                  tmp << "Error in bytecode: BiCGStab stagnated (Two consecutive iterates were the same.), in block " << block+1;
                  mexWarnMsgTxt(tmp.str().c_str());
                  return 3;
                }
            }
        }

      /**z solution of A_tild * z = s*/
      //  L tmp_ = s => tmp_ = L^-1 s, with tmp_ = U . z
      if (preconditioner == 3)
        {
          double *s_tild;
          cudaChk(cudaMalloc((void **) &s_tild, n * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate b_tild on the graphic card\n");
          cusparseChk(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
                                     n, n, P_nnz * periods, &one, CUDA_descr,
                                     Px, Pj, Pi,
                                     s, &zeros,
                                     s_tild),
                      "  in Solve_Cuda_BiCGStab, s_tild = cusparseDcsrmv(P', s) has failed\n");
          cusparseChk(cusparseDcsrsv_solve(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
                                           n, &one,
                                           descrL,
                                           A_tild, A_tild_p, A_tild_i,
                                           infoL, s_tild,
                                           tmp_),
                      "  in Solve_Cuda_BiCGStab, cusparseDcsrsv_solve for L . tmp_ = s_tild has failed\n");
          cudaChk(cudaFree(s_tild), "  in Solve_Cuda_BiCGStab, can't free s_tild\n");
        }
      else
        cusparseChk(cusparseDcsrsv_solve(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
                                         n, &one,
                                         descrL,
                                         //Lx, Lp, Li,
                                         A_tild, A_tild_p, A_tild_i,
                                         infoL, s,
                                         tmp_),
                    "  in Solve_Cuda_BiCGStab, cusparseDcsrsv_solve for L . tmp_ = s has failed\n");
      //  U . z = L^-1 s <=> U . z = tmp_ => z = U^-1 L^-1 s
      cusparseChk(cusparseDcsrsv_solve(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
                                       n, &one,
                                       descrU,
                                       //Ux, Up, Ui,
                                       A_tild, A_tild_p, A_tild_i,
                                       infoU, tmp_,
                                       z),
                  "  in Solve_Cuda_BiCGStab, cusparseDcsrsv_solve for U . z = tmp_ has failed\n");
      if (preconditioner == 3)
        {
          double *z_tild;
          cudaChk(cudaMalloc((void **) &z_tild, n * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate z_tild on the graphic card\n");
          cudaChk(cudaMemcpy(z_tild, z, n  * sizeof(double), cudaMemcpyDeviceToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy z_tild = z has failed\n");
          cusparseChk(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
                                     n, n, Q_nnz * periods, &one, CUDA_descr,
                                     Qx, Qj, Qi,
                                     z_tild, &zeros,
                                     z),
                      "  in Solve_Cuda_BiCGStab, z = cusparseDcsrmv(Q, z_tild) has failed\n");
          cudaChk(cudaFree(z_tild), "  in Solve_Cuda_BiCGStab, can't free x_tild\n");
        }
      /**t = A * z*/
      cusparseChk(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
                                 n, n, nnz, &one, CUDA_descr,
                                 Ax, Ap, Ai,
                                 z, &zeros,
                                 t),
                  "  in Solve_Cuda_BiCGStab, t = cusparseDcsrmv(A, z) has failed\n");

      /** omega = (t' s) / (t' t)*/
      cublasChk(cublasDdot(cublas_handle, n, // numerator
                           t, 1,
                           s, 1,
                           &tmp1),
                "  in Solve_Cuda_BiCGStab, cublasDdot(t, s) has failed\n");

      cublasChk(cublasDdot(cublas_handle, n, // numerator
                           t, 1,
                           t, 1,
                           &tmp2),
                "  in Solve_Cuda_BiCGStab, cublasDdot(t, t) has failed\n");

      omega = tmp1 / tmp2;

      if (omega == 0 || isinf(omega) || isnan(omega))
        {
          Solve_CUDA_BiCGStab_Free(tmp_vect_host, p, r, v, s, t, y_, z, tmp_, Ai, Ax, Ap, x0, b, A_tild, A_tild_i, A_tild_p, infoL, infoU, descrL, descrU, preconditioner);
          ostringstream tmp;
          mexEvalString("diary off;");
          tmp << "one of the scalar quantities (omega=" << omega << ") calculated during BICGSTAB became too small or too large to continue computing, in block " << block+1;
          mexWarnMsgTxt(tmp.str().c_str());
          return 4;
        }

      /**x = x +  omega * z*/
      VecInc<<< nblocks, n_threads>>> (x0, omega, z, n);

      /**r = s - omega * t*/
      VecAdd<<< nblocks, n_threads>>> (r, s, -omega, t, n);

      /**Has BiCGStab converged?*/
      cublasChk(cublasDnrm2(cublas_handle, n, // numerator
                            r, 1,
                            &tmp1),
                "  in Solve_Cuda_BiCGStab, cublasDnrm2(r) has failed\n");
      conv_criteria = tmp1;

      convergence = conv_criteria < tolb;

      if (convergence || stagnation >= max_stagnation || refinement_needed)
        {
          /**r = b - A * x0*/
          cudaChk(cudaMemcpy(r, b, n * sizeof(double), cudaMemcpyDeviceToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy r = b has failed\n");
          cusparseChk(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
                                     n, n, nnz, &m_one, CUDA_descr,
                                     Ax, Ap, Ai,
                                     x0, &one,
                                     r),
                      "  in Solve_Cuda_BiCGStab, r = b - cusparseDcsrmv(A, x0) has failed\n");
          cublasChk(cublasDnrm2(cublas_handle, n, // numerator
                                r, 1,
                                &tmp1),
                    "  in Solve_Cuda_BiCGStab, cublasDnrm2(r) has failed\n");
          conv_criteria = tmp1;
          convergence = conv_criteria < tolb;
          if (convergence)
            {
              mexPrintf("convergence achieved\n");
              break;
            }
          else
            {
              if (stagnation >= max_stagnation && refinement_needed == 0)
                stagnation = 0;
              refinement_needed++;
              if (refinement_needed > max_refinement)
                {
                  Solve_CUDA_BiCGStab_Free(tmp_vect_host, p, r, v, s, t, y_, z, tmp_, Ai, Ax, Ap, x0, b, A_tild, A_tild_i, A_tild_p, /*Lx, Li, Lp, Ux, Ui, Up, device_n, */ infoL, infoU, descrL, descrU, preconditioner);
                  ostringstream tmp;
                  mexEvalString("diary off;");
                  tmp << "Error in bytecode: BiCGStab stagnated (Two consecutive iterates were the same.), in block " << block+1;
                  mexWarnMsgTxt(tmp.str().c_str());
                  return 3;
                }
            }
        }

      iteration++;
    }
  cudaChk(cudaMemcpy(tmp_vect_host, x0, n * sizeof(double), cudaMemcpyDeviceToHost), "  in Solve_Cuda_BiCGStab, cudaMemcpy tmp_vect_host = x0 has failed\n");

  if (is_two_boundaries)
    for (int i = 0; i < n; i++)
      {
        int eq = index_vara[i+Size*y_kmin];
        double yy = -(tmp_vect_host[i] + y[eq]);
        direction[eq] = yy;
        y[eq] += slowc * yy;
      }
  else
    for (int i = 0; i < n; i++)
      {
        int eq = index_vara[i];
        double yy = -(tmp_vect_host[i] + y[eq+it_*y_size]);
        direction[eq] = yy;
        y[eq+it_*y_size] += slowc * yy;
      }
  Solve_CUDA_BiCGStab_Free(tmp_vect_host, p, r, v, s, t, y_, z, tmp_, Ai, Ax, Ap, x0, b, A_tild, A_tild_i, A_tild_p, infoL, infoU, descrL, descrU, preconditioner);

  if (iteration >= max_iterations)
    {
      ostringstream tmp;
      mexEvalString("diary off;");
      tmp << "Error in bytecode: No convergence inside BiCGStab, in block " << block+1;
      mexWarnMsgTxt(tmp.str().c_str());
      return 1;
    }
  else
    return 0;
}
#endif

void
dynSparseMatrix::Solve_Matlab_GMRES(mxArray *A_m, mxArray *b_m, int Size, double slowc, int block, bool is_two_boundaries, int it_, mxArray *x0_m)
{
#ifdef OCTAVE_MEX_FILE
  ostringstream tmp;
  if (steady_state)
    tmp << " GMRES method is not implemented in Octave. You cannot use solve_algo=7, change solve_algo.\n";
  else
    tmp << " GMRES method is not implemented in Octave. You cannot use stack_solve_algo=2, change stack_solve_algo.\n";
  throw FatalExceptionHandling(tmp.str());
#endif
  size_t n = mxGetM(A_m);
  const char *field_names[] = {"droptol", "type"};
  mwSize dims[1] = { 1 };
  mxArray *Setup = mxCreateStructArray(1, dims, 2, field_names);
  mxSetFieldByNumber(Setup, 0, 0, mxCreateDoubleScalar(lu_inc_tol));
  mxSetFieldByNumber(Setup, 0, 1, mxCreateString("ilutp"));
  mxArray *lhs0[2];
  mxArray *rhs0[2];
  rhs0[0] = A_m;
  rhs0[1] = Setup;
  if (mexCallMATLAB(2, lhs0, 2, rhs0, "ilu"))
    throw FatalExceptionHandling("In GMRES, the incomplet LU decomposition (ilu) ahs failed.");
  mxArray *L1 = lhs0[0];
  mxArray *U1 = lhs0[1];
  /*[za,flag1] = gmres(g1a,b,Blck_size,1e-6,Blck_size*periods,L1,U1);*/
  mxArray *rhs[8];
  rhs[0] = A_m;
  rhs[1] = b_m;
  rhs[2] = mxCreateDoubleScalar(Size);
  rhs[3] = mxCreateDoubleScalar(1e-6);
  rhs[4] = mxCreateDoubleScalar((double) n);
  rhs[5] = L1;
  rhs[6] = U1;
  rhs[7] = x0_m;
  mxArray *lhs[2];
  mexCallMATLAB(2, lhs, 8, rhs, "gmres");
  mxArray *z = lhs[0];
  mxArray *flag = lhs[1];
  double *flag1 = mxGetPr(flag);
  mxDestroyArray(rhs0[1]);
  mxDestroyArray(rhs[2]);
  mxDestroyArray(rhs[3]);
  mxDestroyArray(rhs[4]);
  mxDestroyArray(rhs[5]);
  mxDestroyArray(rhs[6]);
  if (*flag1 > 0)
    {
      ostringstream tmp;
      if (*flag1 == 1)
        {
          tmp << "Error in bytecode: No convergence inside GMRES, in block " << block+1;
          mexWarnMsgTxt(tmp.str().c_str());
        }
      else if (*flag1 == 2)
        {
          tmp << "Error in bytecode: Preconditioner is ill-conditioned, in block " << block+1;
          mexWarnMsgTxt(tmp.str().c_str());
        }
      else if (*flag1 == 3)
        {
          tmp << "Error in bytecode: GMRES stagnated (Two consecutive iterates were the same.), in block " << block+1;
          mexWarnMsgTxt(tmp.str().c_str());
        }
      lu_inc_tol /= 10;
    }
  else
    {
      double *res = mxGetPr(z);
      if (is_two_boundaries)
        for (int i = 0; i < (int) n; i++)
          {
            int eq = index_vara[i+Size*y_kmin];
            double yy = -(res[i] + y[eq]);
            direction[eq] = yy;
            y[eq] += slowc * yy;
          }
      else
        for (int i = 0; i < (int) n; i++)
          {
            int eq = index_vara[i];
            double yy = -(res[i] + y[eq+it_*y_size]);
            direction[eq] = yy;
            y[eq+it_*y_size] += slowc * yy;
          }
    }
  mxDestroyArray(A_m);
  mxDestroyArray(b_m);
  mxDestroyArray(z);
  mxDestroyArray(flag);
}

void
dynSparseMatrix::Solve_Matlab_BiCGStab(mxArray *A_m, mxArray *b_m, int Size, double slowc, int block, bool is_two_boundaries, int it_, mxArray *x0_m, int preconditioner)
{
  /* precond = 0  => Jacobi
     precond = 1  => Incomplet LU decomposition*/
  size_t n = mxGetM(A_m);
  mxArray *L1, *U1, *Diag;
  L1 = NULL;
  U1 = NULL;
  Diag = NULL;

  mxArray *rhs0[4];
  if (preconditioner == 0)
    {
      mxArray *lhs0[1];
      rhs0[0] = A_m;
      rhs0[1] = mxCreateDoubleScalar(0);
      mexCallMATLAB(1, lhs0, 2, rhs0, "spdiags");
      mxArray *tmp = lhs0[0];
      double *tmp_val = mxGetPr(tmp);
      Diag = mxCreateSparse(n, n, n, mxREAL);
      mwIndex *Diag_i = mxGetIr(Diag);
      mwIndex *Diag_j = mxGetJc(Diag);
      double *Diag_val = mxGetPr(Diag);
      for (size_t i = 0; i < n; i++)
        {
          Diag_val[i] = tmp_val[i];
          Diag_j[i] = i;
          Diag_i[i] = i;
        }
      Diag_j[n] = n;
    }
  else if (preconditioner == 1)
    {
      /*[L1, U1] = ilu(g1a=;*/
      const char *field_names[] = {"type", "droptol", "milu", "udiag", "thresh"};
      const int type = 0;
      const int droptol = 1;
      const int milu = 2;
      const int udiag = 3;
      const int thresh = 4;
      mwSize dims[1] = {(mwSize) 1 };
      mxArray *Setup = mxCreateStructArray(1, dims, 5, field_names);
      mxSetFieldByNumber(Setup, 0, type, mxCreateString("ilutp"));
      mxSetFieldByNumber(Setup, 0, droptol, mxCreateDoubleScalar(lu_inc_tol));
      mxSetFieldByNumber(Setup, 0, milu, mxCreateString("off"));
      mxSetFieldByNumber(Setup, 0, udiag, mxCreateDoubleScalar(0));
      mxSetFieldByNumber(Setup, 0, thresh, mxCreateDoubleScalar(1));
      mxArray *lhs0[2];
      mxArray *rhs0[2];
      rhs0[0] = A_m;
      rhs0[1] = Setup;
      if (mexCallMATLAB(2, lhs0, 2, rhs0, "ilu"))
        {
          ostringstream tmp;
          tmp << " In BiCGStab, the incomplet LU decomposition (ilu) ahs failed.\n";
          throw FatalExceptionHandling(tmp.str());
        }
      L1 = lhs0[0];
      U1 = lhs0[1];
      mxDestroyArray(Setup);
    }
  double flags = 2;
  mxArray *z;
  z = NULL;
  if (steady_state)  /*Octave BicStab algorihtm involves a 0 division in case of a preconditionner equal to the LU decomposition of A matrix*/
    {
      mxArray *res = mult_SAT_B(Sparse_transpose(A_m), x0_m);
      double *resid = mxGetPr(res);
      double *b = mxGetPr(b_m);
      for (int i = 0; i < (int) n; i++)
        resid[i] = b[i] - resid[i];
      mxArray *rhs[2];
      mxArray *lhs[1];
      rhs[0] = L1;
      rhs[1] = res;
      mexCallMATLAB(1, lhs, 2, rhs, "mldivide");
      rhs[0] = U1;
      rhs[1] = lhs[0];
      mexCallMATLAB(1, lhs, 2, rhs, "mldivide");
      z = lhs[0];
      double *phat = mxGetPr(z);
      double *x0 = mxGetPr(x0_m);
      for (int i = 0; i < (int) n; i++)
        phat[i] = x0[i] + phat[i];

      /*Check the solution*/
      res = mult_SAT_B(Sparse_transpose(A_m), z);
      resid = mxGetPr(res);
      double cum_abs = 0;
      for (int i = 0; i < (int) n; i++)
        {
          resid[i] = b[i] - resid[i];
          cum_abs += fabs(resid[i]);
        }
      if (cum_abs > 1e-7)
        flags = 2;
      else
        flags = 0;
      mxDestroyArray(res);
    }
  //else

  if (flags == 2)
    {
      if (preconditioner == 0)
        {
          /*[za,flag1] = bicgstab(g1a,b,1e-6,Blck_size*periods,L1,U1);*/
          mxArray *rhs[5];
          rhs[0] = A_m;
          rhs[1] = b_m;
          rhs[2] = mxCreateDoubleScalar(1e-6);
          rhs[3] = mxCreateDoubleScalar((double) n);
          rhs[4] = Diag;
          //rhs[5] = x0_m;
          mxArray *lhs[2];
          mexCallMATLAB(2, lhs, 5, rhs, "bicgstab");
          z = lhs[0];
          mxArray *flag = lhs[1];
          double *flag1 = mxGetPr(flag);
          flags = flag1[0];
          mxDestroyArray(flag);
          mxDestroyArray(rhs[2]);
          mxDestroyArray(rhs[3]);
          mxDestroyArray(rhs[4]);
        }
      else if (preconditioner == 1)
        {
          /*[za,flag1] = bicgstab(g1a,b,1e-6,Blck_size*periods,L1,U1);*/
          mxArray *rhs[7];
          rhs[0] = A_m;
          rhs[1] = b_m;
          rhs[2] = mxCreateDoubleScalar(1e-6);
          rhs[3] = mxCreateDoubleScalar((double) n);
          rhs[4] = L1;
          rhs[5] = U1;
          rhs[6] = x0_m;
          mxArray *lhs[2];
          mexCallMATLAB(2, lhs, 7, rhs, "bicgstab");
          z = lhs[0];
          mxArray *flag = lhs[1];
          double *flag1 = mxGetPr(flag);
          flags = flag1[0];
          mxDestroyArray(flag);
          mxDestroyArray(rhs[2]);
          mxDestroyArray(rhs[3]);
          mxDestroyArray(rhs[4]);
          mxDestroyArray(rhs[5]);
        }
    }

  if (flags > 0)
    {
      ostringstream tmp;
      if (flags == 1)
        {
          tmp << "Error in bytecode: No convergence inside BiCGStab, in block " << block+1;
          mexWarnMsgTxt(tmp.str().c_str());
        }
      else if (flags == 2)
        {
          tmp << "Error in bytecode: Preconditioner is ill-conditioned, in block " << block+1;
          mexWarnMsgTxt(tmp.str().c_str());
        }
      else if (flags == 3)
        {
          tmp << "Error in bytecode: BiCGStab stagnated (Two consecutive iterates were the same.), in block " << block+1;
          mexWarnMsgTxt(tmp.str().c_str());
        }
      lu_inc_tol /= 10;
    }
  else
    {
      double *res = mxGetPr(z);
      if (is_two_boundaries)
        for (int i = 0; i < (int) n; i++)
          {
            int eq = index_vara[i+Size*y_kmin];
            double yy = -(res[i] + y[eq]);
            direction[eq] = yy;
            y[eq] += slowc * yy;
          }
      else
        for (int i = 0; i < (int) n; i++)
          {
            int eq = index_vara[i];
            double yy = -(res[i] + y[eq+it_*y_size]);
            direction[eq] = yy;
            y[eq+it_*y_size] += slowc * yy;
          }
    }
  mxDestroyArray(A_m);
  mxDestroyArray(b_m);
  mxDestroyArray(z);
}

void
dynSparseMatrix::Singular_display(int block, int Size)
{
  bool zero_solution;
  Simple_Init(Size, IM_i, zero_solution);
  NonZeroElem *first;
  mxArray *rhs[1];
  rhs[0] = mxCreateDoubleMatrix(Size, Size, mxREAL);
  double *pind;
  pind = mxGetPr(rhs[0]);
  for (int j = 0; j < Size * Size; j++)
    pind[j] = 0.0;
  for (int ii = 0; ii < Size; ii++)
    {
      int nb_eq = At_Col(ii, &first);
      for (int j = 0; j < nb_eq; j++)
        {
          int k = first->u_index;
          int jj = first->r_index;
          pind[ii * Size + jj ] = u[k];
          first = first->NZE_C_N;
        }
    }
  mxArray *lhs[3];
  mexCallMATLAB(3, lhs, 1, rhs, "svd");
  mxArray *SVD_u = lhs[0];
  mxArray *SVD_s = lhs[1];
  //mxArray* SVD_v = lhs[2];
  double *SVD_ps = mxGetPr(SVD_s);
  double *SVD_pu = mxGetPr(SVD_u);
  for (int i = 0; i < Size; i++)
    {
      if (abs(SVD_ps[i * (1 + Size)]) < 1e-12)
        {
          mexPrintf(" The following equations form a linear combination:\n    ");
          double max_u = 0;
          for (int j = 0; j < Size; j++)
            if (abs(SVD_pu[j + i * Size]) > abs(max_u))
              max_u = SVD_pu[j + i * Size];
          vector<int> equ_list;
          for (int j = 0; j < Size; j++)
            {
              double rr = SVD_pu[j + i * Size] / max_u;
              if (rr < -1e-10)
                {
                  equ_list.push_back(j);
                  if (rr != -1)
                    mexPrintf(" - %3.2f*Dequ_%d_dy", abs(rr), j+1);
                  else
                    mexPrintf(" - Dequ_%d_dy", j+1);
                }
              else if (rr > 1e-10)
                {
                  equ_list.push_back(j);
                  if (j > 0)
                    if (rr != 1)
                      mexPrintf(" + %3.2f*Dequ_%d_dy", rr, j+1);
                    else
                      mexPrintf(" + Dequ_%d_dy", j+1);
                  else if (rr != 1)
                    mexPrintf(" %3.2f*Dequ_%d_dy", rr, j+1);
                  else
                    mexPrintf(" Dequ_%d_dy", j+1);
                }
            }
          mexPrintf(" = 0\n");
          /*mexPrintf(" with:\n");
            it_code = get_begin_block(block);
            for (int j=0; j < Size; j++)
            {
            if (find(equ_list.begin(), equ_list.end(), j) != equ_list.end())
            mexPrintf("  equ_%d: %s\n",j, print_expression(it_code_expr, false, Size, block, steady_state, 0, 0, it_code, true).c_str());
            }*/
        }
    }
  mxDestroyArray(lhs[0]);
  mxDestroyArray(lhs[1]);
  mxDestroyArray(lhs[2]);
  ostringstream tmp;
  if (block > 1)
    tmp << " in Solve_ByteCode_Sparse_GaussianElimination, singular system in block " << block+1 << "\n";
  else
    tmp << " in Solve_ByteCode_Sparse_GaussianElimination, singular system\n";
  throw FatalExceptionHandling(tmp.str());
}

bool
dynSparseMatrix::Solve_ByteCode_Sparse_GaussianElimination(int Size, int blck, int it_)
{
  bool one;
  int pivj = 0, pivk = 0;
  double *piv_v;
  int *pivj_v, *pivk_v, *NR;
  int l, N_max;
  NonZeroElem *first, *firsta, *first_suba;
  double piv_abs;
  NonZeroElem **bc;
  bc = (NonZeroElem **) mxMalloc(Size*sizeof(*bc));
  test_mxMalloc(bc, __LINE__, __FILE__, __func__, Size*sizeof(*bc));
  piv_v = (double *) mxMalloc(Size*sizeof(double));
  test_mxMalloc(piv_v, __LINE__, __FILE__, __func__, Size*sizeof(double));
  pivj_v = (int *) mxMalloc(Size*sizeof(int));
  test_mxMalloc(pivj_v, __LINE__, __FILE__, __func__, Size*sizeof(int));
  pivk_v = (int *) mxMalloc(Size*sizeof(int));
  test_mxMalloc(pivk_v, __LINE__, __FILE__, __func__, Size*sizeof(int));
  NR = (int *) mxMalloc(Size*sizeof(int));
  test_mxMalloc(NR, __LINE__, __FILE__, __func__, Size*sizeof(int));

  for (int i = 0; i < Size; i++)
    {
      /*finding the max-pivot*/
      double piv = piv_abs = 0;
      int nb_eq = At_Col(i, &first);
      l = 0;
      N_max = 0;
      one = false;
      piv_abs = 0;
      for (int j = 0; j < nb_eq; j++)
        {
          if (!line_done[first->r_index])
            {
              int k = first->u_index;
              int jj = first->r_index;
              int NRow_jj = NRow(jj);

              piv_v[l] = u[k];
              double piv_fabs = fabs(u[k]);
              pivj_v[l] = jj;
              pivk_v[l] = k;
              NR[l] = NRow_jj;
              if (NRow_jj == 1 && !one)
                {
                  one = true;
                  piv_abs = piv_fabs;
                  N_max = NRow_jj;
                }
              if (!one)
                {
                  if (piv_fabs > piv_abs)
                    piv_abs = piv_fabs;
                  if (NRow_jj > N_max)
                    N_max = NRow_jj;
                }
              else
                {
                  if (NRow_jj == 1)
                    {
                      if (piv_fabs > piv_abs)
                        piv_abs = piv_fabs;
                      if (NRow_jj > N_max)
                        N_max = NRow_jj;
                    }
                }
              l++;
            }
          first = first->NZE_C_N;
        }
      if (piv_abs < eps)
        {
          mxFree(piv_v);
          mxFree(pivj_v);
          mxFree(pivk_v);
          mxFree(NR);
          mxFree(bc);
          if (steady_state)
            {
              if (blck > 1)
                mexPrintf("Error: singular system in Simulate_NG in block %d\n", blck+1);
              else
                mexPrintf("Error: singular system in Simulate_NG\n");
              return true;
            }
          else
            {
              ostringstream tmp;
              if (blck > 1)
                tmp << " in Solve_ByteCode_Sparse_GaussianElimination, singular system in block " << blck+1 << "\n";
              else
                tmp << " in Solve_ByteCode_Sparse_GaussianElimination, singular system\n";
              throw FatalExceptionHandling(tmp.str());
            }
        }
      double markovitz = 0, markovitz_max = -9e70;
      if (!one)
        {
          for (int j = 0; j < l; j++)
            {
              if (N_max > 0 && NR[j] > 0)
                {
                  if (fabs(piv_v[j]) > 0)
                    {
                      if (markowitz_c > 0)
                        markovitz = exp(log(fabs(piv_v[j])/piv_abs)-markowitz_c*log(double (NR[j])/double (N_max)));
                      else
                        markovitz = fabs(piv_v[j])/piv_abs;
                    }
                  else
                    markovitz = 0;
                }
              else
                markovitz = fabs(piv_v[j])/piv_abs;
              if (markovitz > markovitz_max)
                {
                  piv = piv_v[j];
                  pivj = pivj_v[j];   //Line number
                  pivk = pivk_v[j];   //positi
                  markovitz_max = markovitz;
                }
            }
        }
      else
        {
          for (int j = 0; j < l; j++)
            {
              if (N_max > 0 && NR[j] > 0)
                {
                  if (fabs(piv_v[j]) > 0)
                    {
                      if (markowitz_c > 0)
                        markovitz = exp(log(fabs(piv_v[j])/piv_abs)-markowitz_c*log(double (NR[j])/double (N_max)));
                      else
                        markovitz = fabs(piv_v[j])/piv_abs;
                    }
                  else
                    markovitz = 0;
                }
              else
                markovitz = fabs(piv_v[j])/piv_abs;
              if (NR[j] == 1)
                {
                  piv = piv_v[j];
                  pivj = pivj_v[j];   //Line number
                  pivk = pivk_v[j];   //positi
                  markovitz_max = markovitz;
                }
            }
        }
      pivot[i] = pivj;
      pivotk[i] = pivk;
      pivotv[i] = piv;
      line_done[pivj] = true;

      /*divide all the non zeros elements of the line pivj by the max_pivot*/
      int nb_var = At_Row(pivj, &first);
      for (int j = 0; j < nb_var; j++)
        {
          u[first->u_index] /= piv;
          first = first->NZE_R_N;
        }
      u[b[pivj]] /= piv;
      /*substract the elements on the non treated lines*/
      nb_eq = At_Col(i, &first);
      NonZeroElem *first_piva;
      int nb_var_piva = At_Row(pivj, &first_piva);
      int nb_eq_todo = 0;
      for (int j = 0; j < nb_eq && first; j++)
        {
          if (!line_done[first->r_index])
            bc[nb_eq_todo++] = first;
          first = first->NZE_C_N;
        }
      //pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
      for (int j = 0; j < nb_eq_todo; j++)
        {
          first = bc[j];
          int row = first->r_index;
          double first_elem = u[first->u_index];

          int nb_var_piv = nb_var_piva;
          NonZeroElem *first_piv = first_piva;
          NonZeroElem *first_sub;
          int nb_var_sub = At_Row(row, &first_sub);
          int l_sub = 0, l_piv = 0;
          int sub_c_index = first_sub->c_index, piv_c_index = first_piv->c_index;
          while (l_sub < nb_var_sub || l_piv < nb_var_piv)
            {
              if (l_sub < nb_var_sub && (sub_c_index < piv_c_index || l_piv >= nb_var_piv))
                {
                  first_sub = first_sub->NZE_R_N;
                  if (first_sub)
                    sub_c_index = first_sub->c_index;
                  else
                    sub_c_index = Size;
                  l_sub++;
                }
              else if (sub_c_index > piv_c_index || l_sub >= nb_var_sub)
                {
                  int tmp_u_count = Get_u();
                  Insert(row, first_piv->c_index, tmp_u_count, 0);
                  u[tmp_u_count] = -u[first_piv->u_index]*first_elem;
                  first_piv = first_piv->NZE_R_N;
                  if (first_piv)
                    piv_c_index = first_piv->c_index;
                  else
                    piv_c_index = Size;
                  l_piv++;
                }
              else
                {
                  if (i == sub_c_index)
                    {
                      firsta = first;
                      first_suba = first_sub->NZE_R_N;
                      Delete(first_sub->r_index, first_sub->c_index);
                      first = firsta->NZE_C_N;
                      first_sub = first_suba;
                      if (first_sub)
                        sub_c_index = first_sub->c_index;
                      else
                        sub_c_index = Size;
                      l_sub++;
                      first_piv = first_piv->NZE_R_N;
                      if (first_piv)
                        piv_c_index = first_piv->c_index;
                      else
                        piv_c_index = Size;
                      l_piv++;
                    }
                  else
                    {
                      u[first_sub->u_index] -= u[first_piv->u_index]*first_elem;
                      first_sub = first_sub->NZE_R_N;
                      if (first_sub)
                        sub_c_index = first_sub->c_index;
                      else
                        sub_c_index = Size;
                      l_sub++;
                      first_piv = first_piv->NZE_R_N;
                      if (first_piv)
                        piv_c_index = first_piv->c_index;
                      else
                        piv_c_index = Size;
                      l_piv++;
                    }
                }
            }
          u[b[row]] -= u[b[pivj]]*first_elem;
        }
    }
  double slowc_lbx = slowc;
  for (int i = 0; i < y_size; i++)
    ya[i+it_*y_size] = y[i+it_*y_size];

  slowc_save = slowc;
  simple_bksub(it_, Size, slowc_lbx);
  End_GE(Size);
  mxFree(piv_v);
  mxFree(pivj_v);
  mxFree(pivk_v);
  mxFree(NR);
  mxFree(bc);
  return false;
}

void
dynSparseMatrix::Solve_ByteCode_Symbolic_Sparse_GaussianElimination(int Size, bool symbolic, int Block_number)
{
  /*Triangularisation at each period of a block using a simple gaussian Elimination*/
  t_save_op_s *save_op_s;
  int *save_op = NULL, *save_opa = NULL, *save_opaa = NULL;
  long int nop = 0, nopa = 0;
  bool record = false;
  double *piv_v;
  double piv_abs;
  int *pivj_v, *pivk_v, *NR;
  int pivj = 0, pivk = 0;
  NonZeroElem *first;
  int tmp_u_count, lag;
  int tbreak = 0, last_period = periods;

  piv_v = (double *) mxMalloc(Size*sizeof(double));
  test_mxMalloc(piv_v, __LINE__, __FILE__, __func__, Size*sizeof(double));
  pivj_v = (int *) mxMalloc(Size*sizeof(int));
  test_mxMalloc(pivj_v, __LINE__, __FILE__, __func__, Size*sizeof(int));
  pivk_v = (int *) mxMalloc(Size*sizeof(int));
  test_mxMalloc(pivk_v, __LINE__, __FILE__, __func__, Size*sizeof(int));
  NR = (int *) mxMalloc(Size*sizeof(int));
  test_mxMalloc(NR, __LINE__, __FILE__, __func__, Size*sizeof(int));
  //clock_t time00 = clock();
  NonZeroElem **bc;
  bc = (NonZeroElem **) mxMalloc(Size*sizeof(first));
  test_mxMalloc(bc, __LINE__, __FILE__, __func__, Size*sizeof(first));

  for (int t = 0; t < periods; t++)
    {
      /*clock_t time11 = clock();
        mexPrintf("t=%d, record = %d\n",t, record);*/
#ifdef MATLAB_MEX_FILE
      if (utIsInterruptPending())
        throw UserExceptionHandling();
#endif

      if (record && symbolic)
        {
          /*if (save_op)
            {
            mxFree(save_op);
            save_op = NULL;
            }*/
          save_op = (int *) mxMalloc(nop*sizeof(int));
          test_mxMalloc(save_op, __LINE__, __FILE__, __func__, nop*sizeof(int));
          nopa = nop;
        }
      nop = 0;
      Clear_u();
      int ti = t*Size;
      for (int i = ti; i < Size+ti; i++)
        {
          /*finding the max-pivot*/
          double piv = piv_abs = 0;
          int nb_eq = At_Col(i, 0, &first);
          if ((symbolic && t <= start_compare) || !symbolic)
            {
              int l = 0, N_max = 0;
              bool one = false;
              piv_abs = 0;
              for (int j = 0; j < nb_eq; j++)
                {
                  if (!line_done[first->r_index])
                    {
                      int k = first->u_index;
                      int jj = first->r_index;
                      int NRow_jj = NRow(jj);
                      piv_v[l] = u[k];
                      double piv_fabs = fabs(u[k]);
                      pivj_v[l] = jj;
                      pivk_v[l] = k;
                      NR[l] = NRow_jj;
                      if (NRow_jj == 1 && !one)
                        {
                          one = true;
                          piv_abs = piv_fabs;
                          N_max = NRow_jj;
                        }
                      if (!one)
                        {
                          if (piv_fabs > piv_abs)
                            piv_abs = piv_fabs;
                          if (NRow_jj > N_max)
                            N_max = NRow_jj;
                        }
                      else
                        {
                          if (NRow_jj == 1)
                            {
                              if (piv_fabs > piv_abs)
                                piv_abs = piv_fabs;
                              if (NRow_jj > N_max)
                                N_max = NRow_jj;
                            }
                        }
                      l++;
                    }
                  first = first->NZE_C_N;
                }
              double markovitz = 0, markovitz_max = -9e70;
              int NR_max = 0;
              if (!one)
                {
                  for (int j = 0; j < l; j++)
                    {
                      if (N_max > 0 && NR[j] > 0)
                        {
                          if (fabs(piv_v[j]) > 0)
                            {
                              if (markowitz_c > 0)
                                markovitz = exp(log(fabs(piv_v[j])/piv_abs)-markowitz_c*log(double (NR[j])/double (N_max)));
                              else
                                markovitz = fabs(piv_v[j])/piv_abs;
                            }
                          else
                            markovitz = 0;
                        }
                      else
                        markovitz = fabs(piv_v[j])/piv_abs;
                      if (markovitz > markovitz_max)
                        {
                          piv = piv_v[j];
                          pivj = pivj_v[j];   //Line number
                          pivk = pivk_v[j];   //positi
                          markovitz_max = markovitz;
                          NR_max = NR[j];
                        }
                    }
                }
              else
                {
                  for (int j = 0; j < l; j++)
                    {
                      if (N_max > 0 && NR[j] > 0)
                        {
                          if (fabs(piv_v[j]) > 0)
                            {
                              if (markowitz_c > 0)
                                markovitz = exp(log(fabs(piv_v[j])/piv_abs)-markowitz_c*log(double (NR[j])/double (N_max)));
                              else
                                markovitz = fabs(piv_v[j])/piv_abs;
                            }
                          else
                            markovitz = 0;
                        }
                      else
                        markovitz = fabs(piv_v[j])/piv_abs;
                      if (NR[j] == 1)
                        {
                          piv = piv_v[j];
                          pivj = pivj_v[j];   //Line number
                          pivk = pivk_v[j];   //positi
                          markovitz_max = markovitz;
                          NR_max = NR[j];
                        }
                    }
                }
              if (fabs(piv) < eps)
                mexPrintf("==> Error NR_max=%d, N_max=%d and piv=%f, piv_abs=%f, markovitz_max=%f\n", NR_max, N_max, piv, piv_abs, markovitz_max);
              if (NR_max == 0)
                mexPrintf("==> Error NR_max=0 and piv=%f, markovitz_max=%f\n", piv, markovitz_max);
              pivot[i] = pivj;
              pivot_save[i] = pivj;
              pivotk[i] = pivk;
              pivotv[i] = piv;
            }
          else
            {
              pivj = pivot[i-Size]+Size;
              pivot[i] = pivj;
              At_Pos(pivj, i, &first);
              pivk = first->u_index;
              piv = u[pivk];
              piv_abs = fabs(piv);
            }
          line_done[pivj] = true;

          if (record && symbolic)
            {
              if (nop+1 >= nopa)
                {
                  nopa = long (mem_increasing_factor*(double) nopa);
                  save_op = (int *) mxRealloc(save_op, nopa*sizeof(int));
                }
              save_op_s = (t_save_op_s *) (&(save_op[nop]));
              save_op_s->operat = IFLD;
              save_op_s->first = pivk;
              save_op_s->lag = 0;
              nop += 2;
              if (piv_abs < eps)
                {
                  ostringstream tmp;
                  if (Block_number > 1)
                    tmp << " in Solve_ByteCode_Symbolic_Sparse_GaussianElimination, singular system in block " << Block_number+1 << "\n";
                  else
                    tmp << " in Solve_ByteCode_Symbolic_Sparse_GaussianElimination, singular system\n";
                  throw FatalExceptionHandling(tmp.str());
                }
              /*divide all the non zeros elements of the line pivj by the max_pivot*/
              int nb_var = At_Row(pivj, &first);
              for (int j = 0; j < nb_var; j++)
                {
                  u[first->u_index] /= piv;
                  if (nop+j*2+1 >= nopa)
                    {
                      nopa = long (mem_increasing_factor*(double) nopa);
                      save_op = (int *) mxRealloc(save_op, nopa*sizeof(int));
                    }
                  save_op_s = (t_save_op_s *) (&(save_op[nop+j*2]));
                  save_op_s->operat = IFDIV;
                  save_op_s->first = first->u_index;
                  save_op_s->lag = first->lag_index;
                  first = first->NZE_R_N;
                }
              nop += nb_var*2;
              u[b[pivj]] /= piv;
              if (nop+1 >= nopa)
                {
                  nopa = long (mem_increasing_factor*(double) nopa);
                  save_op = (int *) mxRealloc(save_op, nopa*sizeof(int));
                }
              save_op_s = (t_save_op_s *) (&(save_op[nop]));
              save_op_s->operat = IFDIV;
              save_op_s->first = b[pivj];
              save_op_s->lag = 0;
              nop += 2;
              /*substract the elements on the non treated lines*/
              nb_eq = At_Col(i, &first);
              NonZeroElem *first_piva;
              int nb_var_piva = At_Row(pivj, &first_piva);

              int nb_eq_todo = 0;
              for (int j = 0; j < nb_eq && first; j++)
                {
                  if (!line_done[first->r_index])
                    bc[nb_eq_todo++] = first;
                  first = first->NZE_C_N;
                }
              //#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) shared(nb_var_piva, first_piva, nopa, save_op) reduction(+:nop)
              for (int j = 0; j < nb_eq_todo; j++)
                {
                  t_save_op_s *save_op_s_l;
                  NonZeroElem *first = bc[j];
                  int row = first->r_index;
                  double first_elem = u[first->u_index];
                  if (nop+1 >= nopa)
                    {
                      nopa = long (mem_increasing_factor*(double) nopa);
                      save_op = (int *) mxRealloc(save_op, nopa*sizeof(int));
                    }
                  save_op_s_l = (t_save_op_s *) (&(save_op[nop]));
                  save_op_s_l->operat = IFLD;
                  save_op_s_l->first = first->u_index;
                  save_op_s_l->lag = abs(first->lag_index);
                  nop += 2;

                  int nb_var_piv = nb_var_piva;
                  NonZeroElem *first_piv = first_piva;
                  NonZeroElem *first_sub;
                  int nb_var_sub = At_Row(row, &first_sub);
                  int l_sub = 0;
                  int l_piv = 0;
                  int sub_c_index = first_sub->c_index;
                  int piv_c_index = first_piv->c_index;
                  int tmp_lag = first_sub->lag_index;
                  while (l_sub < (nb_var_sub /*=NRow(row)*/) || l_piv < nb_var_piv)
                    {
                      if (l_sub < nb_var_sub && (sub_c_index < piv_c_index || l_piv >= nb_var_piv))
                        {
                          //There is no nonzero element at row pivot for this column=> Nothing to do for the current element got to next column
                          first_sub = first_sub->NZE_R_N;
                          if (first_sub)
                            sub_c_index = first_sub->c_index;
                          else
                            sub_c_index = Size*periods;
                          l_sub++;
                        }
                      else if (sub_c_index > piv_c_index || l_sub >= nb_var_sub)
                        {
                          // There is an nonzero element at row pivot but not at the current row=> insert a negative element in the current row
                          tmp_u_count = Get_u();
                          lag = first_piv->c_index/Size-row/Size;
                          //#pragma omp critical
                          {
                            Insert(row, first_piv->c_index, tmp_u_count, lag);
                          }
                          u[tmp_u_count] = -u[first_piv->u_index]*first_elem;
                          if (nop+2 >= nopa)
                            {
                              nopa = long (mem_increasing_factor*(double) nopa);
                              save_op = (int *) mxRealloc(save_op, nopa*sizeof(int));
                            }
                          save_op_s_l = (t_save_op_s *) (&(save_op[nop]));
                          save_op_s_l->operat = IFLESS;
                          save_op_s_l->first = tmp_u_count;
                          save_op_s_l->second = first_piv->u_index;
                          save_op_s_l->lag = max(first_piv->lag_index, abs(tmp_lag));
                          nop += 3;
                          first_piv = first_piv->NZE_R_N;
                          if (first_piv)
                            piv_c_index = first_piv->c_index;
                          else
                            piv_c_index = Size*periods;
                          l_piv++;
                        }
                      else /*first_sub->c_index==first_piv->c_index*/
                        {
                          if (i == sub_c_index)
                            {
                              NonZeroElem *firsta = first;
                              NonZeroElem *first_suba = first_sub->NZE_R_N;
                              //#pragma omp critical
                              {
                                Delete(first_sub->r_index, first_sub->c_index);
                              }
                              first = firsta->NZE_C_N;
                              first_sub = first_suba;
                              if (first_sub)
                                sub_c_index = first_sub->c_index;
                              else
                                sub_c_index = Size*periods;
                              l_sub++;
                              first_piv = first_piv->NZE_R_N;
                              if (first_piv)
                                piv_c_index = first_piv->c_index;
                              else
                                piv_c_index = Size*periods;
                              l_piv++;
                            }
                          else
                            {
                              u[first_sub->u_index] -= u[first_piv->u_index]*first_elem;
                              if (nop+3 >= nopa)
                                {
                                  nopa = long (mem_increasing_factor*(double) nopa);
                                  save_op = (int *) mxRealloc(save_op, nopa*sizeof(int));
                                }
                              save_op_s_l = (t_save_op_s *) (&(save_op[nop]));
                              save_op_s_l->operat = IFSUB;
                              save_op_s_l->first = first_sub->u_index;
                              save_op_s_l->second = first_piv->u_index;
                              save_op_s_l->lag = max(abs(tmp_lag), first_piv->lag_index);
                              nop += 3;
                              first_sub = first_sub->NZE_R_N;
                              if (first_sub)
                                sub_c_index = first_sub->c_index;
                              else
                                sub_c_index = Size*periods;
                              l_sub++;
                              first_piv = first_piv->NZE_R_N;
                              if (first_piv)
                                piv_c_index = first_piv->c_index;
                              else
                                piv_c_index = Size*periods;
                              l_piv++;
                            }
                        }
                    }
                  u[b[row]] -= u[b[pivj]]*first_elem;

                  if (nop+3 >= nopa)
                    {
                      nopa = long (mem_increasing_factor*(double) nopa);
                      save_op = (int *) mxRealloc(save_op, nopa*sizeof(int));
                    }
                  save_op_s_l = (t_save_op_s *) (&(save_op[nop]));
                  save_op_s_l->operat = IFSUB;
                  save_op_s_l->first = b[row];
                  save_op_s_l->second = b[pivj];
                  save_op_s_l->lag = abs(tmp_lag);
                  nop += 3;
                }
            }
          else if (symbolic)
            {
              nop += 2;
              if (piv_abs < eps)
                {
                  ostringstream tmp;
                  if (Block_number > 1)
                    tmp << " in Solve_ByteCode_Symbolic_Sparse_GaussianElimination, singular system in block " << Block_number+1 << "\n";
                  else
                    tmp << " in Solve_ByteCode_Symbolic_Sparse_GaussianElimination, singular system\n";
                  throw FatalExceptionHandling(tmp.str());
                }
              /*divide all the non zeros elements of the line pivj by the max_pivot*/
              int nb_var = At_Row(pivj, &first);
              for (int j = 0; j < nb_var; j++)
                {
                  u[first->u_index] /= piv;
                  first = first->NZE_R_N;
                }
              nop += nb_var*2;
              u[b[pivj]] /= piv;
              nop += 2;
              /*substract the elements on the non treated lines*/
              nb_eq = At_Col(i, &first);
              NonZeroElem *first_piva;
              int nb_var_piva = At_Row(pivj, &first_piva);

              int nb_eq_todo = 0;
              for (int j = 0; j < nb_eq && first; j++)
                {
                  if (!line_done[first->r_index])
                    bc[nb_eq_todo++] = first;
                  first = first->NZE_C_N;
                }
              //#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) shared(nb_var_piva, first_piva, nopa, save_op) reduction(+:nop)
              for (int j = 0; j < nb_eq_todo; j++)
                {
                  NonZeroElem *first = bc[j];
                  int row = first->r_index;
                  double first_elem = u[first->u_index];
                  nop += 2;
                  int nb_var_piv = nb_var_piva;
                  NonZeroElem *first_piv = first_piva;
                  NonZeroElem *first_sub;
                  int nb_var_sub = At_Row(row, &first_sub);
                  int l_sub = 0;
                  int l_piv = 0;
                  int sub_c_index = first_sub->c_index;
                  int piv_c_index = first_piv->c_index;
                  while (l_sub < (nb_var_sub /*= NRow(row)*/) || l_piv < nb_var_piv)
                    {
                      if (l_sub < nb_var_sub && (sub_c_index < piv_c_index || l_piv >= nb_var_piv))
                        {
                          //There is no nonzero element at row pivot for this column=> Nothing to do for the current element got to next column
                          first_sub = first_sub->NZE_R_N;
                          if (first_sub)
                            sub_c_index = first_sub->c_index;
                          else
                            sub_c_index = Size*periods;
                          l_sub++;
                        }
                      else if (sub_c_index > piv_c_index || l_sub >= nb_var_sub)
                        {
                          // There is an nonzero element at row pivot but not at the current row=> insert a negative element in the current row
                          tmp_u_count = Get_u();
                          lag = first_piv->c_index/Size-row/Size;
                          //#pragma omp critical
                          {
                            Insert(row, first_piv->c_index, tmp_u_count, lag);
                          }
                          u[tmp_u_count] = -u[first_piv->u_index]*first_elem;
                          nop += 3;
                          first_piv = first_piv->NZE_R_N;
                          if (first_piv)
                            piv_c_index = first_piv->c_index;
                          else
                            piv_c_index = Size*periods;
                          l_piv++;
                        }
                      else /*first_sub->c_index==first_piv->c_index*/
                        {
                          if (i == sub_c_index)
                            {
                              NonZeroElem *firsta = first;
                              NonZeroElem *first_suba = first_sub->NZE_R_N;
                              //#pragma omp critical
                              {
                                Delete(first_sub->r_index, first_sub->c_index);
                              }
                              first = firsta->NZE_C_N;
                              first_sub = first_suba;
                              if (first_sub)
                                sub_c_index = first_sub->c_index;
                              else
                                sub_c_index = Size*periods;
                              l_sub++;
                              first_piv = first_piv->NZE_R_N;
                              if (first_piv)
                                piv_c_index = first_piv->c_index;
                              else
                                piv_c_index = Size*periods;
                              l_piv++;
                            }
                          else
                            {
                              u[first_sub->u_index] -= u[first_piv->u_index]*first_elem;
                              nop += 3;
                              first_sub = first_sub->NZE_R_N;
                              if (first_sub)
                                sub_c_index = first_sub->c_index;
                              else
                                sub_c_index = Size*periods;
                              l_sub++;
                              first_piv = first_piv->NZE_R_N;
                              if (first_piv)
                                piv_c_index = first_piv->c_index;
                              else
                                piv_c_index = Size*periods;
                              l_piv++;
                            }
                        }
                    }
                  u[b[row]] -= u[b[pivj]]*first_elem;
                  nop += 3;
                }
            }
        }
      if (symbolic)
        {
          if (t > int (periods*0.35))
            {
              symbolic = false;
              mxFree(save_opaa);
              mxFree(save_opa);
              mxFree(save_op);
            }
          else if (record && (nop == nop1))
            {
              if (t > int (periods*0.35))
                {
                  symbolic = false;
                  if (save_opaa)
                    {
                      mxFree(save_opaa);
                      save_opaa = NULL;
                    }
                  if (save_opa)
                    {
                      mxFree(save_opa);
                      save_opa = NULL;
                    }
                  if (save_op)
                    {
                      mxFree(save_op);
                      save_op = NULL;
                    }
                }
              else if (save_opa && save_opaa)
                {
                  if (compare(save_op, save_opa, save_opaa, t, periods, nop, Size))
                    {
                      tbreak = t;
                      tbreak_g = tbreak;
                      //mexPrintf("time=%f\n",(1000.0*(double (clock())-double (time11)))/double (CLOCKS_PER_SEC));
                      break;
                    }
                }
              if (save_opa)
                {
                  if (save_opaa)
                    {
                      mxFree(save_opaa);
                      save_opaa = NULL;
                    }
                  save_opaa = save_opa;
                }
              save_opa = save_op;
            }
          else
            {
              if (nop == nop1)
                record = true;
              else
                {
                  record = false;
                  if (save_opa)
                    {
                      mxFree(save_opa);
                      save_opa = NULL;
                    }
                  if (save_opaa)
                    {
                      mxFree(save_opaa);
                      save_opaa = NULL;
                    }
                }
            }
          nop2 = nop1;
          nop1 = nop;
        }
      //mexPrintf("time=%f\n",(1000.0*(double (clock())-double (time11)))/double (CLOCKS_PER_SEC));
    }
  mxFree(bc);
  mxFree(piv_v);
  mxFree(pivj_v);
  mxFree(pivk_v);
  mxFree(NR);
  /*mexPrintf("tbreak=%d, periods=%d time required=%f\n",tbreak,periods, (1000.0*(double (clock())-double (time00)))/double (CLOCKS_PER_SEC));
    mexEvalString("drawnow;");
    time00 = clock();*/
  nop_all += nop;
  if (symbolic)
    {
      if (save_op)
        mxFree(save_op);
      if (save_opa)
        mxFree(save_opa);
      if (save_opaa)
        mxFree(save_opaa);
    }

  /*The backward substitution*/
  double slowc_lbx = slowc;
  for (int i = 0; i < y_size*(periods+y_kmin); i++)
    ya[i] = y[i];
  slowc_save = slowc;
  bksub(tbreak, last_period, Size, slowc_lbx);
  /*mexPrintf("remaining operations and bksub time required=%f\n",tbreak,periods, (1000.0*(double (clock())-double (time00)))/double (CLOCKS_PER_SEC));
    mexEvalString("drawnow;");*/
  End_GE(Size);
}

void
dynSparseMatrix::Grad_f_product(int n, mxArray *b_m, double *vectr, mxArray *A_m, SuiteSparse_long *Ap, SuiteSparse_long *Ai, double *Ax, double *b_)
{
  if ((solve_algo == 5 && steady_state) || (stack_solve_algo == 5 && !steady_state))
    {
      NonZeroElem *first;
      for (int i = 0; i < n; i++)
        {
          double sum = 0;
          first = FNZE_R[i];
          if (first)
            for (int k = 0; k < NbNZRow[i]; k++)
              {
                sum += u[first->u_index] * u[b[first->c_index]];
                first = first->NZE_R_N;
              }
          vectr[i] = sum;
        }
    }
  else
    {
      if (!((solve_algo == 6 && steady_state) || ((stack_solve_algo == 0 || stack_solve_algo == 1 || stack_solve_algo == 4) && !steady_state)))
        {
          mwIndex *Ai = mxGetIr(A_m);
          if (!Ai)
            {
              ostringstream tmp;
              tmp << " in Init_Matlab_Sparse_Simple, can't allocate Ai index vector\n";
              throw FatalExceptionHandling(tmp.str());
            }
          mwIndex *Aj = mxGetJc(A_m);
          if (!Aj)
            {
              ostringstream tmp;
              tmp << " in Init_Matlab_Sparse_Simple, can't allocate Aj index vector\n";
              throw FatalExceptionHandling(tmp.str());
            }
          double *A = mxGetPr(A_m);
          if (!A)
            {
              ostringstream tmp;
              tmp << " in Init_Matlab_Sparse_Simple, can't retrieve A matrix\n";
              throw FatalExceptionHandling(tmp.str());
            }
          b_ = mxGetPr(b_m);
          if (!b_)
            {
              ostringstream tmp;
              tmp << " in Init_Matlab_Sparse_Simple, can't retrieve b matrix\n";
              throw FatalExceptionHandling(tmp.str());
            }
        }
      memset(vectr, 0, n * sizeof(double));
      for (int i = 0; i < n; i++)
        for (SuiteSparse_long j = Ap[i]; j < Ap[i+1]; j++)
          vectr[Ai[j]] += Ax[j] * b_[i];
    }
}

void
dynSparseMatrix::Check_and_Correct_Previous_Iteration(int block_num, int y_size, int size, double crit_opt_old)
{
  double top = 1.0;
  double bottom = 0.1;
  if (isnan(res1) || isinf(res1) || (res2 > g0 && iter > 0))
    {
      while ((isnan(res1) || isinf(res1)))
        {
          prev_slowc_save = slowc_save;
          slowc_save /= 1.1;
          for (int i = 0; i < size; i++)
            {
              int eq = index_vara[i];
              y[eq+it_*y_size] = ya[eq+it_*y_size] + slowc_save * direction[eq+it_*y_size];
            }
          /*mexPrintf("reducing solwc_save = %e, it_=%d, y_size=%d, size=%d, y[%d]=%e, ya[%d]=%e,\n y[%d]=%e, ya[%d]=%e\n",slowc_save, it_, y_size, size-1, index_vara[0]+it_*y_size, y[index_vara[0]+it_*y_size], index_vara[0]+it_*y_size, ya[index_vara[0]+it_*y_size]
            , index_vara[size-1]+it_*y_size, y[index_vara[size-1]+it_*y_size], index_vara[size-1]+it_*y_size, ya[index_vara[size-1]+it_*y_size]);*/
          //mexPrintf("->slowc_save=%f\n",slowc_save);
          compute_complete(true, res1, res2, max_res, max_res_idx);
        }

      while (res2 > g0 && slowc_save > 1e-1)
        {
          prev_slowc_save = slowc_save;
          slowc_save /= 1.5;
          for (int i = 0; i < size; i++)
            {
              int eq = index_vara[i];
              y[eq+it_*y_size] = ya[eq+it_*y_size] + slowc_save * direction[eq+it_*y_size];
            }
          /*mexPrintf("reducing solwc_save = %e, it_=%d, y_size=%d, size=%d, y[%d]=%e, ya[%d]=%e,\n y[%d]=%e, ya[%d]=%e\n",slowc_save, it_, y_size, size-1, index_vara[0]+it_*y_size, y[index_vara[0]+it_*y_size], index_vara[0]+it_*y_size, ya[index_vara[0]+it_*y_size]                                                                                            , index_vara[size-1]+it_*y_size, y[index_vara[size-1]+it_*y_size], index_vara[size-1]+it_*y_size, ya[index_vara[size-1]+it_*y_size]);*/
          //mexPrintf("->slowc_save=%f\n",slowc_save);
          compute_complete(true, res1, res2, max_res, max_res_idx);
        }
      double ax = slowc_save-0.001, bx = slowc_save+0.001, cx = slowc_save, fa, fb, fc, xmin;
      if (false /*slowc_save > 2e-1*/)
        if (mnbrak(&ax, &bx, &cx, &fa, &fb, &fc))
          if (golden(ax, bx, cx, 1e-1, solve_tolf, &xmin))
            slowc_save = xmin;
      //mexPrintf("cx=%f\n", cx);
      //mexPrintf("ax= %f, bx=%f, cx=%f, fa=%f, fb=%f, fc=%d\n", ax, bx, cx, fa, fb, fc);

      //if (!(isnan(res1) || isinf(res1))/* && !(isnan(g0) || isinf(g0))*//*|| (res2 > g0 && iter > 1)*/)
      if (false)
        {

          double *p = (double *) mxMalloc(size * sizeof(double));
          test_mxMalloc(p, __LINE__, __FILE__, __func__, size * sizeof(double));
          Grad_f_product(size, b_m_save, p, A_m_save, Ap_save, Ai_save, Ax_save, b_save);
          double slope = 0.0;
          for (int i = 1; i < size; i++)
            slope += -direction[i] * p[i];
          /*if (slope > 0)
            mexPrintf("Roundoff in lnsearch\n");
            else*/
          {
            prev_slowc_save = 1;
            double crit_opt = res2/2;
            double max_try_iteration = 100;
            double small_ = 1.0e-4;
            bool try_at_cvg = false;
            while ((try_at_iteration < max_try_iteration) && (!try_at_cvg) && (abs(prev_slowc_save - slowc_save) > 1e-10))
              {
                crit_opt = res2 / 2;
                if (slowc_save < 1e-7)
                  {
                    try_at_cvg = true;
                    continue;
                  }
                else if ((crit_opt <= crit_opt_old + small_ * slowc_save * slope) && !(isnan(res1) || isinf(res1)))
                  {
                    try_at_cvg = true;
                    continue;
                  }
                else if (try_at_iteration == 0)
                  {
                    prev_slowc_save = slowc_save;
                    //slowc_save = max(- top * slope / ( (crit_opt - crit_opt_old - slope)), bottom);
                    slowc_save /= 1.2;
                  }
                else
                  {
                    double t1 = crit_opt - slope * slowc_save - crit_opt_old;
                    double t2 = glambda2 - slope * prev_slowc_save - crit_opt_old;
                    double a = (1/(slowc_save * slowc_save) * t1 - 1/(prev_slowc_save * prev_slowc_save) * t2) / (slowc_save - prev_slowc_save);
                    double b = (-prev_slowc_save/(slowc_save * slowc_save) * t1 + slowc_save/(prev_slowc_save * prev_slowc_save) * t2) / (slowc_save - prev_slowc_save);
                    if (a == 0)
                      slowc_save = max(min(-slope/(2 * b), top * slowc_save), bottom * slowc_save);
                    else
                      {
                        double delta = b*b - 3 * a * slope;
                        if (delta <= 0)
                          slowc_save = top * slowc_save;
                        else if (b <= 0)
                          slowc_save = max(min(-b + sqrt(delta) / (3 * a), top * slowc_save), bottom * slowc_save);
                        else
                          slowc_save = max(min(-slope / (b + sqrt(delta)), top * slowc_save), bottom * slowc_save);
                      }
                  }
                if (abs(prev_slowc_save - slowc_save) < 1e-10)
                  slowc_save /= 1.1;
                //mexPrintf("=>slowc_save=%f, prev_slowc_save=%f\n",slowc_save, prev_slowc_save);
                prev_slowc_save = slowc_save;
                glambda2 = crit_opt;
                try_at_iteration++;
                for (int i = 0; i < size; i++)
                  {
                    int eq = index_vara[i];
                    y[eq+it_*y_size] = ya[eq+it_*y_size] + slowc_save * direction[eq+it_*y_size];
                  }
                compute_complete(true, res1, res2, max_res, max_res_idx);
              }
          }
          mxFree(p);
        }
      //if (print_it)
      mexPrintf("Error: Simulation diverging, trying to correct it using slowc=%f\n", slowc_save);
      for (int i = 0; i < size; i++)
        {
          int eq = index_vara[i];
          y[eq+it_*y_size] = ya[eq+it_*y_size] + slowc_save * direction[eq+it_*y_size];
        }
      compute_complete(false, res1, res2, max_res, max_res_idx);
    }
  else
    {
      //mexPrintf("slowc_save=%f res1=%f\n",slowc_save, res1);
      for (int i = 0; i < size; i++)
        {
          int eq = index_vara[i];
          y[eq+it_*y_size] = ya[eq+it_*y_size] + slowc_save * direction[eq+it_*y_size];
        }
    }
  slowc_save = slowc;
}

bool
dynSparseMatrix::Simulate_One_Boundary(int block_num, int y_size, int y_kmin, int y_kmax, int size, bool cvg)
{
  //int i;
  mxArray *b_m = NULL, *A_m = NULL, *x0_m = NULL;
  SuiteSparse_long *Ap = NULL, *Ai = NULL;
  double *Ax = NULL, *b = NULL;
  int preconditioner = 1;

  try_at_iteration = 0;
  Clear_u();
  bool singular_system = false;
  u_count_alloc_save = u_count_alloc;

  if (isnan(res1) || isinf(res1))
    {
#ifdef DEBUG
      for (int j = 0; j < y_size; j++)
        {
          bool select = false;
          for (int i = 0; i < size; i++)
            if (j == index_vara[i])
              {
                select = true;
                break;
              }
          if (select)
            mexPrintf("-> variable %s (%d) at time %d = %f direction = %f\n", get_variable(eEndogenous, j).c_str(), j+1, it_, y[j+it_*y_size], direction[j+it_*y_size]);
          else
            mexPrintf("   variable %s (%d) at time %d = %f direction = %f\n", get_variable(eEndogenous, j).c_str(), j+1, it_, y[j+it_*y_size], direction[j+it_*y_size]);
        }
#endif
      if (steady_state)
        {
          if (iter == 0)
            mexPrintf(" the initial values of endogenous variables are too far from the solution.\nChange them!\n");
          else
            mexPrintf(" dynare cannot improve the simulation in block %d at time %d (variable %d)\n", block_num+1, it_+1, index_vara[max_res_idx]+1);
          mexEvalString("drawnow;");
          //return singular_system;
        }
      else
        {
          ostringstream tmp;
          if (iter == 0)
            tmp << " in Simulate_One_Boundary, The initial values of endogenous variables are too far from the solution.\nChange them!\n";
          else
            tmp << " in Simulate_One_Boundary, Dynare cannot improve the simulation in block " << block_num+1 << " at time " << it_+1 << " (variable " << index_vara[max_res_idx]+1 << "%d)\n";
          throw FatalExceptionHandling(tmp.str());
        }
    }
  if (print_it)
    {
      if (steady_state)
        {
          switch (solve_algo)
            {
            case 0:
              mexPrintf("MODEL STEADY STATE: MATLAB fsolve\n");
              break;
            case 1:
              mexPrintf("MODEL STEADY STATE: MATLAB solve1\n");
              break;
            case 2:
            case 4:
              mexPrintf("MODEL STEADY STATE: block decomposition + MATLAB solve1\n");
              break;
            case 3:
              mexPrintf("MODEL STEADY STATE: MATLAB csolve\n");
              break;
            case 5:
              mexPrintf("MODEL STEADY STATE: (method=ByteCode own solver)\n");
              break;
            case 6:
              mexPrintf("MODEL STEADY STATE: Sparse LU\n");
              break;
            case 7:
              mexPrintf(preconditioner_print_out("MODEL STEADY STATE: (method=GMRES)\n", preconditioner, true).c_str());
              //mexPrintf("MODEL STEADY STATE: (method=GMRES)\n");
              break;
            case 8:
              mexPrintf(preconditioner_print_out("MODEL STEADY STATE: (method=BiCGStab)\n", preconditioner, true).c_str());
              //mexPrintf("MODEL STEADY STATE: (method=BiCGStab)\n");
              break;
            default:
              mexPrintf("MODEL STEADY STATE: (method=Unknown - %d - )\n", stack_solve_algo);
            }
        }

      mexPrintf("-----------------------------------\n");
      mexPrintf("      Simulate iteration no %d     \n", iter+1);
      mexPrintf("      max. error=%.10e       \n", double (max_res));
      mexPrintf("      sqr. error=%.10e       \n", double (res2));
      mexPrintf("      abs. error=%.10e       \n", double (res1));
      mexPrintf("-----------------------------------\n");
    }
  bool zero_solution;

  if ((solve_algo == 5 && steady_state) || (stack_solve_algo == 5 && !steady_state))
    Simple_Init(size, IM_i, zero_solution);
  else
    {
      b_m = mxCreateDoubleMatrix(size, 1, mxREAL);
      if (!b_m)
        {
          ostringstream tmp;
          tmp << " in Simulate_One_Boundary, can't allocate b_m vector\n";
          throw FatalExceptionHandling(tmp.str());
        }
      A_m = mxCreateSparse(size, size, min(int (IM_i.size()*2), size * size), mxREAL);
      if (!A_m)
        {
          ostringstream tmp;
          tmp << " in Simulate_One_Boundary, can't allocate A_m matrix\n";
          throw FatalExceptionHandling(tmp.str());
        }
      x0_m = mxCreateDoubleMatrix(size, 1, mxREAL);
      if (!x0_m)
        {
          ostringstream tmp;
          tmp << " in Simulate_One_Boundary, can't allocate x0_m vector\n";
          throw FatalExceptionHandling(tmp.str());
        }
      if (!((solve_algo == 6 && steady_state) || ((stack_solve_algo == 0 || stack_solve_algo == 4) && !steady_state)))
        {
          Init_Matlab_Sparse_Simple(size, IM_i, A_m, b_m, zero_solution, x0_m);
          A_m_save = mxDuplicateArray(A_m);
          b_m_save = mxDuplicateArray(b_m);
        }
      else
        {
          Init_UMFPACK_Sparse_Simple(size, IM_i, &Ap, &Ai, &Ax, &b, zero_solution, x0_m);
          if (Ap_save[size] != Ap[size])
            {
              mxFree(Ai_save);
              mxFree(Ax_save);
              Ai_save = (SuiteSparse_long *) mxMalloc(Ap[size] * sizeof(SuiteSparse_long));
              test_mxMalloc(Ai_save, __LINE__, __FILE__, __func__, Ap[size] * sizeof(SuiteSparse_long));
              Ax_save = (double *) mxMalloc(Ap[size] * sizeof(double));
              test_mxMalloc(Ax_save, __LINE__, __FILE__, __func__, Ap[size] * sizeof(double));
            }
          memcpy(Ap_save, Ap, (size + 1) * sizeof(SuiteSparse_long));
          memcpy(Ai_save, Ai, Ap[size] * sizeof(SuiteSparse_long));
          memcpy(Ax_save, Ax, Ap[size] * sizeof(double));
          memcpy(b_save, b, size * sizeof(double));
        }
    }
  if (zero_solution)
    {
      for (int i = 0; i < size; i++)
        {
          int eq = index_vara[i];
          double yy = -(y[eq+it_*y_size]);
          direction[eq] = yy;
          y[eq+it_*y_size] += slowc * yy;
        }
    }
  else
    {
      if ((solve_algo == 5 && steady_state) || (stack_solve_algo == 5 && !steady_state))
        singular_system = Solve_ByteCode_Sparse_GaussianElimination(size, block_num, it_);
      else if ((solve_algo == 7 && steady_state) || (stack_solve_algo == 2 && !steady_state))
        Solve_Matlab_GMRES(A_m, b_m, size, slowc, block_num, false, it_, x0_m);
      else if ((solve_algo == 8 && steady_state) || (stack_solve_algo == 3 && !steady_state))
        Solve_Matlab_BiCGStab(A_m, b_m, size, slowc, block_num, false, it_, x0_m, preconditioner);
      else if ((solve_algo == 6 && steady_state) || ((stack_solve_algo == 0 || stack_solve_algo == 1 || stack_solve_algo == 4) && !steady_state))
        Solve_LU_UMFPack(Ap, Ai, Ax, b, size, size, slowc, true, 0);
    }
  return singular_system;
}

bool
dynSparseMatrix::solve_linear(const int block_num, const int y_size, const int y_kmin, const int y_kmax, const int size, const int iter)
{
  bool cvg = false;
  double crit_opt_old = res2/2;
  compute_complete(false, res1, res2, max_res, max_res_idx);
  cvg = (max_res < solve_tolf);
  if (!cvg || isnan(res1) || isinf(res1))
    {
      if (iter)
        Check_and_Correct_Previous_Iteration(block_num, y_size, size, crit_opt_old);
      bool singular_system = Simulate_One_Boundary(block_num, y_size, y_kmin, y_kmax, size, cvg);
      if (singular_system)
        Singular_display(block_num, size);
    }
  return cvg;
}

void
dynSparseMatrix::solve_non_linear(const int block_num, const int y_size, const int y_kmin, const int y_kmax, const int size)

{
  max_res_idx = 0;
  bool cvg = false;
  iter = 0;
  glambda2 = g0 = very_big;
  //try_at_iteration = 0;
  while ((!cvg) && (iter < maxit_))
    {
      cvg = solve_linear(block_num, y_size, y_kmin, y_kmax, size, iter);
      g0 = res2;
      iter++;
    }
  if (!cvg)
    {
      ostringstream tmp;
      if (steady_state)
        tmp << " in Solve Forward complete, convergence not achieved in block " << block_num+1 << ", after " << iter << " iterations\n";
      else
        tmp << " in Solve Forward complete, convergence not achieved in block " << block_num+1 << ", at time " << it_ << ", after " << iter << " iterations\n";
      throw FatalExceptionHandling(tmp.str());
    }
}

void
dynSparseMatrix::Simulate_Newton_One_Boundary(const bool forward)
{
  g1 = (double *) mxMalloc(size*size*sizeof(double));
  test_mxMalloc(g1, __LINE__, __FILE__, __func__, size*size*sizeof(double));
  r = (double *) mxMalloc(size*sizeof(double));
  test_mxMalloc(r, __LINE__, __FILE__, __func__, size*sizeof(double));
  iter = 0;
  if ((solve_algo == 6 && steady_state) || ((stack_solve_algo == 0 || stack_solve_algo == 1 || stack_solve_algo == 4) && !steady_state))
    {
      Ap_save = (SuiteSparse_long *) mxMalloc((size + 1) * sizeof(SuiteSparse_long));
      test_mxMalloc(Ap_save, __LINE__, __FILE__, __func__, (size + 1) * sizeof(SuiteSparse_long));
      Ap_save[size] = 0;
      Ai_save = (SuiteSparse_long *) mxMalloc(1 * sizeof(SuiteSparse_long));
      test_mxMalloc(Ai_save, __LINE__, __FILE__, __func__, 1 * sizeof(SuiteSparse_long));
      Ax_save = (double *) mxMalloc(1 * sizeof(double));
      test_mxMalloc(Ax_save, __LINE__, __FILE__, __func__, 1 * sizeof(double));
      b_save = (double *) mxMalloc((size) * sizeof(SuiteSparse_long));
      test_mxMalloc(b_save, __LINE__, __FILE__, __func__, (size) * sizeof(SuiteSparse_long));
    }
  if (steady_state)
    {
      it_ = 0;
      if (!is_linear)
        solve_non_linear(block_num, y_size, 0, 0, size);
      else
        solve_linear(block_num, y_size, 0, 0, size, 0);
    }
  else if (forward)
    {
      if (!is_linear)
        {
          for (it_ = y_kmin; it_ < periods+y_kmin; it_++)
            solve_non_linear(block_num, y_size, y_kmin, y_kmax, size);
        }
      else
        {
          for (int it_ = y_kmin; it_ < periods+y_kmin; it_++)
            solve_linear(block_num, y_size, y_kmin, y_kmax, size, 0);
        }
    }
  else
    {
      if (!is_linear)
        {
          for (it_ = periods+y_kmin-1; it_ >= y_kmin; it_--)
            solve_non_linear(block_num, y_size, y_kmin, y_kmax, size);
        }
      else
        {
          for (it_ = periods+y_kmin-1; it_ >= y_kmin; it_--)
            solve_linear(block_num, y_size, y_kmin, y_kmax, size, 0);
        }
    }
  if ((solve_algo == 6 && steady_state) || ((stack_solve_algo == 0 || stack_solve_algo == 1 || stack_solve_algo == 4) && !steady_state))
    {
      mxFree(Ap_save);
      mxFree(Ai_save);
      mxFree(Ax_save);
      mxFree(b_save);
    }
  mxFree(g1);
  mxFree(r);
}

string
dynSparseMatrix::preconditioner_print_out(string s, int preconditioner, bool ss)
{
  int n = s.length();
  string tmp = ", preconditioner=";
  switch (preconditioner)
    {
    case 0:
      if (ss)
        tmp.append("Jacobi on static jacobian");
      else
        tmp.append("Jacobi on dynamic jacobian");
      break;
    case 1:
      if (ss)
        tmp.append("incomplet lutp on static jacobian");
      else
        tmp.append("incomplet lu0 on dynamic jacobian");
      break;
    case 2:
      tmp.append("incomplet lutp on dynamic jacobian");
      break;
    case 3:
      tmp.append("lu on static jacobian");
      break;
    }
  s.insert(n - 2, tmp);
  return s;
}

void
dynSparseMatrix::Simulate_Newton_Two_Boundaries(int blck, int y_size, int y_kmin, int y_kmax, int Size, int periods, bool cvg, int minimal_solving_periods, int stack_solve_algo, unsigned int endo_name_length, char *P_endo_names, vector_table_conditional_local_type vector_table_conditional_local)
{
  double top = 0.5;
  double bottom = 0.1;
#ifdef CUDA
  int nnz, nnz_tild;
  int *Ap_i, *Ai_i;
  int *Ap_i_tild, *Ai_i_tild;
  double *x0, *A_tild;

#endif
  int preconditioner = 2;
  if (start_compare == 0)
    start_compare = y_kmin;
  u_count_alloc_save = u_count_alloc;
  clock_t t1 = clock();
  nop1 = 0;
  mxArray *b_m = NULL, *A_m = NULL, *x0_m = NULL;
  double *Ax = NULL, *b;
  SuiteSparse_long *Ap = NULL, *Ai = NULL;

  if (iter > 0)
    {
      if (print_it)
        {
          mexPrintf("Sim : %f ms\n", (1000.0*(double (clock())-double (time00)))/double (CLOCKS_PER_SEC));
          mexEvalString("drawnow;");
        }
      time00 = clock();
    }
  if (isnan(res1) || isinf(res1) || (res2 > 12*g0 && iter > 0))
    {
      if (iter == 0 || fabs(slowc_save) < 1e-8)
        {
          mexPrintf("res1 = %f, res2 = %f g0 = %f iter = %d\n", res1, res2, g0, iter);
          for (int j = 0; j < y_size; j++)
            {
              ostringstream res;
              for (unsigned int i = 0; i < endo_name_length; i++)
                if (P_endo_names[CHAR_LENGTH*(j+i*y_size)] != ' ')
                  res << P_endo_names[CHAR_LENGTH*(j+i*y_size)];
              bool select = false;
              for (int i = 0; i < Size; i++)
                if (j == index_vara[i])
                  {
                    select = true;
                    break;
                  }
              if (select)
                mexPrintf("-> variable %s (%d) at time %d = %f direction = %f\n", res.str().c_str(), j+1, it_, y[j+it_*y_size], direction[j+it_*y_size]);
              else
                mexPrintf("   variable %s (%d) at time %d = %f direction = %f\n", res.str().c_str(), j+1, it_, y[j+it_*y_size], direction[j+it_*y_size]);
            }
          ostringstream Error;
          if (iter == 0)
            Error << " in Simulate_Newton_Two_Boundaries, the initial values of endogenous variables are too far from the solution.\nChange them!\n";
          else
            Error << " in Simulate_Newton_Two_Boundaries, dynare cannot improve the simulation in block " << blck+1 << " at time " << it_+1 << " (variable " << index_vara[max_res_idx]+1 << " = " << max_res << ")\n";
          throw FatalExceptionHandling(Error.str());
        }
      if (!(isnan(res1) || isinf(res1)) && !(isnan(g0) || isinf(g0)) && (stack_solve_algo == 4 || stack_solve_algo == 5))
        {
          if (try_at_iteration == 0)
            {
              prev_slowc_save = slowc_save;
              slowc_save = max(-gp0 / (2 * (res2 - g0 - gp0)), bottom);
            }
          else
            {
              double t1 = res2 - gp0 * slowc_save - g0;
              double t2 = glambda2 - gp0 * prev_slowc_save - g0;
              double a = (1/(slowc_save * slowc_save) * t1 - 1/(prev_slowc_save * prev_slowc_save) * t2) / (slowc_save - prev_slowc_save);
              double b = (-prev_slowc_save/(slowc_save * slowc_save) * t1 + slowc_save/(prev_slowc_save * prev_slowc_save) * t2) / (slowc_save - prev_slowc_save);
              prev_slowc_save = slowc_save;
              slowc_save = max(min(-b + sqrt(b*b - 3 * a * gp0) / (3 * a), top * slowc_save), bottom * slowc_save);
            }
          glambda2 = res2;
          try_at_iteration++;
          if (slowc_save <= bottom)
            {
              for (int i = 0; i < y_size*(periods+y_kmin); i++)
                y[i] = ya[i]+direction[i];
              g0 = res2;
              gp0 = -res2;
              try_at_iteration = 0;
              iter--;
              return;
            }
        }
      else
        {
          prev_slowc_save = slowc_save;
          slowc_save /= 1.05;
        }
      if (print_it)
        {
          if (isnan(res1) || isinf(res1))
            mexPrintf("The model cannot be evaluated, trying to correct it using slowc=%f\n", slowc_save);
          else
            mexPrintf("Simulation diverging, trying to correct it using slowc=%f\n", slowc_save);
        }
      for (int i = 0; i < y_size*(periods+y_kmin); i++)
        y[i] = ya[i]+slowc_save*direction[i];
      iter--;
      return;
    }
  u_count += u_count_init;
  if (stack_solve_algo == 5)
    {
      if (alt_symbolic && alt_symbolic_count < alt_symbolic_count_max)
        {
          mexPrintf("Pivoting method will be applied only to the first periods.\n");
          alt_symbolic = false;
          symbolic = true;
          markowitz_c = markowitz_c_s;
          alt_symbolic_count++;
        }
      if (((res1/res1a-1) > -0.3) && symbolic && iter > 0)
        {
          if (restart > 2)
            {
              mexPrintf("Divergence or slowdown occurred during simulation.\nIn the next iteration, pivoting method will be applied to all periods.\n");
              symbolic = false;
              alt_symbolic = true;
              markowitz_c_s = markowitz_c;
              markowitz_c = 0;
            }
          else
            {
              mexPrintf("Divergence or slowdown occurred during simulation.\nIn the next iteration, pivoting method will be applied for a longer period.\n");
              start_compare = min(tbreak_g, periods);
              restart++;
            }
        }
      else
        {
          start_compare = max(y_kmin, minimal_solving_periods);
          restart = 0;
        }
    }
  res1a = res1;
  if (print_it)
    {
      if (iter == 0)
        {
          switch (stack_solve_algo)
            {
            case 0:
              mexPrintf("MODEL SIMULATION: (method=Sparse LU)\n");
              break;
            case 1:
              mexPrintf("MODEL SIMULATION: (method=Relaxation)\n");
              break;
            case 2:
              mexPrintf(preconditioner_print_out("MODEL SIMULATION: (method=GMRES)\n", preconditioner, false).c_str());
              break;
            case 3:
              mexPrintf(preconditioner_print_out("MODEL SIMULATION: (method=BiCGStab)\n", preconditioner, false).c_str());
              break;
            case 4:
              mexPrintf("MODEL SIMULATION: (method=Sparse LU & optimal path length)\n");
              break;
            case 5:
              mexPrintf("MODEL SIMULATION: (method=ByteCode own solver)\n");
              break;
            case 7:
              mexPrintf(preconditioner_print_out("MODEL SIMULATION: (method=GPU BiCGStab)\n", preconditioner, false).c_str());
              break;
            default:
              mexPrintf("MODEL SIMULATION: (method=Unknown - %d - )\n", stack_solve_algo);
            }
        }
      mexPrintf("-----------------------------------\n");
      mexPrintf("      Simulate iteration no %d     \n", iter+1);
      mexPrintf("      max. error=%.10e       \n", double (max_res));
      mexPrintf("      sqr. error=%.10e       \n", double (res2));
      mexPrintf("      abs. error=%.10e       \n", double (res1));
      mexPrintf("-----------------------------------\n");
      mexEvalString("drawnow;");
    }
  if (cvg)
    {
      return;
    }
  else
    {
      if (stack_solve_algo == 5)
        Init_GE(periods, y_kmin, y_kmax, Size, IM_i);
      else
        {
          b_m = mxCreateDoubleMatrix(periods*Size, 1, mxREAL);
          if (!b_m)
            {
              ostringstream tmp;
              tmp << " in Simulate_Newton_Two_Boundaries, can't allocate b_m vector\n";
              throw FatalExceptionHandling(tmp.str());
            }
          x0_m = mxCreateDoubleMatrix(periods*Size, 1, mxREAL);
          if (!x0_m)
            {
              ostringstream tmp;
              tmp << " in Simulate_Newton_Two_Boundaries, can't allocate x0_m vector\n";
              throw FatalExceptionHandling(tmp.str());
            }
          if (stack_solve_algo != 0 && stack_solve_algo != 4 && stack_solve_algo != 7)
            {
              A_m = mxCreateSparse(periods*Size, periods*Size, IM_i.size()* periods*2, mxREAL);
              if (!A_m)
                {
                  ostringstream tmp;
                  tmp << " in Simulate_Newton_Two_Boundaries, can't allocate A_m matrix\n";
                  throw FatalExceptionHandling(tmp.str());
                }
            }
          if (stack_solve_algo == 0 || stack_solve_algo == 4)
            Init_UMFPACK_Sparse(periods, y_kmin, y_kmax, Size, IM_i, &Ap, &Ai, &Ax, &b, x0_m, vector_table_conditional_local, blck);
#ifdef CUDA
          else if (stack_solve_algo == 7)
            Init_CUDA_Sparse(periods, y_kmin, y_kmax, Size, IM_i, &Ap_i, &Ai_i, &Ax, &Ap_i_tild, &Ai_i_tild, &A_tild, &b, &x0, x0_m, &nnz, &nnz_tild, preconditioner);
#endif
          else
            Init_Matlab_Sparse(periods, y_kmin, y_kmax, Size, IM_i, A_m, b_m, x0_m);

        }
      if (stack_solve_algo == 0 || stack_solve_algo == 4)
        Solve_LU_UMFPack(Ap, Ai, Ax, b, Size * periods, Size, slowc, true, 0, vector_table_conditional_local);
      else if (stack_solve_algo == 1)
        Solve_Matlab_Relaxation(A_m, b_m, Size, slowc, true, 0);
      else if (stack_solve_algo == 2)
        Solve_Matlab_GMRES(A_m, b_m, Size, slowc, blck, true, 0, x0_m);
      else if (stack_solve_algo == 3)
        Solve_Matlab_BiCGStab(A_m, b_m, Size, slowc, blck, true, 0, x0_m, 1);
      else if (stack_solve_algo == 5)
        Solve_ByteCode_Symbolic_Sparse_GaussianElimination(Size, symbolic, blck);
#ifdef CUDA
      else if (stack_solve_algo == 7)
        Solve_CUDA_BiCGStab(Ap_i, Ai_i, Ax, Ap_i_tild, Ai_i_tild, A_tild, b, x0, Size * periods, Size, slowc, true, 0, nnz, nnz_tild, preconditioner, Size * periods, blck);
#endif
    }
  if (print_it)
    {
      clock_t t2 = clock();
      mexPrintf("(** %f milliseconds **)\n", 1000.0*(double (t2) - double (t1))/double (CLOCKS_PER_SEC));
      mexEvalString("drawnow;");
    }
  if ((!steady_state && (stack_solve_algo == 4 /*|| stack_solve_algo == 0*/)) /* || steady_state*/)
    {
      clock_t t2 = clock();
      double ax = -0.1, bx = 1.1, cx = 0.5, fa, fb, fc, xmin;

      if (!mnbrak(&ax, &bx, &cx, &fa, &fb, &fc))
        return;
      //mexPrintf("ax= %f, bx=%f, cx=%f, fa=%f, fb=%f, fc=%d\n", ax, bx, cx, fa, fb, fc);
      if (!golden(ax, bx, cx, 1e-1, solve_tolf, &xmin))
        return;
      slowc = xmin;
      clock_t t3 = clock();
      mexPrintf("(** %f milliseconds **)\n", 1000.0*(double (t3) - double (t2))/double (CLOCKS_PER_SEC));
      mexEvalString("drawnow;");
    }
  time00 = clock();
  if (tbreak_g == 0)
    tbreak_g = periods;
  return;
}

void
dynSparseMatrix::fixe_u(double **u, int u_count_int, int max_lag_plus_max_lead_plus_1)
{
  u_count = u_count_int * periods;
  u_count_alloc = 2*u_count;
#ifdef DEBUG
  mexPrintf("fixe_u : alloc(%d double)\n", u_count_alloc);
#endif
  (*u) = (double *) mxMalloc(u_count_alloc*sizeof(double));
  test_mxMalloc(*u, __LINE__, __FILE__, __func__, u_count_alloc*sizeof(double));
#ifdef DEBUG
  mexPrintf("*u=%d\n", *u);
#endif
  memset((*u), 0, u_count_alloc*sizeof(double));
  u_count_init = max_lag_plus_max_lead_plus_1;
}