From 03e487a09293cc8bd5b5dff2e53e346729257509 Mon Sep 17 00:00:00 2001
From: Ferhat Mihoubi <ferhat.mihoubi@univ-evry.fr>
Date: Fri, 22 Mar 2013 15:44:34 +0100
Subject: [PATCH] Major update of bytecode:  - Iterative linear solvers using
 CUDA  - interpreter.cc decomposed

---
 mex/build/bytecode.am                 |    4 +-
 mex/sources/bytecode/ErrorHandling.hh |  253 +-
 mex/sources/bytecode/Interpreter.cc   | 2436 ++------------
 mex/sources/bytecode/Interpreter.hh   |   54 +-
 mex/sources/bytecode/SparseMatrix.cc  | 4474 +++++++++++++++++++++----
 mex/sources/bytecode/SparseMatrix.hh  |  133 +-
 mex/sources/bytecode/bytecode.cc      |  669 +++-
 mex/sources/dynblas.h                 |    2 +-
 8 files changed, 5104 insertions(+), 2921 deletions(-)
diff --git a/mex/build/bytecode.am b/mex/build/bytecode.am
index 66a29a3e9..f0563e4f8 100644
--- a/mex/build/bytecode.am
+++ b/mex/build/bytecode.am
@@ -1,6 +1,6 @@
 noinst_PROGRAMS = bytecode
 
-bytecode_CPPFLAGS = $(AM_CPPFLAGS) -I$(top_srcdir)/../../sources/bytecode -I$(top_srcdir)/../../../preprocessor
+bytecode_CPPFLAGS = $(AM_CPPFLAGS) -I$(top_srcdir)/../../sources -I$(top_srcdir)/../../sources/bytecode -I$(top_srcdir)/../../../preprocessor
 
 TOPDIR = $(top_srcdir)/../../sources/bytecode
 
@@ -9,8 +9,10 @@ nodist_bytecode_SOURCES = \
 	$(TOPDIR)/Interpreter.cc \
 	$(TOPDIR)/Mem_Mngr.cc \
 	$(TOPDIR)/SparseMatrix.cc \
+	$(TOPDIR)/Evaluate.cc \
 	$(TOPDIR)/Interpreter.hh \
 	$(TOPDIR)/Mem_Mngr.hh \
 	$(TOPDIR)/SparseMatrix.hh \
+	$(TOPDIR)/Evaluate.hh \
 	$(TOPDIR)/ErrorHandling.hh
 
diff --git a/mex/sources/bytecode/ErrorHandling.hh b/mex/sources/bytecode/ErrorHandling.hh
index 9c6717d0e..b09c62deb 100644
--- a/mex/sources/bytecode/ErrorHandling.hh
+++ b/mex/sources/bytecode/ErrorHandling.hh
@@ -23,24 +23,138 @@
 #include <cstring>
 #include <iostream>
 #include <sstream>
+#include <map>
+#define BYTE_CODE
 #include "CodeInterpreter.hh"
 #ifdef DEBUG_EX
-# include <math>
+# include <math.h>
 # include "mex_interface.hh"
 #endif
+
+#ifdef OCTAVE_MEX_FILE
+# define CHAR_LENGTH 1
+#else
+# define CHAR_LENGTH 2
+#endif
+
+#ifdef _MSC_VER
+#include <limits>
+#define M_E 2.71828182845904523536
+#define M_LOG2E 1.44269504088896340736
+#define M_LOG10E 0.434294481903251827651
+#define M_LN2 0.693147180559945309417
+#define M_LN10 2.30258509299404568402
+#define M_PI 3.14159265358979323846
+#define M_PI_2 1.57079632679489661923
+#define M_PI_4 0.785398163397448309616
+#define M_1_PI 0.318309886183790671538
+#define M_2_PI 0.636619772367581343076
+#define M_1_SQRTPI 0.564189583547756286948
+#define M_2_SQRTPI 1.12837916709551257390
+#define M_SQRT2 1.41421356237309504880
+#define M_SQRT_2 0.707106781186547524401
+#define NAN numeric_limits<double>::quiet_NaN()
+
+#define isnan(x) _isnan(x)
+#define isinf(x) (!_finite(x))
+#define fpu_error(x) (isinf(x) || isnan(x))
+
+
+class MSVCpp_missings
+{
+  public:
+  inline double
+  asinh(double x) const
+    {
+      if(x==0.0)
+        return 0.0;
+      double ax = abs(x);
+      return log(x+ax*sqrt(1.+1./(ax*ax)));
+    }
+
+  inline double
+  acosh(double x) const
+    {
+      if(x==0.0)
+        return 0.0;
+      double ax = abs(x);
+      return log(x+ax*sqrt(1.-1./(ax*ax)));
+    }
+
+  inline double
+  atanh(double x) const
+    {
+      return log((1+x)/(1-x))/2;
+    }
+
+  inline double
+  erf(double x) const
+    {
+      const double a1 = -1.26551223,   a2 = 1.00002368,
+                   a3 =  0.37409196,   a4 = 0.09678418,
+                   a5 = -0.18628806,   a6 = 0.27886807,
+                   a7 = -1.13520398,   a8 = 1.48851587,
+                   a9 = -0.82215223,  a10 = 0.17087277;
+     double v = 1;
+     double z = abs(x);
+     if (z <= 0)
+       return v;
+     double t = 1 / (1 + 0.5 * z);
+     v = t*exp((-z*z) +a1+t*(a2+t*(a3+t*(a4+t*(a5+t*(a6+t*(a7+t*(a8+t*(a9+t*a10)))))))));
+     if (x < 0)
+       v = 2 - v;
+     return 1 - v;
+    }
+
+  inline double
+  nearbyint(double x) const
+    {
+      return floor(x + 0.5);
+    }
+
+  inline double
+  fmax(double x, double y) const
+    {
+      if (x > y)
+        return x;
+      else
+        return y;
+   }
+
+  inline double
+  fmin(double x, double y) const
+    {
+      if (x < y)
+        return x;
+      else
+        return y;
+    }
+
+};
+#endif
+
+
 //#define DEBUG
 using namespace std;
 
 const int NO_ERROR_ON_EXIT = 0;
 const int ERROR_ON_EXIT = 1;
 
+
 typedef vector<pair<Tags, void * > > code_liste_type;
 typedef code_liste_type::const_iterator it_code_type;
 
+
 class GeneralExceptionHandling
 {
   string ErrorMsg;
 public:
+#ifdef _MSC_VER_
+  ~GeneralExceptionHandling()
+  {
+    FreeLibrary(hinstLib);
+  };
+#endif
   GeneralExceptionHandling(string ErrorMsg_arg) : ErrorMsg(ErrorMsg_arg)
   {
   };
@@ -121,6 +235,16 @@ public:
   };
 };
 
+class UserExceptionHandling : public GeneralExceptionHandling
+{
+  double value;
+public:
+  UserExceptionHandling() : GeneralExceptionHandling("Fatal error in bytecode:")
+  {
+    completeErrorMsg(" User break\n");
+  };
+};
+
 class FatalExceptionHandling : public GeneralExceptionHandling
 {
 public:
@@ -133,16 +257,40 @@ public:
   };
 };
 
-class ErrorMsg
+struct s_plan
 {
+  string var, exo;
+  int var_num, exo_num;
+  vector<pair<int, double> > per_value;
+};
+
+#ifdef MATLAB_MEX_FILE
+extern "C" bool utIsInterruptPending();
+#else
+#include <octave/oct.h>
+#include <octave/unwind-prot.h>
+#endif
+
+#ifdef _MSC_VER
+class ErrorMsg : public MSVCpp_missings
+#else
+class ErrorMsg
+#endif
+{
+private:
+  bool is_load_variable_list;
+
 public:
+  double *y, *ya;
+  int y_size;
   double *T;
-  int nb_row_xd, nb_row_x, y_size;
+  int nb_row_xd, nb_row_x;
   int y_kmin, y_kmax, periods;
   double *x, *params;
-  double *u, *y, *ya;
+  double *u;
   double *steady_y, *steady_x;
-  double *g2, *g1, *r;
+  double *g2, *g1, *r, *res;
+  vector<s_plan> splan, spfplan;
   vector<mxArray *> jacobian_block, jacobian_other_endo_block, jacobian_exo_block, jacobian_det_exo_block;
   map<unsigned int, double> TEF;
   map<pair<unsigned int, unsigned int>, double > TEFD;
@@ -150,11 +298,12 @@ public:
 
   ExpressionType EQN_type;
   it_code_type it_code_expr;
-  unsigned int nb_endo, nb_exo, nb_param;
+  /*unsigned int*/size_t nb_endo, nb_exo, nb_param;
   char *P_endo_names, *P_exo_names, *P_param_names;
-  unsigned int endo_name_length, exo_name_length, param_name_length;
+  size_t/*unsigned int*/ endo_name_length, exo_name_length, param_name_length;
   unsigned int EQN_equation, EQN_block, EQN_block_number;
   unsigned int EQN_dvar1, EQN_dvar2, EQN_dvar3;
+  vector<pair<string, pair<SymbolType, unsigned int> > > Variable_list;
 
   inline
   ErrorMsg()
@@ -169,6 +318,7 @@ public:
     nb_param = mxGetM(mxGetFieldByNumber(M_, 0, mxGetFieldNumber(M_, "param_names")));
     param_name_length = mxGetN(mxGetFieldByNumber(M_, 0, mxGetFieldNumber(M_, "param_names")));
     P_param_names = (char *) mxGetPr(mxGetFieldByNumber(M_, 0, mxGetFieldNumber(M_, "param_names")));
+    is_load_variable_list = false;
   }
 
   inline string
@@ -184,9 +334,9 @@ public:
         else
           {
             if (str[i] == '$')
-              pos1 = temp.length();
+              pos1 = int(temp.length());
             else
-              pos2 = temp.length();
+              pos2 = int(temp.length());
             if (pos1 >= 0 && pos2 >= 0)
               {
                 tmp_n.erase(pos1, pos2-pos1+1);
@@ -199,6 +349,50 @@ public:
     return temp;
   }
 
+  inline void
+  load_variable_list()
+  {
+    ostringstream res;
+    for (unsigned int variable_num = 0; variable_num < (unsigned int)nb_endo; variable_num++)
+      {
+        for (unsigned int i = 0; i < endo_name_length; i++)
+          if (P_endo_names[CHAR_LENGTH*(variable_num+i*nb_endo)] != ' ')
+            res << P_endo_names[CHAR_LENGTH*(variable_num+i*nb_endo)];
+        Variable_list.push_back(make_pair(res.str(), make_pair(eEndogenous, variable_num)));
+      }
+    for (unsigned int variable_num = 0; variable_num < (unsigned int)nb_exo; variable_num++)
+      {
+        for (unsigned int i = 0; i < exo_name_length; i++)
+          if (P_exo_names[CHAR_LENGTH*(variable_num+i*nb_exo)] != ' ')
+            res << P_exo_names[CHAR_LENGTH*(variable_num+i*nb_exo)];
+        Variable_list.push_back(make_pair(res.str(), make_pair(eExogenous, variable_num)));
+      }
+  }
+
+  inline int
+  get_ID(const string variable_name, SymbolType *variable_type)
+  {
+    if (!is_load_variable_list)
+      {
+        load_variable_list();
+        is_load_variable_list = true;
+      }
+    size_t n = Variable_list.size();
+    int i = 0;
+    bool notfound = true;
+    while (notfound && i < n)
+      {
+        if (variable_name == Variable_list[i].first)
+          {
+            notfound = false;
+            *variable_type = Variable_list[i].second.first;
+            return Variable_list[i].second.second;
+          }
+        i++;
+      }
+    return(-1);
+  }
+
   inline string
   get_variable(const SymbolType variable_type, const unsigned int variable_num) const
   {
@@ -293,7 +487,6 @@ public:
           break;
         default:
           return ("???");
-          break;
         }
     else
       switch (EQN_type)
@@ -342,7 +535,6 @@ public:
           break;
         default:
           return ("???");
-          break;
         }
     it_code_type it_code_ret;
     Error_loc << endl << add_underscore_to_fpe("      " + print_expression(it_code_expr, evaluate, size, block_num, steady_state, Per_u_, it_, it_code_ret, true));
@@ -378,6 +570,12 @@ public:
 
     while (go_on)
       {
+#ifdef OCTAVE_MEX_FILE
+        OCTAVE_QUIT;
+#else
+	      if ( utIsInterruptPending() )
+		      throw UserExceptionHandling();
+#endif
         switch (it_code->first)
           {
           case FNUMEXPR:
@@ -441,7 +639,9 @@ public:
               case eParameter:
                 var = ((FLDV_ *) it_code->second)->get_pos();
 #ifdef DEBUG
-                mexPrintf("FLDV_ Param var=%d", var);
+                mexPrintf("FLDV_ Param var=%d\n", var);
+                mexPrintf("get_variable(eParameter, var)=%s\n",get_variable(eParameter, var).c_str());
+                mexEvalString("drawnow;");
 #endif
                 Stack.push(get_variable(eParameter, var));
                 if (compute)
@@ -451,7 +651,10 @@ public:
                 var = ((FLDV_ *) it_code->second)->get_pos();
                 lag = ((FLDV_ *) it_code->second)->get_lead_lag();
 #ifdef DEBUG
-                mexPrintf("FLDV_ endo var=%d, lag=%d", var, lag);
+                mexPrintf("FLDV_ endo var=%d, lag=%d\n", var, lag);
+                mexPrintf("get_variable(eEndogenous, var)=%s, compute=%d\n",get_variable(eEndogenous, var).c_str(), compute);
+                mexPrintf("it_=%d, lag=%d, y_size=%d, var=%d, y=%x\n", it_, lag, y_size, var, y);
+                mexEvalString("drawnow;");
 #endif
                 tmp_out.str("");
                 if (lag > 0)
@@ -1250,7 +1453,7 @@ public:
                   Stack.pop();
                   if (compute)
                     {
-                      int derivOrder = nearbyint(Stackf.top());
+                      int derivOrder = int(nearbyint(Stackf.top()));
                       Stackf.pop();
                       if (fabs(v1f) < NEAR_ZERO && v2f > 0
                           && derivOrder > v2f
@@ -1570,7 +1773,11 @@ public:
                       }
                     tmp_out.str("");
                     tmp_out << function_name << "(";
+#ifndef _MSC_VER
                     string ss[nb_input_arguments];
+#else
+                    vector<string> ss(nb_input_arguments);
+#endif
                     for (unsigned int i = 0; i < nb_input_arguments; i++)
                       {
                         ss[nb_input_arguments-i-1] = Stack.top();
@@ -1624,7 +1831,11 @@ public:
                     tmp_out.str("");
                     tmp_out << function_name << "(";
                     tmp_out << arg_func_name.c_str() << ", " << fc->get_row() << ", {";
+#ifndef _MSC_VER
                     string ss[nb_add_input_arguments];
+#else
+                    vector<string> ss(nb_input_arguments);
+#endif
                     for (unsigned int i = 0; i < nb_add_input_arguments; i++)
                       {
                         ss[nb_add_input_arguments-i-1] = Stack.top();
@@ -1655,7 +1866,11 @@ public:
                       }
                     tmp_out.str("");
                     tmp_out << function_name << "(";
+#ifndef _MSC_VER
                     string ss[nb_input_arguments];
+#else
+                    vector<string> ss(nb_input_arguments);
+#endif
                     for (unsigned int i = 0; i < nb_input_arguments; i++)
                       {
                         ss[nb_input_arguments-i-1] = Stack.top();
@@ -1708,7 +1923,11 @@ public:
                     tmp_out.str("");
                     tmp_out << function_name << "(";
                     tmp_out << arg_func_name.c_str() << ", " << fc->get_row() << ", " << fc->get_col() << ", {";
+#ifndef _MSC_VER
                     string ss[nb_add_input_arguments];
+#else
+                    vector<string> ss(nb_input_arguments);
+#endif
                     for (unsigned int i = 0; i < nb_add_input_arguments; i++)
                       {
                         ss[nb_add_input_arguments-i-1] = Stack.top();
@@ -1739,7 +1958,11 @@ public:
                       }
                     tmp_out.str("");
                     tmp_out << function_name << "(";
+#ifndef _MSC_VER
                     string ss[nb_input_arguments];
+#else
+                    vector<string> ss(nb_input_arguments);
+#endif
                     for (unsigned int i = 0; i < nb_input_arguments; i++)
                       {
                         ss[nb_input_arguments-i-1] = Stack.top();
@@ -1965,7 +2188,7 @@ public:
         it_code++;
       }
 #ifdef DEBUG
-    mexPrintf("print_expression end\n"); mexEvalString("drawnow;");
+    mexPrintf("print_expression end tmp_out.str().c_str()=%s\n", tmp_out.str().c_str()); mexEvalString("drawnow;");
 #endif
     it_code_ret = it_code;
     return (tmp_out.str());
diff --git a/mex/sources/bytecode/Interpreter.cc b/mex/sources/bytecode/Interpreter.cc
index 1587e2d45..c4b677c82 100644
--- a/mex/sources/bytecode/Interpreter.cc
+++ b/mex/sources/bytecode/Interpreter.cc
@@ -22,18 +22,28 @@
 #include "Interpreter.hh"
 #define BIG 1.0e+8;
 #define SMALL 1.0e-5;
-//#define DEBUG
+///#define DEBUG
 
 Interpreter::~Interpreter()
 {
 }
 
 Interpreter::Interpreter(double *params_arg, double *y_arg, double *ya_arg, double *x_arg, double *steady_y_arg, double *steady_x_arg,
-                         double *direction_arg, int y_size_arg,
-                         int nb_row_x_arg, int nb_row_xd_arg, int periods_arg, int y_kmin_arg, int y_kmax_arg,
-                         int maxit_arg_, double solve_tolf_arg, int size_of_direction_arg, double slowc_arg, int y_decal_arg, double markowitz_c_arg,
+                         double *direction_arg, size_t y_size_arg,
+                         size_t nb_row_x_arg, size_t nb_row_xd_arg, int periods_arg, int y_kmin_arg, int y_kmax_arg,
+                         int maxit_arg_, double solve_tolf_arg, size_t size_of_direction_arg, double slowc_arg, int y_decal_arg, double markowitz_c_arg,
                          string &filename_arg, int minimal_solving_periods_arg, int stack_solve_algo_arg, int solve_algo_arg,
-                         bool global_temporary_terms_arg, bool print_arg, bool print_error_arg, mxArray *GlobalTemporaryTerms_arg)
+                         bool global_temporary_terms_arg, bool print_arg, bool print_error_arg, mxArray *GlobalTemporaryTerms_arg,
+                         bool steady_state_arg, bool print_it_arg
+#ifdef CUDA
+                         , const int CUDA_device_arg, cublasHandle_t cublas_handle_arg, cusparseHandle_t cusparse_handle_arg, cusparseMatDescr_t descr_arg
+#endif
+                         )
+                         : dynSparseMatrix(y_size_arg, y_kmin_arg, y_kmax_arg, print_it_arg, steady_state_arg, periods_arg, minimal_solving_periods_arg
+#ifdef CUDA
+                                        , CUDA_device_arg, cublas_handle_arg, cusparse_handle_arg, descr_arg
+#endif
+                                        )
 {
   params = params_arg;
   y = y_arg;
@@ -42,12 +52,12 @@ Interpreter::Interpreter(double *params_arg, double *y_arg, double *ya_arg, doub
   steady_y = steady_y_arg;
   steady_x = steady_x_arg;
   direction = direction_arg;
-  y_size = y_size_arg;
+  //y_size = y_size_arg;
   nb_row_x = nb_row_x_arg;
   nb_row_xd = nb_row_xd_arg;
   periods = periods_arg;
-  y_kmax = y_kmax_arg;
-  y_kmin = y_kmin_arg;
+  //y_kmax = y_kmax_arg;
+  //y_kmin = y_kmin_arg;
   maxit_ = maxit_arg_;
   solve_tolf = solve_tolf_arg;
   size_of_direction = size_of_direction_arg;
@@ -57,7 +67,6 @@ Interpreter::Interpreter(double *params_arg, double *y_arg, double *ya_arg, doub
   markowitz_c = markowitz_c_arg;
   filename = filename_arg;
   T = NULL;
-  error_not_printed = true;
   minimal_solving_periods = minimal_solving_periods_arg;
   stack_solve_algo = stack_solve_algo_arg;
   solve_algo = solve_algo_arg;
@@ -65,1393 +74,15 @@ Interpreter::Interpreter(double *params_arg, double *y_arg, double *ya_arg, doub
   print = print_arg;
   GlobalTemporaryTerms = GlobalTemporaryTerms_arg;
   print_error = print_error_arg;
-}
+  //steady_state = steady_state_arg;
+  //print_it = print_it_arg;
 
-double
-Interpreter::pow1(double a, double b)
-{
-  double r = pow_(a, b);
-  if (isnan(r) || isinf(r))
-    {
-      res1 = NAN;
-      r = 0.0000000000000000000000001;
-      if (print_error)
-        throw PowExceptionHandling(a, b);
-    }
-  return r;
-}
-
-double
-Interpreter::divide(double a, double b)
-{
-  double r = a / b;
-  if (isnan(r) || isinf(r))
-    {
-      res1 = NAN;
-      r = 1e70;
-      if (print_error)
-        throw DivideExceptionHandling(a, b);
-    }
-  return r;
-}
-
-double
-Interpreter::log1(double a)
-{
-  double r = log(a);
-  if (isnan(r) || isinf(r))
-    {
-      res1 = NAN;
-      r = -1e70;
-      if (print_error)
-        throw LogExceptionHandling(a);
-    }
-  return r;
-}
-
-double
-Interpreter::log10_1(double a)
-{
-  double r = log(a);
-  if (isnan(r) || isinf(r))
-    {
-      res1 = NAN;
-      r = -1e70;
-      if (print_error)
-        throw Log10ExceptionHandling(a);
-    }
-  return r;
 }
 
 void
-Interpreter::compute_block_time(int Per_u_, bool evaluate, int block_num, int size, bool steady_state)
-{
-  int var = 0, lag = 0, op;
-  unsigned int eq, pos_col;
-  ostringstream tmp_out;
-  double v1, v2, v3;
-  bool go_on = true;
-  double ll;
-  double rr;
-  double *jacob = NULL, *jacob_other_endo = NULL, *jacob_exo = NULL, *jacob_exo_det = NULL;
-  EQN_block = block_num;
-  stack<double> Stack;
-  external_function_type function_type = ExternalFunctionWithoutDerivative;
-
-#ifdef DEBUG
-  mexPrintf("compute_block_time\n");
-#endif
-  if (evaluate /*&& !steady_state*/)
-    {
-      jacob = mxGetPr(jacobian_block[block_num]);
-      if (!steady_state)
-        {
-          jacob_other_endo = mxGetPr(jacobian_other_endo_block[block_num]);
-          jacob_exo = mxGetPr(jacobian_exo_block[block_num]);
-          jacob_exo_det = mxGetPr(jacobian_det_exo_block[block_num]);
-        }
-    }
-
-  while (go_on)
-    {
-      switch (it_code->first)
-        {
-        case FNUMEXPR:
-#ifdef DEBUG
-          mexPrintf("FNUMEXPR\n");
-#endif
-          it_code_expr = it_code;
-          switch (((FNUMEXPR_ *) it_code->second)->get_expression_type())
-            {
-            case TemporaryTerm:
-#ifdef DEBUG
-              mexPrintf("TemporaryTerm\n");
-#endif
-              EQN_type = TemporaryTerm;
-              EQN_equation = ((FNUMEXPR_ *) it_code->second)->get_equation();
-#ifdef DEBUG
-              mexPrintf("EQN_equation=%d\n", EQN_equation); mexEvalString("drawnow;");
-#endif
-              break;
-            case ModelEquation:
-#ifdef DEBUG
-              mexPrintf("ModelEquation\n");
-#endif
-              EQN_type = ModelEquation;
-              EQN_equation = ((FNUMEXPR_ *) it_code->second)->get_equation();
-              break;
-            case FirstEndoDerivative:
-#ifdef DEBUG
-              mexPrintf("FirstEndoDerivative\n");
-#endif
-              EQN_type = FirstEndoDerivative;
-              EQN_equation = ((FNUMEXPR_ *) it_code->second)->get_equation();
-              EQN_dvar1 = ((FNUMEXPR_ *) it_code->second)->get_dvariable1();
-              EQN_lag1 = ((FNUMEXPR_ *) it_code->second)->get_lag1();
-              break;
-            case FirstOtherEndoDerivative:
-#ifdef DEBUG
-              mexPrintf("FirstOtherEndoDerivative\n");
-#endif
-              EQN_type = FirstOtherEndoDerivative;
-              EQN_equation = ((FNUMEXPR_ *) it_code->second)->get_equation();
-              EQN_dvar1 = ((FNUMEXPR_ *) it_code->second)->get_dvariable1();
-              EQN_lag1 = ((FNUMEXPR_ *) it_code->second)->get_lag1();
-              break;
-            case FirstExoDerivative:
-#ifdef DEBUG
-              mexPrintf("FirstExoDerivative\n");
-#endif
-              EQN_type = FirstExoDerivative;
-              EQN_equation = ((FNUMEXPR_ *) it_code->second)->get_equation();
-              EQN_dvar1 = ((FNUMEXPR_ *) it_code->second)->get_dvariable1();
-              EQN_lag1 = ((FNUMEXPR_ *) it_code->second)->get_lag1();
-              break;
-            case FirstExodetDerivative:
-#ifdef DEBUG
-              mexPrintf("FirstExodetDerivative\n");
-#endif
-              EQN_type = FirstExodetDerivative;
-              EQN_equation = ((FNUMEXPR_ *) it_code->second)->get_equation();
-              EQN_dvar1 = ((FNUMEXPR_ *) it_code->second)->get_dvariable1();
-              EQN_lag1 = ((FNUMEXPR_ *) it_code->second)->get_lag1();
-              break;
-            case FirstParamDerivative:
-#ifdef DEBUG
-              mexPrintf("FirstParamDerivative\n");
-#endif
-              EQN_type = FirstParamDerivative;
-              EQN_equation = ((FNUMEXPR_ *) it_code->second)->get_equation();
-              EQN_dvar1 = ((FNUMEXPR_ *) it_code->second)->get_dvariable1();
-              break;
-            case SecondEndoDerivative:
-#ifdef DEBUG
-              mexPrintf("SecondEndoDerivative\n");
-#endif
-              EQN_type = SecondEndoDerivative;
-              EQN_equation = ((FNUMEXPR_ *) it_code->second)->get_equation();
-              EQN_dvar1 = ((FNUMEXPR_ *) it_code->second)->get_dvariable1();
-              EQN_lag1 = ((FNUMEXPR_ *) it_code->second)->get_lag1();
-              EQN_dvar2 = ((FNUMEXPR_ *) it_code->second)->get_dvariable2();
-              EQN_lag2 = ((FNUMEXPR_ *) it_code->second)->get_lag2();
-              break;
-            case SecondExoDerivative:
-#ifdef DEBUG
-              mexPrintf("SecondExoDerivative\n");
-#endif
-              EQN_type = SecondExoDerivative;
-              EQN_equation = ((FNUMEXPR_ *) it_code->second)->get_equation();
-              EQN_dvar1 = ((FNUMEXPR_ *) it_code->second)->get_dvariable1();
-              EQN_lag1 = ((FNUMEXPR_ *) it_code->second)->get_lag1();
-              EQN_dvar2 = ((FNUMEXPR_ *) it_code->second)->get_dvariable2();
-              EQN_lag2 = ((FNUMEXPR_ *) it_code->second)->get_lag2();
-              break;
-            case SecondExodetDerivative:
-#ifdef DEBUG
-              mexPrintf("SecondExodetDerivative\n");
-#endif
-              EQN_type = SecondExodetDerivative;
-              EQN_equation = ((FNUMEXPR_ *) it_code->second)->get_equation();
-              EQN_dvar1 = ((FNUMEXPR_ *) it_code->second)->get_dvariable1();
-              EQN_lag1 = ((FNUMEXPR_ *) it_code->second)->get_lag1();
-              EQN_dvar2 = ((FNUMEXPR_ *) it_code->second)->get_dvariable2();
-              EQN_lag2 = ((FNUMEXPR_ *) it_code->second)->get_lag2();
-              break;
-            case SecondParamDerivative:
-#ifdef DEBUG
-              mexPrintf("SecondParamDerivative\n");
-#endif
-              EQN_type = SecondParamDerivative;
-              EQN_equation = ((FNUMEXPR_ *) it_code->second)->get_equation();
-              EQN_dvar1 = ((FNUMEXPR_ *) it_code->second)->get_dvariable1();
-              EQN_dvar2 = ((FNUMEXPR_ *) it_code->second)->get_dvariable2();
-              break;
-            case ThirdEndoDerivative:
-#ifdef DEBUG
-              mexPrintf("ThirdEndoDerivative\n");
-#endif
-              EQN_type = ThirdEndoDerivative;
-              EQN_equation = ((FNUMEXPR_ *) it_code->second)->get_equation();
-              EQN_dvar1 = ((FNUMEXPR_ *) it_code->second)->get_dvariable1();
-              EQN_lag1 = ((FNUMEXPR_ *) it_code->second)->get_lag1();
-              EQN_dvar2 = ((FNUMEXPR_ *) it_code->second)->get_dvariable2();
-              EQN_lag2 = ((FNUMEXPR_ *) it_code->second)->get_lag2();
-              EQN_dvar3 = ((FNUMEXPR_ *) it_code->second)->get_dvariable3();
-              EQN_lag3 = ((FNUMEXPR_ *) it_code->second)->get_lag3();
-              break;
-            case ThirdExoDerivative:
-#ifdef DEBUG
-              mexPrintf("ThirdExoDerivative\n");
-#endif
-              EQN_type = ThirdExoDerivative;
-              EQN_equation = ((FNUMEXPR_ *) it_code->second)->get_equation();
-              EQN_dvar1 = ((FNUMEXPR_ *) it_code->second)->get_dvariable1();
-              EQN_lag1 = ((FNUMEXPR_ *) it_code->second)->get_lag1();
-              EQN_dvar2 = ((FNUMEXPR_ *) it_code->second)->get_dvariable2();
-              EQN_lag2 = ((FNUMEXPR_ *) it_code->second)->get_lag2();
-              EQN_dvar3 = ((FNUMEXPR_ *) it_code->second)->get_dvariable3();
-              EQN_lag3 = ((FNUMEXPR_ *) it_code->second)->get_lag3();
-              break;
-            case ThirdExodetDerivative:
-#ifdef DEBUG
-              mexPrintf("ThirdExodetDerivative\n");
-#endif
-              EQN_type = ThirdExodetDerivative;
-              EQN_equation = ((FNUMEXPR_ *) it_code->second)->get_equation();
-              EQN_dvar1 = ((FNUMEXPR_ *) it_code->second)->get_dvariable1();
-              EQN_lag1 = ((FNUMEXPR_ *) it_code->second)->get_lag1();
-              EQN_dvar2 = ((FNUMEXPR_ *) it_code->second)->get_dvariable2();
-              EQN_lag2 = ((FNUMEXPR_ *) it_code->second)->get_lag2();
-              EQN_dvar3 = ((FNUMEXPR_ *) it_code->second)->get_dvariable3();
-              EQN_lag3 = ((FNUMEXPR_ *) it_code->second)->get_lag3();
-              break;
-            case ThirdParamDerivative:
-#ifdef DEBUG
-              mexPrintf("ThirdParamDerivative\n");
-#endif
-              EQN_type = ThirdParamDerivative;
-              EQN_equation = ((FNUMEXPR_ *) it_code->second)->get_equation();
-              EQN_dvar1 = ((FNUMEXPR_ *) it_code->second)->get_dvariable1();
-              EQN_dvar2 = ((FNUMEXPR_ *) it_code->second)->get_dvariable2();
-              EQN_dvar3 = ((FNUMEXPR_ *) it_code->second)->get_dvariable3();
-              break;
-            }
-          break;
-        case FLDV:
-          //load a variable in the processor
-          switch (((FLDV_ *) it_code->second)->get_type())
-            {
-            case eParameter:
-              var = ((FLDV_ *) it_code->second)->get_pos();
-#ifdef DEBUG
-              mexPrintf("FLDV Param[var=%d]\n", var);
-              tmp_out << " params[" << var << "](" << params[var] << ")";
-#endif
-              Stack.push(params[var]);
-              break;
-            case eEndogenous:
-              var = ((FLDV_ *) it_code->second)->get_pos();
-              lag = ((FLDV_ *) it_code->second)->get_lead_lag();
-#ifdef DEBUG
-              mexPrintf("FLDV y[var=%d, lag=%d, it_=%d], y_size=%d evaluate=%d\n", var, lag, it_, y_size, evaluate);
-#endif
-              if (evaluate)
-                Stack.push(ya[(it_+lag)*y_size+var]);
-              else
-                Stack.push(y[(it_+lag)*y_size+var]);
-#ifdef DEBUG
-              tmp_out << " y[" << it_+lag << ", " << var << "](" << y[(it_+lag)*y_size+var] << ")";
-#endif
-              break;
-            case eExogenous:
-              var = ((FLDV_ *) it_code->second)->get_pos();
-              lag = ((FLDV_ *) it_code->second)->get_lead_lag();
-#ifdef DEBUG
-              mexPrintf("FLDV x[var=%d, lag=%d, it_=%d], nb_row_x=%d evaluate=%d\n", var, lag, it_, nb_row_x, evaluate);
-              tmp_out << " x[" << it_+lag << ", " << var << "](" << x[it_+lag+var*nb_row_x] << ")";
-#endif
-              Stack.push(x[it_+lag+var*nb_row_x]);
-              break;
-            case eExogenousDet:
-              var = ((FLDV_ *) it_code->second)->get_pos();
-              lag = ((FLDV_ *) it_code->second)->get_lead_lag();
-              Stack.push(x[it_+lag+var*nb_row_xd]);
-              break;
-            case eModelLocalVariable:
-#ifdef DEBUG
-              mexPrintf("FLDV a local variable in Block %d Stack.size()=%d", block_num, Stack.size());
-              mexPrintf(" value=%f\n", Stack.top());
-#endif
-              break;
-            default:
-              mexPrintf("FLDV: Unknown variable type\n");
-            }
-          break;
-        case FLDSV:
-          //load a variable in the processor
-          switch (((FLDSV_ *) it_code->second)->get_type())
-            {
-            case eParameter:
-              var = ((FLDSV_ *) it_code->second)->get_pos();
-#ifdef DEBUG
-              mexPrintf("FLDSV Param[var=%d]=%f\n", var, params[var]);
-              tmp_out << " params[" << var << "](" << params[var] << ")";
-#endif
-              Stack.push(params[var]);
-              break;
-            case eEndogenous:
-              var = ((FLDSV_ *) it_code->second)->get_pos();
-#ifdef DEBUG
-              mexPrintf("FLDSV y[var=%d]=%f\n", var, ya[var]);
-              tmp_out << " y[" << var << "](" << y[var] << ")";
-#endif
-              if (evaluate)
-                Stack.push(ya[var]);
-              else
-                Stack.push(y[var]);
-              break;
-            case eExogenous:
-              var = ((FLDSV_ *) it_code->second)->get_pos();
-#ifdef DEBUG
-              mexPrintf("FLDSV x[var=%d]\n", var);
-              tmp_out << " x[" << var << "](" << x[var] << ")";
-#endif
-              Stack.push(x[var]);
-              break;
-            case eExogenousDet:
-              var = ((FLDSV_ *) it_code->second)->get_pos();
-#ifdef DEBUG
-              mexPrintf("FLDSV xd[var=%d]\n", var);
-#endif
-              Stack.push(x[var]);
-              break;
-            case eModelLocalVariable:
-#ifdef DEBUG
-              mexPrintf("FLDSV a local variable in Block %d Stack.size()=%d", block_num, Stack.size());
-              mexPrintf(" value=%f\n", Stack.top());
-#endif
-              break;
-            default:
-              mexPrintf("FLDSV: Unknown variable type\n");
-            }
-          break;
-        case FLDVS:
-          //load a variable in the processor
-          switch (((FLDVS_ *) it_code->second)->get_type())
-            {
-            case eParameter:
-              var = ((FLDVS_ *) it_code->second)->get_pos();
-#ifdef DEBUG
-              mexPrintf("params[%d]\n", var);
-#endif
-              Stack.push(params[var]);
-              break;
-            case eEndogenous:
-              var = ((FLDVS_ *) it_code->second)->get_pos();
-#ifdef DEBUG
-              mexPrintf("FLDVS steady_y[%d]\n", var);
-#endif
-              Stack.push(steady_y[var]);
-              break;
-            case eExogenous:
-              var = ((FLDVS_ *) it_code->second)->get_pos();
-#ifdef DEBUG
-              mexPrintf("FLDVS x[%d] \n", var);
-#endif
-              Stack.push(x[var]);
-              break;
-            case eExogenousDet:
-              var = ((FLDVS_ *) it_code->second)->get_pos();
-#ifdef DEBUG
-              mexPrintf("FLDVS xd[%d]\n", var);
-#endif
-              Stack.push(x[var]);
-              break;
-            case eModelLocalVariable:
-#ifdef DEBUG
-              mexPrintf("FLDVS a local variable in Block %d Stack.size()=%d", block_num, Stack.size());
-              mexPrintf(" value=%f\n", Stack.top());
-#endif
-              break;
-            default:
-              mexPrintf("FLDVS: Unknown variable type\n");
-            }
-          break;
-        case FLDT:
-          //load a temporary variable in the processor
-          var = ((FLDT_ *) it_code->second)->get_pos();
-#ifdef DEBUG
-          mexPrintf("T[it_=%d var=%d, y_kmin=%d, y_kmax=%d == %d]=>%f\n", it_, var, y_kmin, y_kmax, var*(periods+y_kmin+y_kmax)+it_, var);
-          tmp_out << " T[" << it_ << ", " << var << "](" << T[var*(periods+y_kmin+y_kmax)+it_] << ")";
-#endif
-          Stack.push(T[var*(periods+y_kmin+y_kmax)+it_]);
-          break;
-        case FLDST:
-          //load a temporary variable in the processor
-          var = ((FLDST_ *) it_code->second)->get_pos();
-#ifdef DEBUG
-          mexPrintf("FLDST T[%d]", var);
-#endif
-          Stack.push(T[var]);
-#ifdef DEBUG
-          mexPrintf("=%f\n", T[var]);
-          tmp_out << " T[" << var << "](" << T[var] << ")";
-#endif
-          break;
-        case FLDU:
-          //load u variable in the processor
-          var = ((FLDU_ *) it_code->second)->get_pos();
-          var += Per_u_;
-#ifdef DEBUG
-          mexPrintf("FLDU u[%d]\n", var);
-          tmp_out << " u[" << var << "](" << u[var] << ")";
-#endif
-          Stack.push(u[var]);
-          break;
-        case FLDSU:
-          //load u variable in the processor
-          var = ((FLDSU_ *) it_code->second)->get_pos();
-#ifdef DEBUG
-          mexPrintf("FLDSU u[%d]\n", var);
-          tmp_out << " u[" << var << "](" << u[var] << ")";
-#endif
-          Stack.push(u[var]);
-          break;
-        case FLDR:
-          //load u variable in the processor
-          var = ((FLDR_ *) it_code->second)->get_pos();
-#ifdef DEBUG
-          mexPrintf("FLDR r[%d]\n", var);
-#endif
-          Stack.push(r[var]);
-          break;
-        case FLDZ:
-          //load 0 in the processor
-#ifdef DEBUG
-          mexPrintf("FLDZ\n");
-#endif
-          Stack.push(0.0);
-#ifdef DEBUG
-          tmp_out << " 0";
-#endif
-          break;
-        case FLDC:
-          //load a numerical constant in the processor
-          ll = ((FLDC_ *) it_code->second)->get_value();
-#ifdef DEBUG
-          mexPrintf("FLDC = %f\n", ll);
-          tmp_out << " " << ll;
-#endif
-
-          Stack.push(ll);
-          break;
-        case FSTPV:
-          //load a variable in the processor
-          switch (((FSTPV_ *) it_code->second)->get_type())
-            {
-            case eParameter:
-              var = ((FSTPV_ *) it_code->second)->get_pos();
-#ifdef DEBUG
-              mexPrintf("FSTPV params[%d]\n", var);
-#endif
-              params[var] = Stack.top();
-              Stack.pop();
-              break;
-            case eEndogenous:
-              var = ((FSTPV_ *) it_code->second)->get_pos();
-              lag = ((FSTPV_ *) it_code->second)->get_lead_lag();
-              y[(it_+lag)*y_size+var] = Stack.top();
-#ifdef DEBUG
-              tmp_out << "=>";
-              mexPrintf(" y[%d, %d](%f)=%s\n", it_+lag, var, y[(it_+lag)*y_size+var], tmp_out.str().c_str());
-              tmp_out.str("");
-#endif
-              Stack.pop();
-              break;
-            case eExogenous:
-              var = ((FSTPV_ *) it_code->second)->get_pos();
-              lag = ((FSTPV_ *) it_code->second)->get_lead_lag();
-              x[it_+lag+var*nb_row_x]  = Stack.top();
-#ifdef DEBUG
-              tmp_out << "=>";
-              mexPrintf(" x[%d, %d](%f)=%s\n", it_+lag, var, x[it_+lag+var*nb_row_x], tmp_out.str().c_str());
-              tmp_out.str("");
-#endif
-
-              Stack.pop();
-              break;
-            case eExogenousDet:
-              var = ((FSTPV_ *) it_code->second)->get_pos();
-              lag = ((FSTPV_ *) it_code->second)->get_lead_lag();
-              x[it_+lag+var*nb_row_xd] = Stack.top();
-#ifdef DEBUG
-              tmp_out << "=>";
-              mexPrintf(" x[%d, %d](%f)=%s\n", it_+lag, var, x[it_+lag+var*nb_row_xd], tmp_out.str().c_str());
-              tmp_out.str("");
-#endif
-              Stack.pop();
-              break;
-            default:
-              mexPrintf("FSTPV: Unknown variable type\n");
-            }
-          break;
-        case FSTPSV:
-          //load a variable in the processor
-          switch (((FSTPSV_ *) it_code->second)->get_type())
-            {
-            case eParameter:
-              var = ((FSTPSV_ *) it_code->second)->get_pos();
-              params[var] = Stack.top();
-              Stack.pop();
-              break;
-            case eEndogenous:
-              var = ((FSTPSV_ *) it_code->second)->get_pos();
-              y[var] = Stack.top();
-#ifdef DEBUG
-              tmp_out << "=>";
-              mexPrintf(" y[%d](%f)=%s\n", var, y[var], tmp_out.str().c_str());
-              tmp_out.str("");
-#endif
-              Stack.pop();
-              break;
-            case eExogenous:
-            case eExogenousDet:
-              var = ((FSTPSV_ *) it_code->second)->get_pos();
-              x[var]  = Stack.top();
-#ifdef DEBUG
-              tmp_out << "=>";
-              mexPrintf(" x[%d, %d](%f)=%s\n", it_+lag, var, x[var], tmp_out.str().c_str());
-              tmp_out.str("");
-#endif
-              Stack.pop();
-              break;
-            default:
-              mexPrintf("FSTPSV: Unknown variable type\n");
-            }
-          break;
-        case FSTPT:
-          //store in a temporary variable from the processor
-#ifdef DEBUG
-          mexPrintf("FSTPT\n");
-#endif
-          var = ((FSTPT_ *) it_code->second)->get_pos();
-          T[var*(periods+y_kmin+y_kmax)+it_] = Stack.top();
-#ifdef DEBUG
-          tmp_out << "=>";
-          mexPrintf(" T[%d, %d](%f)=%s\n", it_, var, T[var*(periods+y_kmin+y_kmax)+it_], tmp_out.str().c_str());
-          tmp_out.str("");
-#endif
-
-          Stack.pop();
-          break;
-        case FSTPST:
-          //store in a temporary variable from the processor
-#ifdef DEBUG
-          mexPrintf("FSTPST\n");
-#endif
-          var = ((FSTPST_ *) it_code->second)->get_pos();
-#ifdef DEBUG
-          mexPrintf("var=%d\n", var);
-#endif
-          T[var] = Stack.top();
-#ifdef DEBUG
-          tmp_out << "=>";
-          mexPrintf(" T[%d](%f)=%s\n", var, T[var], tmp_out.str().c_str());
-          tmp_out.str("");
-#endif
-          Stack.pop();
-          break;
-        case FSTPU:
-          //store in u variable from the processor
-          var = ((FSTPU_ *) it_code->second)->get_pos();
-          var += Per_u_;
-#ifdef DEBUG
-          mexPrintf("FSTPU\n");
-          mexPrintf("var=%d\n", var);
-#endif
-          u[var] = Stack.top();
-#ifdef DEBUG
-          tmp_out << "=>";
-          mexPrintf(" u[%d](%f)=%s\n", var, u[var], tmp_out.str().c_str());
-          tmp_out.str("");
-#endif
-          Stack.pop();
-          break;
-        case FSTPSU:
-          //store in u variable from the processor
-          var = ((FSTPSU_ *) it_code->second)->get_pos();
-#ifdef DEBUG
-          if (var >= u_count_alloc || var < 0)
-            mexPrintf("Erreur var=%d\n", var);
-#endif
-          u[var] = Stack.top();
-#ifdef DEBUG
-          tmp_out << "=>";
-          mexPrintf(" u[%d](%f)=%s\n", var, u[var], tmp_out.str().c_str());
-          tmp_out.str("");
-#endif
-          Stack.pop();
-          break;
-        case FSTPR:
-          //store in residual variable from the processor
-          var = ((FSTPR_ *) it_code->second)->get_pos();
-#ifdef DEBUG
-          tmp_out << "=>";
-          mexPrintf("FSTPR r[%d]", var);
-          tmp_out.str("");
-#endif
-          r[var] = Stack.top();
-#ifdef DEBUG
-          tmp_out << "=>";
-          mexPrintf("(%f)=%s\n", r[var], tmp_out.str().c_str());
-          tmp_out.str("");
-#endif
-          Stack.pop();
-          break;
-        case FSTPG:
-          //store in derivative (g) variable from the processor
-#ifdef DEBUG
-          mexPrintf("FSTPG\n");
-          mexEvalString("drawnow;");
-#endif
-          var = ((FSTPG_ *) it_code->second)->get_pos();
-          g1[var] = Stack.top();
-#ifdef DEBUG
-          tmp_out << "=>";
-          mexPrintf(" g1[%d](%f)=%s\n", var, g1[var], tmp_out.str().c_str());
-          tmp_out.str("");
-#endif
-          Stack.pop();
-          break;
-
-        case FSTPG2:
-          //store in the jacobian matrix
-          rr = Stack.top();
-          if (EQN_type != FirstEndoDerivative)
-            {
-              ostringstream tmp;
-              tmp << " in compute_block_time, impossible case " << EQN_type << " not implement in static jacobian\n";
-              throw FatalExceptionHandling(tmp.str());
-            }
-          eq = ((FSTPG2_ *) it_code->second)->get_row();
-          var = ((FSTPG2_ *) it_code->second)->get_col();
-#ifdef DEBUG
-          mexPrintf("FSTPG2 eq=%d, var=%d\n", eq, var);
-          mexEvalString("drawnow;");
-#endif
-          jacob[eq + size*var] = rr;
-          break;
-        case FSTPG3:
-          //store in derivative (g) variable from the processor
-#ifdef DEBUG
-          mexPrintf("FSTPG3\n");
-          mexEvalString("drawnow;");
-#endif
-          rr = Stack.top();
-          switch (EQN_type)
-            {
-            case FirstEndoDerivative:
-              eq = ((FSTPG3_ *) it_code->second)->get_row();
-              var = ((FSTPG3_ *) it_code->second)->get_col();
-              lag = ((FSTPG3_ *) it_code->second)->get_lag();
-              pos_col = ((FSTPG3_ *) it_code->second)->get_col_pos();
-#ifdef DEBUG
-              mexPrintf("Endo eq=%d, pos_col=%d, size=%d\n", eq, pos_col, size);
-#endif
-              jacob[eq + size*pos_col] = rr;
-              break;
-            case FirstOtherEndoDerivative:
-              //eq = ((FSTPG3_ *) it_code->second)->get_row();
-              eq = EQN_equation;
-              var = ((FSTPG3_ *) it_code->second)->get_col();
-              lag = ((FSTPG3_ *) it_code->second)->get_lag();
-              pos_col = ((FSTPG3_ *) it_code->second)->get_col_pos();
-              jacob_other_endo[eq + size*pos_col] = rr;
-              break;
-            case FirstExoDerivative:
-              //eq = ((FSTPG3_ *) it_code->second)->get_row();
-              eq = EQN_equation;
-              var = ((FSTPG3_ *) it_code->second)->get_col();
-              lag = ((FSTPG3_ *) it_code->second)->get_lag();
-              pos_col = ((FSTPG3_ *) it_code->second)->get_col_pos();
-#ifdef DEBUG
-              mexPrintf("Exo eq=%d, pos_col=%d, size=%d\n", eq, pos_col, size);
-#endif
-              jacob_exo[eq + size*pos_col] = rr;
-              break;
-            case FirstExodetDerivative:
-              //eq = ((FSTPG3_ *) it_code->second)->get_row();
-              eq = EQN_equation;
-              var = ((FSTPG3_ *) it_code->second)->get_col();
-              lag = ((FSTPG3_ *) it_code->second)->get_lag();
-              pos_col = ((FSTPG3_ *) it_code->second)->get_col_pos();
-              jacob_exo_det[eq + size*pos_col] = rr;
-              break;
-            default:
-              ostringstream tmp;
-              tmp << " in compute_block_time, variable " << EQN_type << " not used yet\n";
-              throw FatalExceptionHandling(tmp.str());
-            }
-#ifdef DEBUG
-          tmp_out << "=>";
-          mexPrintf(" g1[%d](%f)=%s\n", var, g1[var], tmp_out.str().c_str());
-          tmp_out.str("");
-#endif
-          Stack.pop();
-          break;
-
-        case FBINARY:
-          op = ((FBINARY_ *) it_code->second)->get_op_type();
-#ifdef DEBUG
-          mexPrintf("FBINARY, op=%d\n", op);
-#endif
-          v2 = Stack.top();
-          Stack.pop();
-          v1 = Stack.top();
-          Stack.pop();
-          switch (op)
-            {
-            case oPlus:
-              Stack.push(v1 + v2);
-#ifdef DEBUG
-              tmp_out << " |" << v1 << "+" << v2 << "|";
-#endif
-              break;
-            case oMinus:
-              Stack.push(v1 - v2);
-#ifdef DEBUG
-              tmp_out << " |" << v1 << "-" << v2 << "|";
-#endif
-              break;
-            case oTimes:
-              Stack.push(v1 * v2);
-#ifdef DEBUG
-              tmp_out << " |" << v1 << "*" << v2 << "|";
-#endif
-              break;
-            case oDivide:
-              double tmp;
-#ifdef DEBUG
-              mexPrintf("v1=%f / v2=%f\n", v1, v2);
-#endif
-              try
-                {
-                  tmp = divide(v1, v2);
-                }
-              catch (FloatingPointExceptionHandling &fpeh)
-                {
-                  mexPrintf("%s      %s\n", fpeh.GetErrorMsg().c_str(), error_location(evaluate, steady_state, size, block_num, it_, Per_u_).c_str());
-                  go_on = false;
-                }
-              Stack.push(tmp);
-#ifdef DEBUG
-              tmp_out << " |" << v1 << "/" << v2 << "|";
-#endif
-              break;
-            case oLess:
-              Stack.push(double (v1 < v2));
-#ifdef DEBUG
-              tmp_out << " |" << v1 << "<" << v2 << "|";
-#endif
-              break;
-            case oGreater:
-              Stack.push(double (v1 > v2));
-#ifdef DEBUG
-              tmp_out << " |" << v1 << ">" << v2 << "|";
-#endif
-              break;
-            case oLessEqual:
-              Stack.push(double (v1 <= v2));
-#ifdef DEBUG
-              tmp_out << " |" << v1 << "<=" << v2 << "|";
-#endif
-              break;
-            case oGreaterEqual:
-              Stack.push(double (v1 >= v2));
-#ifdef DEBUG
-              tmp_out << " |" << v1 << ">=" << v2 << "|";
-#endif
-              break;
-            case oEqualEqual:
-              Stack.push(double (v1 == v2));
-#ifdef DEBUG
-              tmp_out << " |" << v1 << "==" << v2 << "|";
-#endif
-              break;
-            case oDifferent:
-              Stack.push(double (v1 != v2));
-#ifdef DEBUG
-              tmp_out << " |" << v1 << "!=" << v2 << "|";
-#endif
-              break;
-            case oPower:
-#ifdef DEBUG
-              mexPrintf("pow\n");
-#endif
-              try
-                {
-                  tmp = pow1(v1, v2);
-                }
-              catch (FloatingPointExceptionHandling &fpeh)
-                {
-                  mexPrintf("%s      %s\n", fpeh.GetErrorMsg().c_str(), error_location(evaluate, steady_state, size, block_num, it_, Per_u_).c_str());
-                  go_on = false;
-                }
-              Stack.push(tmp);
-
-#ifdef DEBUG
-              tmp_out << " |" << v1 << "^" << v2 << "|";
-#endif
-              break;
-            case oPowerDeriv:
-              {
-                int derivOrder = nearbyint(Stack.top());
-                Stack.pop();
-                try
-                  {
-                    if (fabs(v1) < NEAR_ZERO && v2 > 0
-                        && derivOrder > v2
-                        && fabs(v2-nearbyint(v2)) < NEAR_ZERO)
-                      Stack.push(0.0);
-                    else
-                      {
-                        double dxp = pow1(v1, v2-derivOrder);
-                        for (int i = 0; i < derivOrder; i++)
-                          dxp *= v2--;
-                        Stack.push(dxp);
-                      }
-                  }
-                catch (FloatingPointExceptionHandling &fpeh)
-                  {
-                    mexPrintf("%s      %s\n", fpeh.GetErrorMsg().c_str(), error_location(evaluate, steady_state, size, block_num, it_, Per_u_).c_str());
-                    go_on = false;
-                  }
-              }
-
-#ifdef DEBUG
-              tmp_out << " |PowerDeriv(" << v1 << ", " << v2 << ")|";
-#endif
-              break;
-            case oMax:
-              Stack.push(max(v1, v2));
-#ifdef DEBUG
-              tmp_out << " |max(" << v1 << "," << v2 << ")|";
-#endif
-              break;
-            case oMin:
-              Stack.push(min(v1, v2));
-#ifdef DEBUG
-              tmp_out << " |min(" << v1 << "," << v2 << ")|";
-#endif
-              break;
-            case oEqual:
-              // Nothing to do
-              break;
-            default:
-              {
-                mexPrintf("Error\n");
-                ostringstream tmp;
-                tmp << " in compute_block_time, unknown binary operator " << op << "\n";
-                throw FatalExceptionHandling(tmp.str());
-              }
-            }
-          break;
-        case FUNARY:
-          op = ((FUNARY_ *) it_code->second)->get_op_type();
-          v1 = Stack.top();
-          Stack.pop();
-#ifdef DEBUG
-          mexPrintf("FUNARY, op=%d\n", op);
-#endif
-          switch (op)
-            {
-            case oUminus:
-              Stack.push(-v1);
-#ifdef DEBUG
-              tmp_out << " |-(" << v1 << ")|";
-#endif
-
-              break;
-            case oExp:
-              Stack.push(exp(v1));
-#ifdef DEBUG
-              tmp_out << " |exp(" << v1 << ")|";
-#endif
-              break;
-            case oLog:
-              double tmp;
-              try
-                {
-                  tmp = log1(v1);
-                }
-              catch (FloatingPointExceptionHandling &fpeh)
-                {
-                  mexPrintf("%s      %s\n", fpeh.GetErrorMsg().c_str(), error_location(evaluate, steady_state, size, block_num, it_, Per_u_).c_str());
-                  go_on = false;
-                }
-              Stack.push(tmp);
-              //if (isnan(res1))
-
-#ifdef DEBUG
-              tmp_out << " |log(" << v1 << ")|";
-#endif
-              break;
-            case oLog10:
-              try
-                {
-                  tmp = log10_1(v1);
-                }
-              catch (FloatingPointExceptionHandling &fpeh)
-                {
-                  mexPrintf("%s      %s\n", fpeh.GetErrorMsg().c_str(), error_location(evaluate, steady_state, size, block_num, it_, Per_u_).c_str());
-                  go_on = false;
-                }
-              Stack.push(tmp);
-#ifdef DEBUG
-              tmp_out << " |log10(" << v1 << ")|";
-#endif
-              break;
-            case oCos:
-              Stack.push(cos(v1));
-#ifdef DEBUG
-              tmp_out << " |cos(" << v1 << ")|";
-#endif
-              break;
-            case oSin:
-              Stack.push(sin(v1));
-#ifdef DEBUG
-              tmp_out << " |sin(" << v1 << ")|";
-#endif
-              break;
-            case oTan:
-              Stack.push(tan(v1));
-#ifdef DEBUG
-              tmp_out << " |tan(" << v1 << ")|";
-#endif
-              break;
-            case oAcos:
-              Stack.push(acos(v1));
-#ifdef DEBUG
-              tmp_out << " |acos(" << v1 << ")|";
-#endif
-              break;
-            case oAsin:
-              Stack.push(asin(v1));
-#ifdef DEBUG
-              tmp_out << " |asin(" << v1 << ")|";
-#endif
-              break;
-            case oAtan:
-              Stack.push(atan(v1));
-#ifdef DEBUG
-              tmp_out << " |atan(" << v1 << ")|";
-#endif
-              break;
-            case oCosh:
-              Stack.push(cosh(v1));
-#ifdef DEBUG
-              tmp_out << " |cosh(" << v1 << ")|";
-#endif
-              break;
-            case oSinh:
-              Stack.push(sinh(v1));
-#ifdef DEBUG
-              tmp_out << " |sinh(" << v1 << ")|";
-#endif
-              break;
-            case oTanh:
-              Stack.push(tanh(v1));
-#ifdef DEBUG
-              tmp_out << " |tanh(" << v1 << ")|";
-#endif
-              break;
-            case oAcosh:
-              Stack.push(acosh(v1));
-#ifdef DEBUG
-              tmp_out << " |acosh(" << v1 << ")|";
-#endif
-              break;
-            case oAsinh:
-              Stack.push(asinh(v1));
-#ifdef DEBUG
-              tmp_out << " |asinh(" << v1 << ")|";
-#endif
-              break;
-            case oAtanh:
-              Stack.push(atanh(v1));
-#ifdef DEBUG
-              tmp_out << " |atanh(" << v1 << ")|";
-#endif
-              break;
-            case oSqrt:
-              Stack.push(sqrt(v1));
-#ifdef DEBUG
-              tmp_out << " |sqrt(" << v1 << ")|";
-#endif
-              break;
-            case oErf:
-              Stack.push(erf(v1));
-#ifdef DEBUG
-              tmp_out << " |erf(" << v1 << ")|";
-
-#endif
-              break;
-            default:
-              {
-                mexPrintf("Error\n");
-                ostringstream tmp;
-                tmp << " in compute_block_time, unknown unary operator " << op << "\n";
-                throw FatalExceptionHandling(tmp.str());
-              }
-            }
-          break;
-        case FTRINARY:
-          op = ((FTRINARY_ *) it_code->second)->get_op_type();
-          v3 = Stack.top();
-          Stack.pop();
-          v2 = Stack.top();
-          Stack.pop();
-          v1 = Stack.top();
-          Stack.pop();
-          switch (op)
-            {
-            case oNormcdf:
-              Stack.push(0.5*(1+erf((v1-v2)/v3/M_SQRT2)));
-#ifdef DEBUG
-              tmp_out << " |normcdf(" << v1 << ", " << v2 << ", " << v3 << ")|";
-#endif
-              break;
-            case oNormpdf:
-              Stack.push(1/(v3*sqrt(2*M_PI)*exp(pow((v1-v2)/v3, 2)/2)));
-#ifdef DEBUG
-              tmp_out << " |normpdf(" << v1 << ", " << v2 << ", " << v3 << ")|";
-#endif
-              break;
-            default:
-              {
-                mexPrintf("Error\n");
-                ostringstream tmp;
-                tmp << " in compute_block_time, unknown trinary operator " << op << "\n";
-                throw FatalExceptionHandling(tmp.str());
-              }
-            }
-          break;
-
-        case FPUSH:
-          break;
-
-        case FCALL:
-          {
-#ifdef DEBUG
-            mexPrintf("------------------------------\n");
-            mexPrintf("CALL "); mexEvalString("drawnow;");
-#endif
-            FCALL_ *fc = (FCALL_ *) it_code->second;
-            string function_name = fc->get_function_name();
-#ifdef DEBUG
-            mexPrintf("function_name=%s ", function_name.c_str()); mexEvalString("drawnow;");
-#endif
-            unsigned int nb_input_arguments = fc->get_nb_input_arguments();
-#ifdef DEBUG
-            mexPrintf("nb_input_arguments=%d ", nb_input_arguments); mexEvalString("drawnow;");
-#endif
-            unsigned int nb_output_arguments = fc->get_nb_output_arguments();
-#ifdef DEBUG
-            mexPrintf("nb_output_arguments=%d\n", nb_output_arguments); mexEvalString("drawnow;");
-#endif
-
-            mxArray *output_arguments[3];
-            string arg_func_name = fc->get_arg_func_name();
-#ifdef DEBUG
-            mexPrintf("arg_func_name.length() = %d\n", arg_func_name.length());
-            mexPrintf("arg_func_name.c_str() = %s\n", arg_func_name.c_str());
-#endif
-            unsigned int nb_add_input_arguments = fc->get_nb_add_input_arguments();
-            function_type = fc->get_function_type();
-#ifdef DEBUG
-            mexPrintf("function_type=%d ExternalFunctionWithoutDerivative=%d\n", function_type, ExternalFunctionWithoutDerivative);
-            mexEvalString("drawnow;");
-#endif
-            mxArray **input_arguments;
-            switch (function_type)
-              {
-              case ExternalFunctionWithoutDerivative:
-              case ExternalFunctionWithFirstDerivative:
-              case ExternalFunctionWithFirstandSecondDerivative:
-                {
-                  input_arguments = (mxArray **) mxMalloc(nb_input_arguments * sizeof(mxArray *));
-#ifdef DEBUG
-                  mexPrintf("Stack.size()=%d\n", Stack.size());
-                  mexEvalString("drawnow;");
-#endif
-                  for (unsigned int i = 0; i < nb_input_arguments; i++)
-                    {
-                      mxArray *vv = mxCreateDoubleScalar(Stack.top());
-                      input_arguments[nb_input_arguments - i - 1] = vv;
-                      Stack.pop();
-                    }
-                  mexCallMATLAB(nb_output_arguments, output_arguments, nb_input_arguments, input_arguments, function_name.c_str());
-                  double *rr = mxGetPr(output_arguments[0]);
-                  Stack.push(*rr);
-                  if (function_type == ExternalFunctionWithFirstDerivative || function_type == ExternalFunctionWithFirstandSecondDerivative)
-                    {
-                      unsigned int indx = fc->get_indx();
-                      double *FD1 = mxGetPr(output_arguments[1]);
-                      unsigned int rows = mxGetN(output_arguments[1]);
-                      for (unsigned int i = 0; i < rows; i++)
-                        TEFD[make_pair(indx, i)] = FD1[i];
-                    }
-                  if (function_type == ExternalFunctionWithFirstandSecondDerivative)
-                    {
-                      unsigned int indx = fc->get_indx();
-                      double *FD2 = mxGetPr(output_arguments[2]);
-                      unsigned int rows = mxGetM(output_arguments[2]);
-                      unsigned int cols = mxGetN(output_arguments[2]);
-                      unsigned int k = 0;
-                      for (unsigned int j = 0; j < cols; j++)
-                        for (unsigned int i = 0; i < rows; i++)
-                          TEFDD[make_pair(indx, make_pair(i, j))] = FD2[k++];
-                    }
-                }
-                break;
-              case ExternalFunctionNumericalFirstDerivative:
-                {
-                  input_arguments = (mxArray **) mxMalloc((nb_input_arguments+1+nb_add_input_arguments) * sizeof(mxArray *));
-                  mxArray *vv = mxCreateString(arg_func_name.c_str());
-                  input_arguments[0] = vv;
-                  vv = mxCreateDoubleScalar(fc->get_row());
-                  input_arguments[1] = vv;
-                  vv = mxCreateCellMatrix(1, nb_add_input_arguments);
-                  for (unsigned int i = 0; i < nb_add_input_arguments; i++)
-                    {
-                      double rr = Stack.top();
-#ifdef DEBUG
-                      mexPrintf("i=%d rr = %f Stack.size()=%d\n", i, rr, Stack.size());
-#endif
-                      mxSetCell(vv, nb_add_input_arguments - (i+1), mxCreateDoubleScalar(rr));
-                      Stack.pop();
-                    }
-                  input_arguments[nb_input_arguments+nb_add_input_arguments] = vv;
-#ifdef DEBUG
-                  mexCallMATLAB(0, NULL, 1, &input_arguments[0], "disp");
-                  mexCallMATLAB(0, NULL, 1, &input_arguments[1], "disp");
-                  mexCallMATLAB(0, NULL, 1, &input_arguments[2], "celldisp");
-                  mexPrintf("OK\n");
-                  mexEvalString("drawnow;");
-#endif
-                  nb_input_arguments = 3;
-                  mexCallMATLAB(nb_output_arguments, output_arguments, nb_input_arguments, input_arguments, function_name.c_str());
-                  double *rr = mxGetPr(output_arguments[0]);
-#ifdef DEBUG
-                  mexPrintf("*rr=%f\n", *rr);
-#endif
-                  Stack.push(*rr);
-                }
-                break;
-              case ExternalFunctionFirstDerivative:
-                {
-                  input_arguments = (mxArray **) mxMalloc(nb_input_arguments * sizeof(mxArray *));
-                  for (unsigned int i = 0; i < nb_input_arguments; i++)
-                    {
-                      mxArray *vv = mxCreateDoubleScalar(Stack.top());
-                      input_arguments[(nb_input_arguments - 1) - i] = vv;
-                      Stack.pop();
-                    }
-                  mexCallMATLAB(nb_output_arguments, output_arguments, nb_input_arguments, input_arguments, function_name.c_str());
-                  unsigned int indx = fc->get_indx();
-                  double *FD1 = mxGetPr(output_arguments[0]);
-                  //mexPrint
-                  unsigned int rows = mxGetN(output_arguments[0]);
-                  for (unsigned int i = 0; i < rows; i++)
-                    TEFD[make_pair(indx, i)] = FD1[i];
-                }
-                break;
-              case ExternalFunctionNumericalSecondDerivative:
-                {
-                  input_arguments = (mxArray **) mxMalloc((nb_input_arguments+1+nb_add_input_arguments) * sizeof(mxArray *));
-                  mxArray *vv = mxCreateString(arg_func_name.c_str());
-                  input_arguments[0] = vv;
-                  vv = mxCreateDoubleScalar(fc->get_row());
-                  input_arguments[1] = vv;
-                  vv = mxCreateDoubleScalar(fc->get_col());
-                  input_arguments[2] = vv;
-                  vv = mxCreateCellMatrix(1, nb_add_input_arguments);
-                  for (unsigned int i = 0; i < nb_add_input_arguments; i++)
-                    {
-                      double rr = Stack.top();
-#ifdef DEBUG
-                      mexPrintf("i=%d rr = %f\n", i, rr);
-#endif
-                      mxSetCell(vv, (nb_add_input_arguments - 1) - i, mxCreateDoubleScalar(rr));
-                      Stack.pop();
-                    }
-                  input_arguments[nb_input_arguments+nb_add_input_arguments] = vv;
-#ifdef DEBUG
-                  mexCallMATLAB(0, NULL, 1, &input_arguments[0], "disp");
-                  mexCallMATLAB(0, NULL, 1, &input_arguments[1], "disp");
-                  mexCallMATLAB(0, NULL, 1, &input_arguments[2], "celldisp");
-                  mexPrintf("OK\n");
-                  mexEvalString("drawnow;");
-#endif
-                  nb_input_arguments = 3;
-                  mexCallMATLAB(nb_output_arguments, output_arguments, nb_input_arguments, input_arguments, function_name.c_str());
-                  double *rr = mxGetPr(output_arguments[0]);
-                  Stack.push(*rr);
-                }
-                break;
-              case ExternalFunctionSecondDerivative:
-                {
-                  input_arguments = (mxArray **) mxMalloc(nb_input_arguments * sizeof(mxArray *));
-                  for (unsigned int i = 0; i < nb_input_arguments; i++)
-                    {
-                      mxArray *vv = mxCreateDoubleScalar(Stack.top());
-                      input_arguments[i] = vv;
-                      Stack.pop();
-                    }
-                  mexCallMATLAB(nb_output_arguments, output_arguments, nb_input_arguments, input_arguments, function_name.c_str());
-                  unsigned int indx = fc->get_indx();
-                  double *FD2 = mxGetPr(output_arguments[2]);
-                  unsigned int rows = mxGetM(output_arguments[0]);
-                  unsigned int cols = mxGetN(output_arguments[0]);
-                  unsigned int k = 0;
-                  for (unsigned int j = 0; j < cols; j++)
-                    for (unsigned int i = 0; i < rows; i++)
-                      TEFDD[make_pair(indx, make_pair(i, j))] = FD2[k++];
-                }
-                break;
-              }
-          }
-          break;
-        case FSTPTEF:
-          var = ((FSTPTEF_ *) it_code->second)->get_number();
-#ifdef DEBUG
-          mexPrintf("FSTPTEF\n");
-          mexPrintf("var=%d Stack.size()=%d\n", var, Stack.size());
-#endif
-          TEF[var-1] = Stack.top();
-#ifdef DEBUG
-          mexPrintf("FSTP TEF[var-1]=%f done\n", TEF[var-1]);
-          mexEvalString("drawnow;");
-#endif
-          Stack.pop();
-          break;
-        case FLDTEF:
-          var = ((FLDTEF_ *) it_code->second)->get_number();
-#ifdef DEBUG
-          mexPrintf("FLDTEF\n");
-          mexPrintf("var=%d Stack.size()=%d\n", var, Stack.size());
-          mexPrintf("FLD TEF[var-1]=%f done\n", TEF[var-1]);
-          mexEvalString("drawnow;");
-#endif
-          Stack.push(TEF[var-1]);
-          break;
-        case FSTPTEFD:
-          {
-            unsigned int indx = ((FSTPTEFD_ *) it_code->second)->get_indx();
-            unsigned int row = ((FSTPTEFD_ *) it_code->second)->get_row();
-#ifdef DEBUG
-            mexPrintf("FSTPTEFD\n");
-            mexPrintf("indx=%d Stack.size()=%d\n", indx, Stack.size());
-#endif
-            if (function_type == ExternalFunctionNumericalFirstDerivative)
-              {
-                TEFD[make_pair(indx, row-1)] = Stack.top();
-#ifdef DEBUG
-                mexPrintf("FSTP TEFD[make_pair(indx, row)]=%f done\n", TEFD[make_pair(indx, row-1)]);
-                mexEvalString("drawnow;");
-#endif
-                Stack.pop();
-              }
-          }
-
-          break;
-        case FLDTEFD:
-          {
-            unsigned int indx = ((FLDTEFD_ *) it_code->second)->get_indx();
-            unsigned int row = ((FLDTEFD_ *) it_code->second)->get_row();
-#ifdef DEBUG
-            mexPrintf("FLDTEFD\n");
-            mexPrintf("indx=%d row=%d Stack.size()=%d\n", indx, row, Stack.size());
-            mexPrintf("FLD TEFD[make_pair(indx, row)]=%f done\n", TEFD[make_pair(indx, row-1)]);
-            mexEvalString("drawnow;");
-#endif
-            Stack.push(TEFD[make_pair(indx, row-1)]);
-          }
-          break;
-        case FSTPTEFDD:
-          {
-            unsigned int indx = ((FSTPTEFDD_ *) it_code->second)->get_indx();
-            unsigned int row = ((FSTPTEFDD_ *) it_code->second)->get_row();
-            unsigned int col = ((FSTPTEFDD_ *) it_code->second)->get_col();
-#ifdef DEBUG
-            mexPrintf("FSTPTEFD\n");
-            mexPrintf("indx=%d Stack.size()=%d\n", indx, Stack.size());
-#endif
-            if (function_type == ExternalFunctionNumericalSecondDerivative)
-              {
-                TEFDD[make_pair(indx, make_pair(row-1, col-1))] = Stack.top();
-#ifdef DEBUG
-                mexPrintf("FSTP TEFDD[make_pair(indx, make_pair(row, col))]=%f done\n", TEFDD[make_pair(indx, make_pair(row, col))]);
-                mexEvalString("drawnow;");
-#endif
-                Stack.pop();
-              }
-          }
-
-          break;
-        case FLDTEFDD:
-          {
-            unsigned int indx = ((FLDTEFDD_ *) it_code->second)->get_indx();
-            unsigned int row = ((FLDTEFDD_ *) it_code->second)->get_row();
-            unsigned int col = ((FSTPTEFDD_ *) it_code->second)->get_col();
-#ifdef DEBUG
-            mexPrintf("FLDTEFD\n");
-            mexPrintf("indx=%d Stack.size()=%d\n", indx, Stack.size());
-            mexPrintf("FLD TEFD[make_pair(indx, make_pair(row, col))]=%f done\n", TEFDD[make_pair(indx, make_pair(row, col))]);
-            mexEvalString("drawnow;");
-#endif
-            Stack.push(TEFDD[make_pair(indx, make_pair(row-1, col-1))]);
-          }
-          break;
-        case FCUML:
-          v1 = Stack.top();
-          Stack.pop();
-          v2 = Stack.top();
-          Stack.pop();
-          Stack.push(v1+v2);
-          break;
-        case FENDBLOCK:
-          //it's the block end
-#ifdef DEBUG
-          mexPrintf("FENDBLOCK\n");
-#endif
-          go_on = false;
-          break;
-        case FENDEQU:
-          break;
-        case FJMPIFEVAL:
-          if (evaluate)
-            {
-#ifdef DEBUG
-              mexPrintf("FJMPIFEVAL length=%d\n", ((FJMPIFEVAL_ *) it_code->second)->get_pos());
-              mexEvalString("drawnow;");
-#endif
-              it_code += ((FJMPIFEVAL_ *) it_code->second)->get_pos() /* - 1*/;
-            }
-          break;
-        case FJMP:
-#ifdef DEBUG
-          mexPrintf("FJMP length=%d\n", ((FJMP_ *) it_code->second)->get_pos());
-          mexEvalString("drawnow;");
-#endif
-          it_code += ((FJMP_ *) it_code->second)->get_pos() /*- 1 */;
-          break;
-        case FOK:
-          op = ((FOK_ *) it_code->second)->get_arg();
-          if (Stack.size() > 0)
-            {
-              ostringstream tmp;
-              tmp << " in compute_block_time, stack not empty\n";
-              throw FatalExceptionHandling(tmp.str());
-            }
-          break;
-        default:
-          ostringstream tmp;
-          tmp << " in compute_block_time, unknown opcode " << it_code->first << "\n";
-          throw FatalExceptionHandling(tmp.str());
-        }
-      it_code++;
-    }
-#ifdef DEBUG
-  mexPrintf("==> end of compute_block_time Block = %d\n", block_num);
-  mexEvalString("drawnow;");
-#endif
-}
-
-void
-Interpreter::evaluate_a_block(const int size, const int type, string bin_basename, bool steady_state, int block_num,
+Interpreter::evaluate_a_block(/*const int size, const int type, string bin_basename, int block_num,
                               const bool is_linear, const int symbol_table_endo_nbr, const int Block_List_Max_Lag,
-                              const int Block_List_Max_Lead, const int u_count_int, int block)
+                              const int Block_List_Max_Lead, const int u_count_int, int block*/)
 {
   it_code_type begining;
 
@@ -1460,16 +91,19 @@ Interpreter::evaluate_a_block(const int size, const int type, string bin_basenam
     case EVALUATE_FORWARD:
       if (steady_state)
         {
-          compute_block_time(0, true, block_num, size, steady_state);
+          compute_block_time(0, true, /*block_num, size, steady_state, */false);
           if (block >= 0)
+            #ifdef USE_OMP
+            #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+            #endif
             for (int j = 0; j < size; j++)
               residual[j] = y[Block_Contain[j].Variable] - ya[Block_Contain[j].Variable];
           else
+            #ifdef USE_OMP
+            #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+            #endif
             for (int j = 0; j < size; j++)
-              {
-                //mexPrintf("=>residual[Block_Contain[%d].Equation = %d]=%g (y = %g, ya = %g)\n", j, Block_Contain[j].Equation, y[Block_Contain[j].Variable] - ya[Block_Contain[j].Variable], y[Block_Contain[j].Variable], ya[Block_Contain[j].Variable]);
-                residual[Block_Contain[j].Equation] = y[Block_Contain[j].Variable] - ya[Block_Contain[j].Variable];
-              }
+              residual[Block_Contain[j].Equation] = y[Block_Contain[j].Variable] - ya[Block_Contain[j].Variable];
         }
       else
         {
@@ -1478,11 +112,17 @@ Interpreter::evaluate_a_block(const int size, const int type, string bin_basenam
             {
               it_code = begining;
               Per_y_ = it_*y_size;
-              compute_block_time(0, true, block_num, size, steady_state);
+              compute_block_time(0, true, /*block_num, size, steady_state, */false);
               if (block >= 0)
+                #ifdef USE_OMP
+                #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+                #endif
                 for (int j = 0; j < size; j++)
                   residual[it_*size+j] = y[it_*y_size+Block_Contain[j].Variable] - ya[it_*y_size+Block_Contain[j].Variable];
               else
+                #ifdef USE_OMP
+                #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+                #endif
                 for (int j = 0; j < size; j++)
                   residual[it_*size+Block_Contain[j].Equation] = y[it_*y_size+Block_Contain[j].Variable] - ya[it_*y_size+Block_Contain[j].Variable];
             }
@@ -1493,64 +133,17 @@ Interpreter::evaluate_a_block(const int size, const int type, string bin_basenam
       r = (double *) mxMalloc(size*sizeof(double));
       if (steady_state)
         {
-          compute_block_time(0, true, block_num, size, steady_state);
-          if (block < 0)
-            {
-              for (int j = 0; j < size; j++)
-                {
-                  //mexPrintf("residual[Block_Contain[%d].Equation = %d]=%g\n", j, Block_Contain[j].Equation, r[j]);
-                  residual[Block_Contain[j].Equation] = r[j];
-                }
-            }
-          else
-            {
-              for (int j = 0; j < size; j++)
-                residual[j] = r[j];
-            }
-        }
-      else
-        {
-          begining = it_code;
-          for (it_ = y_kmin; it_ < periods+y_kmin; it_++)
-            {
-              it_code = begining;
-              Per_y_ = it_*y_size;
-              compute_block_time(0, true, block_num, size, steady_state);
-              if (block < 0)
-                {
-                  for (int j = 0; j < size; j++)
-                    {
-                      //mexPrintf("residual[Per_y + Block_Contain[%d].Equation = %d]=%g\n", j, Per_y_ + Block_Contain[j].Equation, r[j]);
-                      residual[Per_y_+Block_Contain[j].Equation] = r[j];
-                    }
-                }
-              else
-                {
-                  for (int j = 0; j < size; j++)
-                    residual[it_*size+j] = r[j];
-                }
-            }
-        }
-      mxFree(g1);
-      mxFree(r);
-      break;
-    case SOLVE_FORWARD_COMPLETE:
-      fixe_u(&u, u_count_int, u_count_int);
-      Read_SparseMatrix(bin_basename, size, 1, 0, 0, steady_state, false, stack_solve_algo, solve_algo);
-#ifdef DEBUG
-      mexPrintf("in SOLVE_FORWARD_COMPLETE r = mxMalloc(%d*sizeof(double))\n", size);
-#endif
-      r = (double *) mxMalloc(size*sizeof(double));
-      if (steady_state)
-        {
-          compute_block_time(0, true, block_num, size, steady_state);
+          compute_block_time(0, true, /*block_num, size, steady_state,*/ false);
           if (block < 0)
+             #ifdef USE_OMP
+             #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+             #endif
             for (int j = 0; j < size; j++)
-              {
-                //mexPrintf("residual[Block_Contain[%d].Equation = %d]=%g\n", j, Block_Contain[j].Equation, r[j]);
-                residual[Block_Contain[j].Equation] = r[j];
-              }
+              residual[Block_Contain[j].Equation] = r[j];
           else
+            #ifdef USE_OMP
+            #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+            #endif
             for (int j = 0; j < size; j++)
               residual[j] = r[j];
         }
@@ -1561,14 +154,65 @@ Interpreter::evaluate_a_block(const int size, const int type, string bin_basenam
             {
               it_code = begining;
               Per_y_ = it_*y_size;
-              compute_block_time(0, true, block_num, size, steady_state);
+              compute_block_time(0, true, /*block_num, size, steady_state,*/ false);
               if (block < 0)
+                #ifdef USE_OMP
+                #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+                #endif
                 for (int j = 0; j < size; j++)
-                  {
-                    //mexPrintf("residual[it_*y_size+Block_Contain[%d].Equation = %d]=%g\n", j, it_*y_size+Block_Contain[j].Equation, r[j]);
-                    residual[it_*y_size+Block_Contain[j].Equation] = r[j];
-                  }
+                  residual[Per_y_+Block_Contain[j].Equation] = r[j];
               else
+                #ifdef USE_OMP
+                #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+                #endif
+                for (int j = 0; j < size; j++)
+                  residual[it_*size+j] = r[j];
+            }
+        }
+      mxFree(g1);
+      mxFree(r);
+      break;
+    case SOLVE_FORWARD_COMPLETE:
+      fixe_u(&u, u_count_int, u_count_int);
+      Read_SparseMatrix(bin_base_name, size, 1, 0, 0, false, stack_solve_algo, solve_algo);
+#ifdef DEBUG
+      mexPrintf("in SOLVE_FORWARD_COMPLETE r = mxMalloc(%d*sizeof(double))\n", size);
+#endif
+      r = (double *) mxMalloc(size*sizeof(double));
+      if (steady_state)
+        {
+          compute_block_time(0, true, /*block_num, size, steady_state,*/ false);
+          if (block < 0)
+            #ifdef USE_OMP
+            #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+            #endif
+            for (int j = 0; j < size; j++)
+              residual[Block_Contain[j].Equation] = r[j];
+          else
+            #ifdef USE_OMP
+            #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+            #endif
+            for (int j = 0; j < size; j++)
+              residual[j] = r[j];
+        }
+      else
+        {
+          begining = it_code;
+          for (it_ = y_kmin; it_ < periods+y_kmin; it_++)
+            {
+              it_code = begining;
+              Per_y_ = it_*y_size;
+              compute_block_time(0, true, /*block_num, size, steady_state,*/ false);
+              if (block < 0)
+                #ifdef USE_OMP
+                #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+                #endif
+                for (int j = 0; j < size; j++)
+                  residual[it_*y_size+Block_Contain[j].Equation] = r[j];
+              else
+                #ifdef USE_OMP
+                #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+                #endif
                 for (int j = 0; j < size; j++)
                   residual[it_*size+j] = r[j];
             }
@@ -1578,16 +222,19 @@ Interpreter::evaluate_a_block(const int size, const int type, string bin_basenam
     case EVALUATE_BACKWARD:
       if (steady_state)
         {
-          compute_block_time(0, true, block_num, size, steady_state);
+          compute_block_time(0, true, /*block_num, size, steady_state,*/ false);
           if (block >= 0)
+            #ifdef USE_OMP
+            #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+            #endif
             for (int j = 0; j < size; j++)
               residual[j] = y[Block_Contain[j].Variable] - ya[Block_Contain[j].Variable];
           else
+            #ifdef USE_OMP
+            #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+            #endif
             for (int j = 0; j < size; j++)
-              {
-                //mexPrintf("residual[Block_Contain[%d].Equation = %d]=%g\n", j, Block_Contain[j].Equation, y[Block_Contain[j].Variable] - ya[Block_Contain[j].Variable]);
-                residual[Block_Contain[j].Equation] = y[Block_Contain[j].Variable] - ya[Block_Contain[j].Variable];
-              }
+              residual[Block_Contain[j].Equation] = y[Block_Contain[j].Variable] - ya[Block_Contain[j].Variable];
         }
       else
         {
@@ -1596,11 +243,17 @@ Interpreter::evaluate_a_block(const int size, const int type, string bin_basenam
             {
               it_code = begining;
               Per_y_ = it_*y_size;
-              compute_block_time(0, true, block_num, size, steady_state);
+              compute_block_time(0, true, /*block_num, size, steady_state,*/ false);
               if (block >= 0)
+                #ifdef USE_OMP
+                #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+                #endif
                 for (int j = 0; j < size; j++)
                   residual[it_*size+j] = y[it_*y_size+Block_Contain[j].Variable] - ya[it_*y_size+Block_Contain[j].Variable];
               else
+                #ifdef USE_OMP
+                #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+                #endif
                 for (int j = 0; j < size; j++)
                   residual[it_*size+Block_Contain[j].Equation] = y[it_*y_size+Block_Contain[j].Variable] - ya[it_*y_size+Block_Contain[j].Variable];
             }
@@ -1611,61 +264,17 @@ Interpreter::evaluate_a_block(const int size, const int type, string bin_basenam
       r = (double *) mxMalloc(size*sizeof(double));
       if (steady_state)
         {
-          compute_block_time(0, true, block_num, size, steady_state);
-          if (block < 0)
-            {
-              for (int j = 0; j < size; j++)
-                {
-                  //mexPrintf("residual[Block_Contain[%d].Equation = %d]=%g\n", j, Block_Contain[j].Equation, r[j]);
-                  residual[Block_Contain[j].Equation] = r[j];
-                }
-            }
-          else
-            {
-              for (int j = 0; j < size; j++)
-                residual[j] = r[j];
-            }
-        }
-      else
-        {
-          begining = it_code;
-          for (it_ = periods+y_kmin-1; it_ >= y_kmin; it_--)
-            {
-              it_code = begining;
-              Per_y_ = it_*y_size;
-              compute_block_time(0, true, block_num, size, steady_state);
-              if (block < 0)
-                {
-                  for (int j = 0; j < size; j++)
-                    {
-                      //mexPrintf("residual[Per_y_+Block_Contain[%d].Equation = %d]=%g\n", j, Per_y_+Block_Contain[j].Equation, r[j]);
-                      residual[Per_y_+Block_Contain[j].Equation] = r[j];
-                    }
-                }
-              else
-                {
-                  for (int j = 0; j < size; j++)
-                    residual[it_*size+j] = r[j];
-                }
-            }
-        }
-      mxFree(g1);
-      mxFree(r);
-      break;
-    case SOLVE_BACKWARD_COMPLETE:
-      fixe_u(&u, u_count_int, u_count_int);
-      Read_SparseMatrix(bin_basename, size, 1, 0, 0, steady_state, false, stack_solve_algo, solve_algo);
-      r = (double *) mxMalloc(size*sizeof(double));
-      if (steady_state)
-        {
-          compute_block_time(0, true, block_num, size, steady_state);
+          compute_block_time(0, true, /*block_num, size, steady_state,*/ false);
           if (block < 0)
+            #ifdef USE_OMP
+            #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+            #endif
             for (int j = 0; j < size; j++)
-              {
-                //mexPrintf("residual[Block_Contain[%d].Equation = %d]=%g\n", j, Block_Contain[j].Equation, r[j]);
-                residual[Block_Contain[j].Equation] = r[j];
-              }
+              residual[Block_Contain[j].Equation] = r[j];
           else
+            #ifdef USE_OMP
+            #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+            #endif
             for (int j = 0; j < size; j++)
               residual[j] = r[j];
         }
@@ -1676,14 +285,62 @@ Interpreter::evaluate_a_block(const int size, const int type, string bin_basenam
             {
               it_code = begining;
               Per_y_ = it_*y_size;
-              compute_block_time(0, true, block_num, size, steady_state);
+              compute_block_time(0, true, /*block_num, size, steady_state,*/ false);
               if (block < 0)
+                #ifdef USE_OMP
+                #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+                #endif
                 for (int j = 0; j < size; j++)
-                  {
-                    //mexPrintf("residual[Per_y_+Block_Contain[%d].Equation = %d]=%g\n", j, Per_y_+Block_Contain[j].Equation, r[j]);
-                    residual[Per_y_+Block_Contain[j].Equation] = r[j];
-                  }
+                  residual[Per_y_+Block_Contain[j].Equation] = r[j];
               else
+                #ifdef USE_OMP
+                #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+                #endif
+                for (int j = 0; j < size; j++)
+                  residual[it_*size+j] = r[j];
+            }
+        }
+      mxFree(g1);
+      mxFree(r);
+      break;
+    case SOLVE_BACKWARD_COMPLETE:
+      fixe_u(&u, u_count_int, u_count_int);
+      Read_SparseMatrix(bin_base_name, size, 1, 0, 0, false, stack_solve_algo, solve_algo);
+      r = (double *) mxMalloc(size*sizeof(double));
+      if (steady_state)
+        {
+          compute_block_time(0, true, /*block_num, size, steady_state,*/ false);
+          if (block < 0)
+            #ifdef USE_OMP
+            #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+            #endif
+            for (int j = 0; j < size; j++)
+              residual[Block_Contain[j].Equation] = r[j];
+          else
+            #ifdef USE_OMP
+            #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+            #endif
+            for (int j = 0; j < size; j++)
+              residual[j] = r[j];
+        }
+      else
+        {
+          begining = it_code;
+          for (it_ = periods+y_kmin-1; it_ >= y_kmin; it_--)
+            {
+              it_code = begining;
+              Per_y_ = it_*y_size;
+              compute_block_time(0, true, /*block_num, size, steady_state, */false);
+              if (block < 0)
+                #ifdef USE_OMP
+                #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+                #endif
+                for (int j = 0; j < size; j++)
+                  residual[Per_y_+Block_Contain[j].Equation] = r[j];
+              else
+                #ifdef USE_OMP
+                #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+                #endif
                 for (int j = 0; j < size; j++)
                   residual[it_*size+j] = r[j];
             }
@@ -1693,7 +350,7 @@ Interpreter::evaluate_a_block(const int size, const int type, string bin_basenam
     case SOLVE_TWO_BOUNDARIES_SIMPLE:
     case SOLVE_TWO_BOUNDARIES_COMPLETE:
       fixe_u(&u, u_count_int, u_count_int);
-      Read_SparseMatrix(bin_basename, size, periods, y_kmin, y_kmax, steady_state, true, stack_solve_algo, solve_algo);
+      Read_SparseMatrix(bin_base_name, size, periods, y_kmin, y_kmax, true, stack_solve_algo, solve_algo);
       u_count = u_count_int*(periods+y_kmax+y_kmin);
       r = (double *) mxMalloc(size*sizeof(double));
       begining = it_code;
@@ -1702,14 +359,17 @@ Interpreter::evaluate_a_block(const int size, const int type, string bin_basenam
           Per_u_ = (it_-y_kmin)*u_count_int;
           Per_y_ = it_*y_size;
           it_code = begining;
-          compute_block_time(Per_u_, true, block_num, size, steady_state);
+          compute_block_time(Per_u_, true, /*block_num, size, steady_state,*/ false);
           if (block < 0)
+            #ifdef USE_OMP
+            #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+            #endif
             for (int j = 0; j < size; j++)
-              {
-                //mexPrintf("residual[it_*y_size+Block_Contain[%d].Equation = %d]=%g\n", j, it_*y_size+Block_Contain[j].Equation, r[j]);
-                residual[it_*y_size+Block_Contain[j].Equation] = r[j];
-              }
+              residual[it_*y_size+Block_Contain[j].Equation] = r[j];
           else
+            #ifdef USE_OMP
+            #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+            #endif
             for (int j = 0; j < size; j++)
               residual[it_*size+j] = r[j];
         }
@@ -1719,650 +379,88 @@ Interpreter::evaluate_a_block(const int size, const int type, string bin_basenam
 }
 
 
-void
-Interpreter::SingularDisplay(int Per_u_, bool evaluate, int Block_Count, int size, bool steady_state, it_code_type begining)
-{
-  it_code = begining;
-  compute_block_time(Per_u_, evaluate, Block_Count, size, steady_state);
-  Singular_display(Block_Count, size, steady_state, begining);
-}
-
 
 int
-Interpreter::simulate_a_block(const int size, const int type, string file_name, string bin_basename, bool Gaussian_Elimination, bool steady_state, bool print_it, int block_num,
-                              const bool is_linear, const int symbol_table_endo_nbr, const int Block_List_Max_Lag, const int Block_List_Max_Lead, const int u_count_int)
+Interpreter::simulate_a_block()
 {
   it_code_type begining;
-  int i;
   bool cvg;
-  bool result = true;
-  bool singular_system;
   double *y_save;
-  res1 = 0;
 #ifdef DEBUG
-  mexPrintf("simulate_a_block\n");
+  mexPrintf("simulate_a_block type = %d, periods=%d, y_kmin=%d, y_kmax=%d\n", type, periods, y_kmin, y_kmax);
+  mexEvalString("drawnow;");
 #endif
   switch (type)
     {
     case EVALUATE_FORWARD:
 #ifdef DEBUG
       mexPrintf("EVALUATE_FORWARD\n");
+      mexEvalString("drawnow;");
 #endif
-      if (steady_state)
-        compute_block_time(0, false, block_num, size, steady_state);
-      else
-        {
-          begining = it_code;
-          for (it_ = y_kmin; it_ < periods+y_kmin; it_++)
-            {
-              it_code = begining;
-              Per_y_ = it_*y_size;
-              compute_block_time(0, false, block_num, size, steady_state);
-            }
-        }
+        evaluate_over_periods(true);
       break;
     case EVALUATE_BACKWARD:
 #ifdef DEBUG
       mexPrintf("EVALUATE_BACKWARD\n");
+      mexEvalString("drawnow;");
 #endif
-      if (steady_state)
-        compute_block_time(0, false, block_num, size, steady_state);
-      else
-        {
-          begining = it_code;
-          for (it_ = periods+y_kmin-1; it_ >= y_kmin; it_--)
-            {
-              it_code = begining;
-              Per_y_ = it_*y_size;
-              compute_block_time(0, false, block_num, size, steady_state);
-            }
-        }
+        evaluate_over_periods(false);
       break;
     case SOLVE_FORWARD_SIMPLE:
 #ifdef DEBUG
       mexPrintf("SOLVE_FORWARD_SIMPLE size=%d\n", size);
+      mexEvalString("drawnow;");
 #endif
-      g1 = (double *) mxMalloc(size*size*sizeof(double));
-      r = (double *) mxMalloc(size*sizeof(double));
-      begining = it_code;
-      if (steady_state)
-        {
-          cvg = false;
-          iter = 0;
-          while (!(cvg || (iter > maxit_)))
-            {
-              it_code = begining;
-              Per_y_ = it_*y_size;
-              compute_block_time(0, false, block_num, size, steady_state);
-              double rr;
-              rr = r[0];
-              cvg = (fabs(rr) < solve_tolf);
-              if (cvg)
-                continue;
-
-              try
-                {
-                  y[Block_Contain[0].Variable] += -divide(r[0], g1[0]);
-                }
-              catch (FloatingPointExceptionHandling &fpeh)
-                {
-                  mexPrintf("%s      \n", fpeh.GetErrorMsg().c_str());
-                  mexPrintf("      Singularity in block %d", block_num+1);
-                }
-              iter++;
-            }
-          if (!cvg)
-            {
-              ostringstream tmp;
-              tmp << " in Solve Forward simple, convergence not achieved in block " << Block_Count+1 << ", after " << iter << " iterations\n";
-              throw FatalExceptionHandling(tmp.str());
-            }
-        }
-      else
-        {
-          for (it_ = y_kmin; it_ < periods+y_kmin; it_++)
-            {
-              cvg = false;
-              iter = 0;
-              Per_y_ = it_*y_size;
-              while (!(cvg || (iter > maxit_)))
-                {
-                  it_code = begining;
-                  Per_y_ = it_*y_size;
-                  compute_block_time(0, false, block_num, size, steady_state);
-                  double rr;
-                  if (fabs(1+y[Per_y_+Block_Contain[0].Variable]) > eps)
-                    rr = r[0]/(1+y[Per_y_+Block_Contain[0].Variable]);
-                  else
-                    rr = r[0];
-                  cvg = (fabs(rr) < solve_tolf);
-                  if (cvg)
-                    continue;
-                  try
-                    {
-                      y[Per_y_+Block_Contain[0].Variable] += -divide(r[0], g1[0]);
-                    }
-                  catch (FloatingPointExceptionHandling &fpeh)
-                    {
-                      mexPrintf("%s      \n", fpeh.GetErrorMsg().c_str());
-                      mexPrintf("      Singularity in block %d", block_num+1);
-                    }
-                  iter++;
-                }
-              if (!cvg)
-                {
-                  ostringstream tmp;
-                  tmp << " in Solve Forward simple, convergence not achieved in block " << Block_Count+1 << ", at time " << it_ << ", after " << iter << " iterations\n";
-                  throw FatalExceptionHandling(tmp.str());
-                }
-            }
-        }
-      mxFree(g1);
-      mxFree(r);
+      solve_simple_over_periods(true);
       break;
     case SOLVE_BACKWARD_SIMPLE:
 #ifdef DEBUG
       mexPrintf("SOLVE_BACKWARD_SIMPLE\n");
+      mexEvalString("drawnow;");
 #endif
-      g1 = (double *) mxMalloc(size*size*sizeof(double));
-      r = (double *) mxMalloc(size*sizeof(double));
-      begining = it_code;
-      if (steady_state)
-        {
-          cvg = false;
-          iter = 0;
-          while (!(cvg || (iter > maxit_)))
-            {
-              it_code = begining;
-              Per_y_ = it_*y_size;
-              compute_block_time(0, false, block_num, size, steady_state);
-              double rr;
-              rr = r[0];
-              cvg = (fabs(rr) < solve_tolf);
-              if (cvg)
-                continue;
-              try
-                {
-                  y[Block_Contain[0].Variable] += -divide(r[0], g1[0]);
-                }
-              catch (FloatingPointExceptionHandling &fpeh)
-                {
-                  mexPrintf("%s      \n", fpeh.GetErrorMsg().c_str());
-                  mexPrintf("      Singularity in block %d", block_num+1);
-                }
-              iter++;
-            }
-          if (!cvg)
-            {
-              ostringstream tmp;
-              tmp << " in Solve Backward simple, convergence not achieved in block " << Block_Count+1 << ", after " << iter << " iterations\n";
-              throw FatalExceptionHandling(tmp.str());
-            }
-        }
-      else
-        {
-          for (it_ = periods+y_kmin-1; it_ >= y_kmin; it_--)
-            {
-              cvg = false;
-              iter = 0;
-              Per_y_ = it_*y_size;
-              while (!(cvg || (iter > maxit_)))
-                {
-                  it_code = begining;
-                  Per_y_ = it_*y_size;
-                  compute_block_time(0, false, block_num, size, steady_state);
-                  double rr;
-                  if (fabs(1+y[Per_y_+Block_Contain[0].Variable]) > eps)
-                    rr = r[0]/(1+y[Per_y_+Block_Contain[0].Variable]);
-                  else
-                    rr = r[0];
-                  cvg = (fabs(rr) < solve_tolf);
-                  if (cvg)
-                    continue;
-                  try
-                    {
-                      y[Per_y_+Block_Contain[0].Variable] += -divide(r[0], g1[0]);
-                    }
-                  catch (FloatingPointExceptionHandling &fpeh)
-                    {
-                      mexPrintf("%s      \n", fpeh.GetErrorMsg().c_str());
-                      mexPrintf("      Singularity in block %d", block_num+1);
-                    }
-
-                  iter++;
-                }
-              if (!cvg)
-                {
-                  ostringstream tmp;
-                  tmp << " in Solve Backward simple, convergence not achieved in block " << Block_Count+1 << ", at time " << it_ << ", after " << iter << " iterations\n";
-                  throw FatalExceptionHandling(tmp.str());
-                }
-            }
-        }
-      mxFree(g1);
-      mxFree(r);
+      solve_simple_over_periods(false);
       break;
     case SOLVE_FORWARD_COMPLETE:
 #ifdef DEBUG
       mexPrintf("SOLVE_FORWARD_COMPLETE\n");
+      mexEvalString("drawnow;");
 #endif
       fixe_u(&u, u_count_int, u_count_int);
-      Read_SparseMatrix(bin_basename, size, 1, 0, 0, steady_state, false, stack_solve_algo, solve_algo);
-      g1 = (double *) mxMalloc(size*size*sizeof(double));
-      r = (double *) mxMalloc(size*sizeof(double));
-      begining = it_code;
+      Read_SparseMatrix(bin_base_name, size, 1, 0, 0, false, stack_solve_algo, solve_algo);
+      start_code = it_code;
       Per_u_ = 0;
-      if (steady_state)
-        {
-          if (!is_linear)
-            {
-              max_res_idx = 0;
-              cvg = false;
-              iter = 0;
-              glambda2 = g0 = very_big;
-              try_at_iteration = 0;
-              while (!(cvg || (iter > maxit_)))
-                {
-                  it_code = begining;
-                  error_not_printed = true;
-                  res2 = 0;
-                  res1 = 0;
-                  max_res = 0;
-                  compute_block_time(0, false, block_num, size, steady_state);
-                  if (!(isnan(res1) || isinf(res1)))
-                    {
-                      for (i = 0; i < size; i++)
-                        {
-                          double rr;
-                          rr = r[i];
-                          if (max_res < fabs(rr))
-                            {
-                              max_res = fabs(rr);
-                              max_res_idx = i;
-                            }
-                          res2 += rr*rr;
-                          res1 += fabs(rr);
-                        }
-                      cvg = (max_res < solve_tolf);
-                    }
-                  else
-                    cvg = false;
-                  if (cvg)
-                    continue;
-                  int prev_iter = iter;
-                  singular_system = Simulate_Newton_One_Boundary(Block_Count, symbol_table_endo_nbr, 0, 0, 0, size, print_it, cvg, iter, true, stack_solve_algo, solve_algo);
-                  if (singular_system)
-                    SingularDisplay(0, false, block_num, size, steady_state, begining);
 
-                  iter++;
-                  if (iter > prev_iter)
-                    {
-                      g0 = res2;
-                      gp0 = -res2;
-                      try_at_iteration = 0;
-                    }
-                }
-              if (!cvg || !result)
-                {
-                  ostringstream tmp;
-                  tmp << " in Solve Forward complete, convergence not achieved in block " << Block_Count+1 << ", after " << iter << " iterations\n";
-                  throw FatalExceptionHandling(tmp.str());
-                }
-            }
-          else
-            {
-              it_code = begining;
-              Per_y_ = it_*y_size;
-              iter = 0;
-              res1 = 0;
-              res2 = 0;
-              max_res = 0;
-              max_res_idx = 0;
-              error_not_printed = true;
-              compute_block_time(0, false, block_num, size, steady_state);
-              if (!(isnan(res1) || isinf(res1)))
-                {
-                  for (i = 0; i < size; i++)
-                    {
-                      double rr;
-                      rr = r[i];
-                      if (max_res < fabs(rr))
-                        {
-                          max_res = fabs(rr);
-                          max_res_idx = i;
-                        }
-                      res2 += rr*rr;
-                      res1 += fabs(rr);
-                    }
-                  cvg = (max_res < solve_tolf);
-                }
-              else
-                cvg = false;
-              singular_system = Simulate_Newton_One_Boundary(Block_Count, symbol_table_endo_nbr, 0, 0, 0, size, print_it, cvg, iter, true, stack_solve_algo, solve_algo);
-              if (singular_system)
-                SingularDisplay(0, false, block_num, size, steady_state, begining);
-              if (!result)
-                {
-                  mexPrintf(" in Solve Forward complete, convergence not achieved in block %d\n", Block_Count+1);
-                  return ERROR_ON_EXIT;
-                }
-            }
-        }
-      else
-        {
-          if (!is_linear)
-            {
-              max_res_idx = 0;
-              for (it_ = y_kmin; it_ < periods+y_kmin; it_++)
-                {
-                  cvg = false;
-                  iter = 0;
-                  glambda2 = g0 = very_big;
-                  try_at_iteration = 0;
-                  Per_y_ = it_*y_size;
-                  while (!(cvg || (iter > maxit_)))
-                    {
-                      it_code = begining;
-                      error_not_printed = true;
-                      res2 = 0;
-                      res1 = 0;
-                      max_res = 0;
-                      compute_block_time(0, false, block_num, size, steady_state);
-                      if (!(isnan(res1) || isinf(res1)))
-                        {
-                          for (i = 0; i < size; i++)
-                            {
-                              double rr;
-                              if (fabs(1+y[Per_y_+Block_Contain[i].Variable]) > eps)
-                                rr = r[i]/(1+y[Per_y_+Block_Contain[i].Variable]);
-                              else
-                                rr = r[i];
-                              if (max_res < fabs(rr))
-                                {
-                                  max_res = fabs(rr);
-                                  max_res_idx = i;
-                                }
-                              res2 += rr*rr;
-                              res1 += fabs(rr);
-                            }
-                          cvg = (max_res < solve_tolf);
-                        }
-                      else
-                        cvg = false;
-                      if (cvg)
-                        continue;
-                      int prev_iter = iter;
-                      singular_system = Simulate_Newton_One_Boundary(Block_Count, symbol_table_endo_nbr, it_, y_kmin, y_kmax, size, print_it, cvg, iter, false, stack_solve_algo, solve_algo);
-                      if (singular_system)
-                        SingularDisplay(0, false, block_num, size, steady_state, begining);
-                      iter++;
-                      if (iter > prev_iter)
-                        {
-                          g0 = res2;
-                          gp0 = -res2;
-                          try_at_iteration = 0;
-                        }
-                    }
-                  if (!cvg)
-                    {
-                      ostringstream tmp;
-                      tmp << " in Solve Forward complete, convergence not achieved in block " << Block_Count+1 << ", at time " << it_ << ", after " << iter << " iterations\n";
-                      throw FatalExceptionHandling(tmp.str());
-                    }
-                }
-            }
-          else
-            {
-              for (it_ = y_kmin; it_ < periods+y_kmin; it_++)
-                {
-                  it_code = begining;
-                  Per_y_ = it_*y_size;
-                  iter = 0;
-                  res1 = 0;
-                  res2 = 0;
-                  max_res = 0; max_res_idx = 0;
-                  error_not_printed = true;
-                  compute_block_time(0, false, block_num, size, steady_state);
-                  if (!(isnan(res1) || isinf(res1)))
-                    {
-                      for (i = 0; i < size; i++)
-                        {
-                          double rr;
-                          rr = r[i];
-                          if (max_res < fabs(rr))
-                            {
-                              max_res = fabs(rr);
-                              max_res_idx = i;
-                            }
-                          res2 += rr*rr;
-                          res1 += fabs(rr);
-                        }
-                      cvg = (max_res < solve_tolf);
-                    }
-                  else
-                    cvg = false;
-                  singular_system = Simulate_Newton_One_Boundary(Block_Count, symbol_table_endo_nbr, it_, y_kmin, y_kmax, size, print_it, cvg, iter, false, stack_solve_algo, solve_algo);
-                  if (singular_system)
-                    SingularDisplay(0, false, block_num, size, steady_state, begining);
-                }
-            }
-        }
+      Simulate_Newton_One_Boundary(true);
+
+      mxFree(u);
       mxFree(index_equa);
       mxFree(index_vara);
       memset(direction, 0, size_of_direction);
-      mxFree(g1);
-      mxFree(r);
-      mxFree(u);
+      End_Solver();
       break;
     case SOLVE_BACKWARD_COMPLETE:
 #ifdef DEBUG
       mexPrintf("SOLVE_BACKWARD_COMPLETE\n");
+      mexEvalString("drawnow;");
 #endif
       fixe_u(&u, u_count_int, u_count_int);
-      Read_SparseMatrix(bin_basename, size, 1, 0, 0, steady_state, false, stack_solve_algo, solve_algo);
-      g1 = (double *) mxMalloc(size*size*sizeof(double));
-      r = (double *) mxMalloc(size*sizeof(double));
-      begining = it_code;
-      if (steady_state)
-        {
-          if (!is_linear)
-            {
-              max_res_idx = 0;
-              cvg = false;
-              iter = 0;
-              glambda2 = g0 = very_big;
-              try_at_iteration = 0;
-              while (!(cvg || (iter > maxit_)))
-                {
-                  it_code = begining;
-                  error_not_printed = true;
-                  res2 = 0;
-                  res1 = 0;
-                  max_res = 0;
-                  compute_block_time(0, false, block_num, size, steady_state);
-                  if (!(isnan(res1) || isinf(res1)))
-                    {
-                      for (i = 0; i < size; i++)
-                        {
-                          double rr;
-                          rr = r[i];
-                          if (max_res < fabs(rr))
-                            {
-                              max_res = fabs(rr);
-                              max_res_idx = i;
-                            }
-                          res2 += rr*rr;
-                          res1 += fabs(rr);
-                        }
-                      cvg = (max_res < solve_tolf);
-                    }
-                  else
-                    cvg = false;
-                  if (cvg)
-                    continue;
-                  int prev_iter = iter;
-                  singular_system = Simulate_Newton_One_Boundary(Block_Count, symbol_table_endo_nbr, 0, 0, 0, size, print_it, cvg, iter, true, stack_solve_algo, solve_algo);
-                  if (singular_system)
-                    SingularDisplay(0, false, block_num, size, steady_state, begining);
-                  iter++;
-                  if (iter > prev_iter)
-                    {
-                      g0 = res2;
-                      gp0 = -res2;
-                      try_at_iteration = 0;
-                    }
-                }
-              if (!cvg || !result)
-                {
-                  ostringstream tmp;
-                  tmp << " in Solve Backward complete, convergence not achieved in block " << Block_Count+1 << ", after " << iter << " iterations\n";
-                  throw FatalExceptionHandling(tmp.str());
-                }
-            }
-          else
-            {
-              it_code = begining;
-              Per_y_ = it_*y_size;
-              iter = 0;
-              res1 = 0;
-              res2 = 0;
-              max_res = 0; max_res_idx = 0;
-              error_not_printed = true;
-              compute_block_time(0, false, block_num, size, steady_state);
-              if (!(isnan(res1) || isinf(res1)))
-                {
-                  for (i = 0; i < size; i++)
-                    {
-                      double rr;
-                      rr = r[i];
-                      if (max_res < fabs(rr))
-                        {
-                          max_res = fabs(rr);
-                          max_res_idx = i;
-                        }
-                      res2 += rr*rr;
-                      res1 += fabs(rr);
-                    }
-                  cvg = (max_res < solve_tolf);
-                }
-              else
-                cvg = false;
-              singular_system = Simulate_Newton_One_Boundary(Block_Count, symbol_table_endo_nbr, 0, 0, 0, size, print_it, cvg, iter, true, stack_solve_algo, solve_algo);
-              if (singular_system)
-                SingularDisplay(0, false, block_num, size, steady_state, begining);
-              if (!result)
-                {
-                  mexPrintf(" in Solve Backward complete, convergence not achieved in block %d\n", Block_Count+1);
-                  return ERROR_ON_EXIT;
-                }
-            }
-        }
-      else
-        {
-          if (!is_linear)
-            {
-              max_res_idx = 0;
-              for (it_ = periods+y_kmin-1; it_ >= y_kmin; it_--)
-                {
-                  cvg = false;
-                  iter = 0;
-                  glambda2 = g0 = very_big;
-                  try_at_iteration = 0;
-                  Per_y_ = it_*y_size;
-                  while (!(cvg || (iter > maxit_)))
-                    {
-                      it_code = begining;
-                      error_not_printed = true;
-                      res2 = 0;
-                      res1 = 0;
-                      max_res = 0;
-                      compute_block_time(0, false, block_num, size, steady_state);
-                      if (!(isnan(res1) || isinf(res1)))
-                        {
-                          for (i = 0; i < size; i++)
-                            {
-                              double rr;
-                              if (fabs(1+y[Per_y_+Block_Contain[i].Variable]) > eps)
-                                rr = r[i]/(1+y[Per_y_+Block_Contain[i].Variable]);
-                              else
-                                rr = r[i];
-                              if (max_res < fabs(rr))
-                                {
-                                  max_res = fabs(rr);
-                                  max_res_idx = i;
-                                }
-                              res2 += rr*rr;
-                              res1 += fabs(rr);
-                            }
-                          cvg = (max_res < solve_tolf);
-                        }
-                      else
-                        cvg = false;
-                      if (cvg)
-                        continue;
-                      int prev_iter = iter;
-                      singular_system = Simulate_Newton_One_Boundary(Block_Count, symbol_table_endo_nbr, it_, y_kmin, y_kmax, size, print_it, cvg, iter, false, stack_solve_algo, solve_algo);
-                      if (singular_system)
-                        SingularDisplay(0, false, block_num, size, steady_state, begining);
-                      iter++;
-                      if (iter > prev_iter)
-                        {
-                          g0 = res2;
-                          gp0 = -res2;
-                          try_at_iteration = 0;
-                        }
-                    }
-                  if (!cvg)
-                    {
-                      ostringstream tmp;
-                      tmp << " in Solve Backward complete, convergence not achieved in block " << Block_Count+1 << ", at time " << it_ << ", after " << iter << " iterations\n";
-                      throw FatalExceptionHandling(tmp.str());
-                    }
-                }
-            }
-          else
-            {
-              for (it_ = periods+y_kmin-1; it_ >= y_kmin; it_--)
-                {
-                  it_code = begining;
-                  Per_y_ = it_*y_size;
-                  error_not_printed = true;
-                  compute_block_time(0, false, block_num, size, steady_state);
-                  if (!(isnan(res1) || isinf(res1)))
-                    {
-                      for (i = 0; i < size; i++)
-                        {
-                          double rr;
-                          rr = r[i];
-                          if (max_res < fabs(rr))
-                            {
-                              max_res = fabs(rr);
-                              max_res_idx = i;
-                            }
-                          res2 += rr*rr;
-                          res1 += fabs(rr);
-                        }
-                      cvg = (max_res < solve_tolf);
-                    }
-                  else
-                    cvg = false;
-                  singular_system = Simulate_Newton_One_Boundary(Block_Count, symbol_table_endo_nbr, it_, y_kmin, y_kmax, size, print_it, cvg, iter, false, stack_solve_algo, solve_algo);
-                  if (singular_system)
-                    SingularDisplay(0, false, block_num, size, steady_state, begining);
-                }
-            }
-        }
+      Read_SparseMatrix(bin_base_name, size, 1, 0, 0, false, stack_solve_algo, solve_algo);
+      start_code = it_code;
+      Per_u_ = 0;
+
+      Simulate_Newton_One_Boundary(false);
+
       mxFree(index_equa);
       mxFree(index_vara);
       memset(direction, 0, size_of_direction);
-      mxFree(g1);
-      mxFree(r);
       mxFree(u);
+      End_Solver();
       break;
     case SOLVE_TWO_BOUNDARIES_SIMPLE:
     case SOLVE_TWO_BOUNDARIES_COMPLETE:
 #ifdef DEBUG
       mexPrintf("SOLVE_TWO_BOUNDARIES\n");
+      mexEvalString("drawnow;");
 #endif
       if (steady_state)
         {
@@ -2370,11 +468,12 @@ Interpreter::simulate_a_block(const int size, const int type, string file_name,
           return ERROR_ON_EXIT;
         }
       fixe_u(&u, u_count_int, u_count_int);
-      Read_SparseMatrix(bin_basename, size, periods, y_kmin, y_kmax, steady_state, true, stack_solve_algo, solve_algo);
+      Read_SparseMatrix(bin_base_name, size, periods, y_kmin, y_kmax, true, stack_solve_algo, solve_algo);
       u_count = u_count_int*(periods+y_kmax+y_kmin);
       r = (double *) mxMalloc(size*sizeof(double));
+      res = (double *) mxMalloc(size*periods*sizeof(double));
       y_save = (double *) mxMalloc(y_size*sizeof(double)*(periods+y_kmax+y_kmin));
-      begining = it_code;
+      start_code = it_code;
       iter = 0;
       if (!is_linear)
         {
@@ -2384,47 +483,25 @@ Interpreter::simulate_a_block(const int size, const int type, string file_name,
           int u_count_saved = u_count;
           while (!(cvg || (iter > maxit_)))
             {
-
               res2 = 0;
               res1 = 0;
               max_res = 0;
               max_res_idx = 0;
+              double res1_, res2_, max_res_;
+              int max_res_idx_;
               memcpy(y_save, y, y_size*sizeof(double)*(periods+y_kmax+y_kmin));
-              for (it_ = y_kmin; it_ < periods+y_kmin; it_++)
-                {
-                  Per_u_ = (it_-y_kmin)*u_count_int;
-                  Per_y_ = it_*y_size;
-                  it_code = begining;
-                  error_not_printed = true;
-                  compute_block_time(Per_u_, false, block_num, size, steady_state);
-                  if (isnan(res1) || isinf(res1))
-                    {
-                      memcpy(y, y_save, y_size*sizeof(double)*(periods+y_kmax+y_kmin));
-                      break;
-                    }
-                  for (i = 0; i < size; i++)
-                    {
-                      double rr;
-                      if (fabs(1+y[Per_y_+Block_Contain[i].Variable]) > eps)
-                        rr = r[i]/(1+y[Per_y_+Block_Contain[i].Variable]);
-                      else
-                        rr = r[i];
-                      if (max_res < fabs(rr))
-                        {
-                          max_res = fabs(rr);
-                          max_res_idx = i;
-                        }
-                      res2 += rr*rr;
-                      res1 += fabs(rr);
-                    }
-                }
-              if (isnan(res1) || isinf(res1))
-                cvg = false;
-              else
+
+              compute_complete_2b(false, &res1, &res2, &max_res, &max_res_idx);
+              end_code = it_code;
+
+              if (!(isnan(res1) || isinf(res1)))
                 cvg = (max_res < solve_tolf);
+              if (isnan(res1) || isinf(res1) || (stack_solve_algo == 4 && iter > 0))
+                  memcpy(y, y_save, y_size*sizeof(double)*(periods+y_kmax+y_kmin));
               u_count = u_count_saved;
               int prev_iter = iter;
-              Simulate_Newton_Two_Boundaries(Block_Count, symbol_table_endo_nbr, it_, y_kmin, y_kmax, size, periods, print_it, cvg, iter, minimal_solving_periods, stack_solve_algo, endo_name_length, P_endo_names);
+
+              Simulate_Newton_Two_Boundaries(block_num, symbol_table_endo_nbr, y_kmin, y_kmax, size, periods, cvg, minimal_solving_periods, stack_solve_algo, endo_name_length, P_endo_names);
               iter++;
               if (iter > prev_iter)
                 {
@@ -2437,7 +514,7 @@ Interpreter::simulate_a_block(const int size, const int type, string file_name,
           if (!cvg)
             {
               ostringstream tmp;
-              tmp << " in Solve two boundaries, convergence not achieved in block " << Block_Count+1 << ", after " << iter << " iterations\n";
+              tmp << " in Solve two boundaries, convergence not achieved in block " << block_num+1 << ", after " << iter << " iterations\n";
               throw FatalExceptionHandling(tmp.str());
             }
         }
@@ -2446,34 +523,24 @@ Interpreter::simulate_a_block(const int size, const int type, string file_name,
           res1 = 0;
           res2 = 0;
           max_res = 0; max_res_idx = 0;
-          for (it_ = y_kmin; it_ < periods+y_kmin; it_++)
-            {
-              Per_u_ = (it_-y_kmin)*u_count_int;
-              Per_y_ = it_*y_size;
-              it_code = begining;
-              compute_block_time(Per_u_, false, block_num, size, steady_state);
-              for (i = 0; i < size; i++)
-                {
-                  double rr;
-                  rr = r[i];
-                  if (max_res < fabs(rr))
-                    {
-                      max_res = fabs(rr);
-                      max_res_idx = i;
-                    }
-                  res2 += rr*rr;
-                  res1 += fabs(rr);
-                }
-            }
+          double res1_, res2_, max_res_;
+          int max_res_idx_;
+
+          compute_complete_2b(false, &res1, &res2, &max_res, &max_res_idx);
+          end_code = it_code;
+
           cvg = false;
-          Simulate_Newton_Two_Boundaries(Block_Count, symbol_table_endo_nbr, it_, y_kmin, y_kmax, size, periods, print_it, cvg, iter, minimal_solving_periods, stack_solve_algo, endo_name_length, P_endo_names);
+          Simulate_Newton_Two_Boundaries(block_num, symbol_table_endo_nbr, y_kmin, y_kmax, size, periods, cvg, minimal_solving_periods, stack_solve_algo, endo_name_length, P_endo_names);
         }
+      it_code = end_code;
       mxFree(r);
       mxFree(y_save);
       mxFree(u);
       mxFree(index_vara);
       mxFree(index_equa);
+      mxFree(res);
       memset(direction, 0, size_of_direction);
+      End_Solver();
       break;
     default:
       ostringstream tmp;
@@ -2485,12 +552,13 @@ Interpreter::simulate_a_block(const int size, const int type, string file_name,
 }
 
 void
-Interpreter::print_a_block(const int size, const int type, string bin_basename, bool steady_state, int block_num,
-                           const bool is_linear, const int symbol_table_endo_nbr, const int Block_List_Max_Lag,
-                           const int Block_List_Max_Lead, const int u_count_int, int block)
+Interpreter::print_a_block()
 {
   it_code_type begining;
-  mexPrintf("\nBlock %d\n", block_num+1);
+  if (block < 0)
+    mexPrintf("\nBlock %d\n", block_num+1);
+  else
+    mexPrintf("\nBlock %d\n", block+1);
   mexPrintf("----------\n");
   if (steady_state)
     residual = vector<double>(size);
@@ -2525,7 +593,7 @@ Interpreter::print_a_block(const int size, const int type, string bin_basename,
 }
 
 bool
-Interpreter::compute_blocks(string file_name, string bin_basename, bool steady_state, bool evaluate, int block, int &nb_blocks, bool print_it)
+Interpreter::compute_blocks(string file_name, string bin_basename, bool evaluate, int block, int &nb_blocks)
 {
   bool result = true;
 
@@ -2582,9 +650,9 @@ Interpreter::compute_blocks(string file_name, string bin_basename, bool steady_s
             FBEGINBLOCK_ *fb = (FBEGINBLOCK_ *) it_code->second;
             Block_Contain = fb->get_Block_Contain();
             it_code++;
+            set_block(fb->get_size(), fb->get_type(), file_name, bin_basename, Block_Count, fb->get_is_linear(), fb->get_endo_nbr(), fb->get_Max_Lag(), fb->get_Max_Lead(), fb->get_u_count_int(), block);
             if (print)
-              print_a_block(fb->get_size(), fb->get_type(), bin_basename, steady_state, Block_Count,
-                            fb->get_is_linear(), fb->get_endo_nbr(), fb->get_Max_Lag(), fb->get_Max_Lead(), fb->get_u_count_int(), block);
+              print_a_block();
             else if (evaluate)
               {
 #ifdef DEBUG
@@ -2607,8 +675,7 @@ Interpreter::compute_blocks(string file_name, string bin_basename, bool steady_s
                     else
                       residual = vector<double>(fb->get_size()*(periods+y_kmin));
                   }
-                evaluate_a_block(fb->get_size(), fb->get_type(), bin_basename, steady_state, Block_Count,
-                                 fb->get_is_linear(), fb->get_endo_nbr(), fb->get_Max_Lag(), fb->get_Max_Lead(), fb->get_u_count_int(), block);
+                evaluate_a_block();
               }
             else
               {
@@ -2616,8 +683,7 @@ Interpreter::compute_blocks(string file_name, string bin_basename, bool steady_s
                 mexPrintf("endo in block=%d, type=%d, steady_state=%d, print_it=%d, Block_Count=%d, fb->get_is_linear()=%d, fb->get_endo_nbr()=%d, fb->get_Max_Lag()=%d, fb->get_Max_Lead()=%d, fb->get_u_count_int()=%d\n",
                           fb->get_size(), fb->get_type(), steady_state, print_it, Block_Count, fb->get_is_linear(), fb->get_endo_nbr(), fb->get_Max_Lag(), fb->get_Max_Lead(), fb->get_u_count_int());
 #endif
-                result = simulate_a_block(fb->get_size(), fb->get_type(), file_name, bin_basename, true, steady_state, print_it,Block_Count,
-                                          fb->get_is_linear(), fb->get_endo_nbr(), fb->get_Max_Lag(), fb->get_Max_Lead(), fb->get_u_count_int());
+                result = simulate_a_block();
                 if (result == ERROR_ON_EXIT)
                   return ERROR_ON_EXIT;
               }
@@ -2678,16 +744,14 @@ Interpreter::compute_blocks(string file_name, string bin_basename, bool steady_s
             it_code++;
           break;
         default:
-          mexPrintf("Error\n");
           ostringstream tmp;
-          tmp << " in compute_blocks, unknown command " << it_code->first << "\n";
+          tmp << " in compute_blocks, unknown command " << it_code->first << " (block=" << Block_Count << ")\n";
           throw FatalExceptionHandling(tmp.str());
         }
     }
-
   mxFree(Init_Code->second);
   nb_blocks = Block_Count+1;
-  if (T and !global_temporary_terms)
+  if (T && !global_temporary_terms)
     mxFree(T);
   return result;
 }
diff --git a/mex/sources/bytecode/Interpreter.hh b/mex/sources/bytecode/Interpreter.hh
index 18fefd89c..b33b608d8 100644
--- a/mex/sources/bytecode/Interpreter.hh
+++ b/mex/sources/bytecode/Interpreter.hh
@@ -27,6 +27,7 @@
 #define BYTE_CODE
 #include "CodeInterpreter.hh"
 #include "SparseMatrix.hh"
+#include "Evaluate.hh"
 #ifdef LINBCG
 # include "linbcg.hh"
 #endif
@@ -40,50 +41,29 @@
 
 using namespace std;
 
-#define pow_ pow
 
-class Interpreter : public SparseMatrix
+class Interpreter : public dynSparseMatrix
 {
 private:
-  unsigned int EQN_dvar1, EQN_dvar2, EQN_dvar3;
-  int EQN_lag1, EQN_lag2, EQN_lag3;
-  mxArray *GlobalTemporaryTerms;
 protected:
-  double pow1(double a, double b);
-  double divide(double a, double b);
-  double log1(double a);
-  double log10_1(double a);
-  void compute_block_time(int Per_u_, bool evaluate, int block_num, int size, bool steady_state);
-  void evaluate_a_block(const int size, const int type, string bin_basename, bool steady_state, int block_num,
-                        const bool is_linear = false, const int symbol_table_endo_nbr = 0, const int Block_List_Max_Lag = 0, const int Block_List_Max_Lead = 0, const int u_count_int = 0, int block = -1);
-  int simulate_a_block(const int size, const int type, string file_name, string bin_basename, bool Gaussian_Elimination, bool steady_state, bool print_it, int block_num,
-                       const bool is_linear = false, const int symbol_table_endo_nbr = 0, const int Block_List_Max_Lag = 0, const int Block_List_Max_Lead = 0, const int u_count_int = 0);
-  void print_a_block(const int size, const int type, string bin_basename, bool steady_state, int block_num,
-                     const bool is_linear, const int symbol_table_endo_nbr, const int Block_List_Max_Lag,
-                     const int Block_List_Max_Lead, const int u_count_int, int block);
-  void SingularDisplay(int Per_u_, bool evaluate, int Block_Count, int size, bool steady_state, it_code_type begining);
-  vector<Block_contain_type> Block_Contain;
-  code_liste_type code_liste;
-  it_code_type it_code;
-  int Block_Count, Per_u_, Per_y_;
-  int it_, maxit_, size_of_direction;
-  double solve_tolf;
-  bool GaussSeidel;
-  map<pair<pair<int, int>, int>, int> IM_i;
-  int equation, derivative_equation, derivative_variable;
-  string filename;
-  int minimal_solving_periods;
-  int stack_solve_algo, solve_algo;
-  bool global_temporary_terms;
-  bool print, print_error;
+  void evaluate_a_block();
+  int simulate_a_block();
+  void print_a_block();
 public:
   ~Interpreter();
   Interpreter(double *params_arg, double *y_arg, double *ya_arg, double *x_arg, double *steady_y_arg, double *steady_x_arg,
-              double *direction_arg, int y_size_arg, int nb_row_x_arg,
-              int nb_row_xd_arg, int periods_arg, int y_kmin_arg, int y_kmax_arg, int maxit_arg_, double solve_tolf_arg, int size_o_direction_arg,
-              double slowc_arg, int y_decal_arg, double markowitz_c_arg, string &filename_arg, int minimal_solving_periods_arg, int stack_solve_algo_arg, int solve_algo_arg,
-              bool global_temporary_terms_arg, bool print_arg, bool print_error_arg, mxArray *GlobalTemporaryTerms_arg);
-  bool compute_blocks(string file_name, string bin_basename, bool steady_state, bool evaluate, int block, int &nb_blocks, bool print_it);
+              double *direction_arg, size_t y_size_arg,
+              size_t nb_row_x_arg, size_t nb_row_xd_arg, int periods_arg, int y_kmin_arg, int y_kmax_arg,
+              int maxit_arg_, double solve_tolf_arg, size_t size_of_direction_arg, double slowc_arg, int y_decal_arg, double markowitz_c_arg,
+              string &filename_arg, int minimal_solving_periods_arg, int stack_solve_algo_arg, int solve_algo_arg,
+              bool global_temporary_terms_arg, bool print_arg, bool print_error_arg, mxArray *GlobalTemporaryTerms_arg,
+              bool steady_state_arg, bool print_it_arg
+#ifdef CUDA
+              , const int CUDA_device, cublasHandle_t cublas_handle_arg, cusparseHandle_t cusparse_handle_arg, cusparseMatDescr_t descr_arg
+#endif
+              );
+  bool compute_blocks(string file_name, string bin_basename, bool evaluate, int block, int &nb_blocks);
+
   inline mxArray *
   get_jacob(int block_num)
   {
diff --git a/mex/sources/bytecode/SparseMatrix.cc b/mex/sources/bytecode/SparseMatrix.cc
index 692a708bc..131c4314a 100644
--- a/mex/sources/bytecode/SparseMatrix.cc
+++ b/mex/sources/bytecode/SparseMatrix.cc
@@ -1,4 +1,4 @@
-/*
+  /*
  * Copyright (C) 2007-2012 Dynare Team
  *
  * This file is part of Dynare.
@@ -17,15 +17,88 @@
  * along with Dynare.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-//#define _GLIBCXX_USE_C99_FENV_TR1 1
-//#include <cfenv>
+//define _GLIBCXX_USE_C99_FENV_TR1 1
+//include <cfenv>
 
 #include <cstring>
 #include <ctime>
 #include <sstream>
+//#include <gsl/gsl_min.h>
+//#include <minimize.h>
 #include "SparseMatrix.hh"
 
-SparseMatrix::SparseMatrix()
+#ifdef CUDA
+#include "SparseMatrix_kernel.cu"
+#endif
+
+using namespace std;
+#if (defined _MSC_VER || defined OCTAVE_MEX_FILE)
+
+#ifdef _MSC_VER
+#include <windows.h>
+HINSTANCE hinstLib;
+#else
+#include <sys/types.h>
+#include <dlfcn.h>
+void* hinstLib;
+#endif
+#define UMFPACK_INFO 90
+#define UMFPACK_CONTROL 20
+/* used in all UMFPACK_report_* routines: */
+#define UMFPACK_PRL 0			/* print level */
+/* returned by all routines that use Info: */
+#define UMFPACK_OK (0)
+#define UMFPACK_STATUS 0	/* UMFPACK_OK, or other result */
+
+
+
+
+typedef void (*t_umfpack_dl_free_numeric)(void **Numeric);
+t_umfpack_dl_free_numeric umfpack_dl_free_numeric;
+typedef void (*t_umfpack_dl_free_symbolic)(void **Symbolic);
+t_umfpack_dl_free_symbolic umfpack_dl_free_symbolic;
+typedef int64_t (*t_umfpack_dl_solve)(int64_t sys,
+                                      const int64_t Ap [ ],
+                                      const int64_t Ai [ ],
+                                      const double Ax [ ],
+                                      double X [ ],
+                                      const double B [ ],
+                                      void *Numeric,
+                                      const double Control [UMFPACK_CONTROL],
+                                      double Info [UMFPACK_INFO]);
+t_umfpack_dl_solve umfpack_dl_solve;
+typedef int64_t (*t_umfpack_dl_numeric)(const int64_t Ap [ ],
+                                        const int64_t Ai [ ],
+                                        const double Ax [ ],
+                                        void *Symbolic,
+                                        void **Numeric,
+                                        const double Control [UMFPACK_CONTROL],
+                                        double Info [UMFPACK_INFO]);
+t_umfpack_dl_numeric umfpack_dl_numeric;
+typedef int64_t (*t_umfpack_dl_symbolic)(int64_t n_row,
+    int64_t n_col,
+    const int64_t Ap [ ],
+    const int64_t Ai [ ],
+    const double Ax [ ],
+    void **Symbolic,
+    const double Control [UMFPACK_CONTROL],
+    double Info [UMFPACK_INFO]);
+t_umfpack_dl_symbolic umfpack_dl_symbolic;
+typedef void (*t_umfpack_dl_report_info)(const double Control [UMFPACK_CONTROL],
+    const double Info [UMFPACK_INFO]);
+t_umfpack_dl_report_info umfpack_dl_report_info;
+typedef void (*t_umfpack_dl_report_status)(const double Control [UMFPACK_CONTROL],
+    int64_t status);
+t_umfpack_dl_report_status umfpack_dl_report_status;
+typedef void (*t_umfpack_dl_defaults)(double Control [UMFPACK_CONTROL]);
+t_umfpack_dl_defaults umfpack_dl_defaults;
+
+#endif
+
+
+
+
+dynSparseMatrix::dynSparseMatrix()
 {
   pivotva = NULL;
   g_save_op = NULL;
@@ -42,29 +115,304 @@ SparseMatrix::SparseMatrix()
   restart = 0;
   IM_i.clear();
   lu_inc_tol = 1e-10;
+  Symbolic = NULL;
+  Numeric = NULL;
+#if (defined _MSC_VER || defined OCTAVE_MEX_FILE)
+  // Get a handle to the DLL module.
+#ifdef _MSC_VER
+  hinstLib = LoadLibrary(TEXT("libmwumfpack.dll"));
+#else
+  hinstLib = dlopen("libmwumfpack.dll",RTLD_LAZY);
+#endif
+  // If the handle is valid, try to get the function address.
+  if (hinstLib)
+    {
+#ifdef _MSC_VER
+      umfpack_dl_free_numeric = (t_umfpack_dl_free_numeric) GetProcAddress(hinstLib, "umfpack_dl_free_numeric");
+      if (!umfpack_dl_free_numeric)
+        {
+          mexPrintf("umfpack_dl_free_numeric not found\n");
+          ostringstream tmp;
+          tmp << " in libmwumfpack.dll, the function umfpack_dl_free_numeric is not found.";
+          throw FatalExceptionHandling(tmp.str());
+        }
+      umfpack_dl_free_symbolic = (t_umfpack_dl_free_symbolic) GetProcAddress(hinstLib, "umfpack_dl_free_symbolic");
+      if (!umfpack_dl_free_symbolic)
+        {
+          ostringstream tmp;
+          tmp << " in libmwumfpack.dll, the function umfpack_dl_free_symbolic is not found.";
+          throw FatalExceptionHandling(tmp.str());
+        }
+      umfpack_dl_solve = (t_umfpack_dl_solve) GetProcAddress(hinstLib, "umfpack_dl_free_solve");
+      if (!umfpack_dl_solve)
+        {
+          ostringstream tmp;
+          tmp << " in libmwumfpack.dll, the function umfpack_dl_solve is not found.";
+          throw FatalExceptionHandling(tmp.str());
+        }
+      umfpack_dl_numeric = (t_umfpack_dl_numeric) GetProcAddress(hinstLib, "umfpack_dl_numeric");
+      if (!umfpack_dl_numeric)
+        {
+          ostringstream tmp;
+          tmp << " in libmwumfpack.dll, the function umfpack_dl_numeric is not found.";
+          throw FatalExceptionHandling(tmp.str());
+        }
+      umfpack_dl_symbolic = (t_umfpack_dl_symbolic) GetProcAddress(hinstLib, "umfpack_dl_symbolic");
+      if (!umfpack_dl_symbolic)
+        {
+          ostringstream tmp;
+          tmp << " in libmwumfpack.dll, the function umfpack_dl_symbolic is not found.";
+          throw FatalExceptionHandling(tmp.str());
+        }
+      umfpack_dl_report_info = (t_umfpack_dl_report_info) GetProcAddress(hinstLib, "umfpack_dl_report_info");
+      if (!umfpack_dl_report_info)
+        {
+          ostringstream tmp;
+          tmp << " in libmwumfpack.dll, the function umfpack_dl_report_info is not found.";
+          throw FatalExceptionHandling(tmp.str());
+        }
+      umfpack_dl_report_status = (t_umfpack_dl_report_status) GetProcAddress(hinstLib, "umfpack_dl_report_status");
+      if (!umfpack_dl_report_status)
+        {
+          ostringstream tmp;
+          tmp << " in libmwumfpack.dll, the function umfpack_dl_report_status is not found.";
+          throw FatalExceptionHandling(tmp.str());
+        }
+      umfpack_dl_defaults = (t_umfpack_dl_defaults) GetProcAddress(hinstLib, "umfpack_dl_defaults");
+      if (!umfpack_dl_defaults)
+        {
+          ostringstream tmp;
+          tmp << " in libmwumfpack.dll, the function umfpack_dl_defaults is not found.";
+          throw FatalExceptionHandling(tmp.str());
+        }
+/*#else
+      mexPrintf("loading libmwumpfpack\n");
+      // reset errors
+      dlerror();
+      umfpack_dl_free_numeric = (t_umfpack_dl_free_numeric)  dlsym(hinstLib, "umfpack_dl_free_numeric");
+      const char* dlsym_error = dlerror();
+      if (dlsym_error)
+        {
+          mexPrintf("umfpack_dl_free_numeric not found\n");
+          ostringstream tmp;
+          tmp << " in libmwumfpack.dll, the function umfpack_dl_free_numeric is not found.";
+          throw FatalExceptionHandling(tmp.str());
+        }
+      // reset errors
+      dlerror();
+      umfpack_dl_free_symbolic = (t_umfpack_dl_free_symbolic)  dlsym(hinstLib, "umfpack_dl_free_symbolic");
+      dlsym_error = dlerror();
+      if (dlsym_error)
+        {
+          mexPrintf("umfpack_dl_free_symbolic not found\n");
+          ostringstream tmp;
+          tmp << " in libmwumfpack.dll, the function umfpack_dl_free_symbolic is not found.";
+          throw FatalExceptionHandling(tmp.str());
+        }
+      // reset errors
+      dlerror();
+      umfpack_dl_solve = (t_umfpack_dl_solve)  dlsym(hinstLib, "umfpack_dl_solve");
+      dlsym_error = dlerror();
+      if (dlsym_error)
+        {
+          mexPrintf("umfpack_dl_solve not found\n");
+          ostringstream tmp;
+          tmp << " in libmwumfpack.dll, the function umfpack_dl_solve is not found.";
+          throw FatalExceptionHandling(tmp.str());
+        }
+      // reset errors
+      dlerror();
+      umfpack_dl_numeric = (t_umfpack_dl_numeric)  dlsym(hinstLib, "umfpack_dl_numeric");
+      dlsym_error = dlerror();
+      if (dlsym_error)
+        {
+          mexPrintf("umfpack_dl_numeric not found\n");
+          ostringstream tmp;
+          tmp << " in libmwumfpack.dll, the function umfpack_dl_numeric is not found.";
+          throw FatalExceptionHandling(tmp.str());
+        }
+      // reset errors
+      dlerror();
+      umfpack_dl_symbolic = (t_umfpack_dl_symbolic)  dlsym(hinstLib, "umfpack_dl_symbolic");
+      dlsym_error = dlerror();
+      if (dlsym_error)
+        {
+          mexPrintf("umfpack_dl_symbolic not found\n");
+          ostringstream tmp;
+          tmp << " in libmwumfpack.dll, the function umfpack_dl_symbolic is not found.";
+          throw FatalExceptionHandling(tmp.str());
+        }
+      // reset errors
+      dlerror();
+      umfpack_dl_report_info = (t_umfpack_dl_report_info)  dlsym(hinstLib, "umfpack_dl_report_info");
+      dlsym_error = dlerror();
+      if (dlsym_error)
+        {
+          mexPrintf("umfpack_dl_report_info not found\n");
+          ostringstream tmp;
+          tmp << " in libmwumfpack.dll, the function umfpack_dl_report_info is not found.";
+          throw FatalExceptionHandling(tmp.str());
+        }
+      // reset errors
+      dlerror();
+      umfpack_dl_report_status = (t_umfpack_dl_report_status)  dlsym(hinstLib, "umfpack_dl_report_status");
+      dlsym_error = dlerror();
+      if (dlsym_error)
+        {
+          mexPrintf("umfpack_dl_report_status not found\n");
+          ostringstream tmp;
+          tmp << " in libmwumfpack.dll, the function umfpack_dl_report_status is not found.";
+          throw FatalExceptionHandling(tmp.str());
+        }
+      // reset errors
+      dlerror();
+      umfpack_dl_defaults = (t_umfpack_dl_defaults)  dlsym(hinstLib, "umfpack_dl_defaults");
+      dlsym_error = dlerror();
+      if (dlsym_error)
+        {
+          mexPrintf("umfpack_dl_defaults not found\n");
+          ostringstream tmp;
+          tmp << " in libmwumfpack.dll, the function umfpack_dl_defaults is not found.";
+          throw FatalExceptionHandling(tmp.str());
+        }*/
+#endif
+    }
+  else
+    {
+      mexPrintf("library loading error\n");
+      ostringstream tmp;
+      tmp << " in main, libmwumfpack.dll not found. \n Check that \\Program files\\MATLAB\\RXXXXX\\bin\\win64 is in the current path.";
+      throw FatalExceptionHandling(tmp.str());
+    }
+#endif
 }
 
+dynSparseMatrix::dynSparseMatrix(const int y_size_arg, const int y_kmin_arg, const int y_kmax_arg, const bool print_it_arg, const bool steady_state_arg, const int periods_arg,
+                           const int minimal_solving_periods_arg
+#ifdef CUDA
+                           , const int CUDA_device_arg, cublasHandle_t cublas_handle_arg, cusparseHandle_t cusparse_handle_arg, cusparseMatDescr_t descr_arg
+#endif
+                           ):
+  Evaluate(y_size_arg, y_kmin_arg, y_kmax_arg, print_it_arg, steady_state_arg, periods_arg, minimal_solving_periods_arg)
+{
+  pivotva = NULL;
+  g_save_op = NULL;
+  g_nop_all = 0;
+  mem_mngr.init_Mem();
+  symbolic = true;
+  alt_symbolic = false;
+  alt_symbolic_count = 0;
+  max_u = 0;
+  min_u = 0x7FFFFFFF;
+  res1a = 9.0e60;
+  tbreak_g = 0;
+  start_compare = 0;
+  restart = 0;
+  IM_i.clear();
+  lu_inc_tol = 1e-10;
+  Symbolic = NULL;
+  Numeric = NULL;
+#ifdef CUDA
+  CUDA_device = CUDA_device_arg;
+  cublas_handle = cublas_handle_arg;
+  cusparse_handle = cusparse_handle_arg;
+  CUDA_descr = descr_arg;
+#endif
+#ifdef _MSC_VER
+  // Get a handle to the DLL module.
+  hinstLib = LoadLibrary(TEXT("libmwumfpack.dll"));
+  // If the handle is valid, try to get the function address.
+  if (hinstLib != NULL)
+    {
+      umfpack_dl_free_numeric = (t_umfpack_dl_free_numeric) GetProcAddress(hinstLib, "umfpack_dl_free_numeric");
+      if (!umfpack_dl_free_numeric)
+        {
+          ostringstream tmp;
+          tmp << " in libmwumfpack.dll, the function umfpack_dl_free_numeric is not found.";
+          throw FatalExceptionHandling(tmp.str());
+        }
+      umfpack_dl_free_symbolic = (t_umfpack_dl_free_symbolic) GetProcAddress(hinstLib, "umfpack_dl_free_symbolic");
+      if (!umfpack_dl_free_symbolic)
+        {
+          ostringstream tmp;
+          tmp << " in libmwumfpack.dll, the function umfpack_dl_free_symbolic is not found.";
+          throw FatalExceptionHandling(tmp.str());
+        }
+      umfpack_dl_report_info = (t_umfpack_dl_report_info) GetProcAddress(hinstLib, "umfpack_dl_report_info");
+      if (!umfpack_dl_report_info)
+        {
+          ostringstream tmp;
+          tmp << " in libmwumfpack.dll, the function umfpack_dl_report_info is not found.";
+          throw FatalExceptionHandling(tmp.str());
+        }
+      umfpack_dl_solve = (t_umfpack_dl_solve) GetProcAddress(hinstLib, "umfpack_dl_solve");
+      if (!umfpack_dl_solve)
+        {
+          ostringstream tmp;
+          tmp << " in libmwumfpack.dll, the function umfpack_dl_solve is not found.";
+          throw FatalExceptionHandling(tmp.str());
+        }
+      umfpack_dl_numeric = (t_umfpack_dl_numeric) GetProcAddress(hinstLib, "umfpack_dl_numeric");
+      if (!umfpack_dl_numeric)
+        {
+          ostringstream tmp;
+          tmp << " in libmwumfpack.dll, the function umfpack_dl_numeric is not found.";
+          throw FatalExceptionHandling(tmp.str());
+        }
+      umfpack_dl_symbolic = (t_umfpack_dl_symbolic) GetProcAddress(hinstLib, "umfpack_dl_symbolic");
+      if (!umfpack_dl_symbolic)
+        {
+          ostringstream tmp;
+          tmp << " in libmwumfpack.dll, the function umfpack_dl_symbolic is not found.";
+          throw FatalExceptionHandling(tmp.str());
+        }
+      umfpack_dl_report_status = (t_umfpack_dl_report_status) GetProcAddress(hinstLib, "umfpack_dl_report_status");
+      if (!umfpack_dl_report_status)
+        {
+          ostringstream tmp;
+          tmp << " in libmwumfpack.dll, the function umfpack_dl_report_status is not found.";
+          throw FatalExceptionHandling(tmp.str());
+        }
+      umfpack_dl_defaults = (t_umfpack_dl_defaults) GetProcAddress(hinstLib, "umfpack_dl_defaults");
+      if (!umfpack_dl_defaults)
+        {
+          ostringstream tmp;
+          tmp << " in libmwumfpack.dll, the function umfpack_dl_defaults is not found.";
+          throw FatalExceptionHandling(tmp.str());
+        }
+    }
+  else
+    {
+      mexPrintf("library loading error\n");
+      ostringstream tmp;
+      tmp << " in main, libmwumfpack.dll not found. \n Check that \\Program files\\MATLAB\\RXXXXX\\bin\\win64 in the current path.";
+      throw FatalExceptionHandling(tmp.str());
+    }
+#endif
+}
+
+
 int
-SparseMatrix::NRow(int r)
+dynSparseMatrix::NRow(int r)
 {
   return NbNZRow[r];
 }
 
 int
-SparseMatrix::NCol(int c)
+dynSparseMatrix::NCol(int c)
 {
   return NbNZCol[c];
 }
 
 int
-SparseMatrix::At_Row(int r, NonZeroElem **first)
+dynSparseMatrix::At_Row(int r, NonZeroElem **first)
 {
   (*first) = FNZE_R[r];
   return NbNZRow[r];
 }
 
 int
-SparseMatrix::Union_Row(int row1, int row2)
+dynSparseMatrix::Union_Row(int row1, int row2)
 {
   NonZeroElem *first1, *first2;
   int n1 = At_Row(row1, &first1);
@@ -97,7 +445,7 @@ SparseMatrix::Union_Row(int row1, int row2)
 }
 
 int
-SparseMatrix::At_Pos(int r, int c, NonZeroElem **first)
+dynSparseMatrix::At_Pos(int r, int c, NonZeroElem **first)
 {
   (*first) = FNZE_R[r];
   while ((*first)->c_index != c)
@@ -106,14 +454,14 @@ SparseMatrix::At_Pos(int r, int c, NonZeroElem **first)
 }
 
 int
-SparseMatrix::At_Col(int c, NonZeroElem **first)
+dynSparseMatrix::At_Col(int c, NonZeroElem **first)
 {
   (*first) = FNZE_C[c];
   return NbNZCol[c];
 }
 
 int
-SparseMatrix::At_Col(int c, int lag, NonZeroElem **first)
+dynSparseMatrix::At_Col(int c, int lag, NonZeroElem **first)
 {
   (*first) = FNZE_C[c];
   int i = 0;
@@ -139,7 +487,7 @@ SparseMatrix::At_Col(int c, int lag, NonZeroElem **first)
 }
 
 void
-SparseMatrix::Delete(const int r, const int c)
+dynSparseMatrix::Delete(const int r, const int c)
 {
   NonZeroElem *first = FNZE_R[r], *firsta = NULL;
 
@@ -173,7 +521,7 @@ SparseMatrix::Delete(const int r, const int c)
 }
 
 void
-SparseMatrix::Print(int Size, int *b)
+dynSparseMatrix::Print(int Size, int *b)
 {
   int a, i, j, k, l;
   mexPrintf("   ");
@@ -221,7 +569,7 @@ SparseMatrix::Print(int Size, int *b)
 }
 
 void
-SparseMatrix::Insert(const int r, const int c, const int u_index, const int lag_index)
+dynSparseMatrix::Insert(const int r, const int c, const int u_index, const int lag_index)
 {
   NonZeroElem *firstn, *first, *firsta, *a;
   firstn = mem_mngr.mxMalloc_NZE();
@@ -275,12 +623,14 @@ SparseMatrix::Insert(const int r, const int c, const int u_index, const int lag_
 }
 
 void
-SparseMatrix::Read_SparseMatrix(string file_name, const int Size, int periods, int y_kmin, int y_kmax, bool steady_state, bool two_boundaries, int stack_solve_algo, int solve_algo)
+dynSparseMatrix::Read_SparseMatrix(string file_name, const int Size, int periods, int y_kmin, int y_kmax, bool two_boundaries, int stack_solve_algo, int solve_algo)
 {
   unsigned int eq, var;
-  int i, j, lag;
+  int lag;
   filename = file_name;
   mem_mngr.fixe_file_name(file_name);
+  /*mexPrintf("steady_state=%d, size=%d, solve_algo=%d, stack_solve_algo=%d, two_boundaries=%d\n",steady_state, Size, solve_algo, stack_solve_algo, two_boundaries);
+  mexEvalString("drawnow;");*/
   if (!SaveCode.is_open())
     {
       if (steady_state)
@@ -302,28 +652,44 @@ SparseMatrix::Read_SparseMatrix(string file_name, const int Size, int periods, i
     {
       if (stack_solve_algo == 5)
         {
-          for (i = 0; i < u_count_init-Size; i++)
+          for (int i = 0; i < u_count_init-Size; i++)
             {
+              int val;
               SaveCode.read(reinterpret_cast<char *>(&eq), sizeof(eq));
               SaveCode.read(reinterpret_cast<char *>(&var), sizeof(var));
               SaveCode.read(reinterpret_cast<char *>(&lag), sizeof(lag));
-              SaveCode.read(reinterpret_cast<char *>(&j), sizeof(j));
-              IM_i[make_pair(make_pair(eq, var), lag)] = j;
+              SaveCode.read(reinterpret_cast<char *>(&val), sizeof(val));
+              IM_i[make_pair(make_pair(eq, var), lag)] = val;
             }
-          for (j = 0; j < Size; j++)
+          for (int j = 0; j < Size; j++)
             IM_i[make_pair(make_pair(j, Size*(periods+y_kmax)), 0)] = j;
         }
-      else if (stack_solve_algo >= 0 || stack_solve_algo <= 4)
+      else if (stack_solve_algo >= 0 && stack_solve_algo <= 4)
         {
-          for (i = 0; i < u_count_init-Size; i++)
+          for (int i = 0; i < u_count_init-Size; i++)
             {
+              int val;
               SaveCode.read(reinterpret_cast<char *>(&eq), sizeof(eq));
               SaveCode.read(reinterpret_cast<char *>(&var), sizeof(var));
               SaveCode.read(reinterpret_cast<char *>(&lag), sizeof(lag));
-              SaveCode.read(reinterpret_cast<char *>(&j), sizeof(j));
-              IM_i[make_pair(make_pair(var - lag*Size, -lag), eq)] = j;
+              SaveCode.read(reinterpret_cast<char *>(&val), sizeof(val));
+              IM_i[make_pair(make_pair(var - lag*Size, -lag), eq)] = val;
             }
-          for (j = 0; j < Size; j++)
+          for (int j = 0; j < Size; j++)
+            IM_i[make_pair(make_pair(Size*(periods+y_kmax), 0), j)] = j;
+        }
+      else if (stack_solve_algo == 7)
+        {
+          for (int i = 0; i < u_count_init-Size; i++)
+            {
+              int val;
+              SaveCode.read(reinterpret_cast<char *>(&eq), sizeof(eq));
+              SaveCode.read(reinterpret_cast<char *>(&var), sizeof(var));
+              SaveCode.read(reinterpret_cast<char *>(&lag), sizeof(lag));
+              SaveCode.read(reinterpret_cast<char *>(&val), sizeof(val));
+              IM_i[make_pair(make_pair(eq, lag), var - lag * Size)] = val;
+            }
+          for (int j = 0; j < Size; j++)
             IM_i[make_pair(make_pair(Size*(periods+y_kmax), 0), j)] = j;
         }
 
@@ -332,41 +698,48 @@ SparseMatrix::Read_SparseMatrix(string file_name, const int Size, int periods, i
     {
       if ((stack_solve_algo == 5 && !steady_state) || (solve_algo == 5 && steady_state))
         {
-          for (i = 0; i < u_count_init; i++)
+          for (int i = 0; i < u_count_init; i++)
             {
+              int val;
               SaveCode.read(reinterpret_cast<char *>(&eq), sizeof(eq));
               SaveCode.read(reinterpret_cast<char *>(&var), sizeof(var));
               SaveCode.read(reinterpret_cast<char *>(&lag), sizeof(lag));
-              SaveCode.read(reinterpret_cast<char *>(&j), sizeof(j));
-              IM_i[make_pair(make_pair(eq, var), lag)] = j;
+              SaveCode.read(reinterpret_cast<char *>(&val), sizeof(val));
+              IM_i[make_pair(make_pair(eq, var), lag)] = val;
             }
         }
       else if (((stack_solve_algo >= 0 || stack_solve_algo <= 4) && !steady_state) || ((solve_algo >= 6 || solve_algo <= 8) && steady_state))
         {
-          for (i = 0; i < u_count_init; i++)
+          for (int i = 0; i < u_count_init; i++)
             {
+              int val;
               SaveCode.read(reinterpret_cast<char *>(&eq), sizeof(eq));
               SaveCode.read(reinterpret_cast<char *>(&var), sizeof(var));
               SaveCode.read(reinterpret_cast<char *>(&lag), sizeof(lag));
-              SaveCode.read(reinterpret_cast<char *>(&j), sizeof(j));
-              IM_i[make_pair(make_pair(var - lag*Size, -lag), eq)] = j;
+              SaveCode.read(reinterpret_cast<char *>(&val), sizeof(val));
+              IM_i[make_pair(make_pair(var - lag*Size, -lag), eq)] = val;
             }
         }
     }
   index_vara = (int *) mxMalloc(Size*(periods+y_kmin+y_kmax)*sizeof(int));
-  for (j = 0; j < Size; j++)
+  for (int j = 0; j < Size; j++)
     SaveCode.read(reinterpret_cast<char *>(&index_vara[j]), sizeof(*index_vara));
   if (periods+y_kmin+y_kmax > 1)
-    for (i = 1; i < periods+y_kmin+y_kmax; i++)
-      for (j = 0; j < Size; j++)
-        index_vara[j+Size*i] = index_vara[j+Size*(i-1)]+y_size;
+    for (int i = 1; i < periods+y_kmin+y_kmax; i++)
+      {
+#ifdef USE_OMP
+#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+#endif
+        for (int j = 0; j < Size; j++)
+          index_vara[j+Size*i] = index_vara[j+Size*(i-1)] + y_size;
+      }
   index_equa = (int *) mxMalloc(Size*sizeof(int));
-  for (j = 0; j < Size; j++)
+  for (int j = 0; j < Size; j++)
     SaveCode.read(reinterpret_cast<char *>(&index_equa[j]), sizeof(*index_equa));
 }
 
 void
-SparseMatrix::Simple_Init(int Size, map<pair<pair<int, int>, int>, int> &IM, bool &zero_solution)
+dynSparseMatrix::Simple_Init(int Size, map<pair<pair<int, int>, int>, int> &IM, bool &zero_solution)
 {
   int i, eq, var, lag;
   map<pair<pair<int, int>, int>, int>::iterator it4;
@@ -392,12 +765,14 @@ SparseMatrix::Simple_Init(int Size, map<pair<pair<int, int>, int>, int> &IM, boo
   NbNZCol = (int *) mxMalloc(i);
   it4 = IM.begin();
   eq = -1;
-  //#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+#ifdef USE_OMP
+#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+#endif
   for (i = 0; i < Size; i++)
     {
       line_done[i] = 0;
-      FNZE_C[i] = 0;
-      FNZE_R[i] = 0;
+      FNZE_C[i] = NULL;
+      FNZE_R[i] = NULL;
       temp_NZE_C[i] = 0;
       temp_NZE_R[i] = 0;
       NbNZRow[i] = 0;
@@ -434,9 +809,11 @@ SparseMatrix::Simple_Init(int Size, map<pair<pair<int, int>, int>, int> &IM, boo
         }
       it4++;
     }
-  //#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
   double cum_abs_sum = 0;
-  for (i = 0; i < Size; i++)
+#if USE_OMP
+#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) reduction(+:cum_abs_sum)
+#endif
+  for (int i = 0; i < Size; i++)
     {
       b[i] = i;
       cum_abs_sum += fabs(u[i]);
@@ -452,9 +829,9 @@ SparseMatrix::Simple_Init(int Size, map<pair<pair<int, int>, int>, int> &IM, boo
 }
 
 void
-SparseMatrix::Init_Matlab_Sparse_Simple(int Size, map<pair<pair<int, int>, int>, int> &IM, mxArray *A_m, mxArray *b_m, bool &zero_solution, mxArray *x0_m)
+dynSparseMatrix::Init_Matlab_Sparse_Simple(int Size, map<pair<pair<int, int>, int>, int> &IM, mxArray *A_m, mxArray *b_m, bool &zero_solution, mxArray *x0_m)
 {
-  int i, eq, var;
+  int eq, var;
   double *b = mxGetPr(b_m);
   if (!b)
     {
@@ -491,7 +868,10 @@ SparseMatrix::Init_Matlab_Sparse_Simple(int Size, map<pair<pair<int, int>, int>,
       throw FatalExceptionHandling(tmp.str());
     }
   map<pair<pair<int, int>, int>, int>::iterator it4;
-  for (i = 0; i < y_size*(periods+y_kmin); i++)
+#ifdef USE_OMP
+#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+#endif
+  for (int i = 0; i < y_size*(periods+y_kmin); i++)
     ya[i] = y[i];
 #ifdef DEBUG
   unsigned int max_nze = mxGetNzmax(A_m);
@@ -499,7 +879,10 @@ SparseMatrix::Init_Matlab_Sparse_Simple(int Size, map<pair<pair<int, int>, int>,
   unsigned int NZE = 0;
   int last_var = 0;
   double cum_abs_sum = 0;
-  for (i = 0; i < Size; i++)
+#ifdef USE_OMP
+#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) reduction(+:cum_abs_sum)
+#endif
+  for (int i = 0; i < Size; i++)
     {
       b[i] = u[i];
       cum_abs_sum += fabs(b[i]);
@@ -511,7 +894,7 @@ SparseMatrix::Init_Matlab_Sparse_Simple(int Size, map<pair<pair<int, int>, int>,
     zero_solution = false;
 
   Aj[0] = 0;
-  last_var = -1;
+  last_var = 0;
   it4 = IM.begin();
   while (it4 != IM.end())
     {
@@ -565,11 +948,757 @@ SparseMatrix::Init_Matlab_Sparse_Simple(int Size, map<pair<pair<int, int>, int>,
   Aj[Size] = NZE;
 }
 
+
 void
-SparseMatrix::Init_Matlab_Sparse(int periods, int y_kmin, int y_kmax, int Size, map<pair<pair<int, int>, int>, int> &IM, mxArray *A_m, mxArray *b_m, mxArray *x0_m)
+dynSparseMatrix::Init_UMFPACK_Sparse_Simple(int Size, map<pair<pair<int, int>, int>, int> &IM, SuiteSparse_long **Ap, SuiteSparse_long **Ai, double **Ax, double **b, bool &zero_solution, mxArray *x0_m)
 {
-  int t, i, eq, var, lag, ti_y_kmin, ti_y_kmax;
+  int eq, var;
+  //double *b = mxGetPr(b_m);
+  *b = (double*)mxMalloc(Size * sizeof(double));
+  if (!(*b))
+    {
+      ostringstream tmp;
+      tmp << " in Init_UMFPACK_Sparse, can't retrieve b vector\n";
+      throw FatalExceptionHandling(tmp.str());
+    }
+  double *x0 = mxGetPr(x0_m);
+  if (!x0)
+    {
+      ostringstream tmp;
+      tmp << " in Init_UMFPACK_Sparse_Simple, can't retrieve x0 vector\n";
+      throw FatalExceptionHandling(tmp.str());
+    }
+  *Ap = (SuiteSparse_long*)mxMalloc((Size+1) * sizeof(SuiteSparse_long));
+  if (!(*Ap))
+    {
+      ostringstream tmp;
+      tmp << " in Init_UMFPACK_Sparse, can't allocate Ap index vector\n";
+      throw FatalExceptionHandling(tmp.str());
+    }
+  size_t prior_nz = IM.size();
+  *Ai = (SuiteSparse_long*)mxMalloc(prior_nz * sizeof(SuiteSparse_long));
+  if (!(*Ai))
+    {
+      ostringstream tmp;
+      tmp << " in Init_UMFPACK_Sparse, can't allocate Ai index vector\n";
+      throw FatalExceptionHandling(tmp.str());
+    }
+  *Ax = (double*)mxMalloc(prior_nz * sizeof(double));
+  if (!(*Ax))
+    {
+      ostringstream tmp;
+      tmp << " in Init_UMFPACK_Sparse, can't retrieve Ax matrix\n";
+      throw FatalExceptionHandling(tmp.str());
+    }
+
+  map<pair<pair<int, int>, int>, int>::iterator it4;
+#ifdef USE_OMP
+#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+#endif
+  for (int i = 0; i < Size; i++)
+    {
+      int eq = index_vara[i];
+      ya[eq+it_*y_size] = y[eq+it_*y_size];
+    }
+#ifdef DEBUG
+  unsigned int max_nze = mxGetNzmax(A_m);
+#endif
+  unsigned int NZE = 0;
+  int last_var = 0;
+  double cum_abs_sum = 0;
+
+#ifdef USE_OMP
+#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) reduction(+:cum_abs_sum)
+#endif
+  for (int i = 0; i < Size; i++)
+    {
+      (*b)[i] = u[i];
+      cum_abs_sum += fabs((*b)[i]);
+      x0[i] = y[i];
+    }
+  if (cum_abs_sum < 1e-20)
+    zero_solution = true;
+  else
+    zero_solution = false;
+
+  (*Ap)[0] = 0;
+  last_var = 0;
+  it4 = IM.begin();
+  while (it4 != IM.end())
+    {
+      var = it4->first.first.first;
+      if (var != last_var)
+        {
+          (*Ap)[1+last_var ] = NZE;
+          last_var = var;
+        }
+      eq = it4->first.second;
+      int index = it4->second;
+#ifdef DEBUG
+      if (index < 0 || index >= u_count_alloc || index > Size + Size*Size)
+        {
+          ostringstream tmp;
+          tmp << " in Init_Matlab_Sparse_Simple, index (" << index << ") out of range for u vector max = " << Size+Size*Size << " allocated = " << u_count_alloc << "\n";
+          throw FatalExceptionHandling(tmp.str());
+        }
+      if (NZE >= max_nze)
+        {
+          ostringstream tmp;
+          tmp << " in Init_Matlab_Sparse_Simple, exceeds the capacity of A_m sparse matrix\n";
+          throw FatalExceptionHandling(tmp.str());
+        }
+#endif
+      (*Ax)[NZE] = u[index];
+      (*Ai)[NZE] = eq;
+      NZE++;
+#ifdef DEBUG
+      if (eq < 0 || eq >= Size)
+        {
+          ostringstream tmp;
+          tmp << " in Init_Matlab_Sparse_Simple, index (" << eq << ") out of range for b vector\n";
+          throw FatalExceptionHandling(tmp.str());
+        }
+      if (var < 0 || var >= Size)
+        {
+          ostringstream tmp;
+          tmp << " in Init_Matlab_Sparse_Simple, index (" << var << ") out of range for index_vara vector\n";
+          throw FatalExceptionHandling(tmp.str());
+        }
+      if (index_vara[var] < 0 || index_vara[var] >= y_size)
+        {
+          ostringstream tmp;
+          tmp << " in Init_Matlab_Sparse_Simple, index (" << index_vara[var] << ") out of range for y vector max=" << y_size << " (0)\n";
+          throw FatalExceptionHandling(tmp.str());
+        }
+#endif
+      it4++;
+    }
+  (*Ap)[Size] = NZE;
+}
+
+
+void
+dynSparseMatrix::Init_UMFPACK_Sparse(int periods, int y_kmin, int y_kmax, int Size, map<pair<pair<int, int>, int>, int> &IM, SuiteSparse_long **Ap, SuiteSparse_long **Ai, double **Ax, double **b, mxArray *x0_m)
+{
+  int t, eq, var, lag, ti_y_kmin, ti_y_kmax;
+  int n = periods * Size;
+  *b = (double*)mxMalloc(n * sizeof(double));
+  if (!(*b))
+    {
+      ostringstream tmp;
+      tmp << " in Init_UMFPACK_Sparse, can't retrieve b vector\n";
+      throw FatalExceptionHandling(tmp.str());
+    }
+  double *x0 = mxGetPr(x0_m);
+  if (!x0)
+    {
+      ostringstream tmp;
+      tmp << " in Init_UMFPACK_Sparse_Simple, can't retrieve x0 vector\n";
+      throw FatalExceptionHandling(tmp.str());
+    }
+  *Ap = (SuiteSparse_long*)mxMalloc((n+1) * sizeof(SuiteSparse_long));
+  if (!(*Ap))
+    {
+      ostringstream tmp;
+      tmp << " in Init_UMFPACK_Sparse, can't allocate Ap index vector\n";
+      throw FatalExceptionHandling(tmp.str());
+    }
+  size_t prior_nz = IM.size() * periods;
+  *Ai = (SuiteSparse_long*)mxMalloc(prior_nz * sizeof(SuiteSparse_long));
+  if (!(*Ai))
+    {
+      ostringstream tmp;
+      tmp << " in Init_UMFPACK_Sparse, can't allocate Ai index vector\n";
+      throw FatalExceptionHandling(tmp.str());
+    }
+  *Ax = (double*)mxMalloc(prior_nz * sizeof(double));
+  if (!(*Ax))
+    {
+      ostringstream tmp;
+      tmp << " in Init_UMFPACK_Sparse, can't retrieve Ax matrix\n";
+      throw FatalExceptionHandling(tmp.str());
+    }
+  map<pair<pair<int, int>, int>, int>::iterator it4;
+#ifdef USE_OMP
+#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+#endif
+  for (int i = 0; i < y_size*(periods+y_kmin); i++)
+    ya[i] = y[i];
+#ifdef DEBUG
+  unsigned int max_nze = mxGetNzmax(A_m);
+#endif
+  unsigned int NZE = 0;
+  int last_var = 0;
+
+#ifdef USE_OMP
+#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+#endif
+  for (int i = 0; i < periods*Size; i++)
+    {
+      (*b)[i] = 0;
+      x0[i] = y[index_vara[Size*y_kmin+i]];
+    }
+  (*Ap)[0] = 0;
+  /*int min_lag = 0;
+  int max_lag = 0;*/
+  for (t = 0; t < periods; t++)
+    {
+      last_var = -1;
+      it4 = IM.begin();
+      while (it4 != IM.end())
+        {
+          var = it4->first.first.first;
+          if (var != last_var)
+            {
+              (*Ap)[1+last_var + t * Size] = NZE;
+              last_var = var;
+            }
+          eq = it4->first.second+Size*t;
+          lag = -it4->first.first.second;
+          /*if (t==0)
+            {
+              if (min_lag > lag)
+                min_lag = lag;
+              if (max_lag < lag)
+                max_lag = lag;
+            }*/
+          int index = it4->second+ (t-lag) * u_count_init;
+          if (var < (periods+y_kmax)*Size)
+            {
+              ti_y_kmin = -min(t, y_kmin);
+              ti_y_kmax = min(periods-(t +1), y_kmax);
+              int ti_new_y_kmax = min(t, y_kmax);
+              int ti_new_y_kmin = -min(periods-(t+1), y_kmin);
+              if (lag <= ti_new_y_kmax && lag >= ti_new_y_kmin)   /*Build the index for sparse matrix containing the jacobian : u*/
+                {
+#ifdef DEBUG
+                  if (index < 0 || index >= u_count_alloc || index > Size + Size*Size)
+                    {
+                      ostringstream tmp;
+                      tmp << " in Init_UMFPACK_Sparse, index (" << index << ") out of range for u vector max = " << Size+Size*Size << " allocated = " << u_count_alloc << "\n";
+                      throw FatalExceptionHandling(tmp.str());
+                    }
+                  if (NZE >= max_nze)
+                    {
+                      ostringstream tmp;
+                      tmp << " in Init_UMFPACK_Sparse, exceeds the capacity of A_m sparse matrix\n";
+                      throw FatalExceptionHandling(tmp.str());
+                    }
+#endif
+                  (*Ax)[NZE] = u[index];
+                  (*Ai)[NZE] = eq - lag * Size;
+                  NZE++;
+                }
+              if (lag > ti_y_kmax || lag < ti_y_kmin)
+                {
+#ifdef DEBUG
+                  if (eq < 0 || eq >= Size * periods)
+                    {
+                      ostringstream tmp;
+                      tmp << " in Init_UMFPACK_Sparse, index (" << eq << ") out of range for b vector\n";
+                      throw FatalExceptionHandling(tmp.str());
+                    }
+                  if (var+Size*(y_kmin+t+lag) < 0 || var+Size*(y_kmin+t+lag) >= Size*(periods+y_kmin+y_kmax))
+                    {
+                      ostringstream tmp;
+                      tmp << " in Init_UMFPACK_Sparse, index (" << var+Size*(y_kmin+t+lag) << ") out of range for index_vara vector\n";
+                      throw FatalExceptionHandling(tmp.str());
+                    }
+                  if (index_vara[var+Size*(y_kmin+t+lag)] < 0 || index_vara[var+Size*(y_kmin+t+lag)] >= y_size*(periods+y_kmin+y_kmax))
+                    {
+                      ostringstream tmp;
+                      tmp << " in Init_UMFPACK_Sparse, index (" << index_vara[var+Size*(y_kmin+t+lag)] << ") out of range for y vector max=" << y_size*(periods+y_kmin+y_kmax) << "\n";
+                      throw FatalExceptionHandling(tmp.str());
+                    }
+#endif
+                  (*b)[eq]  += u[index+lag*u_count_init]*y[index_vara[var+Size*(y_kmin+t+lag)]];
+                }
+            }
+          else           /* ...and store it in the u vector*/
+            {
+#ifdef DEBUG
+              if (index < 0 || index >= u_count_alloc)
+                {
+                  ostringstream tmp;
+                  tmp << " in Init_UMFPACK_Sparse, index (" << index << ") out of range for u vector\n";
+                  throw FatalExceptionHandling(tmp.str());
+                }
+              if (eq < 0 || eq >= (Size*periods))
+                {
+                  ostringstream tmp;
+                  tmp << " in Init_UMFPACK_Sparse, index (" << eq << ") out of range for b vector\n";
+                  throw FatalExceptionHandling(tmp.str());
+                }
+#endif
+              (*b)[eq]  += u[index];
+            }
+          it4++;
+        }
+    }
+  (*Ap)[Size*periods] = NZE;
+
+#ifdef DEBUG
+  mexPrintf("*Ax = [");
+  for (int i = 0; i < NZE; i++)
+    mexPrintf("%f ",(*Ax)[i]);
+  mexPrintf("]\n");
+
+  mexPrintf("*Ap = [");
+  for (int i = 0; i < n+1; i++)
+    mexPrintf("%d ",(*Ap)[i]);
+  mexPrintf("]\n");
+
+  mexPrintf("*Ai = [");
+  for (int i = 0; i < NZE; i++)
+    mexPrintf("%d ",(*Ai)[i]);
+  mexPrintf("]\n");
+#endif
+}
+
+void
+dynSparseMatrix::Init_CUDA_Sparse_Simple(int Size, map<pair<pair<int, int>, int>, int> &IM, SuiteSparse_long **Ap, SuiteSparse_long **Ai, double **Ax, double **b, double **x0, bool &zero_solution, mxArray *x0_m)
+{
+  int eq, var;
+
+  *b = (double*)mxMalloc(Size * sizeof(double));
+  if (!(*b))
+    {
+      ostringstream tmp;
+      tmp << " in Init_CUDA_Sparse, can't retrieve b vector\n";
+      throw FatalExceptionHandling(tmp.str());
+    }
+  double *Host_x0 = mxGetPr(x0_m);
+  if (!Host_x0)
+    {
+      ostringstream tmp;
+      tmp << " in Init_CUDA_Sparse_Simple, can't retrieve x0 vector\n";
+      throw FatalExceptionHandling(tmp.str());
+    }
+  *Ap = (SuiteSparse_long*)mxMalloc((Size+1) * sizeof(SuiteSparse_long));
+  if (!(*Ap))
+    {
+      ostringstream tmp;
+      tmp << " in Init_CUDA_Sparse, can't allocate Ap index vector\n";
+      throw FatalExceptionHandling(tmp.str());
+    }
+  size_t prior_nz = IM.size();
+  *Ai = (SuiteSparse_long*)mxMalloc(prior_nz * sizeof(SuiteSparse_long));
+  if (!(*Ai))
+    {
+      ostringstream tmp;
+      tmp << " in Init_CUDA_Sparse, can't allocate Ai index vector\n";
+      throw FatalExceptionHandling(tmp.str());
+    }
+  *Ax = (double*)mxMalloc(prior_nz * sizeof(double));
+  if (!(*Ax))
+    {
+      ostringstream tmp;
+      tmp << " in Init_CUDA_Sparse, can't retrieve Ax matrix\n";
+      throw FatalExceptionHandling(tmp.str());
+    }
+
+  map<pair<pair<int, int>, int>, int>::iterator it4;
+#ifdef USE_OMP
+#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+#endif
+  for (int i = 0; i < Size; i++)
+    {
+      int eq = index_vara[i];
+      ya[eq+it_*y_size] = y[eq+it_*y_size];
+    }
+
+#ifdef DEBUG
+  unsigned int max_nze = mxGetNzmax(A_m);
+#endif
+  unsigned int NZE = 0;
+  int last_var = 0;
+  double cum_abs_sum = 0;
+
+#ifdef USE_OMP
+#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) reduction(+:cum_abs_sum)
+#endif
+  for (int i = 0; i < Size; i++)
+    {
+      (*b)[i] = u[i];
+      cum_abs_sum += fabs((*b)[i]);
+      (*x0)[i] = y[i];
+    }
+  if (cum_abs_sum < 1e-20)
+    zero_solution = true;
+  else
+    zero_solution = false;
+
+  (*Ap)[0] = 0;
+  last_var = -1;
+  it4 = IM.begin();
+  while (it4 != IM.end())
+    {
+      var = it4->first.first.first;
+      if (var != last_var)
+        {
+          (*Ap)[1+last_var ] = NZE;
+          last_var = var;
+        }
+      eq = it4->first.second;
+      int index = it4->second;
+#ifdef DEBUG
+      if (index < 0 || index >= u_count_alloc || index > Size + Size*Size)
+        {
+          ostringstream tmp;
+          tmp << " in Init_CUDA_Sparse_Simple, index (" << index << ") out of range for u vector max = " << Size+Size*Size << " allocated = " << u_count_alloc << "\n";
+          throw FatalExceptionHandling(tmp.str());
+        }
+      if (NZE >= max_nze)
+        {
+          ostringstream tmp;
+          tmp << " in Init_CUDA_Sparse_Simple, exceeds the capacity of A_m sparse matrix\n";
+          throw FatalExceptionHandling(tmp.str());
+        }
+#endif
+      (*Ax)[NZE] = u[index];
+      (*Ai)[NZE] = eq;
+      NZE++;
+#ifdef DEBUG
+      if (eq < 0 || eq >= Size)
+        {
+          ostringstream tmp;
+          tmp << " in Init_CUDA_Sparse_Simple, index (" << eq << ") out of range for b vector\n";
+          throw FatalExceptionHandling(tmp.str());
+        }
+      if (var < 0 || var >= Size)
+        {
+          ostringstream tmp;
+          tmp << " in Init_CUDA_Sparse_Simple, index (" << var << ") out of range for index_vara vector\n";
+          throw FatalExceptionHandling(tmp.str());
+        }
+      if (index_vara[var] < 0 || index_vara[var] >= y_size)
+        {
+          ostringstream tmp;
+          tmp << " in Init_CUDA_Sparse_Simple, index (" << index_vara[var] << ") out of range for y vector max=" << y_size << " (0)\n";
+          throw FatalExceptionHandling(tmp.str());
+        }
+#endif
+      it4++;
+    }
+  (*Ap)[Size] = NZE;
+}
+
+#ifdef CUDA
+void
+dynSparseMatrix::Init_CUDA_Sparse(int periods, int y_kmin, int y_kmax, int Size, map<pair<pair<int, int>, int>, int> &IM, int **Ap, int **Ai, double **Ax, int **Ap_tild, int **Ai_tild, double **A_tild, double **b, double **x0, mxArray *x0_m, int *nnz, int *nnz_tild, int preconditioner)
+{
+  //cudaError_t cuda_error;
+  int t, eq, var, lag, ti_y_kmin, ti_y_kmax;
+  int n = periods * Size;
+  size_t prior_nz = IM.size() * periods;
+  size_t preconditioner_size = 0;
+  map<pair<int, int>, int> jacob_struct;
+
+  /* ask cuda how many devices it can find */
+  int device_count;
+  cudaGetDeviceCount(&device_count);
+
+  cudaSetDevice(CUDA_device);
+
+
+  double *Host_b = (double*)mxMalloc(n * sizeof(double));
+  cudaChk(cudaMalloc((void**)b, n * sizeof(double)), " in Init_Cuda_Sparse, not enought memory to allocate b vector on the graphic card\n");
+
+  double *Host_x0 = mxGetPr(x0_m);
+  if (!Host_x0)
+    {
+      ostringstream tmp;
+      tmp << " in Init_Cuda_Sparse, can't retrieve x0 vector\n";
+      throw FatalExceptionHandling(tmp.str());
+    }
+  cudaChk(cudaMalloc((void**)x0, n * sizeof(double)), " in Init_Cuda_Sparse, not enought memory to allocate x0 vector on the graphic card\n");
+
+  int* Host_Ap = (int*)mxMalloc((n+1) * sizeof(int));
+
+
+  int* Host_Ai = (int*)mxMalloc(prior_nz * sizeof(int));
+
+
+  double* Host_Ax = (double*)mxMalloc(prior_nz * sizeof(double));
+
+  int* Host_Ai_tild, * Host_Ap_tild;
+  if (preconditioner == 3)
+    {
+      Host_Ap_tild = (int*) mxMalloc((n+1)*sizeof(int));
+      Host_Ai_tild = (int*) mxMalloc(prior_nz*sizeof(int));
+      Host_Ap_tild[0] = 0;
+    }
+
+
+  if (preconditioner == 0)
+    preconditioner_size = n;
+  else if (preconditioner == 1 || preconditioner == 2 || preconditioner == 3)
+    preconditioner_size = prior_nz;
+
+  double *Host_A_tild = (double*)mxMalloc(preconditioner_size * sizeof(double));
+
+
+  map<pair<pair<int, int>, int>, int>::iterator it4;
+#ifdef USE_OMP
+#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+#endif
+  for (int i = 0; i < y_size*(periods+y_kmin); i++)
+    ya[i] = y[i];
+#ifdef DEBUG
+  unsigned int max_nze = mxGetNzmax(A_m);
+#endif
+  unsigned int NZE = 0, NZE_tild = 0;
+  int last_eq = 0;
+#ifdef USE_OMP
+#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+#endif
+  for (int i = 0; i < periods*Size; i++)
+    {
+      Host_b[i] = 0;
+      Host_x0[i] = y[index_vara[Size*y_kmin+i]];
+    }
+
+  //Ordered in CSR and not in CSC
+
+  Host_Ap[0] = 0;
+  for (t = 0; t < periods; t++)
+    {
+      last_eq = -1;
+      it4 = IM.begin();
+      while (it4 != IM.end())
+        {
+          eq = it4->first.first.first;
+          if (eq != last_eq)
+            {
+#ifdef DEBUG
+              if (1+last_eq + t * Size > (n + 1))
+                {
+                  ostringstream tmp;
+                  tmp << " in Init_CUDA_Sparse, 1+last_eq + t * Size (" << 1+last_eq + t * Size << ") out of range for Host_Ap vector\n";
+                  throw FatalExceptionHandling(tmp.str());
+                }
+#endif
+              Host_Ap[1+last_eq + t * Size] = NZE;
+              if (preconditioner == 3 && t == 0)
+                 Host_Ap_tild[1+last_eq ] = NZE_tild;
+              last_eq = eq;
+            }
+          var = it4->first.second+Size*t;
+          lag = it4->first.first.second;
+          int index = it4->second+ (t /*+ lag*/) * u_count_init;
+          if (eq < (periods+y_kmax)*Size)
+            {
+              ti_y_kmin = -min(t, y_kmin);
+              ti_y_kmax = min(periods-(t + 1), y_kmax);
+              if ((lag <= ti_y_kmax && lag >= ti_y_kmin) || preconditioner == 3)  /*Build the index for sparse matrix containing the jacobian : u*/
+                {
+#ifdef DEBUG
+                  if (index < 0 || index >= u_count_alloc || index > (periods-1)* IM.size() + Size * Size + periods * Size)
+                    {
+                      ostringstream tmp;
+                      tmp << " in Init_CUDA_Sparse, index (" << index << ") out of range for u vector max = " << (periods-1)* IM.size() + Size * Size + periods * Size << " allocated = " << u_count_alloc << "\n";
+                      throw FatalExceptionHandling(tmp.str());
+                    }
+                  if (NZE >= prior_nz)
+                    {
+                      ostringstream tmp;
+                      tmp << " in Init_CUDA_Sparse, exceeds the capacity of A_i or A_x sparse matrix\n";
+                      throw FatalExceptionHandling(tmp.str());
+                    }
+#endif
+                  bool to_store = true;
+                  if (preconditioner == 0)
+                    {
+                      if (lag == 0 && it4->first.second == eq)
+                        Host_A_tild[var] = u[index];
+                    }
+                  else if (preconditioner == 1 || preconditioner == 2)
+                    Host_A_tild[NZE] = u[index];
+                  else if (preconditioner == 3)
+                    {
+                      if (lag > ti_y_kmax || lag < ti_y_kmin)
+                        {
+                          Host_b[eq + t * Size]  += u[index]*y[index_vara[var+Size*(y_kmin+lag)]];
+                          to_store = false;
+                        }
+                      if (t == 0)
+                        {
+                           map<pair<int, int>, int>::const_iterator it = jacob_struct.find(make_pair(eq + t * Size, var));
+                           if (it != jacob_struct.end())
+                             Host_A_tild[it->second] += u[index];
+                           else
+                            {
+                              jacob_struct[make_pair(eq, var)] = NZE_tild;
+                              Host_A_tild[NZE_tild] = u[index];
+                              Host_Ai_tild[NZE_tild] = var;
+                              NZE_tild++;
+                            }
+                        }
+                    }
+                  if (to_store)
+                    {
+                      Host_Ax[NZE] = u[index];
+                      Host_Ai[NZE] = var + lag * Size;
+                      NZE++;
+                    }
+                }
+              else
+                {
+#ifdef DEBUG
+                  if (var < 0 || var >= Size * periods)
+                    {
+                      ostringstream tmp;
+                      tmp << " in Init_CUDA_Sparse, index (" << var << ") out of range for b vector\n";
+                      throw FatalExceptionHandling(tmp.str());
+                    }
+                  if (var+Size*(y_kmin+t+lag) < 0 || var+Size*(y_kmin+lag) >= Size*(periods+y_kmin+y_kmax))
+                    {
+                      ostringstream tmp;
+                      tmp << " in Init_CUDA_Sparse, index (" << var+Size*(y_kmin+lag) << ") out of range for index_vara vector max=" << Size*(periods+y_kmin+y_kmax) << "\n";
+                      throw FatalExceptionHandling(tmp.str());
+                    }
+                  if (index_vara[var+Size*(y_kmin+lag)] < 0 || index_vara[var+Size*(y_kmin+lag)] >= y_size*(periods+y_kmin+y_kmax))
+                    {
+                      ostringstream tmp;
+                      tmp << " in Init_CUDA_Sparse, index (" << index_vara[var+Size*(y_kmin+lag)] << ") out of range for y vector max=" << y_size*(periods+y_kmin+y_kmax) << "\n";
+                      throw FatalExceptionHandling(tmp.str());
+                    }
+#endif
+                  Host_b[eq + t * Size]  += u[index]*y[index_vara[var+Size*(y_kmin+lag)]];
+                }
+            }
+          else           // ...and store it in the u vector
+            {
+#ifdef DEBUG
+              if (index < 0 || index >= u_count_alloc)
+                {
+                  ostringstream tmp;
+                  tmp << " in Init_CUDA_Sparse, index (" << index << ") out of range for u vector\n";
+                  throw FatalExceptionHandling(tmp.str());
+                }
+              if (var < 0 || var >= (Size*periods))
+                {
+                  ostringstream tmp;
+                  tmp << " in Init_CUDA_Sparse, index (" << var << ") out of range for b vector\n";
+                  throw FatalExceptionHandling(tmp.str());
+                }
+#endif
+              Host_b[var]  += u[index];
+            }
+          it4++;
+        }
+    }
+  Host_Ap[Size*periods] = NZE;
+  if (preconditioner == 3)
+    {
+      int* tmp_Ap_tild = (int*) mxMalloc((Size + 1) * sizeof(int) );
+      int* tmp_Ai_tild = (int*) mxMalloc(NZE_tild * sizeof(int) );
+      double* tmp_A_tild = (double*) mxMalloc(NZE_tild * sizeof(double) );
+      memcpy(tmp_Ap_tild, Host_Ap_tild, (Size + 1) * sizeof(int));
+      memcpy(tmp_Ai_tild, Host_Ai_tild, NZE_tild * sizeof(int));
+      memcpy(tmp_A_tild, Host_A_tild, NZE_tild * sizeof(double));
+      //int NZE_tild_old = NZE_tild;
+      NZE_tild = 0;
+      Host_Ap_tild[0] = NZE_tild;
+
+      for (int i = 0; i < Size; i++)
+        {
+          for(int j = tmp_Ap_tild[i]; j < tmp_Ap_tild[i+1]; j++)
+            if (abs(tmp_A_tild[j]) > 1.0e-20 )
+              {
+                Host_A_tild[NZE_tild] = tmp_A_tild[j];
+                Host_Ai_tild[NZE_tild] = tmp_Ai_tild[j];
+                NZE_tild++;
+              }
+          Host_Ap_tild[i+1] = NZE_tild;
+        }
+      mxFree(tmp_Ap_tild);
+      mxFree(tmp_Ai_tild);
+      mxFree(tmp_A_tild);
+    }
+
+  *nnz = NZE;
+  *nnz_tild = NZE_tild;
+  if (preconditioner == 1 || preconditioner == 2 || preconditioner == 3)
+    preconditioner_size = NZE;
+
+
+#ifdef DEBUG
+  mexPrintf("Host_Ax = [");
+  for (int i = 0; i < NZE; i++)
+    mexPrintf("%f ",Host_Ax[i]);
+  mexPrintf("]\n");
+
+  mexPrintf("Host_Ap = [");
+  for (int i = 0; i < n+1; i++)
+    mexPrintf("%d ",Host_Ap[i]);
+  mexPrintf("]\n");
+
+  mexPrintf("Host_Ai = [");
+  for (int i = 0; i < NZE; i++)
+    mexPrintf("%d ",Host_Ai[i]);
+  mexPrintf("]\n");
+#endif
+  cudaChk(cudaMalloc((void**)Ai, NZE * sizeof(int)), " in Init_Cuda_Sparse, can't allocate Ai index vector on the graphic card\n");
+  cudaChk(cudaMalloc((void**)Ax, NZE * sizeof(double)), "  in Init_Cuda_Sparse, can't allocate Ax on the graphic card\n");
+  cudaChk(cudaMalloc((void**)Ap, (n+1) * sizeof(int)), " in Init_Cuda_Sparse, can't allocate Ap index vector on the graphic card\n");
+  if (preconditioner == 3)
+    {
+      cudaChk(cudaMalloc((void**)Ai_tild, NZE_tild * sizeof(int)), " in Init_Cuda_Sparse, can't allocate Ai_tild index vector on the graphic card\n");
+      cudaChk(cudaMalloc((void**)Ap_tild, (n+1) * sizeof(int)), " in Init_Cuda_Sparse, can't allocate Ap_tild index vector on the graphic card\n");
+    }
+  cudaChk(cudaMalloc((void**)A_tild, preconditioner_size * sizeof(double)), "  in Init_Cuda_Sparse, can't allocate A_tild on the graphic card\n");
+
+  cudaChk(cudaMemcpy(*x0,     Host_x0,     n *                   sizeof(double), cudaMemcpyHostToDevice), " in Init_CUDA_Sparse, cudaMemcpy x0 = Host_x0 failed");
+  cudaChk(cudaMemcpy(*b,      Host_b,      n *                   sizeof(double), cudaMemcpyHostToDevice), " in Init_CUDA_Sparse, cudaMemcpy b = Host_b failed");
+  cudaChk(cudaMemcpy(*Ap,     Host_Ap,     (n + 1) *             sizeof(int),    cudaMemcpyHostToDevice), " in Init_CUDA_Sparse, cudaMemcpy Ap = Host_Ap failed");
+  cudaChk(cudaMemcpy(*Ai,     Host_Ai,     NZE *                 sizeof(int),    cudaMemcpyHostToDevice), " in Init_CUDA_Sparse, cudaMemcpy Ai = Host_Ai failed");
+  cudaChk(cudaMemcpy(*Ax,     Host_Ax,     NZE *                 sizeof(double), cudaMemcpyHostToDevice), " in Init_CUDA_Sparse, cudaMemcpy Ax = Host_Ax failed");
+  if (preconditioner == 3)
+    {
+      cudaChk(cudaMemcpy(*Ap_tild,     Host_Ap_tild,     (n + 1) *             sizeof(int),    cudaMemcpyHostToDevice), " in Init_CUDA_Sparse, cudaMemcpy Ap_tild = Host_Ap_tild failed");
+      cudaChk(cudaMemcpy(*Ai_tild,     Host_Ai_tild,     NZE_tild *                 sizeof(int),    cudaMemcpyHostToDevice), " in Init_CUDA_Sparse, cudaMemcpy Ai_tild = Host_Ai_til failed");
+    }
+  cudaChk(cudaMemcpy(*A_tild, Host_A_tild, preconditioner_size * sizeof(double), cudaMemcpyHostToDevice), " in Init_CUDA_Sparse, cudaMemcpy A_tild = Host_A_tild failed");
+}
+#endif
+
+
+void
+PrintM(int n, double* Ax, mwIndex *Ap, mwIndex *Ai)
+{
+  int nnz = Ap[n];
+  double *A = (double*)mxMalloc(n * n * sizeof(double));
+  memset(A,0,n * n  * sizeof(double));
+  int k = 0;
+  for (int i = 0; i< n; i++)
+    {
+      for (int j = Ap[i]; j < Ap[i + 1]; j++)
+        {
+          int row = Ai[j];
+          A[row *n + i] = Ax[j];
+          k++;
+        }
+    }
+  if (nnz != k)
+    mexPrintf("Problem nnz(%d) != number of elements(%d)\n", nnz, k);
+  mexPrintf("----------------------\n");
+  //mexEvalString("drawnow;");
+  for (int i = 0; i < n ; i++)
+    {
+      for (int j = 0; j < n; j++)
+        mexPrintf("%-6.3f ",A[i * n + j]);
+      mexPrintf("\n");
+    }
+  mxFree(A);
+}
+
+void
+dynSparseMatrix::Init_Matlab_Sparse(int periods, int y_kmin, int y_kmax, int Size, map<pair<pair<int, int>, int>, int> &IM, mxArray *A_m, mxArray *b_m, mxArray *x0_m)
+{
+  int t, eq, var, lag, ti_y_kmin, ti_y_kmax;
   double *b = mxGetPr(b_m);
+
   if (!b)
     {
       ostringstream tmp;
@@ -583,13 +1712,6 @@ SparseMatrix::Init_Matlab_Sparse(int periods, int y_kmin, int y_kmax, int Size,
       tmp << " in Init_Matlab_Sparse_Simple, can't retrieve x0 vector\n";
       throw FatalExceptionHandling(tmp.str());
     }
-  mwIndex *Ai = mxGetIr(A_m);
-  if (!Ai)
-    {
-      ostringstream tmp;
-      tmp << " in Init_Matlab_Sparse, can't allocate Ai index vector\n";
-      throw FatalExceptionHandling(tmp.str());
-    }
   mwIndex *Aj = mxGetJc(A_m);
   if (!Aj)
     {
@@ -597,6 +1719,13 @@ SparseMatrix::Init_Matlab_Sparse(int periods, int y_kmin, int y_kmax, int Size,
       tmp << " in Init_Matlab_Sparse, can't allocate Aj index vector\n";
       throw FatalExceptionHandling(tmp.str());
     }
+  mwIndex *Ai = mxGetIr(A_m);
+  if (!Ai)
+    {
+      ostringstream tmp;
+      tmp << " in Init_Matlab_Sparse, can't allocate Ai index vector\n";
+      throw FatalExceptionHandling(tmp.str());
+    }
   double *A = mxGetPr(A_m);
   if (!A)
     {
@@ -604,15 +1733,22 @@ SparseMatrix::Init_Matlab_Sparse(int periods, int y_kmin, int y_kmax, int Size,
       tmp << " in Init_Matlab_Sparse, can't retrieve A matrix\n";
       throw FatalExceptionHandling(tmp.str());
     }
+
   map<pair<pair<int, int>, int>, int>::iterator it4;
-  for (i = 0; i < y_size*(periods+y_kmin); i++)
+#ifdef USE_OMP
+#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+#endif
+  for (int i = 0; i < y_size*(periods+y_kmin); i++)
     ya[i] = y[i];
 #ifdef DEBUG
   unsigned int max_nze = mxGetNzmax(A_m);
 #endif
   unsigned int NZE = 0;
   int last_var = 0;
-  for (i = 0; i < periods*Size; i++)
+#ifdef USE_OMP
+#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+#endif
+  for (int i = 0; i < periods*Size; i++)
     {
       b[i] = 0;
       x0[i] = y[index_vara[Size*y_kmin+i]];
@@ -709,7 +1845,7 @@ SparseMatrix::Init_Matlab_Sparse(int periods, int y_kmin, int y_kmax, int Size,
 }
 
 void
-SparseMatrix::Init_GE(int periods, int y_kmin, int y_kmax, int Size, map<pair<pair<int, int>, int>, int> &IM)
+dynSparseMatrix::Init_GE(int periods, int y_kmin, int y_kmax, int Size, map<pair<pair<int, int>, int>, int> &IM)
 {
   int t, i, eq, var, lag, ti_y_kmin, ti_y_kmax;
   double tmp_b = 0.0;
@@ -734,31 +1870,35 @@ SparseMatrix::Init_GE(int periods, int y_kmin, int y_kmax, int Size, map<pair<pa
   NbNZRow = (int *) mxMalloc(i);
   NbNZCol = (int *) mxMalloc(i);
 
-  //#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
-  for (i = 0; i < periods*Size; i++)
+#ifdef USE_OMP
+#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+#endif
+  for (int i = 0; i < periods*Size; i++)
     {
       b[i] = 0;
       line_done[i] = 0;
     }
-  //#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
-  for (i = 0; i < (periods+y_kmax+1)*Size; i++)
+#ifdef USE_OMP
+#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+#endif
+  for (int i = 0; i < (periods+y_kmax+1)*Size; i++)
     {
-      FNZE_C[i] = 0;
-      FNZE_R[i] = 0;
+      FNZE_C[i] = NULL;
+      FNZE_R[i] = NULL;
       temp_NZE_C[i] = NULL;
       temp_NZE_R[i] = NULL;
       NbNZRow[i] = 0;
       NbNZCol[i] = 0;
     }
-
-  //#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) ordered private(it4, ti_y_kmin, ti_y_kmax, eq, var, lag) schedule(dynamic)
+  int nnz = 0;
+  //pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) ordered private(it4, ti_y_kmin, ti_y_kmax, eq, var, lag) schedule(dynamic)
   for (t = 0; t < periods; t++)
     {
       ti_y_kmin = -min(t, y_kmin);
       ti_y_kmax = min(periods-(t+1), y_kmax);
       it4 = IM.begin();
       eq = -1;
-      //#pragma omp ordered
+      //pragma omp ordered
       while (it4 != IM.end())
         {
           var = it4->first.first.second;
@@ -771,6 +1911,7 @@ SparseMatrix::Init_GE(int periods, int y_kmin, int y_kmax, int Size, map<pair<pa
               lag = it4->first.second;
               if (lag <= ti_y_kmax && lag >= ti_y_kmin)   /*Build the index for sparse matrix containing the jacobian : u*/
                 {
+                  nnz++;
                   var += Size*t;
                   NbNZRow[eq]++;
                   NbNZCol[var]++;
@@ -814,12 +1955,13 @@ SparseMatrix::Init_GE(int periods, int y_kmin, int y_kmax, int Size, map<pair<pa
           it4++;
         }
     }
+  //mexPrintf("nnz/n=%f\n", double(nnz)/double(periods*Size));
   mxFree(temp_NZE_R);
   mxFree(temp_NZE_C);
 }
 
 int
-SparseMatrix::Get_u()
+dynSparseMatrix::Get_u()
 {
   if (!u_liste.empty())
     {
@@ -853,26 +1995,26 @@ SparseMatrix::Get_u()
 }
 
 void
-SparseMatrix::Delete_u(int pos)
+dynSparseMatrix::Delete_u(int pos)
 {
   u_liste.push_back(pos);
 }
 
 void
-SparseMatrix::Clear_u()
+dynSparseMatrix::Clear_u()
 {
   u_liste.clear();
 }
 
 void
-SparseMatrix::Print_u()
+dynSparseMatrix::Print_u()
 {
   for (unsigned int i = 0; i < u_liste.size(); i++)
     mexPrintf("%d ", u_liste[i]);
 }
 
 void
-SparseMatrix::End_GE(int Size)
+dynSparseMatrix::End_GE(int Size)
 {
   mem_mngr.Free_All();
   mxFree(FNZE_R);
@@ -889,9 +2031,9 @@ SparseMatrix::End_GE(int Size)
 }
 
 bool
-SparseMatrix::compare(int *save_op, int *save_opa, int *save_opaa, int beg_t, int periods, long int nop4,  int Size)
+dynSparseMatrix::compare(int *save_op, int *save_opa, int *save_opaa, int beg_t, int periods, long int nop4,  int Size)
 {
-  long int i, j, nop = nop4/2, t, k;
+  long int i, j, nop = nop4/2;
   double r = 0.0;
   bool OK = true;
   t_save_op_s *save_op_s, *save_opa_s, *save_opaa_s;
@@ -899,7 +2041,7 @@ SparseMatrix::compare(int *save_op, int *save_opa, int *save_opaa, int beg_t, in
   diff1 = (int *) mxMalloc(nop*sizeof(int));
   diff2 = (int *) mxMalloc(nop*sizeof(int));
   int max_save_ops_first = -1;
-  j = k = i = 0;
+  j = i = 0;
   while (i < nop4 && OK)
     {
       save_op_s = (t_save_op_s *) &(save_op[i]);
@@ -936,14 +2078,13 @@ SparseMatrix::compare(int *save_op, int *save_opa, int *save_opaa, int beg_t, in
   // the same pivot for all remaining periods
   if (OK)
     {
-      //#pragma omp parallel for  num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) ordered private(j) schedule(dynamic)
-      for (i = beg_t; i < periods; i++)
+      for (int i = beg_t; i < periods; i++)
         {
-          for (j = 0; j < Size; j++)
-            {
-              ///#pragma omp ordered
-              pivot[i*Size+j] = pivot[(i-1)*Size+j]+Size;
-            }
+#ifdef USE_OMP
+#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+#endif
+          for (int j = 0; j < Size; j++)
+            pivot[i*Size+j] = pivot[(i-1)*Size+j]+Size;
         }
       if (max_save_ops_first >= u_count_alloc)
         {
@@ -956,13 +2097,16 @@ SparseMatrix::compare(int *save_op, int *save_opa, int *save_opaa, int beg_t, in
               throw FatalExceptionHandling(tmp.str());
             }
         }
-      double *up;
-      for (t = 1; t < periods-beg_t-y_kmax; t++)
+      /*#ifdef USE_OMP
+      #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+      #endif*/
+      for (int t = 1; t < periods-beg_t-y_kmax; t++)
         {
-          i = j = 0;
+          int i = j = 0;
+          double *up;
           while (i < nop4)
             {
-              save_op_s = (t_save_op_s *) (&(save_op[i]));
+              t_save_op_s *save_op_s = (t_save_op_s *) (&(save_op[i]));
               up = &u[save_op_s->first+t*diff1[j]];
               switch (save_op_s->operat)
                 {
@@ -988,15 +2132,22 @@ SparseMatrix::compare(int *save_op, int *save_opa, int *save_opaa, int beg_t, in
         }
       int t1 = max(1, periods-beg_t-y_kmax);
       int periods_beg_t = periods-beg_t;
-      for (t = t1; t < periods_beg_t; t++)
+      /*#ifdef USE_OMP
+      #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+      #endif*/
+      for (int t = t1; t < periods_beg_t; t++)
         {
-          i = j = 0;
+          int i = j = 0;
+          int gap = periods_beg_t-t;
+          /*#ifdef USE_OMP
+          #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+          #endif*/
           while (i < nop4)
             {
-              save_op_s = (t_save_op_s *) (&(save_op[i]));
-              if (save_op_s->lag < (periods_beg_t-t))
+              t_save_op_s *save_op_s = (t_save_op_s *) (&(save_op[i]));
+              if (save_op_s->lag < gap)
                 {
-                  up = &u[save_op_s->first+t*diff1[j]];
+                  double *up = &u[save_op_s->first+t*diff1[j]];
                   switch (save_op_s->operat)
                     {
                     case IFLD:
@@ -1041,9 +2192,9 @@ SparseMatrix::compare(int *save_op, int *save_opa, int *save_opaa, int beg_t, in
 }
 
 int
-SparseMatrix::complete(int beg_t, int Size, int periods, int *b)
+dynSparseMatrix::complete(int beg_t, int Size, int periods, int *b)
 {
-  long int i, j, k, nop, nopa, nop1, cal_y, nb_var, pos, t, ti, max_var, min_var;
+  long int i, j, k, nop, nopa, nop1, cal_y, nb_var, pos, max_var, min_var;
   NonZeroElem *first;
   int *save_code;
   int *diff;
@@ -1067,8 +2218,10 @@ SparseMatrix::complete(int beg_t, int Size, int periods, int *b)
       save_code[nop+1] = 0;
       save_code[nop+2] = 0;
       save_code[nop+3] = 0;
+#ifdef DEBUG
       if ((nop+3) >= size_of_save_code)
         mexPrintf("out of save_code[%d] (bound=%d)\n", nop+2, size_of_save_code);
+#endif
       nop += 4;
       for (k = 0; k < nb_var; k++)
         {
@@ -1076,8 +2229,10 @@ SparseMatrix::complete(int beg_t, int Size, int periods, int *b)
           save_code[nop+1] = index_vara[first->c_index]+cal_y;
           save_code[nop+2] = first->u_index;
           save_code[nop+3] = first->lag_index;
+#ifdef DEBUG
           if ((nop+3) >= size_of_save_code)
             mexPrintf("out of save_code[%d] (bound=%d)\n", nop+2, size_of_save_code);
+#endif
           nop += 4;
           first = first->NZE_R_N;
         }
@@ -1085,15 +2240,19 @@ SparseMatrix::complete(int beg_t, int Size, int periods, int *b)
       save_code[nop+1] = b[pos];
       save_code[nop+2] = 0;
       save_code[nop+3] = 0;
+#ifdef DEBUG
       if ((nop+3) >= size_of_save_code)
         mexPrintf("out of save_code[%d] (bound=%d)\n", nop+2, size_of_save_code);
+#endif
       nop += 4;
       save_code[nop] = IFSTP;
       save_code[nop+1] = index_vara[j]+y_size*y_kmin;
       save_code[nop+2] = 0;
       save_code[nop+3] = 0;
+#ifdef DEBUG
       if ((nop+2) >= size_of_save_code)
         mexPrintf("out of save_code[%d] (bound=%d)\n", nop+2, size_of_save_code);
+#endif
       nop += 4;
     }
   i = beg_t*Size-1;
@@ -1112,38 +2271,47 @@ SparseMatrix::complete(int beg_t, int Size, int periods, int *b)
         {
           diff[nopa] = save_code[nop1+1]-(index_vara[first->c_index]+cal_y);
           diff[nopa+1] = save_code[nop1+2]-(first->u_index);
+#ifdef DEBUG
           if ((nop1+2) >= size_of_save_code)
             mexPrintf("out of save_code[%d] (bound=%d)\n", nop1+2, size_of_save_code);
           if ((nopa+1) >= size_of_diff)
             mexPrintf("out of diff[%d] (bound=%d)\n", nopa+2, size_of_diff);
+#endif
           nopa += 2;
           nop1 += 4;
           first = first->NZE_R_N;
         }
       diff[nopa] = save_code[nop1+1]-(b[pos]);
       diff[nopa+1] = 0;
+#ifdef DEBUG
       if ((nop1+3) >= size_of_save_code)
         mexPrintf("out of save_code[%d] (bound=%d)\n", nop1+2, size_of_save_code);
       if ((nopa+1) >= size_of_diff)
         mexPrintf("out of diff[%d] (bound=%d)\n", nopa+2, size_of_diff);
+#endif
       nopa += 2;
       nop1 += 4;
       diff[nopa] = save_code[nop1+1]-(index_vara[j]+y_size*y_kmin);
       diff[nopa+1] = 0;
+#ifdef DEBUG
       if ((nop1+4) >= size_of_save_code)
         mexPrintf("out of save_code[%d] (bound=%d)\n", nop1+2, size_of_save_code);
       if ((nopa+1) >= size_of_diff)
         mexPrintf("out of diff[%d] (bound=%d)\n", nopa+2, size_of_diff);
+#endif
       nopa += 2;
       nop1 += 4;
     }
   max_var = (periods+y_kmin)*y_size;
   min_var = y_kmin*y_size;
-  for (t = periods+y_kmin-1; t >= beg_t+y_kmin; t--)
+  /*#ifdef USE_OMP
+  #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+  #endif*/
+  for (int t = periods+y_kmin-1; t >= beg_t+y_kmin; t--)
     {
-      j = 0;
-      ti = t-y_kmin-beg_t;
-      for (i = 0; i < nop; i += 4)
+      int j = 0, k;
+      int ti = t-y_kmin-beg_t;
+      for (int i = 0; i < nop; i += 4)
         {
           switch (save_code[i])
             {
@@ -1175,12 +2343,15 @@ SparseMatrix::complete(int beg_t, int Size, int periods, int *b)
 }
 
 void
-SparseMatrix::bksub(int tbreak, int last_period, int Size, double slowc_l)
+dynSparseMatrix::bksub(int tbreak, int last_period, int Size, double slowc_l)
 {
   NonZeroElem *first;
   int i, j, k;
   double yy;
-  for (i = 0; i < y_size*(periods+y_kmin); i++)
+#ifdef USE_OMP
+#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+#endif
+  for (int i = 0; i < y_size*(periods+y_kmin); i++)
     y[i] = ya[i];
   if (symbolic && tbreak)
     last_period = complete(tbreak, Size, periods, b);
@@ -1213,12 +2384,15 @@ SparseMatrix::bksub(int tbreak, int last_period, int Size, double slowc_l)
 }
 
 void
-SparseMatrix::simple_bksub(int it_, int Size, double slowc_l)
+dynSparseMatrix::simple_bksub(int it_, int Size, double slowc_l)
 {
   int i, k;
   double yy;
   NonZeroElem *first;
-  for (i = 0; i < y_size; i++)
+#ifdef USE_OMP
+#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+#endif
+  for (int i = 0; i < y_size; i++)
     y[i+it_*y_size] = ya[i+it_*y_size];
   for (i = Size-1; i >= 0; i--)
     {
@@ -1240,7 +2414,7 @@ SparseMatrix::simple_bksub(int it_, int Size, double slowc_l)
 }
 
 void
-SparseMatrix::CheckIt(int y_size, int y_kmin, int y_kmax, int Size, int periods, int iter)
+dynSparseMatrix::CheckIt(int y_size, int y_kmin, int y_kmax, int Size, int periods)
 {
   const double epsilon = 1e-7;
   fstream SaveResult;
@@ -1311,7 +2485,7 @@ SparseMatrix::CheckIt(int y_size, int y_kmin, int y_kmax, int Size, int periods,
 }
 
 void
-SparseMatrix::Check_the_Solution(int periods, int y_kmin, int y_kmax, int Size, double *u, int *pivot, int *b)
+dynSparseMatrix::Check_the_Solution(int periods, int y_kmin, int y_kmax, int Size, double *u, int *pivot, int *b)
 {
   const double epsilon = 1e-10;
   Init_GE(periods, y_kmin, y_kmax, Size, IM_i);
@@ -1349,32 +2523,35 @@ SparseMatrix::Check_the_Solution(int periods, int y_kmin, int y_kmax, int Size,
 }
 
 mxArray *
-SparseMatrix::substract_A_B(mxArray *A_m, mxArray *B_m)
+dynSparseMatrix::substract_A_B(mxArray *A_m, mxArray *B_m)
 {
-  unsigned int n_A = mxGetN(A_m);
-  unsigned int m_A = mxGetM(A_m);
+  size_t n_A = mxGetN(A_m);
+  size_t m_A = mxGetM(A_m);
   double *A_d = mxGetPr(A_m);
-  unsigned int n_B = mxGetN(B_m);
+  size_t n_B = mxGetN(B_m);
   double *B_d = mxGetPr(B_m);
   mxArray *C_m = mxCreateDoubleMatrix(m_A, n_B, mxREAL);
   double *C_d = mxGetPr(C_m);
-  for (unsigned int j = 0; j < n_A; j++)
+#ifdef USE_OMP
+#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+#endif
+  for (int j = 0; j < n_A; j++)
     for (unsigned int i = 0; i < m_A; i++)
       {
-        unsigned int index = j*m_A+i;
+        size_t index = j*m_A+i;
         C_d[index] = A_d[index] - B_d[index];
       }
   return C_m;
 }
 
 mxArray *
-SparseMatrix::Sparse_substract_A_SB(mxArray *A_m, mxArray *B_m)
+dynSparseMatrix::Sparse_substract_A_SB(mxArray *A_m, mxArray *B_m)
 {
-  unsigned int n_B = mxGetN(B_m);
-  unsigned int m_B = mxGetM(B_m);
+  size_t n_B = mxGetN(B_m);
+  size_t m_B = mxGetM(B_m);
   mwIndex *B_i = mxGetIr(B_m);
   mwIndex *B_j = mxGetJc(B_m);
-  unsigned int total_nze_B = B_j[n_B];
+  size_t total_nze_B = B_j[n_B];
   double *B_d = mxGetPr(B_m);
   mxArray *C_m = mxDuplicateArray(A_m);
   double *C_d = mxGetPr(C_m);
@@ -1391,18 +2568,18 @@ SparseMatrix::Sparse_substract_A_SB(mxArray *A_m, mxArray *B_m)
 }
 
 mxArray *
-SparseMatrix::Sparse_substract_SA_SB(mxArray *A_m, mxArray *B_m)
+dynSparseMatrix::Sparse_substract_SA_SB(mxArray *A_m, mxArray *B_m)
 {
-  unsigned int n_A = mxGetN(A_m);
-  unsigned int m_A = mxGetM(A_m);
+  size_t n_A = mxGetN(A_m);
+  size_t m_A = mxGetM(A_m);
   mwIndex *A_i = mxGetIr(A_m);
   mwIndex *A_j = mxGetJc(A_m);
-  unsigned int total_nze_A = A_j[n_A];
+  size_t total_nze_A = A_j[n_A];
   double *A_d = mxGetPr(A_m);
-  unsigned int n_B = mxGetN(B_m);
+  size_t n_B = mxGetN(B_m);
   mwIndex *B_i = mxGetIr(B_m);
   mwIndex *B_j = mxGetJc(B_m);
-  unsigned int total_nze_B = B_j[n_B];
+  size_t total_nze_B = B_j[n_B];
   double *B_d = mxGetPr(B_m);
   mxArray *C_m = mxCreateSparse(m_A, n_B, m_A*n_B, mxREAL);
   mwIndex *C_i = mxGetIr(C_m);
@@ -1415,10 +2592,10 @@ SparseMatrix::Sparse_substract_SA_SB(mxArray *A_m, mxArray *B_m)
     {
       while (nze_A >= (unsigned int) A_j[A_col+1] && (nze_A < total_nze_A))
         A_col++;
-      int A_row = A_i[nze_A];
+      size_t A_row = A_i[nze_A];
       while (nze_B >= (unsigned int) B_j[B_col+1] && (nze_B < total_nze_B))
         B_col++;
-      int B_row = B_i[nze_B];
+      size_t B_row = B_i[nze_B];
       if (A_col == B_col)
         {
           if (A_row == B_row && (nze_B < total_nze_B && nze_A < total_nze_A))
@@ -1475,27 +2652,30 @@ SparseMatrix::Sparse_substract_SA_SB(mxArray *A_m, mxArray *B_m)
 }
 
 mxArray *
-SparseMatrix::mult_SAT_B(mxArray *A_m, mxArray *B_m)
+dynSparseMatrix::mult_SAT_B(mxArray *A_m, mxArray *B_m)
 {
-  unsigned int n_A = mxGetN(A_m);
-  unsigned int m_A = mxGetM(A_m);
+  size_t n_A = mxGetN(A_m);
+  size_t m_A = mxGetM(A_m);
   mwIndex *A_i = mxGetIr(A_m);
   mwIndex *A_j = mxGetJc(A_m);
   double *A_d = mxGetPr(A_m);
-  unsigned int n_B = mxGetN(B_m);
+  size_t n_B = mxGetN(B_m);
   double *B_d = mxGetPr(B_m);
   mxArray *C_m = mxCreateDoubleMatrix(m_A, n_B, mxREAL);
   double *C_d = mxGetPr(C_m);
-  unsigned int nze_A = 0;
-  for (unsigned int j = 0; j < n_B; j++)
+  //unsigned int nze_A = 0;
+#ifdef USE_OMP
+#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+#endif
+  for (int j = 0; j < (int)n_B; j++)
     {
       for (unsigned int i = 0; i < n_A; i++)
         {
           double sum = 0;
-          nze_A = A_j[i];
+          size_t nze_A = A_j[i];
           while (nze_A < (unsigned int) A_j[i+1])
             {
-              unsigned int i_A = A_i[nze_A];
+              size_t i_A = A_i[nze_A];
               sum += A_d[nze_A++] * B_d[i_A];
             }
           C_d[j*n_A+i] = sum;
@@ -1505,32 +2685,34 @@ SparseMatrix::mult_SAT_B(mxArray *A_m, mxArray *B_m)
 }
 
 mxArray *
-SparseMatrix::Sparse_mult_SAT_B(mxArray *A_m, mxArray *B_m)
+dynSparseMatrix::Sparse_mult_SAT_B(mxArray *A_m, mxArray *B_m)
 {
-  unsigned int n_A = mxGetN(A_m);
-  unsigned int m_A = mxGetM(A_m);
+  size_t n_A = mxGetN(A_m);
+  size_t m_A = mxGetM(A_m);
   mwIndex *A_i = mxGetIr(A_m);
   mwIndex *A_j = mxGetJc(A_m);
   double *A_d = mxGetPr(A_m);
-  unsigned int n_B = mxGetN(B_m);
-  unsigned int m_B = mxGetM(B_m);
+  size_t n_B = mxGetN(B_m);
+  size_t m_B = mxGetM(B_m);
   double *B_d = mxGetPr(B_m);
   mxArray *C_m = mxCreateSparse(m_A, n_B, m_A*n_B, mxREAL);
   mwIndex *C_i = mxGetIr(C_m);
   mwIndex *C_j = mxGetJc(C_m);
   double *C_d = mxGetPr(C_m);
-  unsigned int nze_C = 0, nze_A = 0;
+  unsigned int nze_C = 0;
+  //unsigned int nze_A = 0;
   unsigned int C_col = 0;
   C_j[C_col] = 0;
+  //#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
   for (unsigned int j = 0; j < n_B; j++)
     {
       for (unsigned int i = 0; i < n_A; i++)
         {
           double sum = 0;
-          nze_A = A_j[i];
+          size_t nze_A = A_j[i];
           while (nze_A < (unsigned int) A_j[i+1])
             {
-              unsigned int i_A = A_i[nze_A];
+              size_t i_A = A_i[nze_A];
               sum += A_d[nze_A++] * B_d[i_A];
             }
           if (fabs(sum) > 1e-10)
@@ -1550,14 +2732,14 @@ SparseMatrix::Sparse_mult_SAT_B(mxArray *A_m, mxArray *B_m)
 }
 
 mxArray *
-SparseMatrix::Sparse_mult_SAT_SB(mxArray *A_m, mxArray *B_m)
+dynSparseMatrix::Sparse_mult_SAT_SB(mxArray *A_m, mxArray *B_m)
 {
-  unsigned int n_A = mxGetN(A_m);
-  unsigned int m_A = mxGetM(A_m);
+  size_t n_A = mxGetN(A_m);
+  size_t m_A = mxGetM(A_m);
   mwIndex *A_i = mxGetIr(A_m);
   mwIndex *A_j = mxGetJc(A_m);
   double *A_d = mxGetPr(A_m);
-  unsigned int n_B = mxGetN(B_m);
+  size_t n_B = mxGetN(B_m);
   mwIndex *B_i = mxGetIr(B_m);
   mwIndex *B_j = mxGetJc(B_m);
   double *B_d = mxGetPr(B_m);
@@ -1565,7 +2747,7 @@ SparseMatrix::Sparse_mult_SAT_SB(mxArray *A_m, mxArray *B_m)
   mwIndex *C_i = mxGetIr(C_m);
   mwIndex *C_j = mxGetJc(C_m);
   double *C_d = mxGetPr(C_m);
-  unsigned int nze_B = 0, nze_C = 0, nze_A = 0;
+  size_t nze_B = 0, nze_C = 0, nze_A = 0;
   unsigned int C_col = 0;
   C_j[C_col] = 0;
   for (unsigned int j = 0; j < n_B; j++)
@@ -1577,8 +2759,8 @@ SparseMatrix::Sparse_mult_SAT_SB(mxArray *A_m, mxArray *B_m)
           nze_A = A_j[i];
           while (nze_A < (unsigned int) A_j[i+1] && nze_B < (unsigned int) B_j[j+1])
             {
-              unsigned int i_A = A_i[nze_A];
-              unsigned int i_B = B_i[nze_B];
+              size_t i_A = A_i[nze_A];
+              size_t i_B = B_i[nze_B];
               if (i_A == i_B)
                 sum += A_d[nze_A++] * B_d[nze_B++];
               else if (i_A < i_B)
@@ -1603,13 +2785,13 @@ SparseMatrix::Sparse_mult_SAT_SB(mxArray *A_m, mxArray *B_m)
 }
 
 mxArray *
-SparseMatrix::Sparse_transpose(mxArray *A_m)
+dynSparseMatrix::Sparse_transpose(mxArray *A_m)
 {
-  unsigned int n_A = mxGetN(A_m);
-  unsigned int m_A = mxGetM(A_m);
+  size_t n_A = mxGetN(A_m);
+  size_t m_A = mxGetM(A_m);
   mwIndex *A_i = mxGetIr(A_m);
   mwIndex *A_j = mxGetJc(A_m);
-  unsigned int total_nze_A = A_j[n_A];
+  size_t total_nze_A = A_j[n_A];
   double *A_d = mxGetPr(A_m);
   mxArray *C_m = mxCreateSparse(n_A, m_A, total_nze_A, mxREAL);
   mwIndex *C_i = mxGetIr(C_m);
@@ -1617,7 +2799,7 @@ SparseMatrix::Sparse_transpose(mxArray *A_m)
   double *C_d = mxGetPr(C_m);
   unsigned int nze_C = 0, nze_A = 0;
   memset(C_j, 0, m_A);
-  map<pair<unsigned int, unsigned int>, double> B2;
+  map<pair<mwIndex, unsigned int>, double> B2;
   for (unsigned int i = 0; i < n_A; i++)
     {
       while (nze_A < (unsigned int) A_j[i+1])
@@ -1629,7 +2811,7 @@ SparseMatrix::Sparse_transpose(mxArray *A_m)
     }
   for (unsigned int i = 0; i < m_A; i++)
     C_j[i+1] += C_j[i];
-  for (map<pair<unsigned int, unsigned int>, double>::const_iterator it = B2.begin(); it != B2.end(); it++)
+  for (map<pair<mwIndex, unsigned int>, double>::const_iterator it = B2.begin(); it != B2.end(); it++)
     {
       C_d[nze_C] = it->second;
       C_i[nze_C++] = it->first.second;
@@ -1637,8 +2819,163 @@ SparseMatrix::Sparse_transpose(mxArray *A_m)
   return C_m;
 }
 
+
+#define sign(a,b) ((b) >= 0.0 ? fabs(a) : -fabs(a))
+bool
+dynSparseMatrix::mnbrak(double *ax, double *bx, double *cx, double *fa, double *fb, double *fc)
+{
+  const double GOLD=1.618034;
+  const double GLIMIT=100.0;
+  const double TINY=1.0e-20;
+
+  double tmp;
+  mexPrintf("bracketing *ax=%f, *bx=%f\n",*ax, *bx);
+  //mexEvalString("drawnow;");
+  double ulim,u,r,q,fu;
+  if (!compute_complete(*ax, fa))
+    return false;
+  if (!compute_complete(*bx, fb))
+    return false;
+  if (*fb > *fa)
+    {
+      tmp = *ax;
+      *ax = *bx;
+      *bx = tmp;
+
+      tmp = *fa;
+      *fa = *fb;
+      *fb = tmp;
+    }
+  *cx=(*bx)+GOLD*(*bx-*ax);
+  if (!compute_complete(*cx, fc))
+    return false;
+  while (*fb > *fc)
+    {
+      r=(*bx-*ax)*(*fb-*fc);
+      q=(*bx-*cx)*(*fb-*fa);
+      u=(*bx)-((*bx-*cx)*q-(*bx-*ax)*r)/
+        (2.0*sign(fmax(fabs(q-r),TINY),q-r));
+      ulim=(*bx)+GLIMIT*(*cx-*bx);
+      if ((*bx-u)*(u-*cx) > 0.0)
+        {
+          if (!compute_complete(u, &fu))
+            return false;
+          if (fu < *fc)
+            {
+              *ax=(*bx);
+              *bx=u;
+              *fa=(*fb);
+              *fb=fu;
+              return true;
+            }
+          else if (fu > *fb)
+            {
+              *cx=u;
+              *fc=fu;
+              return true;
+            }
+          u=(*cx)+GOLD*(*cx-*bx);
+          if (!compute_complete(u, &fu))
+            return false;
+        }
+      else if ((*cx-u)*(u-ulim) > 0.0)
+        {
+          if (!compute_complete(u, &fu))
+            return false;
+          if (fu < *fc)
+            {
+              *bx = *cx;
+              *cx = u;
+              u = *cx+GOLD*(*cx-*bx);
+              *fb = *fc;
+              *fc = fu;
+              if (!compute_complete(u, &fu))
+                return false;
+            }
+        }
+      else if ((u-ulim)*(ulim-*cx) >= 0.0)
+        {
+          u=ulim;
+          if (!compute_complete(u, &fu))
+            return false;
+        }
+      else
+        {
+          u=(*cx)+GOLD*(*cx-*bx);
+          if (!compute_complete(u, &fu))
+            return false;
+        }
+      *ax = *bx;
+      *bx = *cx;
+      *cx = u;
+      *fa = *fb;
+      *fb = *fc;
+      *fc = fu;
+    }
+  return true;
+}
+
+bool
+dynSparseMatrix::golden(double ax, double bx, double cx, double tol, double solve_tolf, double *xmin)
+{
+  const double R=0.61803399;
+  const double C=(1.0-R);
+  mexPrintf("golden\n");
+  //mexEvalString("drawnow;");
+  double f1,f2,x0,x1,x2,x3;
+  int iter= 0, max_iter= 100;
+  x0=ax;
+  x3=cx;
+  if (fabs(cx-bx) > fabs(bx-ax))
+    {
+      x1=bx;
+      x2=bx+C*(cx-bx);
+    }
+  else
+    {
+      x2=bx;
+      x1=bx-C*(bx-ax);
+    }
+  if (!compute_complete(x1, &f1))
+    return false;
+  if (!compute_complete(x2, &f2))
+    return false;
+  while ((fabs(x3-x0) > tol*(fabs(x1)+fabs(x2)) && (f1 > solve_tolf && f2 > solve_tolf)) && (iter < max_iter) && (abs(x1 - x2) > 1e-4))
+    {
+      if (f2 < f1)
+        {
+          x0 = x1;
+          x1 = x2;
+          x2 = R*x1+C*x3;
+          f1 = f2;
+          if (!compute_complete(x2, &f2))
+            return false;
+        }
+      else
+        {
+          x3 = x2;
+          x2 = x1;
+          x1 = R*x2+C*x0;
+          f2 = f1;
+          if (!compute_complete(x1, &f1))
+            return false;
+        }
+      iter++;
+    }
+  if (f1 < f2)
+    {
+      *xmin=x1;
+      return true;
+    }
+  else
+    {
+      *xmin=x2;
+      return true;
+    }
+}
+
 void
-SparseMatrix::Solve_Matlab_Relaxation(mxArray *A_m, mxArray *b_m, unsigned int Size, double slowc_l, bool is_two_boundaries, int  it_)
+dynSparseMatrix::Solve_Matlab_Relaxation(mxArray *A_m, mxArray *b_m, unsigned int Size, double slowc_l, bool is_two_boundaries, int  it_)
 {
   mxArray *B1, *C1, *A2, *B2, *A3, *b1, *b2;
   double *b_m_d = mxGetPr(b_m);
@@ -1669,9 +3006,9 @@ SparseMatrix::Solve_Matlab_Relaxation(mxArray *A_m, mxArray *b_m, unsigned int S
       tmp << " in Solve_Matlab_Relaxation, can't retrieve A matrix\n";
       throw FatalExceptionHandling(tmp.str());
     }
-  unsigned int max_nze = A_m_j[Size*periods];
+  size_t max_nze = A_m_j[Size*periods];
   unsigned int nze = 0;
-  unsigned int var = A_m_j[nze];
+  size_t var = A_m_j[nze];
   B1 = mxCreateSparse(Size, Size, Size*Size, mxREAL);
   mwIndex *B1_i = mxGetIr(B1);
   mwIndex *B1_j = mxGetJc(B1);
@@ -1716,7 +3053,7 @@ SparseMatrix::Solve_Matlab_Relaxation(mxArray *A_m, mxArray *b_m, unsigned int S
   double *b1_d = mxGetPr(b1);
   b2 = mxCreateDoubleMatrix(Size, 1, mxREAL);
   double *b2_d = mxGetPr(b2);
-  unsigned int eq = 0;
+  size_t eq = 0;
   /*B1 C1
     A2 B2
     A3*/
@@ -1803,7 +3140,7 @@ SparseMatrix::Solve_Matlab_Relaxation(mxArray *A_m, mxArray *b_m, unsigned int S
             mxDestroyArray(B1_inv);
           mexCallMATLAB(1, &B1_inv, 1, &B1, "inv");
           mwIndex *B_inv_j = mxGetJc(B1_inv);
-          unsigned int B_inv_nze = B_inv_j[Size];
+          size_t B_inv_nze = B_inv_j[Size];
           double *B_inv_d = mxGetPr(B1_inv);
           sumc = 0;
           for (unsigned int i = 0; i < B_inv_nze; i++)
@@ -1912,9 +3249,9 @@ SparseMatrix::Solve_Matlab_Relaxation(mxArray *A_m, mxArray *b_m, unsigned int S
 }
 
 void
-SparseMatrix::Solve_Matlab_LU_UMFPack(mxArray *A_m, mxArray *b_m, int Size, double slowc_l, bool is_two_boundaries, int  it_)
+dynSparseMatrix::Solve_Matlab_LU_UMFPack(mxArray *A_m, mxArray *b_m, int Size, double slowc_l, bool is_two_boundaries, int  it_)
 {
-  int n = mxGetM(A_m);
+  size_t n = mxGetM(A_m);
   mxArray *z;
   mxArray *rhs[2];
   rhs[0] = A_m;
@@ -1922,6 +3259,9 @@ SparseMatrix::Solve_Matlab_LU_UMFPack(mxArray *A_m, mxArray *b_m, int Size, doub
   mexCallMATLAB(1, &z, 2, rhs, "mldivide");
   double *res = mxGetPr(z);
   if (is_two_boundaries)
+#ifdef USE_OMP
+#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+#endif
     for (int i = 0; i < n; i++)
       {
         int eq = index_vara[i+Size*y_kmin];
@@ -1930,6 +3270,9 @@ SparseMatrix::Solve_Matlab_LU_UMFPack(mxArray *A_m, mxArray *b_m, int Size, doub
         y[eq] += slowc_l * yy;
       }
   else
+#ifdef USE_OMP
+#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+#endif
     for (int i = 0; i < n; i++)
       {
         int eq = index_vara[i];
@@ -1943,7 +3286,1404 @@ SparseMatrix::Solve_Matlab_LU_UMFPack(mxArray *A_m, mxArray *b_m, int Size, doub
 }
 
 void
-SparseMatrix::Solve_Matlab_GMRES(mxArray *A_m, mxArray *b_m, int Size, double slowc, int block, bool is_two_boundaries, int it_, bool steady_state, mxArray *x0_m)
+dynSparseMatrix::End_Matlab_LU_UMFPack()
+{
+  if (Symbolic)
+    umfpack_dl_free_symbolic (&Symbolic) ;
+  if (Numeric)
+    umfpack_dl_free_numeric (&Numeric) ;
+}
+
+
+void
+dynSparseMatrix::End_Solver()
+{
+  if (((stack_solve_algo == 0 || stack_solve_algo == 4) && !steady_state) || (solve_algo == 6 && steady_state))
+    End_Matlab_LU_UMFPack();
+}
+
+void
+dynSparseMatrix::Solve_LU_UMFPack(SuiteSparse_long *Ap, SuiteSparse_long *Ai, double *Ax, double *b, int n, int Size, double slowc_l, bool is_two_boundaries, int  it_)
+{
+  SuiteSparse_long status, sys = 0;
+#ifndef _MSC_VER
+  double Control [UMFPACK_CONTROL], Info [UMFPACK_INFO], res [n];
+#else
+  double *Control, *Info, *res;
+  Control = (double*)mxMalloc(UMFPACK_CONTROL * sizeof(double));
+  Info = (double*)mxMalloc(UMFPACK_INFO * sizeof(double));
+  res = (double*)mxMalloc(n * sizeof(double));
+#endif
+
+  umfpack_dl_defaults(Control);
+  Control [UMFPACK_PRL] = 5;
+  status = 0;
+  if (iter == 0)
+    {
+      status = umfpack_dl_symbolic(n, n, Ap, Ai, Ax, &Symbolic, Control, Info);
+      if (status < 0)
+        {
+          umfpack_dl_report_info(Control, Info);
+          umfpack_dl_report_status(Control, status);
+          ostringstream  Error;
+          Error << " umfpack_dl_symbolic failed\n";
+          throw FatalExceptionHandling(Error.str());
+        }
+    }
+  if (iter > 0)
+    umfpack_dl_free_numeric(&Numeric) ;
+  status = umfpack_dl_numeric (Ap, Ai, Ax, Symbolic, &Numeric, Control, Info);
+  if (status < 0)
+    {
+      umfpack_dl_report_info(Control, Info);
+      umfpack_dl_report_status(Control, status);
+      ostringstream  Error;
+      Error << " umfpack_dl_numeric failed\n";
+      throw FatalExceptionHandling(Error.str());
+    }
+  status = umfpack_dl_solve(sys, Ap, Ai, Ax, res, b, Numeric, Control, Info);
+  if (status != UMFPACK_OK)
+    {
+      umfpack_dl_report_info(Control, Info);
+      umfpack_dl_report_status(Control, status);
+      ostringstream  Error;
+      Error << " umfpack_dl_solve failed\n";
+      throw FatalExceptionHandling(Error.str());
+    }
+
+  if (is_two_boundaries)
+#ifdef USE_OMP
+#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+#endif
+    for (int i = 0; i < n; i++)
+      {
+        int eq = index_vara[i+Size*y_kmin];
+        double yy = -(res[i] + y[eq]);
+        direction[eq] = yy;
+        y[eq] += slowc_l * yy;
+      }
+  else
+#ifdef USE_OMP
+#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+#endif
+    for (int i = 0; i < n; i++)
+      {
+        int eq = index_vara[i];
+        double yy = -(res[i] + y[eq+it_*y_size]);
+        direction[eq] = yy;
+        y[eq+it_*y_size] += slowc_l * yy;
+      }
+
+  mxFree(Ap);
+  mxFree(Ai);
+  mxFree(Ax);
+  mxFree(b);
+#ifdef _MSC_VER
+  mxFree(Control);
+  mxFree(Info);
+  mxFree(res);
+#endif
+}
+
+
+void
+dynSparseMatrix::Solve_LU_UMFPack(mxArray *A_m, mxArray *b_m, int Size, double slowc_l, bool is_two_boundaries, int  it_)
+{
+  SuiteSparse_long n = mxGetM(A_m);
+
+  SuiteSparse_long *Ap = (SuiteSparse_long*)mxGetJc (A_m);
+
+  SuiteSparse_long *Ai = (SuiteSparse_long*)mxGetIr(A_m);
+  double*  Ax = mxGetPr(A_m);
+  double*  B  = mxGetPr(b_m);
+  SuiteSparse_long status, sys = 0;
+#ifndef _MSC_VER
+  double Control [UMFPACK_CONTROL], Info [UMFPACK_INFO], res [n];
+#else
+  double *Control, *Info, *res;
+  Control = (double*)mxMalloc(UMFPACK_CONTROL * sizeof(double));
+  Info = (double*)mxMalloc(UMFPACK_INFO * sizeof(double));
+  res = (double*)mxMalloc(n * sizeof(double));
+#endif
+  void *Symbolic, *Numeric ;
+  umfpack_dl_defaults (Control) ;
+
+  status = umfpack_dl_symbolic (n, n, Ap, Ai, Ax, &Symbolic, Control, Info) ;
+  if (status != UMFPACK_OK)
+    umfpack_dl_report_info ((double*) NULL, Info) ;
+
+  status = umfpack_dl_numeric (Ap, Ai, Ax, Symbolic, &Numeric, Control, Info) ;
+  if (status != UMFPACK_OK)
+    umfpack_dl_report_info ((double*) NULL, Info) ;
+
+  status = umfpack_dl_solve (sys, Ap, Ai, Ax, res, B, Numeric, Control, Info) ;
+  if (status != UMFPACK_OK)
+    umfpack_dl_report_info ((double*) NULL, Info) ;
+  //double *res = mxGetPr(z);
+  if (is_two_boundaries)
+#ifdef USE_OMP
+#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+#endif
+    for (int i = 0; i < n; i++)
+      {
+        int eq = index_vara[i+Size*y_kmin];
+        double yy = -(res[i] + y[eq]);
+        direction[eq] = yy;
+        y[eq] += slowc_l * yy;
+      }
+  else
+#ifdef USE_OMP
+#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+#endif
+    for (int i = 0; i < n; i++)
+      {
+        int eq = index_vara[i];
+        double yy = -(res[i] + y[eq+it_*y_size]);
+        direction[eq] = yy;
+        y[eq+it_*y_size] += slowc_l * yy;
+      }
+  mxDestroyArray(A_m);
+  mxDestroyArray(b_m);
+#ifdef _MSC_VER
+  mxFree(Control);
+  mxFree(Info);
+  mxFree(res);
+#endif
+
+}
+
+
+#ifdef CUDA
+void
+printM(int n,double *Ax, int* Ap, int* Ai,  cusparseMatDescr_t descrA, cusparseHandle_t cusparse_handle)
+{
+  //cudaError_t cuda_error;
+  //cusparseStatus_t cusparse_status;
+  double * A_dense;
+  cudaChk(cudaMalloc((void**) &A_dense, n * n *sizeof(double)), "A_dense cudaMalloc has failed\n");
+
+
+  cusparseChk(cusparseDcsr2dense(cusparse_handle, n, n, descrA,
+                                 Ax, Ap,Ai, A_dense, n), "cusparseDcsr2dense has failed\n");
+  double *A_dense_hoste = (double*)mxMalloc(n * n * sizeof(double));
+  cudaChk(cudaMemcpy(A_dense_hoste, A_dense, n * n * sizeof(double),cudaMemcpyDeviceToHost), " cudaMemcpy(A_dense_hoste, A_dense) has failed\n");
+  mexPrintf("----------------------\n");
+  mexPrintf("FillMode=%d, IndexBase=%d, MatType=%d, DiagType=%d\n",cusparseGetMatFillMode(descrA), cusparseGetMatIndexBase(descrA), cusparseGetMatType(descrA), cusparseGetMatDiagType(descrA));
+  //mexEvalString("drawnow;");
+  for (int i = 0; i < n ; i++)
+    {
+      for (int j = 0; j < n; j++)
+        mexPrintf("%-6.3f ",A_dense_hoste[i + j * n]);
+      mexPrintf("\n");
+    }
+  mxFree(A_dense_hoste);
+  cudaChk(cudaFree(A_dense), "cudaFree(A_dense) has failed\n");
+}
+
+
+
+void
+dynSparseMatrix::Solve_CUDA_BiCGStab_Free(double* tmp_vect_host, double* p, double* r, double* v, double* s, double* t, double* y_, double* z, double* tmp_,
+                                       int* Ai, double* Ax, int* Ap, double* x0, double* b, double* A_tild, int* A_tild_i, int* A_tild_p/*, double* Lx, int* Li, int* Lp,
+                                       double* Ux, int* Ui, int* Up, int* device_n*/, cusparseSolveAnalysisInfo_t infoL, cusparseSolveAnalysisInfo_t infoU,
+                                       cusparseMatDescr_t descrL, cusparseMatDescr_t descrU, int preconditioner)
+{
+  //cudaError_t cuda_error;
+  //cusparseStatus_t cusparse_status;
+  mxFree(tmp_vect_host);
+  cudaChk(cudaFree(p), "  in Solve_Cuda_BiCGStab, can't free p\n");
+  cudaChk(cudaFree(r), "  in Solve_Cuda_BiCGStab, can't free r\n");
+  cudaChk(cudaFree(v), "  in Solve_Cuda_BiCGStab, can't free v\n");
+  cudaChk(cudaFree(s), "  in Solve_Cuda_BiCGStab, can't free s\n");
+  cudaChk(cudaFree(t), "  in Solve_Cuda_BiCGStab, can't free t\n");
+  cudaChk(cudaFree(y_), "  in Solve_Cuda_BiCGStab, can't free y_\n");
+  cudaChk(cudaFree(z), "  in Solve_Cuda_BiCGStab, can't free z\n");
+  cudaChk(cudaFree(tmp_), "  in Solve_Cuda_BiCGStab, can't free tmp_\n");
+  cudaChk(cudaFree(Ai), "  in Solve_Cuda_BiCGStab, can't free Ai\n");
+  cudaChk(cudaFree(Ax), "  in Solve_Cuda_BiCGStab, can't free Ax\n");
+  cudaChk(cudaFree(Ap), "  in Solve_Cuda_BiCGStab, can't free Ap\n");
+  cudaChk(cudaFree(x0), "  in Solve_Cuda_BiCGStab, can't free x0\n");
+  cudaChk(cudaFree(b), "  in Solve_Cuda_BiCGStab, can't free b\n");
+  /*if (preconditioner == 0)
+    {*/
+      cudaChk(cudaFree(A_tild), "  in Solve_Cuda_BiCGStab, can't free A_tild (1)\n");
+      cudaChk(cudaFree(A_tild_i), "  in Solve_Cuda_BiCGStab, can't free A_tild_i (1)\n");
+      cudaChk(cudaFree(A_tild_p), "  in Solve_Cuda_BiCGStab, can't free A_tild_p (1)\n");
+    /*}
+  else
+    {
+      cudaChk(cudaFree(Lx), "  in Solve_Cuda_BiCGStab, can't free Lx\n");
+      cudaChk(cudaFree(Li), "  in Solve_Cuda_BiCGStab, can't free Li\n");
+      cudaChk(cudaFree(Lp), "  in Solve_Cuda_BiCGStab, can't free Lp\n");
+      cudaChk(cudaFree(Ux), "  in Solve_Cuda_BiCGStab, can't free Ux\n");
+      cudaChk(cudaFree(Ui), "  in Solve_Cuda_BiCGStab, can't free Ui\n");
+      cudaChk(cudaFree(Up), "  in Solve_Cuda_BiCGStab, can't free Up\n");
+    }*/
+  //cudaChk(cudaFree(device_n), "  in Solve_Cuda_BiCGStab, can't free device_n\n");
+  if (preconditioner == 1 || preconditioner == 2 || preconditioner == 3)
+    {
+      cusparseChk(cusparseDestroySolveAnalysisInfo(infoL),
+                  "  in Solve_Cuda_BiCGStab, cusparseDestroySolveAnalysisInfo has failed for infoL\n");
+      cusparseChk(cusparseDestroySolveAnalysisInfo(infoU),
+                  "  in Solve_Cuda_BiCGStab, cusparseDestroySolveAnalysisInfo has failed for infoU\n");
+    }
+  cusparseChk(cusparseDestroyMatDescr(descrL),
+              " in Solve_Cuda_BiCGStab, matrix descriptor destruction failed for descrL\n");
+  cusparseChk(cusparseDestroyMatDescr(descrU),
+              " in Solve_Cuda_BiCGStab, matrix descriptor destruction failed for descrU\n");
+}
+#endif
+
+void
+Solve(double* Ax, int* Ap, int* Ai, double *b, int n, bool Lower, double *x)
+{
+  if (Lower)
+    {
+      for (int i = 0; i < n; i++)
+        {
+          double sum = 0;
+          for(int j = Ap[i]; j < Ap[i+1]; j++)
+            {
+              int k = Ai[j];
+              if (k < i)
+                sum += x[k] * Ax[j];
+            }
+          x[i] = b[i] - sum;
+        }
+    }
+  else
+    {
+      for (int i = n-1 ; i >= 0; i--)
+        {
+          double sum = 0, mul = 1;
+          for(int j = Ap[i]; j < Ap[i+1]; j++)
+            {
+              int k = Ai[j];
+              if (k > i)
+                sum += x[k] * Ax[j];
+              else if (k == i)
+                mul = Ax[j];
+            }
+          x[i] = (b[i] - sum) / mul;
+        }
+    }
+}
+
+void
+Check(int n, double* Ax, int* Ap, int* Ai, double* b, double *x, bool Lower)
+{
+  if (Lower)
+    {
+      for (int i = 0; i < n; i++)
+        {
+          double sum = 0;
+          for(int j = Ap[i]; j < Ap[i+1]; j++)
+            {
+              int k = Ai[j];
+              if (k < i)
+                sum += x[k] * Ax[j];
+            }
+          double err =  b[i] - sum - x[i];
+          if (abs(err) > 1e-10)
+            mexPrintf("error at i=%d\n",i);
+        }
+    }
+  else
+    {
+      for (int i = n-1 ; i >= 0; i--)
+        {
+          double sum = 0;
+          for(int j = Ap[i]; j < Ap[i+1]; j++)
+            {
+              int k = Ai[j];
+              if (k >= i)
+                sum += x[k] * Ax[j];
+            }
+          double err =  b[i] - sum;
+          if (abs(err) > 1e-10)
+            mexPrintf("error at i=%d\n",i);
+        }
+    }
+}
+
+#ifdef CUDA
+int
+dynSparseMatrix::Solve_CUDA_BiCGStab(int *Ap, int *Ai, double *Ax, int *Ap_tild, int *Ai_tild, double *A_tild, double *b, double *x0, int n, int Size, double slowc_l, bool is_two_boundaries,
+                                  int  it_, int nnz, int nnz_tild, int preconditioner, int max_iterations, int block)
+{
+  cusparseSolveAnalysisInfo_t info, infoL, infoU;
+  cusparseMatDescr_t descrL, descrU;
+  const double tol = 1.0e-6;//1.0e-6;
+  const double eps = 1.0e-16;
+  double *p, *r, *r0, *v, *s, *t, *y_, *z, *tmp_;
+  int *A_tild_i, *A_tild_p;
+  double *Qx;
+  int *Qi, *Qj;
+  double *Px;
+  int *Pi, *Pj;
+  int Q_nnz, P_nnz;
+  int W_nnz;
+  double bnorm;
+  double tmp1, tmp2;
+  int refinement_needed = 0, stagnation = 0;
+  int max_refinement = min(min(int(floor(double(n)/50)),10),n-max_iterations), max_stagnation = 3;
+  int nblocks = ceil(double(n) / double(1024));
+  int n_threads;
+  if (nblocks == 0)
+    n_threads = n;
+  else
+    n_threads = 1024;
+  int periods = n / Size;
+
+  double * tmp_vect_host = (double*)mxMalloc(n * sizeof(double));
+
+  cublasChk(cublasDnrm2(cublas_handle, n,b, 1, &bnorm),
+            "  in Solve_Cuda_BiCGStab, cublasDnrm2(b) has failed\n");
+
+  double tolb = tol * bnorm;
+
+  if (bnorm == 0.0)
+    {
+      // if b = 0 the A.x = 0 => x = 0
+      cudaChk(cudaFree(Ai), "  in Solve_Cuda_BiCGStab, can't free Ai\n");
+      cudaChk(cudaFree(Ax), "  in Solve_Cuda_BiCGStab, can't free Ax\n");
+      cudaChk(cudaFree(Ap), "  in Solve_Cuda_BiCGStab, can't free Ap\n");
+      if (preconditioner == 3)
+        {
+          cudaChk(cudaFree(Ai_tild), "  in Solve_Cuda_BiCGStab, can't free Ai_tild\n");
+          cudaChk(cudaFree(Ap_tild), "  in Solve_Cuda_BiCGStab, can't free Ap_tild\n");
+        }
+      cudaChk(cudaFree(A_tild), "  in Solve_Cuda_BiCGStab, can't free A_tild\n");
+      cudaChk(cudaFree(x0), "  in Solve_Cuda_BiCGStab, can't free x0\n");
+      cudaChk(cudaFree(b), "  in Solve_Cuda_BiCGStab, can't free b\n");
+      if (is_two_boundaries)
+#ifdef USE_OMP
+#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+#endif
+        for (int i = 0; i < n; i++)
+          {
+            int eq = index_vara[i+Size*y_kmin];
+            double yy = -y[eq];
+            direction[eq] = yy;
+            y[eq] += slowc * yy;
+          }
+      else
+#ifdef USE_OMP
+#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+#endif
+        for (int i = 0; i < n; i++)
+          {
+            int eq = index_vara[i];
+            double yy = -y[eq+it_*y_size];
+            direction[eq] = yy;
+            y[eq+it_*y_size] += slowc * yy;
+          }
+      return 0;
+    }
+
+  int iteration = 0;
+  bool convergence = false;
+  double zeros = 0.0, one = 1.0, m_one = -1.0;
+
+  cudaChk(cudaMalloc((void**)&tmp_, n * sizeof(double)), "  in Solve_Cuda_Sparse, can't allocate tmp_ on the graphic card\n");
+
+  cudaChk(cudaMalloc((void**)&r, n * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate r on the graphic card\n");
+
+  cudaChk(cudaMemcpy(r, b, n * sizeof(double), cudaMemcpyDeviceToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy r = b has failed\n");
+
+  //r = b - A * x0
+  cusparseChk(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, n,
+                                   n, nnz, &m_one,
+                                   CUDA_descr, Ax,
+                                   Ap, Ai,
+                                   x0, &one,
+                                   r), "in Solve_Cuda_BiCGStab, cusparseDcsrmv A * x0 has failed");
+
+  cudaChk(cudaMemcpy(tmp_vect_host, r, n*sizeof(double), cudaMemcpyDeviceToHost), "  in Solve_Cuda_BiCGStab, cudaMemcpy tmp_vect_host = p_tild has failed\n");
+  /*mexPrintf("r\n");
+  for (int i = 0; i < n; i++)
+    mexPrintf("%f\n",tmp_vect_host[i]);*/
+
+  cudaChk(cudaMalloc((void**)&r0, n * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate r0 on the graphic card\n");
+  cudaChk(cudaMemcpy(r0, r, n * sizeof(double), cudaMemcpyDeviceToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy r0 = r has failed\n");
+
+  cublasChk(cublasDnrm2(cublas_handle, n, // numerator
+                        r, 1,
+                        &tmp1),
+            "  in Solve_Cuda_BiCGStab, cublasDnrm2(r) has failed\n");
+  double conv_criteria = tmp1;
+
+  convergence = conv_criteria < tolb;
+  if (convergence)
+    {
+      /* the initial value (x0) is solution of A x = b*/
+      cudaChk(cudaFree(Ai), "  in Solve_Cuda_BiCGStab, can't free Ai\n");
+      cudaChk(cudaFree(Ax), "  in Solve_Cuda_BiCGStab, can't free Ax\n");
+      cudaChk(cudaFree(Ap), "  in Solve_Cuda_BiCGStab, can't free Ap\n");
+      if (preconditioner == 3)
+        {
+          cudaChk(cudaFree(Ai_tild), "  in Solve_Cuda_BiCGStab, can't free Ai_tild\n");
+          cudaChk(cudaFree(Ap_tild), "  in Solve_Cuda_BiCGStab, can't free Ap_tild\n");
+        }
+      cudaChk(cudaFree(A_tild), "  in Solve_Cuda_BiCGStab, can't free A_tild\n");
+      cudaChk(cudaFree(x0), "  in Solve_Cuda_BiCGStab, can't free x0\n");
+      cudaChk(cudaFree(b), "  in Solve_Cuda_BiCGStab, can't free b\n");
+      return 0;
+    }
+
+
+  if (preconditioner == 0)
+    {
+      //Apply the Jacobi preconditioner
+      /*VecDiv<<<nblocks, n_threads>>>(r_, A_tild, z_, n);
+      cuda_error = cudaMemcpy(zz_, z_, n * sizeof(double), cudaMemcpyDeviceToDevice);*/
+    }
+  else if (preconditioner == 1)
+    {
+      //Apply an incomplete LU decomposition of A as preconditioner
+      cusparseChk(cusparseCreateSolveAnalysisInfo(&info), "  in Solve_Cuda_BiCGStab, cusparseCreateSolveAnalysisInfo for info has failed\n");
+
+      cusparseChk(cusparseDcsrsv_analysis(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                          n, nnz, CUDA_descr,
+                                          A_tild, Ap, Ai,
+                                          info),
+                  "  in Solve_Cuda_BiCGStab, cusparseDcsrsm_analysis(info) has failed\n");
+
+      cusparseChk(cusparseDcsrilu0(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                   n, CUDA_descr,
+                                   A_tild, Ap, Ai,
+                                   info),
+                  "  in Solve_Cuda_BiCGStab, cusparseDcsrilu0 has failed\n");
+
+      //Make a copy of the indexes in A_tild_i and A_tild_p to use it the Bicgstab algorithm
+      cudaChk(cudaMalloc((void**)&A_tild_i, nnz * sizeof(int)), "  in Solve_Cuda_BiCGStab, can't allocate A_tild_i on the graphic card\n");
+      cudaChk(cudaMemcpy(A_tild_i, Ai, nnz * sizeof(int), cudaMemcpyDeviceToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy A_tild_i = Ai has failed\n");
+      cudaChk(cudaMalloc((void**)&A_tild_p, (n + 1) * sizeof(int)), "  in Solve_Cuda_BiCGStab, can't allocate A_tild_p on the graphic card\n");
+      cudaChk(cudaMemcpy(A_tild_p, Ap, (n + 1) * sizeof(int), cudaMemcpyDeviceToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy A_tild_p = Ap has failed\n");
+    }
+  else if (preconditioner == 2)
+    {
+      //Because the Jacobian matrix A is store in CSC format in matlab
+      // we have to transpose it to get a CSR format used by CUDA
+      mwIndex* Awi, *Awp;
+      double* A_tild_host = (double*)mxMalloc(nnz*sizeof(double));
+      Awi = (mwIndex*)mxMalloc(nnz * sizeof(mwIndex));
+      Awp = (mwIndex*)mxMalloc((n + 1) * sizeof(mwIndex));
+      int* Aii = (int*)mxMalloc(nnz * sizeof(int));
+      int* Aip = (int*)mxMalloc((n + 1) * sizeof(int));
+      cudaChk(cudaMemcpy(A_tild_host, A_tild, nnz*sizeof(double), cudaMemcpyDeviceToHost), "  in Solve_Cuda_BiCGStab, cudaMemcpy A_tild_host = A_tild has failed\n");
+      cudaChk(cudaMemcpy(Aii, Ai, nnz*sizeof(int), cudaMemcpyDeviceToHost), "  in Solve_Cuda_BiCGStab, cudaMemcpy Aii = Ai has failed\n");
+      cudaChk(cudaMemcpy(Aip, Ap, (n+1)*sizeof(int), cudaMemcpyDeviceToHost), "  in Solve_Cuda_BiCGStab, cudaMemcpy Aip = Ai has failed\n");
+      for (int i = 0; i < nnz; i++)
+        Awi[i] = Aii[i];
+      for (int i = 0; i < n + 1; i++)
+        Awp[i] = Aip[i];
+      mxFree(Aii);
+      mxFree(Aip);
+      mxArray * At_m = mxCreateSparse(n,n,nnz,mxREAL);
+      mxSetIr(At_m, Awi);
+      mxSetJc(At_m, Awp);
+      mxSetPr(At_m, A_tild_host);
+      mxArray *A_m;
+      mexCallMATLAB(1, &A_m, 1, &At_m, "transpose");
+      mxDestroyArray(At_m);
+
+      /*mexPrintf("A_m\n");
+      mexCallMATLAB(0, NULL, 1, &A_m, "disp_dense");*/
+      /*mxFree(Awi);
+      mxFree(Awp);*/
+
+      /*[L1, U1] = ilu(g1a=;*/
+      const char *field_names[] = {"type", "droptol", "milu", "udiag", "thresh"};
+      const int type = 0;
+      const int droptol = 1;
+      const int milu = 2;
+      const int udiag = 3;
+      const int thresh = 4;
+      mwSize dims[1] = {(mwSize)1 };
+      mxArray *Setup = mxCreateStructArray(1, dims, 5, field_names);
+      mxSetFieldByNumber(Setup, 0, type, mxCreateString("ilutp"));
+      //mxSetFieldByNumber(Setup, 0, type, mxCreateString("nofill"));
+      mxSetFieldByNumber(Setup, 0, droptol, mxCreateDoubleScalar(lu_inc_tol));
+      mxSetFieldByNumber(Setup, 0, milu, mxCreateString("off"));
+      mxSetFieldByNumber(Setup, 0, udiag, mxCreateDoubleScalar(0));
+      mxSetFieldByNumber(Setup, 0, thresh, mxCreateDoubleScalar(0));
+      //mxSetFieldByNumber(Setup, 0, thresh, mxCreateDoubleScalar(1));
+      mxArray *lhs0[2];
+      mxArray *rhs0[2];
+      rhs0[0] = A_m;
+      rhs0[1] = Setup;
+      mexCallMATLAB(2, lhs0, 2, rhs0, "ilu");
+      L1 = lhs0[0];
+      U1 = lhs0[1];
+      mxDestroyArray(Setup);
+
+
+ /*     //ILUT preconditionner computed by Matlab (todo: in futur version of cuda replace it by a new equivalent cuda function)
+      const char *field_names[] = {"type", "droptol", "milu", "udiag", "thresh"};
+      const int type = 0;
+      const int droptol = 1;
+      const int milu = 2;
+      const int udiag = 3;
+      const int thresh = 4;
+      mwSize dims[1] = {(mwSize)1 };
+      mxArray *Setup = mxCreateStructArray(1, dims, 5, field_names);
+      mxSetFieldByNumber(Setup, 0, type, mxCreateString("ilutp"));
+      mxSetFieldByNumber(Setup, 0, droptol, mxCreateDoubleScalar(lu_inc_tol));
+      mxSetFieldByNumber(Setup, 0, milu, mxCreateString("off"));
+      mxSetFieldByNumber(Setup, 0, udiag, mxCreateDoubleScalar(0));
+      mxSetFieldByNumber(Setup, 0, thresh, mxCreateDoubleScalar(0));
+      mxArray *lhs0[2], *rhs0[2];
+      rhs0[0] = A_m;
+      rhs0[1] = Setup;
+      mexCallMATLAB(1, lhs0, 2, rhs0, "ilu");
+*/
+      // To store the resultng matrix in a CSR format we have to transpose it
+      mxArray *Wt = lhs0[0];
+      mwIndex* Wtj = mxGetJc(Wt);
+      nnz = Wtj[n];
+      mxArray* W;
+      mexCallMATLAB(1, &W, 1, &Wt, "transpose");
+      mxDestroyArray(Wt);
+      double* pW = mxGetPr(W);
+      mwIndex* Wi = mxGetIr(W);
+      mwIndex* Wp = mxGetJc(W);
+      int *Wii = (int*)mxMalloc(nnz * sizeof(int));
+      int *Wip = (int*)mxMalloc((n + 1) * sizeof(int));
+      for (int i = 0; i < nnz; i++)
+        Wii[i] = Wi[i];
+      for (int i = 0; i < n + 1; i++)
+        Wip[i] = Wp[i];
+
+      //mxFree(A_tild_host);
+
+      cudaChk(cudaFree(A_tild), "cudaFree(A_tild) has failed\n");
+
+      cudaChk(cudaMalloc((void**)&A_tild, nnz * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate A_tild on the graphic card\n");
+      cudaChk(cudaMemcpy(A_tild, pW, nnz * sizeof(double), cudaMemcpyHostToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy A_tild = pW has failed\n");
+      cudaChk(cudaMalloc((void**)&A_tild_i, nnz * sizeof(int)), "  in Solve_Cuda_BiCGStab, can't allocate Ai on the graphic card\n");
+      cudaChk(cudaMemcpy(A_tild_i, Wii, nnz * sizeof(int), cudaMemcpyHostToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy A_tild_i = A_tild_i_host has failed\n");
+      cudaChk(cudaMalloc((void**)&A_tild_p, (n + 1) * sizeof(int)), "  in Solve_Cuda_BiCGStab, can't allocate A_tild_p on the graphic card\n");
+      cudaChk(cudaMemcpy(A_tild_p, Wip, (n + 1) * sizeof(int), cudaMemcpyHostToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy A_tild_p = A_tild_j_host has failed\n");
+      /*mxFree(pW);
+      mxFree(Wi);
+      mxFree(Wj);*/
+      mxDestroyArray(W);
+      mxFree(Wii);
+      mxFree(Wip);
+    }
+  else if (preconditioner == 3)
+    {
+      mwIndex* Aowi, *Aowp;
+      double* A_host = (double*)mxMalloc(nnz*sizeof(double));
+      Aowi = (mwIndex*)mxMalloc(nnz * sizeof(mwIndex));
+      Aowp = (mwIndex*)mxMalloc((n + 1) * sizeof(mwIndex));
+      int* Aoii = (int*)mxMalloc(nnz * sizeof(int));
+      int* Aoip = (int*)mxMalloc((n + 1) * sizeof(int));
+      cudaChk(cudaMemcpy(A_host, Ax, nnz*sizeof(double), cudaMemcpyDeviceToHost), "  in Solve_Cuda_BiCGStab, cudaMemcpy A_tild_host = A_tild has failed\n");
+      cudaChk(cudaMemcpy(Aoii, Ai, nnz*sizeof(int), cudaMemcpyDeviceToHost), "  in Solve_Cuda_BiCGStab, cudaMemcpy Aii = Ai_tild has failed\n");
+      cudaChk(cudaMemcpy(Aoip, Ap, (n+1)*sizeof(int), cudaMemcpyDeviceToHost), "  in Solve_Cuda_BiCGStab, cudaMemcpy Aip = Ap_tild has failed\n");
+      for (int i = 0; i < nnz; i++)
+        Aowi[i] = Aoii[i];
+      for (int i = 0; i < n + 1; i++)
+        Aowp[i] = Aoip[i];
+      mxFree(Aoii);
+      mxFree(Aoip);
+      mxArray * Ao_m = mxCreateSparse(n,n,nnz,mxREAL);
+      mxSetIr(Ao_m, Aowi);
+      mxSetJc(Ao_m, Aowp);
+      mxSetPr(Ao_m, A_host);
+      /*mexPrintf("A_m\n");
+      mxArray *Aoo;
+      mexCallMATLAB(1, &Aoo, 1, &Ao_m, "transpose");
+      mexCallMATLAB(0, NULL, 1, &Aoo, "disp_dense");
+      mxDestroyArray(Ao_m);
+      mxDestroyArray(Aoo);*/
+
+      //Because the Jacobian matrix A is store in CSC format in matlab
+      // we have to transpose it to get a CSR format used by CUDA
+      mwIndex* Awi, *Awp;
+      double* A_tild_host = (double*)mxMalloc(nnz_tild*sizeof(double));
+      Awi = (mwIndex*)mxMalloc(nnz_tild * sizeof(mwIndex));
+      Awp = (mwIndex*)mxMalloc((Size + 1) * sizeof(mwIndex));
+      int* Aii = (int*)mxMalloc(nnz_tild * sizeof(int));
+      int* Aip = (int*)mxMalloc((Size + 1) * sizeof(int));
+      cudaChk(cudaMemcpy(A_tild_host, A_tild, nnz_tild*sizeof(double), cudaMemcpyDeviceToHost), "  in Solve_Cuda_BiCGStab, cudaMemcpy A_tild_host = A_tild has failed\n");
+      cudaChk(cudaMemcpy(Aii, Ai_tild, nnz_tild*sizeof(int), cudaMemcpyDeviceToHost), "  in Solve_Cuda_BiCGStab, cudaMemcpy Aii = Ai_tild has failed\n");
+      cudaChk(cudaMemcpy(Aip, Ap_tild, (Size+1)*sizeof(int), cudaMemcpyDeviceToHost), "  in Solve_Cuda_BiCGStab, cudaMemcpy Aip = Ap_tild has failed\n");
+      for (int i = 0; i < nnz_tild; i++)
+        Awi[i] = Aii[i];
+      for (int i = 0; i < Size + 1; i++)
+        Awp[i] = Aip[i];
+      /*for (int i = 0; i < nnz_tild; i++)
+        mexPrintf("%20.17f\n",A_tild_host[i]);*/
+      mxFree(Aii);
+      mxFree(Aip);
+      mxArray * At_m = mxCreateSparse(Size,Size,nnz_tild,mxREAL);
+      mxSetIr(At_m, Awi);
+      mxSetJc(At_m, Awp);
+      mxSetPr(At_m, A_tild_host);
+      mxArray *A_m;
+      mexCallMATLAB(1, &A_m, 1, &At_m, "transpose");
+      /*mexPrintf("A_tild_m\n");
+      mexCallMATLAB(0, NULL, 1, &A_m, "disp_dense");*/
+      mxDestroyArray(At_m);
+      mxArray *P, *Q, *L, *U;
+      mxArray *lhs0[4];
+      mexCallMATLAB(4, lhs0, 1, &A_m, "lu");
+
+      mxArray *P0, *Q0, *L0, *U0;
+      L0 = lhs0[0];
+      U0 = lhs0[1];
+      P0 = lhs0[2];
+      Q0 = lhs0[3];
+      mexCallMATLAB(1, &P, 1, &P0, "transpose");
+      mexCallMATLAB(1, &Q, 1, &Q0, "transpose");
+      mexCallMATLAB(1, &L, 1, &L0, "transpose");
+      mexCallMATLAB(1, &U, 1, &U0, "transpose");
+      mxDestroyArray(P0);
+      mxDestroyArray(Q0);
+      mxDestroyArray(L0);
+      mxDestroyArray(U0);
+      /*L = lhs0[0];
+      U = lhs0[1];
+      P = lhs0[2];
+      Q = lhs0[3];*/
+
+      /*mexPrintf("L\n");
+      mexCallMATLAB(0, NULL, 1, &L, "disp_dense");
+
+      mexPrintf("U\n");
+      mexCallMATLAB(0, NULL, 1, &U, "disp_dense");
+
+      mexPrintf("P\n");
+      mexCallMATLAB(0, NULL, 1, &P, "disp_dense");
+
+      mexPrintf("Q\n");
+      mexCallMATLAB(0, NULL, 1, &Q, "disp_dense");*/
+
+      mwIndex* Qiw_host = mxGetIr(Q);
+      mwIndex* Qjw_host = mxGetJc(Q);
+      double*  Qx_host = mxGetPr(Q);
+      Q_nnz = Qjw_host[Size];
+      mexPrintf("Q_nnz=%d\n",Q_nnz);
+      int *Qi_host = (int*)mxMalloc(Q_nnz * periods * sizeof(int));
+      double *Q_x_host = (double*)mxMalloc(Q_nnz * periods * sizeof(double));
+      int *Qj_host = (int*)mxMalloc((n + 1) * sizeof(int));
+      for (int t = 0; t < periods; t++)
+        {
+          for (int i = 0; i < Q_nnz; i++)
+            {
+              Qi_host[i + t * Q_nnz] = Qiw_host[i] + t * Size;
+              Q_x_host[i + t * Q_nnz] = Qx_host[i];
+            }
+          for (int i = 0; i < Size; i++)
+            {
+              Qj_host[i + t * Size] = Qjw_host[i] + t * Q_nnz;
+            }
+        }
+      Qj_host[periods * Size] = periods * Q_nnz;
+
+
+      /*mwIndex *Qtiw_host  = (mwIndex*) mxMalloc(Q_nnz * periods * sizeof(mwIndex));
+      double *Qt_x_host = (double*)mxMalloc(Q_nnz * periods * sizeof(double));
+      mwIndex *Qtjw_host = (mwIndex*)mxMalloc((n + 1) * sizeof(mwIndex));
+      mexPrintf("n = %d\n",n);
+      for (int i = 0; i < n + 1; i++)
+        Qtjw_host[i] = Qj_host[i];
+      for (int i = 0; i < Q_nnz * periods; i++)
+        {
+          Qtiw_host[i] = Qi_host[i];
+          Qt_x_host[i] = Q_x_host[i];
+        }
+      mxArray* Qt_m = mxCreateSparse(n,n,Q_nnz * periods,mxREAL);
+      mxSetIr(Qt_m, Qtiw_host);
+      mxSetJc(Qt_m, Qtjw_host);
+      mxSetPr(Qt_m, Qt_x_host);
+      mexPrintf("Qt_m\n");
+      mexCallMATLAB(0, NULL, 1, &Qt_m, "disp_dense");*/
+
+
+      /*mexPrintf("Qtjw_host[periods * Size=%d]=%d\n", periods * Size, Qtjw_host[periods * Size]);
+      for (int i = 0; i < n; i++)
+        for (int j = Qtjw_host[i]; j < Qtjw_host[i+1]; j++)
+           mexPrintf("(i=%d, j=%d) = %f\n", i, Qtiw_host[j], Qt_x_host[j]);*/
+      //mxDestroyArray(Qt_m);
+
+
+      cudaChk(cudaMalloc((void**)&Qx, Q_nnz * periods * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate Qx on the graphic card\n");
+      cudaChk(cudaMemcpy(Qx, Q_x_host, Q_nnz * periods * sizeof(double), cudaMemcpyHostToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy Qx = Qx_host has failed\n");
+      cudaChk(cudaMalloc((void**)&Qi, Q_nnz * periods * sizeof(int)), "  in Solve_Cuda_BiCGStab, can't allocate Qi on the graphic card\n");
+      cudaChk(cudaMemcpy(Qi, Qi_host, Q_nnz * periods * sizeof(int), cudaMemcpyHostToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy Qi = Qi_host has failed\n");
+      cudaChk(cudaMalloc((void**)&Qj, (Size * periods + 1) * sizeof(int)), "  in Solve_Cuda_BiCGStab, can't allocate Qj on the graphic card\n");
+      cudaChk(cudaMemcpy(Qj, Qj_host, (Size * periods + 1) * sizeof(int), cudaMemcpyHostToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy Qj = Qj_host has failed\n");
+      mxFree(Qi_host);
+      mxFree(Qj_host);
+      mxFree(Q_x_host);
+      mxDestroyArray(Q);
+
+
+      mwIndex* Piw_host = mxGetIr(P);
+      mwIndex* Pjw_host = mxGetJc(P);
+      double*  Px_host = mxGetPr(P);
+      P_nnz = Pjw_host[Size];
+      int *Pi_host = (int*)mxMalloc(P_nnz * periods * sizeof(int));
+      double *P_x_host = (double*)mxMalloc(P_nnz * periods * sizeof(double));
+      int *Pj_host = (int*)mxMalloc((n + 1) * sizeof(int));
+      for (int t = 0; t < periods; t++)
+        {
+          for (int i = 0; i < P_nnz; i++)
+            {
+              Pi_host[i + t * P_nnz] = Piw_host[i] + t * Size;
+              P_x_host[i + t * P_nnz] = Px_host[i];
+            }
+          for (int i = 0; i < Size; i++)
+            Pj_host[i + t * Size] = Pjw_host[i] + t * P_nnz;
+        }
+      Pj_host[periods * Size] = periods * P_nnz;
+
+      /*mwIndex *Ptiw_host  = (mwIndex*) mxMalloc(P_nnz * periods * sizeof(mwIndex));
+      double *Pt_x_host = (double*)mxMalloc(P_nnz * periods * sizeof(double));
+      mwIndex *Ptjw_host = (mwIndex*)mxMalloc((n + 1) * sizeof(mwIndex));
+      for (int i = 0; i < n + 1; i++)
+        Ptjw_host[i] = Pj_host[i];
+      for (int i = 0; i < P_nnz * periods; i++)
+        {
+          Ptiw_host[i] = Pi_host[i];
+          Pt_x_host[i] = P_x_host[i];
+        }
+      mxArray* Pt_m = mxCreateSparse(n,n,P_nnz * periods,mxREAL);
+      mxSetIr(Pt_m, Ptiw_host);
+      mxSetJc(Pt_m, Ptjw_host);
+      mxSetPr(Pt_m, Pt_x_host);
+      mexPrintf("Pt_m\n");
+      mexCallMATLAB(0, NULL, 1, &Pt_m, "disp_dense");
+      mxDestroyArray(Pt_m);*/
+
+
+      cudaChk(cudaMalloc((void**)&Px, P_nnz * periods * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate Px on the graphic card\n");
+      cudaChk(cudaMemcpy(Px, P_x_host, P_nnz * periods * sizeof(double), cudaMemcpyHostToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy Px = Px_host has failed\n");
+      cudaChk(cudaMalloc((void**)&Pi, P_nnz * periods * sizeof(int)), "  in Solve_Cuda_BiCGStab, can't allocate Pi on the graphic card\n");
+      cudaChk(cudaMemcpy(Pi, Pi_host, P_nnz * periods * sizeof(int), cudaMemcpyHostToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy Pi = Pi_host has failed\n");
+      cudaChk(cudaMalloc((void**)&Pj, (Size * periods + 1) * sizeof(int)), "  in Solve_Cuda_BiCGStab, can't allocate Pj on the graphic card\n");
+      cudaChk(cudaMemcpy(Pj, Pj_host, (Size * periods + 1) * sizeof(int), cudaMemcpyHostToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy Pj = Pj_host has failed\n");
+      mxFree(Pi_host);
+      mxFree(Pj_host);
+      mxFree(P_x_host);
+      mxDestroyArray(P);
+
+      /*mwIndex* Piw_host = mxGetIr(P);
+      mwIndex* Pjw_host = mxGetJc(P);
+      double*  Px_host = mxGetPr(P);
+      P_nnz = Pjw_host[Size];
+      int *Pi_host = (int*)mxMalloc(P_nnz * sizeof(int));
+      int *Pj_host = (int*)mxMalloc((Size + 1) * sizeof(int));
+      for (int i = 0; i < P_nnz; i++)
+        Pi_host[i] = Piw_host[i];
+      for (int i = 0; i < Size + 1; i++)
+        Pj_host[i] = Pjw_host[i];
+
+      cudaChk(cudaMalloc((void**)&Px, P_nnz * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate Px on the graphic card\n");
+      cudaChk(cudaMemcpy(Px, Px_host, P_nnz * sizeof(double), cudaMemcpyHostToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy Px = Px_host has failed\n");
+      cudaChk(cudaMalloc((void**)&Pi, P_nnz * sizeof(int)), "  in Solve_Cuda_BiCGStab, can't allocate Pi on the graphic card\n");
+      cudaChk(cudaMemcpy(Pi, Pi_host, P_nnz * sizeof(int), cudaMemcpyHostToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy Pi = Pi_host has failed\n");
+      cudaChk(cudaMalloc((void**)&Pj, (Size + 1) * sizeof(int)), "  in Solve_Cuda_BiCGStab, can't allocate Pj on the graphic card\n");
+      cudaChk(cudaMemcpy(Pj, Pj_host, (Size + 1) * sizeof(int), cudaMemcpyHostToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy Pj = Pj_host has failed\n");
+      mxFree(Pi_host);
+      mxFree(Pj_host);
+      mxDestroyArray(P);*/
+
+      /*mexPrintf("L\n");
+      mexCallMATLAB(0, NULL, 1, &L, "disp_dense");
+
+      mexPrintf("U\n");
+      mexCallMATLAB(0, NULL, 1, &U, "disp_dense");*/
+
+      mwIndex* Liw_host = mxGetIr(L);
+      mwIndex* Ljw_host = mxGetJc(L);
+      double*  Lx_host = mxGetPr(L);
+      int L_nnz = Ljw_host[Size];
+
+      mwIndex* Uiw_host = mxGetIr(U);
+      mwIndex* Ujw_host = mxGetJc(U);
+      double*  Ux_host = mxGetPr(U);
+      int U_nnz = Ujw_host[Size];
+
+      double *pW = (double*)mxMalloc((L_nnz + U_nnz - Size) * periods * sizeof(double));
+      int *Wi = (int*)mxMalloc((L_nnz + U_nnz - Size) * periods * sizeof(int));
+      int *Wj = (int*)mxMalloc((n + 1) * sizeof(int));
+      Wj[0] = 0;
+      W_nnz = 0;
+      for (int t = 0; t < periods; t++)
+        for (int i = 0; i < Size ; i++)
+          {
+            for (mwIndex l  = Ujw_host[i]; l < Ujw_host[i+1]; l++)
+              {
+                Wi[W_nnz] = Uiw_host[l] + t * Size;
+                pW[W_nnz] = Ux_host[l];
+                //mexPrintf("Wj[%d] = %d, Wi[%d] = Uiw_host[%d] + t * Size = %d, pW[%d]=%f\n", i + t * Size, Wj[i + t * Size], W_nnz, l, Uiw_host[l] + t * Size, W_nnz, Ux_host[l]);
+                W_nnz++;
+              }
+            for (mwIndex l  = Ljw_host[i]; l < Ljw_host[i+1]; l++)
+              {
+                if (Liw_host[l] > i)
+                  {
+                    Wi[W_nnz] = Liw_host[l] + t * Size;
+                    pW[W_nnz] = Lx_host[l];
+                    //mexPrintf("Wj[%d] = %d, Wi[%d] = Liw_host[%d] + t * Size = %d, pW[%d]=%f\n", i  + t * Size, Wj[i + t * Size], W_nnz, l, Liw_host[l] + t * Size, W_nnz, Lx_host[l]);
+                    W_nnz++;
+                  }
+              }
+            Wj[i + 1 + t * Size] = W_nnz;
+          }
+      //mexPrintf("Wj[%d] = %d, n=%d\n", periods * Size, Wj[periods * Size], n);
+      cudaChk(cudaMalloc((void**)&A_tild, W_nnz * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate Px on the graphic card\n");
+      cudaChk(cudaMemcpy(A_tild, pW, W_nnz * sizeof(double), cudaMemcpyHostToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy A_tild = pW has failed\n");
+      cudaChk(cudaMalloc((void**)&A_tild_i, W_nnz * sizeof(int)), "  in Solve_Cuda_BiCGStab, can't allocate Pi on the graphic card\n");
+      cudaChk(cudaMemcpy(A_tild_i, Wi, W_nnz * sizeof(int), cudaMemcpyHostToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy A_tild_i = Wi has failed\n");
+      cudaChk(cudaMalloc((void**)&A_tild_p, (n + 1) * sizeof(int)), "  in Solve_Cuda_BiCGStab, can't allocate Pj on the graphic card\n");
+      cudaChk(cudaMemcpy(A_tild_p, Wj, (n + 1) * sizeof(int), cudaMemcpyHostToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy A_tild_p = Wj has failed\n");
+
+      /*mwIndex *Wwi = (mwIndex*)mxMalloc(W_nnz * sizeof(mwIndex));
+      mwIndex *Wwj = (mwIndex*)mxMalloc((n + 1) * sizeof(mwIndex));
+      for (int i = 0; i < W_nnz; i++)
+        Wwi[i] = Wi[i];
+      for (int i = 0; i < n + 1; i++)
+        Wwj[i] = Wj[i];
+      mxFree(Wi);
+      mxFree(Wj);
+      mxArray* Ao_tild = mxCreateSparse(n,n,W_nnz,mxREAL);
+      mxSetIr(Ao_tild, Wwi);
+      mxSetJc(Ao_tild, Wwj);
+      mxSetPr(Ao_tild, pW);
+      mexPrintf("Ao_tild\n");
+      mexCallMATLAB(0, NULL, 1, &Ao_tild, "disp_dense");
+      mxDestroyArray(Ao_tild);*/
+
+
+      /*ostringstream tmp;
+      tmp << "debugging";
+      mexWarnMsgTxt(tmp.str().c_str());
+      return 4;*/
+
+      /* /**Apply the permutation matrices (P and Q) to the b vector of system to solve :
+       b_tild = P-1 . b  = P' . b */
+      /*cudaChk(cudaMalloc((void**)&b_tild, n * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate b_tild on the graphic card\n");
+      cusparseChk(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_TRANSPOSE,
+                                 n, n, nnz, &one, CUDA_descr,
+                                 Px, Pj, Pi,
+                                 b, &zeros,
+                                 b_tild),
+                  "  in Solve_Cuda_BiCGStab, b_tild = cusparseDcsrmv(P', b) has failed\n");
+
+      cusparseChk(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_TRANSPOSE,
+                                 n, n, nnz, &one, CUDA_descr,
+                                 Px, Pj, Pi,
+                                 b, &zeros,
+                                 b),
+                  "  in Solve_Cuda_BiCGStab, b = cusparseDcsrmv(P', b) has failed\n");
+      */
+      /*mexPrintf("Wt = lu(A_m)\n");
+      mexCallMATLAB(0, NULL, 1, &Wt, "disp_dense");*/
+      /*ostringstream tmp;
+      tmp << "debugging";
+      mexWarnMsgTxt(tmp.str().c_str());
+      return 4;*/
+      // To store the resultng matrix in a CSR format we have to transpose it
+      /*mwIndex* Wtj = mxGetJc(Wt);
+      nnz = Wtj[n];
+      mxArray* W;
+      mexCallMATLAB(1, &W, 1, &Wt, "transpose");
+      mxDestroyArray(Wt);
+      pW = mxGetPr(W);
+      Wwi = mxGetIr(W);
+      mwIndex* Wp = mxGetJc(W);
+      int *Wii = (int*)mxMalloc(nnz * sizeof(int));
+      int *Wip = (int*)mxMalloc((n + 1) * sizeof(int));
+      for (int i = 0; i < nnz; i++)
+        Wii[i] = Wi[i];
+      for (int i = 0; i < n + 1; i++)
+        Wip[i] = Wp[i];
+
+      //mxFree(A_tild_host);
+
+      cudaChk(cudaFree(Ai_tild), "  in Solve_Cuda_BiCGStab, cudaFree(Ai_tild) has failed\n");
+      cudaChk(cudaFree(Ap_tild), "  in Solve_Cuda_BiCGStab, cudaFree(Ap_tild) has failed\n");
+      cudaChk(cudaFree(A_tild), "  in Solve_Cuda_BiCGStab, cudaFree(A_tild) has failed\n");
+
+      cudaChk(cudaMalloc((void**)&A_tild, nnz * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate A_tild on the graphic card\n");
+      cudaChk(cudaMemcpy(A_tild, pW, nnz * sizeof(double), cudaMemcpyHostToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy A_tild = pW has failed\n");
+      cudaChk(cudaMalloc((void**)&A_tild_i, nnz * sizeof(int)), "  in Solve_Cuda_BiCGStab, can't allocate Ai on the graphic card\n");
+      cudaChk(cudaMemcpy(A_tild_i, Wii, nnz * sizeof(int), cudaMemcpyHostToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy A_tild_i = A_tild_i_host has failed\n");
+      cudaChk(cudaMalloc((void**)&A_tild_p, (n + 1) * sizeof(int)), "  in Solve_Cuda_BiCGStab, can't allocate A_tild_p on the graphic card\n");
+      cudaChk(cudaMemcpy(A_tild_p, Wip, (n + 1) * sizeof(int), cudaMemcpyHostToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy A_tild_p = A_tild_j_host has failed\n");
+      mxDestroyArray(W);
+      mxFree(Wii);
+      mxFree(Wip);*/
+    }
+  if (preconditioner == 1 || preconditioner == 2 || preconditioner == 3)
+    {
+      cusparseChk(cusparseCreateMatDescr(&descrL),
+                  "  in Solve_Cuda_BiCGStab, cusparseCreateMatDescr has failed for descrL\n");
+      cusparseChk(cusparseSetMatIndexBase(descrL, CUSPARSE_INDEX_BASE_ZERO),
+                  "  in Solve_Cuda_BiCGStab, cusparseSetMatIndexBase has failed for descrL\n");
+      cusparseChk(cusparseSetMatType(descrL, CUSPARSE_MATRIX_TYPE_GENERAL),
+                  "  in Solve_Cuda_BiCGStab, cusparseSetMatType has failed for descrL\n");
+      cusparseChk(cusparseSetMatFillMode(descrL, CUSPARSE_FILL_MODE_LOWER),
+                  "  in Solve_Cuda_BiCGStab, cusparseSetFillMod has failed for descrL\n");
+      cusparseChk(cusparseSetMatDiagType(descrL, CUSPARSE_DIAG_TYPE_UNIT),
+                  "  in Solve_Cuda_BiCGStab, cusparseSetMatDiagType has failed for descrL\n");
+
+      cusparseChk(cusparseCreateMatDescr(&descrU),
+                  "  in Solve_Cuda_BiCGStab, cusparseCreateMatDescr has failed for descrU\n");
+      cusparseChk(cusparseSetMatIndexBase(descrU, CUSPARSE_INDEX_BASE_ZERO),
+                  "  in Solve_Cuda_BiCGStab, cusparseSetMatIndexBase has failed for descrU\n");
+      cusparseChk(cusparseSetMatType(descrU, CUSPARSE_MATRIX_TYPE_GENERAL),
+                  "  in Solve_Cuda_BiCGStab, cusparseSetMatType has failed for descrU\n");
+      cusparseChk(cusparseSetMatFillMode(descrU, CUSPARSE_FILL_MODE_UPPER),
+                  "  in Solve_Cuda_BiCGStab, cusparseSetFillMod has failed for descrU\n");
+      cusparseChk(cusparseSetMatDiagType(descrU, CUSPARSE_DIAG_TYPE_NON_UNIT),
+                  "  in Solve_Cuda_BiCGStab, cusparseSetMatDiagType has failed for descrU\n");
+
+      int host_nnz_tild;
+      if  (preconditioner == 3)
+        host_nnz_tild = W_nnz;
+      else
+        host_nnz_tild = nnz;
+
+      if (preconditioner == 1)
+        cusparseChk(cusparseDestroySolveAnalysisInfo(info),
+                    "  in Solve_Cuda_BiCGStab, cusparseDestroySolveAnalysisInfo has failed for info\n");
+
+      cusparseChk(cusparseCreateSolveAnalysisInfo(&infoL),
+                  "  in Solve_Cuda_BiCGStab, cusparseCreateSolveAnalysisInfo has failed for infoL\n");
+      cusparseChk(cusparseDcsrsv_analysis(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                          n, host_nnz_tild, descrL,
+                                          A_tild, A_tild_p, A_tild_i,
+                                          infoL),
+                  "  in Solve_Cuda_BiCGStab, cusparseDcsrsm_analysis for infoL has failed\n");
+
+      cusparseChk(cusparseCreateSolveAnalysisInfo(&infoU),
+                  "  in Solve_Cuda_BiCGStab, cusparseCreateSolveAnalysisInfo has failed for infoU\n");
+      cusparseChk(cusparseDcsrsv_analysis(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                          n, host_nnz_tild, descrU,
+                                          A_tild, A_tild_p, A_tild_i,
+                                          infoU),
+                  "  in Solve_Cuda_BiCGStab, cusparseDcsrsm_analysis for infoU has failed\n");
+    }
+
+  cudaChk(cudaMalloc((void**)&v, n * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate v on the graphic card\n");
+  cudaChk(cudaMalloc((void**)&p, n * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate p on the graphic card\n");
+  //cudaChk(cudaMemset(p, 0, n * sizeof(double)), "  in Solve_Cuda_BiCGStab, cudaMemset p = 0 has failed\n");
+  cudaChk(cudaMalloc((void**)&s, n * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate s on the graphic card\n");
+  cudaChk(cudaMalloc((void**)&t, n * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate t on the graphic card\n");
+  cudaChk(cudaMalloc((void**)&y_, n * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate y_ on the graphic card\n");
+  cudaChk(cudaMalloc((void**)&z, n * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate z on the graphic card\n");
+
+  double rho = 1.0, alpha = 1.0, omega = 1.0;
+
+
+  //residual = P*B*Q - L*U;
+  //norm(Z,1) should be close to 0
+
+
+  while (iteration < 50/*max_iterations*/ && !convergence)
+    {
+      double rho_prev = rho;
+      /**store in s previous value of r*/
+      cudaChk(cudaMemcpy(s, r, n * sizeof(double), cudaMemcpyDeviceToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy s = r has failed\n");
+
+      /**rho = r0 . r*/
+      cublasChk(cublasDdot(cublas_handle, n, // numerator
+                           r0, 1,
+                           r, 1,
+                           &rho),
+                "  in Solve_Cuda_BiCGStab, rho = cublasDdot(r0, r) has failed\n");
+
+      mexPrintf("rho=%f\n",rho);
+
+      double beta;
+
+      if (iteration == 0)
+        {
+          cudaChk(cudaMemcpy(p, r, n * sizeof(double), cudaMemcpyDeviceToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy p = r has failed\n");
+        }
+      else
+        {
+          /**beta = (rho / rho_prev) . (alpha / omega);*/
+          beta = rho / rho_prev * alpha / omega;
+
+          /**p = r + beta * (p - omega * v)*/
+          // tmp_ = p - omega * v
+          VecAdd<<<nblocks, n_threads>>>(tmp_, p, -omega, v, n);
+          //p = r + beta * tmp_
+          VecAdd<<<nblocks, n_threads>>>(p, r, beta, tmp_, n);
+        }
+
+      /**y_ solution of A_tild * y_ = p <=> L . U . y_ = p*/
+      //  L tmp_ = p => tmp_ = L^-1 p, with tmp_ = U . y_
+
+      if (preconditioner == 3)
+        {
+          double *p_tild;
+          mexPrintf("n=%d\n",n);
+
+          cudaChk(cudaMemcpy(tmp_vect_host, p, n*sizeof(double), cudaMemcpyDeviceToHost), "  in Solve_Cuda_BiCGStab, cudaMemcpy tmp_vect_host = p has failed\n");
+          /*mexPrintf("p\n");
+          for (int i = 0; i < n; i++)
+             mexPrintf("%f\n",tmp_vect_host[i]);*/
+
+          cudaChk(cudaMalloc((void**)&p_tild, n * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate b_tild on the graphic card\n");
+          cusparseChk(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                     n, n, P_nnz * periods, &one, CUDA_descr,
+                                     Px, Pj, Pi,
+                                     p, &zeros,
+                                     p_tild),
+                      "  in Solve_Cuda_BiCGStab, p_tild = cusparseDcsrmv(P', p) has failed\n");
+
+          /*mexPrintf("P\n");
+          printM(n, Px, Pj, Pi, CUDA_descr, cusparse_handle);*/
+
+          cudaChk(cudaMemcpy(tmp_vect_host, p_tild, n*sizeof(double), cudaMemcpyDeviceToHost), "  in Solve_Cuda_BiCGStab, cudaMemcpy tmp_vect_host = p_tild has failed\n");
+          /*mexPrintf("p_tild\n");
+          for (int i = 0; i < n; i++)
+             mexPrintf("%f\n",tmp_vect_host[i]);*/
+
+          cusparseChk(cusparseDcsrsv_solve(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                           n, &one,
+                                           descrL,
+                                           A_tild, A_tild_p, A_tild_i,
+                                           infoL, p_tild,
+                                           tmp_),
+                      "  in Solve_Cuda_BiCGStab, cusparseDcsrsv_solve for L . tmp_ = p_tild has failed\n");
+          cudaChk(cudaFree(p_tild), "  in Solve_Cuda_BiCGStab, can't free p_tild\n");
+
+          cudaChk(cudaMemcpy(tmp_vect_host, tmp_, n*sizeof(double), cudaMemcpyDeviceToHost), "  in Solve_Cuda_BiCGStab, cudaMemcpy tmp_vect_host = v has failed\n");
+          /*mexPrintf("tmp_\n");
+          for (int i = 0; i < n; i++)
+             mexPrintf("%f\n",tmp_vect_host[i]);*/
+        }
+      else
+        cusparseChk(cusparseDcsrsv_solve(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                         n, &one,
+                                         descrL,
+                                         A_tild, A_tild_p, A_tild_i,
+                                         infoL, p,
+                                         tmp_),
+                    "  in Solve_Cuda_BiCGStab, cusparseDcsrsv_solve for L . tmp_ = p has failed\n");
+
+      //  U . y_ = L^-1 p <=> U . y_ = tmp_ => y_ = U^-1 L^-1 p
+      cusparseChk(cusparseDcsrsv_solve(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                       n, &one,
+                                       descrU,
+                                       A_tild, A_tild_p, A_tild_i,
+                                       infoU, tmp_,
+                                       y_),
+                  "  in Solve_Cuda_BiCGStab, cusparseDcsrsv_solve for U . y_ = tmp_ has failed\n");
+
+      /*cudaChk(cudaMemcpy(tmp_vect_host, y_, n*sizeof(double), cudaMemcpyDeviceToHost), "  in Solve_Cuda_BiCGStab, cudaMemcpy tmp_vect_host = v has failed\n");
+      mexPrintf("y_\n");
+      for (int i = 0; i < n; i++)
+        mexPrintf("%f\n",tmp_vect_host[i]);*/
+
+      if (preconditioner == 3)
+        {
+          double *y_tild;
+          cudaChk(cudaMalloc((void**)&y_tild, n * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate b_tild on the graphic card\n");
+          cudaChk(cudaMemcpy(y_tild, y_, n  * sizeof(double), cudaMemcpyDeviceToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy y_tild = y_ has failed\n");
+          cusparseChk(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                     n, n, Q_nnz * periods, &one, CUDA_descr,
+                                     Qx, Qj, Qi,
+                                     y_tild, &zeros,
+                                     y_),
+                      "  in Solve_Cuda_BiCGStab, y_ = cusparseDcsrmv(Q', y_tild) has failed\n");
+          cudaChk(cudaFree(y_tild), "  in Solve_Cuda_BiCGStab, can't free y_tild\n");
+        }
+      /*cudaChk(cudaMemcpy(tmp_vect_host, y_, n*sizeof(double), cudaMemcpyDeviceToHost), "  in Solve_Cuda_BiCGStab, cudaMemcpy tmp_vect_host = v has failed\n");
+      mexPrintf("y_\n");
+      for (int i = 0; i < n; i++)
+        mexPrintf("%f\n",tmp_vect_host[i]);*/
+      /**v = A*y_*/
+      cusparseChk(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                 n, n, nnz, &one, CUDA_descr,
+                                 Ax, Ap, Ai,
+                                 y_, &zeros,
+                                 v),
+                  "  in Solve_Cuda_BiCGStab, v = cusparseDcsrmv(A, y_) has failed\n");
+      cudaChk(cudaMemcpy(tmp_vect_host, v, n*sizeof(double), cudaMemcpyDeviceToHost), "  in Solve_Cuda_BiCGStab, cudaMemcpy tmp_vect_host = v has failed\n");
+      /*mexPrintf("v\n");
+      for (int i = 0; i < n; i++)
+        mexPrintf("%f\n",tmp_vect_host[i]);*/
+
+
+
+      /**alpha = rho / (rr0 . v) with rr0 = r0*/
+      cublasChk(cublasDdot(cublas_handle, n, // numerator
+                           r0, 1,
+                           v, 1,
+                           &tmp1),
+                "  in Solve_Cuda_BiCGStab, cublasDdot(r0, v) has failed\n");
+
+      alpha = rho / tmp1;
+      mexPrintf("rho = %f, tmp1 = %f\n", rho, tmp1);
+      mexPrintf("alpha = %f\n", alpha);
+
+      if (alpha == 0 || isinf(alpha) || isnan(alpha))
+        {
+          Solve_CUDA_BiCGStab_Free(tmp_vect_host, p, r, v, s, t, y_, z, tmp_, Ai, Ax, Ap, x0, b, A_tild, A_tild_i, A_tild_p, infoL, infoU, descrL, descrU, preconditioner);
+          ostringstream tmp;
+          tmp << "one of the scalar quantities (alpha=" << alpha << ") calculated during BICGSTAB became too small or too large to continue computing, in block " << block+1;
+          mexWarnMsgTxt(tmp.str().c_str());
+          return 4;
+        }
+
+      /** Check for potential stagnation*/
+      cublasChk(cublasDnrm2(cublas_handle, n, // numerator
+                            y_, 1,
+                            &tmp1),
+                "  in Solve_Cuda_BiCGStab, cublasDnrm2(y_) has failed\n");
+      cublasChk(cublasDnrm2(cublas_handle, n, // denominator
+                            x0, 1,
+                            &tmp2),
+                "  in Solve_Cuda_BiCGStab, cublasDnrm2(y_) has failed\n");
+      mexPrintf("abs(alpha)*tmp1  = %f, alpha = %f, tmp1 = %f, tmp2 = %f, eps = %f\n",abs(alpha)*tmp1 , alpha, tmp1, tmp2, eps);
+      if (abs(alpha)*tmp1  < eps * tmp2)
+        stagnation++;
+      else
+        stagnation = 0;
+
+      /**x = x + alpha * y_*/
+      VecInc<<<nblocks, n_threads>>>(x0, alpha, y_, n);
+
+      /**s = r_prev - alpha *v with r_prev = s*/
+      VecInc<<<nblocks, n_threads>>>(s, -alpha, v, n);
+
+      /**Has BiCGStab converged?*/
+      cublasChk(cublasDnrm2(cublas_handle, n, // numerator
+                            s, 1,
+                            &tmp1),
+                "  in Solve_Cuda_BiCGStab, cublasDnrm2(s) has failed\n");
+      conv_criteria = tmp1;
+      mexPrintf("conv_criteria = %f, tolb = %f\n", conv_criteria, tolb);
+      convergence = conv_criteria < tolb;
+
+      if (convergence || stagnation >= max_stagnation || refinement_needed)
+        {
+          /**s = b - A * x0*/
+          cudaChk(cudaMemcpy(s, b, n * sizeof(double), cudaMemcpyDeviceToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy s = b has failed\n");
+          cusparseChk(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                     n, n, nnz, &m_one, CUDA_descr,
+                                     Ax, Ap, Ai,
+                                     x0, &one,
+                                     s),
+                      "  in Solve_Cuda_BiCGStab, s = b - cusparseDcsrmv(A, x0) has failed\n");
+          cublasChk(cublasDnrm2(cublas_handle, n, // numerator
+                                s, 1,
+                                &tmp1),
+                    "  in Solve_Cuda_BiCGStab, cublasDnrm2(s) has failed\n");
+          conv_criteria = tmp1;
+          convergence = conv_criteria < tolb;
+          if (convergence)
+            {
+              break;
+            }
+          else
+            {
+              if (stagnation >= max_stagnation && refinement_needed == 0)
+                stagnation = 0;
+              refinement_needed++;
+              if (refinement_needed > max_refinement)
+                {
+                  Solve_CUDA_BiCGStab_Free(tmp_vect_host, p, r, v, s, t, y_, z, tmp_, Ai, Ax, Ap, x0, b, A_tild, A_tild_i, A_tild_p, infoL, infoU, descrL, descrU, preconditioner);
+                  ostringstream tmp;
+                  tmp << "Error in bytecode: BiCGStab stagnated (Two consecutive iterates were the same.), in block " << block+1;
+                  mexWarnMsgTxt(tmp.str().c_str());
+                  return 3;
+                }
+            }
+        }
+
+      /**z solution of A_tild * z = s*/
+      //  L tmp_ = s => tmp_ = L^-1 s, with tmp_ = U . z
+      if (preconditioner == 3)
+        {
+          double *s_tild;
+          cudaChk(cudaMalloc((void**)&s_tild, n * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate b_tild on the graphic card\n");
+          cusparseChk(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                     n, n, P_nnz * periods, &one, CUDA_descr,
+                                     Px, Pj, Pi,
+                                     s, &zeros,
+                                     s_tild),
+                      "  in Solve_Cuda_BiCGStab, s_tild = cusparseDcsrmv(P', s) has failed\n");
+          cusparseChk(cusparseDcsrsv_solve(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                           n, &one,
+                                           descrL,
+                                           A_tild, A_tild_p, A_tild_i,
+                                           infoL, s_tild,
+                                           tmp_),
+                      "  in Solve_Cuda_BiCGStab, cusparseDcsrsv_solve for L . tmp_ = s_tild has failed\n");
+          cudaChk(cudaFree(s_tild), "  in Solve_Cuda_BiCGStab, can't free s_tild\n");
+        }
+      else
+        cusparseChk(cusparseDcsrsv_solve(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                         n, &one,
+                                         descrL,
+                                         //Lx, Lp, Li,
+                                         A_tild, A_tild_p, A_tild_i,
+                                         infoL, s,
+                                         tmp_),
+                    "  in Solve_Cuda_BiCGStab, cusparseDcsrsv_solve for L . tmp_ = s has failed\n");
+      //  U . z = L^-1 s <=> U . z = tmp_ => z = U^-1 L^-1 s
+      cusparseChk(cusparseDcsrsv_solve(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                       n, &one,
+                                       descrU,
+                                       //Ux, Up, Ui,
+                                       A_tild, A_tild_p, A_tild_i,
+                                       infoU, tmp_,
+                                       z),
+                  "  in Solve_Cuda_BiCGStab, cusparseDcsrsv_solve for U . z = tmp_ has failed\n");
+      if (preconditioner == 3)
+        {
+          double *z_tild;
+          cudaChk(cudaMalloc((void**)&z_tild, n * sizeof(double)), "  in Solve_Cuda_BiCGStab, can't allocate z_tild on the graphic card\n");
+          cudaChk(cudaMemcpy(z_tild, z, n  * sizeof(double), cudaMemcpyDeviceToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy z_tild = z has failed\n");
+          cusparseChk(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                     n, n, Q_nnz * periods, &one, CUDA_descr,
+                                     Qx, Qj, Qi,
+                                     z_tild, &zeros,
+                                     z),
+                      "  in Solve_Cuda_BiCGStab, z = cusparseDcsrmv(Q, z_tild) has failed\n");
+          cudaChk(cudaFree(z_tild), "  in Solve_Cuda_BiCGStab, can't free x_tild\n");
+        }
+      /**t = A * z*/
+      cusparseChk(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                 n, n, nnz, &one, CUDA_descr,
+                                 Ax, Ap, Ai,
+                                 z, &zeros,
+                                 t),
+                  "  in Solve_Cuda_BiCGStab, t = cusparseDcsrmv(A, z) has failed\n");
+
+      /** omega = (t' s) / (t' t)*/
+      cublasChk(cublasDdot(cublas_handle, n, // numerator
+                           t, 1,
+                           s, 1,
+                           &tmp1),
+                "  in Solve_Cuda_BiCGStab, cublasDdot(t, s) has failed\n");
+
+      cublasChk(cublasDdot(cublas_handle, n, // numerator
+                           t, 1,
+                           t, 1,
+                           &tmp2),
+                "  in Solve_Cuda_BiCGStab, cublasDdot(t, t) has failed\n");
+
+      omega = tmp1 / tmp2;
+
+      if (omega == 0 || isinf(omega) || isnan(omega))
+        {
+          Solve_CUDA_BiCGStab_Free(tmp_vect_host, p, r, v, s, t, y_, z, tmp_, Ai, Ax, Ap, x0, b, A_tild, A_tild_i, A_tild_p, infoL, infoU, descrL, descrU, preconditioner);
+          ostringstream tmp;
+          mexEvalString("diary off;");
+          tmp << "one of the scalar quantities (omega=" << omega << ") calculated during BICGSTAB became too small or too large to continue computing, in block " << block+1;
+          mexWarnMsgTxt(tmp.str().c_str());
+          return 4;
+        }
+
+      /**x = x +  omega * z*/
+      VecInc<<<nblocks, n_threads>>>(x0, omega, z, n);
+
+      /**r = s - omega * t*/
+      VecAdd<<<nblocks, n_threads>>>(r, s, -omega, t, n);
+
+      /**Has BiCGStab converged?*/
+      cublasChk(cublasDnrm2(cublas_handle, n, // numerator
+                            r, 1,
+                            &tmp1),
+                "  in Solve_Cuda_BiCGStab, cublasDnrm2(r) has failed\n");
+      conv_criteria = tmp1;
+
+      convergence = conv_criteria < tolb;
+
+      if (convergence || stagnation >= max_stagnation || refinement_needed)
+        {
+          /**r = b - A * x0*/
+          cudaChk(cudaMemcpy(r, b, n * sizeof(double), cudaMemcpyDeviceToDevice), "  in Solve_Cuda_BiCGStab, cudaMemcpy r = b has failed\n");
+          cusparseChk(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                     n, n, nnz, &m_one, CUDA_descr,
+                                     Ax, Ap, Ai,
+                                     x0, &one,
+                                     r),
+                      "  in Solve_Cuda_BiCGStab, r = b - cusparseDcsrmv(A, x0) has failed\n");
+          cublasChk(cublasDnrm2(cublas_handle, n, // numerator
+                                r, 1,
+                                &tmp1),
+                    "  in Solve_Cuda_BiCGStab, cublasDnrm2(r) has failed\n");
+          conv_criteria = tmp1;
+          convergence = conv_criteria < tolb;
+          if (convergence)
+            {
+              mexPrintf("convergence achieved\n");
+              break;
+            }
+          else
+            {
+              if (stagnation >= max_stagnation && refinement_needed == 0)
+                stagnation = 0;
+              refinement_needed++;
+              if (refinement_needed > max_refinement)
+                {
+                  Solve_CUDA_BiCGStab_Free(tmp_vect_host, p, r, v, s, t, y_, z, tmp_, Ai, Ax, Ap, x0, b, A_tild, A_tild_i, A_tild_p, /*Lx, Li, Lp, Ux, Ui, Up, device_n, */infoL, infoU, descrL, descrU, preconditioner);
+                  ostringstream tmp;
+                  mexEvalString("diary off;");
+                  tmp << "Error in bytecode: BiCGStab stagnated (Two consecutive iterates were the same.), in block " << block+1;
+                  mexWarnMsgTxt(tmp.str().c_str());
+                  return 3;
+                }
+            }
+        }
+
+      iteration++;
+    }
+  cudaChk(cudaMemcpy(tmp_vect_host, x0, n * sizeof(double), cudaMemcpyDeviceToHost), "  in Solve_Cuda_BiCGStab, cudaMemcpy tmp_vect_host = x0 has failed\n");
+
+  if (is_two_boundaries)
+#ifdef USE_OMP
+#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+#endif
+    for (int i = 0; i < n; i++)
+      {
+        int eq = index_vara[i+Size*y_kmin];
+        double yy = -(tmp_vect_host[i] + y[eq]);
+        direction[eq] = yy;
+        y[eq] += slowc * yy;
+      }
+  else
+#ifdef USE_OMP
+#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+#endif
+    for (int i = 0; i < n; i++)
+      {
+        int eq = index_vara[i];
+        double yy = -(tmp_vect_host[i] + y[eq+it_*y_size]);
+        direction[eq] = yy;
+        y[eq+it_*y_size] += slowc * yy;
+      }
+  Solve_CUDA_BiCGStab_Free(tmp_vect_host, p, r, v, s, t, y_, z, tmp_, Ai, Ax, Ap, x0, b, A_tild, A_tild_i, A_tild_p, infoL, infoU, descrL, descrU, preconditioner);
+
+  if (iteration >= max_iterations)
+    {
+      ostringstream tmp;
+      mexEvalString("diary off;");
+      tmp << "Error in bytecode: No convergence inside BiCGStab, in block " << block+1;
+      mexWarnMsgTxt(tmp.str().c_str());
+      return 1;
+    }
+  else
+    return 0;
+}
+#endif
+
+void
+dynSparseMatrix::Solve_Matlab_GMRES(mxArray *A_m, mxArray *b_m, int Size, double slowc, int block, bool is_two_boundaries, int it_, mxArray *x0_m)
 {
 #ifdef OCTAVE_MEX_FILE
   ostringstream tmp;
@@ -1953,7 +4693,7 @@ SparseMatrix::Solve_Matlab_GMRES(mxArray *A_m, mxArray *b_m, int Size, double sl
     tmp << " GMRES method is not implemented in Octave. You cannot use stack_solve_algo=2, change stack_solve_algo.\n";
   throw FatalExceptionHandling(tmp.str());
 #endif
-  int n = mxGetM(A_m);
+  size_t n = mxGetM(A_m);
   mxArray *lhs0[2];
   mxArray *rhs0[2];
   rhs0[0] = A_m;
@@ -1967,7 +4707,7 @@ SparseMatrix::Solve_Matlab_GMRES(mxArray *A_m, mxArray *b_m, int Size, double sl
   rhs[1] = b_m;
   rhs[2] = mxCreateDoubleScalar(Size);
   rhs[3] = mxCreateDoubleScalar(1e-6);
-  rhs[4] = mxCreateDoubleScalar(n);
+  rhs[4] = mxCreateDoubleScalar((double)n);
   rhs[5] = L1;
   rhs[6] = U1;
   rhs[7] = x0_m;
@@ -2006,6 +4746,9 @@ SparseMatrix::Solve_Matlab_GMRES(mxArray *A_m, mxArray *b_m, int Size, double sl
     {
       double *res = mxGetPr(z);
       if (is_two_boundaries)
+#ifdef USE_OMP
+#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+#endif
         for (int i = 0; i < n; i++)
           {
             int eq = index_vara[i+Size*y_kmin];
@@ -2014,6 +4757,9 @@ SparseMatrix::Solve_Matlab_GMRES(mxArray *A_m, mxArray *b_m, int Size, double sl
             y[eq] += slowc * yy;
           }
       else
+#ifdef USE_OMP
+#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+#endif
         for (int i = 0; i < n; i++)
           {
             int eq = index_vara[i];
@@ -2029,17 +4775,62 @@ SparseMatrix::Solve_Matlab_GMRES(mxArray *A_m, mxArray *b_m, int Size, double sl
 }
 
 void
-SparseMatrix::Solve_Matlab_BiCGStab(mxArray *A_m, mxArray *b_m, int Size, double slowc, int block, bool is_two_boundaries, int it_, mxArray *x0_m, bool steady_state)
+dynSparseMatrix::Solve_Matlab_BiCGStab(mxArray *A_m, mxArray *b_m, int Size, double slowc, int block, bool is_two_boundaries, int it_, mxArray *x0_m, int preconditioner)
 {
-  unsigned int n = mxGetM(A_m);
-  /*[L1, U1]=luinc(g1a,luinc_tol);*/
-  mxArray *lhs0[2];
-  mxArray *rhs0[2];
-  rhs0[0] = A_m;
-  rhs0[1] = mxCreateDoubleScalar(lu_inc_tol);
-  mexCallMATLAB(2, lhs0, 2, rhs0, "luinc");
-  mxArray *L1 = lhs0[0];
-  mxArray *U1 = lhs0[1];
+  /* precond = 0  => Jacobi
+     precond = 1  => Incomplet LU decomposition*/
+  size_t n = mxGetM(A_m);
+  mxArray *L1, *U1, *Diag;
+
+  mxArray *rhs0[4];
+  if (preconditioner == 0)
+    {
+      mxArray *lhs0[1];
+      rhs0[0] = A_m;
+      rhs0[1] = mxCreateDoubleScalar(0);
+      mexCallMATLAB(1, lhs0, 2, rhs0, "spdiags");
+      mxArray* tmp = lhs0[0];
+      double* tmp_val = mxGetPr(tmp);
+      Diag = mxCreateSparse(n, n, n, mxREAL);
+      mwIndex *Diag_i = mxGetIr(Diag);
+      mwIndex *Diag_j = mxGetJc(Diag);
+      double *Diag_val = mxGetPr(Diag);
+      for (size_t i = 0; i < n; i++)
+        {
+          Diag_val[i] = tmp_val[i];
+          Diag_j[i] = i;
+          Diag_i[i] = i;
+        }
+      Diag_j[n] = n;
+    }
+  else if (preconditioner == 1)
+    {
+      /*[L1, U1] = ilu(g1a=;*/
+      const char *field_names[] = {"type", "droptol", "milu", "udiag", "thresh"};
+      const int type = 0;
+      const int droptol = 1;
+      const int milu = 2;
+      const int udiag = 3;
+      const int thresh = 4;
+      mwSize dims[1] = {(mwSize)1 };
+      mxArray *Setup = mxCreateStructArray(1, dims, 5, field_names);
+      mxSetFieldByNumber(Setup, 0, type, mxCreateString("ilutp"));
+      //mxSetFieldByNumber(Setup, 0, type, mxCreateString("nofill"));
+      mxSetFieldByNumber(Setup, 0, droptol, mxCreateDoubleScalar(lu_inc_tol));
+      mxSetFieldByNumber(Setup, 0, milu, mxCreateString("off"));
+      mxSetFieldByNumber(Setup, 0, udiag, mxCreateDoubleScalar(0));
+      mxSetFieldByNumber(Setup, 0, thresh, mxCreateDoubleScalar(0));
+      //mxSetFieldByNumber(Setup, 0, thresh, mxCreateDoubleScalar(1));
+      mxArray *lhs0[2];
+      mxArray *rhs0[2];
+      rhs0[0] = A_m;
+      rhs0[1] = Setup;
+      mexCallMATLAB(2, lhs0, 2, rhs0, "ilu");
+      L1 = lhs0[0];
+      U1 = lhs0[1];
+      mxDestroyArray(Setup);
+    }
+
   double flags = 2;
   mxArray *z;
   if (steady_state)  /*Octave BicStab algorihtm involves a 0 division in case of a preconditionner equal to the LU decomposition of A matrix*/
@@ -2047,7 +4838,10 @@ SparseMatrix::Solve_Matlab_BiCGStab(mxArray *A_m, mxArray *b_m, int Size, double
       mxArray *res = mult_SAT_B(Sparse_transpose(A_m), x0_m);
       double *resid = mxGetPr(res);
       double *b = mxGetPr(b_m);
-      for (unsigned int i = 0; i < n; i++)
+#ifdef USE_OMP
+#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+#endif
+      for (int i = 0; i < (int)n; i++)
         resid[i] = b[i] - resid[i];
       mxArray *rhs[2];
       mxArray *lhs[1];
@@ -2060,19 +4854,24 @@ SparseMatrix::Solve_Matlab_BiCGStab(mxArray *A_m, mxArray *b_m, int Size, double
       z = lhs[0];
       double *phat = mxGetPr(z);
       double *x0 = mxGetPr(x0_m);
-      for (unsigned int i = 0; i < n; i++)
+#ifdef USE_OMP
+#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+#endif
+      for (int i = 0; i < (int)n; i++)
         phat[i] = x0[i] + phat[i];
 
       /*Check the solution*/
       res = mult_SAT_B(Sparse_transpose(A_m), z);
       resid = mxGetPr(res);
       double cum_abs = 0;
-      for (unsigned int i = 0; i < n; i++)
+#ifdef USE_OMP
+#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) reduction(+:cum_abs)
+#endif
+      for (int i = 0; i < (int)n; i++)
         {
           resid[i] = b[i] - resid[i];
           cum_abs += fabs(resid[i]);
         }
-      //mexPrintf("cum_abs=%g\n", cum_abs);
       if (cum_abs > 1e-7)
         flags = 2;
       else
@@ -2080,32 +4879,55 @@ SparseMatrix::Solve_Matlab_BiCGStab(mxArray *A_m, mxArray *b_m, int Size, double
       mxDestroyArray(res);
     }
   //else
+
   if (flags == 2)
     {
-      /*[za,flag1] = bicgstab(g1a,b,1e-6,Blck_size*periods,L1,U1);*/
-      mxArray *rhs[7];
-      rhs[0] = A_m;
-      rhs[1] = b_m;
-      rhs[2] = mxCreateDoubleScalar(1e-6);
-      rhs[3] = mxCreateDoubleScalar(n);
-      rhs[4] = L1;
-      rhs[5] = U1;
-      rhs[6] = x0_m;
-      mxArray *lhs[2];
-      mexCallMATLAB(2, lhs, 7, rhs, "bicgstab");
-      z = lhs[0];
-      mxArray *flag = lhs[1];
-      double *flag1 = mxGetPr(flag);
-      flags = flag1[0];
-      mxDestroyArray(flag);
-      mxDestroyArray(rhs[2]);
-      mxDestroyArray(rhs[3]);
-      mxDestroyArray(rhs[4]);
-      mxDestroyArray(rhs[5]);
+      if (preconditioner == 0)
+        {
+          /*[za,flag1] = bicgstab(g1a,b,1e-6,Blck_size*periods,L1,U1);*/
+          mxArray *rhs[5];
+          rhs[0] = A_m;
+          rhs[1] = b_m;
+          rhs[2] = mxCreateDoubleScalar(1e-6);
+          rhs[3] = mxCreateDoubleScalar((double)n);
+          rhs[4] = Diag;
+          //rhs[5] = x0_m;
+          mxArray *lhs[2];
+          mexCallMATLAB(2, lhs, 5, rhs, "bicgstab");
+          z = lhs[0];
+          mxArray *flag = lhs[1];
+          double *flag1 = mxGetPr(flag);
+          flags = flag1[0];
+          mxDestroyArray(flag);
+          mxDestroyArray(rhs[2]);
+          mxDestroyArray(rhs[3]);
+          mxDestroyArray(rhs[4]);
+        }
+      else if (preconditioner == 1)
+        {
+          /*[za,flag1] = bicgstab(g1a,b,1e-6,Blck_size*periods,L1,U1);*/
+          mxArray *rhs[7];
+          rhs[0] = A_m;
+          rhs[1] = b_m;
+          rhs[2] = mxCreateDoubleScalar(1e-6);
+          rhs[3] = mxCreateDoubleScalar((double)n);
+          rhs[4] = L1;
+          rhs[5] = U1;
+          rhs[6] = x0_m;
+          mxArray *lhs[2];
+          mexCallMATLAB(2, lhs, 7, rhs, "bicgstab");
+          z = lhs[0];
+          mxArray *flag = lhs[1];
+          double *flag1 = mxGetPr(flag);
+          flags = flag1[0];
+          mxDestroyArray(flag);
+          mxDestroyArray(rhs[2]);
+          mxDestroyArray(rhs[3]);
+          mxDestroyArray(rhs[4]);
+          mxDestroyArray(rhs[5]);
+        }
     }
-  /*mexPrintf("z");
-    mexCallMATLAB(0, NULL, 1, &z, "disp");*/
-  mxDestroyArray(rhs0[1]);
+
 
   if (flags > 0)
     {
@@ -2131,7 +4953,10 @@ SparseMatrix::Solve_Matlab_BiCGStab(mxArray *A_m, mxArray *b_m, int Size, double
     {
       double *res = mxGetPr(z);
       if (is_two_boundaries)
-        for (unsigned int i = 0; i < n; i++)
+#ifdef USE_OMP
+#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+#endif
+        for (int i = 0; i < n; i++)
           {
             int eq = index_vara[i+Size*y_kmin];
             double yy = -(res[i] + y[eq]);
@@ -2139,7 +4964,10 @@ SparseMatrix::Solve_Matlab_BiCGStab(mxArray *A_m, mxArray *b_m, int Size, double
             y[eq] += slowc * yy;
           }
       else
-        for (unsigned int i = 0; i < n; i++)
+#ifdef USE_OMP
+#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+#endif
+        for (int i = 0; i < n; i++)
           {
             int eq = index_vara[i];
             double yy = -(res[i] + y[eq+it_*y_size]);
@@ -2153,7 +4981,7 @@ SparseMatrix::Solve_Matlab_BiCGStab(mxArray *A_m, mxArray *b_m, int Size, double
 }
 
 void
-SparseMatrix::Singular_display(int block, int Size, bool steady_state, it_code_type it_code)
+dynSparseMatrix::Singular_display(int block, int Size)
 {
   bool zero_solution;
   Simple_Init(Size, IM_i, zero_solution);
@@ -2162,6 +4990,9 @@ SparseMatrix::Singular_display(int block, int Size, bool steady_state, it_code_t
   rhs[0] = mxCreateDoubleMatrix(Size, Size, mxREAL);
   double *pind;
   pind = mxGetPr(rhs[0]);
+#ifdef USE_OMP
+#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+#endif
   for (int j = 0; j < Size * Size; j++)
     pind[j] = 0.0;
   for (int ii = 0; ii < Size; ii++)
@@ -2186,46 +5017,45 @@ SparseMatrix::Singular_display(int block, int Size, bool steady_state, it_code_t
     {
       if (abs(SVD_ps[i * (1 + Size)]) < 1e-12)
         {
-            mexPrintf(" The following equations form a linear combination:\n    ");
-            double max_u = 0;
-            for (int j = 0; j < Size; j++)
-              if (abs(SVD_pu[j + i * Size]) > abs(max_u))
-                max_u = SVD_pu[j + i * Size];
-            vector<int> equ_list;
-            for (int j = 0; j < Size; j++)
-              {
-                double rr = SVD_pu[j + i * Size] / max_u;
-                if ( rr < -1e-10)
-                  {
-                    equ_list.push_back(j);
-                    if (rr != -1)
-                      mexPrintf(" - %3.2f*Dequ_%d_dy",abs(rr),j+1);
+          mexPrintf(" The following equations form a linear combination:\n    ");
+          double max_u = 0;
+          for (int j = 0; j < Size; j++)
+            if (abs(SVD_pu[j + i * Size]) > abs(max_u))
+              max_u = SVD_pu[j + i * Size];
+          vector<int> equ_list;
+          for (int j = 0; j < Size; j++)
+            {
+              double rr = SVD_pu[j + i * Size] / max_u;
+              if ( rr < -1e-10)
+                {
+                  equ_list.push_back(j);
+                  if (rr != -1)
+                    mexPrintf(" - %3.2f*Dequ_%d_dy",abs(rr),j+1);
+                  else
+                    mexPrintf(" - Dequ_%d_dy",j+1);
+                }
+              else if (rr > 1e-10)
+                {
+                  equ_list.push_back(j);
+                  if (j > 0)
+                    if (rr != 1)
+                      mexPrintf(" + %3.2f*Dequ_%d_dy",rr,j+1);
                     else
-                      mexPrintf(" - Dequ_%d_dy",j+1);
-                  }
-                else if (rr > 1e-10)
-                  {
-                    equ_list.push_back(j);
-                    if (j > 0)
-                      if (rr != 1)
-                        mexPrintf(" + %3.2f*Dequ_%d_dy",rr,j+1);
-                      else
-                        mexPrintf(" + Dequ_%d_dy",j+1);
-                    else
-                      if (rr != 1)
-                        mexPrintf(" %3.2f*Dequ_%d_dy",rr,j+1);
-                      else
-                        mexPrintf(" Dequ_%d_dy",j+1);
-                  }
-              }
-            mexPrintf(" = 0\n");
-            /*mexPrintf(" with:\n");
-            it_code = get_begin_block(block);
-            for (int j=0; j < Size; j++)
-              {
-                if (find(equ_list.begin(), equ_list.end(), j) != equ_list.end())
-                  mexPrintf("  equ_%d: %s\n",j, print_expression(it_code_expr, false, Size, block, steady_state, 0, 0, it_code, true).c_str());
-              }*/
+                      mexPrintf(" + Dequ_%d_dy",j+1);
+                  else if (rr != 1)
+                    mexPrintf(" %3.2f*Dequ_%d_dy",rr,j+1);
+                  else
+                    mexPrintf(" Dequ_%d_dy",j+1);
+                }
+            }
+          mexPrintf(" = 0\n");
+          /*mexPrintf(" with:\n");
+          it_code = get_begin_block(block);
+          for (int j=0; j < Size; j++)
+            {
+              if (find(equ_list.begin(), equ_list.end(), j) != equ_list.end())
+                mexPrintf("  equ_%d: %s\n",j, print_expression(it_code_expr, false, Size, block, steady_state, 0, 0, it_code, true).c_str());
+            }*/
         }
     }
   mxDestroyArray(lhs[0]);
@@ -2241,7 +5071,7 @@ SparseMatrix::Singular_display(int block, int Size, bool steady_state, it_code_t
 
 
 bool
-SparseMatrix::Solve_ByteCode_Sparse_GaussianElimination(int Size, int blck, bool steady_state, int it_)
+dynSparseMatrix::Solve_ByteCode_Sparse_GaussianElimination(int Size, int blck, int it_)
 {
   bool one;
   int pivj = 0, pivk = 0;
@@ -2262,7 +5092,8 @@ SparseMatrix::Solve_ByteCode_Sparse_GaussianElimination(int Size, int blck, bool
       /*finding the max-pivot*/
       double piv = piv_abs = 0;
       int nb_eq = At_Col(i, &first);
-      l = 0; N_max = 0;
+      l = 0;
+      N_max = 0;
       one = false;
       piv_abs = 0;
       for (int j = 0; j < nb_eq; j++)
@@ -2409,7 +5240,7 @@ SparseMatrix::Solve_ByteCode_Sparse_GaussianElimination(int Size, int blck, bool
             bc[nb_eq_todo++] = first;
           first = first->NZE_C_N;
         }
-      //#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+      //pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
       for (int j = 0; j < nb_eq_todo; j++)
         {
           first = bc[j];
@@ -2488,8 +5319,12 @@ SparseMatrix::Solve_ByteCode_Sparse_GaussianElimination(int Size, int blck, bool
         }
     }
   double slowc_lbx = slowc;
+#ifdef USE_OMP
+#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+#endif
   for (int i = 0; i < y_size; i++)
     ya[i+it_*y_size] = y[i+it_*y_size];
+
   slowc_save = slowc;
   simple_bksub(it_, Size, slowc_lbx);
   End_GE(Size);
@@ -2502,7 +5337,7 @@ SparseMatrix::Solve_ByteCode_Sparse_GaussianElimination(int Size, int blck, bool
 }
 
 void
-SparseMatrix::Solve_ByteCode_Symbolic_Sparse_GaussianElimination(int Size, bool symbolic, int Block_number)
+dynSparseMatrix::Solve_ByteCode_Symbolic_Sparse_GaussianElimination(int Size, bool symbolic, int Block_number)
 {
   /*Triangularisation at each period of a block using a simple gaussian Elimination*/
   t_save_op_s *save_op_s;
@@ -2521,16 +5356,28 @@ SparseMatrix::Solve_ByteCode_Symbolic_Sparse_GaussianElimination(int Size, bool
   pivj_v = (int *) mxMalloc(Size*sizeof(int));
   pivk_v = (int *) mxMalloc(Size*sizeof(int));
   NR = (int *) mxMalloc(Size*sizeof(int));
+  //clock_t time00 = clock();
+  NonZeroElem **bc;
+  bc = (NonZeroElem **) mxMalloc(Size*sizeof(first));
 
   for (int t = 0; t < periods; t++)
     {
+      /*clock_t time11 = clock();
+      mexPrintf("t=%d, record = %d\n",t, record);*/
+#ifdef OCTAVE_MEX_FILE
+      OCTAVE_QUIT;
+#else
+    	if ( utIsInterruptPending() )
+		    throw UserExceptionHandling();
+#endif
+
       if (record && symbolic)
         {
-          if (save_op)
+          /*if (save_op)
             {
               mxFree(save_op);
               save_op = NULL;
-            }
+            }*/
           save_op = (int *) mxMalloc(nop*sizeof(int));
           nopa = nop;
         }
@@ -2663,220 +5510,128 @@ SparseMatrix::Solve_ByteCode_Symbolic_Sparse_GaussianElimination(int Size, bool
               piv_abs = fabs(piv);
             }
           line_done[pivj] = true;
-          if (symbolic)
-            {
-              if (record)
-                {
-                  if (nop+1 >= nopa)
-                    {
-                      nopa = long (mem_increasing_factor*(double) nopa);
-                      save_op = (int *) mxRealloc(save_op, nopa*sizeof(int));
-                    }
-                  save_op_s = (t_save_op_s *) (&(save_op[nop]));
-                  save_op_s->operat = IFLD;
-                  save_op_s->first = pivk;
-                  save_op_s->lag = 0;
-                }
-              nop += 2;
-            }
-          if (piv_abs < eps)
-            {
-              ostringstream tmp;
-              if (Block_number > 1)
-                tmp << " in Solve_ByteCode_Symbolic_Sparse_GaussianElimination, singular system in block " << Block_number+1 << "\n";
-              else
-                tmp << " in Solve_ByteCode_Symbolic_Sparse_GaussianElimination, singular system\n";
-              throw FatalExceptionHandling(tmp.str());
-            }
-          /*divide all the non zeros elements of the line pivj by the max_pivot*/
-          int nb_var = At_Row(pivj, &first);
-          NonZeroElem **bb;
-          bb = (NonZeroElem **) mxMalloc(nb_var*sizeof(first));
-          for (int j = 0; j < nb_var; j++)
-            {
-              bb[j] = first;
-              first = first->NZE_R_N;
-            }
 
-          for (int j = 0; j < nb_var; j++)
+          if (record && symbolic)
             {
-              first = bb[j];
-              u[first->u_index] /= piv;
-              if (symbolic)
+              if (nop+1 >= nopa)
                 {
-                  if (record)
-                    {
-                      if (nop+j*2+1 >= nopa)
-                        {
-                          nopa = long (mem_increasing_factor*(double) nopa);
-                          save_op = (int *) mxRealloc(save_op, nopa*sizeof(int));
-                        }
-                      save_op_s = (t_save_op_s *) (&(save_op[nop+j*2]));
-                      save_op_s->operat = IFDIV;
-                      save_op_s->first = first->u_index;
-                      save_op_s->lag = first->lag_index;
-                    }
+                  nopa = long (mem_increasing_factor*(double) nopa);
+                  save_op = (int *) mxRealloc(save_op, nopa*sizeof(int));
                 }
-            }
-          mxFree(bb);
-          nop += nb_var*2;
-          u[b[pivj]] /= piv;
-          if (symbolic)
-            {
-              if (record)
+              save_op_s = (t_save_op_s *) (&(save_op[nop]));
+              save_op_s->operat = IFLD;
+              save_op_s->first = pivk;
+              save_op_s->lag = 0;
+              nop += 2;
+              if (piv_abs < eps)
                 {
-                  if (nop+1 >= nopa)
+                  ostringstream tmp;
+                  if (Block_number > 1)
+                    tmp << " in Solve_ByteCode_Symbolic_Sparse_GaussianElimination, singular system in block " << Block_number+1 << "\n";
+                  else
+                    tmp << " in Solve_ByteCode_Symbolic_Sparse_GaussianElimination, singular system\n";
+                  throw FatalExceptionHandling(tmp.str());
+                }
+              /*divide all the non zeros elements of the line pivj by the max_pivot*/
+              int nb_var = At_Row(pivj, &first);
+              for (int j = 0; j < nb_var; j++)
+                {
+                  u[first->u_index] /= piv;
+                  if (nop+j*2+1 >= nopa)
                     {
                       nopa = long (mem_increasing_factor*(double) nopa);
                       save_op = (int *) mxRealloc(save_op, nopa*sizeof(int));
                     }
-                  save_op_s = (t_save_op_s *) (&(save_op[nop]));
+                  save_op_s = (t_save_op_s *) (&(save_op[nop+j*2]));
                   save_op_s->operat = IFDIV;
-                  save_op_s->first = b[pivj];
-                  save_op_s->lag = 0;
+                  save_op_s->first = first->u_index;
+                  save_op_s->lag = first->lag_index;
+                  first = first->NZE_R_N;
                 }
+              nop += nb_var*2;
+              u[b[pivj]] /= piv;
+              if (nop+1 >= nopa)
+                {
+                  nopa = long (mem_increasing_factor*(double) nopa);
+                  save_op = (int *) mxRealloc(save_op, nopa*sizeof(int));
+                }
+              save_op_s = (t_save_op_s *) (&(save_op[nop]));
+              save_op_s->operat = IFDIV;
+              save_op_s->first = b[pivj];
+              save_op_s->lag = 0;
               nop += 2;
-            }
-          /*substract the elements on the non treated lines*/
-          nb_eq = At_Col(i, &first);
-          NonZeroElem *first_piva;
-          int nb_var_piva = At_Row(pivj, &first_piva);
+              /*substract the elements on the non treated lines*/
+              nb_eq = At_Col(i, &first);
+              NonZeroElem *first_piva;
+              int nb_var_piva = At_Row(pivj, &first_piva);
 
-          NonZeroElem **bc;
-          bc = (NonZeroElem **) mxMalloc(nb_eq*sizeof(first));
-          int nb_eq_todo = 0;
-          for (int j = 0; j < nb_eq && first; j++)
-            {
-              if (!line_done[first->r_index])
-                bc[nb_eq_todo++] = first;
-              first = first->NZE_C_N;
-            }
-          //#pragma omp parallel for num_threads(2) shared(nb_var_piva, first_piva, nopa, nop, save_op, record)
-          for (int j = 0; j < nb_eq_todo; j++)
-            {
-              t_save_op_s *save_op_s_l;
-              first = bc[j];
-              int row = first->r_index;
-              double first_elem = u[first->u_index];
-              if (symbolic)
+              int nb_eq_todo = 0;
+              for (int j = 0; j < nb_eq && first; j++)
                 {
-                  if (record)
-                    {
-                      if (nop+1 >= nopa)
-                        {
-                          nopa = long (mem_increasing_factor*(double) nopa);
-                          save_op = (int *) mxRealloc(save_op, nopa*sizeof(int));
-                        }
-                      save_op_s_l = (t_save_op_s *) (&(save_op[nop]));
-                      save_op_s_l->operat = IFLD;
-                      save_op_s_l->first = first->u_index;
-                      save_op_s_l->lag = abs(first->lag_index);
-                    }
-                  nop += 2;
+                  if (!line_done[first->r_index])
+                    bc[nb_eq_todo++] = first;
+                  first = first->NZE_C_N;
                 }
-
-              int nb_var_piv = nb_var_piva;
-              NonZeroElem *first_piv = first_piva;
-              NonZeroElem *first_sub;
-              int nb_var_sub = At_Row(row, &first_sub);
-              int l_sub = 0;
-              int l_piv = 0;
-              int sub_c_index = first_sub->c_index;
-              int piv_c_index = first_piv->c_index;
-              int tmp_lag = first_sub->lag_index;
-              while (l_sub < nb_var_sub || l_piv < nb_var_piv)
+//#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) shared(nb_var_piva, first_piva, nopa, save_op) reduction(+:nop)
+              for (int j = 0; j < nb_eq_todo; j++)
                 {
-                  if (l_sub < nb_var_sub && (sub_c_index < piv_c_index || l_piv >= nb_var_piv))
+                  t_save_op_s *save_op_s_l;
+                  NonZeroElem *first = bc[j];
+                  int row = first->r_index;
+                  double first_elem = u[first->u_index];
+                  if (nop+1 >= nopa)
                     {
-                      //There is no nonzero element at row pivot for this column=> Nothing to do for the current element got to next column
-                      first_sub = first_sub->NZE_R_N;
-                      if (first_sub)
-                        sub_c_index = first_sub->c_index;
-                      else
-                        sub_c_index = Size*periods;
-                      l_sub++;
+                      nopa = long (mem_increasing_factor*(double) nopa);
+                      save_op = (int *) mxRealloc(save_op, nopa*sizeof(int));
                     }
-                  else if (sub_c_index > piv_c_index || l_sub >= nb_var_sub)
+                  save_op_s_l = (t_save_op_s *) (&(save_op[nop]));
+                  save_op_s_l->operat = IFLD;
+                  save_op_s_l->first = first->u_index;
+                  save_op_s_l->lag = abs(first->lag_index);
+                  nop += 2;
+
+                  int nb_var_piv = nb_var_piva;
+                  NonZeroElem *first_piv = first_piva;
+                  NonZeroElem *first_sub;
+                  int nb_var_sub = At_Row(row, &first_sub);
+                  int l_sub = 0;
+                  int l_piv = 0;
+                  int sub_c_index = first_sub->c_index;
+                  int piv_c_index = first_piv->c_index;
+                  int tmp_lag = first_sub->lag_index;
+                  while (l_sub < (nb_var_sub/*=NRow(row)*/) || l_piv < nb_var_piv)
                     {
-                      // There is an nonzero element at row pivot but not at the current row=> insert a negative element in the current row
-                      tmp_u_count = Get_u();
-                      lag = first_piv->c_index/Size-row/Size;
-                      //#pragma omp critical
-                      {
-                        Insert(row, first_piv->c_index, tmp_u_count, lag);
-                      }
-                      u[tmp_u_count] = -u[first_piv->u_index]*first_elem;
-                      if (symbolic)
+                      if (l_sub < nb_var_sub && (sub_c_index < piv_c_index || l_piv >= nb_var_piv))
                         {
-                          if (record)
-                            {
-                              if (nop+2 >= nopa)
-                                {
-                                  nopa = long (mem_increasing_factor*(double) nopa);
-                                  save_op = (int *) mxRealloc(save_op, nopa*sizeof(int));
-                                }
-                              save_op_s_l = (t_save_op_s *) (&(save_op[nop]));
-                              save_op_s_l->operat = IFLESS;
-                              save_op_s_l->first = tmp_u_count;
-                              save_op_s_l->second = first_piv->u_index;
-                              save_op_s_l->lag = max(first_piv->lag_index, abs(tmp_lag));
-                            }
-                          nop += 3;
-                        }
-                      first_piv = first_piv->NZE_R_N;
-                      if (first_piv)
-                        piv_c_index = first_piv->c_index;
-                      else
-                        piv_c_index = Size*periods;
-                      l_piv++;
-                    }
-                  else /*first_sub->c_index==first_piv->c_index*/
-                    {
-                      if (i == sub_c_index)
-                        {
-                          NonZeroElem *firsta = first;
-                          NonZeroElem *first_suba = first_sub->NZE_R_N;
-                          Delete(first_sub->r_index, first_sub->c_index);
-                          first = firsta->NZE_C_N;
-                          first_sub = first_suba;
-                          if (first_sub)
-                            sub_c_index = first_sub->c_index;
-                          else
-                            sub_c_index = Size*periods;
-                          l_sub++;
-                          first_piv = first_piv->NZE_R_N;
-                          if (first_piv)
-                            piv_c_index = first_piv->c_index;
-                          else
-                            piv_c_index = Size*periods;
-                          l_piv++;
-                        }
-                      else
-                        {
-                          u[first_sub->u_index] -= u[first_piv->u_index]*first_elem;
-                          if (symbolic)
-                            {
-                              if (record)
-                                {
-                                  if (nop+3 >= nopa)
-                                    {
-                                      nopa = long (mem_increasing_factor*(double) nopa);
-                                      save_op = (int *) mxRealloc(save_op, nopa*sizeof(int));
-                                    }
-                                  save_op_s_l = (t_save_op_s *) (&(save_op[nop]));
-                                  save_op_s_l->operat = IFSUB;
-                                  save_op_s_l->first = first_sub->u_index;
-                                  save_op_s_l->second = first_piv->u_index;
-                                  save_op_s_l->lag = max(abs(tmp_lag), first_piv->lag_index);
-                                }
-                              nop += 3;
-                            }
+                          //There is no nonzero element at row pivot for this column=> Nothing to do for the current element got to next column
                           first_sub = first_sub->NZE_R_N;
                           if (first_sub)
                             sub_c_index = first_sub->c_index;
                           else
                             sub_c_index = Size*periods;
                           l_sub++;
+                        }
+                      else if (sub_c_index > piv_c_index || l_sub >= nb_var_sub)
+                        {
+                          // There is an nonzero element at row pivot but not at the current row=> insert a negative element in the current row
+                          tmp_u_count = Get_u();
+                          lag = first_piv->c_index/Size-row/Size;
+                          //#pragma omp critical
+                            {
+                              Insert(row, first_piv->c_index, tmp_u_count, lag);
+                            }
+                          u[tmp_u_count] = -u[first_piv->u_index]*first_elem;
+                          if (nop+2 >= nopa)
+                            {
+                              nopa = long (mem_increasing_factor*(double) nopa);
+                              save_op = (int *) mxRealloc(save_op, nopa*sizeof(int));
+                            }
+                          save_op_s_l = (t_save_op_s *) (&(save_op[nop]));
+                          save_op_s_l->operat = IFLESS;
+                          save_op_s_l->first = tmp_u_count;
+                          save_op_s_l->second = first_piv->u_index;
+                          save_op_s_l->lag = max(first_piv->lag_index, abs(tmp_lag));
+                          nop += 3;
                           first_piv = first_piv->NZE_R_N;
                           if (first_piv)
                             piv_c_index = first_piv->c_index;
@@ -2884,29 +5639,200 @@ SparseMatrix::Solve_ByteCode_Symbolic_Sparse_GaussianElimination(int Size, bool
                             piv_c_index = Size*periods;
                           l_piv++;
                         }
-                    }
-                }
-              u[b[row]] -= u[b[pivj]]*first_elem;
-
-              if (symbolic)
-                {
-                  if (record)
-                    {
-                      if (nop+3 >= nopa)
+                      else /*first_sub->c_index==first_piv->c_index*/
                         {
-                          nopa = long (mem_increasing_factor*(double) nopa);
-                          save_op = (int *) mxRealloc(save_op, nopa*sizeof(int));
+                          if (i == sub_c_index)
+                            {
+                              NonZeroElem *firsta = first;
+                              NonZeroElem *first_suba = first_sub->NZE_R_N;
+                              //#pragma omp critical
+                                {
+                                  Delete(first_sub->r_index, first_sub->c_index);
+                                }
+                              first = firsta->NZE_C_N;
+                              first_sub = first_suba;
+                              if (first_sub)
+                                sub_c_index = first_sub->c_index;
+                              else
+                                sub_c_index = Size*periods;
+                              l_sub++;
+                              first_piv = first_piv->NZE_R_N;
+                              if (first_piv)
+                                piv_c_index = first_piv->c_index;
+                              else
+                                piv_c_index = Size*periods;
+                              l_piv++;
+                            }
+                          else
+                            {
+                              u[first_sub->u_index] -= u[first_piv->u_index]*first_elem;
+                              if (nop+3 >= nopa)
+                                {
+                                  nopa = long (mem_increasing_factor*(double) nopa);
+                                  save_op = (int *) mxRealloc(save_op, nopa*sizeof(int));
+                                }
+                              save_op_s_l = (t_save_op_s *) (&(save_op[nop]));
+                              save_op_s_l->operat = IFSUB;
+                              save_op_s_l->first = first_sub->u_index;
+                              save_op_s_l->second = first_piv->u_index;
+                              save_op_s_l->lag = max(abs(tmp_lag), first_piv->lag_index);
+                              nop += 3;
+                              first_sub = first_sub->NZE_R_N;
+                              if (first_sub)
+                                sub_c_index = first_sub->c_index;
+                              else
+                                sub_c_index = Size*periods;
+                              l_sub++;
+                              first_piv = first_piv->NZE_R_N;
+                              if (first_piv)
+                                piv_c_index = first_piv->c_index;
+                              else
+                                piv_c_index = Size*periods;
+                              l_piv++;
+                            }
                         }
-                      save_op_s_l = (t_save_op_s *) (&(save_op[nop]));
-                      save_op_s_l->operat = IFSUB;
-                      save_op_s_l->first = b[row];
-                      save_op_s_l->second = b[pivj];
-                      save_op_s_l->lag = abs(tmp_lag);
                     }
+                  u[b[row]] -= u[b[pivj]]*first_elem;
+
+                  if (nop+3 >= nopa)
+                    {
+                      nopa = long (mem_increasing_factor*(double) nopa);
+                      save_op = (int *) mxRealloc(save_op, nopa*sizeof(int));
+                    }
+                  save_op_s_l = (t_save_op_s *) (&(save_op[nop]));
+                  save_op_s_l->operat = IFSUB;
+                  save_op_s_l->first = b[row];
+                  save_op_s_l->second = b[pivj];
+                  save_op_s_l->lag = abs(tmp_lag);
+                  nop += 3;
+                }
+            }
+          else if(symbolic)
+            {
+              nop += 2;
+              if (piv_abs < eps)
+                {
+                  ostringstream tmp;
+                  if (Block_number > 1)
+                    tmp << " in Solve_ByteCode_Symbolic_Sparse_GaussianElimination, singular system in block " << Block_number+1 << "\n";
+                  else
+                    tmp << " in Solve_ByteCode_Symbolic_Sparse_GaussianElimination, singular system\n";
+                  throw FatalExceptionHandling(tmp.str());
+                }
+              /*divide all the non zeros elements of the line pivj by the max_pivot*/
+              int nb_var = At_Row(pivj, &first);
+              for (int j = 0; j < nb_var; j++)
+                {
+                  u[first->u_index] /= piv;
+                  first = first->NZE_R_N;
+                }
+              nop += nb_var*2;
+              u[b[pivj]] /= piv;
+              nop += 2;
+              /*substract the elements on the non treated lines*/
+              nb_eq = At_Col(i, &first);
+              NonZeroElem *first_piva;
+              int nb_var_piva = At_Row(pivj, &first_piva);
+
+              int nb_eq_todo = 0;
+              for (int j = 0; j < nb_eq && first; j++)
+                {
+                  if (!line_done[first->r_index])
+                    bc[nb_eq_todo++] = first;
+                  first = first->NZE_C_N;
+                }
+//#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) shared(nb_var_piva, first_piva, nopa, save_op) reduction(+:nop)
+              for (int j = 0; j < nb_eq_todo; j++)
+                {
+                  NonZeroElem *first = bc[j];
+                  int row = first->r_index;
+                  double first_elem = u[first->u_index];
+                  nop += 2;
+                  int nb_var_piv = nb_var_piva;
+                  NonZeroElem *first_piv = first_piva;
+                  NonZeroElem *first_sub;
+                  int nb_var_sub = At_Row(row, &first_sub);
+                  int l_sub = 0;
+                  int l_piv = 0;
+                  int sub_c_index = first_sub->c_index;
+                  int piv_c_index = first_piv->c_index;
+                  while (l_sub < (nb_var_sub /*= NRow(row)*/) || l_piv < nb_var_piv)
+                    {
+                      if (l_sub < nb_var_sub && (sub_c_index < piv_c_index || l_piv >= nb_var_piv))
+                        {
+                          //There is no nonzero element at row pivot for this column=> Nothing to do for the current element got to next column
+                          first_sub = first_sub->NZE_R_N;
+                          if (first_sub)
+                            sub_c_index = first_sub->c_index;
+                          else
+                            sub_c_index = Size*periods;
+                          l_sub++;
+                        }
+                      else if (sub_c_index > piv_c_index || l_sub >= nb_var_sub)
+                        {
+                          // There is an nonzero element at row pivot but not at the current row=> insert a negative element in the current row
+                          tmp_u_count = Get_u();
+                          lag = first_piv->c_index/Size-row/Size;
+                          //#pragma omp critical
+                           {
+                             Insert(row, first_piv->c_index, tmp_u_count, lag);
+                           }
+                          u[tmp_u_count] = -u[first_piv->u_index]*first_elem;
+                          nop += 3;
+                          first_piv = first_piv->NZE_R_N;
+                          if (first_piv)
+                            piv_c_index = first_piv->c_index;
+                          else
+                            piv_c_index = Size*periods;
+                          l_piv++;
+                        }
+                      else /*first_sub->c_index==first_piv->c_index*/
+                        {
+                          if (i == sub_c_index)
+                            {
+                              NonZeroElem *firsta = first;
+                              NonZeroElem *first_suba = first_sub->NZE_R_N;
+                              //#pragma omp critical
+                                {
+                                  Delete(first_sub->r_index, first_sub->c_index);
+                                }
+                              first = firsta->NZE_C_N;
+                              first_sub = first_suba;
+                              if (first_sub)
+                                sub_c_index = first_sub->c_index;
+                              else
+                                sub_c_index = Size*periods;
+                              l_sub++;
+                              first_piv = first_piv->NZE_R_N;
+                              if (first_piv)
+                                piv_c_index = first_piv->c_index;
+                              else
+                                piv_c_index = Size*periods;
+                              l_piv++;
+                            }
+                          else
+                            {
+                              u[first_sub->u_index] -= u[first_piv->u_index]*first_elem;
+                              nop += 3;
+                              first_sub = first_sub->NZE_R_N;
+                              if (first_sub)
+                                sub_c_index = first_sub->c_index;
+                              else
+                                sub_c_index = Size*periods;
+                              l_sub++;
+                              first_piv = first_piv->NZE_R_N;
+                              if (first_piv)
+                                piv_c_index = first_piv->c_index;
+                              else
+                                piv_c_index = Size*periods;
+                              l_piv++;
+                            }
+                        }
+                    }
+                  u[b[row]] -= u[b[pivj]]*first_elem;
                   nop += 3;
                 }
             }
-          mxFree(bc);
         }
       if (symbolic)
         {
@@ -2919,12 +5845,32 @@ SparseMatrix::Solve_ByteCode_Symbolic_Sparse_GaussianElimination(int Size, bool
             }
           else if (record && (nop == nop1))
             {
-              if (save_opa && save_opaa)
+              if (t > int(periods*0.35))
+                {
+                  symbolic = false;
+                  if (save_opaa)
+                    {
+                      mxFree(save_opaa);
+                      save_opaa = NULL;
+                    }
+                  if (save_opa)
+                    {
+                      mxFree(save_opa);
+                      save_opa = NULL;
+                    }
+                  if (save_op)
+                    {
+                      mxFree(save_op);
+                      save_op = NULL;
+                    }
+                }
+              else if (save_opa && save_opaa)
                 {
                   if (compare(save_op, save_opa, save_opaa, t, periods, nop, Size))
                     {
                       tbreak = t;
                       tbreak_g = tbreak;
+                      //mexPrintf("time=%f\n",(1000.0*(double (clock())-double (time11)))/double (CLOCKS_PER_SEC));
                       break;
                     }
                 }
@@ -2935,16 +5881,9 @@ SparseMatrix::Solve_ByteCode_Symbolic_Sparse_GaussianElimination(int Size, bool
                       mxFree(save_opaa);
                       save_opaa = NULL;
                     }
-                  save_opaa = (int *) mxMalloc(nop1*sizeof(int));
-                  memcpy(save_opaa, save_opa, nop1*sizeof(int));
+                  save_opaa = save_opa;
                 }
-              if (save_opa)
-                {
-                  mxFree(save_opa);
-                  save_opa = NULL;
-                }
-              save_opa = (int *) mxMalloc(nop*sizeof(int));
-              memcpy(save_opa, save_op, nop*sizeof(int));
+              save_opa = save_op;
             }
           else
             {
@@ -2968,11 +5907,16 @@ SparseMatrix::Solve_ByteCode_Symbolic_Sparse_GaussianElimination(int Size, bool
           nop2 = nop1;
           nop1 = nop;
         }
+      //mexPrintf("time=%f\n",(1000.0*(double (clock())-double (time11)))/double (CLOCKS_PER_SEC));
     }
+  mxFree(bc);
   mxFree(piv_v);
   mxFree(pivj_v);
   mxFree(pivk_v);
   mxFree(NR);
+  /*mexPrintf("tbreak=%d, periods=%d time required=%f\n",tbreak,periods, (1000.0*(double (clock())-double (time00)))/double (CLOCKS_PER_SEC));
+  mexEvalString("drawnow;");
+  time00 = clock();*/
   nop_all += nop;
   if (symbolic)
     {
@@ -2986,103 +5930,289 @@ SparseMatrix::Solve_ByteCode_Symbolic_Sparse_GaussianElimination(int Size, bool
 
   /*The backward substitution*/
   double slowc_lbx = slowc;
+#ifdef USE_OMP
+#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+#endif
   for (int i = 0; i < y_size*(periods+y_kmin); i++)
     ya[i] = y[i];
   slowc_save = slowc;
   bksub(tbreak, last_period, Size, slowc_lbx);
+  /*mexPrintf("remaining operations and bksub time required=%f\n",tbreak,periods, (1000.0*(double (clock())-double (time00)))/double (CLOCKS_PER_SEC));
+  mexEvalString("drawnow;");*/
   End_GE(Size);
 }
 
-bool
-SparseMatrix::Simulate_Newton_One_Boundary(int blck, int y_size, int it_, int y_kmin, int y_kmax, int Size, bool print_it, bool cvg, int &iter, bool steady_state, int stack_solve_algo, int solve_algo)
+
+void
+dynSparseMatrix::Grad_f_product(int n, mxArray *b_m, double* vectr, mxArray *A_m, SuiteSparse_long *Ap, SuiteSparse_long *Ai, double* Ax, double* b_)
 {
-  int i, j;
-  mxArray *b_m = NULL, *A_m = NULL, *x0_m = NULL;
-  Clear_u();
-  error_not_printed = true;
-  bool singular_system = false;
-  u_count_alloc_save = u_count_alloc;
-  if (isnan(res1) || isinf(res1) || (res2 > 12*g0 && iter > 0))
+  if ((solve_algo == 5 && steady_state) || (stack_solve_algo == 5 && !steady_state))
     {
-      if (iter == 0 || fabs(slowc_save) < 1e-8)
+      NonZeroElem *first;
+#ifdef USE_OMP
+#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) private(first)
+#endif
+      for (int i = 0; i < n; i++)
         {
-          for (j = 0; j < y_size; j++)
-            {
-#ifdef DEBUG
-              bool select = false;
-#endif
-              for (int i = 0; i < Size; i++)
-                if (j == index_vara[i])
-                  {
-#ifdef DEBUG
-                    select = true;
-#endif
-                    break;
-                  }
-#ifdef DEBUG
-              if (select)
-                mexPrintf("-> variable %s (%d) at time %d = %f direction = %f\n", get_variable(eEndogenous, j).c_str(), j+1, it_, y[j+it_*y_size], direction[j+it_*y_size]);
-              else
-                mexPrintf("   variable %s (%d) at time %d = %f direction = %f\n", get_variable(eEndogenous, j).c_str(), j+1, it_, y[j+it_*y_size], direction[j+it_*y_size]);
-#endif
-            }
-          if (steady_state)
-            {
-              if (iter == 0)
-                mexPrintf(" the initial values of endogenous variables are too far from the solution.\nChange them!\n");
-              else
-                mexPrintf(" dynare cannot improve the simulation in block %d at time %d (variable %d)\n", blck+1, it_+1, index_vara[max_res_idx]+1);
-              mexEvalString("drawnow;");
-              return singular_system;
-            }
-          else
+          double sum = 0;
+          first = FNZE_R[i];
+          if (first)
+            for (int k = 0; k < NbNZRow[i]; k++)
+              {
+                sum += u[first->u_index] * u[b[first->c_index]];
+                first = first->NZE_R_N;
+              }
+          vectr[i] = sum;
+        }
+    }
+  else
+    {
+      if (!((solve_algo == 6 && steady_state) || ((stack_solve_algo == 0 || stack_solve_algo == 1 || stack_solve_algo == 4) && !steady_state)))
+        {
+          mwIndex *Ai = mxGetIr(A_m);
+          if (!Ai)
             {
               ostringstream tmp;
-              if (iter == 0)
-                tmp << " in Simulate_Newton_One_Boundary, The initial values of endogenous variables are too far from the solution.\nChange them!\n";
-              else
-                tmp << " in Simulate_Newton_One_Boundary, Dynare cannot improve the simulation in block " << blck+1 << " at time " << it_+1 << " (variable " << index_vara[max_res_idx]+1 << "%d)\n";
+              tmp << " in Init_Matlab_Sparse_Simple, can't allocate Ai index vector\n";
+              throw FatalExceptionHandling(tmp.str());
+            }
+          mwIndex *Aj = mxGetJc(A_m);
+          if (!Aj)
+            {
+              ostringstream tmp;
+              tmp << " in Init_Matlab_Sparse_Simple, can't allocate Aj index vector\n";
+              throw FatalExceptionHandling(tmp.str());
+            }
+          double *A = mxGetPr(A_m);
+          if (!A)
+            {
+              ostringstream tmp;
+              tmp << " in Init_Matlab_Sparse_Simple, can't retrieve A matrix\n";
+              throw FatalExceptionHandling(tmp.str());
+            }
+          b_ = mxGetPr(b_m);
+          if (!b_)
+            {
+              ostringstream tmp;
+              tmp << " in Init_Matlab_Sparse_Simple, can't retrieve b matrix\n";
               throw FatalExceptionHandling(tmp.str());
             }
         }
-      if (!(isnan(res1) || isinf(res1)) && !(isnan(g0) || isinf(g0)))
-        {
-          if (try_at_iteration == 0)
-            {
-              prev_slowc_save = slowc_save;
-              slowc_save = max(-gp0 / (2 * (res2 - g0 - gp0)), 0.1);
-            }
-          else
-            {
-              double t1 = res2 - gp0 * slowc_save - g0;
-              double t2 = glambda2 - gp0 * prev_slowc_save - g0;
-              double a = (1/(slowc_save * slowc_save) * t1 - 1/(prev_slowc_save * prev_slowc_save) * t2) / (slowc_save - prev_slowc_save);
-              double b = (-prev_slowc_save/(slowc_save * slowc_save) * t1 + slowc_save/(prev_slowc_save * prev_slowc_save) * t2) / (slowc_save - prev_slowc_save);
-              prev_slowc_save = slowc_save;
-              slowc_save = max(min(-b + sqrt(b*b - 3 * a * gp0) / (3 * a), 0.5 * slowc_save), 0.1 * slowc_save);
-            }
-          glambda2 = res2;
-          try_at_iteration++;
-        }
-      else
+      memset(vectr, 0, n * sizeof(double));
+#ifdef USE_OMP
+#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) /*shared(vectr)*/
+#endif
+      for (int i = 0; i < n; i++)
+        for (SuiteSparse_long j = Ap[i]; j < Ap[i+1]; j++)
+          vectr[Ai[j]] += Ax[j] * b_[i];
+    }
+}
+
+void
+dynSparseMatrix::Check_and_Correct_Previous_Iteration(int block_num, int y_size, int size, double crit_opt_old)
+{
+  double top = 1.0;
+  double bottom = 0.1;
+  //mexPrintf("res2=%f > g0=%f, res1=%f, iter=%d it_=%d\n", res2, g0, res1, iter, it_);
+  if (isnan(res1) || isinf(res1) || (res2 > g0 && iter > 0))
+    {
+      while ((isnan(res1) || isinf(res1)))
         {
           prev_slowc_save = slowc_save;
           slowc_save /= 1.1;
+#ifdef USE_OMP
+#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+#endif
+          for (int i = 0; i < size; i++)
+            {
+              int eq = index_vara[i];
+              y[eq+it_*y_size] = ya[eq+it_*y_size] + slowc_save * direction[eq+it_*y_size];
+            }
+          /*mexPrintf("reducing solwc_save = %e, it_=%d, y_size=%d, size=%d, y[%d]=%e, ya[%d]=%e,\n y[%d]=%e, ya[%d]=%e\n",slowc_save, it_, y_size, size-1, index_vara[0]+it_*y_size, y[index_vara[0]+it_*y_size], index_vara[0]+it_*y_size, ya[index_vara[0]+it_*y_size]
+                                                                                                       , index_vara[size-1]+it_*y_size, y[index_vara[size-1]+it_*y_size], index_vara[size-1]+it_*y_size, ya[index_vara[size-1]+it_*y_size]);*/
+           //mexPrintf("->slowc_save=%f\n",slowc_save);
+           compute_complete(true, res1, res2, max_res, max_res_idx);
         }
-      if (print_it)
+
+      while (res2 > g0 && slowc_save > 1e-1)
+        {
+          prev_slowc_save = slowc_save;
+          slowc_save /= 1.5;
+#ifdef USE_OMP
+#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+#endif
+          for (int i = 0; i < size; i++)
+            {
+              int eq = index_vara[i];
+              y[eq+it_*y_size] = ya[eq+it_*y_size] + slowc_save * direction[eq+it_*y_size];
+            }
+          /*mexPrintf("reducing solwc_save = %e, it_=%d, y_size=%d, size=%d, y[%d]=%e, ya[%d]=%e,\n y[%d]=%e, ya[%d]=%e\n",slowc_save, it_, y_size, size-1, index_vara[0]+it_*y_size, y[index_vara[0]+it_*y_size], index_vara[0]+it_*y_size, ya[index_vara[0]+it_*y_size]                                                                                            , index_vara[size-1]+it_*y_size, y[index_vara[size-1]+it_*y_size], index_vara[size-1]+it_*y_size, ya[index_vara[size-1]+it_*y_size]);*/
+          //mexPrintf("->slowc_save=%f\n",slowc_save);
+          compute_complete(true, res1, res2, max_res, max_res_idx);
+        }
+      double ax = slowc_save-0.001, bx = slowc_save+0.001, cx = slowc_save, fa, fb, fc, xmin;
+      if (false/*slowc_save > 2e-1*/)
+        if (mnbrak(&ax, &bx, &cx, &fa, &fb, &fc))
+          if (golden(ax, bx, cx, 1e-1, solve_tolf, &xmin))
+            slowc_save = xmin;
+      //mexPrintf("cx=%f\n", cx);
+      //mexPrintf("ax= %f, bx=%f, cx=%f, fa=%f, fb=%f, fc=%d\n", ax, bx, cx, fa, fb, fc);
+
+      //if (!(isnan(res1) || isinf(res1))/* && !(isnan(g0) || isinf(g0))*//*|| (res2 > g0 && iter > 1)*/)
+      if (false)
+        {
+
+          double *p = (double*)mxMalloc(size * sizeof(double));
+          Grad_f_product(size, b_m_save, p, A_m_save, Ap_save, Ai_save, Ax_save, b_save);
+          double slope=0.0;
+#ifdef USE_OMP
+#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) reduction(+:slope)
+#endif
+          for (int i = 1; i < size; i++)
+            slope += - direction[i] * p[i];
+          /*if (slope > 0)
+            mexPrintf("Roundoff in lnsearch\n");
+          else*/
+            {
+              prev_slowc_save = 1;
+              double crit_opt = res2/2;
+              double max_try_iteration = 100;
+              double small_ = 1.0e-4;
+              bool try_at_cvg = false;
+              while ((try_at_iteration < max_try_iteration) && (!try_at_cvg) && (abs(prev_slowc_save - slowc_save) > 1e-10))
+                {
+                  crit_opt = res2 / 2;
+                  if (slowc_save < 1e-7)
+                    {
+                      try_at_cvg = true;
+                      continue;
+                    }
+                  else if ((crit_opt <= crit_opt_old + small_ * slowc_save * slope) && !(isnan(res1) || isinf(res1)))
+                    {
+                      try_at_cvg = true;
+                      continue;
+                    }
+                  else if (try_at_iteration == 0)
+                    {
+                      prev_slowc_save = slowc_save;
+                      //slowc_save = max(- top * slope / ( (crit_opt - crit_opt_old - slope)), bottom);
+                      slowc_save /= 1.2;
+                    }
+                  else
+                    {
+                      double t1 = crit_opt - slope * slowc_save - crit_opt_old;
+                      double t2 = glambda2 - slope * prev_slowc_save - crit_opt_old;
+                      double a = (1/(slowc_save * slowc_save) * t1 - 1/(prev_slowc_save * prev_slowc_save) * t2) / (slowc_save - prev_slowc_save);
+                      double b = (-prev_slowc_save/(slowc_save * slowc_save) * t1 + slowc_save/(prev_slowc_save * prev_slowc_save) * t2) / (slowc_save - prev_slowc_save);
+                      if (a == 0)
+                        slowc_save = max(min( - slope/(2 * b) , top * slowc_save), bottom * slowc_save);
+                      else
+                        {
+                          double delta = b*b - 3 * a * slope;
+                          if (delta <= 0)
+                            slowc_save = top * slowc_save;
+                          else if (b <= 0)
+                            slowc_save = max(min(-b + sqrt(delta) / (3 * a), top * slowc_save), bottom * slowc_save);
+                          else
+                            slowc_save = max(min(- slope / (b + sqrt(delta)), top * slowc_save), bottom * slowc_save);
+                        }
+                    }
+                  if (abs(prev_slowc_save - slowc_save) < 1e-10)
+                    slowc_save /= 1.1;
+                  //mexPrintf("=>slowc_save=%f, prev_slowc_save=%f\n",slowc_save, prev_slowc_save);
+                  prev_slowc_save = slowc_save;
+                  glambda2 = crit_opt;
+                  try_at_iteration++;
+#ifdef USE_OMP
+#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+#endif
+                  for (int i = 0; i < size; i++)
+                    {
+                      int eq = index_vara[i];
+                      y[eq+it_*y_size] = ya[eq+it_*y_size] + slowc_save * direction[eq+it_*y_size];
+                    }
+                  compute_complete(true, res1, res2, max_res, max_res_idx);
+                }
+            }
+          mxFree(p);
+        }
+      //if (print_it)
         mexPrintf("Error: Simulation diverging, trying to correct it using slowc=%f\n", slowc_save);
-      for (i = 0; i < y_size; i++)
-        y[i+it_*y_size] = ya[i+it_*y_size] + slowc_save*direction[i+it_*y_size];
-      iter--;
-      return singular_system;
+#ifdef USE_OMP
+#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+#endif
+      for (int i = 0; i < size; i++)
+        {
+          int eq = index_vara[i];
+          y[eq+it_*y_size] = ya[eq+it_*y_size] + slowc_save * direction[eq+it_*y_size];
+        }
+      compute_complete(false, res1, res2, max_res, max_res_idx);
     }
-  if (cvg)
+  else
     {
-      return singular_system;
+      //mexPrintf("slowc_save=%f res1=%f\n",slowc_save, res1);
+      for (int i = 0; i < size; i++)
+        {
+          int eq = index_vara[i];
+          y[eq+it_*y_size] = ya[eq+it_*y_size] + slowc_save * direction[eq+it_*y_size];
+        }
+    }
+  slowc_save = slowc;
+}
+
+bool
+dynSparseMatrix::Simulate_One_Boundary(int block_num, int y_size, int y_kmin, int y_kmax, int size, bool cvg)
+{
+  //int i;
+  mxArray *b_m = NULL, *A_m = NULL, *x0_m = NULL;
+  SuiteSparse_long *Ap = NULL, *Ai = NULL;
+  double *Ax = NULL, *b = NULL;
+
+
+  try_at_iteration = 0;
+  Clear_u();
+  bool singular_system = false;
+  u_count_alloc_save = u_count_alloc;
+
+  if (isnan(res1) || isinf(res1))
+    {
+#ifdef DEBUG
+      for (int j = 0; j < y_size; j++)
+        {
+          bool select = false;
+          for (int i = 0; i < size; i++)
+            if (j == index_vara[i])
+              {
+                select = true;
+                break;
+              }
+          if (select)
+            mexPrintf("-> variable %s (%d) at time %d = %f direction = %f\n", get_variable(eEndogenous, j).c_str(), j+1, it_, y[j+it_*y_size], direction[j+it_*y_size]);
+          else
+            mexPrintf("   variable %s (%d) at time %d = %f direction = %f\n", get_variable(eEndogenous, j).c_str(), j+1, it_, y[j+it_*y_size], direction[j+it_*y_size]);
+        }
+#endif
+      if (steady_state)
+        {
+          if (iter == 0)
+            mexPrintf(" the initial values of endogenous variables are too far from the solution.\nChange them!\n");
+          else
+            mexPrintf(" dynare cannot improve the simulation in block %d at time %d (variable %d)\n", block_num+1, it_+1, index_vara[max_res_idx]+1);
+          mexEvalString("drawnow;");
+          //return singular_system;
+        }
+      else
+        {
+          ostringstream tmp;
+          if (iter == 0)
+            tmp << " in Simulate_One_Boundary, The initial values of endogenous variables are too far from the solution.\nChange them!\n";
+          else
+            tmp << " in Simulate_One_Boundary, Dynare cannot improve the simulation in block " << block_num+1 << " at time " << it_+1 << " (variable " << index_vara[max_res_idx]+1 << "%d)\n";
+          throw FatalExceptionHandling(tmp.str());
+        }
     }
   if (print_it)
     {
-      //mexPrintf("solwc=%f g0=%f res2=%f glambda2=%f\n",slowc_save,g0, res2, glambda2);
       if (steady_state)
         {
           switch (solve_algo)
@@ -3125,36 +6255,60 @@ SparseMatrix::Simulate_Newton_One_Boundary(int blck, int y_size, int it_, int y_
       mexPrintf("-----------------------------------\n");
     }
   bool zero_solution;
+
   if ((solve_algo == 5 && steady_state) || (stack_solve_algo == 5 && !steady_state))
-    Simple_Init(Size, IM_i, zero_solution);
+    Simple_Init(size, IM_i, zero_solution);
   else
     {
-      b_m = mxCreateDoubleMatrix(Size, 1, mxREAL);
+      b_m = mxCreateDoubleMatrix(size, 1, mxREAL);
       if (!b_m)
         {
           ostringstream tmp;
-          tmp << " in Simulate_Newton_One_Boundary, can't allocate b_m vector\n";
+          tmp << " in Simulate_One_Boundary, can't allocate b_m vector\n";
           throw FatalExceptionHandling(tmp.str());
         }
-      A_m = mxCreateSparse(Size, Size, min(int (IM_i.size()*2), Size*Size), mxREAL);
+      A_m = mxCreateSparse(size, size, min(int (IM_i.size()*2), size * size), mxREAL);
       if (!A_m)
         {
           ostringstream tmp;
-          tmp << " in Simulate_Newton_One_Boundary, can't allocate A_m matrix\n";
+          tmp << " in Simulate_One_Boundary, can't allocate A_m matrix\n";
           throw FatalExceptionHandling(tmp.str());
         }
-      x0_m = mxCreateDoubleMatrix(Size, 1, mxREAL);
+      x0_m = mxCreateDoubleMatrix(size, 1, mxREAL);
       if (!x0_m)
         {
           ostringstream tmp;
-          tmp << " in Simulate_Newton_One_Boundary, can't allocate x0_m vector\n";
+          tmp << " in Simulate_One_Boundary, can't allocate x0_m vector\n";
           throw FatalExceptionHandling(tmp.str());
         }
-      Init_Matlab_Sparse_Simple(Size, IM_i, A_m, b_m, zero_solution, x0_m);
+      if (!((solve_algo == 6 && steady_state) || ((stack_solve_algo == 0 || stack_solve_algo == 4) && !steady_state)))
+        {
+          Init_Matlab_Sparse_Simple(size, IM_i, A_m, b_m, zero_solution, x0_m);
+          A_m_save = mxDuplicateArray(A_m);
+          b_m_save = mxDuplicateArray(b_m);
+        }
+      else
+        {
+          Init_UMFPACK_Sparse_Simple(size, IM_i, &Ap, &Ai, &Ax, &b, zero_solution, x0_m);
+          if (Ap_save[size] != Ap[size])
+            {
+              mxFree(Ai_save);
+              mxFree(Ax_save);
+              Ai_save = (SuiteSparse_long*)mxMalloc(Ap[size] * sizeof(SuiteSparse_long));
+              Ax_save = (double*)mxMalloc(Ap[size] * sizeof(double));
+            }
+          memcpy(Ap_save, Ap, (size + 1) * sizeof(SuiteSparse_long));
+          memcpy(Ai_save, Ai, Ap[size] * sizeof(SuiteSparse_long));
+          memcpy(Ax_save, Ax, Ap[size] * sizeof(double));
+          memcpy(b_save, b, size * sizeof(double));
+        }
     }
   if (zero_solution)
     {
-      for (int i = 0; i < Size; i++)
+#ifdef USE_OMP
+#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+#endif
+      for (int i = 0; i < size; i++)
         {
           int eq = index_vara[i];
           double yy = -(y[eq+it_*y_size]);
@@ -3165,27 +6319,173 @@ SparseMatrix::Simulate_Newton_One_Boundary(int blck, int y_size, int it_, int y_
   else
     {
       if ((solve_algo == 5 && steady_state) || (stack_solve_algo == 5 && !steady_state))
-        singular_system = Solve_ByteCode_Sparse_GaussianElimination(Size, blck, steady_state, it_);
+        singular_system = Solve_ByteCode_Sparse_GaussianElimination(size, block_num, it_);
       else if ((solve_algo == 7 && steady_state) || (stack_solve_algo == 2 && !steady_state))
-        Solve_Matlab_GMRES(A_m, b_m, Size, slowc, blck, false, it_, steady_state, x0_m);
+        Solve_Matlab_GMRES(A_m, b_m, size, slowc, block_num, false, it_, x0_m);
       else if ((solve_algo == 8 && steady_state) || (stack_solve_algo == 3 && !steady_state))
-        Solve_Matlab_BiCGStab(A_m, b_m, Size, slowc, blck, false, it_, x0_m, steady_state);
-      else if ((solve_algo == 6 && steady_state) || ((stack_solve_algo == 0 || stack_solve_algo == 1) && !steady_state))
-        Solve_Matlab_LU_UMFPack(A_m, b_m, Size, slowc, false, it_);
+        Solve_Matlab_BiCGStab(A_m, b_m, size, slowc, block_num, false, it_, x0_m, 1);
+      else if ((solve_algo == 6 && steady_state) || ((stack_solve_algo == 0 || stack_solve_algo == 1 || stack_solve_algo == 4) && !steady_state))
+        Solve_LU_UMFPack(Ap, Ai, Ax, b, size, size, slowc, true, 0);
     }
   return singular_system;
 }
 
-void
-SparseMatrix::Simulate_Newton_Two_Boundaries(int blck, int y_size, int it_, int y_kmin, int y_kmax, int Size, int periods, bool print_it, bool cvg, int &iter, int minimal_solving_periods, int stack_solve_algo, unsigned int endo_name_length, char *P_endo_names)
+
+
+
+bool
+dynSparseMatrix::solve_linear(const int block_num, const int y_size, const int y_kmin, const int y_kmax, const int size, const int iter)
 {
+  bool cvg = false;
+  double crit_opt_old = res2/2;
+  compute_complete(false, res1, res2, max_res, max_res_idx);
+  cvg = (max_res < solve_tolf);
+  if (!cvg || isnan(res1) || isinf(res1))
+    {
+      if (iter)
+        Check_and_Correct_Previous_Iteration(block_num, y_size, size, crit_opt_old);
+      bool singular_system = Simulate_One_Boundary(block_num, y_size, y_kmin, y_kmax, size, cvg);
+      if (singular_system)
+        Singular_display(block_num, size);
+    }
+  return cvg;
+}
+
+void
+dynSparseMatrix::solve_non_linear(const int block_num, const int y_size, const int y_kmin, const int y_kmax, const int size)
+
+{
+  max_res_idx = 0;
+  bool cvg = false;
+  iter = 0;
+  glambda2 = g0 = very_big;
+  //try_at_iteration = 0;
+  while ((!cvg) && (iter < maxit_))
+    {
+      cvg = solve_linear(block_num, y_size, y_kmin, y_kmax, size, iter);
+      g0 = res2;
+      iter++;
+    }
+  if (!cvg)
+    {
+      ostringstream tmp;
+      if (steady_state)
+        tmp << " in Solve Forward complete, convergence not achieved in block " << block_num+1 << ", after " << iter << " iterations\n";
+      else
+        tmp << " in Solve Forward complete, convergence not achieved in block " << block_num+1 << ", at time " << it_ << ", after " << iter << " iterations\n";
+      throw FatalExceptionHandling(tmp.str());
+    }
+}
+
+void
+dynSparseMatrix::Simulate_Newton_One_Boundary(const bool forward)
+{
+  g1 = (double *) mxMalloc(size*size*sizeof(double));
+  r = (double *) mxMalloc(size*sizeof(double));
+  //mexPrintf("Simulate_Newton_One_Boundary, block_num=%d, size=%d, steady=%d, forward=%d, iter=%d, is_linear=%d\n", block_num, size, steady_state, forward, iter, is_linear);
+  iter = 0;
+  if ((solve_algo == 6 && steady_state) || ((stack_solve_algo == 0 || stack_solve_algo == 1 || stack_solve_algo == 4) && !steady_state))
+    {
+      Ap_save = (SuiteSparse_long*)mxMalloc((size + 1) * sizeof(SuiteSparse_long));
+      Ap_save[size] = 0;
+      Ai_save = (SuiteSparse_long*)mxMalloc(1 * sizeof(SuiteSparse_long));
+      Ax_save = (double*)mxMalloc(1 * sizeof(double));
+      b_save = (double*)mxMalloc((size) * sizeof(SuiteSparse_long));
+    }
+  if (steady_state)
+    {
+      it_ = 0;
+      if (!is_linear)
+        solve_non_linear(block_num, y_size, 0, 0, size);
+      else
+        solve_linear(block_num, y_size, 0, 0, size, 0);
+    }
+  else if (forward)
+    {
+      if (!is_linear)
+        {
+          for (it_ = y_kmin; it_ < periods+y_kmin; it_++)
+            solve_non_linear(block_num, y_size, y_kmin, y_kmax, size);
+        }
+      else
+        {
+          for (int it_ = y_kmin; it_ < periods+y_kmin; it_++)
+            solve_linear(block_num, y_size, y_kmin, y_kmax, size, 0);
+        }
+    }
+  else
+    {
+      if (!is_linear)
+        {
+          for (it_ = periods+y_kmin-1; it_ >= y_kmin; it_--)
+            solve_non_linear(block_num, y_size, y_kmin, y_kmax, size);
+        }
+      else
+        {
+          for (it_ = periods+y_kmin-1; it_ >= y_kmin; it_--)
+            solve_linear(block_num, y_size, y_kmin, y_kmax, size, 0);
+        }
+    }
+  if ((solve_algo == 6 && steady_state) || ((stack_solve_algo == 0 || stack_solve_algo == 1 || stack_solve_algo == 4) && !steady_state))
+    {
+      mxFree(Ap_save);
+      mxFree(Ai_save);
+      mxFree(Ax_save);
+      mxFree(b_save);
+    }
+  mxFree(g1);
+  mxFree(r);
+}
+
+string
+dynSparseMatrix::preconditioner_print_out(string s, int preconditioner)
+{
+  int n = s.length();
+  string tmp = ", preconditioner=";
+  switch(preconditioner)
+    {
+    case 0:
+      tmp.append("Jacobi on dynamic jacobian");
+      break;
+    case 1:
+      tmp.append("incomplet lu0 on dynamic jacobian");
+      break;
+    case 2:
+      tmp.append("incomplet lut on dynamic jacobian");
+      break;
+    case 3:
+      tmp.append("lu on static jacobian");
+      break;
+    }
+  s.insert(n - 2, tmp);
+  return s;
+}
+
+void
+dynSparseMatrix::Simulate_Newton_Two_Boundaries(int blck, int y_size, int y_kmin, int y_kmax, int Size, int periods, bool cvg, int minimal_solving_periods, int stack_solve_algo, unsigned int endo_name_length, char *P_endo_names)
+{
+  double top = 0.5;
+  double bottom = 0.1;
+#ifdef CUDA
+  int nnz, nnz_tild;
+  int *Ap_i, *Ai_i;
+  int *Ap_i_tild, *Ai_i_tild;
+  double *x0, *A_tild;
+
+#endif
+  int preconditioner = 2;
   if (start_compare == 0)
     start_compare = y_kmin;
   u_count_alloc_save = u_count_alloc;
   clock_t t1 = clock();
   nop1 = 0;
-  error_not_printed = true;
   mxArray *b_m = NULL, *A_m = NULL, *x0_m = NULL;
+  double *Ax = NULL, *b;
+  SuiteSparse_long *Ap = NULL, *Ai = NULL;
+
+
+
+
   if (iter > 0)
     {
       if (print_it)
@@ -3222,7 +6522,6 @@ SparseMatrix::Simulate_Newton_Two_Boundaries(int blck, int y_size, int it_, int
             Error << " in Simulate_Newton_Two_Boundaries, the initial values of endogenous variables are too far from the solution.\nChange them!\n";
           else
             Error << " in Simulate_Newton_Two_Boundaries, dynare cannot improve the simulation in block " << blck+1 << " at time " << it_+1 << " (variable " << index_vara[max_res_idx]+1 << ")\n";
-          //Error << filename << " stopped";
           throw FatalExceptionHandling(Error.str());
         }
       if (!(isnan(res1) || isinf(res1)) && !(isnan(g0) || isinf(g0)) && (stack_solve_algo == 4 || stack_solve_algo == 5))
@@ -3230,7 +6529,7 @@ SparseMatrix::Simulate_Newton_Two_Boundaries(int blck, int y_size, int it_, int
           if (try_at_iteration == 0)
             {
               prev_slowc_save = slowc_save;
-              slowc_save = max(-gp0 / (2 * (res2 - g0 - gp0)), 0.1);
+              slowc_save = max(-gp0 / (2 * (res2 - g0 - gp0)), bottom);
             }
           else
             {
@@ -3239,12 +6538,15 @@ SparseMatrix::Simulate_Newton_Two_Boundaries(int blck, int y_size, int it_, int
               double a = (1/(slowc_save * slowc_save) * t1 - 1/(prev_slowc_save * prev_slowc_save) * t2) / (slowc_save - prev_slowc_save);
               double b = (-prev_slowc_save/(slowc_save * slowc_save) * t1 + slowc_save/(prev_slowc_save * prev_slowc_save) * t2) / (slowc_save - prev_slowc_save);
               prev_slowc_save = slowc_save;
-              slowc_save = max(min(-b + sqrt(b*b - 3 * a * gp0) / (3 * a), 0.5 * slowc_save), 0.1 * slowc_save);
+              slowc_save = max(min(-b + sqrt(b*b - 3 * a * gp0) / (3 * a), top * slowc_save), bottom * slowc_save);
             }
           glambda2 = res2;
           try_at_iteration++;
-          if (slowc_save <= 0.1)
+          if (slowc_save <= bottom)
             {
+#ifdef USE_OMP
+#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+#endif
               for (int i = 0; i < y_size*(periods+y_kmin); i++)
                 y[i] = ya[i]+direction[i];
               g0 = res2;
@@ -3266,13 +6568,14 @@ SparseMatrix::Simulate_Newton_Two_Boundaries(int blck, int y_size, int it_, int
           else
             mexPrintf("Simulation diverging, trying to correct it using slowc=%f\n", slowc_save);
         }
-
+#ifdef USE_OMP
+#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+#endif
       for (int i = 0; i < y_size*(periods+y_kmin); i++)
         y[i] = ya[i]+slowc_save*direction[i];
       iter--;
       return;
     }
-
   u_count += u_count_init;
   if (stack_solve_algo == 5)
     {
@@ -3322,10 +6625,10 @@ SparseMatrix::Simulate_Newton_Two_Boundaries(int blck, int y_size, int it_, int
               mexPrintf("MODEL SIMULATION: (method=Relaxation)\n");
               break;
             case 2:
-              mexPrintf("MODEL SIMULATION: (method=GMRES)\n");
+              mexPrintf(preconditioner_print_out("MODEL SIMULATION: (method=GMRES)\n", preconditioner).c_str());
               break;
             case 3:
-              mexPrintf("MODEL SIMULATION: (method=BiCGStab)\n");
+              mexPrintf(preconditioner_print_out("MODEL SIMULATION: (method=BiCGStab)\n", preconditioner).c_str());
               break;
             case 4:
               mexPrintf("MODEL SIMULATION: (method=Sparse LU & optimal path length)\n");
@@ -3333,6 +6636,9 @@ SparseMatrix::Simulate_Newton_Two_Boundaries(int blck, int y_size, int it_, int
             case 5:
               mexPrintf("MODEL SIMULATION: (method=ByteCode own solver)\n");
               break;
+            case 7:
+              mexPrintf(preconditioner_print_out("MODEL SIMULATION: (method=GPU BiCGStab)\n", preconditioner).c_str());
+              break;
             default:
               mexPrintf("MODEL SIMULATION: (method=Unknown - %d - )\n", stack_solve_algo);
             }
@@ -3369,26 +6675,44 @@ SparseMatrix::Simulate_Newton_Two_Boundaries(int blck, int y_size, int it_, int
               tmp << " in Simulate_Newton_Two_Boundaries, can't allocate x0_m vector\n";
               throw FatalExceptionHandling(tmp.str());
             }
-          A_m = mxCreateSparse(periods*Size, periods*Size, IM_i.size()* periods*2, mxREAL);
-          if (!A_m)
+          if (stack_solve_algo != 0 && stack_solve_algo != 4 && stack_solve_algo != 7)
             {
-              ostringstream tmp;
-              tmp << " in Simulate_Newton_Two_Boundaries, can't allocate A_m matrix\n";
-              throw FatalExceptionHandling(tmp.str());
+              A_m = mxCreateSparse(periods*Size, periods*Size, IM_i.size()* periods*2, mxREAL);
+              if (!A_m)
+                {
+                  ostringstream tmp;
+                  tmp << " in Simulate_Newton_Two_Boundaries, can't allocate A_m matrix\n";
+                  throw FatalExceptionHandling(tmp.str());
+                }
             }
-          Init_Matlab_Sparse(periods, y_kmin, y_kmax, Size, IM_i, A_m, b_m, x0_m);
+          if (stack_solve_algo == 0 || stack_solve_algo == 4)
+            Init_UMFPACK_Sparse(periods, y_kmin, y_kmax, Size, IM_i, &Ap, &Ai, &Ax, &b, x0_m);
+#ifdef CUDA
+          else if (stack_solve_algo == 7)
+            Init_CUDA_Sparse(periods, y_kmin, y_kmax, Size, IM_i, &Ap_i, &Ai_i, &Ax, &Ap_i_tild, &Ai_i_tild, &A_tild, &b, &x0, x0_m, &nnz, &nnz_tild, preconditioner);
+#endif
+          else
+            Init_Matlab_Sparse(periods, y_kmin, y_kmax, Size, IM_i, A_m, b_m, x0_m);
+
         }
+      //if (iter > 0)
+      /*mexPrintf("--> stack_solve_algo=%d\n", stack_solve_algo);
+      mexEvalString("drawnow;");*/
 
       if (stack_solve_algo == 0 || stack_solve_algo == 4)
-        Solve_Matlab_LU_UMFPack(A_m, b_m, Size, slowc, true, 0);
+        Solve_LU_UMFPack(Ap, Ai, Ax, b, Size * periods, Size, slowc, true, 0);
       else if (stack_solve_algo == 1)
         Solve_Matlab_Relaxation(A_m, b_m, Size, slowc, true, 0);
       else if (stack_solve_algo == 2)
-        Solve_Matlab_GMRES(A_m, b_m, Size, slowc, blck, true, 0, false, x0_m);
+        Solve_Matlab_GMRES(A_m, b_m, Size, slowc, blck, true, 0, x0_m);
       else if (stack_solve_algo == 3)
-        Solve_Matlab_BiCGStab(A_m, b_m, Size, slowc, blck, true, 0, x0_m, false);
+        Solve_Matlab_BiCGStab(A_m, b_m, Size, slowc, blck, true, 0, x0_m, 1);
       else if (stack_solve_algo == 5)
         Solve_ByteCode_Symbolic_Sparse_GaussianElimination(Size, symbolic, blck);
+#ifdef CUDA
+      else if (stack_solve_algo == 7)
+        Solve_CUDA_BiCGStab(Ap_i, Ai_i, Ax, Ap_i_tild, Ai_i_tild, A_tild, b, x0, Size * periods, Size, slowc, true, 0, nnz, nnz_tild, preconditioner, Size * periods, blck);
+#endif
     }
   if (print_it)
     {
@@ -3396,7 +6720,21 @@ SparseMatrix::Simulate_Newton_Two_Boundaries(int blck, int y_size, int it_, int
       mexPrintf("(** %f milliseconds **)\n", 1000.0*(double (t2) - double (t1))/double (CLOCKS_PER_SEC));
       mexEvalString("drawnow;");
     }
+  if ((!steady_state && (stack_solve_algo == 4 /*|| stack_solve_algo == 0*/))/* || steady_state*/)
+    {
+      clock_t t2 = clock();
+      double ax = -0.1, bx = 1.1, cx = 0.5, fa, fb, fc, xmin;
 
+      if (!mnbrak(&ax, &bx, &cx, &fa, &fb, &fc))
+        return;
+      //mexPrintf("ax= %f, bx=%f, cx=%f, fa=%f, fb=%f, fc=%d\n", ax, bx, cx, fa, fb, fc);
+      if (!golden(ax, bx, cx, 1e-1, solve_tolf, &xmin))
+        return;
+      slowc = xmin;
+      clock_t t3 = clock();
+      mexPrintf("(** %f milliseconds **)\n", 1000.0*(double (t3) - double (t2))/double (CLOCKS_PER_SEC));
+      mexEvalString("drawnow;");
+    }
   time00 = clock();
   if (tbreak_g == 0)
     tbreak_g = periods;
@@ -3404,7 +6742,7 @@ SparseMatrix::Simulate_Newton_Two_Boundaries(int blck, int y_size, int it_, int
 }
 
 void
-SparseMatrix::fixe_u(double **u, int u_count_int, int max_lag_plus_max_lead_plus_1)
+dynSparseMatrix::fixe_u(double **u, int u_count_int, int max_lag_plus_max_lead_plus_1)
 {
   u_count = u_count_int * periods;
   u_count_alloc = 2*u_count;
diff --git a/mex/sources/bytecode/SparseMatrix.hh b/mex/sources/bytecode/SparseMatrix.hh
index 5303b186f..4f2fb2117 100644
--- a/mex/sources/bytecode/SparseMatrix.hh
+++ b/mex/sources/bytecode/SparseMatrix.hh
@@ -19,21 +19,62 @@
 
 #ifndef SPARSEMATRIX_HH_INCLUDED
 #define SPARSEMATRIX_HH_INCLUDED
+#define PRINTF_ printf
 
-#include <fstream>
 #include <stack>
 #include <cmath>
 #include <map>
 #include <ctime>
+#include "dynblas.h"
+#if !(defined _MSC_VER)
+#include "dynumfpack.h"
+#endif
 
-#ifdef OCTAVE_MEX_FILE
-# define CHAR_LENGTH 1
-#else
-# define CHAR_LENGTH 2
+#ifdef CUDA
+#include "cuda.h"
+#include "cuda_runtime_api.h"
+#include "cublas_v2.h"
+#include "cusparse_v2.h"
 #endif
 
 #include "Mem_Mngr.hh"
 #include "ErrorHandling.hh"
+//#include "Interpreter.hh"
+#include "Evaluate.hh"
+
+#define cudaChk(x, y) \
+        { \
+          cudaError_t cuda_error = x; \
+          if (cuda_error != cudaSuccess) \
+            { \
+              ostringstream tmp; \
+              tmp << y; \
+              throw FatalExceptionHandling(tmp.str()); \
+            } \
+        };
+
+#define cusparseChk(x, y) \
+        { \
+          cusparseStatus_t cusparse_status = x; \
+          if (cusparse_status != CUSPARSE_STATUS_SUCCESS) \
+            { \
+              ostringstream tmp; \
+              tmp << y; \
+              throw FatalExceptionHandling(tmp.str()); \
+            } \
+        };
+
+#define cublasChk(x, y) \
+        { \
+          cublasStatus_t cublas_status = x; \
+          if (cublas_status != CUBLAS_STATUS_SUCCESS) \
+            { \
+              ostringstream tmp; \
+              tmp << y; \
+              throw FatalExceptionHandling(tmp.str()); \
+            } \
+        };
+
 #define NEW_ALLOC
 #define MARKOVITZ
 
@@ -53,41 +94,76 @@ const int IFLDZ = 4;
 const int IFMUL = 5;
 const int IFSTP = 6;
 const int IFADD = 7;
-const double eps = 1e-10;
+const double eps = 1e-15;
 const double very_big = 1e24;
 const int alt_symbolic_count_max = 1;
 const double mem_increasing_factor = 1.1;
 
-class SparseMatrix : public ErrorMsg
+
+
+class dynSparseMatrix : public Evaluate
 {
 public:
-  SparseMatrix();
-  void Simulate_Newton_Two_Boundaries(int blck, int y_size, int it_, int y_kmin, int y_kmax, int Size, int periods, bool print_it, bool cvg, int &iter, int minimal_solving_periods, int stack_solve_algo, unsigned int endo_name_length, char *P_endo_names) /*throw(ErrorHandlingException)*/;
-  bool Simulate_Newton_One_Boundary(int blck, int y_size, int it_, int y_kmin, int y_kmax, int Size, bool print_it, bool cvg, int &iter, bool steady_state, int stack_solve_algo, int solve_algo);
-  void Direct_Simulate(int blck, int y_size, int it_, int y_kmin, int y_kmax, int Size, int periods, bool print_it, int iter);
+  #if (defined _MSC_VER)
+  typedef int64_t SuiteSparse_long;
+  #endif
+  dynSparseMatrix();
+  dynSparseMatrix(const int y_size_arg, const int y_kmin_arg, const int y_kmax_arg, const bool print_it_arg, const bool steady_state_arg, const int periods_arg, const int minimal_solving_periods_arg
+#ifdef CUDA
+               ,const int CUDA_device_arg, cublasHandle_t cublas_handle_arg, cusparseHandle_t cusparse_handle_arg, cusparseMatDescr_t descr_arg
+#endif
+               );
+  void Simulate_Newton_Two_Boundaries(int blck, int y_size, int y_kmin, int y_kmax, int Size, int periods, bool cvg, int minimal_solving_periods, int stack_solve_algo, unsigned int endo_name_length, char *P_endo_names);
+  void Simulate_Newton_One_Boundary(bool forward);
   void fixe_u(double **u, int u_count_int, int max_lag_plus_max_lead_plus_1);
-  void Read_SparseMatrix(string file_name, const int Size, int periods, int y_kmin, int y_kmax, bool steady_state, bool two_boundaries, int stack_solve_algo, int solve_algo);
+  void Read_SparseMatrix(string file_name, const int Size, int periods, int y_kmin, int y_kmax, bool two_boundaries, int stack_solve_algo, int solve_algo);
   void Read_file(string file_name, int periods, int u_size1, int y_size, int y_kmin, int y_kmax, int &nb_endo, int &u_count, int &u_count_init, double *u);
-  void Singular_display(int block, int Size, bool steady_state, it_code_type it_code);
-  double g0, gp0, glambda2, try_at_iteration;
+  void Singular_display(int block, int Size);
+  void End_Solver();
+  double g0, gp0, glambda2;
+  int try_at_iteration;
 
 private:
   void Init_GE(int periods, int y_kmin, int y_kmax, int Size, map<pair<pair<int, int>, int>, int> &IM);
   void Init_Matlab_Sparse(int periods, int y_kmin, int y_kmax, int Size, map<pair<pair<int, int>, int>, int> &IM, mxArray *A_m, mxArray *b_m, mxArray *x0_m);
+  void Init_UMFPACK_Sparse(int periods, int y_kmin, int y_kmax, int Size, map<pair<pair<int, int>, int>, int> &IM, SuiteSparse_long **Ap, SuiteSparse_long **Ai, double **Ax, double **b, mxArray *x0_m);
+#ifdef CUDA
+  void Init_CUDA_Sparse(int periods, int y_kmin, int y_kmax, int Size, map<pair<pair<int, int>, int>, int> &IM, int **Ap, int **Ai, double **Ax, int **Ap_tild, int **Ai_tild, double **A_tild, double **b, double **x0, mxArray *x0_m, int *nnz, int *nnz_tild, int preconditioner);
+#endif
   void Init_Matlab_Sparse_Simple(int Size, map<pair<pair<int, int>, int>, int> &IM, mxArray *A_m, mxArray *b_m, bool &zero_solution, mxArray *x0_m);
+  void Init_UMFPACK_Sparse_Simple(int Size, map<pair<pair<int, int>, int>, int> &IM, SuiteSparse_long **Ap, SuiteSparse_long **Ai, double **Ax, double **b, bool &zero_solution, mxArray *x0_m);
+  void Init_CUDA_Sparse_Simple(int Size, map<pair<pair<int, int>, int>, int> &IM, SuiteSparse_long **Ap, SuiteSparse_long **Ai, double **Ax, double **b, double **x0, bool &zero_solution, mxArray *x0_m);
   void Simple_Init(int Size, std::map<std::pair<std::pair<int, int>, int>, int> &IM, bool &zero_solution);
   void End_GE(int Size);
+  bool mnbrak(double *ax, double *bx, double *cx, double *fa, double *fb, double *fc);
+  bool golden(double ax, double bx, double cx, double tol, double solve_tolf, double *xmin);
   void Solve_ByteCode_Symbolic_Sparse_GaussianElimination(int Size, bool symbolic, int Block_number);
-  bool Solve_ByteCode_Sparse_GaussianElimination(int Size, int blck, bool steady_state, int it_);
+  bool Solve_ByteCode_Sparse_GaussianElimination(int Size, int blck, int it_);
   void Solve_Matlab_Relaxation(mxArray *A_m, mxArray *b_m, unsigned int Size, double slowc_l, bool is_two_boundaries, int  it_);
   void Solve_Matlab_LU_UMFPack(mxArray *A_m, mxArray *b_m, int Size, double slowc_l, bool is_two_boundaries, int it_);
-  void Solve_Matlab_GMRES(mxArray *A_m, mxArray *b_m, int Size, double slowc, int block, bool is_two_boundaries, int it_, bool steady_state, mxArray *x0_m);
-  void Solve_Matlab_BiCGStab(mxArray *A_m, mxArray *b_m, int Size, double slowc, int block, bool is_two_boundaries, int it_, mxArray *x0_m, bool steady_state);
+  void Solve_LU_UMFPack(mxArray *A_m, mxArray *b_m, int Size, double slowc_l, bool is_two_boundaries, int  it_);
+  void Solve_LU_UMFPack(SuiteSparse_long *Ap, SuiteSparse_long *Ai, double *Ax, double *b, int n, int Size, double slowc_l, bool is_two_boundaries, int  it_);
+  void End_Matlab_LU_UMFPack();
+#ifdef CUDA
+  void Solve_CUDA_BiCGStab_Free(double* tmp_vect_host, double* p, double* r, double* v, double* s, double* t, double* y_, double* z, double* tmp_,
+                                       int* Ai, double* Ax, int* Ap, double* x0, double* b, double* A_tild, int* A_tild_i, int* A_tild_p,
+                                       cusparseSolveAnalysisInfo_t infoL, cusparseSolveAnalysisInfo_t infoU,
+                                       cusparseMatDescr_t descrL, cusparseMatDescr_t descrU, int preconditioner);
+  int Solve_CUDA_BiCGStab(int *Ap, int *Ai, double *Ax, int *Ap_tild, int *Ai_tild, double *A_tild, double *b, double *x0, int n, int Size, double slowc_l, bool is_two_boundaries, int  it_, int nnz, int nnz_tild, int preconditioner, int max_iterations, int block);
+#endif
+  void Solve_Matlab_GMRES(mxArray *A_m, mxArray *b_m, int Size, double slowc, int block, bool is_two_boundaries, int it_, mxArray *x0_m);
+  void Solve_Matlab_BiCGStab(mxArray *A_m, mxArray *b_m, int Size, double slowc, int block, bool is_two_boundaries, int it_, mxArray *x0_m, int precond);
+  void Check_and_Correct_Previous_Iteration(int block_num, int y_size, int size, double crit_opt_old);
+  bool Simulate_One_Boundary(int blck, int y_size, int y_kmin, int y_kmax, int Size, bool cvg);
+  bool solve_linear(const int block_num, const int y_size, const int y_kmin, const int y_kmax, const int size, const int iter);
+  void solve_non_linear(const int block_num, const int y_size, const int y_kmin, const int y_kmax, const int size);
+  string preconditioner_print_out(string s, int preconditioner);
   bool compare(int *save_op, int *save_opa, int *save_opaa, int beg_t, int periods, long int nop4,  int Size
 #ifdef PROFILER
                , long int *ndiv, long int *nsub
 #endif
                );
+  void Grad_f_product(int n, mxArray *b_m, double* vectr, mxArray *A_m, SuiteSparse_long *Ap, SuiteSparse_long *Ai, double* Ax, double *b);
   void Insert(const int r, const int c, const int u_index, const int lag_index);
   void Delete(const int r, const int c);
   int At_Row(int r, NonZeroElem **first);
@@ -102,7 +178,8 @@ private:
   void Delete_u(int pos);
   void Clear_u();
   void Print_u();
-  void CheckIt(int y_size, int y_kmin, int y_kmax, int Size, int periods, int iter);
+  void *Symbolic, *Numeric ;
+  void CheckIt(int y_size, int y_kmin, int y_kmax, int Size, int periods);
   void Check_the_Solution(int periods, int y_kmin, int y_kmax, int Size, double *u, int *pivot, int *b);
   int complete(int beg_t, int Size, int periods, int *b);
   void bksub(int tbreak, int last_period, int Size, double slowc_l
@@ -118,12 +195,18 @@ private:
   mxArray *Sparse_substract_SA_SB(mxArray *A_m, mxArray *B_m);
   mxArray *Sparse_substract_A_SB(mxArray *A_m, mxArray *B_m);
   mxArray *substract_A_B(mxArray *A_m, mxArray *B_m);
-
+#ifdef CUDA
+  int CUDA_device;
+  cublasHandle_t cublas_handle;
+  cusparseHandle_t cusparse_handle;
+  cusparseMatDescr_t CUDA_descr;
+#endif
+protected:
   stack<double> Stack;
   int nb_prologue_table_u, nb_first_table_u, nb_middle_table_u, nb_last_table_u;
   int nb_prologue_table_y, nb_first_table_y, nb_middle_table_y, nb_last_table_y;
   int middle_count_loop;
-  char type;
+  //char type;
   fstream SaveCode;
   string filename;
   int max_u, min_u;
@@ -154,19 +237,19 @@ protected:
   int u_count_alloc, u_count_alloc_save;
   vector<double *> jac;
   double *jcb;
-  double res1, res2, max_res;
-  int max_res_idx;
   double slowc, slowc_save, prev_slowc_save, markowitz_c;
   int y_decal;
-  int  *index_vara, *index_equa;
+  int *index_equa;
   int u_count, tbreak_g;
   int iter;
-  double *direction;
   int start_compare;
   int restart;
-  bool error_not_printed;
   double g_lambda1, g_lambda2, gp_0;
   double lu_inc_tol;
+//private:
+  SuiteSparse_long *Ap_save, *Ai_save;
+  double *Ax_save, *b_save;
+  mxArray *A_m_save, *b_m_save;
 };
 
 #endif
diff --git a/mex/sources/bytecode/bytecode.cc b/mex/sources/bytecode/bytecode.cc
index 9d3e9321e..3dde9e031 100644
--- a/mex/sources/bytecode/bytecode.cc
+++ b/mex/sources/bytecode/bytecode.cc
@@ -18,12 +18,18 @@
  */
 #include <cstring>
 #include "Interpreter.hh"
+#include "ErrorHandling.hh"
+#include <ctime>
+#include <math.h>
 
 #ifdef DEBUG_EX
 
 using namespace std;
 # include <sstream>
 
+
+
+
 string
 Get_Argument(const char *argv)
 {
@@ -33,14 +39,17 @@ Get_Argument(const char *argv)
 
 #else
 
+void (*prev_fn)(int);
+
+
 string
 Get_Argument(const mxArray *prhs)
 {
   const mxArray *mxa = prhs;
-  int buflen = mxGetM(mxa) * mxGetN(mxa) + 1;
+  mwSize buflen = mwSize(mxGetM(mxa) * mxGetN(mxa) + 1);
   char *first_argument;
   first_argument = (char *) mxCalloc(buflen, sizeof(char));
-  int status = mxGetString(mxa, first_argument, buflen);
+  size_t status = mxGetString(mxa, first_argument, buflen);
   if (status != 0)
     mexWarnMsgTxt("Not enough space. The first argument is truncated.");
   string f(first_argument);
@@ -49,6 +58,178 @@ Get_Argument(const mxArray *prhs)
 }
 #endif
 
+
+//#include <windows.h> 
+#include <stdio.h> 
+ 
+
+#ifdef CUDA
+int
+GPU_Test_and_Info(cublasHandle_t *cublas_handle, cusparseHandle_t *cusparse_handle, cusparseMatDescr_t *descr)
+{
+  cudaDeviceProp deviceProp;
+  int device_count, device, version, version_max = 0;
+  cublasStatus_t cublas_status;
+  cudaError_t cuda_error;
+  *descr=0;
+
+  /* ask cuda how many devices it can find */
+  cudaGetDeviceCount(&device_count);
+  if (device_count < 1)
+    {
+      /* if it couldn't find any fail out */
+      ostringstream tmp;
+      tmp << " Unable to find a CUDA device. Unable to implement CUDA solvers\n";
+      throw FatalExceptionHandling(tmp.str());
+    }
+  else
+    {
+      mexPrintf("-----------------------------------------\n");
+      for (int i = 0; i < device_count; i++)
+        {
+          cudaSetDevice(i);
+          // Statistics about the GPU device
+          cuda_error = cudaGetDeviceProperties(&deviceProp, i);
+          if (cuda_error != cudaSuccess)
+            {
+               ostringstream tmp;
+               tmp << "  bytecode cudaGetDeviceProperties failed\n";
+               throw FatalExceptionHandling(tmp.str());
+            }
+          mexPrintf("> GPU device %d: \"%s\" has:\n   - %d Multi-Processors,\n   - %d threads per multiprocessor,\n", i, deviceProp.name, deviceProp.multiProcessorCount, deviceProp.maxThreadsPerMultiProcessor);
+          mexEvalString("drawnow;");
+          version = (deviceProp.major * 0x10 + deviceProp.minor);
+          if (version >= version_max)
+            {
+              device = i;
+              version_max = version;
+            }
+          mexPrintf("   - %4.2fMhz clock rate,\n   - %2.0fMb of memory,\n   - %d.%d compute capabilities.\n", double(deviceProp.clockRate) / (1024 * 1024), double(deviceProp.totalGlobalMem) / (1024 * 1024), deviceProp.major, deviceProp.minor);
+          mexEvalString("drawnow;");
+        }
+    }
+  mexPrintf("> Device %d selected\n", device);
+  mexEvalString("drawnow;");
+
+  cuda_error = cudaSetDevice(device);
+  if (cuda_error != cudaSuccess)
+    {
+       ostringstream tmp;
+       tmp << "  bytecode cudaSetDevice failed\n";
+       throw FatalExceptionHandling(tmp.str());
+    }
+
+  if(version_max < 0x11)
+    {
+       ostringstream tmp;
+       tmp << "  bytecode requires a minimum CUDA compute 1.1 capability\n";
+       cudaDeviceReset();
+       throw FatalExceptionHandling(tmp.str());
+    }
+
+  // Initialize CuBlas library
+  cublas_status = cublasCreate(cublas_handle);
+  if (cublas_status != CUBLAS_STATUS_SUCCESS)
+    {
+      ostringstream tmp;
+      switch(cublas_status)
+        {
+          case CUBLAS_STATUS_NOT_INITIALIZED:
+            tmp << " the CUBLAS initialization failed.\n";
+            break;
+          case CUBLAS_STATUS_ALLOC_FAILED:
+            tmp << " the resources could not be allocated.\n";
+            break;
+          default:
+            tmp << " unknown error during the initialization of cusparse library.\n";
+        }
+      throw FatalExceptionHandling(tmp.str());
+    }
+
+  // Initialize the CuSparse library
+  cusparseStatus_t cusparse_status;
+  cusparse_status = cusparseCreate(cusparse_handle);
+  if (cusparse_status != CUSPARSE_STATUS_SUCCESS)
+    {
+      ostringstream tmp;
+      switch(cusparse_status)
+        {
+          case CUSPARSE_STATUS_NOT_INITIALIZED:
+            tmp << " the CUDA Runtime initialization failed.\n";
+            break;
+          case CUSPARSE_STATUS_ALLOC_FAILED:
+            tmp <<  " the resources could not be allocated.\n";
+            break;
+          case CUSPARSE_STATUS_ARCH_MISMATCH:
+            tmp <<  " the device compute capability (CC) is less than 1.1. The CC of at least 1.1 is required.\n";
+            break;
+          default:
+            tmp << " unknown error during the initialization of cusparse library.\n";
+        }
+      throw FatalExceptionHandling(tmp.str());
+    }
+
+  // Create and setup matrix descriptor
+  cusparse_status = cusparseCreateMatDescr(descr);
+  if (cusparse_status != CUSPARSE_STATUS_SUCCESS)
+    {
+      ostringstream tmp;
+      tmp << " Matrix descriptor initialization failed\n";
+      throw FatalExceptionHandling(tmp.str());
+    }
+  cusparseSetMatType(*descr, CUSPARSE_MATRIX_TYPE_GENERAL);
+  cusparseSetMatIndexBase(*descr, CUSPARSE_INDEX_BASE_ZERO);
+
+  mexPrintf("> Driver version:\n");
+  int cuda_version;
+  cuda_error = cudaDriverGetVersion(&cuda_version);
+  if (cuda_error  != cudaSuccess)
+    {
+      ostringstream tmp;
+      tmp << " cudaGetVersion has failed\n";
+      throw FatalExceptionHandling(tmp.str());
+    }
+  mexPrintf("   - CUDA version %5.3f\n", double(cuda_version) / 1000);
+  int cublas_version;
+  cublas_status = cublasGetVersion(*cublas_handle, &cublas_version);
+  if (cublas_status != CUBLAS_STATUS_SUCCESS)
+    {
+      ostringstream tmp;
+      tmp << " cublasGetVersion has failed\n";
+      throw FatalExceptionHandling(tmp.str());
+    }
+  mexPrintf("   - CUBLAS version %5.3f\n", double(cublas_version) / 1000);
+  int cusparse_version;
+  cusparse_status = cusparseGetVersion(*cusparse_handle, &cusparse_version);
+  if (cusparse_status != CUSPARSE_STATUS_SUCCESS)
+    {
+      ostringstream tmp;
+      tmp << " cusparseGetVersion has failed\n";
+      throw FatalExceptionHandling(tmp.str());
+    }
+  mexPrintf("   - CUSPARSE version %5.3f\n", double(cusparse_version) / 1000);
+  mexPrintf("-----------------------------------------\n");
+  return device;
+}
+
+void
+GPU_close(cublasHandle_t cublas_handle, cusparseHandle_t cusparse_handle, cusparseMatDescr_t descr)
+{
+  cublasChk(cublasDestroy(cublas_handle),"in bytecode cublasDestroy failed\n");
+  cusparseChk(cusparseDestroyMatDescr(descr), "in bytecode cusparseDestroyMatDescr failed\n");
+  cusparseChk(cusparseDestroy(cusparse_handle),"in bytecode cusparseDestroy failed\n");
+}
+
+#endif
+string
+deblank(string x)
+{
+  for(int i = 0; i < x.length(); i++)
+    if (x[i] == ' ')
+      x.erase(i--, 1);
+  return x;
+}
+
 void
 Get_Arguments_and_global_variables(int nrhs,
 #ifndef DEBUG_EX
@@ -57,10 +238,10 @@ Get_Arguments_and_global_variables(int nrhs,
                                    const char *prhs[],
 #endif
                                    int &count_array_argument,
-                                   double *yd[], unsigned int &row_y, unsigned int &col_y,
-                                   double *xd[], unsigned int &row_x, unsigned int &col_x,
-                                   double *params[], 
-                                   double *steady_yd[], unsigned int &steady_row_y, unsigned int &steady_col_y,
+                                   double *yd[], size_t &row_y, size_t &col_y,
+                                   double *xd[], size_t &row_x, size_t &col_x,
+                                   double *params[],
+                                   double *steady_yd[], size_t &steady_row_y, size_t &steady_col_y,
                                    unsigned int &periods,
 #ifndef DEBUG_EX
                                    mxArray *block_structur[],
@@ -69,8 +250,10 @@ Get_Arguments_and_global_variables(int nrhs,
                                    mxArray *M_[], mxArray *oo_[], mxArray *options_[], bool &global_temporary_terms,
                                    bool &print,
                                    bool &print_error,
-                                   mxArray *GlobalTemporaryTerms[])
+                                   mxArray *GlobalTemporaryTerms[],
+                                   string *plan_struct_name, string *pfplan_struct_name)
 {
+  size_t pos;
 #ifdef DEBUG_EX
   for (int i = 2; i < nrhs; i++)
 #else
@@ -101,7 +284,7 @@ Get_Arguments_and_global_variables(int nrhs,
                 steady_col_y = mxGetN(prhs[i]);
                 break;
               case 4:
-                periods = mxGetScalar(prhs[i]);
+                periods = int(mxGetScalar(prhs[i]));
                 break;
               case 5:
                 *block_structur = mxDuplicateArray(prhs[i]);
@@ -111,7 +294,7 @@ Get_Arguments_and_global_variables(int nrhs,
                 *GlobalTemporaryTerms = mxDuplicateArray(prhs[i]);
                 break;
               default:
-                //mexPrintf("Unknown argument count_array_argument=%d\n",count_array_argument);
+                mexPrintf("Unknown argument count_array_argument=%d\n",count_array_argument);
                 break;
               }
             count_array_argument++;
@@ -132,16 +315,34 @@ Get_Arguments_and_global_variables(int nrhs,
             print_error = false;
           else
             {
-              int pos = Get_Argument(prhs[i]).find("block");
-              if (pos != (int) string::npos)
+              ;
+              if ((pos = Get_Argument(prhs[i]).find("block")) != (int) string::npos)
                 {
-                  int pos1 = Get_Argument(prhs[i]).find("=", pos+5);
+                  size_t pos1 = Get_Argument(prhs[i]).find("=", pos+5);
                   if (pos1 != (int) string::npos)
                     pos = pos1 + 1;
                   else
                     pos += 5;
                   block =  atoi(Get_Argument(prhs[i]).substr(pos, string::npos).c_str())-1;
                 }
+              else if ((pos = Get_Argument(prhs[i]).find("pfplan")) != (int) string::npos)
+                {
+                  size_t pos1 = Get_Argument(prhs[i]).find("=", pos+6);
+                  if (pos1 != (int) string::npos)
+                    pos = pos1 + 1;
+                  else
+                    pos += 6;
+                  *pfplan_struct_name =  deblank(Get_Argument(prhs[i]).substr(pos, string::npos));
+                }
+              else if ((pos = Get_Argument(prhs[i]).find("plan")) != (int) string::npos)
+                {
+                  size_t pos1 = Get_Argument(prhs[i]).find("=", pos+4);
+                  if (pos1 != (int) string::npos)
+                    pos = pos1 + 1;
+                  else
+                    pos += 4;
+                  *plan_struct_name =  deblank(Get_Argument(prhs[i]).substr(pos, string::npos));
+                }
               else
                 {
                   ostringstream tmp;
@@ -185,6 +386,7 @@ Get_Arguments_and_global_variables(int nrhs,
     }
 }
 
+
 #ifdef DEBUG_EX
 int
 main(int nrhs, const char *prhs[])
@@ -203,9 +405,9 @@ main(int nrhs, const char *prhs[])
   char *plhs[1];
   load_global((char *) prhs[1]);
 #endif
-  //ErrorHandlingException error_handling;
-  unsigned int i, row_y = 0, col_y = 0, row_x = 0, col_x = 0, nb_row_xd = 0;
-  unsigned int steady_row_y, steady_col_y;
+  mxArray *plan_struct = NULL, *pfplan_struct = NULL;
+  size_t i, row_y = 0, col_y = 0, row_x = 0, col_x = 0, nb_row_xd = 0;
+  size_t steady_row_y, steady_col_y;
   int y_kmin = 0, y_kmax = 0, y_decal = 0;
   unsigned int periods = 1;
   double *direction;
@@ -218,13 +420,22 @@ main(int nrhs, const char *prhs[])
   bool global_temporary_terms = false;
   bool print = false, print_error = true, print_it = false;
   double *steady_yd = NULL, *steady_xd = NULL;
-  
+  string plan, pfplan;
+
+  vector<s_plan> splan, spfplan;
+
+#ifdef CUDA
+  int CUDA_device = -1;
+  cublasHandle_t cublas_handle;
+  cusparseHandle_t cusparse_handle;
+  cusparseMatDescr_t descr;
+#endif
   try
     {
       Get_Arguments_and_global_variables(nrhs, prhs, count_array_argument,
                                          &yd, row_y, col_y,
                                          &xd, row_x, col_x,
-                                         &params, 
+                                         &params,
                                          &steady_yd, steady_row_y, steady_col_y,
                                          periods,
 #ifndef DEBUG_EX
@@ -232,123 +443,401 @@ main(int nrhs, const char *prhs[])
 #endif
                                          steady_state, evaluate, block,
                                          &M_, &oo_, &options_, global_temporary_terms,
-                                         print, print_error, &GlobalTemporaryTerms);
+                                         print, print_error, &GlobalTemporaryTerms,
+                                         &plan, &pfplan);
     }
   catch (GeneralExceptionHandling &feh)
     {
       DYN_MEX_FUNC_ERR_MSG_TXT(feh.GetErrorMsg().c_str());
     }
-
   if (!count_array_argument)
-    params = mxGetPr(mxGetFieldByNumber(M_, 0, mxGetFieldNumber(M_, "params")));
+    {
+      int field = mxGetFieldNumber(M_, "params");
+      if (field < 0)
+        DYN_MEX_FUNC_ERR_MSG_TXT("params is not a field of M_");
+      params = mxGetPr(mxGetFieldByNumber(M_, 0, field));
+    }
 
+  ErrorMsg emsg;
   
+
+
+  if (plan.length()>0)
+    {
+      mxArray* plan_struct = mexGetVariable("base", plan.c_str());
+      if (plan_struct == NULL)
+        {
+          string tmp = plan;
+          tmp.insert(0,"Can't find the plan: ");
+          DYN_MEX_FUNC_ERR_MSG_TXT(tmp.c_str());
+        }
+      size_t n_plan = mxGetN(plan_struct);
+      splan.resize(n_plan);
+      for (int i = 0; i < n_plan; i++)
+        {
+          splan[i].var = "";
+          splan[i].exo = "";
+          mxArray* tmp = mxGetField(plan_struct, i, "exo");
+          if (tmp)
+            {
+              char name [100];
+              mxGetString(tmp, name, 100);
+              splan[i].var = name;
+              SymbolType variable_type;
+              int exo_num = emsg.get_ID(name, &variable_type);
+              if (variable_type == eExogenous || variable_type == eExogenousDet)
+                splan[i].var_num = exo_num;
+              else
+                {
+                  string tmp = name;
+                  tmp.insert(0,"the variable '");
+                  tmp.append("'  defined as var in plan is not an exogenous or a deterministic exogenous\n");
+                  DYN_MEX_FUNC_ERR_MSG_TXT(tmp.c_str());
+                }
+            }
+          tmp = mxGetField(plan_struct, i, "var");
+          if (tmp)
+            {
+              char name [100];
+              mxGetString(tmp, name, 100);
+              splan[i].exo = name;
+              SymbolType variable_type;
+              int exo_num = emsg.get_ID(name, &variable_type);
+              if (variable_type == eEndogenous)
+                splan[i].exo_num = exo_num;
+              else
+                {
+                  string tmp = name;
+                  tmp.insert(0,"the variable '");
+                  tmp.append("'  defined as exo in plan is not an endogenous variable\n");
+                  DYN_MEX_FUNC_ERR_MSG_TXT(tmp.c_str());
+                }
+            }
+          tmp = mxGetField(plan_struct, i, "per_value");
+          if (tmp)
+            {
+              size_t num_shocks = mxGetM(tmp);
+              (splan[i]).per_value.resize(num_shocks);
+              double * per_value = mxGetPr(tmp);
+              for (int j = 0; j < num_shocks; j++)
+                (splan[i]).per_value[j] = make_pair(ceil(per_value[j]), per_value[j + num_shocks]);
+            }
+        }
+      int i;
+      for (vector<s_plan>::iterator it = splan.begin(); it != splan.end(); it++)
+        {
+          mexPrintf("----------------------------------------------------------------------------------------------------\n");
+          mexPrintf("suprise n�%d\n", i+1);
+          if (it->exo.length())
+            mexPrintf(" plan fliping var=%s (%d) exo=%s (%d) for the following periods and with the following values:\n", it->var.c_str(), it->var_num, it->exo.c_str(), it->exo_num);
+          else
+            mexPrintf(" plan shocks on var=%s for the following periods and with the following values:\n", it->var.c_str());
+          for (vector<pair<int, double> >::iterator it1 = it->per_value.begin(); it1 != it->per_value.end(); it1++)
+            {
+              mexPrintf("  %3d %10.5f\n",it1->first, it1->second);
+            }
+          i++;
+        }
+    }
+
+  if (pfplan.length()>0)
+    {
+      pfplan_struct = mexGetVariable("base", pfplan.c_str());
+      if (!pfplan_struct)
+        {
+          string tmp = pfplan;
+          tmp.insert(0,"Can't find the pfplan: ");
+          DYN_MEX_FUNC_ERR_MSG_TXT(tmp.c_str());
+        }
+      size_t n_plan = mxGetN(pfplan_struct);
+      spfplan.resize(n_plan);
+      for (int i = 0; i < n_plan; i++)
+        {
+          spfplan[i].var = "";
+          spfplan[i].exo = "";
+          mxArray* tmp = mxGetField(pfplan_struct, i, "var");
+          if (tmp)
+            {
+              char name [100];
+              mxGetString(tmp, name, 100);
+              spfplan[i].var = name;
+              SymbolType variable_type;
+              int exo_num = emsg.get_ID(name, &variable_type);
+              if (variable_type == eExogenous || variable_type == eExogenousDet)
+                splan[i].var_num = exo_num;
+              else
+                {
+                  string tmp = name;
+                  tmp.insert(0,"the variable '");
+                  tmp.append("' defined as var in pfplan is not an exogenous or a deterministic exogenous\n");
+                  DYN_MEX_FUNC_ERR_MSG_TXT(tmp.c_str());
+                }
+            }
+          tmp = mxGetField(pfplan_struct, i, "exo");
+          if (tmp)
+            {
+              char name [100];
+              mxGetString(tmp, name, 100);
+              spfplan[i].exo = name;
+              SymbolType variable_type;
+              int exo_num = emsg.get_ID(name, &variable_type);
+              if (variable_type == eEndogenous)
+                spfplan[i].exo_num = exo_num;
+              else
+                {
+                  string tmp = name;
+                  tmp.insert(0,"the variable '");
+                  tmp.append("' defined as exo in pfplan  is not an endogenous variable\n");
+                  DYN_MEX_FUNC_ERR_MSG_TXT(tmp.c_str());
+                }
+            }
+          tmp = mxGetField(pfplan_struct, i, "per_value");
+          if (tmp)
+            {
+              size_t num_shocks = mxGetM(tmp);
+              double * per_value = mxGetPr(tmp);
+              (spfplan[i]).per_value.resize(num_shocks);
+              for (int j = 0; j < num_shocks; j++)
+                spfplan[i].per_value[j] = make_pair(ceil(per_value[j]), per_value[j+ num_shocks]);
+            }
+        }
+      int i;
+      for (vector<s_plan>::iterator it = spfplan.begin(); it != spfplan.end(); it++)
+        {
+          mexPrintf("----------------------------------------------------------------------------------------------------\n");
+          mexPrintf("perfect foresight n�%d\n", i+1);
+          if (it->exo.length())
+            mexPrintf(" plan flipping var=%s (%d) exo=%s (%d) for the following periods and with the following values:\n", it->var.c_str(), it->var_num, it->exo.c_str(), it->exo_num);
+          else
+            mexPrintf(" plan shocks on var=%s (%d) for the following periods and with the following values:\n", it->var.c_str(), it->var_num);
+          for (vector<pair<int, double> >::iterator it1 = it->per_value.begin(); it1 != it->per_value.end(); it1++)
+            {
+              mexPrintf("  %3d %10.5f\n",it1->first, it1->second);
+            }
+          i++;
+        }
+    }
+
+
+
+  int field_steady_state = mxGetFieldNumber(oo_, "steady_state");
+  if (field_steady_state < 0)
+    DYN_MEX_FUNC_ERR_MSG_TXT("steady_state is not a field of oo_");
+  int field_exo_steady_state = mxGetFieldNumber(oo_, "exo_steady_state");
+  if (field_exo_steady_state < 0)
+    DYN_MEX_FUNC_ERR_MSG_TXT("exo_steady_state is not a field of oo_");
+
   if (!steady_state)
     {
+      int field_endo_simul = mxGetFieldNumber(oo_, "endo_simul");
+      if (field_endo_simul < 0)
+        DYN_MEX_FUNC_ERR_MSG_TXT("endo_simul is not a field of oo_");
+
+      int field_exo_simul = mxGetFieldNumber(oo_, "exo_simul");
+      if (field_exo_simul < 0)
+        DYN_MEX_FUNC_ERR_MSG_TXT("exo_simul is not a field of oo_");
+
       if (!count_array_argument)
         {
-          yd = mxGetPr(mxGetFieldByNumber(oo_, 0, mxGetFieldNumber(oo_, "endo_simul")));
-          row_y = mxGetM(mxGetFieldByNumber(oo_, 0, mxGetFieldNumber(oo_, "endo_simul")));
-          col_y = mxGetN(mxGetFieldByNumber(oo_, 0, mxGetFieldNumber(oo_, "endo_simul")));
-          xd = mxGetPr(mxGetFieldByNumber(oo_, 0, mxGetFieldNumber(oo_, "exo_simul")));
-          row_x = mxGetM(mxGetFieldByNumber(oo_, 0, mxGetFieldNumber(oo_, "exo_simul")));
-          col_x = mxGetN(mxGetFieldByNumber(oo_, 0, mxGetFieldNumber(oo_, "exo_simul")));
+          mxArray* endo_sim_arr = mxGetFieldByNumber(oo_, 0, field_endo_simul);
+          yd = mxGetPr(endo_sim_arr);
+          row_y = mxGetM(endo_sim_arr);
+          col_y = mxGetN(endo_sim_arr);
+          mxArray* exo_sim_arr = mxGetFieldByNumber(oo_, 0, field_exo_simul);
+          xd = mxGetPr(exo_sim_arr);
+          row_x = mxGetM(exo_sim_arr);
+          col_x = mxGetN(exo_sim_arr);
           nb_row_xd = row_x;
         }
+      int field = mxGetFieldNumber(M_, "maximum_lag");
+      if (field >= 0)
+        y_kmin = int (floor(*(mxGetPr(mxGetFieldByNumber(M_, 0, field)))));
+      else
+        DYN_MEX_FUNC_ERR_MSG_TXT("maximum_lag is not a field of M_");
+      field = mxGetFieldNumber(M_, "maximum_lead");
+      if (field >= 0)
+        y_kmax = int (floor(*(mxGetPr(mxGetFieldByNumber(M_, 0, field)))));
+      else
+        DYN_MEX_FUNC_ERR_MSG_TXT("maximum_lead is not a field of M_");
+      field = mxGetFieldNumber(M_, "maximum_endo_lag");
+      if (field >= 0)
+        y_decal = max(0, y_kmin-int (floor(*(mxGetPr(mxGetFieldByNumber(M_, 0, field))))));
+      else
+        DYN_MEX_FUNC_ERR_MSG_TXT("maximum_endo_lag is not a field of M_");
 
-      y_kmin = int (floor(*(mxGetPr(mxGetFieldByNumber(M_, 0, mxGetFieldNumber(M_, "maximum_lag"))))));
-      y_kmax = int (floor(*(mxGetPr(mxGetFieldByNumber(M_, 0, mxGetFieldNumber(M_, "maximum_lead"))))));
-      y_decal = max(0, y_kmin-int (floor(*(mxGetPr(mxGetFieldByNumber(M_, 0, mxGetFieldNumber(M_, "maximum_endo_lag")))))));
       if (!count_array_argument)
-        periods = int (floor(*(mxGetPr(mxGetFieldByNumber(options_, 0, mxGetFieldNumber(options_, "periods"))))));
+        {
+          int field = mxGetFieldNumber(options_, "periods");
+          if (field >= 0)
+            periods = int (floor(*(mxGetPr(mxGetFieldByNumber(options_, 0, field)))));
+          else
+            DYN_MEX_FUNC_ERR_MSG_TXT("options_ is not a field of options_");
+        }
+
       if (!steady_yd )
         {
-          steady_yd = mxGetPr(mxGetFieldByNumber(oo_, 0, mxGetFieldNumber(oo_, "steady_state")));
-          steady_row_y = mxGetM(mxGetFieldByNumber(oo_, 0, mxGetFieldNumber(oo_, "steady_state")));
-          steady_col_y = mxGetN(mxGetFieldByNumber(oo_, 0, mxGetFieldNumber(oo_, "steady_state")));;
+          mxArray* steady_state_arr = mxGetFieldByNumber(oo_, 0, field_steady_state);
+          steady_yd = mxGetPr(steady_state_arr);
+          steady_row_y = mxGetM(steady_state_arr);
+          steady_col_y = mxGetN(steady_state_arr);
         }
-      steady_xd = mxGetPr(mxGetFieldByNumber(oo_, 0, mxGetFieldNumber(oo_, "exo_steady_state")));
+      steady_xd = mxGetPr(mxGetFieldByNumber(oo_, 0, field_exo_steady_state));
     }
   else
     {
       if (!count_array_argument)
         {
-          yd = mxGetPr(mxGetFieldByNumber(oo_, 0, mxGetFieldNumber(oo_, "steady_state")));
-          row_y = mxGetM(mxGetFieldByNumber(oo_, 0, mxGetFieldNumber(oo_, "steady_state")));
-          col_y = mxGetN(mxGetFieldByNumber(oo_, 0, mxGetFieldNumber(oo_, "steady_state")));;
+          mxArray* steady_state_arr = mxGetFieldByNumber(oo_, 0, field_steady_state);
+          yd = mxGetPr(steady_state_arr);
+          row_y = mxGetM(steady_state_arr);
+          col_y = mxGetN(steady_state_arr);
 
-          xd = mxGetPr(mxGetFieldByNumber(oo_, 0, mxGetFieldNumber(oo_, "exo_steady_state")));
-          row_x = mxGetM(mxGetFieldByNumber(oo_, 0, mxGetFieldNumber(oo_, "exo_steady_state")));
-          col_x = mxGetN(mxGetFieldByNumber(oo_, 0, mxGetFieldNumber(oo_, "exo_steady_state")));
+          mxArray* exo_steady_state_arr = mxGetFieldByNumber(oo_, 0, field_exo_steady_state);
+          xd = mxGetPr(exo_steady_state_arr);
+          row_x = mxGetM(exo_steady_state_arr);
+          col_x = mxGetN(exo_steady_state_arr);
           nb_row_xd = row_x;
         }
     }
-  int verbose= int(*mxGetPr((mxGetFieldByNumber(options_, 0, mxGetFieldNumber(options_, "verbosity")))));
+  int field = mxGetFieldNumber(options_, "verbosity");
+  int verbose = 0;
+  if (field >= 0)
+    verbose = int(*mxGetPr((mxGetFieldByNumber(options_, 0, field))));
+  else
+    DYN_MEX_FUNC_ERR_MSG_TXT("verbosity is not a field of options_");
   if (verbose)
     print_it = true;
-  int maxit_ = int (floor(*(mxGetPr(mxGetFieldByNumber(options_, 0, mxGetFieldNumber(options_, "maxit_"))))));
-  double slowc = double (*(mxGetPr(mxGetFieldByNumber(options_, 0, mxGetFieldNumber(options_, "slowc")))));
-  double markowitz_c = double (*(mxGetPr(mxGetFieldByNumber(options_, 0, mxGetFieldNumber(options_, "markowitz")))));
-  int minimal_solving_periods = int (*(mxGetPr(mxGetFieldByNumber(options_, 0, mxGetFieldNumber(options_, "minimal_solving_periods")))));
-  int stack_solve_algo = int (*(mxGetPr(mxGetFieldByNumber(options_, 0, mxGetFieldNumber(options_, "stack_solve_algo")))));
+  field = mxGetFieldNumber(options_, "maxit_");
+  if (field < 0)
+    DYN_MEX_FUNC_ERR_MSG_TXT("maxit_ is not a field of options_");
+  int maxit_ = int (floor(*(mxGetPr(mxGetFieldByNumber(options_, 0, field)))));
+  field = mxGetFieldNumber(options_, "slowc");
+  if (field < 0)
+    DYN_MEX_FUNC_ERR_MSG_TXT("slows is not a field of options_");
+  double slowc = double (*(mxGetPr(mxGetFieldByNumber(options_, 0, field))));
+  field = mxGetFieldNumber(options_, "markowitz");
+  if (field < 0)
+    DYN_MEX_FUNC_ERR_MSG_TXT("markowitz is not a field of options_");
+  double markowitz_c = double (*(mxGetPr(mxGetFieldByNumber(options_, 0, field))));
+  field = mxGetFieldNumber(options_, "minimal_solving_periods");
+  if (field < 0)
+    DYN_MEX_FUNC_ERR_MSG_TXT("minimal_solving_periods is not a field of options_");
+  int minimal_solving_periods = int (*(mxGetPr(mxGetFieldByNumber(options_, 0, field))));
+  field = mxGetFieldNumber(options_, "stack_solve_algo");
+  if (field < 0)
+    DYN_MEX_FUNC_ERR_MSG_TXT("stack_solve_algo is not a field of options_");
+  int stack_solve_algo = int (*(mxGetPr(mxGetFieldByNumber(options_, 0, field))));
   int solve_algo;
   double solve_tolf;
+
   if (steady_state)
     {
-      solve_algo = int (*(mxGetPr(mxGetFieldByNumber(options_, 0, mxGetFieldNumber(options_, "solve_algo")))));
-      solve_tolf = *(mxGetPr(mxGetFieldByNumber(options_, 0, mxGetFieldNumber(options_, "solve_tolf"))));
+      int field = mxGetFieldNumber(options_, "solve_algo");
+      if (field >= 0)
+        solve_algo = int (*(mxGetPr(mxGetFieldByNumber(options_, 0, field))));
+      else
+        DYN_MEX_FUNC_ERR_MSG_TXT("solve_algo is not a field of options_");
+      field = mxGetFieldNumber(options_, "solve_tolf");
+      if (field >= 0)
+        solve_tolf = *(mxGetPr(mxGetFieldByNumber(options_, 0, field)));
+      else
+        DYN_MEX_FUNC_ERR_MSG_TXT("solve_tolf is not a field of options_");
     }
   else
     {
       solve_algo = stack_solve_algo;
-      mxArray *dynatol = mxGetFieldByNumber(options_, 0, mxGetFieldNumber(options_, "dynatol"));
-      solve_tolf= *mxGetPr((mxGetFieldByNumber(dynatol, 0, mxGetFieldNumber(dynatol, "f"))));
+      int field = mxGetFieldNumber(options_, "dynatol");
+      mxArray *dynatol;
+      if (field >= 0)
+        dynatol = mxGetFieldByNumber(options_, 0, field);
+      else
+        DYN_MEX_FUNC_ERR_MSG_TXT("dynatol is not a field of options_");
+      field = mxGetFieldNumber(dynatol, "f");
+      if (field >= 0)
+        solve_tolf= *mxGetPr((mxGetFieldByNumber(dynatol, 0, field)));
+      else
+        DYN_MEX_FUNC_ERR_MSG_TXT("f is not a field of options_.dynatol");
     }
-
-  mxArray *mxa = mxGetFieldByNumber(M_, 0, mxGetFieldNumber(M_, "fname"));
-  int buflen = mxGetM(mxa) * mxGetN(mxa) + 1;
+  field = mxGetFieldNumber(M_, "fname");
+  mxArray *mxa;
+  if (field >= 0)
+    mxa = mxGetFieldByNumber(M_, 0, field);
+  else
+    DYN_MEX_FUNC_ERR_MSG_TXT("fname is not a field of M_");
+  size_t buflen = mxGetM(mxa) * mxGetN(mxa) + 1;
   char *fname;
   fname = (char *) mxCalloc(buflen+1, sizeof(char));
-  int status = mxGetString(mxa, fname, buflen);
+  size_t status = mxGetString(mxa, fname, int(buflen));
   fname[buflen] = ' ';
   if (status != 0)
     mexWarnMsgTxt("Not enough space. Filename is truncated.");
   string file_name = fname;
 
-  int size_of_direction = col_y*row_y*sizeof(double);
-  double *y = (double *) mxMalloc(size_of_direction);
-  double *ya = (double *) mxMalloc(size_of_direction);
-  direction = (double *) mxMalloc(size_of_direction);
-  memset(direction, 0, size_of_direction);
-
-  double *x = (double *) mxMalloc(col_x*row_x*sizeof(double));
-  for (i = 0; i < row_x*col_x; i++)
-    x[i] = double (xd[i]);
-  for (i = 0; i < row_y*col_y; i++)
-    {
-      y[i]  = double (yd[i]);
-      ya[i] = double (yd[i]);
-    }
-  int y_size = row_y;
-  int nb_row_x = row_x;
-  clock_t t0 = clock();
-
-  Interpreter interprete(params, y, ya, x, steady_yd, steady_xd, direction, y_size, nb_row_x, nb_row_xd, periods, y_kmin, y_kmax, maxit_, solve_tolf, size_of_direction, slowc, y_decal, markowitz_c, file_name, minimal_solving_periods, stack_solve_algo, solve_algo, global_temporary_terms, print, print_error, GlobalTemporaryTerms);
-
-  string f(fname);
-  mxFree(fname);
-  int nb_blocks = 0;
-  double *pind;
-  bool no_error = true;
-
+#ifdef CUDA
   try
     {
-      interprete.compute_blocks(f, f, steady_state, evaluate, block, nb_blocks,print_it);
+      if (stack_solve_algo == 7 && !steady_state)
+        CUDA_device = GPU_Test_and_Info(&cublas_handle, &cusparse_handle, &descr);
     }
   catch (GeneralExceptionHandling &feh)
     {
       DYN_MEX_FUNC_ERR_MSG_TXT(feh.GetErrorMsg().c_str());
     }
+#else
+  if (stack_solve_algo == 7 && !steady_state)
+    DYN_MEX_FUNC_ERR_MSG_TXT("bytecode has not been compiled with CUDA option. Bytecode Can't use options_.stack_solve_algo=7\n");
+#endif
+
+  size_t size_of_direction = col_y*row_y*sizeof(double);
+  double *y = (double *) mxMalloc(size_of_direction);
+  double *ya = (double *) mxMalloc(size_of_direction);
+  direction = (double *) mxMalloc(size_of_direction);
+  memset(direction, 0, size_of_direction);
+  double *x = (double *) mxMalloc(col_x*row_x*sizeof(double));
+  #ifdef USE_OMP
+  #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+  #endif
+  for (i = 0; i < row_x*col_x; i++)
+    {
+      x[i] = double (xd[i]);
+    }
+
+  #ifdef USE_OMP
+  #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
+  #endif
+  for (i = 0; i < row_y*col_y; i++)
+    {
+      y[i]  = double (yd[i]);
+      ya[i] = double (yd[i]);
+    }
+  size_t y_size = row_y;
+  size_t nb_row_x = row_x;
+  clock_t t0 = clock();
+  Interpreter interprete(params, y, ya, x, steady_yd, steady_xd, direction, y_size, nb_row_x, nb_row_xd, periods, y_kmin, y_kmax, maxit_, solve_tolf, size_of_direction, slowc, y_decal,
+                         markowitz_c, file_name, minimal_solving_periods, stack_solve_algo, solve_algo, global_temporary_terms, print, print_error, GlobalTemporaryTerms, steady_state,
+                         print_it
+#ifdef CUDA
+                         , CUDA_device, cublas_handle, cusparse_handle, descr
+#endif
+                         );
+  string f(fname);
+  mxFree(fname);
+  int nb_blocks = 0;
+  double *pind;
+  bool no_error = true;
+  try
+    {
+      interprete.compute_blocks(f, f, evaluate, block, nb_blocks);
+    }
+  catch (GeneralExceptionHandling &feh)
+    {
+      DYN_MEX_FUNC_ERR_MSG_TXT(feh.GetErrorMsg().c_str());
+    }
+
+#ifdef CUDA
+  if (stack_solve_algo == 7 && !steady_state)
+    GPU_close(cublas_handle, cusparse_handle, descr);
+#endif
 
   clock_t t1 = clock();
   if (!steady_state && !evaluate && no_error && print)
@@ -370,14 +859,14 @@ main(int nrhs, const char *prhs[])
               if (evaluate)
                 {
                   vector<double> residual = interprete.get_residual();
-                  plhs[1] = mxCreateDoubleMatrix(residual.size()/col_y, col_y, mxREAL);
+                  plhs[1] = mxCreateDoubleMatrix(int(residual.size()/double(col_y)), int(col_y), mxREAL);
                   pind = mxGetPr(plhs[1]);
                   for (i = 0; i < residual.size(); i++)
                     pind[i] = residual[i];
                 }
               else
                 {
-                  plhs[1] = mxCreateDoubleMatrix(row_y, col_y, mxREAL);
+                  plhs[1] = mxCreateDoubleMatrix(int(row_y), int(col_y), mxREAL);
                   pind = mxGetPr(plhs[1]);
                   for (i = 0; i < row_y*col_y; i++)
                     pind[i] = y[i];
@@ -385,7 +874,7 @@ main(int nrhs, const char *prhs[])
             }
           else
             {
-              plhs[1] = mxCreateDoubleMatrix(row_y, col_y, mxREAL);
+              plhs[1] = mxCreateDoubleMatrix(int(row_y), int(col_y), mxREAL);
               pind = mxGetPr(plhs[1]);
               if (evaluate)
                 {
@@ -409,7 +898,7 @@ main(int nrhs, const char *prhs[])
                       jacob_exo_field_number = 1;
                       jacob_exo_det_field_number = 2;
                       jacob_other_endo_field_number = 3;
-                      mwSize dims[1] = {nb_blocks };
+                      mwSize dims[1] = {(mwSize)nb_blocks };
                       plhs[2] = mxCreateStructArray(1, dims, 4, field_names);
                     }
                   else if (!mxIsStruct(block_structur))
@@ -456,15 +945,15 @@ main(int nrhs, const char *prhs[])
                 }
               if (nlhs > 3)
                 {
-                  plhs[3] = mxCreateDoubleMatrix(row_y, col_y, mxREAL);
+                  plhs[3] = mxCreateDoubleMatrix(int(row_y), int(col_y), mxREAL);
                   pind = mxGetPr(plhs[3]);
                   for (i = 0; i < row_y*col_y; i++)
                     pind[i] = y[i];
                   if (nlhs > 4)
                     {
                       mxArray *GlobalTemporaryTerms = interprete.get_Temporary_Terms();
-                      unsigned int nb_temp_terms = mxGetM(GlobalTemporaryTerms);
-                      plhs[4] = mxCreateDoubleMatrix(nb_temp_terms, 1, mxREAL);
+                      size_t nb_temp_terms = mxGetM(GlobalTemporaryTerms);
+                      plhs[4] = mxCreateDoubleMatrix(int(nb_temp_terms), 1, mxREAL);
                       pind = mxGetPr(plhs[4]);
                       double *tt = mxGetPr(GlobalTemporaryTerms);
                       for (i = 0; i < nb_temp_terms; i++)
@@ -486,4 +975,8 @@ main(int nrhs, const char *prhs[])
     mxFree(ya);
   if (direction)
     mxFree(direction);
+#ifdef _MSC_VER_
+  /*fFreeResult =*/ FreeLibrary(hinstLib);
+#endif
+  return;
 }
diff --git a/mex/sources/dynblas.h b/mex/sources/dynblas.h
index 415ca224f..ba159ae12 100644
--- a/mex/sources/dynblas.h
+++ b/mex/sources/dynblas.h
@@ -41,7 +41,7 @@ typedef ptrdiff_t blas_int;
 typedef int blas_int;
 #endif
 
-#if defined(MATLAB_MEX_FILE) && defined(_WIN32)
+#if defined(MATLAB_MEX_FILE) && defined(_WIN32) && !defined(_MSC_VER)
 # define FORTRAN_WRAPPER(x) x
 #else
 # define FORTRAN_WRAPPER(x) x ## _