From 03e487a09293cc8bd5b5dff2e53e346729257509 Mon Sep 17 00:00:00 2001 From: Ferhat Mihoubi Date: Fri, 22 Mar 2013 15:44:34 +0100 Subject: [PATCH] Major update of bytecode: - Iterative linear solvers using CUDA - interpreter.cc decomposed --- mex/build/bytecode.am | 4 +- mex/sources/bytecode/ErrorHandling.hh | 253 +- mex/sources/bytecode/Interpreter.cc | 2436 ++------------ mex/sources/bytecode/Interpreter.hh | 54 +- mex/sources/bytecode/SparseMatrix.cc | 4474 +++++++++++++++++++++---- mex/sources/bytecode/SparseMatrix.hh | 133 +- mex/sources/bytecode/bytecode.cc | 669 +++- mex/sources/dynblas.h | 2 +- 8 files changed, 5104 insertions(+), 2921 deletions(-) diff --git a/mex/build/bytecode.am b/mex/build/bytecode.am index 66a29a3e9..f0563e4f8 100644 --- a/mex/build/bytecode.am +++ b/mex/build/bytecode.am @@ -1,6 +1,6 @@ noinst_PROGRAMS = bytecode -bytecode_CPPFLAGS = $(AM_CPPFLAGS) -I$(top_srcdir)/../../sources/bytecode -I$(top_srcdir)/../../../preprocessor +bytecode_CPPFLAGS = $(AM_CPPFLAGS) -I$(top_srcdir)/../../sources -I$(top_srcdir)/../../sources/bytecode -I$(top_srcdir)/../../../preprocessor TOPDIR = $(top_srcdir)/../../sources/bytecode @@ -9,8 +9,10 @@ nodist_bytecode_SOURCES = \ $(TOPDIR)/Interpreter.cc \ $(TOPDIR)/Mem_Mngr.cc \ $(TOPDIR)/SparseMatrix.cc \ + $(TOPDIR)/Evaluate.cc \ $(TOPDIR)/Interpreter.hh \ $(TOPDIR)/Mem_Mngr.hh \ $(TOPDIR)/SparseMatrix.hh \ + $(TOPDIR)/Evaluate.hh \ $(TOPDIR)/ErrorHandling.hh diff --git a/mex/sources/bytecode/ErrorHandling.hh b/mex/sources/bytecode/ErrorHandling.hh index 9c6717d0e..b09c62deb 100644 --- a/mex/sources/bytecode/ErrorHandling.hh +++ b/mex/sources/bytecode/ErrorHandling.hh @@ -23,24 +23,138 @@ #include #include #include +#include +#define BYTE_CODE #include "CodeInterpreter.hh" #ifdef DEBUG_EX -# include +# include # include "mex_interface.hh" #endif + +#ifdef OCTAVE_MEX_FILE +# define CHAR_LENGTH 1 +#else +# define CHAR_LENGTH 2 +#endif + +#ifdef _MSC_VER +#include +#define M_E 2.71828182845904523536 +#define M_LOG2E 1.44269504088896340736 +#define M_LOG10E 0.434294481903251827651 +#define M_LN2 0.693147180559945309417 +#define M_LN10 2.30258509299404568402 +#define M_PI 3.14159265358979323846 +#define M_PI_2 1.57079632679489661923 +#define M_PI_4 0.785398163397448309616 +#define M_1_PI 0.318309886183790671538 +#define M_2_PI 0.636619772367581343076 +#define M_1_SQRTPI 0.564189583547756286948 +#define M_2_SQRTPI 1.12837916709551257390 +#define M_SQRT2 1.41421356237309504880 +#define M_SQRT_2 0.707106781186547524401 +#define NAN numeric_limits::quiet_NaN() + +#define isnan(x) _isnan(x) +#define isinf(x) (!_finite(x)) +#define fpu_error(x) (isinf(x) || isnan(x)) + + +class MSVCpp_missings +{ + public: + inline double + asinh(double x) const + { + if(x==0.0) + return 0.0; + double ax = abs(x); + return log(x+ax*sqrt(1.+1./(ax*ax))); + } + + inline double + acosh(double x) const + { + if(x==0.0) + return 0.0; + double ax = abs(x); + return log(x+ax*sqrt(1.-1./(ax*ax))); + } + + inline double + atanh(double x) const + { + return log((1+x)/(1-x))/2; + } + + inline double + erf(double x) const + { + const double a1 = -1.26551223, a2 = 1.00002368, + a3 = 0.37409196, a4 = 0.09678418, + a5 = -0.18628806, a6 = 0.27886807, + a7 = -1.13520398, a8 = 1.48851587, + a9 = -0.82215223, a10 = 0.17087277; + double v = 1; + double z = abs(x); + if (z <= 0) + return v; + double t = 1 / (1 + 0.5 * z); + v = t*exp((-z*z) +a1+t*(a2+t*(a3+t*(a4+t*(a5+t*(a6+t*(a7+t*(a8+t*(a9+t*a10))))))))); + if (x < 0) + v = 2 - v; + return 1 - v; + } + + inline double + nearbyint(double x) const + { + return floor(x + 0.5); + } + + inline double + fmax(double x, double y) const + { + if (x > y) + return x; + else + return y; + } + + inline double + fmin(double x, double y) const + { + if (x < y) + return x; + else + return y; + } + +}; +#endif + + //#define DEBUG using namespace std; const int NO_ERROR_ON_EXIT = 0; const int ERROR_ON_EXIT = 1; + typedef vector > code_liste_type; typedef code_liste_type::const_iterator it_code_type; + class GeneralExceptionHandling { string ErrorMsg; public: +#ifdef _MSC_VER_ + ~GeneralExceptionHandling() + { + FreeLibrary(hinstLib); + }; +#endif GeneralExceptionHandling(string ErrorMsg_arg) : ErrorMsg(ErrorMsg_arg) { }; @@ -121,6 +235,16 @@ public: }; }; +class UserExceptionHandling : public GeneralExceptionHandling +{ + double value; +public: + UserExceptionHandling() : GeneralExceptionHandling("Fatal error in bytecode:") + { + completeErrorMsg(" User break\n"); + }; +}; + class FatalExceptionHandling : public GeneralExceptionHandling { public: @@ -133,16 +257,40 @@ public: }; }; -class ErrorMsg +struct s_plan { + string var, exo; + int var_num, exo_num; + vector > per_value; +}; + +#ifdef MATLAB_MEX_FILE +extern "C" bool utIsInterruptPending(); +#else +#include +#include +#endif + +#ifdef _MSC_VER +class ErrorMsg : public MSVCpp_missings +#else +class ErrorMsg +#endif +{ +private: + bool is_load_variable_list; + public: + double *y, *ya; + int y_size; double *T; - int nb_row_xd, nb_row_x, y_size; + int nb_row_xd, nb_row_x; int y_kmin, y_kmax, periods; double *x, *params; - double *u, *y, *ya; + double *u; double *steady_y, *steady_x; - double *g2, *g1, *r; + double *g2, *g1, *r, *res; + vector splan, spfplan; vector jacobian_block, jacobian_other_endo_block, jacobian_exo_block, jacobian_det_exo_block; map TEF; map, double > TEFD; @@ -150,11 +298,12 @@ public: ExpressionType EQN_type; it_code_type it_code_expr; - unsigned int nb_endo, nb_exo, nb_param; + /*unsigned int*/size_t nb_endo, nb_exo, nb_param; char *P_endo_names, *P_exo_names, *P_param_names; - unsigned int endo_name_length, exo_name_length, param_name_length; + size_t/*unsigned int*/ endo_name_length, exo_name_length, param_name_length; unsigned int EQN_equation, EQN_block, EQN_block_number; unsigned int EQN_dvar1, EQN_dvar2, EQN_dvar3; + vector > > Variable_list; inline ErrorMsg() @@ -169,6 +318,7 @@ public: nb_param = mxGetM(mxGetFieldByNumber(M_, 0, mxGetFieldNumber(M_, "param_names"))); param_name_length = mxGetN(mxGetFieldByNumber(M_, 0, mxGetFieldNumber(M_, "param_names"))); P_param_names = (char *) mxGetPr(mxGetFieldByNumber(M_, 0, mxGetFieldNumber(M_, "param_names"))); + is_load_variable_list = false; } inline string @@ -184,9 +334,9 @@ public: else { if (str[i] == '$') - pos1 = temp.length(); + pos1 = int(temp.length()); else - pos2 = temp.length(); + pos2 = int(temp.length()); if (pos1 >= 0 && pos2 >= 0) { tmp_n.erase(pos1, pos2-pos1+1); @@ -199,6 +349,50 @@ public: return temp; } + inline void + load_variable_list() + { + ostringstream res; + for (unsigned int variable_num = 0; variable_num < (unsigned int)nb_endo; variable_num++) + { + for (unsigned int i = 0; i < endo_name_length; i++) + if (P_endo_names[CHAR_LENGTH*(variable_num+i*nb_endo)] != ' ') + res << P_endo_names[CHAR_LENGTH*(variable_num+i*nb_endo)]; + Variable_list.push_back(make_pair(res.str(), make_pair(eEndogenous, variable_num))); + } + for (unsigned int variable_num = 0; variable_num < (unsigned int)nb_exo; variable_num++) + { + for (unsigned int i = 0; i < exo_name_length; i++) + if (P_exo_names[CHAR_LENGTH*(variable_num+i*nb_exo)] != ' ') + res << P_exo_names[CHAR_LENGTH*(variable_num+i*nb_exo)]; + Variable_list.push_back(make_pair(res.str(), make_pair(eExogenous, variable_num))); + } + } + + inline int + get_ID(const string variable_name, SymbolType *variable_type) + { + if (!is_load_variable_list) + { + load_variable_list(); + is_load_variable_list = true; + } + size_t n = Variable_list.size(); + int i = 0; + bool notfound = true; + while (notfound && i < n) + { + if (variable_name == Variable_list[i].first) + { + notfound = false; + *variable_type = Variable_list[i].second.first; + return Variable_list[i].second.second; + } + i++; + } + return(-1); + } + inline string get_variable(const SymbolType variable_type, const unsigned int variable_num) const { @@ -293,7 +487,6 @@ public: break; default: return ("???"); - break; } else switch (EQN_type) @@ -342,7 +535,6 @@ public: break; default: return ("???"); - break; } it_code_type it_code_ret; Error_loc << endl << add_underscore_to_fpe(" " + print_expression(it_code_expr, evaluate, size, block_num, steady_state, Per_u_, it_, it_code_ret, true)); @@ -378,6 +570,12 @@ public: while (go_on) { +#ifdef OCTAVE_MEX_FILE + OCTAVE_QUIT; +#else + if ( utIsInterruptPending() ) + throw UserExceptionHandling(); +#endif switch (it_code->first) { case FNUMEXPR: @@ -441,7 +639,9 @@ public: case eParameter: var = ((FLDV_ *) it_code->second)->get_pos(); #ifdef DEBUG - mexPrintf("FLDV_ Param var=%d", var); + mexPrintf("FLDV_ Param var=%d\n", var); + mexPrintf("get_variable(eParameter, var)=%s\n",get_variable(eParameter, var).c_str()); + mexEvalString("drawnow;"); #endif Stack.push(get_variable(eParameter, var)); if (compute) @@ -451,7 +651,10 @@ public: var = ((FLDV_ *) it_code->second)->get_pos(); lag = ((FLDV_ *) it_code->second)->get_lead_lag(); #ifdef DEBUG - mexPrintf("FLDV_ endo var=%d, lag=%d", var, lag); + mexPrintf("FLDV_ endo var=%d, lag=%d\n", var, lag); + mexPrintf("get_variable(eEndogenous, var)=%s, compute=%d\n",get_variable(eEndogenous, var).c_str(), compute); + mexPrintf("it_=%d, lag=%d, y_size=%d, var=%d, y=%x\n", it_, lag, y_size, var, y); + mexEvalString("drawnow;"); #endif tmp_out.str(""); if (lag > 0) @@ -1250,7 +1453,7 @@ public: Stack.pop(); if (compute) { - int derivOrder = nearbyint(Stackf.top()); + int derivOrder = int(nearbyint(Stackf.top())); Stackf.pop(); if (fabs(v1f) < NEAR_ZERO && v2f > 0 && derivOrder > v2f @@ -1570,7 +1773,11 @@ public: } tmp_out.str(""); tmp_out << function_name << "("; +#ifndef _MSC_VER string ss[nb_input_arguments]; +#else + vector ss(nb_input_arguments); +#endif for (unsigned int i = 0; i < nb_input_arguments; i++) { ss[nb_input_arguments-i-1] = Stack.top(); @@ -1624,7 +1831,11 @@ public: tmp_out.str(""); tmp_out << function_name << "("; tmp_out << arg_func_name.c_str() << ", " << fc->get_row() << ", {"; +#ifndef _MSC_VER string ss[nb_add_input_arguments]; +#else + vector ss(nb_input_arguments); +#endif for (unsigned int i = 0; i < nb_add_input_arguments; i++) { ss[nb_add_input_arguments-i-1] = Stack.top(); @@ -1655,7 +1866,11 @@ public: } tmp_out.str(""); tmp_out << function_name << "("; +#ifndef _MSC_VER string ss[nb_input_arguments]; +#else + vector ss(nb_input_arguments); +#endif for (unsigned int i = 0; i < nb_input_arguments; i++) { ss[nb_input_arguments-i-1] = Stack.top(); @@ -1708,7 +1923,11 @@ public: tmp_out.str(""); tmp_out << function_name << "("; tmp_out << arg_func_name.c_str() << ", " << fc->get_row() << ", " << fc->get_col() << ", {"; +#ifndef _MSC_VER string ss[nb_add_input_arguments]; +#else + vector ss(nb_input_arguments); +#endif for (unsigned int i = 0; i < nb_add_input_arguments; i++) { ss[nb_add_input_arguments-i-1] = Stack.top(); @@ -1739,7 +1958,11 @@ public: } tmp_out.str(""); tmp_out << function_name << "("; +#ifndef _MSC_VER string ss[nb_input_arguments]; +#else + vector ss(nb_input_arguments); +#endif for (unsigned int i = 0; i < nb_input_arguments; i++) { ss[nb_input_arguments-i-1] = Stack.top(); @@ -1965,7 +2188,7 @@ public: it_code++; } #ifdef DEBUG - mexPrintf("print_expression end\n"); mexEvalString("drawnow;"); + mexPrintf("print_expression end tmp_out.str().c_str()=%s\n", tmp_out.str().c_str()); mexEvalString("drawnow;"); #endif it_code_ret = it_code; return (tmp_out.str()); diff --git a/mex/sources/bytecode/Interpreter.cc b/mex/sources/bytecode/Interpreter.cc index 1587e2d45..c4b677c82 100644 --- a/mex/sources/bytecode/Interpreter.cc +++ b/mex/sources/bytecode/Interpreter.cc @@ -22,18 +22,28 @@ #include "Interpreter.hh" #define BIG 1.0e+8; #define SMALL 1.0e-5; -//#define DEBUG +///#define DEBUG Interpreter::~Interpreter() { } Interpreter::Interpreter(double *params_arg, double *y_arg, double *ya_arg, double *x_arg, double *steady_y_arg, double *steady_x_arg, - double *direction_arg, int y_size_arg, - int nb_row_x_arg, int nb_row_xd_arg, int periods_arg, int y_kmin_arg, int y_kmax_arg, - int maxit_arg_, double solve_tolf_arg, int size_of_direction_arg, double slowc_arg, int y_decal_arg, double markowitz_c_arg, + double *direction_arg, size_t y_size_arg, + size_t nb_row_x_arg, size_t nb_row_xd_arg, int periods_arg, int y_kmin_arg, int y_kmax_arg, + int maxit_arg_, double solve_tolf_arg, size_t size_of_direction_arg, double slowc_arg, int y_decal_arg, double markowitz_c_arg, string &filename_arg, int minimal_solving_periods_arg, int stack_solve_algo_arg, int solve_algo_arg, - bool global_temporary_terms_arg, bool print_arg, bool print_error_arg, mxArray *GlobalTemporaryTerms_arg) + bool global_temporary_terms_arg, bool print_arg, bool print_error_arg, mxArray *GlobalTemporaryTerms_arg, + bool steady_state_arg, bool print_it_arg +#ifdef CUDA + , const int CUDA_device_arg, cublasHandle_t cublas_handle_arg, cusparseHandle_t cusparse_handle_arg, cusparseMatDescr_t descr_arg +#endif + ) + : dynSparseMatrix(y_size_arg, y_kmin_arg, y_kmax_arg, print_it_arg, steady_state_arg, periods_arg, minimal_solving_periods_arg +#ifdef CUDA + , CUDA_device_arg, cublas_handle_arg, cusparse_handle_arg, descr_arg +#endif + ) { params = params_arg; y = y_arg; @@ -42,12 +52,12 @@ Interpreter::Interpreter(double *params_arg, double *y_arg, double *ya_arg, doub steady_y = steady_y_arg; steady_x = steady_x_arg; direction = direction_arg; - y_size = y_size_arg; + //y_size = y_size_arg; nb_row_x = nb_row_x_arg; nb_row_xd = nb_row_xd_arg; periods = periods_arg; - y_kmax = y_kmax_arg; - y_kmin = y_kmin_arg; + //y_kmax = y_kmax_arg; + //y_kmin = y_kmin_arg; maxit_ = maxit_arg_; solve_tolf = solve_tolf_arg; size_of_direction = size_of_direction_arg; @@ -57,7 +67,6 @@ Interpreter::Interpreter(double *params_arg, double *y_arg, double *ya_arg, doub markowitz_c = markowitz_c_arg; filename = filename_arg; T = NULL; - error_not_printed = true; minimal_solving_periods = minimal_solving_periods_arg; stack_solve_algo = stack_solve_algo_arg; solve_algo = solve_algo_arg; @@ -65,1393 +74,15 @@ Interpreter::Interpreter(double *params_arg, double *y_arg, double *ya_arg, doub print = print_arg; GlobalTemporaryTerms = GlobalTemporaryTerms_arg; print_error = print_error_arg; -} + //steady_state = steady_state_arg; + //print_it = print_it_arg; -double -Interpreter::pow1(double a, double b) -{ - double r = pow_(a, b); - if (isnan(r) || isinf(r)) - { - res1 = NAN; - r = 0.0000000000000000000000001; - if (print_error) - throw PowExceptionHandling(a, b); - } - return r; -} - -double -Interpreter::divide(double a, double b) -{ - double r = a / b; - if (isnan(r) || isinf(r)) - { - res1 = NAN; - r = 1e70; - if (print_error) - throw DivideExceptionHandling(a, b); - } - return r; -} - -double -Interpreter::log1(double a) -{ - double r = log(a); - if (isnan(r) || isinf(r)) - { - res1 = NAN; - r = -1e70; - if (print_error) - throw LogExceptionHandling(a); - } - return r; -} - -double -Interpreter::log10_1(double a) -{ - double r = log(a); - if (isnan(r) || isinf(r)) - { - res1 = NAN; - r = -1e70; - if (print_error) - throw Log10ExceptionHandling(a); - } - return r; } void -Interpreter::compute_block_time(int Per_u_, bool evaluate, int block_num, int size, bool steady_state) -{ - int var = 0, lag = 0, op; - unsigned int eq, pos_col; - ostringstream tmp_out; - double v1, v2, v3; - bool go_on = true; - double ll; - double rr; - double *jacob = NULL, *jacob_other_endo = NULL, *jacob_exo = NULL, *jacob_exo_det = NULL; - EQN_block = block_num; - stack Stack; - external_function_type function_type = ExternalFunctionWithoutDerivative; - -#ifdef DEBUG - mexPrintf("compute_block_time\n"); -#endif - if (evaluate /*&& !steady_state*/) - { - jacob = mxGetPr(jacobian_block[block_num]); - if (!steady_state) - { - jacob_other_endo = mxGetPr(jacobian_other_endo_block[block_num]); - jacob_exo = mxGetPr(jacobian_exo_block[block_num]); - jacob_exo_det = mxGetPr(jacobian_det_exo_block[block_num]); - } - } - - while (go_on) - { - switch (it_code->first) - { - case FNUMEXPR: -#ifdef DEBUG - mexPrintf("FNUMEXPR\n"); -#endif - it_code_expr = it_code; - switch (((FNUMEXPR_ *) it_code->second)->get_expression_type()) - { - case TemporaryTerm: -#ifdef DEBUG - mexPrintf("TemporaryTerm\n"); -#endif - EQN_type = TemporaryTerm; - EQN_equation = ((FNUMEXPR_ *) it_code->second)->get_equation(); -#ifdef DEBUG - mexPrintf("EQN_equation=%d\n", EQN_equation); mexEvalString("drawnow;"); -#endif - break; - case ModelEquation: -#ifdef DEBUG - mexPrintf("ModelEquation\n"); -#endif - EQN_type = ModelEquation; - EQN_equation = ((FNUMEXPR_ *) it_code->second)->get_equation(); - break; - case FirstEndoDerivative: -#ifdef DEBUG - mexPrintf("FirstEndoDerivative\n"); -#endif - EQN_type = FirstEndoDerivative; - EQN_equation = ((FNUMEXPR_ *) it_code->second)->get_equation(); - EQN_dvar1 = ((FNUMEXPR_ *) it_code->second)->get_dvariable1(); - EQN_lag1 = ((FNUMEXPR_ *) it_code->second)->get_lag1(); - break; - case FirstOtherEndoDerivative: -#ifdef DEBUG - mexPrintf("FirstOtherEndoDerivative\n"); -#endif - EQN_type = FirstOtherEndoDerivative; - EQN_equation = ((FNUMEXPR_ *) it_code->second)->get_equation(); - EQN_dvar1 = ((FNUMEXPR_ *) it_code->second)->get_dvariable1(); - EQN_lag1 = ((FNUMEXPR_ *) it_code->second)->get_lag1(); - break; - case FirstExoDerivative: -#ifdef DEBUG - mexPrintf("FirstExoDerivative\n"); -#endif - EQN_type = FirstExoDerivative; - EQN_equation = ((FNUMEXPR_ *) it_code->second)->get_equation(); - EQN_dvar1 = ((FNUMEXPR_ *) it_code->second)->get_dvariable1(); - EQN_lag1 = ((FNUMEXPR_ *) it_code->second)->get_lag1(); - break; - case FirstExodetDerivative: -#ifdef DEBUG - mexPrintf("FirstExodetDerivative\n"); -#endif - EQN_type = FirstExodetDerivative; - EQN_equation = ((FNUMEXPR_ *) it_code->second)->get_equation(); - EQN_dvar1 = ((FNUMEXPR_ *) it_code->second)->get_dvariable1(); - EQN_lag1 = ((FNUMEXPR_ *) it_code->second)->get_lag1(); - break; - case FirstParamDerivative: -#ifdef DEBUG - mexPrintf("FirstParamDerivative\n"); -#endif - EQN_type = FirstParamDerivative; - EQN_equation = ((FNUMEXPR_ *) it_code->second)->get_equation(); - EQN_dvar1 = ((FNUMEXPR_ *) it_code->second)->get_dvariable1(); - break; - case SecondEndoDerivative: -#ifdef DEBUG - mexPrintf("SecondEndoDerivative\n"); -#endif - EQN_type = SecondEndoDerivative; - EQN_equation = ((FNUMEXPR_ *) it_code->second)->get_equation(); - EQN_dvar1 = ((FNUMEXPR_ *) it_code->second)->get_dvariable1(); - EQN_lag1 = ((FNUMEXPR_ *) it_code->second)->get_lag1(); - EQN_dvar2 = ((FNUMEXPR_ *) it_code->second)->get_dvariable2(); - EQN_lag2 = ((FNUMEXPR_ *) it_code->second)->get_lag2(); - break; - case SecondExoDerivative: -#ifdef DEBUG - mexPrintf("SecondExoDerivative\n"); -#endif - EQN_type = SecondExoDerivative; - EQN_equation = ((FNUMEXPR_ *) it_code->second)->get_equation(); - EQN_dvar1 = ((FNUMEXPR_ *) it_code->second)->get_dvariable1(); - EQN_lag1 = ((FNUMEXPR_ *) it_code->second)->get_lag1(); - EQN_dvar2 = ((FNUMEXPR_ *) it_code->second)->get_dvariable2(); - EQN_lag2 = ((FNUMEXPR_ *) it_code->second)->get_lag2(); - break; - case SecondExodetDerivative: -#ifdef DEBUG - mexPrintf("SecondExodetDerivative\n"); -#endif - EQN_type = SecondExodetDerivative; - EQN_equation = ((FNUMEXPR_ *) it_code->second)->get_equation(); - EQN_dvar1 = ((FNUMEXPR_ *) it_code->second)->get_dvariable1(); - EQN_lag1 = ((FNUMEXPR_ *) it_code->second)->get_lag1(); - EQN_dvar2 = ((FNUMEXPR_ *) it_code->second)->get_dvariable2(); - EQN_lag2 = ((FNUMEXPR_ *) it_code->second)->get_lag2(); - break; - case SecondParamDerivative: -#ifdef DEBUG - mexPrintf("SecondParamDerivative\n"); -#endif - EQN_type = SecondParamDerivative; - EQN_equation = ((FNUMEXPR_ *) it_code->second)->get_equation(); - EQN_dvar1 = ((FNUMEXPR_ *) it_code->second)->get_dvariable1(); - EQN_dvar2 = ((FNUMEXPR_ *) it_code->second)->get_dvariable2(); - break; - case ThirdEndoDerivative: -#ifdef DEBUG - mexPrintf("ThirdEndoDerivative\n"); -#endif - EQN_type = ThirdEndoDerivative; - EQN_equation = ((FNUMEXPR_ *) it_code->second)->get_equation(); - EQN_dvar1 = ((FNUMEXPR_ *) it_code->second)->get_dvariable1(); - EQN_lag1 = ((FNUMEXPR_ *) it_code->second)->get_lag1(); - EQN_dvar2 = ((FNUMEXPR_ *) it_code->second)->get_dvariable2(); - EQN_lag2 = ((FNUMEXPR_ *) it_code->second)->get_lag2(); - EQN_dvar3 = ((FNUMEXPR_ *) it_code->second)->get_dvariable3(); - EQN_lag3 = ((FNUMEXPR_ *) it_code->second)->get_lag3(); - break; - case ThirdExoDerivative: -#ifdef DEBUG - mexPrintf("ThirdExoDerivative\n"); -#endif - EQN_type = ThirdExoDerivative; - EQN_equation = ((FNUMEXPR_ *) it_code->second)->get_equation(); - EQN_dvar1 = ((FNUMEXPR_ *) it_code->second)->get_dvariable1(); - EQN_lag1 = ((FNUMEXPR_ *) it_code->second)->get_lag1(); - EQN_dvar2 = ((FNUMEXPR_ *) it_code->second)->get_dvariable2(); - EQN_lag2 = ((FNUMEXPR_ *) it_code->second)->get_lag2(); - EQN_dvar3 = ((FNUMEXPR_ *) it_code->second)->get_dvariable3(); - EQN_lag3 = ((FNUMEXPR_ *) it_code->second)->get_lag3(); - break; - case ThirdExodetDerivative: -#ifdef DEBUG - mexPrintf("ThirdExodetDerivative\n"); -#endif - EQN_type = ThirdExodetDerivative; - EQN_equation = ((FNUMEXPR_ *) it_code->second)->get_equation(); - EQN_dvar1 = ((FNUMEXPR_ *) it_code->second)->get_dvariable1(); - EQN_lag1 = ((FNUMEXPR_ *) it_code->second)->get_lag1(); - EQN_dvar2 = ((FNUMEXPR_ *) it_code->second)->get_dvariable2(); - EQN_lag2 = ((FNUMEXPR_ *) it_code->second)->get_lag2(); - EQN_dvar3 = ((FNUMEXPR_ *) it_code->second)->get_dvariable3(); - EQN_lag3 = ((FNUMEXPR_ *) it_code->second)->get_lag3(); - break; - case ThirdParamDerivative: -#ifdef DEBUG - mexPrintf("ThirdParamDerivative\n"); -#endif - EQN_type = ThirdParamDerivative; - EQN_equation = ((FNUMEXPR_ *) it_code->second)->get_equation(); - EQN_dvar1 = ((FNUMEXPR_ *) it_code->second)->get_dvariable1(); - EQN_dvar2 = ((FNUMEXPR_ *) it_code->second)->get_dvariable2(); - EQN_dvar3 = ((FNUMEXPR_ *) it_code->second)->get_dvariable3(); - break; - } - break; - case FLDV: - //load a variable in the processor - switch (((FLDV_ *) it_code->second)->get_type()) - { - case eParameter: - var = ((FLDV_ *) it_code->second)->get_pos(); -#ifdef DEBUG - mexPrintf("FLDV Param[var=%d]\n", var); - tmp_out << " params[" << var << "](" << params[var] << ")"; -#endif - Stack.push(params[var]); - break; - case eEndogenous: - var = ((FLDV_ *) it_code->second)->get_pos(); - lag = ((FLDV_ *) it_code->second)->get_lead_lag(); -#ifdef DEBUG - mexPrintf("FLDV y[var=%d, lag=%d, it_=%d], y_size=%d evaluate=%d\n", var, lag, it_, y_size, evaluate); -#endif - if (evaluate) - Stack.push(ya[(it_+lag)*y_size+var]); - else - Stack.push(y[(it_+lag)*y_size+var]); -#ifdef DEBUG - tmp_out << " y[" << it_+lag << ", " << var << "](" << y[(it_+lag)*y_size+var] << ")"; -#endif - break; - case eExogenous: - var = ((FLDV_ *) it_code->second)->get_pos(); - lag = ((FLDV_ *) it_code->second)->get_lead_lag(); -#ifdef DEBUG - mexPrintf("FLDV x[var=%d, lag=%d, it_=%d], nb_row_x=%d evaluate=%d\n", var, lag, it_, nb_row_x, evaluate); - tmp_out << " x[" << it_+lag << ", " << var << "](" << x[it_+lag+var*nb_row_x] << ")"; -#endif - Stack.push(x[it_+lag+var*nb_row_x]); - break; - case eExogenousDet: - var = ((FLDV_ *) it_code->second)->get_pos(); - lag = ((FLDV_ *) it_code->second)->get_lead_lag(); - Stack.push(x[it_+lag+var*nb_row_xd]); - break; - case eModelLocalVariable: -#ifdef DEBUG - mexPrintf("FLDV a local variable in Block %d Stack.size()=%d", block_num, Stack.size()); - mexPrintf(" value=%f\n", Stack.top()); -#endif - break; - default: - mexPrintf("FLDV: Unknown variable type\n"); - } - break; - case FLDSV: - //load a variable in the processor - switch (((FLDSV_ *) it_code->second)->get_type()) - { - case eParameter: - var = ((FLDSV_ *) it_code->second)->get_pos(); -#ifdef DEBUG - mexPrintf("FLDSV Param[var=%d]=%f\n", var, params[var]); - tmp_out << " params[" << var << "](" << params[var] << ")"; -#endif - Stack.push(params[var]); - break; - case eEndogenous: - var = ((FLDSV_ *) it_code->second)->get_pos(); -#ifdef DEBUG - mexPrintf("FLDSV y[var=%d]=%f\n", var, ya[var]); - tmp_out << " y[" << var << "](" << y[var] << ")"; -#endif - if (evaluate) - Stack.push(ya[var]); - else - Stack.push(y[var]); - break; - case eExogenous: - var = ((FLDSV_ *) it_code->second)->get_pos(); -#ifdef DEBUG - mexPrintf("FLDSV x[var=%d]\n", var); - tmp_out << " x[" << var << "](" << x[var] << ")"; -#endif - Stack.push(x[var]); - break; - case eExogenousDet: - var = ((FLDSV_ *) it_code->second)->get_pos(); -#ifdef DEBUG - mexPrintf("FLDSV xd[var=%d]\n", var); -#endif - Stack.push(x[var]); - break; - case eModelLocalVariable: -#ifdef DEBUG - mexPrintf("FLDSV a local variable in Block %d Stack.size()=%d", block_num, Stack.size()); - mexPrintf(" value=%f\n", Stack.top()); -#endif - break; - default: - mexPrintf("FLDSV: Unknown variable type\n"); - } - break; - case FLDVS: - //load a variable in the processor - switch (((FLDVS_ *) it_code->second)->get_type()) - { - case eParameter: - var = ((FLDVS_ *) it_code->second)->get_pos(); -#ifdef DEBUG - mexPrintf("params[%d]\n", var); -#endif - Stack.push(params[var]); - break; - case eEndogenous: - var = ((FLDVS_ *) it_code->second)->get_pos(); -#ifdef DEBUG - mexPrintf("FLDVS steady_y[%d]\n", var); -#endif - Stack.push(steady_y[var]); - break; - case eExogenous: - var = ((FLDVS_ *) it_code->second)->get_pos(); -#ifdef DEBUG - mexPrintf("FLDVS x[%d] \n", var); -#endif - Stack.push(x[var]); - break; - case eExogenousDet: - var = ((FLDVS_ *) it_code->second)->get_pos(); -#ifdef DEBUG - mexPrintf("FLDVS xd[%d]\n", var); -#endif - Stack.push(x[var]); - break; - case eModelLocalVariable: -#ifdef DEBUG - mexPrintf("FLDVS a local variable in Block %d Stack.size()=%d", block_num, Stack.size()); - mexPrintf(" value=%f\n", Stack.top()); -#endif - break; - default: - mexPrintf("FLDVS: Unknown variable type\n"); - } - break; - case FLDT: - //load a temporary variable in the processor - var = ((FLDT_ *) it_code->second)->get_pos(); -#ifdef DEBUG - mexPrintf("T[it_=%d var=%d, y_kmin=%d, y_kmax=%d == %d]=>%f\n", it_, var, y_kmin, y_kmax, var*(periods+y_kmin+y_kmax)+it_, var); - tmp_out << " T[" << it_ << ", " << var << "](" << T[var*(periods+y_kmin+y_kmax)+it_] << ")"; -#endif - Stack.push(T[var*(periods+y_kmin+y_kmax)+it_]); - break; - case FLDST: - //load a temporary variable in the processor - var = ((FLDST_ *) it_code->second)->get_pos(); -#ifdef DEBUG - mexPrintf("FLDST T[%d]", var); -#endif - Stack.push(T[var]); -#ifdef DEBUG - mexPrintf("=%f\n", T[var]); - tmp_out << " T[" << var << "](" << T[var] << ")"; -#endif - break; - case FLDU: - //load u variable in the processor - var = ((FLDU_ *) it_code->second)->get_pos(); - var += Per_u_; -#ifdef DEBUG - mexPrintf("FLDU u[%d]\n", var); - tmp_out << " u[" << var << "](" << u[var] << ")"; -#endif - Stack.push(u[var]); - break; - case FLDSU: - //load u variable in the processor - var = ((FLDSU_ *) it_code->second)->get_pos(); -#ifdef DEBUG - mexPrintf("FLDSU u[%d]\n", var); - tmp_out << " u[" << var << "](" << u[var] << ")"; -#endif - Stack.push(u[var]); - break; - case FLDR: - //load u variable in the processor - var = ((FLDR_ *) it_code->second)->get_pos(); -#ifdef DEBUG - mexPrintf("FLDR r[%d]\n", var); -#endif - Stack.push(r[var]); - break; - case FLDZ: - //load 0 in the processor -#ifdef DEBUG - mexPrintf("FLDZ\n"); -#endif - Stack.push(0.0); -#ifdef DEBUG - tmp_out << " 0"; -#endif - break; - case FLDC: - //load a numerical constant in the processor - ll = ((FLDC_ *) it_code->second)->get_value(); -#ifdef DEBUG - mexPrintf("FLDC = %f\n", ll); - tmp_out << " " << ll; -#endif - - Stack.push(ll); - break; - case FSTPV: - //load a variable in the processor - switch (((FSTPV_ *) it_code->second)->get_type()) - { - case eParameter: - var = ((FSTPV_ *) it_code->second)->get_pos(); -#ifdef DEBUG - mexPrintf("FSTPV params[%d]\n", var); -#endif - params[var] = Stack.top(); - Stack.pop(); - break; - case eEndogenous: - var = ((FSTPV_ *) it_code->second)->get_pos(); - lag = ((FSTPV_ *) it_code->second)->get_lead_lag(); - y[(it_+lag)*y_size+var] = Stack.top(); -#ifdef DEBUG - tmp_out << "=>"; - mexPrintf(" y[%d, %d](%f)=%s\n", it_+lag, var, y[(it_+lag)*y_size+var], tmp_out.str().c_str()); - tmp_out.str(""); -#endif - Stack.pop(); - break; - case eExogenous: - var = ((FSTPV_ *) it_code->second)->get_pos(); - lag = ((FSTPV_ *) it_code->second)->get_lead_lag(); - x[it_+lag+var*nb_row_x] = Stack.top(); -#ifdef DEBUG - tmp_out << "=>"; - mexPrintf(" x[%d, %d](%f)=%s\n", it_+lag, var, x[it_+lag+var*nb_row_x], tmp_out.str().c_str()); - tmp_out.str(""); -#endif - - Stack.pop(); - break; - case eExogenousDet: - var = ((FSTPV_ *) it_code->second)->get_pos(); - lag = ((FSTPV_ *) it_code->second)->get_lead_lag(); - x[it_+lag+var*nb_row_xd] = Stack.top(); -#ifdef DEBUG - tmp_out << "=>"; - mexPrintf(" x[%d, %d](%f)=%s\n", it_+lag, var, x[it_+lag+var*nb_row_xd], tmp_out.str().c_str()); - tmp_out.str(""); -#endif - Stack.pop(); - break; - default: - mexPrintf("FSTPV: Unknown variable type\n"); - } - break; - case FSTPSV: - //load a variable in the processor - switch (((FSTPSV_ *) it_code->second)->get_type()) - { - case eParameter: - var = ((FSTPSV_ *) it_code->second)->get_pos(); - params[var] = Stack.top(); - Stack.pop(); - break; - case eEndogenous: - var = ((FSTPSV_ *) it_code->second)->get_pos(); - y[var] = Stack.top(); -#ifdef DEBUG - tmp_out << "=>"; - mexPrintf(" y[%d](%f)=%s\n", var, y[var], tmp_out.str().c_str()); - tmp_out.str(""); -#endif - Stack.pop(); - break; - case eExogenous: - case eExogenousDet: - var = ((FSTPSV_ *) it_code->second)->get_pos(); - x[var] = Stack.top(); -#ifdef DEBUG - tmp_out << "=>"; - mexPrintf(" x[%d, %d](%f)=%s\n", it_+lag, var, x[var], tmp_out.str().c_str()); - tmp_out.str(""); -#endif - Stack.pop(); - break; - default: - mexPrintf("FSTPSV: Unknown variable type\n"); - } - break; - case FSTPT: - //store in a temporary variable from the processor -#ifdef DEBUG - mexPrintf("FSTPT\n"); -#endif - var = ((FSTPT_ *) it_code->second)->get_pos(); - T[var*(periods+y_kmin+y_kmax)+it_] = Stack.top(); -#ifdef DEBUG - tmp_out << "=>"; - mexPrintf(" T[%d, %d](%f)=%s\n", it_, var, T[var*(periods+y_kmin+y_kmax)+it_], tmp_out.str().c_str()); - tmp_out.str(""); -#endif - - Stack.pop(); - break; - case FSTPST: - //store in a temporary variable from the processor -#ifdef DEBUG - mexPrintf("FSTPST\n"); -#endif - var = ((FSTPST_ *) it_code->second)->get_pos(); -#ifdef DEBUG - mexPrintf("var=%d\n", var); -#endif - T[var] = Stack.top(); -#ifdef DEBUG - tmp_out << "=>"; - mexPrintf(" T[%d](%f)=%s\n", var, T[var], tmp_out.str().c_str()); - tmp_out.str(""); -#endif - Stack.pop(); - break; - case FSTPU: - //store in u variable from the processor - var = ((FSTPU_ *) it_code->second)->get_pos(); - var += Per_u_; -#ifdef DEBUG - mexPrintf("FSTPU\n"); - mexPrintf("var=%d\n", var); -#endif - u[var] = Stack.top(); -#ifdef DEBUG - tmp_out << "=>"; - mexPrintf(" u[%d](%f)=%s\n", var, u[var], tmp_out.str().c_str()); - tmp_out.str(""); -#endif - Stack.pop(); - break; - case FSTPSU: - //store in u variable from the processor - var = ((FSTPSU_ *) it_code->second)->get_pos(); -#ifdef DEBUG - if (var >= u_count_alloc || var < 0) - mexPrintf("Erreur var=%d\n", var); -#endif - u[var] = Stack.top(); -#ifdef DEBUG - tmp_out << "=>"; - mexPrintf(" u[%d](%f)=%s\n", var, u[var], tmp_out.str().c_str()); - tmp_out.str(""); -#endif - Stack.pop(); - break; - case FSTPR: - //store in residual variable from the processor - var = ((FSTPR_ *) it_code->second)->get_pos(); -#ifdef DEBUG - tmp_out << "=>"; - mexPrintf("FSTPR r[%d]", var); - tmp_out.str(""); -#endif - r[var] = Stack.top(); -#ifdef DEBUG - tmp_out << "=>"; - mexPrintf("(%f)=%s\n", r[var], tmp_out.str().c_str()); - tmp_out.str(""); -#endif - Stack.pop(); - break; - case FSTPG: - //store in derivative (g) variable from the processor -#ifdef DEBUG - mexPrintf("FSTPG\n"); - mexEvalString("drawnow;"); -#endif - var = ((FSTPG_ *) it_code->second)->get_pos(); - g1[var] = Stack.top(); -#ifdef DEBUG - tmp_out << "=>"; - mexPrintf(" g1[%d](%f)=%s\n", var, g1[var], tmp_out.str().c_str()); - tmp_out.str(""); -#endif - Stack.pop(); - break; - - case FSTPG2: - //store in the jacobian matrix - rr = Stack.top(); - if (EQN_type != FirstEndoDerivative) - { - ostringstream tmp; - tmp << " in compute_block_time, impossible case " << EQN_type << " not implement in static jacobian\n"; - throw FatalExceptionHandling(tmp.str()); - } - eq = ((FSTPG2_ *) it_code->second)->get_row(); - var = ((FSTPG2_ *) it_code->second)->get_col(); -#ifdef DEBUG - mexPrintf("FSTPG2 eq=%d, var=%d\n", eq, var); - mexEvalString("drawnow;"); -#endif - jacob[eq + size*var] = rr; - break; - case FSTPG3: - //store in derivative (g) variable from the processor -#ifdef DEBUG - mexPrintf("FSTPG3\n"); - mexEvalString("drawnow;"); -#endif - rr = Stack.top(); - switch (EQN_type) - { - case FirstEndoDerivative: - eq = ((FSTPG3_ *) it_code->second)->get_row(); - var = ((FSTPG3_ *) it_code->second)->get_col(); - lag = ((FSTPG3_ *) it_code->second)->get_lag(); - pos_col = ((FSTPG3_ *) it_code->second)->get_col_pos(); -#ifdef DEBUG - mexPrintf("Endo eq=%d, pos_col=%d, size=%d\n", eq, pos_col, size); -#endif - jacob[eq + size*pos_col] = rr; - break; - case FirstOtherEndoDerivative: - //eq = ((FSTPG3_ *) it_code->second)->get_row(); - eq = EQN_equation; - var = ((FSTPG3_ *) it_code->second)->get_col(); - lag = ((FSTPG3_ *) it_code->second)->get_lag(); - pos_col = ((FSTPG3_ *) it_code->second)->get_col_pos(); - jacob_other_endo[eq + size*pos_col] = rr; - break; - case FirstExoDerivative: - //eq = ((FSTPG3_ *) it_code->second)->get_row(); - eq = EQN_equation; - var = ((FSTPG3_ *) it_code->second)->get_col(); - lag = ((FSTPG3_ *) it_code->second)->get_lag(); - pos_col = ((FSTPG3_ *) it_code->second)->get_col_pos(); -#ifdef DEBUG - mexPrintf("Exo eq=%d, pos_col=%d, size=%d\n", eq, pos_col, size); -#endif - jacob_exo[eq + size*pos_col] = rr; - break; - case FirstExodetDerivative: - //eq = ((FSTPG3_ *) it_code->second)->get_row(); - eq = EQN_equation; - var = ((FSTPG3_ *) it_code->second)->get_col(); - lag = ((FSTPG3_ *) it_code->second)->get_lag(); - pos_col = ((FSTPG3_ *) it_code->second)->get_col_pos(); - jacob_exo_det[eq + size*pos_col] = rr; - break; - default: - ostringstream tmp; - tmp << " in compute_block_time, variable " << EQN_type << " not used yet\n"; - throw FatalExceptionHandling(tmp.str()); - } -#ifdef DEBUG - tmp_out << "=>"; - mexPrintf(" g1[%d](%f)=%s\n", var, g1[var], tmp_out.str().c_str()); - tmp_out.str(""); -#endif - Stack.pop(); - break; - - case FBINARY: - op = ((FBINARY_ *) it_code->second)->get_op_type(); -#ifdef DEBUG - mexPrintf("FBINARY, op=%d\n", op); -#endif - v2 = Stack.top(); - Stack.pop(); - v1 = Stack.top(); - Stack.pop(); - switch (op) - { - case oPlus: - Stack.push(v1 + v2); -#ifdef DEBUG - tmp_out << " |" << v1 << "+" << v2 << "|"; -#endif - break; - case oMinus: - Stack.push(v1 - v2); -#ifdef DEBUG - tmp_out << " |" << v1 << "-" << v2 << "|"; -#endif - break; - case oTimes: - Stack.push(v1 * v2); -#ifdef DEBUG - tmp_out << " |" << v1 << "*" << v2 << "|"; -#endif - break; - case oDivide: - double tmp; -#ifdef DEBUG - mexPrintf("v1=%f / v2=%f\n", v1, v2); -#endif - try - { - tmp = divide(v1, v2); - } - catch (FloatingPointExceptionHandling &fpeh) - { - mexPrintf("%s %s\n", fpeh.GetErrorMsg().c_str(), error_location(evaluate, steady_state, size, block_num, it_, Per_u_).c_str()); - go_on = false; - } - Stack.push(tmp); -#ifdef DEBUG - tmp_out << " |" << v1 << "/" << v2 << "|"; -#endif - break; - case oLess: - Stack.push(double (v1 < v2)); -#ifdef DEBUG - tmp_out << " |" << v1 << "<" << v2 << "|"; -#endif - break; - case oGreater: - Stack.push(double (v1 > v2)); -#ifdef DEBUG - tmp_out << " |" << v1 << ">" << v2 << "|"; -#endif - break; - case oLessEqual: - Stack.push(double (v1 <= v2)); -#ifdef DEBUG - tmp_out << " |" << v1 << "<=" << v2 << "|"; -#endif - break; - case oGreaterEqual: - Stack.push(double (v1 >= v2)); -#ifdef DEBUG - tmp_out << " |" << v1 << ">=" << v2 << "|"; -#endif - break; - case oEqualEqual: - Stack.push(double (v1 == v2)); -#ifdef DEBUG - tmp_out << " |" << v1 << "==" << v2 << "|"; -#endif - break; - case oDifferent: - Stack.push(double (v1 != v2)); -#ifdef DEBUG - tmp_out << " |" << v1 << "!=" << v2 << "|"; -#endif - break; - case oPower: -#ifdef DEBUG - mexPrintf("pow\n"); -#endif - try - { - tmp = pow1(v1, v2); - } - catch (FloatingPointExceptionHandling &fpeh) - { - mexPrintf("%s %s\n", fpeh.GetErrorMsg().c_str(), error_location(evaluate, steady_state, size, block_num, it_, Per_u_).c_str()); - go_on = false; - } - Stack.push(tmp); - -#ifdef DEBUG - tmp_out << " |" << v1 << "^" << v2 << "|"; -#endif - break; - case oPowerDeriv: - { - int derivOrder = nearbyint(Stack.top()); - Stack.pop(); - try - { - if (fabs(v1) < NEAR_ZERO && v2 > 0 - && derivOrder > v2 - && fabs(v2-nearbyint(v2)) < NEAR_ZERO) - Stack.push(0.0); - else - { - double dxp = pow1(v1, v2-derivOrder); - for (int i = 0; i < derivOrder; i++) - dxp *= v2--; - Stack.push(dxp); - } - } - catch (FloatingPointExceptionHandling &fpeh) - { - mexPrintf("%s %s\n", fpeh.GetErrorMsg().c_str(), error_location(evaluate, steady_state, size, block_num, it_, Per_u_).c_str()); - go_on = false; - } - } - -#ifdef DEBUG - tmp_out << " |PowerDeriv(" << v1 << ", " << v2 << ")|"; -#endif - break; - case oMax: - Stack.push(max(v1, v2)); -#ifdef DEBUG - tmp_out << " |max(" << v1 << "," << v2 << ")|"; -#endif - break; - case oMin: - Stack.push(min(v1, v2)); -#ifdef DEBUG - tmp_out << " |min(" << v1 << "," << v2 << ")|"; -#endif - break; - case oEqual: - // Nothing to do - break; - default: - { - mexPrintf("Error\n"); - ostringstream tmp; - tmp << " in compute_block_time, unknown binary operator " << op << "\n"; - throw FatalExceptionHandling(tmp.str()); - } - } - break; - case FUNARY: - op = ((FUNARY_ *) it_code->second)->get_op_type(); - v1 = Stack.top(); - Stack.pop(); -#ifdef DEBUG - mexPrintf("FUNARY, op=%d\n", op); -#endif - switch (op) - { - case oUminus: - Stack.push(-v1); -#ifdef DEBUG - tmp_out << " |-(" << v1 << ")|"; -#endif - - break; - case oExp: - Stack.push(exp(v1)); -#ifdef DEBUG - tmp_out << " |exp(" << v1 << ")|"; -#endif - break; - case oLog: - double tmp; - try - { - tmp = log1(v1); - } - catch (FloatingPointExceptionHandling &fpeh) - { - mexPrintf("%s %s\n", fpeh.GetErrorMsg().c_str(), error_location(evaluate, steady_state, size, block_num, it_, Per_u_).c_str()); - go_on = false; - } - Stack.push(tmp); - //if (isnan(res1)) - -#ifdef DEBUG - tmp_out << " |log(" << v1 << ")|"; -#endif - break; - case oLog10: - try - { - tmp = log10_1(v1); - } - catch (FloatingPointExceptionHandling &fpeh) - { - mexPrintf("%s %s\n", fpeh.GetErrorMsg().c_str(), error_location(evaluate, steady_state, size, block_num, it_, Per_u_).c_str()); - go_on = false; - } - Stack.push(tmp); -#ifdef DEBUG - tmp_out << " |log10(" << v1 << ")|"; -#endif - break; - case oCos: - Stack.push(cos(v1)); -#ifdef DEBUG - tmp_out << " |cos(" << v1 << ")|"; -#endif - break; - case oSin: - Stack.push(sin(v1)); -#ifdef DEBUG - tmp_out << " |sin(" << v1 << ")|"; -#endif - break; - case oTan: - Stack.push(tan(v1)); -#ifdef DEBUG - tmp_out << " |tan(" << v1 << ")|"; -#endif - break; - case oAcos: - Stack.push(acos(v1)); -#ifdef DEBUG - tmp_out << " |acos(" << v1 << ")|"; -#endif - break; - case oAsin: - Stack.push(asin(v1)); -#ifdef DEBUG - tmp_out << " |asin(" << v1 << ")|"; -#endif - break; - case oAtan: - Stack.push(atan(v1)); -#ifdef DEBUG - tmp_out << " |atan(" << v1 << ")|"; -#endif - break; - case oCosh: - Stack.push(cosh(v1)); -#ifdef DEBUG - tmp_out << " |cosh(" << v1 << ")|"; -#endif - break; - case oSinh: - Stack.push(sinh(v1)); -#ifdef DEBUG - tmp_out << " |sinh(" << v1 << ")|"; -#endif - break; - case oTanh: - Stack.push(tanh(v1)); -#ifdef DEBUG - tmp_out << " |tanh(" << v1 << ")|"; -#endif - break; - case oAcosh: - Stack.push(acosh(v1)); -#ifdef DEBUG - tmp_out << " |acosh(" << v1 << ")|"; -#endif - break; - case oAsinh: - Stack.push(asinh(v1)); -#ifdef DEBUG - tmp_out << " |asinh(" << v1 << ")|"; -#endif - break; - case oAtanh: - Stack.push(atanh(v1)); -#ifdef DEBUG - tmp_out << " |atanh(" << v1 << ")|"; -#endif - break; - case oSqrt: - Stack.push(sqrt(v1)); -#ifdef DEBUG - tmp_out << " |sqrt(" << v1 << ")|"; -#endif - break; - case oErf: - Stack.push(erf(v1)); -#ifdef DEBUG - tmp_out << " |erf(" << v1 << ")|"; - -#endif - break; - default: - { - mexPrintf("Error\n"); - ostringstream tmp; - tmp << " in compute_block_time, unknown unary operator " << op << "\n"; - throw FatalExceptionHandling(tmp.str()); - } - } - break; - case FTRINARY: - op = ((FTRINARY_ *) it_code->second)->get_op_type(); - v3 = Stack.top(); - Stack.pop(); - v2 = Stack.top(); - Stack.pop(); - v1 = Stack.top(); - Stack.pop(); - switch (op) - { - case oNormcdf: - Stack.push(0.5*(1+erf((v1-v2)/v3/M_SQRT2))); -#ifdef DEBUG - tmp_out << " |normcdf(" << v1 << ", " << v2 << ", " << v3 << ")|"; -#endif - break; - case oNormpdf: - Stack.push(1/(v3*sqrt(2*M_PI)*exp(pow((v1-v2)/v3, 2)/2))); -#ifdef DEBUG - tmp_out << " |normpdf(" << v1 << ", " << v2 << ", " << v3 << ")|"; -#endif - break; - default: - { - mexPrintf("Error\n"); - ostringstream tmp; - tmp << " in compute_block_time, unknown trinary operator " << op << "\n"; - throw FatalExceptionHandling(tmp.str()); - } - } - break; - - case FPUSH: - break; - - case FCALL: - { -#ifdef DEBUG - mexPrintf("------------------------------\n"); - mexPrintf("CALL "); mexEvalString("drawnow;"); -#endif - FCALL_ *fc = (FCALL_ *) it_code->second; - string function_name = fc->get_function_name(); -#ifdef DEBUG - mexPrintf("function_name=%s ", function_name.c_str()); mexEvalString("drawnow;"); -#endif - unsigned int nb_input_arguments = fc->get_nb_input_arguments(); -#ifdef DEBUG - mexPrintf("nb_input_arguments=%d ", nb_input_arguments); mexEvalString("drawnow;"); -#endif - unsigned int nb_output_arguments = fc->get_nb_output_arguments(); -#ifdef DEBUG - mexPrintf("nb_output_arguments=%d\n", nb_output_arguments); mexEvalString("drawnow;"); -#endif - - mxArray *output_arguments[3]; - string arg_func_name = fc->get_arg_func_name(); -#ifdef DEBUG - mexPrintf("arg_func_name.length() = %d\n", arg_func_name.length()); - mexPrintf("arg_func_name.c_str() = %s\n", arg_func_name.c_str()); -#endif - unsigned int nb_add_input_arguments = fc->get_nb_add_input_arguments(); - function_type = fc->get_function_type(); -#ifdef DEBUG - mexPrintf("function_type=%d ExternalFunctionWithoutDerivative=%d\n", function_type, ExternalFunctionWithoutDerivative); - mexEvalString("drawnow;"); -#endif - mxArray **input_arguments; - switch (function_type) - { - case ExternalFunctionWithoutDerivative: - case ExternalFunctionWithFirstDerivative: - case ExternalFunctionWithFirstandSecondDerivative: - { - input_arguments = (mxArray **) mxMalloc(nb_input_arguments * sizeof(mxArray *)); -#ifdef DEBUG - mexPrintf("Stack.size()=%d\n", Stack.size()); - mexEvalString("drawnow;"); -#endif - for (unsigned int i = 0; i < nb_input_arguments; i++) - { - mxArray *vv = mxCreateDoubleScalar(Stack.top()); - input_arguments[nb_input_arguments - i - 1] = vv; - Stack.pop(); - } - mexCallMATLAB(nb_output_arguments, output_arguments, nb_input_arguments, input_arguments, function_name.c_str()); - double *rr = mxGetPr(output_arguments[0]); - Stack.push(*rr); - if (function_type == ExternalFunctionWithFirstDerivative || function_type == ExternalFunctionWithFirstandSecondDerivative) - { - unsigned int indx = fc->get_indx(); - double *FD1 = mxGetPr(output_arguments[1]); - unsigned int rows = mxGetN(output_arguments[1]); - for (unsigned int i = 0; i < rows; i++) - TEFD[make_pair(indx, i)] = FD1[i]; - } - if (function_type == ExternalFunctionWithFirstandSecondDerivative) - { - unsigned int indx = fc->get_indx(); - double *FD2 = mxGetPr(output_arguments[2]); - unsigned int rows = mxGetM(output_arguments[2]); - unsigned int cols = mxGetN(output_arguments[2]); - unsigned int k = 0; - for (unsigned int j = 0; j < cols; j++) - for (unsigned int i = 0; i < rows; i++) - TEFDD[make_pair(indx, make_pair(i, j))] = FD2[k++]; - } - } - break; - case ExternalFunctionNumericalFirstDerivative: - { - input_arguments = (mxArray **) mxMalloc((nb_input_arguments+1+nb_add_input_arguments) * sizeof(mxArray *)); - mxArray *vv = mxCreateString(arg_func_name.c_str()); - input_arguments[0] = vv; - vv = mxCreateDoubleScalar(fc->get_row()); - input_arguments[1] = vv; - vv = mxCreateCellMatrix(1, nb_add_input_arguments); - for (unsigned int i = 0; i < nb_add_input_arguments; i++) - { - double rr = Stack.top(); -#ifdef DEBUG - mexPrintf("i=%d rr = %f Stack.size()=%d\n", i, rr, Stack.size()); -#endif - mxSetCell(vv, nb_add_input_arguments - (i+1), mxCreateDoubleScalar(rr)); - Stack.pop(); - } - input_arguments[nb_input_arguments+nb_add_input_arguments] = vv; -#ifdef DEBUG - mexCallMATLAB(0, NULL, 1, &input_arguments[0], "disp"); - mexCallMATLAB(0, NULL, 1, &input_arguments[1], "disp"); - mexCallMATLAB(0, NULL, 1, &input_arguments[2], "celldisp"); - mexPrintf("OK\n"); - mexEvalString("drawnow;"); -#endif - nb_input_arguments = 3; - mexCallMATLAB(nb_output_arguments, output_arguments, nb_input_arguments, input_arguments, function_name.c_str()); - double *rr = mxGetPr(output_arguments[0]); -#ifdef DEBUG - mexPrintf("*rr=%f\n", *rr); -#endif - Stack.push(*rr); - } - break; - case ExternalFunctionFirstDerivative: - { - input_arguments = (mxArray **) mxMalloc(nb_input_arguments * sizeof(mxArray *)); - for (unsigned int i = 0; i < nb_input_arguments; i++) - { - mxArray *vv = mxCreateDoubleScalar(Stack.top()); - input_arguments[(nb_input_arguments - 1) - i] = vv; - Stack.pop(); - } - mexCallMATLAB(nb_output_arguments, output_arguments, nb_input_arguments, input_arguments, function_name.c_str()); - unsigned int indx = fc->get_indx(); - double *FD1 = mxGetPr(output_arguments[0]); - //mexPrint - unsigned int rows = mxGetN(output_arguments[0]); - for (unsigned int i = 0; i < rows; i++) - TEFD[make_pair(indx, i)] = FD1[i]; - } - break; - case ExternalFunctionNumericalSecondDerivative: - { - input_arguments = (mxArray **) mxMalloc((nb_input_arguments+1+nb_add_input_arguments) * sizeof(mxArray *)); - mxArray *vv = mxCreateString(arg_func_name.c_str()); - input_arguments[0] = vv; - vv = mxCreateDoubleScalar(fc->get_row()); - input_arguments[1] = vv; - vv = mxCreateDoubleScalar(fc->get_col()); - input_arguments[2] = vv; - vv = mxCreateCellMatrix(1, nb_add_input_arguments); - for (unsigned int i = 0; i < nb_add_input_arguments; i++) - { - double rr = Stack.top(); -#ifdef DEBUG - mexPrintf("i=%d rr = %f\n", i, rr); -#endif - mxSetCell(vv, (nb_add_input_arguments - 1) - i, mxCreateDoubleScalar(rr)); - Stack.pop(); - } - input_arguments[nb_input_arguments+nb_add_input_arguments] = vv; -#ifdef DEBUG - mexCallMATLAB(0, NULL, 1, &input_arguments[0], "disp"); - mexCallMATLAB(0, NULL, 1, &input_arguments[1], "disp"); - mexCallMATLAB(0, NULL, 1, &input_arguments[2], "celldisp"); - mexPrintf("OK\n"); - mexEvalString("drawnow;"); -#endif - nb_input_arguments = 3; - mexCallMATLAB(nb_output_arguments, output_arguments, nb_input_arguments, input_arguments, function_name.c_str()); - double *rr = mxGetPr(output_arguments[0]); - Stack.push(*rr); - } - break; - case ExternalFunctionSecondDerivative: - { - input_arguments = (mxArray **) mxMalloc(nb_input_arguments * sizeof(mxArray *)); - for (unsigned int i = 0; i < nb_input_arguments; i++) - { - mxArray *vv = mxCreateDoubleScalar(Stack.top()); - input_arguments[i] = vv; - Stack.pop(); - } - mexCallMATLAB(nb_output_arguments, output_arguments, nb_input_arguments, input_arguments, function_name.c_str()); - unsigned int indx = fc->get_indx(); - double *FD2 = mxGetPr(output_arguments[2]); - unsigned int rows = mxGetM(output_arguments[0]); - unsigned int cols = mxGetN(output_arguments[0]); - unsigned int k = 0; - for (unsigned int j = 0; j < cols; j++) - for (unsigned int i = 0; i < rows; i++) - TEFDD[make_pair(indx, make_pair(i, j))] = FD2[k++]; - } - break; - } - } - break; - case FSTPTEF: - var = ((FSTPTEF_ *) it_code->second)->get_number(); -#ifdef DEBUG - mexPrintf("FSTPTEF\n"); - mexPrintf("var=%d Stack.size()=%d\n", var, Stack.size()); -#endif - TEF[var-1] = Stack.top(); -#ifdef DEBUG - mexPrintf("FSTP TEF[var-1]=%f done\n", TEF[var-1]); - mexEvalString("drawnow;"); -#endif - Stack.pop(); - break; - case FLDTEF: - var = ((FLDTEF_ *) it_code->second)->get_number(); -#ifdef DEBUG - mexPrintf("FLDTEF\n"); - mexPrintf("var=%d Stack.size()=%d\n", var, Stack.size()); - mexPrintf("FLD TEF[var-1]=%f done\n", TEF[var-1]); - mexEvalString("drawnow;"); -#endif - Stack.push(TEF[var-1]); - break; - case FSTPTEFD: - { - unsigned int indx = ((FSTPTEFD_ *) it_code->second)->get_indx(); - unsigned int row = ((FSTPTEFD_ *) it_code->second)->get_row(); -#ifdef DEBUG - mexPrintf("FSTPTEFD\n"); - mexPrintf("indx=%d Stack.size()=%d\n", indx, Stack.size()); -#endif - if (function_type == ExternalFunctionNumericalFirstDerivative) - { - TEFD[make_pair(indx, row-1)] = Stack.top(); -#ifdef DEBUG - mexPrintf("FSTP TEFD[make_pair(indx, row)]=%f done\n", TEFD[make_pair(indx, row-1)]); - mexEvalString("drawnow;"); -#endif - Stack.pop(); - } - } - - break; - case FLDTEFD: - { - unsigned int indx = ((FLDTEFD_ *) it_code->second)->get_indx(); - unsigned int row = ((FLDTEFD_ *) it_code->second)->get_row(); -#ifdef DEBUG - mexPrintf("FLDTEFD\n"); - mexPrintf("indx=%d row=%d Stack.size()=%d\n", indx, row, Stack.size()); - mexPrintf("FLD TEFD[make_pair(indx, row)]=%f done\n", TEFD[make_pair(indx, row-1)]); - mexEvalString("drawnow;"); -#endif - Stack.push(TEFD[make_pair(indx, row-1)]); - } - break; - case FSTPTEFDD: - { - unsigned int indx = ((FSTPTEFDD_ *) it_code->second)->get_indx(); - unsigned int row = ((FSTPTEFDD_ *) it_code->second)->get_row(); - unsigned int col = ((FSTPTEFDD_ *) it_code->second)->get_col(); -#ifdef DEBUG - mexPrintf("FSTPTEFD\n"); - mexPrintf("indx=%d Stack.size()=%d\n", indx, Stack.size()); -#endif - if (function_type == ExternalFunctionNumericalSecondDerivative) - { - TEFDD[make_pair(indx, make_pair(row-1, col-1))] = Stack.top(); -#ifdef DEBUG - mexPrintf("FSTP TEFDD[make_pair(indx, make_pair(row, col))]=%f done\n", TEFDD[make_pair(indx, make_pair(row, col))]); - mexEvalString("drawnow;"); -#endif - Stack.pop(); - } - } - - break; - case FLDTEFDD: - { - unsigned int indx = ((FLDTEFDD_ *) it_code->second)->get_indx(); - unsigned int row = ((FLDTEFDD_ *) it_code->second)->get_row(); - unsigned int col = ((FSTPTEFDD_ *) it_code->second)->get_col(); -#ifdef DEBUG - mexPrintf("FLDTEFD\n"); - mexPrintf("indx=%d Stack.size()=%d\n", indx, Stack.size()); - mexPrintf("FLD TEFD[make_pair(indx, make_pair(row, col))]=%f done\n", TEFDD[make_pair(indx, make_pair(row, col))]); - mexEvalString("drawnow;"); -#endif - Stack.push(TEFDD[make_pair(indx, make_pair(row-1, col-1))]); - } - break; - case FCUML: - v1 = Stack.top(); - Stack.pop(); - v2 = Stack.top(); - Stack.pop(); - Stack.push(v1+v2); - break; - case FENDBLOCK: - //it's the block end -#ifdef DEBUG - mexPrintf("FENDBLOCK\n"); -#endif - go_on = false; - break; - case FENDEQU: - break; - case FJMPIFEVAL: - if (evaluate) - { -#ifdef DEBUG - mexPrintf("FJMPIFEVAL length=%d\n", ((FJMPIFEVAL_ *) it_code->second)->get_pos()); - mexEvalString("drawnow;"); -#endif - it_code += ((FJMPIFEVAL_ *) it_code->second)->get_pos() /* - 1*/; - } - break; - case FJMP: -#ifdef DEBUG - mexPrintf("FJMP length=%d\n", ((FJMP_ *) it_code->second)->get_pos()); - mexEvalString("drawnow;"); -#endif - it_code += ((FJMP_ *) it_code->second)->get_pos() /*- 1 */; - break; - case FOK: - op = ((FOK_ *) it_code->second)->get_arg(); - if (Stack.size() > 0) - { - ostringstream tmp; - tmp << " in compute_block_time, stack not empty\n"; - throw FatalExceptionHandling(tmp.str()); - } - break; - default: - ostringstream tmp; - tmp << " in compute_block_time, unknown opcode " << it_code->first << "\n"; - throw FatalExceptionHandling(tmp.str()); - } - it_code++; - } -#ifdef DEBUG - mexPrintf("==> end of compute_block_time Block = %d\n", block_num); - mexEvalString("drawnow;"); -#endif -} - -void -Interpreter::evaluate_a_block(const int size, const int type, string bin_basename, bool steady_state, int block_num, +Interpreter::evaluate_a_block(/*const int size, const int type, string bin_basename, int block_num, const bool is_linear, const int symbol_table_endo_nbr, const int Block_List_Max_Lag, - const int Block_List_Max_Lead, const int u_count_int, int block) + const int Block_List_Max_Lead, const int u_count_int, int block*/) { it_code_type begining; @@ -1460,16 +91,19 @@ Interpreter::evaluate_a_block(const int size, const int type, string bin_basenam case EVALUATE_FORWARD: if (steady_state) { - compute_block_time(0, true, block_num, size, steady_state); + compute_block_time(0, true, /*block_num, size, steady_state, */false); if (block >= 0) + #ifdef USE_OMP + #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) + #endif for (int j = 0; j < size; j++) residual[j] = y[Block_Contain[j].Variable] - ya[Block_Contain[j].Variable]; else + #ifdef USE_OMP + #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) + #endif for (int j = 0; j < size; j++) - { - //mexPrintf("=>residual[Block_Contain[%d].Equation = %d]=%g (y = %g, ya = %g)\n", j, Block_Contain[j].Equation, y[Block_Contain[j].Variable] - ya[Block_Contain[j].Variable], y[Block_Contain[j].Variable], ya[Block_Contain[j].Variable]); - residual[Block_Contain[j].Equation] = y[Block_Contain[j].Variable] - ya[Block_Contain[j].Variable]; - } + residual[Block_Contain[j].Equation] = y[Block_Contain[j].Variable] - ya[Block_Contain[j].Variable]; } else { @@ -1478,11 +112,17 @@ Interpreter::evaluate_a_block(const int size, const int type, string bin_basenam { it_code = begining; Per_y_ = it_*y_size; - compute_block_time(0, true, block_num, size, steady_state); + compute_block_time(0, true, /*block_num, size, steady_state, */false); if (block >= 0) + #ifdef USE_OMP + #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) + #endif for (int j = 0; j < size; j++) residual[it_*size+j] = y[it_*y_size+Block_Contain[j].Variable] - ya[it_*y_size+Block_Contain[j].Variable]; else + #ifdef USE_OMP + #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) + #endif for (int j = 0; j < size; j++) residual[it_*size+Block_Contain[j].Equation] = y[it_*y_size+Block_Contain[j].Variable] - ya[it_*y_size+Block_Contain[j].Variable]; } @@ -1493,64 +133,17 @@ Interpreter::evaluate_a_block(const int size, const int type, string bin_basenam r = (double *) mxMalloc(size*sizeof(double)); if (steady_state) { - compute_block_time(0, true, block_num, size, steady_state); - if (block < 0) - { - for (int j = 0; j < size; j++) - { - //mexPrintf("residual[Block_Contain[%d].Equation = %d]=%g\n", j, Block_Contain[j].Equation, r[j]); - residual[Block_Contain[j].Equation] = r[j]; - } - } - else - { - for (int j = 0; j < size; j++) - residual[j] = r[j]; - } - } - else - { - begining = it_code; - for (it_ = y_kmin; it_ < periods+y_kmin; it_++) - { - it_code = begining; - Per_y_ = it_*y_size; - compute_block_time(0, true, block_num, size, steady_state); - if (block < 0) - { - for (int j = 0; j < size; j++) - { - //mexPrintf("residual[Per_y + Block_Contain[%d].Equation = %d]=%g\n", j, Per_y_ + Block_Contain[j].Equation, r[j]); - residual[Per_y_+Block_Contain[j].Equation] = r[j]; - } - } - else - { - for (int j = 0; j < size; j++) - residual[it_*size+j] = r[j]; - } - } - } - mxFree(g1); - mxFree(r); - break; - case SOLVE_FORWARD_COMPLETE: - fixe_u(&u, u_count_int, u_count_int); - Read_SparseMatrix(bin_basename, size, 1, 0, 0, steady_state, false, stack_solve_algo, solve_algo); -#ifdef DEBUG - mexPrintf("in SOLVE_FORWARD_COMPLETE r = mxMalloc(%d*sizeof(double))\n", size); -#endif - r = (double *) mxMalloc(size*sizeof(double)); - if (steady_state) - { - compute_block_time(0, true, block_num, size, steady_state); + compute_block_time(0, true, /*block_num, size, steady_state,*/ false); if (block < 0) + #ifdef USE_OMP + #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) + #endif for (int j = 0; j < size; j++) - { - //mexPrintf("residual[Block_Contain[%d].Equation = %d]=%g\n", j, Block_Contain[j].Equation, r[j]); - residual[Block_Contain[j].Equation] = r[j]; - } + residual[Block_Contain[j].Equation] = r[j]; else + #ifdef USE_OMP + #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) + #endif for (int j = 0; j < size; j++) residual[j] = r[j]; } @@ -1561,14 +154,65 @@ Interpreter::evaluate_a_block(const int size, const int type, string bin_basenam { it_code = begining; Per_y_ = it_*y_size; - compute_block_time(0, true, block_num, size, steady_state); + compute_block_time(0, true, /*block_num, size, steady_state,*/ false); if (block < 0) + #ifdef USE_OMP + #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) + #endif for (int j = 0; j < size; j++) - { - //mexPrintf("residual[it_*y_size+Block_Contain[%d].Equation = %d]=%g\n", j, it_*y_size+Block_Contain[j].Equation, r[j]); - residual[it_*y_size+Block_Contain[j].Equation] = r[j]; - } + residual[Per_y_+Block_Contain[j].Equation] = r[j]; else + #ifdef USE_OMP + #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) + #endif + for (int j = 0; j < size; j++) + residual[it_*size+j] = r[j]; + } + } + mxFree(g1); + mxFree(r); + break; + case SOLVE_FORWARD_COMPLETE: + fixe_u(&u, u_count_int, u_count_int); + Read_SparseMatrix(bin_base_name, size, 1, 0, 0, false, stack_solve_algo, solve_algo); +#ifdef DEBUG + mexPrintf("in SOLVE_FORWARD_COMPLETE r = mxMalloc(%d*sizeof(double))\n", size); +#endif + r = (double *) mxMalloc(size*sizeof(double)); + if (steady_state) + { + compute_block_time(0, true, /*block_num, size, steady_state,*/ false); + if (block < 0) + #ifdef USE_OMP + #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) + #endif + for (int j = 0; j < size; j++) + residual[Block_Contain[j].Equation] = r[j]; + else + #ifdef USE_OMP + #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) + #endif + for (int j = 0; j < size; j++) + residual[j] = r[j]; + } + else + { + begining = it_code; + for (it_ = y_kmin; it_ < periods+y_kmin; it_++) + { + it_code = begining; + Per_y_ = it_*y_size; + compute_block_time(0, true, /*block_num, size, steady_state,*/ false); + if (block < 0) + #ifdef USE_OMP + #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) + #endif + for (int j = 0; j < size; j++) + residual[it_*y_size+Block_Contain[j].Equation] = r[j]; + else + #ifdef USE_OMP + #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) + #endif for (int j = 0; j < size; j++) residual[it_*size+j] = r[j]; } @@ -1578,16 +222,19 @@ Interpreter::evaluate_a_block(const int size, const int type, string bin_basenam case EVALUATE_BACKWARD: if (steady_state) { - compute_block_time(0, true, block_num, size, steady_state); + compute_block_time(0, true, /*block_num, size, steady_state,*/ false); if (block >= 0) + #ifdef USE_OMP + #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) + #endif for (int j = 0; j < size; j++) residual[j] = y[Block_Contain[j].Variable] - ya[Block_Contain[j].Variable]; else + #ifdef USE_OMP + #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) + #endif for (int j = 0; j < size; j++) - { - //mexPrintf("residual[Block_Contain[%d].Equation = %d]=%g\n", j, Block_Contain[j].Equation, y[Block_Contain[j].Variable] - ya[Block_Contain[j].Variable]); - residual[Block_Contain[j].Equation] = y[Block_Contain[j].Variable] - ya[Block_Contain[j].Variable]; - } + residual[Block_Contain[j].Equation] = y[Block_Contain[j].Variable] - ya[Block_Contain[j].Variable]; } else { @@ -1596,11 +243,17 @@ Interpreter::evaluate_a_block(const int size, const int type, string bin_basenam { it_code = begining; Per_y_ = it_*y_size; - compute_block_time(0, true, block_num, size, steady_state); + compute_block_time(0, true, /*block_num, size, steady_state,*/ false); if (block >= 0) + #ifdef USE_OMP + #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) + #endif for (int j = 0; j < size; j++) residual[it_*size+j] = y[it_*y_size+Block_Contain[j].Variable] - ya[it_*y_size+Block_Contain[j].Variable]; else + #ifdef USE_OMP + #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) + #endif for (int j = 0; j < size; j++) residual[it_*size+Block_Contain[j].Equation] = y[it_*y_size+Block_Contain[j].Variable] - ya[it_*y_size+Block_Contain[j].Variable]; } @@ -1611,61 +264,17 @@ Interpreter::evaluate_a_block(const int size, const int type, string bin_basenam r = (double *) mxMalloc(size*sizeof(double)); if (steady_state) { - compute_block_time(0, true, block_num, size, steady_state); - if (block < 0) - { - for (int j = 0; j < size; j++) - { - //mexPrintf("residual[Block_Contain[%d].Equation = %d]=%g\n", j, Block_Contain[j].Equation, r[j]); - residual[Block_Contain[j].Equation] = r[j]; - } - } - else - { - for (int j = 0; j < size; j++) - residual[j] = r[j]; - } - } - else - { - begining = it_code; - for (it_ = periods+y_kmin-1; it_ >= y_kmin; it_--) - { - it_code = begining; - Per_y_ = it_*y_size; - compute_block_time(0, true, block_num, size, steady_state); - if (block < 0) - { - for (int j = 0; j < size; j++) - { - //mexPrintf("residual[Per_y_+Block_Contain[%d].Equation = %d]=%g\n", j, Per_y_+Block_Contain[j].Equation, r[j]); - residual[Per_y_+Block_Contain[j].Equation] = r[j]; - } - } - else - { - for (int j = 0; j < size; j++) - residual[it_*size+j] = r[j]; - } - } - } - mxFree(g1); - mxFree(r); - break; - case SOLVE_BACKWARD_COMPLETE: - fixe_u(&u, u_count_int, u_count_int); - Read_SparseMatrix(bin_basename, size, 1, 0, 0, steady_state, false, stack_solve_algo, solve_algo); - r = (double *) mxMalloc(size*sizeof(double)); - if (steady_state) - { - compute_block_time(0, true, block_num, size, steady_state); + compute_block_time(0, true, /*block_num, size, steady_state,*/ false); if (block < 0) + #ifdef USE_OMP + #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) + #endif for (int j = 0; j < size; j++) - { - //mexPrintf("residual[Block_Contain[%d].Equation = %d]=%g\n", j, Block_Contain[j].Equation, r[j]); - residual[Block_Contain[j].Equation] = r[j]; - } + residual[Block_Contain[j].Equation] = r[j]; else + #ifdef USE_OMP + #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) + #endif for (int j = 0; j < size; j++) residual[j] = r[j]; } @@ -1676,14 +285,62 @@ Interpreter::evaluate_a_block(const int size, const int type, string bin_basenam { it_code = begining; Per_y_ = it_*y_size; - compute_block_time(0, true, block_num, size, steady_state); + compute_block_time(0, true, /*block_num, size, steady_state,*/ false); if (block < 0) + #ifdef USE_OMP + #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) + #endif for (int j = 0; j < size; j++) - { - //mexPrintf("residual[Per_y_+Block_Contain[%d].Equation = %d]=%g\n", j, Per_y_+Block_Contain[j].Equation, r[j]); - residual[Per_y_+Block_Contain[j].Equation] = r[j]; - } + residual[Per_y_+Block_Contain[j].Equation] = r[j]; else + #ifdef USE_OMP + #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) + #endif + for (int j = 0; j < size; j++) + residual[it_*size+j] = r[j]; + } + } + mxFree(g1); + mxFree(r); + break; + case SOLVE_BACKWARD_COMPLETE: + fixe_u(&u, u_count_int, u_count_int); + Read_SparseMatrix(bin_base_name, size, 1, 0, 0, false, stack_solve_algo, solve_algo); + r = (double *) mxMalloc(size*sizeof(double)); + if (steady_state) + { + compute_block_time(0, true, /*block_num, size, steady_state,*/ false); + if (block < 0) + #ifdef USE_OMP + #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) + #endif + for (int j = 0; j < size; j++) + residual[Block_Contain[j].Equation] = r[j]; + else + #ifdef USE_OMP + #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) + #endif + for (int j = 0; j < size; j++) + residual[j] = r[j]; + } + else + { + begining = it_code; + for (it_ = periods+y_kmin-1; it_ >= y_kmin; it_--) + { + it_code = begining; + Per_y_ = it_*y_size; + compute_block_time(0, true, /*block_num, size, steady_state, */false); + if (block < 0) + #ifdef USE_OMP + #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) + #endif + for (int j = 0; j < size; j++) + residual[Per_y_+Block_Contain[j].Equation] = r[j]; + else + #ifdef USE_OMP + #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) + #endif for (int j = 0; j < size; j++) residual[it_*size+j] = r[j]; } @@ -1693,7 +350,7 @@ Interpreter::evaluate_a_block(const int size, const int type, string bin_basenam case SOLVE_TWO_BOUNDARIES_SIMPLE: case SOLVE_TWO_BOUNDARIES_COMPLETE: fixe_u(&u, u_count_int, u_count_int); - Read_SparseMatrix(bin_basename, size, periods, y_kmin, y_kmax, steady_state, true, stack_solve_algo, solve_algo); + Read_SparseMatrix(bin_base_name, size, periods, y_kmin, y_kmax, true, stack_solve_algo, solve_algo); u_count = u_count_int*(periods+y_kmax+y_kmin); r = (double *) mxMalloc(size*sizeof(double)); begining = it_code; @@ -1702,14 +359,17 @@ Interpreter::evaluate_a_block(const int size, const int type, string bin_basenam Per_u_ = (it_-y_kmin)*u_count_int; Per_y_ = it_*y_size; it_code = begining; - compute_block_time(Per_u_, true, block_num, size, steady_state); + compute_block_time(Per_u_, true, /*block_num, size, steady_state,*/ false); if (block < 0) + #ifdef USE_OMP + #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) + #endif for (int j = 0; j < size; j++) - { - //mexPrintf("residual[it_*y_size+Block_Contain[%d].Equation = %d]=%g\n", j, it_*y_size+Block_Contain[j].Equation, r[j]); - residual[it_*y_size+Block_Contain[j].Equation] = r[j]; - } + residual[it_*y_size+Block_Contain[j].Equation] = r[j]; else + #ifdef USE_OMP + #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) + #endif for (int j = 0; j < size; j++) residual[it_*size+j] = r[j]; } @@ -1719,650 +379,88 @@ Interpreter::evaluate_a_block(const int size, const int type, string bin_basenam } -void -Interpreter::SingularDisplay(int Per_u_, bool evaluate, int Block_Count, int size, bool steady_state, it_code_type begining) -{ - it_code = begining; - compute_block_time(Per_u_, evaluate, Block_Count, size, steady_state); - Singular_display(Block_Count, size, steady_state, begining); -} - int -Interpreter::simulate_a_block(const int size, const int type, string file_name, string bin_basename, bool Gaussian_Elimination, bool steady_state, bool print_it, int block_num, - const bool is_linear, const int symbol_table_endo_nbr, const int Block_List_Max_Lag, const int Block_List_Max_Lead, const int u_count_int) +Interpreter::simulate_a_block() { it_code_type begining; - int i; bool cvg; - bool result = true; - bool singular_system; double *y_save; - res1 = 0; #ifdef DEBUG - mexPrintf("simulate_a_block\n"); + mexPrintf("simulate_a_block type = %d, periods=%d, y_kmin=%d, y_kmax=%d\n", type, periods, y_kmin, y_kmax); + mexEvalString("drawnow;"); #endif switch (type) { case EVALUATE_FORWARD: #ifdef DEBUG mexPrintf("EVALUATE_FORWARD\n"); + mexEvalString("drawnow;"); #endif - if (steady_state) - compute_block_time(0, false, block_num, size, steady_state); - else - { - begining = it_code; - for (it_ = y_kmin; it_ < periods+y_kmin; it_++) - { - it_code = begining; - Per_y_ = it_*y_size; - compute_block_time(0, false, block_num, size, steady_state); - } - } + evaluate_over_periods(true); break; case EVALUATE_BACKWARD: #ifdef DEBUG mexPrintf("EVALUATE_BACKWARD\n"); + mexEvalString("drawnow;"); #endif - if (steady_state) - compute_block_time(0, false, block_num, size, steady_state); - else - { - begining = it_code; - for (it_ = periods+y_kmin-1; it_ >= y_kmin; it_--) - { - it_code = begining; - Per_y_ = it_*y_size; - compute_block_time(0, false, block_num, size, steady_state); - } - } + evaluate_over_periods(false); break; case SOLVE_FORWARD_SIMPLE: #ifdef DEBUG mexPrintf("SOLVE_FORWARD_SIMPLE size=%d\n", size); + mexEvalString("drawnow;"); #endif - g1 = (double *) mxMalloc(size*size*sizeof(double)); - r = (double *) mxMalloc(size*sizeof(double)); - begining = it_code; - if (steady_state) - { - cvg = false; - iter = 0; - while (!(cvg || (iter > maxit_))) - { - it_code = begining; - Per_y_ = it_*y_size; - compute_block_time(0, false, block_num, size, steady_state); - double rr; - rr = r[0]; - cvg = (fabs(rr) < solve_tolf); - if (cvg) - continue; - - try - { - y[Block_Contain[0].Variable] += -divide(r[0], g1[0]); - } - catch (FloatingPointExceptionHandling &fpeh) - { - mexPrintf("%s \n", fpeh.GetErrorMsg().c_str()); - mexPrintf(" Singularity in block %d", block_num+1); - } - iter++; - } - if (!cvg) - { - ostringstream tmp; - tmp << " in Solve Forward simple, convergence not achieved in block " << Block_Count+1 << ", after " << iter << " iterations\n"; - throw FatalExceptionHandling(tmp.str()); - } - } - else - { - for (it_ = y_kmin; it_ < periods+y_kmin; it_++) - { - cvg = false; - iter = 0; - Per_y_ = it_*y_size; - while (!(cvg || (iter > maxit_))) - { - it_code = begining; - Per_y_ = it_*y_size; - compute_block_time(0, false, block_num, size, steady_state); - double rr; - if (fabs(1+y[Per_y_+Block_Contain[0].Variable]) > eps) - rr = r[0]/(1+y[Per_y_+Block_Contain[0].Variable]); - else - rr = r[0]; - cvg = (fabs(rr) < solve_tolf); - if (cvg) - continue; - try - { - y[Per_y_+Block_Contain[0].Variable] += -divide(r[0], g1[0]); - } - catch (FloatingPointExceptionHandling &fpeh) - { - mexPrintf("%s \n", fpeh.GetErrorMsg().c_str()); - mexPrintf(" Singularity in block %d", block_num+1); - } - iter++; - } - if (!cvg) - { - ostringstream tmp; - tmp << " in Solve Forward simple, convergence not achieved in block " << Block_Count+1 << ", at time " << it_ << ", after " << iter << " iterations\n"; - throw FatalExceptionHandling(tmp.str()); - } - } - } - mxFree(g1); - mxFree(r); + solve_simple_over_periods(true); break; case SOLVE_BACKWARD_SIMPLE: #ifdef DEBUG mexPrintf("SOLVE_BACKWARD_SIMPLE\n"); + mexEvalString("drawnow;"); #endif - g1 = (double *) mxMalloc(size*size*sizeof(double)); - r = (double *) mxMalloc(size*sizeof(double)); - begining = it_code; - if (steady_state) - { - cvg = false; - iter = 0; - while (!(cvg || (iter > maxit_))) - { - it_code = begining; - Per_y_ = it_*y_size; - compute_block_time(0, false, block_num, size, steady_state); - double rr; - rr = r[0]; - cvg = (fabs(rr) < solve_tolf); - if (cvg) - continue; - try - { - y[Block_Contain[0].Variable] += -divide(r[0], g1[0]); - } - catch (FloatingPointExceptionHandling &fpeh) - { - mexPrintf("%s \n", fpeh.GetErrorMsg().c_str()); - mexPrintf(" Singularity in block %d", block_num+1); - } - iter++; - } - if (!cvg) - { - ostringstream tmp; - tmp << " in Solve Backward simple, convergence not achieved in block " << Block_Count+1 << ", after " << iter << " iterations\n"; - throw FatalExceptionHandling(tmp.str()); - } - } - else - { - for (it_ = periods+y_kmin-1; it_ >= y_kmin; it_--) - { - cvg = false; - iter = 0; - Per_y_ = it_*y_size; - while (!(cvg || (iter > maxit_))) - { - it_code = begining; - Per_y_ = it_*y_size; - compute_block_time(0, false, block_num, size, steady_state); - double rr; - if (fabs(1+y[Per_y_+Block_Contain[0].Variable]) > eps) - rr = r[0]/(1+y[Per_y_+Block_Contain[0].Variable]); - else - rr = r[0]; - cvg = (fabs(rr) < solve_tolf); - if (cvg) - continue; - try - { - y[Per_y_+Block_Contain[0].Variable] += -divide(r[0], g1[0]); - } - catch (FloatingPointExceptionHandling &fpeh) - { - mexPrintf("%s \n", fpeh.GetErrorMsg().c_str()); - mexPrintf(" Singularity in block %d", block_num+1); - } - - iter++; - } - if (!cvg) - { - ostringstream tmp; - tmp << " in Solve Backward simple, convergence not achieved in block " << Block_Count+1 << ", at time " << it_ << ", after " << iter << " iterations\n"; - throw FatalExceptionHandling(tmp.str()); - } - } - } - mxFree(g1); - mxFree(r); + solve_simple_over_periods(false); break; case SOLVE_FORWARD_COMPLETE: #ifdef DEBUG mexPrintf("SOLVE_FORWARD_COMPLETE\n"); + mexEvalString("drawnow;"); #endif fixe_u(&u, u_count_int, u_count_int); - Read_SparseMatrix(bin_basename, size, 1, 0, 0, steady_state, false, stack_solve_algo, solve_algo); - g1 = (double *) mxMalloc(size*size*sizeof(double)); - r = (double *) mxMalloc(size*sizeof(double)); - begining = it_code; + Read_SparseMatrix(bin_base_name, size, 1, 0, 0, false, stack_solve_algo, solve_algo); + start_code = it_code; Per_u_ = 0; - if (steady_state) - { - if (!is_linear) - { - max_res_idx = 0; - cvg = false; - iter = 0; - glambda2 = g0 = very_big; - try_at_iteration = 0; - while (!(cvg || (iter > maxit_))) - { - it_code = begining; - error_not_printed = true; - res2 = 0; - res1 = 0; - max_res = 0; - compute_block_time(0, false, block_num, size, steady_state); - if (!(isnan(res1) || isinf(res1))) - { - for (i = 0; i < size; i++) - { - double rr; - rr = r[i]; - if (max_res < fabs(rr)) - { - max_res = fabs(rr); - max_res_idx = i; - } - res2 += rr*rr; - res1 += fabs(rr); - } - cvg = (max_res < solve_tolf); - } - else - cvg = false; - if (cvg) - continue; - int prev_iter = iter; - singular_system = Simulate_Newton_One_Boundary(Block_Count, symbol_table_endo_nbr, 0, 0, 0, size, print_it, cvg, iter, true, stack_solve_algo, solve_algo); - if (singular_system) - SingularDisplay(0, false, block_num, size, steady_state, begining); - iter++; - if (iter > prev_iter) - { - g0 = res2; - gp0 = -res2; - try_at_iteration = 0; - } - } - if (!cvg || !result) - { - ostringstream tmp; - tmp << " in Solve Forward complete, convergence not achieved in block " << Block_Count+1 << ", after " << iter << " iterations\n"; - throw FatalExceptionHandling(tmp.str()); - } - } - else - { - it_code = begining; - Per_y_ = it_*y_size; - iter = 0; - res1 = 0; - res2 = 0; - max_res = 0; - max_res_idx = 0; - error_not_printed = true; - compute_block_time(0, false, block_num, size, steady_state); - if (!(isnan(res1) || isinf(res1))) - { - for (i = 0; i < size; i++) - { - double rr; - rr = r[i]; - if (max_res < fabs(rr)) - { - max_res = fabs(rr); - max_res_idx = i; - } - res2 += rr*rr; - res1 += fabs(rr); - } - cvg = (max_res < solve_tolf); - } - else - cvg = false; - singular_system = Simulate_Newton_One_Boundary(Block_Count, symbol_table_endo_nbr, 0, 0, 0, size, print_it, cvg, iter, true, stack_solve_algo, solve_algo); - if (singular_system) - SingularDisplay(0, false, block_num, size, steady_state, begining); - if (!result) - { - mexPrintf(" in Solve Forward complete, convergence not achieved in block %d\n", Block_Count+1); - return ERROR_ON_EXIT; - } - } - } - else - { - if (!is_linear) - { - max_res_idx = 0; - for (it_ = y_kmin; it_ < periods+y_kmin; it_++) - { - cvg = false; - iter = 0; - glambda2 = g0 = very_big; - try_at_iteration = 0; - Per_y_ = it_*y_size; - while (!(cvg || (iter > maxit_))) - { - it_code = begining; - error_not_printed = true; - res2 = 0; - res1 = 0; - max_res = 0; - compute_block_time(0, false, block_num, size, steady_state); - if (!(isnan(res1) || isinf(res1))) - { - for (i = 0; i < size; i++) - { - double rr; - if (fabs(1+y[Per_y_+Block_Contain[i].Variable]) > eps) - rr = r[i]/(1+y[Per_y_+Block_Contain[i].Variable]); - else - rr = r[i]; - if (max_res < fabs(rr)) - { - max_res = fabs(rr); - max_res_idx = i; - } - res2 += rr*rr; - res1 += fabs(rr); - } - cvg = (max_res < solve_tolf); - } - else - cvg = false; - if (cvg) - continue; - int prev_iter = iter; - singular_system = Simulate_Newton_One_Boundary(Block_Count, symbol_table_endo_nbr, it_, y_kmin, y_kmax, size, print_it, cvg, iter, false, stack_solve_algo, solve_algo); - if (singular_system) - SingularDisplay(0, false, block_num, size, steady_state, begining); - iter++; - if (iter > prev_iter) - { - g0 = res2; - gp0 = -res2; - try_at_iteration = 0; - } - } - if (!cvg) - { - ostringstream tmp; - tmp << " in Solve Forward complete, convergence not achieved in block " << Block_Count+1 << ", at time " << it_ << ", after " << iter << " iterations\n"; - throw FatalExceptionHandling(tmp.str()); - } - } - } - else - { - for (it_ = y_kmin; it_ < periods+y_kmin; it_++) - { - it_code = begining; - Per_y_ = it_*y_size; - iter = 0; - res1 = 0; - res2 = 0; - max_res = 0; max_res_idx = 0; - error_not_printed = true; - compute_block_time(0, false, block_num, size, steady_state); - if (!(isnan(res1) || isinf(res1))) - { - for (i = 0; i < size; i++) - { - double rr; - rr = r[i]; - if (max_res < fabs(rr)) - { - max_res = fabs(rr); - max_res_idx = i; - } - res2 += rr*rr; - res1 += fabs(rr); - } - cvg = (max_res < solve_tolf); - } - else - cvg = false; - singular_system = Simulate_Newton_One_Boundary(Block_Count, symbol_table_endo_nbr, it_, y_kmin, y_kmax, size, print_it, cvg, iter, false, stack_solve_algo, solve_algo); - if (singular_system) - SingularDisplay(0, false, block_num, size, steady_state, begining); - } - } - } + Simulate_Newton_One_Boundary(true); + + mxFree(u); mxFree(index_equa); mxFree(index_vara); memset(direction, 0, size_of_direction); - mxFree(g1); - mxFree(r); - mxFree(u); + End_Solver(); break; case SOLVE_BACKWARD_COMPLETE: #ifdef DEBUG mexPrintf("SOLVE_BACKWARD_COMPLETE\n"); + mexEvalString("drawnow;"); #endif fixe_u(&u, u_count_int, u_count_int); - Read_SparseMatrix(bin_basename, size, 1, 0, 0, steady_state, false, stack_solve_algo, solve_algo); - g1 = (double *) mxMalloc(size*size*sizeof(double)); - r = (double *) mxMalloc(size*sizeof(double)); - begining = it_code; - if (steady_state) - { - if (!is_linear) - { - max_res_idx = 0; - cvg = false; - iter = 0; - glambda2 = g0 = very_big; - try_at_iteration = 0; - while (!(cvg || (iter > maxit_))) - { - it_code = begining; - error_not_printed = true; - res2 = 0; - res1 = 0; - max_res = 0; - compute_block_time(0, false, block_num, size, steady_state); - if (!(isnan(res1) || isinf(res1))) - { - for (i = 0; i < size; i++) - { - double rr; - rr = r[i]; - if (max_res < fabs(rr)) - { - max_res = fabs(rr); - max_res_idx = i; - } - res2 += rr*rr; - res1 += fabs(rr); - } - cvg = (max_res < solve_tolf); - } - else - cvg = false; - if (cvg) - continue; - int prev_iter = iter; - singular_system = Simulate_Newton_One_Boundary(Block_Count, symbol_table_endo_nbr, 0, 0, 0, size, print_it, cvg, iter, true, stack_solve_algo, solve_algo); - if (singular_system) - SingularDisplay(0, false, block_num, size, steady_state, begining); - iter++; - if (iter > prev_iter) - { - g0 = res2; - gp0 = -res2; - try_at_iteration = 0; - } - } - if (!cvg || !result) - { - ostringstream tmp; - tmp << " in Solve Backward complete, convergence not achieved in block " << Block_Count+1 << ", after " << iter << " iterations\n"; - throw FatalExceptionHandling(tmp.str()); - } - } - else - { - it_code = begining; - Per_y_ = it_*y_size; - iter = 0; - res1 = 0; - res2 = 0; - max_res = 0; max_res_idx = 0; - error_not_printed = true; - compute_block_time(0, false, block_num, size, steady_state); - if (!(isnan(res1) || isinf(res1))) - { - for (i = 0; i < size; i++) - { - double rr; - rr = r[i]; - if (max_res < fabs(rr)) - { - max_res = fabs(rr); - max_res_idx = i; - } - res2 += rr*rr; - res1 += fabs(rr); - } - cvg = (max_res < solve_tolf); - } - else - cvg = false; - singular_system = Simulate_Newton_One_Boundary(Block_Count, symbol_table_endo_nbr, 0, 0, 0, size, print_it, cvg, iter, true, stack_solve_algo, solve_algo); - if (singular_system) - SingularDisplay(0, false, block_num, size, steady_state, begining); - if (!result) - { - mexPrintf(" in Solve Backward complete, convergence not achieved in block %d\n", Block_Count+1); - return ERROR_ON_EXIT; - } - } - } - else - { - if (!is_linear) - { - max_res_idx = 0; - for (it_ = periods+y_kmin-1; it_ >= y_kmin; it_--) - { - cvg = false; - iter = 0; - glambda2 = g0 = very_big; - try_at_iteration = 0; - Per_y_ = it_*y_size; - while (!(cvg || (iter > maxit_))) - { - it_code = begining; - error_not_printed = true; - res2 = 0; - res1 = 0; - max_res = 0; - compute_block_time(0, false, block_num, size, steady_state); - if (!(isnan(res1) || isinf(res1))) - { - for (i = 0; i < size; i++) - { - double rr; - if (fabs(1+y[Per_y_+Block_Contain[i].Variable]) > eps) - rr = r[i]/(1+y[Per_y_+Block_Contain[i].Variable]); - else - rr = r[i]; - if (max_res < fabs(rr)) - { - max_res = fabs(rr); - max_res_idx = i; - } - res2 += rr*rr; - res1 += fabs(rr); - } - cvg = (max_res < solve_tolf); - } - else - cvg = false; - if (cvg) - continue; - int prev_iter = iter; - singular_system = Simulate_Newton_One_Boundary(Block_Count, symbol_table_endo_nbr, it_, y_kmin, y_kmax, size, print_it, cvg, iter, false, stack_solve_algo, solve_algo); - if (singular_system) - SingularDisplay(0, false, block_num, size, steady_state, begining); - iter++; - if (iter > prev_iter) - { - g0 = res2; - gp0 = -res2; - try_at_iteration = 0; - } - } - if (!cvg) - { - ostringstream tmp; - tmp << " in Solve Backward complete, convergence not achieved in block " << Block_Count+1 << ", at time " << it_ << ", after " << iter << " iterations\n"; - throw FatalExceptionHandling(tmp.str()); - } - } - } - else - { - for (it_ = periods+y_kmin-1; it_ >= y_kmin; it_--) - { - it_code = begining; - Per_y_ = it_*y_size; - error_not_printed = true; - compute_block_time(0, false, block_num, size, steady_state); - if (!(isnan(res1) || isinf(res1))) - { - for (i = 0; i < size; i++) - { - double rr; - rr = r[i]; - if (max_res < fabs(rr)) - { - max_res = fabs(rr); - max_res_idx = i; - } - res2 += rr*rr; - res1 += fabs(rr); - } - cvg = (max_res < solve_tolf); - } - else - cvg = false; - singular_system = Simulate_Newton_One_Boundary(Block_Count, symbol_table_endo_nbr, it_, y_kmin, y_kmax, size, print_it, cvg, iter, false, stack_solve_algo, solve_algo); - if (singular_system) - SingularDisplay(0, false, block_num, size, steady_state, begining); - } - } - } + Read_SparseMatrix(bin_base_name, size, 1, 0, 0, false, stack_solve_algo, solve_algo); + start_code = it_code; + Per_u_ = 0; + + Simulate_Newton_One_Boundary(false); + mxFree(index_equa); mxFree(index_vara); memset(direction, 0, size_of_direction); - mxFree(g1); - mxFree(r); mxFree(u); + End_Solver(); break; case SOLVE_TWO_BOUNDARIES_SIMPLE: case SOLVE_TWO_BOUNDARIES_COMPLETE: #ifdef DEBUG mexPrintf("SOLVE_TWO_BOUNDARIES\n"); + mexEvalString("drawnow;"); #endif if (steady_state) { @@ -2370,11 +468,12 @@ Interpreter::simulate_a_block(const int size, const int type, string file_name, return ERROR_ON_EXIT; } fixe_u(&u, u_count_int, u_count_int); - Read_SparseMatrix(bin_basename, size, periods, y_kmin, y_kmax, steady_state, true, stack_solve_algo, solve_algo); + Read_SparseMatrix(bin_base_name, size, periods, y_kmin, y_kmax, true, stack_solve_algo, solve_algo); u_count = u_count_int*(periods+y_kmax+y_kmin); r = (double *) mxMalloc(size*sizeof(double)); + res = (double *) mxMalloc(size*periods*sizeof(double)); y_save = (double *) mxMalloc(y_size*sizeof(double)*(periods+y_kmax+y_kmin)); - begining = it_code; + start_code = it_code; iter = 0; if (!is_linear) { @@ -2384,47 +483,25 @@ Interpreter::simulate_a_block(const int size, const int type, string file_name, int u_count_saved = u_count; while (!(cvg || (iter > maxit_))) { - res2 = 0; res1 = 0; max_res = 0; max_res_idx = 0; + double res1_, res2_, max_res_; + int max_res_idx_; memcpy(y_save, y, y_size*sizeof(double)*(periods+y_kmax+y_kmin)); - for (it_ = y_kmin; it_ < periods+y_kmin; it_++) - { - Per_u_ = (it_-y_kmin)*u_count_int; - Per_y_ = it_*y_size; - it_code = begining; - error_not_printed = true; - compute_block_time(Per_u_, false, block_num, size, steady_state); - if (isnan(res1) || isinf(res1)) - { - memcpy(y, y_save, y_size*sizeof(double)*(periods+y_kmax+y_kmin)); - break; - } - for (i = 0; i < size; i++) - { - double rr; - if (fabs(1+y[Per_y_+Block_Contain[i].Variable]) > eps) - rr = r[i]/(1+y[Per_y_+Block_Contain[i].Variable]); - else - rr = r[i]; - if (max_res < fabs(rr)) - { - max_res = fabs(rr); - max_res_idx = i; - } - res2 += rr*rr; - res1 += fabs(rr); - } - } - if (isnan(res1) || isinf(res1)) - cvg = false; - else + + compute_complete_2b(false, &res1, &res2, &max_res, &max_res_idx); + end_code = it_code; + + if (!(isnan(res1) || isinf(res1))) cvg = (max_res < solve_tolf); + if (isnan(res1) || isinf(res1) || (stack_solve_algo == 4 && iter > 0)) + memcpy(y, y_save, y_size*sizeof(double)*(periods+y_kmax+y_kmin)); u_count = u_count_saved; int prev_iter = iter; - Simulate_Newton_Two_Boundaries(Block_Count, symbol_table_endo_nbr, it_, y_kmin, y_kmax, size, periods, print_it, cvg, iter, minimal_solving_periods, stack_solve_algo, endo_name_length, P_endo_names); + + Simulate_Newton_Two_Boundaries(block_num, symbol_table_endo_nbr, y_kmin, y_kmax, size, periods, cvg, minimal_solving_periods, stack_solve_algo, endo_name_length, P_endo_names); iter++; if (iter > prev_iter) { @@ -2437,7 +514,7 @@ Interpreter::simulate_a_block(const int size, const int type, string file_name, if (!cvg) { ostringstream tmp; - tmp << " in Solve two boundaries, convergence not achieved in block " << Block_Count+1 << ", after " << iter << " iterations\n"; + tmp << " in Solve two boundaries, convergence not achieved in block " << block_num+1 << ", after " << iter << " iterations\n"; throw FatalExceptionHandling(tmp.str()); } } @@ -2446,34 +523,24 @@ Interpreter::simulate_a_block(const int size, const int type, string file_name, res1 = 0; res2 = 0; max_res = 0; max_res_idx = 0; - for (it_ = y_kmin; it_ < periods+y_kmin; it_++) - { - Per_u_ = (it_-y_kmin)*u_count_int; - Per_y_ = it_*y_size; - it_code = begining; - compute_block_time(Per_u_, false, block_num, size, steady_state); - for (i = 0; i < size; i++) - { - double rr; - rr = r[i]; - if (max_res < fabs(rr)) - { - max_res = fabs(rr); - max_res_idx = i; - } - res2 += rr*rr; - res1 += fabs(rr); - } - } + double res1_, res2_, max_res_; + int max_res_idx_; + + compute_complete_2b(false, &res1, &res2, &max_res, &max_res_idx); + end_code = it_code; + cvg = false; - Simulate_Newton_Two_Boundaries(Block_Count, symbol_table_endo_nbr, it_, y_kmin, y_kmax, size, periods, print_it, cvg, iter, minimal_solving_periods, stack_solve_algo, endo_name_length, P_endo_names); + Simulate_Newton_Two_Boundaries(block_num, symbol_table_endo_nbr, y_kmin, y_kmax, size, periods, cvg, minimal_solving_periods, stack_solve_algo, endo_name_length, P_endo_names); } + it_code = end_code; mxFree(r); mxFree(y_save); mxFree(u); mxFree(index_vara); mxFree(index_equa); + mxFree(res); memset(direction, 0, size_of_direction); + End_Solver(); break; default: ostringstream tmp; @@ -2485,12 +552,13 @@ Interpreter::simulate_a_block(const int size, const int type, string file_name, } void -Interpreter::print_a_block(const int size, const int type, string bin_basename, bool steady_state, int block_num, - const bool is_linear, const int symbol_table_endo_nbr, const int Block_List_Max_Lag, - const int Block_List_Max_Lead, const int u_count_int, int block) +Interpreter::print_a_block() { it_code_type begining; - mexPrintf("\nBlock %d\n", block_num+1); + if (block < 0) + mexPrintf("\nBlock %d\n", block_num+1); + else + mexPrintf("\nBlock %d\n", block+1); mexPrintf("----------\n"); if (steady_state) residual = vector(size); @@ -2525,7 +593,7 @@ Interpreter::print_a_block(const int size, const int type, string bin_basename, } bool -Interpreter::compute_blocks(string file_name, string bin_basename, bool steady_state, bool evaluate, int block, int &nb_blocks, bool print_it) +Interpreter::compute_blocks(string file_name, string bin_basename, bool evaluate, int block, int &nb_blocks) { bool result = true; @@ -2582,9 +650,9 @@ Interpreter::compute_blocks(string file_name, string bin_basename, bool steady_s FBEGINBLOCK_ *fb = (FBEGINBLOCK_ *) it_code->second; Block_Contain = fb->get_Block_Contain(); it_code++; + set_block(fb->get_size(), fb->get_type(), file_name, bin_basename, Block_Count, fb->get_is_linear(), fb->get_endo_nbr(), fb->get_Max_Lag(), fb->get_Max_Lead(), fb->get_u_count_int(), block); if (print) - print_a_block(fb->get_size(), fb->get_type(), bin_basename, steady_state, Block_Count, - fb->get_is_linear(), fb->get_endo_nbr(), fb->get_Max_Lag(), fb->get_Max_Lead(), fb->get_u_count_int(), block); + print_a_block(); else if (evaluate) { #ifdef DEBUG @@ -2607,8 +675,7 @@ Interpreter::compute_blocks(string file_name, string bin_basename, bool steady_s else residual = vector(fb->get_size()*(periods+y_kmin)); } - evaluate_a_block(fb->get_size(), fb->get_type(), bin_basename, steady_state, Block_Count, - fb->get_is_linear(), fb->get_endo_nbr(), fb->get_Max_Lag(), fb->get_Max_Lead(), fb->get_u_count_int(), block); + evaluate_a_block(); } else { @@ -2616,8 +683,7 @@ Interpreter::compute_blocks(string file_name, string bin_basename, bool steady_s mexPrintf("endo in block=%d, type=%d, steady_state=%d, print_it=%d, Block_Count=%d, fb->get_is_linear()=%d, fb->get_endo_nbr()=%d, fb->get_Max_Lag()=%d, fb->get_Max_Lead()=%d, fb->get_u_count_int()=%d\n", fb->get_size(), fb->get_type(), steady_state, print_it, Block_Count, fb->get_is_linear(), fb->get_endo_nbr(), fb->get_Max_Lag(), fb->get_Max_Lead(), fb->get_u_count_int()); #endif - result = simulate_a_block(fb->get_size(), fb->get_type(), file_name, bin_basename, true, steady_state, print_it,Block_Count, - fb->get_is_linear(), fb->get_endo_nbr(), fb->get_Max_Lag(), fb->get_Max_Lead(), fb->get_u_count_int()); + result = simulate_a_block(); if (result == ERROR_ON_EXIT) return ERROR_ON_EXIT; } @@ -2678,16 +744,14 @@ Interpreter::compute_blocks(string file_name, string bin_basename, bool steady_s it_code++; break; default: - mexPrintf("Error\n"); ostringstream tmp; - tmp << " in compute_blocks, unknown command " << it_code->first << "\n"; + tmp << " in compute_blocks, unknown command " << it_code->first << " (block=" << Block_Count << ")\n"; throw FatalExceptionHandling(tmp.str()); } } - mxFree(Init_Code->second); nb_blocks = Block_Count+1; - if (T and !global_temporary_terms) + if (T && !global_temporary_terms) mxFree(T); return result; } diff --git a/mex/sources/bytecode/Interpreter.hh b/mex/sources/bytecode/Interpreter.hh index 18fefd89c..b33b608d8 100644 --- a/mex/sources/bytecode/Interpreter.hh +++ b/mex/sources/bytecode/Interpreter.hh @@ -27,6 +27,7 @@ #define BYTE_CODE #include "CodeInterpreter.hh" #include "SparseMatrix.hh" +#include "Evaluate.hh" #ifdef LINBCG # include "linbcg.hh" #endif @@ -40,50 +41,29 @@ using namespace std; -#define pow_ pow -class Interpreter : public SparseMatrix +class Interpreter : public dynSparseMatrix { private: - unsigned int EQN_dvar1, EQN_dvar2, EQN_dvar3; - int EQN_lag1, EQN_lag2, EQN_lag3; - mxArray *GlobalTemporaryTerms; protected: - double pow1(double a, double b); - double divide(double a, double b); - double log1(double a); - double log10_1(double a); - void compute_block_time(int Per_u_, bool evaluate, int block_num, int size, bool steady_state); - void evaluate_a_block(const int size, const int type, string bin_basename, bool steady_state, int block_num, - const bool is_linear = false, const int symbol_table_endo_nbr = 0, const int Block_List_Max_Lag = 0, const int Block_List_Max_Lead = 0, const int u_count_int = 0, int block = -1); - int simulate_a_block(const int size, const int type, string file_name, string bin_basename, bool Gaussian_Elimination, bool steady_state, bool print_it, int block_num, - const bool is_linear = false, const int symbol_table_endo_nbr = 0, const int Block_List_Max_Lag = 0, const int Block_List_Max_Lead = 0, const int u_count_int = 0); - void print_a_block(const int size, const int type, string bin_basename, bool steady_state, int block_num, - const bool is_linear, const int symbol_table_endo_nbr, const int Block_List_Max_Lag, - const int Block_List_Max_Lead, const int u_count_int, int block); - void SingularDisplay(int Per_u_, bool evaluate, int Block_Count, int size, bool steady_state, it_code_type begining); - vector Block_Contain; - code_liste_type code_liste; - it_code_type it_code; - int Block_Count, Per_u_, Per_y_; - int it_, maxit_, size_of_direction; - double solve_tolf; - bool GaussSeidel; - map, int>, int> IM_i; - int equation, derivative_equation, derivative_variable; - string filename; - int minimal_solving_periods; - int stack_solve_algo, solve_algo; - bool global_temporary_terms; - bool print, print_error; + void evaluate_a_block(); + int simulate_a_block(); + void print_a_block(); public: ~Interpreter(); Interpreter(double *params_arg, double *y_arg, double *ya_arg, double *x_arg, double *steady_y_arg, double *steady_x_arg, - double *direction_arg, int y_size_arg, int nb_row_x_arg, - int nb_row_xd_arg, int periods_arg, int y_kmin_arg, int y_kmax_arg, int maxit_arg_, double solve_tolf_arg, int size_o_direction_arg, - double slowc_arg, int y_decal_arg, double markowitz_c_arg, string &filename_arg, int minimal_solving_periods_arg, int stack_solve_algo_arg, int solve_algo_arg, - bool global_temporary_terms_arg, bool print_arg, bool print_error_arg, mxArray *GlobalTemporaryTerms_arg); - bool compute_blocks(string file_name, string bin_basename, bool steady_state, bool evaluate, int block, int &nb_blocks, bool print_it); + double *direction_arg, size_t y_size_arg, + size_t nb_row_x_arg, size_t nb_row_xd_arg, int periods_arg, int y_kmin_arg, int y_kmax_arg, + int maxit_arg_, double solve_tolf_arg, size_t size_of_direction_arg, double slowc_arg, int y_decal_arg, double markowitz_c_arg, + string &filename_arg, int minimal_solving_periods_arg, int stack_solve_algo_arg, int solve_algo_arg, + bool global_temporary_terms_arg, bool print_arg, bool print_error_arg, mxArray *GlobalTemporaryTerms_arg, + bool steady_state_arg, bool print_it_arg +#ifdef CUDA + , const int CUDA_device, cublasHandle_t cublas_handle_arg, cusparseHandle_t cusparse_handle_arg, cusparseMatDescr_t descr_arg +#endif + ); + bool compute_blocks(string file_name, string bin_basename, bool evaluate, int block, int &nb_blocks); + inline mxArray * get_jacob(int block_num) { diff --git a/mex/sources/bytecode/SparseMatrix.cc b/mex/sources/bytecode/SparseMatrix.cc index 692a708bc..131c4314a 100644 --- a/mex/sources/bytecode/SparseMatrix.cc +++ b/mex/sources/bytecode/SparseMatrix.cc @@ -1,4 +1,4 @@ -/* + /* * Copyright (C) 2007-2012 Dynare Team * * This file is part of Dynare. @@ -17,15 +17,88 @@ * along with Dynare. If not, see . */ -//#define _GLIBCXX_USE_C99_FENV_TR1 1 -//#include +//define _GLIBCXX_USE_C99_FENV_TR1 1 +//include #include #include #include +//#include +//#include #include "SparseMatrix.hh" -SparseMatrix::SparseMatrix() +#ifdef CUDA +#include "SparseMatrix_kernel.cu" +#endif + +using namespace std; +#if (defined _MSC_VER || defined OCTAVE_MEX_FILE) + +#ifdef _MSC_VER +#include +HINSTANCE hinstLib; +#else +#include +#include +void* hinstLib; +#endif +#define UMFPACK_INFO 90 +#define UMFPACK_CONTROL 20 +/* used in all UMFPACK_report_* routines: */ +#define UMFPACK_PRL 0 /* print level */ +/* returned by all routines that use Info: */ +#define UMFPACK_OK (0) +#define UMFPACK_STATUS 0 /* UMFPACK_OK, or other result */ + + + + +typedef void (*t_umfpack_dl_free_numeric)(void **Numeric); +t_umfpack_dl_free_numeric umfpack_dl_free_numeric; +typedef void (*t_umfpack_dl_free_symbolic)(void **Symbolic); +t_umfpack_dl_free_symbolic umfpack_dl_free_symbolic; +typedef int64_t (*t_umfpack_dl_solve)(int64_t sys, + const int64_t Ap [ ], + const int64_t Ai [ ], + const double Ax [ ], + double X [ ], + const double B [ ], + void *Numeric, + const double Control [UMFPACK_CONTROL], + double Info [UMFPACK_INFO]); +t_umfpack_dl_solve umfpack_dl_solve; +typedef int64_t (*t_umfpack_dl_numeric)(const int64_t Ap [ ], + const int64_t Ai [ ], + const double Ax [ ], + void *Symbolic, + void **Numeric, + const double Control [UMFPACK_CONTROL], + double Info [UMFPACK_INFO]); +t_umfpack_dl_numeric umfpack_dl_numeric; +typedef int64_t (*t_umfpack_dl_symbolic)(int64_t n_row, + int64_t n_col, + const int64_t Ap [ ], + const int64_t Ai [ ], + const double Ax [ ], + void **Symbolic, + const double Control [UMFPACK_CONTROL], + double Info [UMFPACK_INFO]); +t_umfpack_dl_symbolic umfpack_dl_symbolic; +typedef void (*t_umfpack_dl_report_info)(const double Control [UMFPACK_CONTROL], + const double Info [UMFPACK_INFO]); +t_umfpack_dl_report_info umfpack_dl_report_info; +typedef void (*t_umfpack_dl_report_status)(const double Control [UMFPACK_CONTROL], + int64_t status); +t_umfpack_dl_report_status umfpack_dl_report_status; +typedef void (*t_umfpack_dl_defaults)(double Control [UMFPACK_CONTROL]); +t_umfpack_dl_defaults umfpack_dl_defaults; + +#endif + + + + +dynSparseMatrix::dynSparseMatrix() { pivotva = NULL; g_save_op = NULL; @@ -42,29 +115,304 @@ SparseMatrix::SparseMatrix() restart = 0; IM_i.clear(); lu_inc_tol = 1e-10; + Symbolic = NULL; + Numeric = NULL; +#if (defined _MSC_VER || defined OCTAVE_MEX_FILE) + // Get a handle to the DLL module. +#ifdef _MSC_VER + hinstLib = LoadLibrary(TEXT("libmwumfpack.dll")); +#else + hinstLib = dlopen("libmwumfpack.dll",RTLD_LAZY); +#endif + // If the handle is valid, try to get the function address. + if (hinstLib) + { +#ifdef _MSC_VER + umfpack_dl_free_numeric = (t_umfpack_dl_free_numeric) GetProcAddress(hinstLib, "umfpack_dl_free_numeric"); + if (!umfpack_dl_free_numeric) + { + mexPrintf("umfpack_dl_free_numeric not found\n"); + ostringstream tmp; + tmp << " in libmwumfpack.dll, the function umfpack_dl_free_numeric is not found."; + throw FatalExceptionHandling(tmp.str()); + } + umfpack_dl_free_symbolic = (t_umfpack_dl_free_symbolic) GetProcAddress(hinstLib, "umfpack_dl_free_symbolic"); + if (!umfpack_dl_free_symbolic) + { + ostringstream tmp; + tmp << " in libmwumfpack.dll, the function umfpack_dl_free_symbolic is not found."; + throw FatalExceptionHandling(tmp.str()); + } + umfpack_dl_solve = (t_umfpack_dl_solve) GetProcAddress(hinstLib, "umfpack_dl_free_solve"); + if (!umfpack_dl_solve) + { + ostringstream tmp; + tmp << " in libmwumfpack.dll, the function umfpack_dl_solve is not found."; + throw FatalExceptionHandling(tmp.str()); + } + umfpack_dl_numeric = (t_umfpack_dl_numeric) GetProcAddress(hinstLib, "umfpack_dl_numeric"); + if (!umfpack_dl_numeric) + { + ostringstream tmp; + tmp << " in libmwumfpack.dll, the function umfpack_dl_numeric is not found."; + throw FatalExceptionHandling(tmp.str()); + } + umfpack_dl_symbolic = (t_umfpack_dl_symbolic) GetProcAddress(hinstLib, "umfpack_dl_symbolic"); + if (!umfpack_dl_symbolic) + { + ostringstream tmp; + tmp << " in libmwumfpack.dll, the function umfpack_dl_symbolic is not found."; + throw FatalExceptionHandling(tmp.str()); + } + umfpack_dl_report_info = (t_umfpack_dl_report_info) GetProcAddress(hinstLib, "umfpack_dl_report_info"); + if (!umfpack_dl_report_info) + { + ostringstream tmp; + tmp << " in libmwumfpack.dll, the function umfpack_dl_report_info is not found."; + throw FatalExceptionHandling(tmp.str()); + } + umfpack_dl_report_status = (t_umfpack_dl_report_status) GetProcAddress(hinstLib, "umfpack_dl_report_status"); + if (!umfpack_dl_report_status) + { + ostringstream tmp; + tmp << " in libmwumfpack.dll, the function umfpack_dl_report_status is not found."; + throw FatalExceptionHandling(tmp.str()); + } + umfpack_dl_defaults = (t_umfpack_dl_defaults) GetProcAddress(hinstLib, "umfpack_dl_defaults"); + if (!umfpack_dl_defaults) + { + ostringstream tmp; + tmp << " in libmwumfpack.dll, the function umfpack_dl_defaults is not found."; + throw FatalExceptionHandling(tmp.str()); + } +/*#else + mexPrintf("loading libmwumpfpack\n"); + // reset errors + dlerror(); + umfpack_dl_free_numeric = (t_umfpack_dl_free_numeric) dlsym(hinstLib, "umfpack_dl_free_numeric"); + const char* dlsym_error = dlerror(); + if (dlsym_error) + { + mexPrintf("umfpack_dl_free_numeric not found\n"); + ostringstream tmp; + tmp << " in libmwumfpack.dll, the function umfpack_dl_free_numeric is not found."; + throw FatalExceptionHandling(tmp.str()); + } + // reset errors + dlerror(); + umfpack_dl_free_symbolic = (t_umfpack_dl_free_symbolic) dlsym(hinstLib, "umfpack_dl_free_symbolic"); + dlsym_error = dlerror(); + if (dlsym_error) + { + mexPrintf("umfpack_dl_free_symbolic not found\n"); + ostringstream tmp; + tmp << " in libmwumfpack.dll, the function umfpack_dl_free_symbolic is not found."; + throw FatalExceptionHandling(tmp.str()); + } + // reset errors + dlerror(); + umfpack_dl_solve = (t_umfpack_dl_solve) dlsym(hinstLib, "umfpack_dl_solve"); + dlsym_error = dlerror(); + if (dlsym_error) + { + mexPrintf("umfpack_dl_solve not found\n"); + ostringstream tmp; + tmp << " in libmwumfpack.dll, the function umfpack_dl_solve is not found."; + throw FatalExceptionHandling(tmp.str()); + } + // reset errors + dlerror(); + umfpack_dl_numeric = (t_umfpack_dl_numeric) dlsym(hinstLib, "umfpack_dl_numeric"); + dlsym_error = dlerror(); + if (dlsym_error) + { + mexPrintf("umfpack_dl_numeric not found\n"); + ostringstream tmp; + tmp << " in libmwumfpack.dll, the function umfpack_dl_numeric is not found."; + throw FatalExceptionHandling(tmp.str()); + } + // reset errors + dlerror(); + umfpack_dl_symbolic = (t_umfpack_dl_symbolic) dlsym(hinstLib, "umfpack_dl_symbolic"); + dlsym_error = dlerror(); + if (dlsym_error) + { + mexPrintf("umfpack_dl_symbolic not found\n"); + ostringstream tmp; + tmp << " in libmwumfpack.dll, the function umfpack_dl_symbolic is not found."; + throw FatalExceptionHandling(tmp.str()); + } + // reset errors + dlerror(); + umfpack_dl_report_info = (t_umfpack_dl_report_info) dlsym(hinstLib, "umfpack_dl_report_info"); + dlsym_error = dlerror(); + if (dlsym_error) + { + mexPrintf("umfpack_dl_report_info not found\n"); + ostringstream tmp; + tmp << " in libmwumfpack.dll, the function umfpack_dl_report_info is not found."; + throw FatalExceptionHandling(tmp.str()); + } + // reset errors + dlerror(); + umfpack_dl_report_status = (t_umfpack_dl_report_status) dlsym(hinstLib, "umfpack_dl_report_status"); + dlsym_error = dlerror(); + if (dlsym_error) + { + mexPrintf("umfpack_dl_report_status not found\n"); + ostringstream tmp; + tmp << " in libmwumfpack.dll, the function umfpack_dl_report_status is not found."; + throw FatalExceptionHandling(tmp.str()); + } + // reset errors + dlerror(); + umfpack_dl_defaults = (t_umfpack_dl_defaults) dlsym(hinstLib, "umfpack_dl_defaults"); + dlsym_error = dlerror(); + if (dlsym_error) + { + mexPrintf("umfpack_dl_defaults not found\n"); + ostringstream tmp; + tmp << " in libmwumfpack.dll, the function umfpack_dl_defaults is not found."; + throw FatalExceptionHandling(tmp.str()); + }*/ +#endif + } + else + { + mexPrintf("library loading error\n"); + ostringstream tmp; + tmp << " in main, libmwumfpack.dll not found. \n Check that \\Program files\\MATLAB\\RXXXXX\\bin\\win64 is in the current path."; + throw FatalExceptionHandling(tmp.str()); + } +#endif } +dynSparseMatrix::dynSparseMatrix(const int y_size_arg, const int y_kmin_arg, const int y_kmax_arg, const bool print_it_arg, const bool steady_state_arg, const int periods_arg, + const int minimal_solving_periods_arg +#ifdef CUDA + , const int CUDA_device_arg, cublasHandle_t cublas_handle_arg, cusparseHandle_t cusparse_handle_arg, cusparseMatDescr_t descr_arg +#endif + ): + Evaluate(y_size_arg, y_kmin_arg, y_kmax_arg, print_it_arg, steady_state_arg, periods_arg, minimal_solving_periods_arg) +{ + pivotva = NULL; + g_save_op = NULL; + g_nop_all = 0; + mem_mngr.init_Mem(); + symbolic = true; + alt_symbolic = false; + alt_symbolic_count = 0; + max_u = 0; + min_u = 0x7FFFFFFF; + res1a = 9.0e60; + tbreak_g = 0; + start_compare = 0; + restart = 0; + IM_i.clear(); + lu_inc_tol = 1e-10; + Symbolic = NULL; + Numeric = NULL; +#ifdef CUDA + CUDA_device = CUDA_device_arg; + cublas_handle = cublas_handle_arg; + cusparse_handle = cusparse_handle_arg; + CUDA_descr = descr_arg; +#endif +#ifdef _MSC_VER + // Get a handle to the DLL module. + hinstLib = LoadLibrary(TEXT("libmwumfpack.dll")); + // If the handle is valid, try to get the function address. + if (hinstLib != NULL) + { + umfpack_dl_free_numeric = (t_umfpack_dl_free_numeric) GetProcAddress(hinstLib, "umfpack_dl_free_numeric"); + if (!umfpack_dl_free_numeric) + { + ostringstream tmp; + tmp << " in libmwumfpack.dll, the function umfpack_dl_free_numeric is not found."; + throw FatalExceptionHandling(tmp.str()); + } + umfpack_dl_free_symbolic = (t_umfpack_dl_free_symbolic) GetProcAddress(hinstLib, "umfpack_dl_free_symbolic"); + if (!umfpack_dl_free_symbolic) + { + ostringstream tmp; + tmp << " in libmwumfpack.dll, the function umfpack_dl_free_symbolic is not found."; + throw FatalExceptionHandling(tmp.str()); + } + umfpack_dl_report_info = (t_umfpack_dl_report_info) GetProcAddress(hinstLib, "umfpack_dl_report_info"); + if (!umfpack_dl_report_info) + { + ostringstream tmp; + tmp << " in libmwumfpack.dll, the function umfpack_dl_report_info is not found."; + throw FatalExceptionHandling(tmp.str()); + } + umfpack_dl_solve = (t_umfpack_dl_solve) GetProcAddress(hinstLib, "umfpack_dl_solve"); + if (!umfpack_dl_solve) + { + ostringstream tmp; + tmp << " in libmwumfpack.dll, the function umfpack_dl_solve is not found."; + throw FatalExceptionHandling(tmp.str()); + } + umfpack_dl_numeric = (t_umfpack_dl_numeric) GetProcAddress(hinstLib, "umfpack_dl_numeric"); + if (!umfpack_dl_numeric) + { + ostringstream tmp; + tmp << " in libmwumfpack.dll, the function umfpack_dl_numeric is not found."; + throw FatalExceptionHandling(tmp.str()); + } + umfpack_dl_symbolic = (t_umfpack_dl_symbolic) GetProcAddress(hinstLib, "umfpack_dl_symbolic"); + if (!umfpack_dl_symbolic) + { + ostringstream tmp; + tmp << " in libmwumfpack.dll, the function umfpack_dl_symbolic is not found."; + throw FatalExceptionHandling(tmp.str()); + } + umfpack_dl_report_status = (t_umfpack_dl_report_status) GetProcAddress(hinstLib, "umfpack_dl_report_status"); + if (!umfpack_dl_report_status) + { + ostringstream tmp; + tmp << " in libmwumfpack.dll, the function umfpack_dl_report_status is not found."; + throw FatalExceptionHandling(tmp.str()); + } + umfpack_dl_defaults = (t_umfpack_dl_defaults) GetProcAddress(hinstLib, "umfpack_dl_defaults"); + if (!umfpack_dl_defaults) + { + ostringstream tmp; + tmp << " in libmwumfpack.dll, the function umfpack_dl_defaults is not found."; + throw FatalExceptionHandling(tmp.str()); + } + } + else + { + mexPrintf("library loading error\n"); + ostringstream tmp; + tmp << " in main, libmwumfpack.dll not found. \n Check that \\Program files\\MATLAB\\RXXXXX\\bin\\win64 in the current path."; + throw FatalExceptionHandling(tmp.str()); + } +#endif +} + + int -SparseMatrix::NRow(int r) +dynSparseMatrix::NRow(int r) { return NbNZRow[r]; } int -SparseMatrix::NCol(int c) +dynSparseMatrix::NCol(int c) { return NbNZCol[c]; } int -SparseMatrix::At_Row(int r, NonZeroElem **first) +dynSparseMatrix::At_Row(int r, NonZeroElem **first) { (*first) = FNZE_R[r]; return NbNZRow[r]; } int -SparseMatrix::Union_Row(int row1, int row2) +dynSparseMatrix::Union_Row(int row1, int row2) { NonZeroElem *first1, *first2; int n1 = At_Row(row1, &first1); @@ -97,7 +445,7 @@ SparseMatrix::Union_Row(int row1, int row2) } int -SparseMatrix::At_Pos(int r, int c, NonZeroElem **first) +dynSparseMatrix::At_Pos(int r, int c, NonZeroElem **first) { (*first) = FNZE_R[r]; while ((*first)->c_index != c) @@ -106,14 +454,14 @@ SparseMatrix::At_Pos(int r, int c, NonZeroElem **first) } int -SparseMatrix::At_Col(int c, NonZeroElem **first) +dynSparseMatrix::At_Col(int c, NonZeroElem **first) { (*first) = FNZE_C[c]; return NbNZCol[c]; } int -SparseMatrix::At_Col(int c, int lag, NonZeroElem **first) +dynSparseMatrix::At_Col(int c, int lag, NonZeroElem **first) { (*first) = FNZE_C[c]; int i = 0; @@ -139,7 +487,7 @@ SparseMatrix::At_Col(int c, int lag, NonZeroElem **first) } void -SparseMatrix::Delete(const int r, const int c) +dynSparseMatrix::Delete(const int r, const int c) { NonZeroElem *first = FNZE_R[r], *firsta = NULL; @@ -173,7 +521,7 @@ SparseMatrix::Delete(const int r, const int c) } void -SparseMatrix::Print(int Size, int *b) +dynSparseMatrix::Print(int Size, int *b) { int a, i, j, k, l; mexPrintf(" "); @@ -221,7 +569,7 @@ SparseMatrix::Print(int Size, int *b) } void -SparseMatrix::Insert(const int r, const int c, const int u_index, const int lag_index) +dynSparseMatrix::Insert(const int r, const int c, const int u_index, const int lag_index) { NonZeroElem *firstn, *first, *firsta, *a; firstn = mem_mngr.mxMalloc_NZE(); @@ -275,12 +623,14 @@ SparseMatrix::Insert(const int r, const int c, const int u_index, const int lag_ } void -SparseMatrix::Read_SparseMatrix(string file_name, const int Size, int periods, int y_kmin, int y_kmax, bool steady_state, bool two_boundaries, int stack_solve_algo, int solve_algo) +dynSparseMatrix::Read_SparseMatrix(string file_name, const int Size, int periods, int y_kmin, int y_kmax, bool two_boundaries, int stack_solve_algo, int solve_algo) { unsigned int eq, var; - int i, j, lag; + int lag; filename = file_name; mem_mngr.fixe_file_name(file_name); + /*mexPrintf("steady_state=%d, size=%d, solve_algo=%d, stack_solve_algo=%d, two_boundaries=%d\n",steady_state, Size, solve_algo, stack_solve_algo, two_boundaries); + mexEvalString("drawnow;");*/ if (!SaveCode.is_open()) { if (steady_state) @@ -302,28 +652,44 @@ SparseMatrix::Read_SparseMatrix(string file_name, const int Size, int periods, i { if (stack_solve_algo == 5) { - for (i = 0; i < u_count_init-Size; i++) + for (int i = 0; i < u_count_init-Size; i++) { + int val; SaveCode.read(reinterpret_cast(&eq), sizeof(eq)); SaveCode.read(reinterpret_cast(&var), sizeof(var)); SaveCode.read(reinterpret_cast(&lag), sizeof(lag)); - SaveCode.read(reinterpret_cast(&j), sizeof(j)); - IM_i[make_pair(make_pair(eq, var), lag)] = j; + SaveCode.read(reinterpret_cast(&val), sizeof(val)); + IM_i[make_pair(make_pair(eq, var), lag)] = val; } - for (j = 0; j < Size; j++) + for (int j = 0; j < Size; j++) IM_i[make_pair(make_pair(j, Size*(periods+y_kmax)), 0)] = j; } - else if (stack_solve_algo >= 0 || stack_solve_algo <= 4) + else if (stack_solve_algo >= 0 && stack_solve_algo <= 4) { - for (i = 0; i < u_count_init-Size; i++) + for (int i = 0; i < u_count_init-Size; i++) { + int val; SaveCode.read(reinterpret_cast(&eq), sizeof(eq)); SaveCode.read(reinterpret_cast(&var), sizeof(var)); SaveCode.read(reinterpret_cast(&lag), sizeof(lag)); - SaveCode.read(reinterpret_cast(&j), sizeof(j)); - IM_i[make_pair(make_pair(var - lag*Size, -lag), eq)] = j; + SaveCode.read(reinterpret_cast(&val), sizeof(val)); + IM_i[make_pair(make_pair(var - lag*Size, -lag), eq)] = val; } - for (j = 0; j < Size; j++) + for (int j = 0; j < Size; j++) + IM_i[make_pair(make_pair(Size*(periods+y_kmax), 0), j)] = j; + } + else if (stack_solve_algo == 7) + { + for (int i = 0; i < u_count_init-Size; i++) + { + int val; + SaveCode.read(reinterpret_cast(&eq), sizeof(eq)); + SaveCode.read(reinterpret_cast(&var), sizeof(var)); + SaveCode.read(reinterpret_cast(&lag), sizeof(lag)); + SaveCode.read(reinterpret_cast(&val), sizeof(val)); + IM_i[make_pair(make_pair(eq, lag), var - lag * Size)] = val; + } + for (int j = 0; j < Size; j++) IM_i[make_pair(make_pair(Size*(periods+y_kmax), 0), j)] = j; } @@ -332,41 +698,48 @@ SparseMatrix::Read_SparseMatrix(string file_name, const int Size, int periods, i { if ((stack_solve_algo == 5 && !steady_state) || (solve_algo == 5 && steady_state)) { - for (i = 0; i < u_count_init; i++) + for (int i = 0; i < u_count_init; i++) { + int val; SaveCode.read(reinterpret_cast(&eq), sizeof(eq)); SaveCode.read(reinterpret_cast(&var), sizeof(var)); SaveCode.read(reinterpret_cast(&lag), sizeof(lag)); - SaveCode.read(reinterpret_cast(&j), sizeof(j)); - IM_i[make_pair(make_pair(eq, var), lag)] = j; + SaveCode.read(reinterpret_cast(&val), sizeof(val)); + IM_i[make_pair(make_pair(eq, var), lag)] = val; } } else if (((stack_solve_algo >= 0 || stack_solve_algo <= 4) && !steady_state) || ((solve_algo >= 6 || solve_algo <= 8) && steady_state)) { - for (i = 0; i < u_count_init; i++) + for (int i = 0; i < u_count_init; i++) { + int val; SaveCode.read(reinterpret_cast(&eq), sizeof(eq)); SaveCode.read(reinterpret_cast(&var), sizeof(var)); SaveCode.read(reinterpret_cast(&lag), sizeof(lag)); - SaveCode.read(reinterpret_cast(&j), sizeof(j)); - IM_i[make_pair(make_pair(var - lag*Size, -lag), eq)] = j; + SaveCode.read(reinterpret_cast(&val), sizeof(val)); + IM_i[make_pair(make_pair(var - lag*Size, -lag), eq)] = val; } } } index_vara = (int *) mxMalloc(Size*(periods+y_kmin+y_kmax)*sizeof(int)); - for (j = 0; j < Size; j++) + for (int j = 0; j < Size; j++) SaveCode.read(reinterpret_cast(&index_vara[j]), sizeof(*index_vara)); if (periods+y_kmin+y_kmax > 1) - for (i = 1; i < periods+y_kmin+y_kmax; i++) - for (j = 0; j < Size; j++) - index_vara[j+Size*i] = index_vara[j+Size*(i-1)]+y_size; + for (int i = 1; i < periods+y_kmin+y_kmax; i++) + { +#ifdef USE_OMP +#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) +#endif + for (int j = 0; j < Size; j++) + index_vara[j+Size*i] = index_vara[j+Size*(i-1)] + y_size; + } index_equa = (int *) mxMalloc(Size*sizeof(int)); - for (j = 0; j < Size; j++) + for (int j = 0; j < Size; j++) SaveCode.read(reinterpret_cast(&index_equa[j]), sizeof(*index_equa)); } void -SparseMatrix::Simple_Init(int Size, map, int>, int> &IM, bool &zero_solution) +dynSparseMatrix::Simple_Init(int Size, map, int>, int> &IM, bool &zero_solution) { int i, eq, var, lag; map, int>, int>::iterator it4; @@ -392,12 +765,14 @@ SparseMatrix::Simple_Init(int Size, map, int>, int> &IM, boo NbNZCol = (int *) mxMalloc(i); it4 = IM.begin(); eq = -1; - //#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) +#ifdef USE_OMP +#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) +#endif for (i = 0; i < Size; i++) { line_done[i] = 0; - FNZE_C[i] = 0; - FNZE_R[i] = 0; + FNZE_C[i] = NULL; + FNZE_R[i] = NULL; temp_NZE_C[i] = 0; temp_NZE_R[i] = 0; NbNZRow[i] = 0; @@ -434,9 +809,11 @@ SparseMatrix::Simple_Init(int Size, map, int>, int> &IM, boo } it4++; } - //#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) double cum_abs_sum = 0; - for (i = 0; i < Size; i++) +#if USE_OMP +#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) reduction(+:cum_abs_sum) +#endif + for (int i = 0; i < Size; i++) { b[i] = i; cum_abs_sum += fabs(u[i]); @@ -452,9 +829,9 @@ SparseMatrix::Simple_Init(int Size, map, int>, int> &IM, boo } void -SparseMatrix::Init_Matlab_Sparse_Simple(int Size, map, int>, int> &IM, mxArray *A_m, mxArray *b_m, bool &zero_solution, mxArray *x0_m) +dynSparseMatrix::Init_Matlab_Sparse_Simple(int Size, map, int>, int> &IM, mxArray *A_m, mxArray *b_m, bool &zero_solution, mxArray *x0_m) { - int i, eq, var; + int eq, var; double *b = mxGetPr(b_m); if (!b) { @@ -491,7 +868,10 @@ SparseMatrix::Init_Matlab_Sparse_Simple(int Size, map, int>, throw FatalExceptionHandling(tmp.str()); } map, int>, int>::iterator it4; - for (i = 0; i < y_size*(periods+y_kmin); i++) +#ifdef USE_OMP +#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) +#endif + for (int i = 0; i < y_size*(periods+y_kmin); i++) ya[i] = y[i]; #ifdef DEBUG unsigned int max_nze = mxGetNzmax(A_m); @@ -499,7 +879,10 @@ SparseMatrix::Init_Matlab_Sparse_Simple(int Size, map, int>, unsigned int NZE = 0; int last_var = 0; double cum_abs_sum = 0; - for (i = 0; i < Size; i++) +#ifdef USE_OMP +#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) reduction(+:cum_abs_sum) +#endif + for (int i = 0; i < Size; i++) { b[i] = u[i]; cum_abs_sum += fabs(b[i]); @@ -511,7 +894,7 @@ SparseMatrix::Init_Matlab_Sparse_Simple(int Size, map, int>, zero_solution = false; Aj[0] = 0; - last_var = -1; + last_var = 0; it4 = IM.begin(); while (it4 != IM.end()) { @@ -565,11 +948,757 @@ SparseMatrix::Init_Matlab_Sparse_Simple(int Size, map, int>, Aj[Size] = NZE; } + void -SparseMatrix::Init_Matlab_Sparse(int periods, int y_kmin, int y_kmax, int Size, map, int>, int> &IM, mxArray *A_m, mxArray *b_m, mxArray *x0_m) +dynSparseMatrix::Init_UMFPACK_Sparse_Simple(int Size, map, int>, int> &IM, SuiteSparse_long **Ap, SuiteSparse_long **Ai, double **Ax, double **b, bool &zero_solution, mxArray *x0_m) { - int t, i, eq, var, lag, ti_y_kmin, ti_y_kmax; + int eq, var; + //double *b = mxGetPr(b_m); + *b = (double*)mxMalloc(Size * sizeof(double)); + if (!(*b)) + { + ostringstream tmp; + tmp << " in Init_UMFPACK_Sparse, can't retrieve b vector\n"; + throw FatalExceptionHandling(tmp.str()); + } + double *x0 = mxGetPr(x0_m); + if (!x0) + { + ostringstream tmp; + tmp << " in Init_UMFPACK_Sparse_Simple, can't retrieve x0 vector\n"; + throw FatalExceptionHandling(tmp.str()); + } + *Ap = (SuiteSparse_long*)mxMalloc((Size+1) * sizeof(SuiteSparse_long)); + if (!(*Ap)) + { + ostringstream tmp; + tmp << " in Init_UMFPACK_Sparse, can't allocate Ap index vector\n"; + throw FatalExceptionHandling(tmp.str()); + } + size_t prior_nz = IM.size(); + *Ai = (SuiteSparse_long*)mxMalloc(prior_nz * sizeof(SuiteSparse_long)); + if (!(*Ai)) + { + ostringstream tmp; + tmp << " in Init_UMFPACK_Sparse, can't allocate Ai index vector\n"; + throw FatalExceptionHandling(tmp.str()); + } + *Ax = (double*)mxMalloc(prior_nz * sizeof(double)); + if (!(*Ax)) + { + ostringstream tmp; + tmp << " in Init_UMFPACK_Sparse, can't retrieve Ax matrix\n"; + throw FatalExceptionHandling(tmp.str()); + } + + map, int>, int>::iterator it4; +#ifdef USE_OMP +#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) +#endif + for (int i = 0; i < Size; i++) + { + int eq = index_vara[i]; + ya[eq+it_*y_size] = y[eq+it_*y_size]; + } +#ifdef DEBUG + unsigned int max_nze = mxGetNzmax(A_m); +#endif + unsigned int NZE = 0; + int last_var = 0; + double cum_abs_sum = 0; + +#ifdef USE_OMP +#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) reduction(+:cum_abs_sum) +#endif + for (int i = 0; i < Size; i++) + { + (*b)[i] = u[i]; + cum_abs_sum += fabs((*b)[i]); + x0[i] = y[i]; + } + if (cum_abs_sum < 1e-20) + zero_solution = true; + else + zero_solution = false; + + (*Ap)[0] = 0; + last_var = 0; + it4 = IM.begin(); + while (it4 != IM.end()) + { + var = it4->first.first.first; + if (var != last_var) + { + (*Ap)[1+last_var ] = NZE; + last_var = var; + } + eq = it4->first.second; + int index = it4->second; +#ifdef DEBUG + if (index < 0 || index >= u_count_alloc || index > Size + Size*Size) + { + ostringstream tmp; + tmp << " in Init_Matlab_Sparse_Simple, index (" << index << ") out of range for u vector max = " << Size+Size*Size << " allocated = " << u_count_alloc << "\n"; + throw FatalExceptionHandling(tmp.str()); + } + if (NZE >= max_nze) + { + ostringstream tmp; + tmp << " in Init_Matlab_Sparse_Simple, exceeds the capacity of A_m sparse matrix\n"; + throw FatalExceptionHandling(tmp.str()); + } +#endif + (*Ax)[NZE] = u[index]; + (*Ai)[NZE] = eq; + NZE++; +#ifdef DEBUG + if (eq < 0 || eq >= Size) + { + ostringstream tmp; + tmp << " in Init_Matlab_Sparse_Simple, index (" << eq << ") out of range for b vector\n"; + throw FatalExceptionHandling(tmp.str()); + } + if (var < 0 || var >= Size) + { + ostringstream tmp; + tmp << " in Init_Matlab_Sparse_Simple, index (" << var << ") out of range for index_vara vector\n"; + throw FatalExceptionHandling(tmp.str()); + } + if (index_vara[var] < 0 || index_vara[var] >= y_size) + { + ostringstream tmp; + tmp << " in Init_Matlab_Sparse_Simple, index (" << index_vara[var] << ") out of range for y vector max=" << y_size << " (0)\n"; + throw FatalExceptionHandling(tmp.str()); + } +#endif + it4++; + } + (*Ap)[Size] = NZE; +} + + +void +dynSparseMatrix::Init_UMFPACK_Sparse(int periods, int y_kmin, int y_kmax, int Size, map, int>, int> &IM, SuiteSparse_long **Ap, SuiteSparse_long **Ai, double **Ax, double **b, mxArray *x0_m) +{ + int t, eq, var, lag, ti_y_kmin, ti_y_kmax; + int n = periods * Size; + *b = (double*)mxMalloc(n * sizeof(double)); + if (!(*b)) + { + ostringstream tmp; + tmp << " in Init_UMFPACK_Sparse, can't retrieve b vector\n"; + throw FatalExceptionHandling(tmp.str()); + } + double *x0 = mxGetPr(x0_m); + if (!x0) + { + ostringstream tmp; + tmp << " in Init_UMFPACK_Sparse_Simple, can't retrieve x0 vector\n"; + throw FatalExceptionHandling(tmp.str()); + } + *Ap = (SuiteSparse_long*)mxMalloc((n+1) * sizeof(SuiteSparse_long)); + if (!(*Ap)) + { + ostringstream tmp; + tmp << " in Init_UMFPACK_Sparse, can't allocate Ap index vector\n"; + throw FatalExceptionHandling(tmp.str()); + } + size_t prior_nz = IM.size() * periods; + *Ai = (SuiteSparse_long*)mxMalloc(prior_nz * sizeof(SuiteSparse_long)); + if (!(*Ai)) + { + ostringstream tmp; + tmp << " in Init_UMFPACK_Sparse, can't allocate Ai index vector\n"; + throw FatalExceptionHandling(tmp.str()); + } + *Ax = (double*)mxMalloc(prior_nz * sizeof(double)); + if (!(*Ax)) + { + ostringstream tmp; + tmp << " in Init_UMFPACK_Sparse, can't retrieve Ax matrix\n"; + throw FatalExceptionHandling(tmp.str()); + } + map, int>, int>::iterator it4; +#ifdef USE_OMP +#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) +#endif + for (int i = 0; i < y_size*(periods+y_kmin); i++) + ya[i] = y[i]; +#ifdef DEBUG + unsigned int max_nze = mxGetNzmax(A_m); +#endif + unsigned int NZE = 0; + int last_var = 0; + +#ifdef USE_OMP +#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) +#endif + for (int i = 0; i < periods*Size; i++) + { + (*b)[i] = 0; + x0[i] = y[index_vara[Size*y_kmin+i]]; + } + (*Ap)[0] = 0; + /*int min_lag = 0; + int max_lag = 0;*/ + for (t = 0; t < periods; t++) + { + last_var = -1; + it4 = IM.begin(); + while (it4 != IM.end()) + { + var = it4->first.first.first; + if (var != last_var) + { + (*Ap)[1+last_var + t * Size] = NZE; + last_var = var; + } + eq = it4->first.second+Size*t; + lag = -it4->first.first.second; + /*if (t==0) + { + if (min_lag > lag) + min_lag = lag; + if (max_lag < lag) + max_lag = lag; + }*/ + int index = it4->second+ (t-lag) * u_count_init; + if (var < (periods+y_kmax)*Size) + { + ti_y_kmin = -min(t, y_kmin); + ti_y_kmax = min(periods-(t +1), y_kmax); + int ti_new_y_kmax = min(t, y_kmax); + int ti_new_y_kmin = -min(periods-(t+1), y_kmin); + if (lag <= ti_new_y_kmax && lag >= ti_new_y_kmin) /*Build the index for sparse matrix containing the jacobian : u*/ + { +#ifdef DEBUG + if (index < 0 || index >= u_count_alloc || index > Size + Size*Size) + { + ostringstream tmp; + tmp << " in Init_UMFPACK_Sparse, index (" << index << ") out of range for u vector max = " << Size+Size*Size << " allocated = " << u_count_alloc << "\n"; + throw FatalExceptionHandling(tmp.str()); + } + if (NZE >= max_nze) + { + ostringstream tmp; + tmp << " in Init_UMFPACK_Sparse, exceeds the capacity of A_m sparse matrix\n"; + throw FatalExceptionHandling(tmp.str()); + } +#endif + (*Ax)[NZE] = u[index]; + (*Ai)[NZE] = eq - lag * Size; + NZE++; + } + if (lag > ti_y_kmax || lag < ti_y_kmin) + { +#ifdef DEBUG + if (eq < 0 || eq >= Size * periods) + { + ostringstream tmp; + tmp << " in Init_UMFPACK_Sparse, index (" << eq << ") out of range for b vector\n"; + throw FatalExceptionHandling(tmp.str()); + } + if (var+Size*(y_kmin+t+lag) < 0 || var+Size*(y_kmin+t+lag) >= Size*(periods+y_kmin+y_kmax)) + { + ostringstream tmp; + tmp << " in Init_UMFPACK_Sparse, index (" << var+Size*(y_kmin+t+lag) << ") out of range for index_vara vector\n"; + throw FatalExceptionHandling(tmp.str()); + } + if (index_vara[var+Size*(y_kmin+t+lag)] < 0 || index_vara[var+Size*(y_kmin+t+lag)] >= y_size*(periods+y_kmin+y_kmax)) + { + ostringstream tmp; + tmp << " in Init_UMFPACK_Sparse, index (" << index_vara[var+Size*(y_kmin+t+lag)] << ") out of range for y vector max=" << y_size*(periods+y_kmin+y_kmax) << "\n"; + throw FatalExceptionHandling(tmp.str()); + } +#endif + (*b)[eq] += u[index+lag*u_count_init]*y[index_vara[var+Size*(y_kmin+t+lag)]]; + } + } + else /* ...and store it in the u vector*/ + { +#ifdef DEBUG + if (index < 0 || index >= u_count_alloc) + { + ostringstream tmp; + tmp << " in Init_UMFPACK_Sparse, index (" << index << ") out of range for u vector\n"; + throw FatalExceptionHandling(tmp.str()); + } + if (eq < 0 || eq >= (Size*periods)) + { + ostringstream tmp; + tmp << " in Init_UMFPACK_Sparse, index (" << eq << ") out of range for b vector\n"; + throw FatalExceptionHandling(tmp.str()); + } +#endif + (*b)[eq] += u[index]; + } + it4++; + } + } + (*Ap)[Size*periods] = NZE; + +#ifdef DEBUG + mexPrintf("*Ax = ["); + for (int i = 0; i < NZE; i++) + mexPrintf("%f ",(*Ax)[i]); + mexPrintf("]\n"); + + mexPrintf("*Ap = ["); + for (int i = 0; i < n+1; i++) + mexPrintf("%d ",(*Ap)[i]); + mexPrintf("]\n"); + + mexPrintf("*Ai = ["); + for (int i = 0; i < NZE; i++) + mexPrintf("%d ",(*Ai)[i]); + mexPrintf("]\n"); +#endif +} + +void +dynSparseMatrix::Init_CUDA_Sparse_Simple(int Size, map, int>, int> &IM, SuiteSparse_long **Ap, SuiteSparse_long **Ai, double **Ax, double **b, double **x0, bool &zero_solution, mxArray *x0_m) +{ + int eq, var; + + *b = (double*)mxMalloc(Size * sizeof(double)); + if (!(*b)) + { + ostringstream tmp; + tmp << " in Init_CUDA_Sparse, can't retrieve b vector\n"; + throw FatalExceptionHandling(tmp.str()); + } + double *Host_x0 = mxGetPr(x0_m); + if (!Host_x0) + { + ostringstream tmp; + tmp << " in Init_CUDA_Sparse_Simple, can't retrieve x0 vector\n"; + throw FatalExceptionHandling(tmp.str()); + } + *Ap = (SuiteSparse_long*)mxMalloc((Size+1) * sizeof(SuiteSparse_long)); + if (!(*Ap)) + { + ostringstream tmp; + tmp << " in Init_CUDA_Sparse, can't allocate Ap index vector\n"; + throw FatalExceptionHandling(tmp.str()); + } + size_t prior_nz = IM.size(); + *Ai = (SuiteSparse_long*)mxMalloc(prior_nz * sizeof(SuiteSparse_long)); + if (!(*Ai)) + { + ostringstream tmp; + tmp << " in Init_CUDA_Sparse, can't allocate Ai index vector\n"; + throw FatalExceptionHandling(tmp.str()); + } + *Ax = (double*)mxMalloc(prior_nz * sizeof(double)); + if (!(*Ax)) + { + ostringstream tmp; + tmp << " in Init_CUDA_Sparse, can't retrieve Ax matrix\n"; + throw FatalExceptionHandling(tmp.str()); + } + + map, int>, int>::iterator it4; +#ifdef USE_OMP +#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) +#endif + for (int i = 0; i < Size; i++) + { + int eq = index_vara[i]; + ya[eq+it_*y_size] = y[eq+it_*y_size]; + } + +#ifdef DEBUG + unsigned int max_nze = mxGetNzmax(A_m); +#endif + unsigned int NZE = 0; + int last_var = 0; + double cum_abs_sum = 0; + +#ifdef USE_OMP +#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) reduction(+:cum_abs_sum) +#endif + for (int i = 0; i < Size; i++) + { + (*b)[i] = u[i]; + cum_abs_sum += fabs((*b)[i]); + (*x0)[i] = y[i]; + } + if (cum_abs_sum < 1e-20) + zero_solution = true; + else + zero_solution = false; + + (*Ap)[0] = 0; + last_var = -1; + it4 = IM.begin(); + while (it4 != IM.end()) + { + var = it4->first.first.first; + if (var != last_var) + { + (*Ap)[1+last_var ] = NZE; + last_var = var; + } + eq = it4->first.second; + int index = it4->second; +#ifdef DEBUG + if (index < 0 || index >= u_count_alloc || index > Size + Size*Size) + { + ostringstream tmp; + tmp << " in Init_CUDA_Sparse_Simple, index (" << index << ") out of range for u vector max = " << Size+Size*Size << " allocated = " << u_count_alloc << "\n"; + throw FatalExceptionHandling(tmp.str()); + } + if (NZE >= max_nze) + { + ostringstream tmp; + tmp << " in Init_CUDA_Sparse_Simple, exceeds the capacity of A_m sparse matrix\n"; + throw FatalExceptionHandling(tmp.str()); + } +#endif + (*Ax)[NZE] = u[index]; + (*Ai)[NZE] = eq; + NZE++; +#ifdef DEBUG + if (eq < 0 || eq >= Size) + { + ostringstream tmp; + tmp << " in Init_CUDA_Sparse_Simple, index (" << eq << ") out of range for b vector\n"; + throw FatalExceptionHandling(tmp.str()); + } + if (var < 0 || var >= Size) + { + ostringstream tmp; + tmp << " in Init_CUDA_Sparse_Simple, index (" << var << ") out of range for index_vara vector\n"; + throw FatalExceptionHandling(tmp.str()); + } + if (index_vara[var] < 0 || index_vara[var] >= y_size) + { + ostringstream tmp; + tmp << " in Init_CUDA_Sparse_Simple, index (" << index_vara[var] << ") out of range for y vector max=" << y_size << " (0)\n"; + throw FatalExceptionHandling(tmp.str()); + } +#endif + it4++; + } + (*Ap)[Size] = NZE; +} + +#ifdef CUDA +void +dynSparseMatrix::Init_CUDA_Sparse(int periods, int y_kmin, int y_kmax, int Size, map, int>, int> &IM, int **Ap, int **Ai, double **Ax, int **Ap_tild, int **Ai_tild, double **A_tild, double **b, double **x0, mxArray *x0_m, int *nnz, int *nnz_tild, int preconditioner) +{ + //cudaError_t cuda_error; + int t, eq, var, lag, ti_y_kmin, ti_y_kmax; + int n = periods * Size; + size_t prior_nz = IM.size() * periods; + size_t preconditioner_size = 0; + map, int> jacob_struct; + + /* ask cuda how many devices it can find */ + int device_count; + cudaGetDeviceCount(&device_count); + + cudaSetDevice(CUDA_device); + + + double *Host_b = (double*)mxMalloc(n * sizeof(double)); + cudaChk(cudaMalloc((void**)b, n * sizeof(double)), " in Init_Cuda_Sparse, not enought memory to allocate b vector on the graphic card\n"); + + double *Host_x0 = mxGetPr(x0_m); + if (!Host_x0) + { + ostringstream tmp; + tmp << " in Init_Cuda_Sparse, can't retrieve x0 vector\n"; + throw FatalExceptionHandling(tmp.str()); + } + cudaChk(cudaMalloc((void**)x0, n * sizeof(double)), " in Init_Cuda_Sparse, not enought memory to allocate x0 vector on the graphic card\n"); + + int* Host_Ap = (int*)mxMalloc((n+1) * sizeof(int)); + + + int* Host_Ai = (int*)mxMalloc(prior_nz * sizeof(int)); + + + double* Host_Ax = (double*)mxMalloc(prior_nz * sizeof(double)); + + int* Host_Ai_tild, * Host_Ap_tild; + if (preconditioner == 3) + { + Host_Ap_tild = (int*) mxMalloc((n+1)*sizeof(int)); + Host_Ai_tild = (int*) mxMalloc(prior_nz*sizeof(int)); + Host_Ap_tild[0] = 0; + } + + + if (preconditioner == 0) + preconditioner_size = n; + else if (preconditioner == 1 || preconditioner == 2 || preconditioner == 3) + preconditioner_size = prior_nz; + + double *Host_A_tild = (double*)mxMalloc(preconditioner_size * sizeof(double)); + + + map, int>, int>::iterator it4; +#ifdef USE_OMP +#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) +#endif + for (int i = 0; i < y_size*(periods+y_kmin); i++) + ya[i] = y[i]; +#ifdef DEBUG + unsigned int max_nze = mxGetNzmax(A_m); +#endif + unsigned int NZE = 0, NZE_tild = 0; + int last_eq = 0; +#ifdef USE_OMP +#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) +#endif + for (int i = 0; i < periods*Size; i++) + { + Host_b[i] = 0; + Host_x0[i] = y[index_vara[Size*y_kmin+i]]; + } + + //Ordered in CSR and not in CSC + + Host_Ap[0] = 0; + for (t = 0; t < periods; t++) + { + last_eq = -1; + it4 = IM.begin(); + while (it4 != IM.end()) + { + eq = it4->first.first.first; + if (eq != last_eq) + { +#ifdef DEBUG + if (1+last_eq + t * Size > (n + 1)) + { + ostringstream tmp; + tmp << " in Init_CUDA_Sparse, 1+last_eq + t * Size (" << 1+last_eq + t * Size << ") out of range for Host_Ap vector\n"; + throw FatalExceptionHandling(tmp.str()); + } +#endif + Host_Ap[1+last_eq + t * Size] = NZE; + if (preconditioner == 3 && t == 0) + Host_Ap_tild[1+last_eq ] = NZE_tild; + last_eq = eq; + } + var = it4->first.second+Size*t; + lag = it4->first.first.second; + int index = it4->second+ (t /*+ lag*/) * u_count_init; + if (eq < (periods+y_kmax)*Size) + { + ti_y_kmin = -min(t, y_kmin); + ti_y_kmax = min(periods-(t + 1), y_kmax); + if ((lag <= ti_y_kmax && lag >= ti_y_kmin) || preconditioner == 3) /*Build the index for sparse matrix containing the jacobian : u*/ + { +#ifdef DEBUG + if (index < 0 || index >= u_count_alloc || index > (periods-1)* IM.size() + Size * Size + periods * Size) + { + ostringstream tmp; + tmp << " in Init_CUDA_Sparse, index (" << index << ") out of range for u vector max = " << (periods-1)* IM.size() + Size * Size + periods * Size << " allocated = " << u_count_alloc << "\n"; + throw FatalExceptionHandling(tmp.str()); + } + if (NZE >= prior_nz) + { + ostringstream tmp; + tmp << " in Init_CUDA_Sparse, exceeds the capacity of A_i or A_x sparse matrix\n"; + throw FatalExceptionHandling(tmp.str()); + } +#endif + bool to_store = true; + if (preconditioner == 0) + { + if (lag == 0 && it4->first.second == eq) + Host_A_tild[var] = u[index]; + } + else if (preconditioner == 1 || preconditioner == 2) + Host_A_tild[NZE] = u[index]; + else if (preconditioner == 3) + { + if (lag > ti_y_kmax || lag < ti_y_kmin) + { + Host_b[eq + t * Size] += u[index]*y[index_vara[var+Size*(y_kmin+lag)]]; + to_store = false; + } + if (t == 0) + { + map, int>::const_iterator it = jacob_struct.find(make_pair(eq + t * Size, var)); + if (it != jacob_struct.end()) + Host_A_tild[it->second] += u[index]; + else + { + jacob_struct[make_pair(eq, var)] = NZE_tild; + Host_A_tild[NZE_tild] = u[index]; + Host_Ai_tild[NZE_tild] = var; + NZE_tild++; + } + } + } + if (to_store) + { + Host_Ax[NZE] = u[index]; + Host_Ai[NZE] = var + lag * Size; + NZE++; + } + } + else + { +#ifdef DEBUG + if (var < 0 || var >= Size * periods) + { + ostringstream tmp; + tmp << " in Init_CUDA_Sparse, index (" << var << ") out of range for b vector\n"; + throw FatalExceptionHandling(tmp.str()); + } + if (var+Size*(y_kmin+t+lag) < 0 || var+Size*(y_kmin+lag) >= Size*(periods+y_kmin+y_kmax)) + { + ostringstream tmp; + tmp << " in Init_CUDA_Sparse, index (" << var+Size*(y_kmin+lag) << ") out of range for index_vara vector max=" << Size*(periods+y_kmin+y_kmax) << "\n"; + throw FatalExceptionHandling(tmp.str()); + } + if (index_vara[var+Size*(y_kmin+lag)] < 0 || index_vara[var+Size*(y_kmin+lag)] >= y_size*(periods+y_kmin+y_kmax)) + { + ostringstream tmp; + tmp << " in Init_CUDA_Sparse, index (" << index_vara[var+Size*(y_kmin+lag)] << ") out of range for y vector max=" << y_size*(periods+y_kmin+y_kmax) << "\n"; + throw FatalExceptionHandling(tmp.str()); + } +#endif + Host_b[eq + t * Size] += u[index]*y[index_vara[var+Size*(y_kmin+lag)]]; + } + } + else // ...and store it in the u vector + { +#ifdef DEBUG + if (index < 0 || index >= u_count_alloc) + { + ostringstream tmp; + tmp << " in Init_CUDA_Sparse, index (" << index << ") out of range for u vector\n"; + throw FatalExceptionHandling(tmp.str()); + } + if (var < 0 || var >= (Size*periods)) + { + ostringstream tmp; + tmp << " in Init_CUDA_Sparse, index (" << var << ") out of range for b vector\n"; + throw FatalExceptionHandling(tmp.str()); + } +#endif + Host_b[var] += u[index]; + } + it4++; + } + } + Host_Ap[Size*periods] = NZE; + if (preconditioner == 3) + { + int* tmp_Ap_tild = (int*) mxMalloc((Size + 1) * sizeof(int) ); + int* tmp_Ai_tild = (int*) mxMalloc(NZE_tild * sizeof(int) ); + double* tmp_A_tild = (double*) mxMalloc(NZE_tild * sizeof(double) ); + memcpy(tmp_Ap_tild, Host_Ap_tild, (Size + 1) * sizeof(int)); + memcpy(tmp_Ai_tild, Host_Ai_tild, NZE_tild * sizeof(int)); + memcpy(tmp_A_tild, Host_A_tild, NZE_tild * sizeof(double)); + //int NZE_tild_old = NZE_tild; + NZE_tild = 0; + Host_Ap_tild[0] = NZE_tild; + + for (int i = 0; i < Size; i++) + { + for(int j = tmp_Ap_tild[i]; j < tmp_Ap_tild[i+1]; j++) + if (abs(tmp_A_tild[j]) > 1.0e-20 ) + { + Host_A_tild[NZE_tild] = tmp_A_tild[j]; + Host_Ai_tild[NZE_tild] = tmp_Ai_tild[j]; + NZE_tild++; + } + Host_Ap_tild[i+1] = NZE_tild; + } + mxFree(tmp_Ap_tild); + mxFree(tmp_Ai_tild); + mxFree(tmp_A_tild); + } + + *nnz = NZE; + *nnz_tild = NZE_tild; + if (preconditioner == 1 || preconditioner == 2 || preconditioner == 3) + preconditioner_size = NZE; + + +#ifdef DEBUG + mexPrintf("Host_Ax = ["); + for (int i = 0; i < NZE; i++) + mexPrintf("%f ",Host_Ax[i]); + mexPrintf("]\n"); + + mexPrintf("Host_Ap = ["); + for (int i = 0; i < n+1; i++) + mexPrintf("%d ",Host_Ap[i]); + mexPrintf("]\n"); + + mexPrintf("Host_Ai = ["); + for (int i = 0; i < NZE; i++) + mexPrintf("%d ",Host_Ai[i]); + mexPrintf("]\n"); +#endif + cudaChk(cudaMalloc((void**)Ai, NZE * sizeof(int)), " in Init_Cuda_Sparse, can't allocate Ai index vector on the graphic card\n"); + cudaChk(cudaMalloc((void**)Ax, NZE * sizeof(double)), " in Init_Cuda_Sparse, can't allocate Ax on the graphic card\n"); + cudaChk(cudaMalloc((void**)Ap, (n+1) * sizeof(int)), " in Init_Cuda_Sparse, can't allocate Ap index vector on the graphic card\n"); + if (preconditioner == 3) + { + cudaChk(cudaMalloc((void**)Ai_tild, NZE_tild * sizeof(int)), " in Init_Cuda_Sparse, can't allocate Ai_tild index vector on the graphic card\n"); + cudaChk(cudaMalloc((void**)Ap_tild, (n+1) * sizeof(int)), " in Init_Cuda_Sparse, can't allocate Ap_tild index vector on the graphic card\n"); + } + cudaChk(cudaMalloc((void**)A_tild, preconditioner_size * sizeof(double)), " in Init_Cuda_Sparse, can't allocate A_tild on the graphic card\n"); + + cudaChk(cudaMemcpy(*x0, Host_x0, n * sizeof(double), cudaMemcpyHostToDevice), " in Init_CUDA_Sparse, cudaMemcpy x0 = Host_x0 failed"); + cudaChk(cudaMemcpy(*b, Host_b, n * sizeof(double), cudaMemcpyHostToDevice), " in Init_CUDA_Sparse, cudaMemcpy b = Host_b failed"); + cudaChk(cudaMemcpy(*Ap, Host_Ap, (n + 1) * sizeof(int), cudaMemcpyHostToDevice), " in Init_CUDA_Sparse, cudaMemcpy Ap = Host_Ap failed"); + cudaChk(cudaMemcpy(*Ai, Host_Ai, NZE * sizeof(int), cudaMemcpyHostToDevice), " in Init_CUDA_Sparse, cudaMemcpy Ai = Host_Ai failed"); + cudaChk(cudaMemcpy(*Ax, Host_Ax, NZE * sizeof(double), cudaMemcpyHostToDevice), " in Init_CUDA_Sparse, cudaMemcpy Ax = Host_Ax failed"); + if (preconditioner == 3) + { + cudaChk(cudaMemcpy(*Ap_tild, Host_Ap_tild, (n + 1) * sizeof(int), cudaMemcpyHostToDevice), " in Init_CUDA_Sparse, cudaMemcpy Ap_tild = Host_Ap_tild failed"); + cudaChk(cudaMemcpy(*Ai_tild, Host_Ai_tild, NZE_tild * sizeof(int), cudaMemcpyHostToDevice), " in Init_CUDA_Sparse, cudaMemcpy Ai_tild = Host_Ai_til failed"); + } + cudaChk(cudaMemcpy(*A_tild, Host_A_tild, preconditioner_size * sizeof(double), cudaMemcpyHostToDevice), " in Init_CUDA_Sparse, cudaMemcpy A_tild = Host_A_tild failed"); +} +#endif + + +void +PrintM(int n, double* Ax, mwIndex *Ap, mwIndex *Ai) +{ + int nnz = Ap[n]; + double *A = (double*)mxMalloc(n * n * sizeof(double)); + memset(A,0,n * n * sizeof(double)); + int k = 0; + for (int i = 0; i< n; i++) + { + for (int j = Ap[i]; j < Ap[i + 1]; j++) + { + int row = Ai[j]; + A[row *n + i] = Ax[j]; + k++; + } + } + if (nnz != k) + mexPrintf("Problem nnz(%d) != number of elements(%d)\n", nnz, k); + mexPrintf("----------------------\n"); + //mexEvalString("drawnow;"); + for (int i = 0; i < n ; i++) + { + for (int j = 0; j < n; j++) + mexPrintf("%-6.3f ",A[i * n + j]); + mexPrintf("\n"); + } + mxFree(A); +} + +void +dynSparseMatrix::Init_Matlab_Sparse(int periods, int y_kmin, int y_kmax, int Size, map, int>, int> &IM, mxArray *A_m, mxArray *b_m, mxArray *x0_m) +{ + int t, eq, var, lag, ti_y_kmin, ti_y_kmax; double *b = mxGetPr(b_m); + if (!b) { ostringstream tmp; @@ -583,13 +1712,6 @@ SparseMatrix::Init_Matlab_Sparse(int periods, int y_kmin, int y_kmax, int Size, tmp << " in Init_Matlab_Sparse_Simple, can't retrieve x0 vector\n"; throw FatalExceptionHandling(tmp.str()); } - mwIndex *Ai = mxGetIr(A_m); - if (!Ai) - { - ostringstream tmp; - tmp << " in Init_Matlab_Sparse, can't allocate Ai index vector\n"; - throw FatalExceptionHandling(tmp.str()); - } mwIndex *Aj = mxGetJc(A_m); if (!Aj) { @@ -597,6 +1719,13 @@ SparseMatrix::Init_Matlab_Sparse(int periods, int y_kmin, int y_kmax, int Size, tmp << " in Init_Matlab_Sparse, can't allocate Aj index vector\n"; throw FatalExceptionHandling(tmp.str()); } + mwIndex *Ai = mxGetIr(A_m); + if (!Ai) + { + ostringstream tmp; + tmp << " in Init_Matlab_Sparse, can't allocate Ai index vector\n"; + throw FatalExceptionHandling(tmp.str()); + } double *A = mxGetPr(A_m); if (!A) { @@ -604,15 +1733,22 @@ SparseMatrix::Init_Matlab_Sparse(int periods, int y_kmin, int y_kmax, int Size, tmp << " in Init_Matlab_Sparse, can't retrieve A matrix\n"; throw FatalExceptionHandling(tmp.str()); } + map, int>, int>::iterator it4; - for (i = 0; i < y_size*(periods+y_kmin); i++) +#ifdef USE_OMP +#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) +#endif + for (int i = 0; i < y_size*(periods+y_kmin); i++) ya[i] = y[i]; #ifdef DEBUG unsigned int max_nze = mxGetNzmax(A_m); #endif unsigned int NZE = 0; int last_var = 0; - for (i = 0; i < periods*Size; i++) +#ifdef USE_OMP +#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) +#endif + for (int i = 0; i < periods*Size; i++) { b[i] = 0; x0[i] = y[index_vara[Size*y_kmin+i]]; @@ -709,7 +1845,7 @@ SparseMatrix::Init_Matlab_Sparse(int periods, int y_kmin, int y_kmax, int Size, } void -SparseMatrix::Init_GE(int periods, int y_kmin, int y_kmax, int Size, map, int>, int> &IM) +dynSparseMatrix::Init_GE(int periods, int y_kmin, int y_kmax, int Size, map, int>, int> &IM) { int t, i, eq, var, lag, ti_y_kmin, ti_y_kmax; double tmp_b = 0.0; @@ -734,31 +1870,35 @@ SparseMatrix::Init_GE(int periods, int y_kmin, int y_kmax, int Size, mapfirst.first.second; @@ -771,6 +1911,7 @@ SparseMatrix::Init_GE(int periods, int y_kmin, int y_kmax, int Size, mapfirst.second; if (lag <= ti_y_kmax && lag >= ti_y_kmin) /*Build the index for sparse matrix containing the jacobian : u*/ { + nnz++; var += Size*t; NbNZRow[eq]++; NbNZCol[var]++; @@ -814,12 +1955,13 @@ SparseMatrix::Init_GE(int periods, int y_kmin, int y_kmax, int Size, map= u_count_alloc) { @@ -956,13 +2097,16 @@ SparseMatrix::compare(int *save_op, int *save_opa, int *save_opaa, int beg_t, in throw FatalExceptionHandling(tmp.str()); } } - double *up; - for (t = 1; t < periods-beg_t-y_kmax; t++) + /*#ifdef USE_OMP + #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) + #endif*/ + for (int t = 1; t < periods-beg_t-y_kmax; t++) { - i = j = 0; + int i = j = 0; + double *up; while (i < nop4) { - save_op_s = (t_save_op_s *) (&(save_op[i])); + t_save_op_s *save_op_s = (t_save_op_s *) (&(save_op[i])); up = &u[save_op_s->first+t*diff1[j]]; switch (save_op_s->operat) { @@ -988,15 +2132,22 @@ SparseMatrix::compare(int *save_op, int *save_opa, int *save_opaa, int beg_t, in } int t1 = max(1, periods-beg_t-y_kmax); int periods_beg_t = periods-beg_t; - for (t = t1; t < periods_beg_t; t++) + /*#ifdef USE_OMP + #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) + #endif*/ + for (int t = t1; t < periods_beg_t; t++) { - i = j = 0; + int i = j = 0; + int gap = periods_beg_t-t; + /*#ifdef USE_OMP + #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) + #endif*/ while (i < nop4) { - save_op_s = (t_save_op_s *) (&(save_op[i])); - if (save_op_s->lag < (periods_beg_t-t)) + t_save_op_s *save_op_s = (t_save_op_s *) (&(save_op[i])); + if (save_op_s->lag < gap) { - up = &u[save_op_s->first+t*diff1[j]]; + double *up = &u[save_op_s->first+t*diff1[j]]; switch (save_op_s->operat) { case IFLD: @@ -1041,9 +2192,9 @@ SparseMatrix::compare(int *save_op, int *save_opa, int *save_opaa, int beg_t, in } int -SparseMatrix::complete(int beg_t, int Size, int periods, int *b) +dynSparseMatrix::complete(int beg_t, int Size, int periods, int *b) { - long int i, j, k, nop, nopa, nop1, cal_y, nb_var, pos, t, ti, max_var, min_var; + long int i, j, k, nop, nopa, nop1, cal_y, nb_var, pos, max_var, min_var; NonZeroElem *first; int *save_code; int *diff; @@ -1067,8 +2218,10 @@ SparseMatrix::complete(int beg_t, int Size, int periods, int *b) save_code[nop+1] = 0; save_code[nop+2] = 0; save_code[nop+3] = 0; +#ifdef DEBUG if ((nop+3) >= size_of_save_code) mexPrintf("out of save_code[%d] (bound=%d)\n", nop+2, size_of_save_code); +#endif nop += 4; for (k = 0; k < nb_var; k++) { @@ -1076,8 +2229,10 @@ SparseMatrix::complete(int beg_t, int Size, int periods, int *b) save_code[nop+1] = index_vara[first->c_index]+cal_y; save_code[nop+2] = first->u_index; save_code[nop+3] = first->lag_index; +#ifdef DEBUG if ((nop+3) >= size_of_save_code) mexPrintf("out of save_code[%d] (bound=%d)\n", nop+2, size_of_save_code); +#endif nop += 4; first = first->NZE_R_N; } @@ -1085,15 +2240,19 @@ SparseMatrix::complete(int beg_t, int Size, int periods, int *b) save_code[nop+1] = b[pos]; save_code[nop+2] = 0; save_code[nop+3] = 0; +#ifdef DEBUG if ((nop+3) >= size_of_save_code) mexPrintf("out of save_code[%d] (bound=%d)\n", nop+2, size_of_save_code); +#endif nop += 4; save_code[nop] = IFSTP; save_code[nop+1] = index_vara[j]+y_size*y_kmin; save_code[nop+2] = 0; save_code[nop+3] = 0; +#ifdef DEBUG if ((nop+2) >= size_of_save_code) mexPrintf("out of save_code[%d] (bound=%d)\n", nop+2, size_of_save_code); +#endif nop += 4; } i = beg_t*Size-1; @@ -1112,38 +2271,47 @@ SparseMatrix::complete(int beg_t, int Size, int periods, int *b) { diff[nopa] = save_code[nop1+1]-(index_vara[first->c_index]+cal_y); diff[nopa+1] = save_code[nop1+2]-(first->u_index); +#ifdef DEBUG if ((nop1+2) >= size_of_save_code) mexPrintf("out of save_code[%d] (bound=%d)\n", nop1+2, size_of_save_code); if ((nopa+1) >= size_of_diff) mexPrintf("out of diff[%d] (bound=%d)\n", nopa+2, size_of_diff); +#endif nopa += 2; nop1 += 4; first = first->NZE_R_N; } diff[nopa] = save_code[nop1+1]-(b[pos]); diff[nopa+1] = 0; +#ifdef DEBUG if ((nop1+3) >= size_of_save_code) mexPrintf("out of save_code[%d] (bound=%d)\n", nop1+2, size_of_save_code); if ((nopa+1) >= size_of_diff) mexPrintf("out of diff[%d] (bound=%d)\n", nopa+2, size_of_diff); +#endif nopa += 2; nop1 += 4; diff[nopa] = save_code[nop1+1]-(index_vara[j]+y_size*y_kmin); diff[nopa+1] = 0; +#ifdef DEBUG if ((nop1+4) >= size_of_save_code) mexPrintf("out of save_code[%d] (bound=%d)\n", nop1+2, size_of_save_code); if ((nopa+1) >= size_of_diff) mexPrintf("out of diff[%d] (bound=%d)\n", nopa+2, size_of_diff); +#endif nopa += 2; nop1 += 4; } max_var = (periods+y_kmin)*y_size; min_var = y_kmin*y_size; - for (t = periods+y_kmin-1; t >= beg_t+y_kmin; t--) + /*#ifdef USE_OMP + #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) + #endif*/ + for (int t = periods+y_kmin-1; t >= beg_t+y_kmin; t--) { - j = 0; - ti = t-y_kmin-beg_t; - for (i = 0; i < nop; i += 4) + int j = 0, k; + int ti = t-y_kmin-beg_t; + for (int i = 0; i < nop; i += 4) { switch (save_code[i]) { @@ -1175,12 +2343,15 @@ SparseMatrix::complete(int beg_t, int Size, int periods, int *b) } void -SparseMatrix::bksub(int tbreak, int last_period, int Size, double slowc_l) +dynSparseMatrix::bksub(int tbreak, int last_period, int Size, double slowc_l) { NonZeroElem *first; int i, j, k; double yy; - for (i = 0; i < y_size*(periods+y_kmin); i++) +#ifdef USE_OMP +#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) +#endif + for (int i = 0; i < y_size*(periods+y_kmin); i++) y[i] = ya[i]; if (symbolic && tbreak) last_period = complete(tbreak, Size, periods, b); @@ -1213,12 +2384,15 @@ SparseMatrix::bksub(int tbreak, int last_period, int Size, double slowc_l) } void -SparseMatrix::simple_bksub(int it_, int Size, double slowc_l) +dynSparseMatrix::simple_bksub(int it_, int Size, double slowc_l) { int i, k; double yy; NonZeroElem *first; - for (i = 0; i < y_size; i++) +#ifdef USE_OMP +#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) +#endif + for (int i = 0; i < y_size; i++) y[i+it_*y_size] = ya[i+it_*y_size]; for (i = Size-1; i >= 0; i--) { @@ -1240,7 +2414,7 @@ SparseMatrix::simple_bksub(int it_, int Size, double slowc_l) } void -SparseMatrix::CheckIt(int y_size, int y_kmin, int y_kmax, int Size, int periods, int iter) +dynSparseMatrix::CheckIt(int y_size, int y_kmin, int y_kmax, int Size, int periods) { const double epsilon = 1e-7; fstream SaveResult; @@ -1311,7 +2485,7 @@ SparseMatrix::CheckIt(int y_size, int y_kmin, int y_kmax, int Size, int periods, } void -SparseMatrix::Check_the_Solution(int periods, int y_kmin, int y_kmax, int Size, double *u, int *pivot, int *b) +dynSparseMatrix::Check_the_Solution(int periods, int y_kmin, int y_kmax, int Size, double *u, int *pivot, int *b) { const double epsilon = 1e-10; Init_GE(periods, y_kmin, y_kmax, Size, IM_i); @@ -1349,32 +2523,35 @@ SparseMatrix::Check_the_Solution(int periods, int y_kmin, int y_kmax, int Size, } mxArray * -SparseMatrix::substract_A_B(mxArray *A_m, mxArray *B_m) +dynSparseMatrix::substract_A_B(mxArray *A_m, mxArray *B_m) { - unsigned int n_A = mxGetN(A_m); - unsigned int m_A = mxGetM(A_m); + size_t n_A = mxGetN(A_m); + size_t m_A = mxGetM(A_m); double *A_d = mxGetPr(A_m); - unsigned int n_B = mxGetN(B_m); + size_t n_B = mxGetN(B_m); double *B_d = mxGetPr(B_m); mxArray *C_m = mxCreateDoubleMatrix(m_A, n_B, mxREAL); double *C_d = mxGetPr(C_m); - for (unsigned int j = 0; j < n_A; j++) +#ifdef USE_OMP +#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) +#endif + for (int j = 0; j < n_A; j++) for (unsigned int i = 0; i < m_A; i++) { - unsigned int index = j*m_A+i; + size_t index = j*m_A+i; C_d[index] = A_d[index] - B_d[index]; } return C_m; } mxArray * -SparseMatrix::Sparse_substract_A_SB(mxArray *A_m, mxArray *B_m) +dynSparseMatrix::Sparse_substract_A_SB(mxArray *A_m, mxArray *B_m) { - unsigned int n_B = mxGetN(B_m); - unsigned int m_B = mxGetM(B_m); + size_t n_B = mxGetN(B_m); + size_t m_B = mxGetM(B_m); mwIndex *B_i = mxGetIr(B_m); mwIndex *B_j = mxGetJc(B_m); - unsigned int total_nze_B = B_j[n_B]; + size_t total_nze_B = B_j[n_B]; double *B_d = mxGetPr(B_m); mxArray *C_m = mxDuplicateArray(A_m); double *C_d = mxGetPr(C_m); @@ -1391,18 +2568,18 @@ SparseMatrix::Sparse_substract_A_SB(mxArray *A_m, mxArray *B_m) } mxArray * -SparseMatrix::Sparse_substract_SA_SB(mxArray *A_m, mxArray *B_m) +dynSparseMatrix::Sparse_substract_SA_SB(mxArray *A_m, mxArray *B_m) { - unsigned int n_A = mxGetN(A_m); - unsigned int m_A = mxGetM(A_m); + size_t n_A = mxGetN(A_m); + size_t m_A = mxGetM(A_m); mwIndex *A_i = mxGetIr(A_m); mwIndex *A_j = mxGetJc(A_m); - unsigned int total_nze_A = A_j[n_A]; + size_t total_nze_A = A_j[n_A]; double *A_d = mxGetPr(A_m); - unsigned int n_B = mxGetN(B_m); + size_t n_B = mxGetN(B_m); mwIndex *B_i = mxGetIr(B_m); mwIndex *B_j = mxGetJc(B_m); - unsigned int total_nze_B = B_j[n_B]; + size_t total_nze_B = B_j[n_B]; double *B_d = mxGetPr(B_m); mxArray *C_m = mxCreateSparse(m_A, n_B, m_A*n_B, mxREAL); mwIndex *C_i = mxGetIr(C_m); @@ -1415,10 +2592,10 @@ SparseMatrix::Sparse_substract_SA_SB(mxArray *A_m, mxArray *B_m) { while (nze_A >= (unsigned int) A_j[A_col+1] && (nze_A < total_nze_A)) A_col++; - int A_row = A_i[nze_A]; + size_t A_row = A_i[nze_A]; while (nze_B >= (unsigned int) B_j[B_col+1] && (nze_B < total_nze_B)) B_col++; - int B_row = B_i[nze_B]; + size_t B_row = B_i[nze_B]; if (A_col == B_col) { if (A_row == B_row && (nze_B < total_nze_B && nze_A < total_nze_A)) @@ -1475,27 +2652,30 @@ SparseMatrix::Sparse_substract_SA_SB(mxArray *A_m, mxArray *B_m) } mxArray * -SparseMatrix::mult_SAT_B(mxArray *A_m, mxArray *B_m) +dynSparseMatrix::mult_SAT_B(mxArray *A_m, mxArray *B_m) { - unsigned int n_A = mxGetN(A_m); - unsigned int m_A = mxGetM(A_m); + size_t n_A = mxGetN(A_m); + size_t m_A = mxGetM(A_m); mwIndex *A_i = mxGetIr(A_m); mwIndex *A_j = mxGetJc(A_m); double *A_d = mxGetPr(A_m); - unsigned int n_B = mxGetN(B_m); + size_t n_B = mxGetN(B_m); double *B_d = mxGetPr(B_m); mxArray *C_m = mxCreateDoubleMatrix(m_A, n_B, mxREAL); double *C_d = mxGetPr(C_m); - unsigned int nze_A = 0; - for (unsigned int j = 0; j < n_B; j++) + //unsigned int nze_A = 0; +#ifdef USE_OMP +#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) +#endif + for (int j = 0; j < (int)n_B; j++) { for (unsigned int i = 0; i < n_A; i++) { double sum = 0; - nze_A = A_j[i]; + size_t nze_A = A_j[i]; while (nze_A < (unsigned int) A_j[i+1]) { - unsigned int i_A = A_i[nze_A]; + size_t i_A = A_i[nze_A]; sum += A_d[nze_A++] * B_d[i_A]; } C_d[j*n_A+i] = sum; @@ -1505,32 +2685,34 @@ SparseMatrix::mult_SAT_B(mxArray *A_m, mxArray *B_m) } mxArray * -SparseMatrix::Sparse_mult_SAT_B(mxArray *A_m, mxArray *B_m) +dynSparseMatrix::Sparse_mult_SAT_B(mxArray *A_m, mxArray *B_m) { - unsigned int n_A = mxGetN(A_m); - unsigned int m_A = mxGetM(A_m); + size_t n_A = mxGetN(A_m); + size_t m_A = mxGetM(A_m); mwIndex *A_i = mxGetIr(A_m); mwIndex *A_j = mxGetJc(A_m); double *A_d = mxGetPr(A_m); - unsigned int n_B = mxGetN(B_m); - unsigned int m_B = mxGetM(B_m); + size_t n_B = mxGetN(B_m); + size_t m_B = mxGetM(B_m); double *B_d = mxGetPr(B_m); mxArray *C_m = mxCreateSparse(m_A, n_B, m_A*n_B, mxREAL); mwIndex *C_i = mxGetIr(C_m); mwIndex *C_j = mxGetJc(C_m); double *C_d = mxGetPr(C_m); - unsigned int nze_C = 0, nze_A = 0; + unsigned int nze_C = 0; + //unsigned int nze_A = 0; unsigned int C_col = 0; C_j[C_col] = 0; + //#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) for (unsigned int j = 0; j < n_B; j++) { for (unsigned int i = 0; i < n_A; i++) { double sum = 0; - nze_A = A_j[i]; + size_t nze_A = A_j[i]; while (nze_A < (unsigned int) A_j[i+1]) { - unsigned int i_A = A_i[nze_A]; + size_t i_A = A_i[nze_A]; sum += A_d[nze_A++] * B_d[i_A]; } if (fabs(sum) > 1e-10) @@ -1550,14 +2732,14 @@ SparseMatrix::Sparse_mult_SAT_B(mxArray *A_m, mxArray *B_m) } mxArray * -SparseMatrix::Sparse_mult_SAT_SB(mxArray *A_m, mxArray *B_m) +dynSparseMatrix::Sparse_mult_SAT_SB(mxArray *A_m, mxArray *B_m) { - unsigned int n_A = mxGetN(A_m); - unsigned int m_A = mxGetM(A_m); + size_t n_A = mxGetN(A_m); + size_t m_A = mxGetM(A_m); mwIndex *A_i = mxGetIr(A_m); mwIndex *A_j = mxGetJc(A_m); double *A_d = mxGetPr(A_m); - unsigned int n_B = mxGetN(B_m); + size_t n_B = mxGetN(B_m); mwIndex *B_i = mxGetIr(B_m); mwIndex *B_j = mxGetJc(B_m); double *B_d = mxGetPr(B_m); @@ -1565,7 +2747,7 @@ SparseMatrix::Sparse_mult_SAT_SB(mxArray *A_m, mxArray *B_m) mwIndex *C_i = mxGetIr(C_m); mwIndex *C_j = mxGetJc(C_m); double *C_d = mxGetPr(C_m); - unsigned int nze_B = 0, nze_C = 0, nze_A = 0; + size_t nze_B = 0, nze_C = 0, nze_A = 0; unsigned int C_col = 0; C_j[C_col] = 0; for (unsigned int j = 0; j < n_B; j++) @@ -1577,8 +2759,8 @@ SparseMatrix::Sparse_mult_SAT_SB(mxArray *A_m, mxArray *B_m) nze_A = A_j[i]; while (nze_A < (unsigned int) A_j[i+1] && nze_B < (unsigned int) B_j[j+1]) { - unsigned int i_A = A_i[nze_A]; - unsigned int i_B = B_i[nze_B]; + size_t i_A = A_i[nze_A]; + size_t i_B = B_i[nze_B]; if (i_A == i_B) sum += A_d[nze_A++] * B_d[nze_B++]; else if (i_A < i_B) @@ -1603,13 +2785,13 @@ SparseMatrix::Sparse_mult_SAT_SB(mxArray *A_m, mxArray *B_m) } mxArray * -SparseMatrix::Sparse_transpose(mxArray *A_m) +dynSparseMatrix::Sparse_transpose(mxArray *A_m) { - unsigned int n_A = mxGetN(A_m); - unsigned int m_A = mxGetM(A_m); + size_t n_A = mxGetN(A_m); + size_t m_A = mxGetM(A_m); mwIndex *A_i = mxGetIr(A_m); mwIndex *A_j = mxGetJc(A_m); - unsigned int total_nze_A = A_j[n_A]; + size_t total_nze_A = A_j[n_A]; double *A_d = mxGetPr(A_m); mxArray *C_m = mxCreateSparse(n_A, m_A, total_nze_A, mxREAL); mwIndex *C_i = mxGetIr(C_m); @@ -1617,7 +2799,7 @@ SparseMatrix::Sparse_transpose(mxArray *A_m) double *C_d = mxGetPr(C_m); unsigned int nze_C = 0, nze_A = 0; memset(C_j, 0, m_A); - map, double> B2; + map, double> B2; for (unsigned int i = 0; i < n_A; i++) { while (nze_A < (unsigned int) A_j[i+1]) @@ -1629,7 +2811,7 @@ SparseMatrix::Sparse_transpose(mxArray *A_m) } for (unsigned int i = 0; i < m_A; i++) C_j[i+1] += C_j[i]; - for (map, double>::const_iterator it = B2.begin(); it != B2.end(); it++) + for (map, double>::const_iterator it = B2.begin(); it != B2.end(); it++) { C_d[nze_C] = it->second; C_i[nze_C++] = it->first.second; @@ -1637,8 +2819,163 @@ SparseMatrix::Sparse_transpose(mxArray *A_m) return C_m; } + +#define sign(a,b) ((b) >= 0.0 ? fabs(a) : -fabs(a)) +bool +dynSparseMatrix::mnbrak(double *ax, double *bx, double *cx, double *fa, double *fb, double *fc) +{ + const double GOLD=1.618034; + const double GLIMIT=100.0; + const double TINY=1.0e-20; + + double tmp; + mexPrintf("bracketing *ax=%f, *bx=%f\n",*ax, *bx); + //mexEvalString("drawnow;"); + double ulim,u,r,q,fu; + if (!compute_complete(*ax, fa)) + return false; + if (!compute_complete(*bx, fb)) + return false; + if (*fb > *fa) + { + tmp = *ax; + *ax = *bx; + *bx = tmp; + + tmp = *fa; + *fa = *fb; + *fb = tmp; + } + *cx=(*bx)+GOLD*(*bx-*ax); + if (!compute_complete(*cx, fc)) + return false; + while (*fb > *fc) + { + r=(*bx-*ax)*(*fb-*fc); + q=(*bx-*cx)*(*fb-*fa); + u=(*bx)-((*bx-*cx)*q-(*bx-*ax)*r)/ + (2.0*sign(fmax(fabs(q-r),TINY),q-r)); + ulim=(*bx)+GLIMIT*(*cx-*bx); + if ((*bx-u)*(u-*cx) > 0.0) + { + if (!compute_complete(u, &fu)) + return false; + if (fu < *fc) + { + *ax=(*bx); + *bx=u; + *fa=(*fb); + *fb=fu; + return true; + } + else if (fu > *fb) + { + *cx=u; + *fc=fu; + return true; + } + u=(*cx)+GOLD*(*cx-*bx); + if (!compute_complete(u, &fu)) + return false; + } + else if ((*cx-u)*(u-ulim) > 0.0) + { + if (!compute_complete(u, &fu)) + return false; + if (fu < *fc) + { + *bx = *cx; + *cx = u; + u = *cx+GOLD*(*cx-*bx); + *fb = *fc; + *fc = fu; + if (!compute_complete(u, &fu)) + return false; + } + } + else if ((u-ulim)*(ulim-*cx) >= 0.0) + { + u=ulim; + if (!compute_complete(u, &fu)) + return false; + } + else + { + u=(*cx)+GOLD*(*cx-*bx); + if (!compute_complete(u, &fu)) + return false; + } + *ax = *bx; + *bx = *cx; + *cx = u; + *fa = *fb; + *fb = *fc; + *fc = fu; + } + return true; +} + +bool +dynSparseMatrix::golden(double ax, double bx, double cx, double tol, double solve_tolf, double *xmin) +{ + const double R=0.61803399; + const double C=(1.0-R); + mexPrintf("golden\n"); + //mexEvalString("drawnow;"); + double f1,f2,x0,x1,x2,x3; + int iter= 0, max_iter= 100; + x0=ax; + x3=cx; + if (fabs(cx-bx) > fabs(bx-ax)) + { + x1=bx; + x2=bx+C*(cx-bx); + } + else + { + x2=bx; + x1=bx-C*(bx-ax); + } + if (!compute_complete(x1, &f1)) + return false; + if (!compute_complete(x2, &f2)) + return false; + while ((fabs(x3-x0) > tol*(fabs(x1)+fabs(x2)) && (f1 > solve_tolf && f2 > solve_tolf)) && (iter < max_iter) && (abs(x1 - x2) > 1e-4)) + { + if (f2 < f1) + { + x0 = x1; + x1 = x2; + x2 = R*x1+C*x3; + f1 = f2; + if (!compute_complete(x2, &f2)) + return false; + } + else + { + x3 = x2; + x2 = x1; + x1 = R*x2+C*x0; + f2 = f1; + if (!compute_complete(x1, &f1)) + return false; + } + iter++; + } + if (f1 < f2) + { + *xmin=x1; + return true; + } + else + { + *xmin=x2; + return true; + } +} + void -SparseMatrix::Solve_Matlab_Relaxation(mxArray *A_m, mxArray *b_m, unsigned int Size, double slowc_l, bool is_two_boundaries, int it_) +dynSparseMatrix::Solve_Matlab_Relaxation(mxArray *A_m, mxArray *b_m, unsigned int Size, double slowc_l, bool is_two_boundaries, int it_) { mxArray *B1, *C1, *A2, *B2, *A3, *b1, *b2; double *b_m_d = mxGetPr(b_m); @@ -1669,9 +3006,9 @@ SparseMatrix::Solve_Matlab_Relaxation(mxArray *A_m, mxArray *b_m, unsigned int S tmp << " in Solve_Matlab_Relaxation, can't retrieve A matrix\n"; throw FatalExceptionHandling(tmp.str()); } - unsigned int max_nze = A_m_j[Size*periods]; + size_t max_nze = A_m_j[Size*periods]; unsigned int nze = 0; - unsigned int var = A_m_j[nze]; + size_t var = A_m_j[nze]; B1 = mxCreateSparse(Size, Size, Size*Size, mxREAL); mwIndex *B1_i = mxGetIr(B1); mwIndex *B1_j = mxGetJc(B1); @@ -1716,7 +3053,7 @@ SparseMatrix::Solve_Matlab_Relaxation(mxArray *A_m, mxArray *b_m, unsigned int S double *b1_d = mxGetPr(b1); b2 = mxCreateDoubleMatrix(Size, 1, mxREAL); double *b2_d = mxGetPr(b2); - unsigned int eq = 0; + size_t eq = 0; /*B1 C1 A2 B2 A3*/ @@ -1803,7 +3140,7 @@ SparseMatrix::Solve_Matlab_Relaxation(mxArray *A_m, mxArray *b_m, unsigned int S mxDestroyArray(B1_inv); mexCallMATLAB(1, &B1_inv, 1, &B1, "inv"); mwIndex *B_inv_j = mxGetJc(B1_inv); - unsigned int B_inv_nze = B_inv_j[Size]; + size_t B_inv_nze = B_inv_j[Size]; double *B_inv_d = mxGetPr(B1_inv); sumc = 0; for (unsigned int i = 0; i < B_inv_nze; i++) @@ -1912,9 +3249,9 @@ SparseMatrix::Solve_Matlab_Relaxation(mxArray *A_m, mxArray *b_m, unsigned int S } void -SparseMatrix::Solve_Matlab_LU_UMFPack(mxArray *A_m, mxArray *b_m, int Size, double slowc_l, bool is_two_boundaries, int it_) +dynSparseMatrix::Solve_Matlab_LU_UMFPack(mxArray *A_m, mxArray *b_m, int Size, double slowc_l, bool is_two_boundaries, int it_) { - int n = mxGetM(A_m); + size_t n = mxGetM(A_m); mxArray *z; mxArray *rhs[2]; rhs[0] = A_m; @@ -1922,6 +3259,9 @@ SparseMatrix::Solve_Matlab_LU_UMFPack(mxArray *A_m, mxArray *b_m, int Size, doub mexCallMATLAB(1, &z, 2, rhs, "mldivide"); double *res = mxGetPr(z); if (is_two_boundaries) +#ifdef USE_OMP +#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) +#endif for (int i = 0; i < n; i++) { int eq = index_vara[i+Size*y_kmin]; @@ -1930,6 +3270,9 @@ SparseMatrix::Solve_Matlab_LU_UMFPack(mxArray *A_m, mxArray *b_m, int Size, doub y[eq] += slowc_l * yy; } else +#ifdef USE_OMP +#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) +#endif for (int i = 0; i < n; i++) { int eq = index_vara[i]; @@ -1943,7 +3286,1404 @@ SparseMatrix::Solve_Matlab_LU_UMFPack(mxArray *A_m, mxArray *b_m, int Size, doub } void -SparseMatrix::Solve_Matlab_GMRES(mxArray *A_m, mxArray *b_m, int Size, double slowc, int block, bool is_two_boundaries, int it_, bool steady_state, mxArray *x0_m) +dynSparseMatrix::End_Matlab_LU_UMFPack() +{ + if (Symbolic) + umfpack_dl_free_symbolic (&Symbolic) ; + if (Numeric) + umfpack_dl_free_numeric (&Numeric) ; +} + + +void +dynSparseMatrix::End_Solver() +{ + if (((stack_solve_algo == 0 || stack_solve_algo == 4) && !steady_state) || (solve_algo == 6 && steady_state)) + End_Matlab_LU_UMFPack(); +} + +void +dynSparseMatrix::Solve_LU_UMFPack(SuiteSparse_long *Ap, SuiteSparse_long *Ai, double *Ax, double *b, int n, int Size, double slowc_l, bool is_two_boundaries, int it_) +{ + SuiteSparse_long status, sys = 0; +#ifndef _MSC_VER + double Control [UMFPACK_CONTROL], Info [UMFPACK_INFO], res [n]; +#else + double *Control, *Info, *res; + Control = (double*)mxMalloc(UMFPACK_CONTROL * sizeof(double)); + Info = (double*)mxMalloc(UMFPACK_INFO * sizeof(double)); + res = (double*)mxMalloc(n * sizeof(double)); +#endif + + umfpack_dl_defaults(Control); + Control [UMFPACK_PRL] = 5; + status = 0; + if (iter == 0) + { + status = umfpack_dl_symbolic(n, n, Ap, Ai, Ax, &Symbolic, Control, Info); + if (status < 0) + { + umfpack_dl_report_info(Control, Info); + umfpack_dl_report_status(Control, status); + ostringstream Error; + Error << " umfpack_dl_symbolic failed\n"; + throw FatalExceptionHandling(Error.str()); + } + } + if (iter > 0) + umfpack_dl_free_numeric(&Numeric) ; + status = umfpack_dl_numeric (Ap, Ai, Ax, Symbolic, &Numeric, Control, Info); + if (status < 0) + { + umfpack_dl_report_info(Control, Info); + umfpack_dl_report_status(Control, status); + ostringstream Error; + Error << " umfpack_dl_numeric failed\n"; + throw FatalExceptionHandling(Error.str()); + } + status = umfpack_dl_solve(sys, Ap, Ai, Ax, res, b, Numeric, Control, Info); + if (status != UMFPACK_OK) + { + umfpack_dl_report_info(Control, Info); + umfpack_dl_report_status(Control, status); + ostringstream Error; + Error << " umfpack_dl_solve failed\n"; + throw FatalExceptionHandling(Error.str()); + } + + if (is_two_boundaries) +#ifdef USE_OMP +#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) +#endif + for (int i = 0; i < n; i++) + { + int eq = index_vara[i+Size*y_kmin]; + double yy = -(res[i] + y[eq]); + direction[eq] = yy; + y[eq] += slowc_l * yy; + } + else +#ifdef USE_OMP +#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) +#endif + for (int i = 0; i < n; i++) + { + int eq = index_vara[i]; + double yy = -(res[i] + y[eq+it_*y_size]); + direction[eq] = yy; + y[eq+it_*y_size] += slowc_l * yy; + } + + mxFree(Ap); + mxFree(Ai); + mxFree(Ax); + mxFree(b); +#ifdef _MSC_VER + mxFree(Control); + mxFree(Info); + mxFree(res); +#endif +} + + +void +dynSparseMatrix::Solve_LU_UMFPack(mxArray *A_m, mxArray *b_m, int Size, double slowc_l, bool is_two_boundaries, int it_) +{ + SuiteSparse_long n = mxGetM(A_m); + + SuiteSparse_long *Ap = (SuiteSparse_long*)mxGetJc (A_m); + + SuiteSparse_long *Ai = (SuiteSparse_long*)mxGetIr(A_m); + double* Ax = mxGetPr(A_m); + double* B = mxGetPr(b_m); + SuiteSparse_long status, sys = 0; +#ifndef _MSC_VER + double Control [UMFPACK_CONTROL], Info [UMFPACK_INFO], res [n]; +#else + double *Control, *Info, *res; + Control = (double*)mxMalloc(UMFPACK_CONTROL * sizeof(double)); + Info = (double*)mxMalloc(UMFPACK_INFO * sizeof(double)); + res = (double*)mxMalloc(n * sizeof(double)); +#endif + void *Symbolic, *Numeric ; + umfpack_dl_defaults (Control) ; + + status = umfpack_dl_symbolic (n, n, Ap, Ai, Ax, &Symbolic, Control, Info) ; + if (status != UMFPACK_OK) + umfpack_dl_report_info ((double*) NULL, Info) ; + + status = umfpack_dl_numeric (Ap, Ai, Ax, Symbolic, &Numeric, Control, Info) ; + if (status != UMFPACK_OK) + umfpack_dl_report_info ((double*) NULL, Info) ; + + status = umfpack_dl_solve (sys, Ap, Ai, Ax, res, B, Numeric, Control, Info) ; + if (status != UMFPACK_OK) + umfpack_dl_report_info ((double*) NULL, Info) ; + //double *res = mxGetPr(z); + if (is_two_boundaries) +#ifdef USE_OMP +#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) +#endif + for (int i = 0; i < n; i++) + { + int eq = index_vara[i+Size*y_kmin]; + double yy = -(res[i] + y[eq]); + direction[eq] = yy; + y[eq] += slowc_l * yy; + } + else +#ifdef USE_OMP +#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) +#endif + for (int i = 0; i < n; i++) + { + int eq = index_vara[i]; + double yy = -(res[i] + y[eq+it_*y_size]); + direction[eq] = yy; + y[eq+it_*y_size] += slowc_l * yy; + } + mxDestroyArray(A_m); + mxDestroyArray(b_m); +#ifdef _MSC_VER + mxFree(Control); + mxFree(Info); + mxFree(res); +#endif + +} + + +#ifdef CUDA +void +printM(int n,double *Ax, int* Ap, int* Ai, cusparseMatDescr_t descrA, cusparseHandle_t cusparse_handle) +{ + //cudaError_t cuda_error; + //cusparseStatus_t cusparse_status; + double * A_dense; + cudaChk(cudaMalloc((void**) &A_dense, n * n *sizeof(double)), "A_dense cudaMalloc has failed\n"); + + + cusparseChk(cusparseDcsr2dense(cusparse_handle, n, n, descrA, + Ax, Ap,Ai, A_dense, n), "cusparseDcsr2dense has failed\n"); + double *A_dense_hoste = (double*)mxMalloc(n * n * sizeof(double)); + cudaChk(cudaMemcpy(A_dense_hoste, A_dense, n * n * sizeof(double),cudaMemcpyDeviceToHost), " cudaMemcpy(A_dense_hoste, A_dense) has failed\n"); + mexPrintf("----------------------\n"); + mexPrintf("FillMode=%d, IndexBase=%d, MatType=%d, DiagType=%d\n",cusparseGetMatFillMode(descrA), cusparseGetMatIndexBase(descrA), cusparseGetMatType(descrA), cusparseGetMatDiagType(descrA)); + //mexEvalString("drawnow;"); + for (int i = 0; i < n ; i++) + { + for (int j = 0; j < n; j++) + mexPrintf("%-6.3f ",A_dense_hoste[i + j * n]); + mexPrintf("\n"); + } + mxFree(A_dense_hoste); + cudaChk(cudaFree(A_dense), "cudaFree(A_dense) has failed\n"); +} + + + +void +dynSparseMatrix::Solve_CUDA_BiCGStab_Free(double* tmp_vect_host, double* p, double* r, double* v, double* s, double* t, double* y_, double* z, double* tmp_, + int* Ai, double* Ax, int* Ap, double* x0, double* b, double* A_tild, int* A_tild_i, int* A_tild_p/*, double* Lx, int* Li, int* Lp, + double* Ux, int* Ui, int* Up, int* device_n*/, cusparseSolveAnalysisInfo_t infoL, cusparseSolveAnalysisInfo_t infoU, + cusparseMatDescr_t descrL, cusparseMatDescr_t descrU, int preconditioner) +{ + //cudaError_t cuda_error; + //cusparseStatus_t cusparse_status; + mxFree(tmp_vect_host); + cudaChk(cudaFree(p), " in Solve_Cuda_BiCGStab, can't free p\n"); + cudaChk(cudaFree(r), " in Solve_Cuda_BiCGStab, can't free r\n"); + cudaChk(cudaFree(v), " in Solve_Cuda_BiCGStab, can't free v\n"); + cudaChk(cudaFree(s), " in Solve_Cuda_BiCGStab, can't free s\n"); + cudaChk(cudaFree(t), " in Solve_Cuda_BiCGStab, can't free t\n"); + cudaChk(cudaFree(y_), " in Solve_Cuda_BiCGStab, can't free y_\n"); + cudaChk(cudaFree(z), " in Solve_Cuda_BiCGStab, can't free z\n"); + cudaChk(cudaFree(tmp_), " in Solve_Cuda_BiCGStab, can't free tmp_\n"); + cudaChk(cudaFree(Ai), " in Solve_Cuda_BiCGStab, can't free Ai\n"); + cudaChk(cudaFree(Ax), " in Solve_Cuda_BiCGStab, can't free Ax\n"); + cudaChk(cudaFree(Ap), " in Solve_Cuda_BiCGStab, can't free Ap\n"); + cudaChk(cudaFree(x0), " in Solve_Cuda_BiCGStab, can't free x0\n"); + cudaChk(cudaFree(b), " in Solve_Cuda_BiCGStab, can't free b\n"); + /*if (preconditioner == 0) + {*/ + cudaChk(cudaFree(A_tild), " in Solve_Cuda_BiCGStab, can't free A_tild (1)\n"); + cudaChk(cudaFree(A_tild_i), " in Solve_Cuda_BiCGStab, can't free A_tild_i (1)\n"); + cudaChk(cudaFree(A_tild_p), " in Solve_Cuda_BiCGStab, can't free A_tild_p (1)\n"); + /*} + else + { + cudaChk(cudaFree(Lx), " in Solve_Cuda_BiCGStab, can't free Lx\n"); + cudaChk(cudaFree(Li), " in Solve_Cuda_BiCGStab, can't free Li\n"); + cudaChk(cudaFree(Lp), " in Solve_Cuda_BiCGStab, can't free Lp\n"); + cudaChk(cudaFree(Ux), " in Solve_Cuda_BiCGStab, can't free Ux\n"); + cudaChk(cudaFree(Ui), " in Solve_Cuda_BiCGStab, can't free Ui\n"); + cudaChk(cudaFree(Up), " in Solve_Cuda_BiCGStab, can't free Up\n"); + }*/ + //cudaChk(cudaFree(device_n), " in Solve_Cuda_BiCGStab, can't free device_n\n"); + if (preconditioner == 1 || preconditioner == 2 || preconditioner == 3) + { + cusparseChk(cusparseDestroySolveAnalysisInfo(infoL), + " in Solve_Cuda_BiCGStab, cusparseDestroySolveAnalysisInfo has failed for infoL\n"); + cusparseChk(cusparseDestroySolveAnalysisInfo(infoU), + " in Solve_Cuda_BiCGStab, cusparseDestroySolveAnalysisInfo has failed for infoU\n"); + } + cusparseChk(cusparseDestroyMatDescr(descrL), + " in Solve_Cuda_BiCGStab, matrix descriptor destruction failed for descrL\n"); + cusparseChk(cusparseDestroyMatDescr(descrU), + " in Solve_Cuda_BiCGStab, matrix descriptor destruction failed for descrU\n"); +} +#endif + +void +Solve(double* Ax, int* Ap, int* Ai, double *b, int n, bool Lower, double *x) +{ + if (Lower) + { + for (int i = 0; i < n; i++) + { + double sum = 0; + for(int j = Ap[i]; j < Ap[i+1]; j++) + { + int k = Ai[j]; + if (k < i) + sum += x[k] * Ax[j]; + } + x[i] = b[i] - sum; + } + } + else + { + for (int i = n-1 ; i >= 0; i--) + { + double sum = 0, mul = 1; + for(int j = Ap[i]; j < Ap[i+1]; j++) + { + int k = Ai[j]; + if (k > i) + sum += x[k] * Ax[j]; + else if (k == i) + mul = Ax[j]; + } + x[i] = (b[i] - sum) / mul; + } + } +} + +void +Check(int n, double* Ax, int* Ap, int* Ai, double* b, double *x, bool Lower) +{ + if (Lower) + { + for (int i = 0; i < n; i++) + { + double sum = 0; + for(int j = Ap[i]; j < Ap[i+1]; j++) + { + int k = Ai[j]; + if (k < i) + sum += x[k] * Ax[j]; + } + double err = b[i] - sum - x[i]; + if (abs(err) > 1e-10) + mexPrintf("error at i=%d\n",i); + } + } + else + { + for (int i = n-1 ; i >= 0; i--) + { + double sum = 0; + for(int j = Ap[i]; j < Ap[i+1]; j++) + { + int k = Ai[j]; + if (k >= i) + sum += x[k] * Ax[j]; + } + double err = b[i] - sum; + if (abs(err) > 1e-10) + mexPrintf("error at i=%d\n",i); + } + } +} + +#ifdef CUDA +int +dynSparseMatrix::Solve_CUDA_BiCGStab(int *Ap, int *Ai, double *Ax, int *Ap_tild, int *Ai_tild, double *A_tild, double *b, double *x0, int n, int Size, double slowc_l, bool is_two_boundaries, + int it_, int nnz, int nnz_tild, int preconditioner, int max_iterations, int block) +{ + cusparseSolveAnalysisInfo_t info, infoL, infoU; + cusparseMatDescr_t descrL, descrU; + const double tol = 1.0e-6;//1.0e-6; + const double eps = 1.0e-16; + double *p, *r, *r0, *v, *s, *t, *y_, *z, *tmp_; + int *A_tild_i, *A_tild_p; + double *Qx; + int *Qi, *Qj; + double *Px; + int *Pi, *Pj; + int Q_nnz, P_nnz; + int W_nnz; + double bnorm; + double tmp1, tmp2; + int refinement_needed = 0, stagnation = 0; + int max_refinement = min(min(int(floor(double(n)/50)),10),n-max_iterations), max_stagnation = 3; + int nblocks = ceil(double(n) / double(1024)); + int n_threads; + if (nblocks == 0) + n_threads = n; + else + n_threads = 1024; + int periods = n / Size; + + double * tmp_vect_host = (double*)mxMalloc(n * sizeof(double)); + + cublasChk(cublasDnrm2(cublas_handle, n,b, 1, &bnorm), + " in Solve_Cuda_BiCGStab, cublasDnrm2(b) has failed\n"); + + double tolb = tol * bnorm; + + if (bnorm == 0.0) + { + // if b = 0 the A.x = 0 => x = 0 + cudaChk(cudaFree(Ai), " in Solve_Cuda_BiCGStab, can't free Ai\n"); + cudaChk(cudaFree(Ax), " in Solve_Cuda_BiCGStab, can't free Ax\n"); + cudaChk(cudaFree(Ap), " in Solve_Cuda_BiCGStab, can't free Ap\n"); + if (preconditioner == 3) + { + cudaChk(cudaFree(Ai_tild), " in Solve_Cuda_BiCGStab, can't free Ai_tild\n"); + cudaChk(cudaFree(Ap_tild), " in Solve_Cuda_BiCGStab, can't free Ap_tild\n"); + } + cudaChk(cudaFree(A_tild), " in Solve_Cuda_BiCGStab, can't free A_tild\n"); + cudaChk(cudaFree(x0), " in Solve_Cuda_BiCGStab, can't free x0\n"); + cudaChk(cudaFree(b), " in Solve_Cuda_BiCGStab, can't free b\n"); + if (is_two_boundaries) +#ifdef USE_OMP +#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) +#endif + for (int i = 0; i < n; i++) + { + int eq = index_vara[i+Size*y_kmin]; + double yy = -y[eq]; + direction[eq] = yy; + y[eq] += slowc * yy; + } + else +#ifdef USE_OMP +#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) +#endif + for (int i = 0; i < n; i++) + { + int eq = index_vara[i]; + double yy = -y[eq+it_*y_size]; + direction[eq] = yy; + y[eq+it_*y_size] += slowc * yy; + } + return 0; + } + + int iteration = 0; + bool convergence = false; + double zeros = 0.0, one = 1.0, m_one = -1.0; + + cudaChk(cudaMalloc((void**)&tmp_, n * sizeof(double)), " in Solve_Cuda_Sparse, can't allocate tmp_ on the graphic card\n"); + + cudaChk(cudaMalloc((void**)&r, n * sizeof(double)), " in Solve_Cuda_BiCGStab, can't allocate r on the graphic card\n"); + + cudaChk(cudaMemcpy(r, b, n * sizeof(double), cudaMemcpyDeviceToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy r = b has failed\n"); + + //r = b - A * x0 + cusparseChk(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, n, + n, nnz, &m_one, + CUDA_descr, Ax, + Ap, Ai, + x0, &one, + r), "in Solve_Cuda_BiCGStab, cusparseDcsrmv A * x0 has failed"); + + cudaChk(cudaMemcpy(tmp_vect_host, r, n*sizeof(double), cudaMemcpyDeviceToHost), " in Solve_Cuda_BiCGStab, cudaMemcpy tmp_vect_host = p_tild has failed\n"); + /*mexPrintf("r\n"); + for (int i = 0; i < n; i++) + mexPrintf("%f\n",tmp_vect_host[i]);*/ + + cudaChk(cudaMalloc((void**)&r0, n * sizeof(double)), " in Solve_Cuda_BiCGStab, can't allocate r0 on the graphic card\n"); + cudaChk(cudaMemcpy(r0, r, n * sizeof(double), cudaMemcpyDeviceToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy r0 = r has failed\n"); + + cublasChk(cublasDnrm2(cublas_handle, n, // numerator + r, 1, + &tmp1), + " in Solve_Cuda_BiCGStab, cublasDnrm2(r) has failed\n"); + double conv_criteria = tmp1; + + convergence = conv_criteria < tolb; + if (convergence) + { + /* the initial value (x0) is solution of A x = b*/ + cudaChk(cudaFree(Ai), " in Solve_Cuda_BiCGStab, can't free Ai\n"); + cudaChk(cudaFree(Ax), " in Solve_Cuda_BiCGStab, can't free Ax\n"); + cudaChk(cudaFree(Ap), " in Solve_Cuda_BiCGStab, can't free Ap\n"); + if (preconditioner == 3) + { + cudaChk(cudaFree(Ai_tild), " in Solve_Cuda_BiCGStab, can't free Ai_tild\n"); + cudaChk(cudaFree(Ap_tild), " in Solve_Cuda_BiCGStab, can't free Ap_tild\n"); + } + cudaChk(cudaFree(A_tild), " in Solve_Cuda_BiCGStab, can't free A_tild\n"); + cudaChk(cudaFree(x0), " in Solve_Cuda_BiCGStab, can't free x0\n"); + cudaChk(cudaFree(b), " in Solve_Cuda_BiCGStab, can't free b\n"); + return 0; + } + + + if (preconditioner == 0) + { + //Apply the Jacobi preconditioner + /*VecDiv<<>>(r_, A_tild, z_, n); + cuda_error = cudaMemcpy(zz_, z_, n * sizeof(double), cudaMemcpyDeviceToDevice);*/ + } + else if (preconditioner == 1) + { + //Apply an incomplete LU decomposition of A as preconditioner + cusparseChk(cusparseCreateSolveAnalysisInfo(&info), " in Solve_Cuda_BiCGStab, cusparseCreateSolveAnalysisInfo for info has failed\n"); + + cusparseChk(cusparseDcsrsv_analysis(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + n, nnz, CUDA_descr, + A_tild, Ap, Ai, + info), + " in Solve_Cuda_BiCGStab, cusparseDcsrsm_analysis(info) has failed\n"); + + cusparseChk(cusparseDcsrilu0(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + n, CUDA_descr, + A_tild, Ap, Ai, + info), + " in Solve_Cuda_BiCGStab, cusparseDcsrilu0 has failed\n"); + + //Make a copy of the indexes in A_tild_i and A_tild_p to use it the Bicgstab algorithm + cudaChk(cudaMalloc((void**)&A_tild_i, nnz * sizeof(int)), " in Solve_Cuda_BiCGStab, can't allocate A_tild_i on the graphic card\n"); + cudaChk(cudaMemcpy(A_tild_i, Ai, nnz * sizeof(int), cudaMemcpyDeviceToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy A_tild_i = Ai has failed\n"); + cudaChk(cudaMalloc((void**)&A_tild_p, (n + 1) * sizeof(int)), " in Solve_Cuda_BiCGStab, can't allocate A_tild_p on the graphic card\n"); + cudaChk(cudaMemcpy(A_tild_p, Ap, (n + 1) * sizeof(int), cudaMemcpyDeviceToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy A_tild_p = Ap has failed\n"); + } + else if (preconditioner == 2) + { + //Because the Jacobian matrix A is store in CSC format in matlab + // we have to transpose it to get a CSR format used by CUDA + mwIndex* Awi, *Awp; + double* A_tild_host = (double*)mxMalloc(nnz*sizeof(double)); + Awi = (mwIndex*)mxMalloc(nnz * sizeof(mwIndex)); + Awp = (mwIndex*)mxMalloc((n + 1) * sizeof(mwIndex)); + int* Aii = (int*)mxMalloc(nnz * sizeof(int)); + int* Aip = (int*)mxMalloc((n + 1) * sizeof(int)); + cudaChk(cudaMemcpy(A_tild_host, A_tild, nnz*sizeof(double), cudaMemcpyDeviceToHost), " in Solve_Cuda_BiCGStab, cudaMemcpy A_tild_host = A_tild has failed\n"); + cudaChk(cudaMemcpy(Aii, Ai, nnz*sizeof(int), cudaMemcpyDeviceToHost), " in Solve_Cuda_BiCGStab, cudaMemcpy Aii = Ai has failed\n"); + cudaChk(cudaMemcpy(Aip, Ap, (n+1)*sizeof(int), cudaMemcpyDeviceToHost), " in Solve_Cuda_BiCGStab, cudaMemcpy Aip = Ai has failed\n"); + for (int i = 0; i < nnz; i++) + Awi[i] = Aii[i]; + for (int i = 0; i < n + 1; i++) + Awp[i] = Aip[i]; + mxFree(Aii); + mxFree(Aip); + mxArray * At_m = mxCreateSparse(n,n,nnz,mxREAL); + mxSetIr(At_m, Awi); + mxSetJc(At_m, Awp); + mxSetPr(At_m, A_tild_host); + mxArray *A_m; + mexCallMATLAB(1, &A_m, 1, &At_m, "transpose"); + mxDestroyArray(At_m); + + /*mexPrintf("A_m\n"); + mexCallMATLAB(0, NULL, 1, &A_m, "disp_dense");*/ + /*mxFree(Awi); + mxFree(Awp);*/ + + /*[L1, U1] = ilu(g1a=;*/ + const char *field_names[] = {"type", "droptol", "milu", "udiag", "thresh"}; + const int type = 0; + const int droptol = 1; + const int milu = 2; + const int udiag = 3; + const int thresh = 4; + mwSize dims[1] = {(mwSize)1 }; + mxArray *Setup = mxCreateStructArray(1, dims, 5, field_names); + mxSetFieldByNumber(Setup, 0, type, mxCreateString("ilutp")); + //mxSetFieldByNumber(Setup, 0, type, mxCreateString("nofill")); + mxSetFieldByNumber(Setup, 0, droptol, mxCreateDoubleScalar(lu_inc_tol)); + mxSetFieldByNumber(Setup, 0, milu, mxCreateString("off")); + mxSetFieldByNumber(Setup, 0, udiag, mxCreateDoubleScalar(0)); + mxSetFieldByNumber(Setup, 0, thresh, mxCreateDoubleScalar(0)); + //mxSetFieldByNumber(Setup, 0, thresh, mxCreateDoubleScalar(1)); + mxArray *lhs0[2]; + mxArray *rhs0[2]; + rhs0[0] = A_m; + rhs0[1] = Setup; + mexCallMATLAB(2, lhs0, 2, rhs0, "ilu"); + L1 = lhs0[0]; + U1 = lhs0[1]; + mxDestroyArray(Setup); + + + /* //ILUT preconditionner computed by Matlab (todo: in futur version of cuda replace it by a new equivalent cuda function) + const char *field_names[] = {"type", "droptol", "milu", "udiag", "thresh"}; + const int type = 0; + const int droptol = 1; + const int milu = 2; + const int udiag = 3; + const int thresh = 4; + mwSize dims[1] = {(mwSize)1 }; + mxArray *Setup = mxCreateStructArray(1, dims, 5, field_names); + mxSetFieldByNumber(Setup, 0, type, mxCreateString("ilutp")); + mxSetFieldByNumber(Setup, 0, droptol, mxCreateDoubleScalar(lu_inc_tol)); + mxSetFieldByNumber(Setup, 0, milu, mxCreateString("off")); + mxSetFieldByNumber(Setup, 0, udiag, mxCreateDoubleScalar(0)); + mxSetFieldByNumber(Setup, 0, thresh, mxCreateDoubleScalar(0)); + mxArray *lhs0[2], *rhs0[2]; + rhs0[0] = A_m; + rhs0[1] = Setup; + mexCallMATLAB(1, lhs0, 2, rhs0, "ilu"); +*/ + // To store the resultng matrix in a CSR format we have to transpose it + mxArray *Wt = lhs0[0]; + mwIndex* Wtj = mxGetJc(Wt); + nnz = Wtj[n]; + mxArray* W; + mexCallMATLAB(1, &W, 1, &Wt, "transpose"); + mxDestroyArray(Wt); + double* pW = mxGetPr(W); + mwIndex* Wi = mxGetIr(W); + mwIndex* Wp = mxGetJc(W); + int *Wii = (int*)mxMalloc(nnz * sizeof(int)); + int *Wip = (int*)mxMalloc((n + 1) * sizeof(int)); + for (int i = 0; i < nnz; i++) + Wii[i] = Wi[i]; + for (int i = 0; i < n + 1; i++) + Wip[i] = Wp[i]; + + //mxFree(A_tild_host); + + cudaChk(cudaFree(A_tild), "cudaFree(A_tild) has failed\n"); + + cudaChk(cudaMalloc((void**)&A_tild, nnz * sizeof(double)), " in Solve_Cuda_BiCGStab, can't allocate A_tild on the graphic card\n"); + cudaChk(cudaMemcpy(A_tild, pW, nnz * sizeof(double), cudaMemcpyHostToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy A_tild = pW has failed\n"); + cudaChk(cudaMalloc((void**)&A_tild_i, nnz * sizeof(int)), " in Solve_Cuda_BiCGStab, can't allocate Ai on the graphic card\n"); + cudaChk(cudaMemcpy(A_tild_i, Wii, nnz * sizeof(int), cudaMemcpyHostToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy A_tild_i = A_tild_i_host has failed\n"); + cudaChk(cudaMalloc((void**)&A_tild_p, (n + 1) * sizeof(int)), " in Solve_Cuda_BiCGStab, can't allocate A_tild_p on the graphic card\n"); + cudaChk(cudaMemcpy(A_tild_p, Wip, (n + 1) * sizeof(int), cudaMemcpyHostToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy A_tild_p = A_tild_j_host has failed\n"); + /*mxFree(pW); + mxFree(Wi); + mxFree(Wj);*/ + mxDestroyArray(W); + mxFree(Wii); + mxFree(Wip); + } + else if (preconditioner == 3) + { + mwIndex* Aowi, *Aowp; + double* A_host = (double*)mxMalloc(nnz*sizeof(double)); + Aowi = (mwIndex*)mxMalloc(nnz * sizeof(mwIndex)); + Aowp = (mwIndex*)mxMalloc((n + 1) * sizeof(mwIndex)); + int* Aoii = (int*)mxMalloc(nnz * sizeof(int)); + int* Aoip = (int*)mxMalloc((n + 1) * sizeof(int)); + cudaChk(cudaMemcpy(A_host, Ax, nnz*sizeof(double), cudaMemcpyDeviceToHost), " in Solve_Cuda_BiCGStab, cudaMemcpy A_tild_host = A_tild has failed\n"); + cudaChk(cudaMemcpy(Aoii, Ai, nnz*sizeof(int), cudaMemcpyDeviceToHost), " in Solve_Cuda_BiCGStab, cudaMemcpy Aii = Ai_tild has failed\n"); + cudaChk(cudaMemcpy(Aoip, Ap, (n+1)*sizeof(int), cudaMemcpyDeviceToHost), " in Solve_Cuda_BiCGStab, cudaMemcpy Aip = Ap_tild has failed\n"); + for (int i = 0; i < nnz; i++) + Aowi[i] = Aoii[i]; + for (int i = 0; i < n + 1; i++) + Aowp[i] = Aoip[i]; + mxFree(Aoii); + mxFree(Aoip); + mxArray * Ao_m = mxCreateSparse(n,n,nnz,mxREAL); + mxSetIr(Ao_m, Aowi); + mxSetJc(Ao_m, Aowp); + mxSetPr(Ao_m, A_host); + /*mexPrintf("A_m\n"); + mxArray *Aoo; + mexCallMATLAB(1, &Aoo, 1, &Ao_m, "transpose"); + mexCallMATLAB(0, NULL, 1, &Aoo, "disp_dense"); + mxDestroyArray(Ao_m); + mxDestroyArray(Aoo);*/ + + //Because the Jacobian matrix A is store in CSC format in matlab + // we have to transpose it to get a CSR format used by CUDA + mwIndex* Awi, *Awp; + double* A_tild_host = (double*)mxMalloc(nnz_tild*sizeof(double)); + Awi = (mwIndex*)mxMalloc(nnz_tild * sizeof(mwIndex)); + Awp = (mwIndex*)mxMalloc((Size + 1) * sizeof(mwIndex)); + int* Aii = (int*)mxMalloc(nnz_tild * sizeof(int)); + int* Aip = (int*)mxMalloc((Size + 1) * sizeof(int)); + cudaChk(cudaMemcpy(A_tild_host, A_tild, nnz_tild*sizeof(double), cudaMemcpyDeviceToHost), " in Solve_Cuda_BiCGStab, cudaMemcpy A_tild_host = A_tild has failed\n"); + cudaChk(cudaMemcpy(Aii, Ai_tild, nnz_tild*sizeof(int), cudaMemcpyDeviceToHost), " in Solve_Cuda_BiCGStab, cudaMemcpy Aii = Ai_tild has failed\n"); + cudaChk(cudaMemcpy(Aip, Ap_tild, (Size+1)*sizeof(int), cudaMemcpyDeviceToHost), " in Solve_Cuda_BiCGStab, cudaMemcpy Aip = Ap_tild has failed\n"); + for (int i = 0; i < nnz_tild; i++) + Awi[i] = Aii[i]; + for (int i = 0; i < Size + 1; i++) + Awp[i] = Aip[i]; + /*for (int i = 0; i < nnz_tild; i++) + mexPrintf("%20.17f\n",A_tild_host[i]);*/ + mxFree(Aii); + mxFree(Aip); + mxArray * At_m = mxCreateSparse(Size,Size,nnz_tild,mxREAL); + mxSetIr(At_m, Awi); + mxSetJc(At_m, Awp); + mxSetPr(At_m, A_tild_host); + mxArray *A_m; + mexCallMATLAB(1, &A_m, 1, &At_m, "transpose"); + /*mexPrintf("A_tild_m\n"); + mexCallMATLAB(0, NULL, 1, &A_m, "disp_dense");*/ + mxDestroyArray(At_m); + mxArray *P, *Q, *L, *U; + mxArray *lhs0[4]; + mexCallMATLAB(4, lhs0, 1, &A_m, "lu"); + + mxArray *P0, *Q0, *L0, *U0; + L0 = lhs0[0]; + U0 = lhs0[1]; + P0 = lhs0[2]; + Q0 = lhs0[3]; + mexCallMATLAB(1, &P, 1, &P0, "transpose"); + mexCallMATLAB(1, &Q, 1, &Q0, "transpose"); + mexCallMATLAB(1, &L, 1, &L0, "transpose"); + mexCallMATLAB(1, &U, 1, &U0, "transpose"); + mxDestroyArray(P0); + mxDestroyArray(Q0); + mxDestroyArray(L0); + mxDestroyArray(U0); + /*L = lhs0[0]; + U = lhs0[1]; + P = lhs0[2]; + Q = lhs0[3];*/ + + /*mexPrintf("L\n"); + mexCallMATLAB(0, NULL, 1, &L, "disp_dense"); + + mexPrintf("U\n"); + mexCallMATLAB(0, NULL, 1, &U, "disp_dense"); + + mexPrintf("P\n"); + mexCallMATLAB(0, NULL, 1, &P, "disp_dense"); + + mexPrintf("Q\n"); + mexCallMATLAB(0, NULL, 1, &Q, "disp_dense");*/ + + mwIndex* Qiw_host = mxGetIr(Q); + mwIndex* Qjw_host = mxGetJc(Q); + double* Qx_host = mxGetPr(Q); + Q_nnz = Qjw_host[Size]; + mexPrintf("Q_nnz=%d\n",Q_nnz); + int *Qi_host = (int*)mxMalloc(Q_nnz * periods * sizeof(int)); + double *Q_x_host = (double*)mxMalloc(Q_nnz * periods * sizeof(double)); + int *Qj_host = (int*)mxMalloc((n + 1) * sizeof(int)); + for (int t = 0; t < periods; t++) + { + for (int i = 0; i < Q_nnz; i++) + { + Qi_host[i + t * Q_nnz] = Qiw_host[i] + t * Size; + Q_x_host[i + t * Q_nnz] = Qx_host[i]; + } + for (int i = 0; i < Size; i++) + { + Qj_host[i + t * Size] = Qjw_host[i] + t * Q_nnz; + } + } + Qj_host[periods * Size] = periods * Q_nnz; + + + /*mwIndex *Qtiw_host = (mwIndex*) mxMalloc(Q_nnz * periods * sizeof(mwIndex)); + double *Qt_x_host = (double*)mxMalloc(Q_nnz * periods * sizeof(double)); + mwIndex *Qtjw_host = (mwIndex*)mxMalloc((n + 1) * sizeof(mwIndex)); + mexPrintf("n = %d\n",n); + for (int i = 0; i < n + 1; i++) + Qtjw_host[i] = Qj_host[i]; + for (int i = 0; i < Q_nnz * periods; i++) + { + Qtiw_host[i] = Qi_host[i]; + Qt_x_host[i] = Q_x_host[i]; + } + mxArray* Qt_m = mxCreateSparse(n,n,Q_nnz * periods,mxREAL); + mxSetIr(Qt_m, Qtiw_host); + mxSetJc(Qt_m, Qtjw_host); + mxSetPr(Qt_m, Qt_x_host); + mexPrintf("Qt_m\n"); + mexCallMATLAB(0, NULL, 1, &Qt_m, "disp_dense");*/ + + + /*mexPrintf("Qtjw_host[periods * Size=%d]=%d\n", periods * Size, Qtjw_host[periods * Size]); + for (int i = 0; i < n; i++) + for (int j = Qtjw_host[i]; j < Qtjw_host[i+1]; j++) + mexPrintf("(i=%d, j=%d) = %f\n", i, Qtiw_host[j], Qt_x_host[j]);*/ + //mxDestroyArray(Qt_m); + + + cudaChk(cudaMalloc((void**)&Qx, Q_nnz * periods * sizeof(double)), " in Solve_Cuda_BiCGStab, can't allocate Qx on the graphic card\n"); + cudaChk(cudaMemcpy(Qx, Q_x_host, Q_nnz * periods * sizeof(double), cudaMemcpyHostToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy Qx = Qx_host has failed\n"); + cudaChk(cudaMalloc((void**)&Qi, Q_nnz * periods * sizeof(int)), " in Solve_Cuda_BiCGStab, can't allocate Qi on the graphic card\n"); + cudaChk(cudaMemcpy(Qi, Qi_host, Q_nnz * periods * sizeof(int), cudaMemcpyHostToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy Qi = Qi_host has failed\n"); + cudaChk(cudaMalloc((void**)&Qj, (Size * periods + 1) * sizeof(int)), " in Solve_Cuda_BiCGStab, can't allocate Qj on the graphic card\n"); + cudaChk(cudaMemcpy(Qj, Qj_host, (Size * periods + 1) * sizeof(int), cudaMemcpyHostToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy Qj = Qj_host has failed\n"); + mxFree(Qi_host); + mxFree(Qj_host); + mxFree(Q_x_host); + mxDestroyArray(Q); + + + mwIndex* Piw_host = mxGetIr(P); + mwIndex* Pjw_host = mxGetJc(P); + double* Px_host = mxGetPr(P); + P_nnz = Pjw_host[Size]; + int *Pi_host = (int*)mxMalloc(P_nnz * periods * sizeof(int)); + double *P_x_host = (double*)mxMalloc(P_nnz * periods * sizeof(double)); + int *Pj_host = (int*)mxMalloc((n + 1) * sizeof(int)); + for (int t = 0; t < periods; t++) + { + for (int i = 0; i < P_nnz; i++) + { + Pi_host[i + t * P_nnz] = Piw_host[i] + t * Size; + P_x_host[i + t * P_nnz] = Px_host[i]; + } + for (int i = 0; i < Size; i++) + Pj_host[i + t * Size] = Pjw_host[i] + t * P_nnz; + } + Pj_host[periods * Size] = periods * P_nnz; + + /*mwIndex *Ptiw_host = (mwIndex*) mxMalloc(P_nnz * periods * sizeof(mwIndex)); + double *Pt_x_host = (double*)mxMalloc(P_nnz * periods * sizeof(double)); + mwIndex *Ptjw_host = (mwIndex*)mxMalloc((n + 1) * sizeof(mwIndex)); + for (int i = 0; i < n + 1; i++) + Ptjw_host[i] = Pj_host[i]; + for (int i = 0; i < P_nnz * periods; i++) + { + Ptiw_host[i] = Pi_host[i]; + Pt_x_host[i] = P_x_host[i]; + } + mxArray* Pt_m = mxCreateSparse(n,n,P_nnz * periods,mxREAL); + mxSetIr(Pt_m, Ptiw_host); + mxSetJc(Pt_m, Ptjw_host); + mxSetPr(Pt_m, Pt_x_host); + mexPrintf("Pt_m\n"); + mexCallMATLAB(0, NULL, 1, &Pt_m, "disp_dense"); + mxDestroyArray(Pt_m);*/ + + + cudaChk(cudaMalloc((void**)&Px, P_nnz * periods * sizeof(double)), " in Solve_Cuda_BiCGStab, can't allocate Px on the graphic card\n"); + cudaChk(cudaMemcpy(Px, P_x_host, P_nnz * periods * sizeof(double), cudaMemcpyHostToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy Px = Px_host has failed\n"); + cudaChk(cudaMalloc((void**)&Pi, P_nnz * periods * sizeof(int)), " in Solve_Cuda_BiCGStab, can't allocate Pi on the graphic card\n"); + cudaChk(cudaMemcpy(Pi, Pi_host, P_nnz * periods * sizeof(int), cudaMemcpyHostToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy Pi = Pi_host has failed\n"); + cudaChk(cudaMalloc((void**)&Pj, (Size * periods + 1) * sizeof(int)), " in Solve_Cuda_BiCGStab, can't allocate Pj on the graphic card\n"); + cudaChk(cudaMemcpy(Pj, Pj_host, (Size * periods + 1) * sizeof(int), cudaMemcpyHostToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy Pj = Pj_host has failed\n"); + mxFree(Pi_host); + mxFree(Pj_host); + mxFree(P_x_host); + mxDestroyArray(P); + + /*mwIndex* Piw_host = mxGetIr(P); + mwIndex* Pjw_host = mxGetJc(P); + double* Px_host = mxGetPr(P); + P_nnz = Pjw_host[Size]; + int *Pi_host = (int*)mxMalloc(P_nnz * sizeof(int)); + int *Pj_host = (int*)mxMalloc((Size + 1) * sizeof(int)); + for (int i = 0; i < P_nnz; i++) + Pi_host[i] = Piw_host[i]; + for (int i = 0; i < Size + 1; i++) + Pj_host[i] = Pjw_host[i]; + + cudaChk(cudaMalloc((void**)&Px, P_nnz * sizeof(double)), " in Solve_Cuda_BiCGStab, can't allocate Px on the graphic card\n"); + cudaChk(cudaMemcpy(Px, Px_host, P_nnz * sizeof(double), cudaMemcpyHostToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy Px = Px_host has failed\n"); + cudaChk(cudaMalloc((void**)&Pi, P_nnz * sizeof(int)), " in Solve_Cuda_BiCGStab, can't allocate Pi on the graphic card\n"); + cudaChk(cudaMemcpy(Pi, Pi_host, P_nnz * sizeof(int), cudaMemcpyHostToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy Pi = Pi_host has failed\n"); + cudaChk(cudaMalloc((void**)&Pj, (Size + 1) * sizeof(int)), " in Solve_Cuda_BiCGStab, can't allocate Pj on the graphic card\n"); + cudaChk(cudaMemcpy(Pj, Pj_host, (Size + 1) * sizeof(int), cudaMemcpyHostToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy Pj = Pj_host has failed\n"); + mxFree(Pi_host); + mxFree(Pj_host); + mxDestroyArray(P);*/ + + /*mexPrintf("L\n"); + mexCallMATLAB(0, NULL, 1, &L, "disp_dense"); + + mexPrintf("U\n"); + mexCallMATLAB(0, NULL, 1, &U, "disp_dense");*/ + + mwIndex* Liw_host = mxGetIr(L); + mwIndex* Ljw_host = mxGetJc(L); + double* Lx_host = mxGetPr(L); + int L_nnz = Ljw_host[Size]; + + mwIndex* Uiw_host = mxGetIr(U); + mwIndex* Ujw_host = mxGetJc(U); + double* Ux_host = mxGetPr(U); + int U_nnz = Ujw_host[Size]; + + double *pW = (double*)mxMalloc((L_nnz + U_nnz - Size) * periods * sizeof(double)); + int *Wi = (int*)mxMalloc((L_nnz + U_nnz - Size) * periods * sizeof(int)); + int *Wj = (int*)mxMalloc((n + 1) * sizeof(int)); + Wj[0] = 0; + W_nnz = 0; + for (int t = 0; t < periods; t++) + for (int i = 0; i < Size ; i++) + { + for (mwIndex l = Ujw_host[i]; l < Ujw_host[i+1]; l++) + { + Wi[W_nnz] = Uiw_host[l] + t * Size; + pW[W_nnz] = Ux_host[l]; + //mexPrintf("Wj[%d] = %d, Wi[%d] = Uiw_host[%d] + t * Size = %d, pW[%d]=%f\n", i + t * Size, Wj[i + t * Size], W_nnz, l, Uiw_host[l] + t * Size, W_nnz, Ux_host[l]); + W_nnz++; + } + for (mwIndex l = Ljw_host[i]; l < Ljw_host[i+1]; l++) + { + if (Liw_host[l] > i) + { + Wi[W_nnz] = Liw_host[l] + t * Size; + pW[W_nnz] = Lx_host[l]; + //mexPrintf("Wj[%d] = %d, Wi[%d] = Liw_host[%d] + t * Size = %d, pW[%d]=%f\n", i + t * Size, Wj[i + t * Size], W_nnz, l, Liw_host[l] + t * Size, W_nnz, Lx_host[l]); + W_nnz++; + } + } + Wj[i + 1 + t * Size] = W_nnz; + } + //mexPrintf("Wj[%d] = %d, n=%d\n", periods * Size, Wj[periods * Size], n); + cudaChk(cudaMalloc((void**)&A_tild, W_nnz * sizeof(double)), " in Solve_Cuda_BiCGStab, can't allocate Px on the graphic card\n"); + cudaChk(cudaMemcpy(A_tild, pW, W_nnz * sizeof(double), cudaMemcpyHostToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy A_tild = pW has failed\n"); + cudaChk(cudaMalloc((void**)&A_tild_i, W_nnz * sizeof(int)), " in Solve_Cuda_BiCGStab, can't allocate Pi on the graphic card\n"); + cudaChk(cudaMemcpy(A_tild_i, Wi, W_nnz * sizeof(int), cudaMemcpyHostToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy A_tild_i = Wi has failed\n"); + cudaChk(cudaMalloc((void**)&A_tild_p, (n + 1) * sizeof(int)), " in Solve_Cuda_BiCGStab, can't allocate Pj on the graphic card\n"); + cudaChk(cudaMemcpy(A_tild_p, Wj, (n + 1) * sizeof(int), cudaMemcpyHostToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy A_tild_p = Wj has failed\n"); + + /*mwIndex *Wwi = (mwIndex*)mxMalloc(W_nnz * sizeof(mwIndex)); + mwIndex *Wwj = (mwIndex*)mxMalloc((n + 1) * sizeof(mwIndex)); + for (int i = 0; i < W_nnz; i++) + Wwi[i] = Wi[i]; + for (int i = 0; i < n + 1; i++) + Wwj[i] = Wj[i]; + mxFree(Wi); + mxFree(Wj); + mxArray* Ao_tild = mxCreateSparse(n,n,W_nnz,mxREAL); + mxSetIr(Ao_tild, Wwi); + mxSetJc(Ao_tild, Wwj); + mxSetPr(Ao_tild, pW); + mexPrintf("Ao_tild\n"); + mexCallMATLAB(0, NULL, 1, &Ao_tild, "disp_dense"); + mxDestroyArray(Ao_tild);*/ + + + /*ostringstream tmp; + tmp << "debugging"; + mexWarnMsgTxt(tmp.str().c_str()); + return 4;*/ + + /* /**Apply the permutation matrices (P and Q) to the b vector of system to solve : + b_tild = P-1 . b = P' . b */ + /*cudaChk(cudaMalloc((void**)&b_tild, n * sizeof(double)), " in Solve_Cuda_BiCGStab, can't allocate b_tild on the graphic card\n"); + cusparseChk(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_TRANSPOSE, + n, n, nnz, &one, CUDA_descr, + Px, Pj, Pi, + b, &zeros, + b_tild), + " in Solve_Cuda_BiCGStab, b_tild = cusparseDcsrmv(P', b) has failed\n"); + + cusparseChk(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_TRANSPOSE, + n, n, nnz, &one, CUDA_descr, + Px, Pj, Pi, + b, &zeros, + b), + " in Solve_Cuda_BiCGStab, b = cusparseDcsrmv(P', b) has failed\n"); + */ + /*mexPrintf("Wt = lu(A_m)\n"); + mexCallMATLAB(0, NULL, 1, &Wt, "disp_dense");*/ + /*ostringstream tmp; + tmp << "debugging"; + mexWarnMsgTxt(tmp.str().c_str()); + return 4;*/ + // To store the resultng matrix in a CSR format we have to transpose it + /*mwIndex* Wtj = mxGetJc(Wt); + nnz = Wtj[n]; + mxArray* W; + mexCallMATLAB(1, &W, 1, &Wt, "transpose"); + mxDestroyArray(Wt); + pW = mxGetPr(W); + Wwi = mxGetIr(W); + mwIndex* Wp = mxGetJc(W); + int *Wii = (int*)mxMalloc(nnz * sizeof(int)); + int *Wip = (int*)mxMalloc((n + 1) * sizeof(int)); + for (int i = 0; i < nnz; i++) + Wii[i] = Wi[i]; + for (int i = 0; i < n + 1; i++) + Wip[i] = Wp[i]; + + //mxFree(A_tild_host); + + cudaChk(cudaFree(Ai_tild), " in Solve_Cuda_BiCGStab, cudaFree(Ai_tild) has failed\n"); + cudaChk(cudaFree(Ap_tild), " in Solve_Cuda_BiCGStab, cudaFree(Ap_tild) has failed\n"); + cudaChk(cudaFree(A_tild), " in Solve_Cuda_BiCGStab, cudaFree(A_tild) has failed\n"); + + cudaChk(cudaMalloc((void**)&A_tild, nnz * sizeof(double)), " in Solve_Cuda_BiCGStab, can't allocate A_tild on the graphic card\n"); + cudaChk(cudaMemcpy(A_tild, pW, nnz * sizeof(double), cudaMemcpyHostToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy A_tild = pW has failed\n"); + cudaChk(cudaMalloc((void**)&A_tild_i, nnz * sizeof(int)), " in Solve_Cuda_BiCGStab, can't allocate Ai on the graphic card\n"); + cudaChk(cudaMemcpy(A_tild_i, Wii, nnz * sizeof(int), cudaMemcpyHostToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy A_tild_i = A_tild_i_host has failed\n"); + cudaChk(cudaMalloc((void**)&A_tild_p, (n + 1) * sizeof(int)), " in Solve_Cuda_BiCGStab, can't allocate A_tild_p on the graphic card\n"); + cudaChk(cudaMemcpy(A_tild_p, Wip, (n + 1) * sizeof(int), cudaMemcpyHostToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy A_tild_p = A_tild_j_host has failed\n"); + mxDestroyArray(W); + mxFree(Wii); + mxFree(Wip);*/ + } + if (preconditioner == 1 || preconditioner == 2 || preconditioner == 3) + { + cusparseChk(cusparseCreateMatDescr(&descrL), + " in Solve_Cuda_BiCGStab, cusparseCreateMatDescr has failed for descrL\n"); + cusparseChk(cusparseSetMatIndexBase(descrL, CUSPARSE_INDEX_BASE_ZERO), + " in Solve_Cuda_BiCGStab, cusparseSetMatIndexBase has failed for descrL\n"); + cusparseChk(cusparseSetMatType(descrL, CUSPARSE_MATRIX_TYPE_GENERAL), + " in Solve_Cuda_BiCGStab, cusparseSetMatType has failed for descrL\n"); + cusparseChk(cusparseSetMatFillMode(descrL, CUSPARSE_FILL_MODE_LOWER), + " in Solve_Cuda_BiCGStab, cusparseSetFillMod has failed for descrL\n"); + cusparseChk(cusparseSetMatDiagType(descrL, CUSPARSE_DIAG_TYPE_UNIT), + " in Solve_Cuda_BiCGStab, cusparseSetMatDiagType has failed for descrL\n"); + + cusparseChk(cusparseCreateMatDescr(&descrU), + " in Solve_Cuda_BiCGStab, cusparseCreateMatDescr has failed for descrU\n"); + cusparseChk(cusparseSetMatIndexBase(descrU, CUSPARSE_INDEX_BASE_ZERO), + " in Solve_Cuda_BiCGStab, cusparseSetMatIndexBase has failed for descrU\n"); + cusparseChk(cusparseSetMatType(descrU, CUSPARSE_MATRIX_TYPE_GENERAL), + " in Solve_Cuda_BiCGStab, cusparseSetMatType has failed for descrU\n"); + cusparseChk(cusparseSetMatFillMode(descrU, CUSPARSE_FILL_MODE_UPPER), + " in Solve_Cuda_BiCGStab, cusparseSetFillMod has failed for descrU\n"); + cusparseChk(cusparseSetMatDiagType(descrU, CUSPARSE_DIAG_TYPE_NON_UNIT), + " in Solve_Cuda_BiCGStab, cusparseSetMatDiagType has failed for descrU\n"); + + int host_nnz_tild; + if (preconditioner == 3) + host_nnz_tild = W_nnz; + else + host_nnz_tild = nnz; + + if (preconditioner == 1) + cusparseChk(cusparseDestroySolveAnalysisInfo(info), + " in Solve_Cuda_BiCGStab, cusparseDestroySolveAnalysisInfo has failed for info\n"); + + cusparseChk(cusparseCreateSolveAnalysisInfo(&infoL), + " in Solve_Cuda_BiCGStab, cusparseCreateSolveAnalysisInfo has failed for infoL\n"); + cusparseChk(cusparseDcsrsv_analysis(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + n, host_nnz_tild, descrL, + A_tild, A_tild_p, A_tild_i, + infoL), + " in Solve_Cuda_BiCGStab, cusparseDcsrsm_analysis for infoL has failed\n"); + + cusparseChk(cusparseCreateSolveAnalysisInfo(&infoU), + " in Solve_Cuda_BiCGStab, cusparseCreateSolveAnalysisInfo has failed for infoU\n"); + cusparseChk(cusparseDcsrsv_analysis(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + n, host_nnz_tild, descrU, + A_tild, A_tild_p, A_tild_i, + infoU), + " in Solve_Cuda_BiCGStab, cusparseDcsrsm_analysis for infoU has failed\n"); + } + + cudaChk(cudaMalloc((void**)&v, n * sizeof(double)), " in Solve_Cuda_BiCGStab, can't allocate v on the graphic card\n"); + cudaChk(cudaMalloc((void**)&p, n * sizeof(double)), " in Solve_Cuda_BiCGStab, can't allocate p on the graphic card\n"); + //cudaChk(cudaMemset(p, 0, n * sizeof(double)), " in Solve_Cuda_BiCGStab, cudaMemset p = 0 has failed\n"); + cudaChk(cudaMalloc((void**)&s, n * sizeof(double)), " in Solve_Cuda_BiCGStab, can't allocate s on the graphic card\n"); + cudaChk(cudaMalloc((void**)&t, n * sizeof(double)), " in Solve_Cuda_BiCGStab, can't allocate t on the graphic card\n"); + cudaChk(cudaMalloc((void**)&y_, n * sizeof(double)), " in Solve_Cuda_BiCGStab, can't allocate y_ on the graphic card\n"); + cudaChk(cudaMalloc((void**)&z, n * sizeof(double)), " in Solve_Cuda_BiCGStab, can't allocate z on the graphic card\n"); + + double rho = 1.0, alpha = 1.0, omega = 1.0; + + + //residual = P*B*Q - L*U; + //norm(Z,1) should be close to 0 + + + while (iteration < 50/*max_iterations*/ && !convergence) + { + double rho_prev = rho; + /**store in s previous value of r*/ + cudaChk(cudaMemcpy(s, r, n * sizeof(double), cudaMemcpyDeviceToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy s = r has failed\n"); + + /**rho = r0 . r*/ + cublasChk(cublasDdot(cublas_handle, n, // numerator + r0, 1, + r, 1, + &rho), + " in Solve_Cuda_BiCGStab, rho = cublasDdot(r0, r) has failed\n"); + + mexPrintf("rho=%f\n",rho); + + double beta; + + if (iteration == 0) + { + cudaChk(cudaMemcpy(p, r, n * sizeof(double), cudaMemcpyDeviceToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy p = r has failed\n"); + } + else + { + /**beta = (rho / rho_prev) . (alpha / omega);*/ + beta = rho / rho_prev * alpha / omega; + + /**p = r + beta * (p - omega * v)*/ + // tmp_ = p - omega * v + VecAdd<<>>(tmp_, p, -omega, v, n); + //p = r + beta * tmp_ + VecAdd<<>>(p, r, beta, tmp_, n); + } + + /**y_ solution of A_tild * y_ = p <=> L . U . y_ = p*/ + // L tmp_ = p => tmp_ = L^-1 p, with tmp_ = U . y_ + + if (preconditioner == 3) + { + double *p_tild; + mexPrintf("n=%d\n",n); + + cudaChk(cudaMemcpy(tmp_vect_host, p, n*sizeof(double), cudaMemcpyDeviceToHost), " in Solve_Cuda_BiCGStab, cudaMemcpy tmp_vect_host = p has failed\n"); + /*mexPrintf("p\n"); + for (int i = 0; i < n; i++) + mexPrintf("%f\n",tmp_vect_host[i]);*/ + + cudaChk(cudaMalloc((void**)&p_tild, n * sizeof(double)), " in Solve_Cuda_BiCGStab, can't allocate b_tild on the graphic card\n"); + cusparseChk(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + n, n, P_nnz * periods, &one, CUDA_descr, + Px, Pj, Pi, + p, &zeros, + p_tild), + " in Solve_Cuda_BiCGStab, p_tild = cusparseDcsrmv(P', p) has failed\n"); + + /*mexPrintf("P\n"); + printM(n, Px, Pj, Pi, CUDA_descr, cusparse_handle);*/ + + cudaChk(cudaMemcpy(tmp_vect_host, p_tild, n*sizeof(double), cudaMemcpyDeviceToHost), " in Solve_Cuda_BiCGStab, cudaMemcpy tmp_vect_host = p_tild has failed\n"); + /*mexPrintf("p_tild\n"); + for (int i = 0; i < n; i++) + mexPrintf("%f\n",tmp_vect_host[i]);*/ + + cusparseChk(cusparseDcsrsv_solve(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + n, &one, + descrL, + A_tild, A_tild_p, A_tild_i, + infoL, p_tild, + tmp_), + " in Solve_Cuda_BiCGStab, cusparseDcsrsv_solve for L . tmp_ = p_tild has failed\n"); + cudaChk(cudaFree(p_tild), " in Solve_Cuda_BiCGStab, can't free p_tild\n"); + + cudaChk(cudaMemcpy(tmp_vect_host, tmp_, n*sizeof(double), cudaMemcpyDeviceToHost), " in Solve_Cuda_BiCGStab, cudaMemcpy tmp_vect_host = v has failed\n"); + /*mexPrintf("tmp_\n"); + for (int i = 0; i < n; i++) + mexPrintf("%f\n",tmp_vect_host[i]);*/ + } + else + cusparseChk(cusparseDcsrsv_solve(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + n, &one, + descrL, + A_tild, A_tild_p, A_tild_i, + infoL, p, + tmp_), + " in Solve_Cuda_BiCGStab, cusparseDcsrsv_solve for L . tmp_ = p has failed\n"); + + // U . y_ = L^-1 p <=> U . y_ = tmp_ => y_ = U^-1 L^-1 p + cusparseChk(cusparseDcsrsv_solve(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + n, &one, + descrU, + A_tild, A_tild_p, A_tild_i, + infoU, tmp_, + y_), + " in Solve_Cuda_BiCGStab, cusparseDcsrsv_solve for U . y_ = tmp_ has failed\n"); + + /*cudaChk(cudaMemcpy(tmp_vect_host, y_, n*sizeof(double), cudaMemcpyDeviceToHost), " in Solve_Cuda_BiCGStab, cudaMemcpy tmp_vect_host = v has failed\n"); + mexPrintf("y_\n"); + for (int i = 0; i < n; i++) + mexPrintf("%f\n",tmp_vect_host[i]);*/ + + if (preconditioner == 3) + { + double *y_tild; + cudaChk(cudaMalloc((void**)&y_tild, n * sizeof(double)), " in Solve_Cuda_BiCGStab, can't allocate b_tild on the graphic card\n"); + cudaChk(cudaMemcpy(y_tild, y_, n * sizeof(double), cudaMemcpyDeviceToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy y_tild = y_ has failed\n"); + cusparseChk(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + n, n, Q_nnz * periods, &one, CUDA_descr, + Qx, Qj, Qi, + y_tild, &zeros, + y_), + " in Solve_Cuda_BiCGStab, y_ = cusparseDcsrmv(Q', y_tild) has failed\n"); + cudaChk(cudaFree(y_tild), " in Solve_Cuda_BiCGStab, can't free y_tild\n"); + } + /*cudaChk(cudaMemcpy(tmp_vect_host, y_, n*sizeof(double), cudaMemcpyDeviceToHost), " in Solve_Cuda_BiCGStab, cudaMemcpy tmp_vect_host = v has failed\n"); + mexPrintf("y_\n"); + for (int i = 0; i < n; i++) + mexPrintf("%f\n",tmp_vect_host[i]);*/ + /**v = A*y_*/ + cusparseChk(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + n, n, nnz, &one, CUDA_descr, + Ax, Ap, Ai, + y_, &zeros, + v), + " in Solve_Cuda_BiCGStab, v = cusparseDcsrmv(A, y_) has failed\n"); + cudaChk(cudaMemcpy(tmp_vect_host, v, n*sizeof(double), cudaMemcpyDeviceToHost), " in Solve_Cuda_BiCGStab, cudaMemcpy tmp_vect_host = v has failed\n"); + /*mexPrintf("v\n"); + for (int i = 0; i < n; i++) + mexPrintf("%f\n",tmp_vect_host[i]);*/ + + + + /**alpha = rho / (rr0 . v) with rr0 = r0*/ + cublasChk(cublasDdot(cublas_handle, n, // numerator + r0, 1, + v, 1, + &tmp1), + " in Solve_Cuda_BiCGStab, cublasDdot(r0, v) has failed\n"); + + alpha = rho / tmp1; + mexPrintf("rho = %f, tmp1 = %f\n", rho, tmp1); + mexPrintf("alpha = %f\n", alpha); + + if (alpha == 0 || isinf(alpha) || isnan(alpha)) + { + Solve_CUDA_BiCGStab_Free(tmp_vect_host, p, r, v, s, t, y_, z, tmp_, Ai, Ax, Ap, x0, b, A_tild, A_tild_i, A_tild_p, infoL, infoU, descrL, descrU, preconditioner); + ostringstream tmp; + tmp << "one of the scalar quantities (alpha=" << alpha << ") calculated during BICGSTAB became too small or too large to continue computing, in block " << block+1; + mexWarnMsgTxt(tmp.str().c_str()); + return 4; + } + + /** Check for potential stagnation*/ + cublasChk(cublasDnrm2(cublas_handle, n, // numerator + y_, 1, + &tmp1), + " in Solve_Cuda_BiCGStab, cublasDnrm2(y_) has failed\n"); + cublasChk(cublasDnrm2(cublas_handle, n, // denominator + x0, 1, + &tmp2), + " in Solve_Cuda_BiCGStab, cublasDnrm2(y_) has failed\n"); + mexPrintf("abs(alpha)*tmp1 = %f, alpha = %f, tmp1 = %f, tmp2 = %f, eps = %f\n",abs(alpha)*tmp1 , alpha, tmp1, tmp2, eps); + if (abs(alpha)*tmp1 < eps * tmp2) + stagnation++; + else + stagnation = 0; + + /**x = x + alpha * y_*/ + VecInc<<>>(x0, alpha, y_, n); + + /**s = r_prev - alpha *v with r_prev = s*/ + VecInc<<>>(s, -alpha, v, n); + + /**Has BiCGStab converged?*/ + cublasChk(cublasDnrm2(cublas_handle, n, // numerator + s, 1, + &tmp1), + " in Solve_Cuda_BiCGStab, cublasDnrm2(s) has failed\n"); + conv_criteria = tmp1; + mexPrintf("conv_criteria = %f, tolb = %f\n", conv_criteria, tolb); + convergence = conv_criteria < tolb; + + if (convergence || stagnation >= max_stagnation || refinement_needed) + { + /**s = b - A * x0*/ + cudaChk(cudaMemcpy(s, b, n * sizeof(double), cudaMemcpyDeviceToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy s = b has failed\n"); + cusparseChk(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + n, n, nnz, &m_one, CUDA_descr, + Ax, Ap, Ai, + x0, &one, + s), + " in Solve_Cuda_BiCGStab, s = b - cusparseDcsrmv(A, x0) has failed\n"); + cublasChk(cublasDnrm2(cublas_handle, n, // numerator + s, 1, + &tmp1), + " in Solve_Cuda_BiCGStab, cublasDnrm2(s) has failed\n"); + conv_criteria = tmp1; + convergence = conv_criteria < tolb; + if (convergence) + { + break; + } + else + { + if (stagnation >= max_stagnation && refinement_needed == 0) + stagnation = 0; + refinement_needed++; + if (refinement_needed > max_refinement) + { + Solve_CUDA_BiCGStab_Free(tmp_vect_host, p, r, v, s, t, y_, z, tmp_, Ai, Ax, Ap, x0, b, A_tild, A_tild_i, A_tild_p, infoL, infoU, descrL, descrU, preconditioner); + ostringstream tmp; + tmp << "Error in bytecode: BiCGStab stagnated (Two consecutive iterates were the same.), in block " << block+1; + mexWarnMsgTxt(tmp.str().c_str()); + return 3; + } + } + } + + /**z solution of A_tild * z = s*/ + // L tmp_ = s => tmp_ = L^-1 s, with tmp_ = U . z + if (preconditioner == 3) + { + double *s_tild; + cudaChk(cudaMalloc((void**)&s_tild, n * sizeof(double)), " in Solve_Cuda_BiCGStab, can't allocate b_tild on the graphic card\n"); + cusparseChk(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + n, n, P_nnz * periods, &one, CUDA_descr, + Px, Pj, Pi, + s, &zeros, + s_tild), + " in Solve_Cuda_BiCGStab, s_tild = cusparseDcsrmv(P', s) has failed\n"); + cusparseChk(cusparseDcsrsv_solve(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + n, &one, + descrL, + A_tild, A_tild_p, A_tild_i, + infoL, s_tild, + tmp_), + " in Solve_Cuda_BiCGStab, cusparseDcsrsv_solve for L . tmp_ = s_tild has failed\n"); + cudaChk(cudaFree(s_tild), " in Solve_Cuda_BiCGStab, can't free s_tild\n"); + } + else + cusparseChk(cusparseDcsrsv_solve(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + n, &one, + descrL, + //Lx, Lp, Li, + A_tild, A_tild_p, A_tild_i, + infoL, s, + tmp_), + " in Solve_Cuda_BiCGStab, cusparseDcsrsv_solve for L . tmp_ = s has failed\n"); + // U . z = L^-1 s <=> U . z = tmp_ => z = U^-1 L^-1 s + cusparseChk(cusparseDcsrsv_solve(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + n, &one, + descrU, + //Ux, Up, Ui, + A_tild, A_tild_p, A_tild_i, + infoU, tmp_, + z), + " in Solve_Cuda_BiCGStab, cusparseDcsrsv_solve for U . z = tmp_ has failed\n"); + if (preconditioner == 3) + { + double *z_tild; + cudaChk(cudaMalloc((void**)&z_tild, n * sizeof(double)), " in Solve_Cuda_BiCGStab, can't allocate z_tild on the graphic card\n"); + cudaChk(cudaMemcpy(z_tild, z, n * sizeof(double), cudaMemcpyDeviceToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy z_tild = z has failed\n"); + cusparseChk(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + n, n, Q_nnz * periods, &one, CUDA_descr, + Qx, Qj, Qi, + z_tild, &zeros, + z), + " in Solve_Cuda_BiCGStab, z = cusparseDcsrmv(Q, z_tild) has failed\n"); + cudaChk(cudaFree(z_tild), " in Solve_Cuda_BiCGStab, can't free x_tild\n"); + } + /**t = A * z*/ + cusparseChk(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + n, n, nnz, &one, CUDA_descr, + Ax, Ap, Ai, + z, &zeros, + t), + " in Solve_Cuda_BiCGStab, t = cusparseDcsrmv(A, z) has failed\n"); + + /** omega = (t' s) / (t' t)*/ + cublasChk(cublasDdot(cublas_handle, n, // numerator + t, 1, + s, 1, + &tmp1), + " in Solve_Cuda_BiCGStab, cublasDdot(t, s) has failed\n"); + + cublasChk(cublasDdot(cublas_handle, n, // numerator + t, 1, + t, 1, + &tmp2), + " in Solve_Cuda_BiCGStab, cublasDdot(t, t) has failed\n"); + + omega = tmp1 / tmp2; + + if (omega == 0 || isinf(omega) || isnan(omega)) + { + Solve_CUDA_BiCGStab_Free(tmp_vect_host, p, r, v, s, t, y_, z, tmp_, Ai, Ax, Ap, x0, b, A_tild, A_tild_i, A_tild_p, infoL, infoU, descrL, descrU, preconditioner); + ostringstream tmp; + mexEvalString("diary off;"); + tmp << "one of the scalar quantities (omega=" << omega << ") calculated during BICGSTAB became too small or too large to continue computing, in block " << block+1; + mexWarnMsgTxt(tmp.str().c_str()); + return 4; + } + + /**x = x + omega * z*/ + VecInc<<>>(x0, omega, z, n); + + /**r = s - omega * t*/ + VecAdd<<>>(r, s, -omega, t, n); + + /**Has BiCGStab converged?*/ + cublasChk(cublasDnrm2(cublas_handle, n, // numerator + r, 1, + &tmp1), + " in Solve_Cuda_BiCGStab, cublasDnrm2(r) has failed\n"); + conv_criteria = tmp1; + + convergence = conv_criteria < tolb; + + if (convergence || stagnation >= max_stagnation || refinement_needed) + { + /**r = b - A * x0*/ + cudaChk(cudaMemcpy(r, b, n * sizeof(double), cudaMemcpyDeviceToDevice), " in Solve_Cuda_BiCGStab, cudaMemcpy r = b has failed\n"); + cusparseChk(cusparseDcsrmv(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + n, n, nnz, &m_one, CUDA_descr, + Ax, Ap, Ai, + x0, &one, + r), + " in Solve_Cuda_BiCGStab, r = b - cusparseDcsrmv(A, x0) has failed\n"); + cublasChk(cublasDnrm2(cublas_handle, n, // numerator + r, 1, + &tmp1), + " in Solve_Cuda_BiCGStab, cublasDnrm2(r) has failed\n"); + conv_criteria = tmp1; + convergence = conv_criteria < tolb; + if (convergence) + { + mexPrintf("convergence achieved\n"); + break; + } + else + { + if (stagnation >= max_stagnation && refinement_needed == 0) + stagnation = 0; + refinement_needed++; + if (refinement_needed > max_refinement) + { + Solve_CUDA_BiCGStab_Free(tmp_vect_host, p, r, v, s, t, y_, z, tmp_, Ai, Ax, Ap, x0, b, A_tild, A_tild_i, A_tild_p, /*Lx, Li, Lp, Ux, Ui, Up, device_n, */infoL, infoU, descrL, descrU, preconditioner); + ostringstream tmp; + mexEvalString("diary off;"); + tmp << "Error in bytecode: BiCGStab stagnated (Two consecutive iterates were the same.), in block " << block+1; + mexWarnMsgTxt(tmp.str().c_str()); + return 3; + } + } + } + + iteration++; + } + cudaChk(cudaMemcpy(tmp_vect_host, x0, n * sizeof(double), cudaMemcpyDeviceToHost), " in Solve_Cuda_BiCGStab, cudaMemcpy tmp_vect_host = x0 has failed\n"); + + if (is_two_boundaries) +#ifdef USE_OMP +#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) +#endif + for (int i = 0; i < n; i++) + { + int eq = index_vara[i+Size*y_kmin]; + double yy = -(tmp_vect_host[i] + y[eq]); + direction[eq] = yy; + y[eq] += slowc * yy; + } + else +#ifdef USE_OMP +#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) +#endif + for (int i = 0; i < n; i++) + { + int eq = index_vara[i]; + double yy = -(tmp_vect_host[i] + y[eq+it_*y_size]); + direction[eq] = yy; + y[eq+it_*y_size] += slowc * yy; + } + Solve_CUDA_BiCGStab_Free(tmp_vect_host, p, r, v, s, t, y_, z, tmp_, Ai, Ax, Ap, x0, b, A_tild, A_tild_i, A_tild_p, infoL, infoU, descrL, descrU, preconditioner); + + if (iteration >= max_iterations) + { + ostringstream tmp; + mexEvalString("diary off;"); + tmp << "Error in bytecode: No convergence inside BiCGStab, in block " << block+1; + mexWarnMsgTxt(tmp.str().c_str()); + return 1; + } + else + return 0; +} +#endif + +void +dynSparseMatrix::Solve_Matlab_GMRES(mxArray *A_m, mxArray *b_m, int Size, double slowc, int block, bool is_two_boundaries, int it_, mxArray *x0_m) { #ifdef OCTAVE_MEX_FILE ostringstream tmp; @@ -1953,7 +4693,7 @@ SparseMatrix::Solve_Matlab_GMRES(mxArray *A_m, mxArray *b_m, int Size, double sl tmp << " GMRES method is not implemented in Octave. You cannot use stack_solve_algo=2, change stack_solve_algo.\n"; throw FatalExceptionHandling(tmp.str()); #endif - int n = mxGetM(A_m); + size_t n = mxGetM(A_m); mxArray *lhs0[2]; mxArray *rhs0[2]; rhs0[0] = A_m; @@ -1967,7 +4707,7 @@ SparseMatrix::Solve_Matlab_GMRES(mxArray *A_m, mxArray *b_m, int Size, double sl rhs[1] = b_m; rhs[2] = mxCreateDoubleScalar(Size); rhs[3] = mxCreateDoubleScalar(1e-6); - rhs[4] = mxCreateDoubleScalar(n); + rhs[4] = mxCreateDoubleScalar((double)n); rhs[5] = L1; rhs[6] = U1; rhs[7] = x0_m; @@ -2006,6 +4746,9 @@ SparseMatrix::Solve_Matlab_GMRES(mxArray *A_m, mxArray *b_m, int Size, double sl { double *res = mxGetPr(z); if (is_two_boundaries) +#ifdef USE_OMP +#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) +#endif for (int i = 0; i < n; i++) { int eq = index_vara[i+Size*y_kmin]; @@ -2014,6 +4757,9 @@ SparseMatrix::Solve_Matlab_GMRES(mxArray *A_m, mxArray *b_m, int Size, double sl y[eq] += slowc * yy; } else +#ifdef USE_OMP +#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) +#endif for (int i = 0; i < n; i++) { int eq = index_vara[i]; @@ -2029,17 +4775,62 @@ SparseMatrix::Solve_Matlab_GMRES(mxArray *A_m, mxArray *b_m, int Size, double sl } void -SparseMatrix::Solve_Matlab_BiCGStab(mxArray *A_m, mxArray *b_m, int Size, double slowc, int block, bool is_two_boundaries, int it_, mxArray *x0_m, bool steady_state) +dynSparseMatrix::Solve_Matlab_BiCGStab(mxArray *A_m, mxArray *b_m, int Size, double slowc, int block, bool is_two_boundaries, int it_, mxArray *x0_m, int preconditioner) { - unsigned int n = mxGetM(A_m); - /*[L1, U1]=luinc(g1a,luinc_tol);*/ - mxArray *lhs0[2]; - mxArray *rhs0[2]; - rhs0[0] = A_m; - rhs0[1] = mxCreateDoubleScalar(lu_inc_tol); - mexCallMATLAB(2, lhs0, 2, rhs0, "luinc"); - mxArray *L1 = lhs0[0]; - mxArray *U1 = lhs0[1]; + /* precond = 0 => Jacobi + precond = 1 => Incomplet LU decomposition*/ + size_t n = mxGetM(A_m); + mxArray *L1, *U1, *Diag; + + mxArray *rhs0[4]; + if (preconditioner == 0) + { + mxArray *lhs0[1]; + rhs0[0] = A_m; + rhs0[1] = mxCreateDoubleScalar(0); + mexCallMATLAB(1, lhs0, 2, rhs0, "spdiags"); + mxArray* tmp = lhs0[0]; + double* tmp_val = mxGetPr(tmp); + Diag = mxCreateSparse(n, n, n, mxREAL); + mwIndex *Diag_i = mxGetIr(Diag); + mwIndex *Diag_j = mxGetJc(Diag); + double *Diag_val = mxGetPr(Diag); + for (size_t i = 0; i < n; i++) + { + Diag_val[i] = tmp_val[i]; + Diag_j[i] = i; + Diag_i[i] = i; + } + Diag_j[n] = n; + } + else if (preconditioner == 1) + { + /*[L1, U1] = ilu(g1a=;*/ + const char *field_names[] = {"type", "droptol", "milu", "udiag", "thresh"}; + const int type = 0; + const int droptol = 1; + const int milu = 2; + const int udiag = 3; + const int thresh = 4; + mwSize dims[1] = {(mwSize)1 }; + mxArray *Setup = mxCreateStructArray(1, dims, 5, field_names); + mxSetFieldByNumber(Setup, 0, type, mxCreateString("ilutp")); + //mxSetFieldByNumber(Setup, 0, type, mxCreateString("nofill")); + mxSetFieldByNumber(Setup, 0, droptol, mxCreateDoubleScalar(lu_inc_tol)); + mxSetFieldByNumber(Setup, 0, milu, mxCreateString("off")); + mxSetFieldByNumber(Setup, 0, udiag, mxCreateDoubleScalar(0)); + mxSetFieldByNumber(Setup, 0, thresh, mxCreateDoubleScalar(0)); + //mxSetFieldByNumber(Setup, 0, thresh, mxCreateDoubleScalar(1)); + mxArray *lhs0[2]; + mxArray *rhs0[2]; + rhs0[0] = A_m; + rhs0[1] = Setup; + mexCallMATLAB(2, lhs0, 2, rhs0, "ilu"); + L1 = lhs0[0]; + U1 = lhs0[1]; + mxDestroyArray(Setup); + } + double flags = 2; mxArray *z; if (steady_state) /*Octave BicStab algorihtm involves a 0 division in case of a preconditionner equal to the LU decomposition of A matrix*/ @@ -2047,7 +4838,10 @@ SparseMatrix::Solve_Matlab_BiCGStab(mxArray *A_m, mxArray *b_m, int Size, double mxArray *res = mult_SAT_B(Sparse_transpose(A_m), x0_m); double *resid = mxGetPr(res); double *b = mxGetPr(b_m); - for (unsigned int i = 0; i < n; i++) +#ifdef USE_OMP +#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) +#endif + for (int i = 0; i < (int)n; i++) resid[i] = b[i] - resid[i]; mxArray *rhs[2]; mxArray *lhs[1]; @@ -2060,19 +4854,24 @@ SparseMatrix::Solve_Matlab_BiCGStab(mxArray *A_m, mxArray *b_m, int Size, double z = lhs[0]; double *phat = mxGetPr(z); double *x0 = mxGetPr(x0_m); - for (unsigned int i = 0; i < n; i++) +#ifdef USE_OMP +#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) +#endif + for (int i = 0; i < (int)n; i++) phat[i] = x0[i] + phat[i]; /*Check the solution*/ res = mult_SAT_B(Sparse_transpose(A_m), z); resid = mxGetPr(res); double cum_abs = 0; - for (unsigned int i = 0; i < n; i++) +#ifdef USE_OMP +#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) reduction(+:cum_abs) +#endif + for (int i = 0; i < (int)n; i++) { resid[i] = b[i] - resid[i]; cum_abs += fabs(resid[i]); } - //mexPrintf("cum_abs=%g\n", cum_abs); if (cum_abs > 1e-7) flags = 2; else @@ -2080,32 +4879,55 @@ SparseMatrix::Solve_Matlab_BiCGStab(mxArray *A_m, mxArray *b_m, int Size, double mxDestroyArray(res); } //else + if (flags == 2) { - /*[za,flag1] = bicgstab(g1a,b,1e-6,Blck_size*periods,L1,U1);*/ - mxArray *rhs[7]; - rhs[0] = A_m; - rhs[1] = b_m; - rhs[2] = mxCreateDoubleScalar(1e-6); - rhs[3] = mxCreateDoubleScalar(n); - rhs[4] = L1; - rhs[5] = U1; - rhs[6] = x0_m; - mxArray *lhs[2]; - mexCallMATLAB(2, lhs, 7, rhs, "bicgstab"); - z = lhs[0]; - mxArray *flag = lhs[1]; - double *flag1 = mxGetPr(flag); - flags = flag1[0]; - mxDestroyArray(flag); - mxDestroyArray(rhs[2]); - mxDestroyArray(rhs[3]); - mxDestroyArray(rhs[4]); - mxDestroyArray(rhs[5]); + if (preconditioner == 0) + { + /*[za,flag1] = bicgstab(g1a,b,1e-6,Blck_size*periods,L1,U1);*/ + mxArray *rhs[5]; + rhs[0] = A_m; + rhs[1] = b_m; + rhs[2] = mxCreateDoubleScalar(1e-6); + rhs[3] = mxCreateDoubleScalar((double)n); + rhs[4] = Diag; + //rhs[5] = x0_m; + mxArray *lhs[2]; + mexCallMATLAB(2, lhs, 5, rhs, "bicgstab"); + z = lhs[0]; + mxArray *flag = lhs[1]; + double *flag1 = mxGetPr(flag); + flags = flag1[0]; + mxDestroyArray(flag); + mxDestroyArray(rhs[2]); + mxDestroyArray(rhs[3]); + mxDestroyArray(rhs[4]); + } + else if (preconditioner == 1) + { + /*[za,flag1] = bicgstab(g1a,b,1e-6,Blck_size*periods,L1,U1);*/ + mxArray *rhs[7]; + rhs[0] = A_m; + rhs[1] = b_m; + rhs[2] = mxCreateDoubleScalar(1e-6); + rhs[3] = mxCreateDoubleScalar((double)n); + rhs[4] = L1; + rhs[5] = U1; + rhs[6] = x0_m; + mxArray *lhs[2]; + mexCallMATLAB(2, lhs, 7, rhs, "bicgstab"); + z = lhs[0]; + mxArray *flag = lhs[1]; + double *flag1 = mxGetPr(flag); + flags = flag1[0]; + mxDestroyArray(flag); + mxDestroyArray(rhs[2]); + mxDestroyArray(rhs[3]); + mxDestroyArray(rhs[4]); + mxDestroyArray(rhs[5]); + } } - /*mexPrintf("z"); - mexCallMATLAB(0, NULL, 1, &z, "disp");*/ - mxDestroyArray(rhs0[1]); + if (flags > 0) { @@ -2131,7 +4953,10 @@ SparseMatrix::Solve_Matlab_BiCGStab(mxArray *A_m, mxArray *b_m, int Size, double { double *res = mxGetPr(z); if (is_two_boundaries) - for (unsigned int i = 0; i < n; i++) +#ifdef USE_OMP +#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) +#endif + for (int i = 0; i < n; i++) { int eq = index_vara[i+Size*y_kmin]; double yy = -(res[i] + y[eq]); @@ -2139,7 +4964,10 @@ SparseMatrix::Solve_Matlab_BiCGStab(mxArray *A_m, mxArray *b_m, int Size, double y[eq] += slowc * yy; } else - for (unsigned int i = 0; i < n; i++) +#ifdef USE_OMP +#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) +#endif + for (int i = 0; i < n; i++) { int eq = index_vara[i]; double yy = -(res[i] + y[eq+it_*y_size]); @@ -2153,7 +4981,7 @@ SparseMatrix::Solve_Matlab_BiCGStab(mxArray *A_m, mxArray *b_m, int Size, double } void -SparseMatrix::Singular_display(int block, int Size, bool steady_state, it_code_type it_code) +dynSparseMatrix::Singular_display(int block, int Size) { bool zero_solution; Simple_Init(Size, IM_i, zero_solution); @@ -2162,6 +4990,9 @@ SparseMatrix::Singular_display(int block, int Size, bool steady_state, it_code_t rhs[0] = mxCreateDoubleMatrix(Size, Size, mxREAL); double *pind; pind = mxGetPr(rhs[0]); +#ifdef USE_OMP +#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) +#endif for (int j = 0; j < Size * Size; j++) pind[j] = 0.0; for (int ii = 0; ii < Size; ii++) @@ -2186,46 +5017,45 @@ SparseMatrix::Singular_display(int block, int Size, bool steady_state, it_code_t { if (abs(SVD_ps[i * (1 + Size)]) < 1e-12) { - mexPrintf(" The following equations form a linear combination:\n "); - double max_u = 0; - for (int j = 0; j < Size; j++) - if (abs(SVD_pu[j + i * Size]) > abs(max_u)) - max_u = SVD_pu[j + i * Size]; - vector equ_list; - for (int j = 0; j < Size; j++) - { - double rr = SVD_pu[j + i * Size] / max_u; - if ( rr < -1e-10) - { - equ_list.push_back(j); - if (rr != -1) - mexPrintf(" - %3.2f*Dequ_%d_dy",abs(rr),j+1); + mexPrintf(" The following equations form a linear combination:\n "); + double max_u = 0; + for (int j = 0; j < Size; j++) + if (abs(SVD_pu[j + i * Size]) > abs(max_u)) + max_u = SVD_pu[j + i * Size]; + vector equ_list; + for (int j = 0; j < Size; j++) + { + double rr = SVD_pu[j + i * Size] / max_u; + if ( rr < -1e-10) + { + equ_list.push_back(j); + if (rr != -1) + mexPrintf(" - %3.2f*Dequ_%d_dy",abs(rr),j+1); + else + mexPrintf(" - Dequ_%d_dy",j+1); + } + else if (rr > 1e-10) + { + equ_list.push_back(j); + if (j > 0) + if (rr != 1) + mexPrintf(" + %3.2f*Dequ_%d_dy",rr,j+1); else - mexPrintf(" - Dequ_%d_dy",j+1); - } - else if (rr > 1e-10) - { - equ_list.push_back(j); - if (j > 0) - if (rr != 1) - mexPrintf(" + %3.2f*Dequ_%d_dy",rr,j+1); - else - mexPrintf(" + Dequ_%d_dy",j+1); - else - if (rr != 1) - mexPrintf(" %3.2f*Dequ_%d_dy",rr,j+1); - else - mexPrintf(" Dequ_%d_dy",j+1); - } - } - mexPrintf(" = 0\n"); - /*mexPrintf(" with:\n"); - it_code = get_begin_block(block); - for (int j=0; j < Size; j++) - { - if (find(equ_list.begin(), equ_list.end(), j) != equ_list.end()) - mexPrintf(" equ_%d: %s\n",j, print_expression(it_code_expr, false, Size, block, steady_state, 0, 0, it_code, true).c_str()); - }*/ + mexPrintf(" + Dequ_%d_dy",j+1); + else if (rr != 1) + mexPrintf(" %3.2f*Dequ_%d_dy",rr,j+1); + else + mexPrintf(" Dequ_%d_dy",j+1); + } + } + mexPrintf(" = 0\n"); + /*mexPrintf(" with:\n"); + it_code = get_begin_block(block); + for (int j=0; j < Size; j++) + { + if (find(equ_list.begin(), equ_list.end(), j) != equ_list.end()) + mexPrintf(" equ_%d: %s\n",j, print_expression(it_code_expr, false, Size, block, steady_state, 0, 0, it_code, true).c_str()); + }*/ } } mxDestroyArray(lhs[0]); @@ -2241,7 +5071,7 @@ SparseMatrix::Singular_display(int block, int Size, bool steady_state, it_code_t bool -SparseMatrix::Solve_ByteCode_Sparse_GaussianElimination(int Size, int blck, bool steady_state, int it_) +dynSparseMatrix::Solve_ByteCode_Sparse_GaussianElimination(int Size, int blck, int it_) { bool one; int pivj = 0, pivk = 0; @@ -2262,7 +5092,8 @@ SparseMatrix::Solve_ByteCode_Sparse_GaussianElimination(int Size, int blck, bool /*finding the max-pivot*/ double piv = piv_abs = 0; int nb_eq = At_Col(i, &first); - l = 0; N_max = 0; + l = 0; + N_max = 0; one = false; piv_abs = 0; for (int j = 0; j < nb_eq; j++) @@ -2409,7 +5240,7 @@ SparseMatrix::Solve_ByteCode_Sparse_GaussianElimination(int Size, int blck, bool bc[nb_eq_todo++] = first; first = first->NZE_C_N; } - //#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) + //pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) for (int j = 0; j < nb_eq_todo; j++) { first = bc[j]; @@ -2488,8 +5319,12 @@ SparseMatrix::Solve_ByteCode_Sparse_GaussianElimination(int Size, int blck, bool } } double slowc_lbx = slowc; +#ifdef USE_OMP +#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) +#endif for (int i = 0; i < y_size; i++) ya[i+it_*y_size] = y[i+it_*y_size]; + slowc_save = slowc; simple_bksub(it_, Size, slowc_lbx); End_GE(Size); @@ -2502,7 +5337,7 @@ SparseMatrix::Solve_ByteCode_Sparse_GaussianElimination(int Size, int blck, bool } void -SparseMatrix::Solve_ByteCode_Symbolic_Sparse_GaussianElimination(int Size, bool symbolic, int Block_number) +dynSparseMatrix::Solve_ByteCode_Symbolic_Sparse_GaussianElimination(int Size, bool symbolic, int Block_number) { /*Triangularisation at each period of a block using a simple gaussian Elimination*/ t_save_op_s *save_op_s; @@ -2521,16 +5356,28 @@ SparseMatrix::Solve_ByteCode_Symbolic_Sparse_GaussianElimination(int Size, bool pivj_v = (int *) mxMalloc(Size*sizeof(int)); pivk_v = (int *) mxMalloc(Size*sizeof(int)); NR = (int *) mxMalloc(Size*sizeof(int)); + //clock_t time00 = clock(); + NonZeroElem **bc; + bc = (NonZeroElem **) mxMalloc(Size*sizeof(first)); for (int t = 0; t < periods; t++) { + /*clock_t time11 = clock(); + mexPrintf("t=%d, record = %d\n",t, record);*/ +#ifdef OCTAVE_MEX_FILE + OCTAVE_QUIT; +#else + if ( utIsInterruptPending() ) + throw UserExceptionHandling(); +#endif + if (record && symbolic) { - if (save_op) + /*if (save_op) { mxFree(save_op); save_op = NULL; - } + }*/ save_op = (int *) mxMalloc(nop*sizeof(int)); nopa = nop; } @@ -2663,220 +5510,128 @@ SparseMatrix::Solve_ByteCode_Symbolic_Sparse_GaussianElimination(int Size, bool piv_abs = fabs(piv); } line_done[pivj] = true; - if (symbolic) - { - if (record) - { - if (nop+1 >= nopa) - { - nopa = long (mem_increasing_factor*(double) nopa); - save_op = (int *) mxRealloc(save_op, nopa*sizeof(int)); - } - save_op_s = (t_save_op_s *) (&(save_op[nop])); - save_op_s->operat = IFLD; - save_op_s->first = pivk; - save_op_s->lag = 0; - } - nop += 2; - } - if (piv_abs < eps) - { - ostringstream tmp; - if (Block_number > 1) - tmp << " in Solve_ByteCode_Symbolic_Sparse_GaussianElimination, singular system in block " << Block_number+1 << "\n"; - else - tmp << " in Solve_ByteCode_Symbolic_Sparse_GaussianElimination, singular system\n"; - throw FatalExceptionHandling(tmp.str()); - } - /*divide all the non zeros elements of the line pivj by the max_pivot*/ - int nb_var = At_Row(pivj, &first); - NonZeroElem **bb; - bb = (NonZeroElem **) mxMalloc(nb_var*sizeof(first)); - for (int j = 0; j < nb_var; j++) - { - bb[j] = first; - first = first->NZE_R_N; - } - for (int j = 0; j < nb_var; j++) + if (record && symbolic) { - first = bb[j]; - u[first->u_index] /= piv; - if (symbolic) + if (nop+1 >= nopa) { - if (record) - { - if (nop+j*2+1 >= nopa) - { - nopa = long (mem_increasing_factor*(double) nopa); - save_op = (int *) mxRealloc(save_op, nopa*sizeof(int)); - } - save_op_s = (t_save_op_s *) (&(save_op[nop+j*2])); - save_op_s->operat = IFDIV; - save_op_s->first = first->u_index; - save_op_s->lag = first->lag_index; - } + nopa = long (mem_increasing_factor*(double) nopa); + save_op = (int *) mxRealloc(save_op, nopa*sizeof(int)); } - } - mxFree(bb); - nop += nb_var*2; - u[b[pivj]] /= piv; - if (symbolic) - { - if (record) + save_op_s = (t_save_op_s *) (&(save_op[nop])); + save_op_s->operat = IFLD; + save_op_s->first = pivk; + save_op_s->lag = 0; + nop += 2; + if (piv_abs < eps) { - if (nop+1 >= nopa) + ostringstream tmp; + if (Block_number > 1) + tmp << " in Solve_ByteCode_Symbolic_Sparse_GaussianElimination, singular system in block " << Block_number+1 << "\n"; + else + tmp << " in Solve_ByteCode_Symbolic_Sparse_GaussianElimination, singular system\n"; + throw FatalExceptionHandling(tmp.str()); + } + /*divide all the non zeros elements of the line pivj by the max_pivot*/ + int nb_var = At_Row(pivj, &first); + for (int j = 0; j < nb_var; j++) + { + u[first->u_index] /= piv; + if (nop+j*2+1 >= nopa) { nopa = long (mem_increasing_factor*(double) nopa); save_op = (int *) mxRealloc(save_op, nopa*sizeof(int)); } - save_op_s = (t_save_op_s *) (&(save_op[nop])); + save_op_s = (t_save_op_s *) (&(save_op[nop+j*2])); save_op_s->operat = IFDIV; - save_op_s->first = b[pivj]; - save_op_s->lag = 0; + save_op_s->first = first->u_index; + save_op_s->lag = first->lag_index; + first = first->NZE_R_N; } + nop += nb_var*2; + u[b[pivj]] /= piv; + if (nop+1 >= nopa) + { + nopa = long (mem_increasing_factor*(double) nopa); + save_op = (int *) mxRealloc(save_op, nopa*sizeof(int)); + } + save_op_s = (t_save_op_s *) (&(save_op[nop])); + save_op_s->operat = IFDIV; + save_op_s->first = b[pivj]; + save_op_s->lag = 0; nop += 2; - } - /*substract the elements on the non treated lines*/ - nb_eq = At_Col(i, &first); - NonZeroElem *first_piva; - int nb_var_piva = At_Row(pivj, &first_piva); + /*substract the elements on the non treated lines*/ + nb_eq = At_Col(i, &first); + NonZeroElem *first_piva; + int nb_var_piva = At_Row(pivj, &first_piva); - NonZeroElem **bc; - bc = (NonZeroElem **) mxMalloc(nb_eq*sizeof(first)); - int nb_eq_todo = 0; - for (int j = 0; j < nb_eq && first; j++) - { - if (!line_done[first->r_index]) - bc[nb_eq_todo++] = first; - first = first->NZE_C_N; - } - //#pragma omp parallel for num_threads(2) shared(nb_var_piva, first_piva, nopa, nop, save_op, record) - for (int j = 0; j < nb_eq_todo; j++) - { - t_save_op_s *save_op_s_l; - first = bc[j]; - int row = first->r_index; - double first_elem = u[first->u_index]; - if (symbolic) + int nb_eq_todo = 0; + for (int j = 0; j < nb_eq && first; j++) { - if (record) - { - if (nop+1 >= nopa) - { - nopa = long (mem_increasing_factor*(double) nopa); - save_op = (int *) mxRealloc(save_op, nopa*sizeof(int)); - } - save_op_s_l = (t_save_op_s *) (&(save_op[nop])); - save_op_s_l->operat = IFLD; - save_op_s_l->first = first->u_index; - save_op_s_l->lag = abs(first->lag_index); - } - nop += 2; + if (!line_done[first->r_index]) + bc[nb_eq_todo++] = first; + first = first->NZE_C_N; } - - int nb_var_piv = nb_var_piva; - NonZeroElem *first_piv = first_piva; - NonZeroElem *first_sub; - int nb_var_sub = At_Row(row, &first_sub); - int l_sub = 0; - int l_piv = 0; - int sub_c_index = first_sub->c_index; - int piv_c_index = first_piv->c_index; - int tmp_lag = first_sub->lag_index; - while (l_sub < nb_var_sub || l_piv < nb_var_piv) +//#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) shared(nb_var_piva, first_piva, nopa, save_op) reduction(+:nop) + for (int j = 0; j < nb_eq_todo; j++) { - if (l_sub < nb_var_sub && (sub_c_index < piv_c_index || l_piv >= nb_var_piv)) + t_save_op_s *save_op_s_l; + NonZeroElem *first = bc[j]; + int row = first->r_index; + double first_elem = u[first->u_index]; + if (nop+1 >= nopa) { - //There is no nonzero element at row pivot for this column=> Nothing to do for the current element got to next column - first_sub = first_sub->NZE_R_N; - if (first_sub) - sub_c_index = first_sub->c_index; - else - sub_c_index = Size*periods; - l_sub++; + nopa = long (mem_increasing_factor*(double) nopa); + save_op = (int *) mxRealloc(save_op, nopa*sizeof(int)); } - else if (sub_c_index > piv_c_index || l_sub >= nb_var_sub) + save_op_s_l = (t_save_op_s *) (&(save_op[nop])); + save_op_s_l->operat = IFLD; + save_op_s_l->first = first->u_index; + save_op_s_l->lag = abs(first->lag_index); + nop += 2; + + int nb_var_piv = nb_var_piva; + NonZeroElem *first_piv = first_piva; + NonZeroElem *first_sub; + int nb_var_sub = At_Row(row, &first_sub); + int l_sub = 0; + int l_piv = 0; + int sub_c_index = first_sub->c_index; + int piv_c_index = first_piv->c_index; + int tmp_lag = first_sub->lag_index; + while (l_sub < (nb_var_sub/*=NRow(row)*/) || l_piv < nb_var_piv) { - // There is an nonzero element at row pivot but not at the current row=> insert a negative element in the current row - tmp_u_count = Get_u(); - lag = first_piv->c_index/Size-row/Size; - //#pragma omp critical - { - Insert(row, first_piv->c_index, tmp_u_count, lag); - } - u[tmp_u_count] = -u[first_piv->u_index]*first_elem; - if (symbolic) + if (l_sub < nb_var_sub && (sub_c_index < piv_c_index || l_piv >= nb_var_piv)) { - if (record) - { - if (nop+2 >= nopa) - { - nopa = long (mem_increasing_factor*(double) nopa); - save_op = (int *) mxRealloc(save_op, nopa*sizeof(int)); - } - save_op_s_l = (t_save_op_s *) (&(save_op[nop])); - save_op_s_l->operat = IFLESS; - save_op_s_l->first = tmp_u_count; - save_op_s_l->second = first_piv->u_index; - save_op_s_l->lag = max(first_piv->lag_index, abs(tmp_lag)); - } - nop += 3; - } - first_piv = first_piv->NZE_R_N; - if (first_piv) - piv_c_index = first_piv->c_index; - else - piv_c_index = Size*periods; - l_piv++; - } - else /*first_sub->c_index==first_piv->c_index*/ - { - if (i == sub_c_index) - { - NonZeroElem *firsta = first; - NonZeroElem *first_suba = first_sub->NZE_R_N; - Delete(first_sub->r_index, first_sub->c_index); - first = firsta->NZE_C_N; - first_sub = first_suba; - if (first_sub) - sub_c_index = first_sub->c_index; - else - sub_c_index = Size*periods; - l_sub++; - first_piv = first_piv->NZE_R_N; - if (first_piv) - piv_c_index = first_piv->c_index; - else - piv_c_index = Size*periods; - l_piv++; - } - else - { - u[first_sub->u_index] -= u[first_piv->u_index]*first_elem; - if (symbolic) - { - if (record) - { - if (nop+3 >= nopa) - { - nopa = long (mem_increasing_factor*(double) nopa); - save_op = (int *) mxRealloc(save_op, nopa*sizeof(int)); - } - save_op_s_l = (t_save_op_s *) (&(save_op[nop])); - save_op_s_l->operat = IFSUB; - save_op_s_l->first = first_sub->u_index; - save_op_s_l->second = first_piv->u_index; - save_op_s_l->lag = max(abs(tmp_lag), first_piv->lag_index); - } - nop += 3; - } + //There is no nonzero element at row pivot for this column=> Nothing to do for the current element got to next column first_sub = first_sub->NZE_R_N; if (first_sub) sub_c_index = first_sub->c_index; else sub_c_index = Size*periods; l_sub++; + } + else if (sub_c_index > piv_c_index || l_sub >= nb_var_sub) + { + // There is an nonzero element at row pivot but not at the current row=> insert a negative element in the current row + tmp_u_count = Get_u(); + lag = first_piv->c_index/Size-row/Size; + //#pragma omp critical + { + Insert(row, first_piv->c_index, tmp_u_count, lag); + } + u[tmp_u_count] = -u[first_piv->u_index]*first_elem; + if (nop+2 >= nopa) + { + nopa = long (mem_increasing_factor*(double) nopa); + save_op = (int *) mxRealloc(save_op, nopa*sizeof(int)); + } + save_op_s_l = (t_save_op_s *) (&(save_op[nop])); + save_op_s_l->operat = IFLESS; + save_op_s_l->first = tmp_u_count; + save_op_s_l->second = first_piv->u_index; + save_op_s_l->lag = max(first_piv->lag_index, abs(tmp_lag)); + nop += 3; first_piv = first_piv->NZE_R_N; if (first_piv) piv_c_index = first_piv->c_index; @@ -2884,29 +5639,200 @@ SparseMatrix::Solve_ByteCode_Symbolic_Sparse_GaussianElimination(int Size, bool piv_c_index = Size*periods; l_piv++; } - } - } - u[b[row]] -= u[b[pivj]]*first_elem; - - if (symbolic) - { - if (record) - { - if (nop+3 >= nopa) + else /*first_sub->c_index==first_piv->c_index*/ { - nopa = long (mem_increasing_factor*(double) nopa); - save_op = (int *) mxRealloc(save_op, nopa*sizeof(int)); + if (i == sub_c_index) + { + NonZeroElem *firsta = first; + NonZeroElem *first_suba = first_sub->NZE_R_N; + //#pragma omp critical + { + Delete(first_sub->r_index, first_sub->c_index); + } + first = firsta->NZE_C_N; + first_sub = first_suba; + if (first_sub) + sub_c_index = first_sub->c_index; + else + sub_c_index = Size*periods; + l_sub++; + first_piv = first_piv->NZE_R_N; + if (first_piv) + piv_c_index = first_piv->c_index; + else + piv_c_index = Size*periods; + l_piv++; + } + else + { + u[first_sub->u_index] -= u[first_piv->u_index]*first_elem; + if (nop+3 >= nopa) + { + nopa = long (mem_increasing_factor*(double) nopa); + save_op = (int *) mxRealloc(save_op, nopa*sizeof(int)); + } + save_op_s_l = (t_save_op_s *) (&(save_op[nop])); + save_op_s_l->operat = IFSUB; + save_op_s_l->first = first_sub->u_index; + save_op_s_l->second = first_piv->u_index; + save_op_s_l->lag = max(abs(tmp_lag), first_piv->lag_index); + nop += 3; + first_sub = first_sub->NZE_R_N; + if (first_sub) + sub_c_index = first_sub->c_index; + else + sub_c_index = Size*periods; + l_sub++; + first_piv = first_piv->NZE_R_N; + if (first_piv) + piv_c_index = first_piv->c_index; + else + piv_c_index = Size*periods; + l_piv++; + } } - save_op_s_l = (t_save_op_s *) (&(save_op[nop])); - save_op_s_l->operat = IFSUB; - save_op_s_l->first = b[row]; - save_op_s_l->second = b[pivj]; - save_op_s_l->lag = abs(tmp_lag); } + u[b[row]] -= u[b[pivj]]*first_elem; + + if (nop+3 >= nopa) + { + nopa = long (mem_increasing_factor*(double) nopa); + save_op = (int *) mxRealloc(save_op, nopa*sizeof(int)); + } + save_op_s_l = (t_save_op_s *) (&(save_op[nop])); + save_op_s_l->operat = IFSUB; + save_op_s_l->first = b[row]; + save_op_s_l->second = b[pivj]; + save_op_s_l->lag = abs(tmp_lag); + nop += 3; + } + } + else if(symbolic) + { + nop += 2; + if (piv_abs < eps) + { + ostringstream tmp; + if (Block_number > 1) + tmp << " in Solve_ByteCode_Symbolic_Sparse_GaussianElimination, singular system in block " << Block_number+1 << "\n"; + else + tmp << " in Solve_ByteCode_Symbolic_Sparse_GaussianElimination, singular system\n"; + throw FatalExceptionHandling(tmp.str()); + } + /*divide all the non zeros elements of the line pivj by the max_pivot*/ + int nb_var = At_Row(pivj, &first); + for (int j = 0; j < nb_var; j++) + { + u[first->u_index] /= piv; + first = first->NZE_R_N; + } + nop += nb_var*2; + u[b[pivj]] /= piv; + nop += 2; + /*substract the elements on the non treated lines*/ + nb_eq = At_Col(i, &first); + NonZeroElem *first_piva; + int nb_var_piva = At_Row(pivj, &first_piva); + + int nb_eq_todo = 0; + for (int j = 0; j < nb_eq && first; j++) + { + if (!line_done[first->r_index]) + bc[nb_eq_todo++] = first; + first = first->NZE_C_N; + } +//#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) shared(nb_var_piva, first_piva, nopa, save_op) reduction(+:nop) + for (int j = 0; j < nb_eq_todo; j++) + { + NonZeroElem *first = bc[j]; + int row = first->r_index; + double first_elem = u[first->u_index]; + nop += 2; + int nb_var_piv = nb_var_piva; + NonZeroElem *first_piv = first_piva; + NonZeroElem *first_sub; + int nb_var_sub = At_Row(row, &first_sub); + int l_sub = 0; + int l_piv = 0; + int sub_c_index = first_sub->c_index; + int piv_c_index = first_piv->c_index; + while (l_sub < (nb_var_sub /*= NRow(row)*/) || l_piv < nb_var_piv) + { + if (l_sub < nb_var_sub && (sub_c_index < piv_c_index || l_piv >= nb_var_piv)) + { + //There is no nonzero element at row pivot for this column=> Nothing to do for the current element got to next column + first_sub = first_sub->NZE_R_N; + if (first_sub) + sub_c_index = first_sub->c_index; + else + sub_c_index = Size*periods; + l_sub++; + } + else if (sub_c_index > piv_c_index || l_sub >= nb_var_sub) + { + // There is an nonzero element at row pivot but not at the current row=> insert a negative element in the current row + tmp_u_count = Get_u(); + lag = first_piv->c_index/Size-row/Size; + //#pragma omp critical + { + Insert(row, first_piv->c_index, tmp_u_count, lag); + } + u[tmp_u_count] = -u[first_piv->u_index]*first_elem; + nop += 3; + first_piv = first_piv->NZE_R_N; + if (first_piv) + piv_c_index = first_piv->c_index; + else + piv_c_index = Size*periods; + l_piv++; + } + else /*first_sub->c_index==first_piv->c_index*/ + { + if (i == sub_c_index) + { + NonZeroElem *firsta = first; + NonZeroElem *first_suba = first_sub->NZE_R_N; + //#pragma omp critical + { + Delete(first_sub->r_index, first_sub->c_index); + } + first = firsta->NZE_C_N; + first_sub = first_suba; + if (first_sub) + sub_c_index = first_sub->c_index; + else + sub_c_index = Size*periods; + l_sub++; + first_piv = first_piv->NZE_R_N; + if (first_piv) + piv_c_index = first_piv->c_index; + else + piv_c_index = Size*periods; + l_piv++; + } + else + { + u[first_sub->u_index] -= u[first_piv->u_index]*first_elem; + nop += 3; + first_sub = first_sub->NZE_R_N; + if (first_sub) + sub_c_index = first_sub->c_index; + else + sub_c_index = Size*periods; + l_sub++; + first_piv = first_piv->NZE_R_N; + if (first_piv) + piv_c_index = first_piv->c_index; + else + piv_c_index = Size*periods; + l_piv++; + } + } + } + u[b[row]] -= u[b[pivj]]*first_elem; nop += 3; } } - mxFree(bc); } if (symbolic) { @@ -2919,12 +5845,32 @@ SparseMatrix::Solve_ByteCode_Symbolic_Sparse_GaussianElimination(int Size, bool } else if (record && (nop == nop1)) { - if (save_opa && save_opaa) + if (t > int(periods*0.35)) + { + symbolic = false; + if (save_opaa) + { + mxFree(save_opaa); + save_opaa = NULL; + } + if (save_opa) + { + mxFree(save_opa); + save_opa = NULL; + } + if (save_op) + { + mxFree(save_op); + save_op = NULL; + } + } + else if (save_opa && save_opaa) { if (compare(save_op, save_opa, save_opaa, t, periods, nop, Size)) { tbreak = t; tbreak_g = tbreak; + //mexPrintf("time=%f\n",(1000.0*(double (clock())-double (time11)))/double (CLOCKS_PER_SEC)); break; } } @@ -2935,16 +5881,9 @@ SparseMatrix::Solve_ByteCode_Symbolic_Sparse_GaussianElimination(int Size, bool mxFree(save_opaa); save_opaa = NULL; } - save_opaa = (int *) mxMalloc(nop1*sizeof(int)); - memcpy(save_opaa, save_opa, nop1*sizeof(int)); + save_opaa = save_opa; } - if (save_opa) - { - mxFree(save_opa); - save_opa = NULL; - } - save_opa = (int *) mxMalloc(nop*sizeof(int)); - memcpy(save_opa, save_op, nop*sizeof(int)); + save_opa = save_op; } else { @@ -2968,11 +5907,16 @@ SparseMatrix::Solve_ByteCode_Symbolic_Sparse_GaussianElimination(int Size, bool nop2 = nop1; nop1 = nop; } + //mexPrintf("time=%f\n",(1000.0*(double (clock())-double (time11)))/double (CLOCKS_PER_SEC)); } + mxFree(bc); mxFree(piv_v); mxFree(pivj_v); mxFree(pivk_v); mxFree(NR); + /*mexPrintf("tbreak=%d, periods=%d time required=%f\n",tbreak,periods, (1000.0*(double (clock())-double (time00)))/double (CLOCKS_PER_SEC)); + mexEvalString("drawnow;"); + time00 = clock();*/ nop_all += nop; if (symbolic) { @@ -2986,103 +5930,289 @@ SparseMatrix::Solve_ByteCode_Symbolic_Sparse_GaussianElimination(int Size, bool /*The backward substitution*/ double slowc_lbx = slowc; +#ifdef USE_OMP +#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) +#endif for (int i = 0; i < y_size*(periods+y_kmin); i++) ya[i] = y[i]; slowc_save = slowc; bksub(tbreak, last_period, Size, slowc_lbx); + /*mexPrintf("remaining operations and bksub time required=%f\n",tbreak,periods, (1000.0*(double (clock())-double (time00)))/double (CLOCKS_PER_SEC)); + mexEvalString("drawnow;");*/ End_GE(Size); } -bool -SparseMatrix::Simulate_Newton_One_Boundary(int blck, int y_size, int it_, int y_kmin, int y_kmax, int Size, bool print_it, bool cvg, int &iter, bool steady_state, int stack_solve_algo, int solve_algo) + +void +dynSparseMatrix::Grad_f_product(int n, mxArray *b_m, double* vectr, mxArray *A_m, SuiteSparse_long *Ap, SuiteSparse_long *Ai, double* Ax, double* b_) { - int i, j; - mxArray *b_m = NULL, *A_m = NULL, *x0_m = NULL; - Clear_u(); - error_not_printed = true; - bool singular_system = false; - u_count_alloc_save = u_count_alloc; - if (isnan(res1) || isinf(res1) || (res2 > 12*g0 && iter > 0)) + if ((solve_algo == 5 && steady_state) || (stack_solve_algo == 5 && !steady_state)) { - if (iter == 0 || fabs(slowc_save) < 1e-8) + NonZeroElem *first; +#ifdef USE_OMP +#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) private(first) +#endif + for (int i = 0; i < n; i++) { - for (j = 0; j < y_size; j++) - { -#ifdef DEBUG - bool select = false; -#endif - for (int i = 0; i < Size; i++) - if (j == index_vara[i]) - { -#ifdef DEBUG - select = true; -#endif - break; - } -#ifdef DEBUG - if (select) - mexPrintf("-> variable %s (%d) at time %d = %f direction = %f\n", get_variable(eEndogenous, j).c_str(), j+1, it_, y[j+it_*y_size], direction[j+it_*y_size]); - else - mexPrintf(" variable %s (%d) at time %d = %f direction = %f\n", get_variable(eEndogenous, j).c_str(), j+1, it_, y[j+it_*y_size], direction[j+it_*y_size]); -#endif - } - if (steady_state) - { - if (iter == 0) - mexPrintf(" the initial values of endogenous variables are too far from the solution.\nChange them!\n"); - else - mexPrintf(" dynare cannot improve the simulation in block %d at time %d (variable %d)\n", blck+1, it_+1, index_vara[max_res_idx]+1); - mexEvalString("drawnow;"); - return singular_system; - } - else + double sum = 0; + first = FNZE_R[i]; + if (first) + for (int k = 0; k < NbNZRow[i]; k++) + { + sum += u[first->u_index] * u[b[first->c_index]]; + first = first->NZE_R_N; + } + vectr[i] = sum; + } + } + else + { + if (!((solve_algo == 6 && steady_state) || ((stack_solve_algo == 0 || stack_solve_algo == 1 || stack_solve_algo == 4) && !steady_state))) + { + mwIndex *Ai = mxGetIr(A_m); + if (!Ai) { ostringstream tmp; - if (iter == 0) - tmp << " in Simulate_Newton_One_Boundary, The initial values of endogenous variables are too far from the solution.\nChange them!\n"; - else - tmp << " in Simulate_Newton_One_Boundary, Dynare cannot improve the simulation in block " << blck+1 << " at time " << it_+1 << " (variable " << index_vara[max_res_idx]+1 << "%d)\n"; + tmp << " in Init_Matlab_Sparse_Simple, can't allocate Ai index vector\n"; + throw FatalExceptionHandling(tmp.str()); + } + mwIndex *Aj = mxGetJc(A_m); + if (!Aj) + { + ostringstream tmp; + tmp << " in Init_Matlab_Sparse_Simple, can't allocate Aj index vector\n"; + throw FatalExceptionHandling(tmp.str()); + } + double *A = mxGetPr(A_m); + if (!A) + { + ostringstream tmp; + tmp << " in Init_Matlab_Sparse_Simple, can't retrieve A matrix\n"; + throw FatalExceptionHandling(tmp.str()); + } + b_ = mxGetPr(b_m); + if (!b_) + { + ostringstream tmp; + tmp << " in Init_Matlab_Sparse_Simple, can't retrieve b matrix\n"; throw FatalExceptionHandling(tmp.str()); } } - if (!(isnan(res1) || isinf(res1)) && !(isnan(g0) || isinf(g0))) - { - if (try_at_iteration == 0) - { - prev_slowc_save = slowc_save; - slowc_save = max(-gp0 / (2 * (res2 - g0 - gp0)), 0.1); - } - else - { - double t1 = res2 - gp0 * slowc_save - g0; - double t2 = glambda2 - gp0 * prev_slowc_save - g0; - double a = (1/(slowc_save * slowc_save) * t1 - 1/(prev_slowc_save * prev_slowc_save) * t2) / (slowc_save - prev_slowc_save); - double b = (-prev_slowc_save/(slowc_save * slowc_save) * t1 + slowc_save/(prev_slowc_save * prev_slowc_save) * t2) / (slowc_save - prev_slowc_save); - prev_slowc_save = slowc_save; - slowc_save = max(min(-b + sqrt(b*b - 3 * a * gp0) / (3 * a), 0.5 * slowc_save), 0.1 * slowc_save); - } - glambda2 = res2; - try_at_iteration++; - } - else + memset(vectr, 0, n * sizeof(double)); +#ifdef USE_OMP +#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) /*shared(vectr)*/ +#endif + for (int i = 0; i < n; i++) + for (SuiteSparse_long j = Ap[i]; j < Ap[i+1]; j++) + vectr[Ai[j]] += Ax[j] * b_[i]; + } +} + +void +dynSparseMatrix::Check_and_Correct_Previous_Iteration(int block_num, int y_size, int size, double crit_opt_old) +{ + double top = 1.0; + double bottom = 0.1; + //mexPrintf("res2=%f > g0=%f, res1=%f, iter=%d it_=%d\n", res2, g0, res1, iter, it_); + if (isnan(res1) || isinf(res1) || (res2 > g0 && iter > 0)) + { + while ((isnan(res1) || isinf(res1))) { prev_slowc_save = slowc_save; slowc_save /= 1.1; +#ifdef USE_OMP +#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) +#endif + for (int i = 0; i < size; i++) + { + int eq = index_vara[i]; + y[eq+it_*y_size] = ya[eq+it_*y_size] + slowc_save * direction[eq+it_*y_size]; + } + /*mexPrintf("reducing solwc_save = %e, it_=%d, y_size=%d, size=%d, y[%d]=%e, ya[%d]=%e,\n y[%d]=%e, ya[%d]=%e\n",slowc_save, it_, y_size, size-1, index_vara[0]+it_*y_size, y[index_vara[0]+it_*y_size], index_vara[0]+it_*y_size, ya[index_vara[0]+it_*y_size] + , index_vara[size-1]+it_*y_size, y[index_vara[size-1]+it_*y_size], index_vara[size-1]+it_*y_size, ya[index_vara[size-1]+it_*y_size]);*/ + //mexPrintf("->slowc_save=%f\n",slowc_save); + compute_complete(true, res1, res2, max_res, max_res_idx); } - if (print_it) + + while (res2 > g0 && slowc_save > 1e-1) + { + prev_slowc_save = slowc_save; + slowc_save /= 1.5; +#ifdef USE_OMP +#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) +#endif + for (int i = 0; i < size; i++) + { + int eq = index_vara[i]; + y[eq+it_*y_size] = ya[eq+it_*y_size] + slowc_save * direction[eq+it_*y_size]; + } + /*mexPrintf("reducing solwc_save = %e, it_=%d, y_size=%d, size=%d, y[%d]=%e, ya[%d]=%e,\n y[%d]=%e, ya[%d]=%e\n",slowc_save, it_, y_size, size-1, index_vara[0]+it_*y_size, y[index_vara[0]+it_*y_size], index_vara[0]+it_*y_size, ya[index_vara[0]+it_*y_size] , index_vara[size-1]+it_*y_size, y[index_vara[size-1]+it_*y_size], index_vara[size-1]+it_*y_size, ya[index_vara[size-1]+it_*y_size]);*/ + //mexPrintf("->slowc_save=%f\n",slowc_save); + compute_complete(true, res1, res2, max_res, max_res_idx); + } + double ax = slowc_save-0.001, bx = slowc_save+0.001, cx = slowc_save, fa, fb, fc, xmin; + if (false/*slowc_save > 2e-1*/) + if (mnbrak(&ax, &bx, &cx, &fa, &fb, &fc)) + if (golden(ax, bx, cx, 1e-1, solve_tolf, &xmin)) + slowc_save = xmin; + //mexPrintf("cx=%f\n", cx); + //mexPrintf("ax= %f, bx=%f, cx=%f, fa=%f, fb=%f, fc=%d\n", ax, bx, cx, fa, fb, fc); + + //if (!(isnan(res1) || isinf(res1))/* && !(isnan(g0) || isinf(g0))*//*|| (res2 > g0 && iter > 1)*/) + if (false) + { + + double *p = (double*)mxMalloc(size * sizeof(double)); + Grad_f_product(size, b_m_save, p, A_m_save, Ap_save, Ai_save, Ax_save, b_save); + double slope=0.0; +#ifdef USE_OMP +#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) reduction(+:slope) +#endif + for (int i = 1; i < size; i++) + slope += - direction[i] * p[i]; + /*if (slope > 0) + mexPrintf("Roundoff in lnsearch\n"); + else*/ + { + prev_slowc_save = 1; + double crit_opt = res2/2; + double max_try_iteration = 100; + double small_ = 1.0e-4; + bool try_at_cvg = false; + while ((try_at_iteration < max_try_iteration) && (!try_at_cvg) && (abs(prev_slowc_save - slowc_save) > 1e-10)) + { + crit_opt = res2 / 2; + if (slowc_save < 1e-7) + { + try_at_cvg = true; + continue; + } + else if ((crit_opt <= crit_opt_old + small_ * slowc_save * slope) && !(isnan(res1) || isinf(res1))) + { + try_at_cvg = true; + continue; + } + else if (try_at_iteration == 0) + { + prev_slowc_save = slowc_save; + //slowc_save = max(- top * slope / ( (crit_opt - crit_opt_old - slope)), bottom); + slowc_save /= 1.2; + } + else + { + double t1 = crit_opt - slope * slowc_save - crit_opt_old; + double t2 = glambda2 - slope * prev_slowc_save - crit_opt_old; + double a = (1/(slowc_save * slowc_save) * t1 - 1/(prev_slowc_save * prev_slowc_save) * t2) / (slowc_save - prev_slowc_save); + double b = (-prev_slowc_save/(slowc_save * slowc_save) * t1 + slowc_save/(prev_slowc_save * prev_slowc_save) * t2) / (slowc_save - prev_slowc_save); + if (a == 0) + slowc_save = max(min( - slope/(2 * b) , top * slowc_save), bottom * slowc_save); + else + { + double delta = b*b - 3 * a * slope; + if (delta <= 0) + slowc_save = top * slowc_save; + else if (b <= 0) + slowc_save = max(min(-b + sqrt(delta) / (3 * a), top * slowc_save), bottom * slowc_save); + else + slowc_save = max(min(- slope / (b + sqrt(delta)), top * slowc_save), bottom * slowc_save); + } + } + if (abs(prev_slowc_save - slowc_save) < 1e-10) + slowc_save /= 1.1; + //mexPrintf("=>slowc_save=%f, prev_slowc_save=%f\n",slowc_save, prev_slowc_save); + prev_slowc_save = slowc_save; + glambda2 = crit_opt; + try_at_iteration++; +#ifdef USE_OMP +#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) +#endif + for (int i = 0; i < size; i++) + { + int eq = index_vara[i]; + y[eq+it_*y_size] = ya[eq+it_*y_size] + slowc_save * direction[eq+it_*y_size]; + } + compute_complete(true, res1, res2, max_res, max_res_idx); + } + } + mxFree(p); + } + //if (print_it) mexPrintf("Error: Simulation diverging, trying to correct it using slowc=%f\n", slowc_save); - for (i = 0; i < y_size; i++) - y[i+it_*y_size] = ya[i+it_*y_size] + slowc_save*direction[i+it_*y_size]; - iter--; - return singular_system; +#ifdef USE_OMP +#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) +#endif + for (int i = 0; i < size; i++) + { + int eq = index_vara[i]; + y[eq+it_*y_size] = ya[eq+it_*y_size] + slowc_save * direction[eq+it_*y_size]; + } + compute_complete(false, res1, res2, max_res, max_res_idx); } - if (cvg) + else { - return singular_system; + //mexPrintf("slowc_save=%f res1=%f\n",slowc_save, res1); + for (int i = 0; i < size; i++) + { + int eq = index_vara[i]; + y[eq+it_*y_size] = ya[eq+it_*y_size] + slowc_save * direction[eq+it_*y_size]; + } + } + slowc_save = slowc; +} + +bool +dynSparseMatrix::Simulate_One_Boundary(int block_num, int y_size, int y_kmin, int y_kmax, int size, bool cvg) +{ + //int i; + mxArray *b_m = NULL, *A_m = NULL, *x0_m = NULL; + SuiteSparse_long *Ap = NULL, *Ai = NULL; + double *Ax = NULL, *b = NULL; + + + try_at_iteration = 0; + Clear_u(); + bool singular_system = false; + u_count_alloc_save = u_count_alloc; + + if (isnan(res1) || isinf(res1)) + { +#ifdef DEBUG + for (int j = 0; j < y_size; j++) + { + bool select = false; + for (int i = 0; i < size; i++) + if (j == index_vara[i]) + { + select = true; + break; + } + if (select) + mexPrintf("-> variable %s (%d) at time %d = %f direction = %f\n", get_variable(eEndogenous, j).c_str(), j+1, it_, y[j+it_*y_size], direction[j+it_*y_size]); + else + mexPrintf(" variable %s (%d) at time %d = %f direction = %f\n", get_variable(eEndogenous, j).c_str(), j+1, it_, y[j+it_*y_size], direction[j+it_*y_size]); + } +#endif + if (steady_state) + { + if (iter == 0) + mexPrintf(" the initial values of endogenous variables are too far from the solution.\nChange them!\n"); + else + mexPrintf(" dynare cannot improve the simulation in block %d at time %d (variable %d)\n", block_num+1, it_+1, index_vara[max_res_idx]+1); + mexEvalString("drawnow;"); + //return singular_system; + } + else + { + ostringstream tmp; + if (iter == 0) + tmp << " in Simulate_One_Boundary, The initial values of endogenous variables are too far from the solution.\nChange them!\n"; + else + tmp << " in Simulate_One_Boundary, Dynare cannot improve the simulation in block " << block_num+1 << " at time " << it_+1 << " (variable " << index_vara[max_res_idx]+1 << "%d)\n"; + throw FatalExceptionHandling(tmp.str()); + } } if (print_it) { - //mexPrintf("solwc=%f g0=%f res2=%f glambda2=%f\n",slowc_save,g0, res2, glambda2); if (steady_state) { switch (solve_algo) @@ -3125,36 +6255,60 @@ SparseMatrix::Simulate_Newton_One_Boundary(int blck, int y_size, int it_, int y_ mexPrintf("-----------------------------------\n"); } bool zero_solution; + if ((solve_algo == 5 && steady_state) || (stack_solve_algo == 5 && !steady_state)) - Simple_Init(Size, IM_i, zero_solution); + Simple_Init(size, IM_i, zero_solution); else { - b_m = mxCreateDoubleMatrix(Size, 1, mxREAL); + b_m = mxCreateDoubleMatrix(size, 1, mxREAL); if (!b_m) { ostringstream tmp; - tmp << " in Simulate_Newton_One_Boundary, can't allocate b_m vector\n"; + tmp << " in Simulate_One_Boundary, can't allocate b_m vector\n"; throw FatalExceptionHandling(tmp.str()); } - A_m = mxCreateSparse(Size, Size, min(int (IM_i.size()*2), Size*Size), mxREAL); + A_m = mxCreateSparse(size, size, min(int (IM_i.size()*2), size * size), mxREAL); if (!A_m) { ostringstream tmp; - tmp << " in Simulate_Newton_One_Boundary, can't allocate A_m matrix\n"; + tmp << " in Simulate_One_Boundary, can't allocate A_m matrix\n"; throw FatalExceptionHandling(tmp.str()); } - x0_m = mxCreateDoubleMatrix(Size, 1, mxREAL); + x0_m = mxCreateDoubleMatrix(size, 1, mxREAL); if (!x0_m) { ostringstream tmp; - tmp << " in Simulate_Newton_One_Boundary, can't allocate x0_m vector\n"; + tmp << " in Simulate_One_Boundary, can't allocate x0_m vector\n"; throw FatalExceptionHandling(tmp.str()); } - Init_Matlab_Sparse_Simple(Size, IM_i, A_m, b_m, zero_solution, x0_m); + if (!((solve_algo == 6 && steady_state) || ((stack_solve_algo == 0 || stack_solve_algo == 4) && !steady_state))) + { + Init_Matlab_Sparse_Simple(size, IM_i, A_m, b_m, zero_solution, x0_m); + A_m_save = mxDuplicateArray(A_m); + b_m_save = mxDuplicateArray(b_m); + } + else + { + Init_UMFPACK_Sparse_Simple(size, IM_i, &Ap, &Ai, &Ax, &b, zero_solution, x0_m); + if (Ap_save[size] != Ap[size]) + { + mxFree(Ai_save); + mxFree(Ax_save); + Ai_save = (SuiteSparse_long*)mxMalloc(Ap[size] * sizeof(SuiteSparse_long)); + Ax_save = (double*)mxMalloc(Ap[size] * sizeof(double)); + } + memcpy(Ap_save, Ap, (size + 1) * sizeof(SuiteSparse_long)); + memcpy(Ai_save, Ai, Ap[size] * sizeof(SuiteSparse_long)); + memcpy(Ax_save, Ax, Ap[size] * sizeof(double)); + memcpy(b_save, b, size * sizeof(double)); + } } if (zero_solution) { - for (int i = 0; i < Size; i++) +#ifdef USE_OMP +#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) +#endif + for (int i = 0; i < size; i++) { int eq = index_vara[i]; double yy = -(y[eq+it_*y_size]); @@ -3165,27 +6319,173 @@ SparseMatrix::Simulate_Newton_One_Boundary(int blck, int y_size, int it_, int y_ else { if ((solve_algo == 5 && steady_state) || (stack_solve_algo == 5 && !steady_state)) - singular_system = Solve_ByteCode_Sparse_GaussianElimination(Size, blck, steady_state, it_); + singular_system = Solve_ByteCode_Sparse_GaussianElimination(size, block_num, it_); else if ((solve_algo == 7 && steady_state) || (stack_solve_algo == 2 && !steady_state)) - Solve_Matlab_GMRES(A_m, b_m, Size, slowc, blck, false, it_, steady_state, x0_m); + Solve_Matlab_GMRES(A_m, b_m, size, slowc, block_num, false, it_, x0_m); else if ((solve_algo == 8 && steady_state) || (stack_solve_algo == 3 && !steady_state)) - Solve_Matlab_BiCGStab(A_m, b_m, Size, slowc, blck, false, it_, x0_m, steady_state); - else if ((solve_algo == 6 && steady_state) || ((stack_solve_algo == 0 || stack_solve_algo == 1) && !steady_state)) - Solve_Matlab_LU_UMFPack(A_m, b_m, Size, slowc, false, it_); + Solve_Matlab_BiCGStab(A_m, b_m, size, slowc, block_num, false, it_, x0_m, 1); + else if ((solve_algo == 6 && steady_state) || ((stack_solve_algo == 0 || stack_solve_algo == 1 || stack_solve_algo == 4) && !steady_state)) + Solve_LU_UMFPack(Ap, Ai, Ax, b, size, size, slowc, true, 0); } return singular_system; } -void -SparseMatrix::Simulate_Newton_Two_Boundaries(int blck, int y_size, int it_, int y_kmin, int y_kmax, int Size, int periods, bool print_it, bool cvg, int &iter, int minimal_solving_periods, int stack_solve_algo, unsigned int endo_name_length, char *P_endo_names) + + + +bool +dynSparseMatrix::solve_linear(const int block_num, const int y_size, const int y_kmin, const int y_kmax, const int size, const int iter) { + bool cvg = false; + double crit_opt_old = res2/2; + compute_complete(false, res1, res2, max_res, max_res_idx); + cvg = (max_res < solve_tolf); + if (!cvg || isnan(res1) || isinf(res1)) + { + if (iter) + Check_and_Correct_Previous_Iteration(block_num, y_size, size, crit_opt_old); + bool singular_system = Simulate_One_Boundary(block_num, y_size, y_kmin, y_kmax, size, cvg); + if (singular_system) + Singular_display(block_num, size); + } + return cvg; +} + +void +dynSparseMatrix::solve_non_linear(const int block_num, const int y_size, const int y_kmin, const int y_kmax, const int size) + +{ + max_res_idx = 0; + bool cvg = false; + iter = 0; + glambda2 = g0 = very_big; + //try_at_iteration = 0; + while ((!cvg) && (iter < maxit_)) + { + cvg = solve_linear(block_num, y_size, y_kmin, y_kmax, size, iter); + g0 = res2; + iter++; + } + if (!cvg) + { + ostringstream tmp; + if (steady_state) + tmp << " in Solve Forward complete, convergence not achieved in block " << block_num+1 << ", after " << iter << " iterations\n"; + else + tmp << " in Solve Forward complete, convergence not achieved in block " << block_num+1 << ", at time " << it_ << ", after " << iter << " iterations\n"; + throw FatalExceptionHandling(tmp.str()); + } +} + +void +dynSparseMatrix::Simulate_Newton_One_Boundary(const bool forward) +{ + g1 = (double *) mxMalloc(size*size*sizeof(double)); + r = (double *) mxMalloc(size*sizeof(double)); + //mexPrintf("Simulate_Newton_One_Boundary, block_num=%d, size=%d, steady=%d, forward=%d, iter=%d, is_linear=%d\n", block_num, size, steady_state, forward, iter, is_linear); + iter = 0; + if ((solve_algo == 6 && steady_state) || ((stack_solve_algo == 0 || stack_solve_algo == 1 || stack_solve_algo == 4) && !steady_state)) + { + Ap_save = (SuiteSparse_long*)mxMalloc((size + 1) * sizeof(SuiteSparse_long)); + Ap_save[size] = 0; + Ai_save = (SuiteSparse_long*)mxMalloc(1 * sizeof(SuiteSparse_long)); + Ax_save = (double*)mxMalloc(1 * sizeof(double)); + b_save = (double*)mxMalloc((size) * sizeof(SuiteSparse_long)); + } + if (steady_state) + { + it_ = 0; + if (!is_linear) + solve_non_linear(block_num, y_size, 0, 0, size); + else + solve_linear(block_num, y_size, 0, 0, size, 0); + } + else if (forward) + { + if (!is_linear) + { + for (it_ = y_kmin; it_ < periods+y_kmin; it_++) + solve_non_linear(block_num, y_size, y_kmin, y_kmax, size); + } + else + { + for (int it_ = y_kmin; it_ < periods+y_kmin; it_++) + solve_linear(block_num, y_size, y_kmin, y_kmax, size, 0); + } + } + else + { + if (!is_linear) + { + for (it_ = periods+y_kmin-1; it_ >= y_kmin; it_--) + solve_non_linear(block_num, y_size, y_kmin, y_kmax, size); + } + else + { + for (it_ = periods+y_kmin-1; it_ >= y_kmin; it_--) + solve_linear(block_num, y_size, y_kmin, y_kmax, size, 0); + } + } + if ((solve_algo == 6 && steady_state) || ((stack_solve_algo == 0 || stack_solve_algo == 1 || stack_solve_algo == 4) && !steady_state)) + { + mxFree(Ap_save); + mxFree(Ai_save); + mxFree(Ax_save); + mxFree(b_save); + } + mxFree(g1); + mxFree(r); +} + +string +dynSparseMatrix::preconditioner_print_out(string s, int preconditioner) +{ + int n = s.length(); + string tmp = ", preconditioner="; + switch(preconditioner) + { + case 0: + tmp.append("Jacobi on dynamic jacobian"); + break; + case 1: + tmp.append("incomplet lu0 on dynamic jacobian"); + break; + case 2: + tmp.append("incomplet lut on dynamic jacobian"); + break; + case 3: + tmp.append("lu on static jacobian"); + break; + } + s.insert(n - 2, tmp); + return s; +} + +void +dynSparseMatrix::Simulate_Newton_Two_Boundaries(int blck, int y_size, int y_kmin, int y_kmax, int Size, int periods, bool cvg, int minimal_solving_periods, int stack_solve_algo, unsigned int endo_name_length, char *P_endo_names) +{ + double top = 0.5; + double bottom = 0.1; +#ifdef CUDA + int nnz, nnz_tild; + int *Ap_i, *Ai_i; + int *Ap_i_tild, *Ai_i_tild; + double *x0, *A_tild; + +#endif + int preconditioner = 2; if (start_compare == 0) start_compare = y_kmin; u_count_alloc_save = u_count_alloc; clock_t t1 = clock(); nop1 = 0; - error_not_printed = true; mxArray *b_m = NULL, *A_m = NULL, *x0_m = NULL; + double *Ax = NULL, *b; + SuiteSparse_long *Ap = NULL, *Ai = NULL; + + + + if (iter > 0) { if (print_it) @@ -3222,7 +6522,6 @@ SparseMatrix::Simulate_Newton_Two_Boundaries(int blck, int y_size, int it_, int Error << " in Simulate_Newton_Two_Boundaries, the initial values of endogenous variables are too far from the solution.\nChange them!\n"; else Error << " in Simulate_Newton_Two_Boundaries, dynare cannot improve the simulation in block " << blck+1 << " at time " << it_+1 << " (variable " << index_vara[max_res_idx]+1 << ")\n"; - //Error << filename << " stopped"; throw FatalExceptionHandling(Error.str()); } if (!(isnan(res1) || isinf(res1)) && !(isnan(g0) || isinf(g0)) && (stack_solve_algo == 4 || stack_solve_algo == 5)) @@ -3230,7 +6529,7 @@ SparseMatrix::Simulate_Newton_Two_Boundaries(int blck, int y_size, int it_, int if (try_at_iteration == 0) { prev_slowc_save = slowc_save; - slowc_save = max(-gp0 / (2 * (res2 - g0 - gp0)), 0.1); + slowc_save = max(-gp0 / (2 * (res2 - g0 - gp0)), bottom); } else { @@ -3239,12 +6538,15 @@ SparseMatrix::Simulate_Newton_Two_Boundaries(int blck, int y_size, int it_, int double a = (1/(slowc_save * slowc_save) * t1 - 1/(prev_slowc_save * prev_slowc_save) * t2) / (slowc_save - prev_slowc_save); double b = (-prev_slowc_save/(slowc_save * slowc_save) * t1 + slowc_save/(prev_slowc_save * prev_slowc_save) * t2) / (slowc_save - prev_slowc_save); prev_slowc_save = slowc_save; - slowc_save = max(min(-b + sqrt(b*b - 3 * a * gp0) / (3 * a), 0.5 * slowc_save), 0.1 * slowc_save); + slowc_save = max(min(-b + sqrt(b*b - 3 * a * gp0) / (3 * a), top * slowc_save), bottom * slowc_save); } glambda2 = res2; try_at_iteration++; - if (slowc_save <= 0.1) + if (slowc_save <= bottom) { +#ifdef USE_OMP +#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) +#endif for (int i = 0; i < y_size*(periods+y_kmin); i++) y[i] = ya[i]+direction[i]; g0 = res2; @@ -3266,13 +6568,14 @@ SparseMatrix::Simulate_Newton_Two_Boundaries(int blck, int y_size, int it_, int else mexPrintf("Simulation diverging, trying to correct it using slowc=%f\n", slowc_save); } - +#ifdef USE_OMP +#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) +#endif for (int i = 0; i < y_size*(periods+y_kmin); i++) y[i] = ya[i]+slowc_save*direction[i]; iter--; return; } - u_count += u_count_init; if (stack_solve_algo == 5) { @@ -3322,10 +6625,10 @@ SparseMatrix::Simulate_Newton_Two_Boundaries(int blck, int y_size, int it_, int mexPrintf("MODEL SIMULATION: (method=Relaxation)\n"); break; case 2: - mexPrintf("MODEL SIMULATION: (method=GMRES)\n"); + mexPrintf(preconditioner_print_out("MODEL SIMULATION: (method=GMRES)\n", preconditioner).c_str()); break; case 3: - mexPrintf("MODEL SIMULATION: (method=BiCGStab)\n"); + mexPrintf(preconditioner_print_out("MODEL SIMULATION: (method=BiCGStab)\n", preconditioner).c_str()); break; case 4: mexPrintf("MODEL SIMULATION: (method=Sparse LU & optimal path length)\n"); @@ -3333,6 +6636,9 @@ SparseMatrix::Simulate_Newton_Two_Boundaries(int blck, int y_size, int it_, int case 5: mexPrintf("MODEL SIMULATION: (method=ByteCode own solver)\n"); break; + case 7: + mexPrintf(preconditioner_print_out("MODEL SIMULATION: (method=GPU BiCGStab)\n", preconditioner).c_str()); + break; default: mexPrintf("MODEL SIMULATION: (method=Unknown - %d - )\n", stack_solve_algo); } @@ -3369,26 +6675,44 @@ SparseMatrix::Simulate_Newton_Two_Boundaries(int blck, int y_size, int it_, int tmp << " in Simulate_Newton_Two_Boundaries, can't allocate x0_m vector\n"; throw FatalExceptionHandling(tmp.str()); } - A_m = mxCreateSparse(periods*Size, periods*Size, IM_i.size()* periods*2, mxREAL); - if (!A_m) + if (stack_solve_algo != 0 && stack_solve_algo != 4 && stack_solve_algo != 7) { - ostringstream tmp; - tmp << " in Simulate_Newton_Two_Boundaries, can't allocate A_m matrix\n"; - throw FatalExceptionHandling(tmp.str()); + A_m = mxCreateSparse(periods*Size, periods*Size, IM_i.size()* periods*2, mxREAL); + if (!A_m) + { + ostringstream tmp; + tmp << " in Simulate_Newton_Two_Boundaries, can't allocate A_m matrix\n"; + throw FatalExceptionHandling(tmp.str()); + } } - Init_Matlab_Sparse(periods, y_kmin, y_kmax, Size, IM_i, A_m, b_m, x0_m); + if (stack_solve_algo == 0 || stack_solve_algo == 4) + Init_UMFPACK_Sparse(periods, y_kmin, y_kmax, Size, IM_i, &Ap, &Ai, &Ax, &b, x0_m); +#ifdef CUDA + else if (stack_solve_algo == 7) + Init_CUDA_Sparse(periods, y_kmin, y_kmax, Size, IM_i, &Ap_i, &Ai_i, &Ax, &Ap_i_tild, &Ai_i_tild, &A_tild, &b, &x0, x0_m, &nnz, &nnz_tild, preconditioner); +#endif + else + Init_Matlab_Sparse(periods, y_kmin, y_kmax, Size, IM_i, A_m, b_m, x0_m); + } + //if (iter > 0) + /*mexPrintf("--> stack_solve_algo=%d\n", stack_solve_algo); + mexEvalString("drawnow;");*/ if (stack_solve_algo == 0 || stack_solve_algo == 4) - Solve_Matlab_LU_UMFPack(A_m, b_m, Size, slowc, true, 0); + Solve_LU_UMFPack(Ap, Ai, Ax, b, Size * periods, Size, slowc, true, 0); else if (stack_solve_algo == 1) Solve_Matlab_Relaxation(A_m, b_m, Size, slowc, true, 0); else if (stack_solve_algo == 2) - Solve_Matlab_GMRES(A_m, b_m, Size, slowc, blck, true, 0, false, x0_m); + Solve_Matlab_GMRES(A_m, b_m, Size, slowc, blck, true, 0, x0_m); else if (stack_solve_algo == 3) - Solve_Matlab_BiCGStab(A_m, b_m, Size, slowc, blck, true, 0, x0_m, false); + Solve_Matlab_BiCGStab(A_m, b_m, Size, slowc, blck, true, 0, x0_m, 1); else if (stack_solve_algo == 5) Solve_ByteCode_Symbolic_Sparse_GaussianElimination(Size, symbolic, blck); +#ifdef CUDA + else if (stack_solve_algo == 7) + Solve_CUDA_BiCGStab(Ap_i, Ai_i, Ax, Ap_i_tild, Ai_i_tild, A_tild, b, x0, Size * periods, Size, slowc, true, 0, nnz, nnz_tild, preconditioner, Size * periods, blck); +#endif } if (print_it) { @@ -3396,7 +6720,21 @@ SparseMatrix::Simulate_Newton_Two_Boundaries(int blck, int y_size, int it_, int mexPrintf("(** %f milliseconds **)\n", 1000.0*(double (t2) - double (t1))/double (CLOCKS_PER_SEC)); mexEvalString("drawnow;"); } + if ((!steady_state && (stack_solve_algo == 4 /*|| stack_solve_algo == 0*/))/* || steady_state*/) + { + clock_t t2 = clock(); + double ax = -0.1, bx = 1.1, cx = 0.5, fa, fb, fc, xmin; + if (!mnbrak(&ax, &bx, &cx, &fa, &fb, &fc)) + return; + //mexPrintf("ax= %f, bx=%f, cx=%f, fa=%f, fb=%f, fc=%d\n", ax, bx, cx, fa, fb, fc); + if (!golden(ax, bx, cx, 1e-1, solve_tolf, &xmin)) + return; + slowc = xmin; + clock_t t3 = clock(); + mexPrintf("(** %f milliseconds **)\n", 1000.0*(double (t3) - double (t2))/double (CLOCKS_PER_SEC)); + mexEvalString("drawnow;"); + } time00 = clock(); if (tbreak_g == 0) tbreak_g = periods; @@ -3404,7 +6742,7 @@ SparseMatrix::Simulate_Newton_Two_Boundaries(int blck, int y_size, int it_, int } void -SparseMatrix::fixe_u(double **u, int u_count_int, int max_lag_plus_max_lead_plus_1) +dynSparseMatrix::fixe_u(double **u, int u_count_int, int max_lag_plus_max_lead_plus_1) { u_count = u_count_int * periods; u_count_alloc = 2*u_count; diff --git a/mex/sources/bytecode/SparseMatrix.hh b/mex/sources/bytecode/SparseMatrix.hh index 5303b186f..4f2fb2117 100644 --- a/mex/sources/bytecode/SparseMatrix.hh +++ b/mex/sources/bytecode/SparseMatrix.hh @@ -19,21 +19,62 @@ #ifndef SPARSEMATRIX_HH_INCLUDED #define SPARSEMATRIX_HH_INCLUDED +#define PRINTF_ printf -#include #include #include #include #include +#include "dynblas.h" +#if !(defined _MSC_VER) +#include "dynumfpack.h" +#endif -#ifdef OCTAVE_MEX_FILE -# define CHAR_LENGTH 1 -#else -# define CHAR_LENGTH 2 +#ifdef CUDA +#include "cuda.h" +#include "cuda_runtime_api.h" +#include "cublas_v2.h" +#include "cusparse_v2.h" #endif #include "Mem_Mngr.hh" #include "ErrorHandling.hh" +//#include "Interpreter.hh" +#include "Evaluate.hh" + +#define cudaChk(x, y) \ + { \ + cudaError_t cuda_error = x; \ + if (cuda_error != cudaSuccess) \ + { \ + ostringstream tmp; \ + tmp << y; \ + throw FatalExceptionHandling(tmp.str()); \ + } \ + }; + +#define cusparseChk(x, y) \ + { \ + cusparseStatus_t cusparse_status = x; \ + if (cusparse_status != CUSPARSE_STATUS_SUCCESS) \ + { \ + ostringstream tmp; \ + tmp << y; \ + throw FatalExceptionHandling(tmp.str()); \ + } \ + }; + +#define cublasChk(x, y) \ + { \ + cublasStatus_t cublas_status = x; \ + if (cublas_status != CUBLAS_STATUS_SUCCESS) \ + { \ + ostringstream tmp; \ + tmp << y; \ + throw FatalExceptionHandling(tmp.str()); \ + } \ + }; + #define NEW_ALLOC #define MARKOVITZ @@ -53,41 +94,76 @@ const int IFLDZ = 4; const int IFMUL = 5; const int IFSTP = 6; const int IFADD = 7; -const double eps = 1e-10; +const double eps = 1e-15; const double very_big = 1e24; const int alt_symbolic_count_max = 1; const double mem_increasing_factor = 1.1; -class SparseMatrix : public ErrorMsg + + +class dynSparseMatrix : public Evaluate { public: - SparseMatrix(); - void Simulate_Newton_Two_Boundaries(int blck, int y_size, int it_, int y_kmin, int y_kmax, int Size, int periods, bool print_it, bool cvg, int &iter, int minimal_solving_periods, int stack_solve_algo, unsigned int endo_name_length, char *P_endo_names) /*throw(ErrorHandlingException)*/; - bool Simulate_Newton_One_Boundary(int blck, int y_size, int it_, int y_kmin, int y_kmax, int Size, bool print_it, bool cvg, int &iter, bool steady_state, int stack_solve_algo, int solve_algo); - void Direct_Simulate(int blck, int y_size, int it_, int y_kmin, int y_kmax, int Size, int periods, bool print_it, int iter); + #if (defined _MSC_VER) + typedef int64_t SuiteSparse_long; + #endif + dynSparseMatrix(); + dynSparseMatrix(const int y_size_arg, const int y_kmin_arg, const int y_kmax_arg, const bool print_it_arg, const bool steady_state_arg, const int periods_arg, const int minimal_solving_periods_arg +#ifdef CUDA + ,const int CUDA_device_arg, cublasHandle_t cublas_handle_arg, cusparseHandle_t cusparse_handle_arg, cusparseMatDescr_t descr_arg +#endif + ); + void Simulate_Newton_Two_Boundaries(int blck, int y_size, int y_kmin, int y_kmax, int Size, int periods, bool cvg, int minimal_solving_periods, int stack_solve_algo, unsigned int endo_name_length, char *P_endo_names); + void Simulate_Newton_One_Boundary(bool forward); void fixe_u(double **u, int u_count_int, int max_lag_plus_max_lead_plus_1); - void Read_SparseMatrix(string file_name, const int Size, int periods, int y_kmin, int y_kmax, bool steady_state, bool two_boundaries, int stack_solve_algo, int solve_algo); + void Read_SparseMatrix(string file_name, const int Size, int periods, int y_kmin, int y_kmax, bool two_boundaries, int stack_solve_algo, int solve_algo); void Read_file(string file_name, int periods, int u_size1, int y_size, int y_kmin, int y_kmax, int &nb_endo, int &u_count, int &u_count_init, double *u); - void Singular_display(int block, int Size, bool steady_state, it_code_type it_code); - double g0, gp0, glambda2, try_at_iteration; + void Singular_display(int block, int Size); + void End_Solver(); + double g0, gp0, glambda2; + int try_at_iteration; private: void Init_GE(int periods, int y_kmin, int y_kmax, int Size, map, int>, int> &IM); void Init_Matlab_Sparse(int periods, int y_kmin, int y_kmax, int Size, map, int>, int> &IM, mxArray *A_m, mxArray *b_m, mxArray *x0_m); + void Init_UMFPACK_Sparse(int periods, int y_kmin, int y_kmax, int Size, map, int>, int> &IM, SuiteSparse_long **Ap, SuiteSparse_long **Ai, double **Ax, double **b, mxArray *x0_m); +#ifdef CUDA + void Init_CUDA_Sparse(int periods, int y_kmin, int y_kmax, int Size, map, int>, int> &IM, int **Ap, int **Ai, double **Ax, int **Ap_tild, int **Ai_tild, double **A_tild, double **b, double **x0, mxArray *x0_m, int *nnz, int *nnz_tild, int preconditioner); +#endif void Init_Matlab_Sparse_Simple(int Size, map, int>, int> &IM, mxArray *A_m, mxArray *b_m, bool &zero_solution, mxArray *x0_m); + void Init_UMFPACK_Sparse_Simple(int Size, map, int>, int> &IM, SuiteSparse_long **Ap, SuiteSparse_long **Ai, double **Ax, double **b, bool &zero_solution, mxArray *x0_m); + void Init_CUDA_Sparse_Simple(int Size, map, int>, int> &IM, SuiteSparse_long **Ap, SuiteSparse_long **Ai, double **Ax, double **b, double **x0, bool &zero_solution, mxArray *x0_m); void Simple_Init(int Size, std::map, int>, int> &IM, bool &zero_solution); void End_GE(int Size); + bool mnbrak(double *ax, double *bx, double *cx, double *fa, double *fb, double *fc); + bool golden(double ax, double bx, double cx, double tol, double solve_tolf, double *xmin); void Solve_ByteCode_Symbolic_Sparse_GaussianElimination(int Size, bool symbolic, int Block_number); - bool Solve_ByteCode_Sparse_GaussianElimination(int Size, int blck, bool steady_state, int it_); + bool Solve_ByteCode_Sparse_GaussianElimination(int Size, int blck, int it_); void Solve_Matlab_Relaxation(mxArray *A_m, mxArray *b_m, unsigned int Size, double slowc_l, bool is_two_boundaries, int it_); void Solve_Matlab_LU_UMFPack(mxArray *A_m, mxArray *b_m, int Size, double slowc_l, bool is_two_boundaries, int it_); - void Solve_Matlab_GMRES(mxArray *A_m, mxArray *b_m, int Size, double slowc, int block, bool is_two_boundaries, int it_, bool steady_state, mxArray *x0_m); - void Solve_Matlab_BiCGStab(mxArray *A_m, mxArray *b_m, int Size, double slowc, int block, bool is_two_boundaries, int it_, mxArray *x0_m, bool steady_state); + void Solve_LU_UMFPack(mxArray *A_m, mxArray *b_m, int Size, double slowc_l, bool is_two_boundaries, int it_); + void Solve_LU_UMFPack(SuiteSparse_long *Ap, SuiteSparse_long *Ai, double *Ax, double *b, int n, int Size, double slowc_l, bool is_two_boundaries, int it_); + void End_Matlab_LU_UMFPack(); +#ifdef CUDA + void Solve_CUDA_BiCGStab_Free(double* tmp_vect_host, double* p, double* r, double* v, double* s, double* t, double* y_, double* z, double* tmp_, + int* Ai, double* Ax, int* Ap, double* x0, double* b, double* A_tild, int* A_tild_i, int* A_tild_p, + cusparseSolveAnalysisInfo_t infoL, cusparseSolveAnalysisInfo_t infoU, + cusparseMatDescr_t descrL, cusparseMatDescr_t descrU, int preconditioner); + int Solve_CUDA_BiCGStab(int *Ap, int *Ai, double *Ax, int *Ap_tild, int *Ai_tild, double *A_tild, double *b, double *x0, int n, int Size, double slowc_l, bool is_two_boundaries, int it_, int nnz, int nnz_tild, int preconditioner, int max_iterations, int block); +#endif + void Solve_Matlab_GMRES(mxArray *A_m, mxArray *b_m, int Size, double slowc, int block, bool is_two_boundaries, int it_, mxArray *x0_m); + void Solve_Matlab_BiCGStab(mxArray *A_m, mxArray *b_m, int Size, double slowc, int block, bool is_two_boundaries, int it_, mxArray *x0_m, int precond); + void Check_and_Correct_Previous_Iteration(int block_num, int y_size, int size, double crit_opt_old); + bool Simulate_One_Boundary(int blck, int y_size, int y_kmin, int y_kmax, int Size, bool cvg); + bool solve_linear(const int block_num, const int y_size, const int y_kmin, const int y_kmax, const int size, const int iter); + void solve_non_linear(const int block_num, const int y_size, const int y_kmin, const int y_kmax, const int size); + string preconditioner_print_out(string s, int preconditioner); bool compare(int *save_op, int *save_opa, int *save_opaa, int beg_t, int periods, long int nop4, int Size #ifdef PROFILER , long int *ndiv, long int *nsub #endif ); + void Grad_f_product(int n, mxArray *b_m, double* vectr, mxArray *A_m, SuiteSparse_long *Ap, SuiteSparse_long *Ai, double* Ax, double *b); void Insert(const int r, const int c, const int u_index, const int lag_index); void Delete(const int r, const int c); int At_Row(int r, NonZeroElem **first); @@ -102,7 +178,8 @@ private: void Delete_u(int pos); void Clear_u(); void Print_u(); - void CheckIt(int y_size, int y_kmin, int y_kmax, int Size, int periods, int iter); + void *Symbolic, *Numeric ; + void CheckIt(int y_size, int y_kmin, int y_kmax, int Size, int periods); void Check_the_Solution(int periods, int y_kmin, int y_kmax, int Size, double *u, int *pivot, int *b); int complete(int beg_t, int Size, int periods, int *b); void bksub(int tbreak, int last_period, int Size, double slowc_l @@ -118,12 +195,18 @@ private: mxArray *Sparse_substract_SA_SB(mxArray *A_m, mxArray *B_m); mxArray *Sparse_substract_A_SB(mxArray *A_m, mxArray *B_m); mxArray *substract_A_B(mxArray *A_m, mxArray *B_m); - +#ifdef CUDA + int CUDA_device; + cublasHandle_t cublas_handle; + cusparseHandle_t cusparse_handle; + cusparseMatDescr_t CUDA_descr; +#endif +protected: stack Stack; int nb_prologue_table_u, nb_first_table_u, nb_middle_table_u, nb_last_table_u; int nb_prologue_table_y, nb_first_table_y, nb_middle_table_y, nb_last_table_y; int middle_count_loop; - char type; + //char type; fstream SaveCode; string filename; int max_u, min_u; @@ -154,19 +237,19 @@ protected: int u_count_alloc, u_count_alloc_save; vector jac; double *jcb; - double res1, res2, max_res; - int max_res_idx; double slowc, slowc_save, prev_slowc_save, markowitz_c; int y_decal; - int *index_vara, *index_equa; + int *index_equa; int u_count, tbreak_g; int iter; - double *direction; int start_compare; int restart; - bool error_not_printed; double g_lambda1, g_lambda2, gp_0; double lu_inc_tol; +//private: + SuiteSparse_long *Ap_save, *Ai_save; + double *Ax_save, *b_save; + mxArray *A_m_save, *b_m_save; }; #endif diff --git a/mex/sources/bytecode/bytecode.cc b/mex/sources/bytecode/bytecode.cc index 9d3e9321e..3dde9e031 100644 --- a/mex/sources/bytecode/bytecode.cc +++ b/mex/sources/bytecode/bytecode.cc @@ -18,12 +18,18 @@ */ #include #include "Interpreter.hh" +#include "ErrorHandling.hh" +#include +#include #ifdef DEBUG_EX using namespace std; # include + + + string Get_Argument(const char *argv) { @@ -33,14 +39,17 @@ Get_Argument(const char *argv) #else +void (*prev_fn)(int); + + string Get_Argument(const mxArray *prhs) { const mxArray *mxa = prhs; - int buflen = mxGetM(mxa) * mxGetN(mxa) + 1; + mwSize buflen = mwSize(mxGetM(mxa) * mxGetN(mxa) + 1); char *first_argument; first_argument = (char *) mxCalloc(buflen, sizeof(char)); - int status = mxGetString(mxa, first_argument, buflen); + size_t status = mxGetString(mxa, first_argument, buflen); if (status != 0) mexWarnMsgTxt("Not enough space. The first argument is truncated."); string f(first_argument); @@ -49,6 +58,178 @@ Get_Argument(const mxArray *prhs) } #endif + +//#include +#include + + +#ifdef CUDA +int +GPU_Test_and_Info(cublasHandle_t *cublas_handle, cusparseHandle_t *cusparse_handle, cusparseMatDescr_t *descr) +{ + cudaDeviceProp deviceProp; + int device_count, device, version, version_max = 0; + cublasStatus_t cublas_status; + cudaError_t cuda_error; + *descr=0; + + /* ask cuda how many devices it can find */ + cudaGetDeviceCount(&device_count); + if (device_count < 1) + { + /* if it couldn't find any fail out */ + ostringstream tmp; + tmp << " Unable to find a CUDA device. Unable to implement CUDA solvers\n"; + throw FatalExceptionHandling(tmp.str()); + } + else + { + mexPrintf("-----------------------------------------\n"); + for (int i = 0; i < device_count; i++) + { + cudaSetDevice(i); + // Statistics about the GPU device + cuda_error = cudaGetDeviceProperties(&deviceProp, i); + if (cuda_error != cudaSuccess) + { + ostringstream tmp; + tmp << " bytecode cudaGetDeviceProperties failed\n"; + throw FatalExceptionHandling(tmp.str()); + } + mexPrintf("> GPU device %d: \"%s\" has:\n - %d Multi-Processors,\n - %d threads per multiprocessor,\n", i, deviceProp.name, deviceProp.multiProcessorCount, deviceProp.maxThreadsPerMultiProcessor); + mexEvalString("drawnow;"); + version = (deviceProp.major * 0x10 + deviceProp.minor); + if (version >= version_max) + { + device = i; + version_max = version; + } + mexPrintf(" - %4.2fMhz clock rate,\n - %2.0fMb of memory,\n - %d.%d compute capabilities.\n", double(deviceProp.clockRate) / (1024 * 1024), double(deviceProp.totalGlobalMem) / (1024 * 1024), deviceProp.major, deviceProp.minor); + mexEvalString("drawnow;"); + } + } + mexPrintf("> Device %d selected\n", device); + mexEvalString("drawnow;"); + + cuda_error = cudaSetDevice(device); + if (cuda_error != cudaSuccess) + { + ostringstream tmp; + tmp << " bytecode cudaSetDevice failed\n"; + throw FatalExceptionHandling(tmp.str()); + } + + if(version_max < 0x11) + { + ostringstream tmp; + tmp << " bytecode requires a minimum CUDA compute 1.1 capability\n"; + cudaDeviceReset(); + throw FatalExceptionHandling(tmp.str()); + } + + // Initialize CuBlas library + cublas_status = cublasCreate(cublas_handle); + if (cublas_status != CUBLAS_STATUS_SUCCESS) + { + ostringstream tmp; + switch(cublas_status) + { + case CUBLAS_STATUS_NOT_INITIALIZED: + tmp << " the CUBLAS initialization failed.\n"; + break; + case CUBLAS_STATUS_ALLOC_FAILED: + tmp << " the resources could not be allocated.\n"; + break; + default: + tmp << " unknown error during the initialization of cusparse library.\n"; + } + throw FatalExceptionHandling(tmp.str()); + } + + // Initialize the CuSparse library + cusparseStatus_t cusparse_status; + cusparse_status = cusparseCreate(cusparse_handle); + if (cusparse_status != CUSPARSE_STATUS_SUCCESS) + { + ostringstream tmp; + switch(cusparse_status) + { + case CUSPARSE_STATUS_NOT_INITIALIZED: + tmp << " the CUDA Runtime initialization failed.\n"; + break; + case CUSPARSE_STATUS_ALLOC_FAILED: + tmp << " the resources could not be allocated.\n"; + break; + case CUSPARSE_STATUS_ARCH_MISMATCH: + tmp << " the device compute capability (CC) is less than 1.1. The CC of at least 1.1 is required.\n"; + break; + default: + tmp << " unknown error during the initialization of cusparse library.\n"; + } + throw FatalExceptionHandling(tmp.str()); + } + + // Create and setup matrix descriptor + cusparse_status = cusparseCreateMatDescr(descr); + if (cusparse_status != CUSPARSE_STATUS_SUCCESS) + { + ostringstream tmp; + tmp << " Matrix descriptor initialization failed\n"; + throw FatalExceptionHandling(tmp.str()); + } + cusparseSetMatType(*descr, CUSPARSE_MATRIX_TYPE_GENERAL); + cusparseSetMatIndexBase(*descr, CUSPARSE_INDEX_BASE_ZERO); + + mexPrintf("> Driver version:\n"); + int cuda_version; + cuda_error = cudaDriverGetVersion(&cuda_version); + if (cuda_error != cudaSuccess) + { + ostringstream tmp; + tmp << " cudaGetVersion has failed\n"; + throw FatalExceptionHandling(tmp.str()); + } + mexPrintf(" - CUDA version %5.3f\n", double(cuda_version) / 1000); + int cublas_version; + cublas_status = cublasGetVersion(*cublas_handle, &cublas_version); + if (cublas_status != CUBLAS_STATUS_SUCCESS) + { + ostringstream tmp; + tmp << " cublasGetVersion has failed\n"; + throw FatalExceptionHandling(tmp.str()); + } + mexPrintf(" - CUBLAS version %5.3f\n", double(cublas_version) / 1000); + int cusparse_version; + cusparse_status = cusparseGetVersion(*cusparse_handle, &cusparse_version); + if (cusparse_status != CUSPARSE_STATUS_SUCCESS) + { + ostringstream tmp; + tmp << " cusparseGetVersion has failed\n"; + throw FatalExceptionHandling(tmp.str()); + } + mexPrintf(" - CUSPARSE version %5.3f\n", double(cusparse_version) / 1000); + mexPrintf("-----------------------------------------\n"); + return device; +} + +void +GPU_close(cublasHandle_t cublas_handle, cusparseHandle_t cusparse_handle, cusparseMatDescr_t descr) +{ + cublasChk(cublasDestroy(cublas_handle),"in bytecode cublasDestroy failed\n"); + cusparseChk(cusparseDestroyMatDescr(descr), "in bytecode cusparseDestroyMatDescr failed\n"); + cusparseChk(cusparseDestroy(cusparse_handle),"in bytecode cusparseDestroy failed\n"); +} + +#endif +string +deblank(string x) +{ + for(int i = 0; i < x.length(); i++) + if (x[i] == ' ') + x.erase(i--, 1); + return x; +} + void Get_Arguments_and_global_variables(int nrhs, #ifndef DEBUG_EX @@ -57,10 +238,10 @@ Get_Arguments_and_global_variables(int nrhs, const char *prhs[], #endif int &count_array_argument, - double *yd[], unsigned int &row_y, unsigned int &col_y, - double *xd[], unsigned int &row_x, unsigned int &col_x, - double *params[], - double *steady_yd[], unsigned int &steady_row_y, unsigned int &steady_col_y, + double *yd[], size_t &row_y, size_t &col_y, + double *xd[], size_t &row_x, size_t &col_x, + double *params[], + double *steady_yd[], size_t &steady_row_y, size_t &steady_col_y, unsigned int &periods, #ifndef DEBUG_EX mxArray *block_structur[], @@ -69,8 +250,10 @@ Get_Arguments_and_global_variables(int nrhs, mxArray *M_[], mxArray *oo_[], mxArray *options_[], bool &global_temporary_terms, bool &print, bool &print_error, - mxArray *GlobalTemporaryTerms[]) + mxArray *GlobalTemporaryTerms[], + string *plan_struct_name, string *pfplan_struct_name) { + size_t pos; #ifdef DEBUG_EX for (int i = 2; i < nrhs; i++) #else @@ -101,7 +284,7 @@ Get_Arguments_and_global_variables(int nrhs, steady_col_y = mxGetN(prhs[i]); break; case 4: - periods = mxGetScalar(prhs[i]); + periods = int(mxGetScalar(prhs[i])); break; case 5: *block_structur = mxDuplicateArray(prhs[i]); @@ -111,7 +294,7 @@ Get_Arguments_and_global_variables(int nrhs, *GlobalTemporaryTerms = mxDuplicateArray(prhs[i]); break; default: - //mexPrintf("Unknown argument count_array_argument=%d\n",count_array_argument); + mexPrintf("Unknown argument count_array_argument=%d\n",count_array_argument); break; } count_array_argument++; @@ -132,16 +315,34 @@ Get_Arguments_and_global_variables(int nrhs, print_error = false; else { - int pos = Get_Argument(prhs[i]).find("block"); - if (pos != (int) string::npos) + ; + if ((pos = Get_Argument(prhs[i]).find("block")) != (int) string::npos) { - int pos1 = Get_Argument(prhs[i]).find("=", pos+5); + size_t pos1 = Get_Argument(prhs[i]).find("=", pos+5); if (pos1 != (int) string::npos) pos = pos1 + 1; else pos += 5; block = atoi(Get_Argument(prhs[i]).substr(pos, string::npos).c_str())-1; } + else if ((pos = Get_Argument(prhs[i]).find("pfplan")) != (int) string::npos) + { + size_t pos1 = Get_Argument(prhs[i]).find("=", pos+6); + if (pos1 != (int) string::npos) + pos = pos1 + 1; + else + pos += 6; + *pfplan_struct_name = deblank(Get_Argument(prhs[i]).substr(pos, string::npos)); + } + else if ((pos = Get_Argument(prhs[i]).find("plan")) != (int) string::npos) + { + size_t pos1 = Get_Argument(prhs[i]).find("=", pos+4); + if (pos1 != (int) string::npos) + pos = pos1 + 1; + else + pos += 4; + *plan_struct_name = deblank(Get_Argument(prhs[i]).substr(pos, string::npos)); + } else { ostringstream tmp; @@ -185,6 +386,7 @@ Get_Arguments_and_global_variables(int nrhs, } } + #ifdef DEBUG_EX int main(int nrhs, const char *prhs[]) @@ -203,9 +405,9 @@ main(int nrhs, const char *prhs[]) char *plhs[1]; load_global((char *) prhs[1]); #endif - //ErrorHandlingException error_handling; - unsigned int i, row_y = 0, col_y = 0, row_x = 0, col_x = 0, nb_row_xd = 0; - unsigned int steady_row_y, steady_col_y; + mxArray *plan_struct = NULL, *pfplan_struct = NULL; + size_t i, row_y = 0, col_y = 0, row_x = 0, col_x = 0, nb_row_xd = 0; + size_t steady_row_y, steady_col_y; int y_kmin = 0, y_kmax = 0, y_decal = 0; unsigned int periods = 1; double *direction; @@ -218,13 +420,22 @@ main(int nrhs, const char *prhs[]) bool global_temporary_terms = false; bool print = false, print_error = true, print_it = false; double *steady_yd = NULL, *steady_xd = NULL; - + string plan, pfplan; + + vector splan, spfplan; + +#ifdef CUDA + int CUDA_device = -1; + cublasHandle_t cublas_handle; + cusparseHandle_t cusparse_handle; + cusparseMatDescr_t descr; +#endif try { Get_Arguments_and_global_variables(nrhs, prhs, count_array_argument, &yd, row_y, col_y, &xd, row_x, col_x, - ¶ms, + ¶ms, &steady_yd, steady_row_y, steady_col_y, periods, #ifndef DEBUG_EX @@ -232,123 +443,401 @@ main(int nrhs, const char *prhs[]) #endif steady_state, evaluate, block, &M_, &oo_, &options_, global_temporary_terms, - print, print_error, &GlobalTemporaryTerms); + print, print_error, &GlobalTemporaryTerms, + &plan, &pfplan); } catch (GeneralExceptionHandling &feh) { DYN_MEX_FUNC_ERR_MSG_TXT(feh.GetErrorMsg().c_str()); } - if (!count_array_argument) - params = mxGetPr(mxGetFieldByNumber(M_, 0, mxGetFieldNumber(M_, "params"))); + { + int field = mxGetFieldNumber(M_, "params"); + if (field < 0) + DYN_MEX_FUNC_ERR_MSG_TXT("params is not a field of M_"); + params = mxGetPr(mxGetFieldByNumber(M_, 0, field)); + } + ErrorMsg emsg; + + + if (plan.length()>0) + { + mxArray* plan_struct = mexGetVariable("base", plan.c_str()); + if (plan_struct == NULL) + { + string tmp = plan; + tmp.insert(0,"Can't find the plan: "); + DYN_MEX_FUNC_ERR_MSG_TXT(tmp.c_str()); + } + size_t n_plan = mxGetN(plan_struct); + splan.resize(n_plan); + for (int i = 0; i < n_plan; i++) + { + splan[i].var = ""; + splan[i].exo = ""; + mxArray* tmp = mxGetField(plan_struct, i, "exo"); + if (tmp) + { + char name [100]; + mxGetString(tmp, name, 100); + splan[i].var = name; + SymbolType variable_type; + int exo_num = emsg.get_ID(name, &variable_type); + if (variable_type == eExogenous || variable_type == eExogenousDet) + splan[i].var_num = exo_num; + else + { + string tmp = name; + tmp.insert(0,"the variable '"); + tmp.append("' defined as var in plan is not an exogenous or a deterministic exogenous\n"); + DYN_MEX_FUNC_ERR_MSG_TXT(tmp.c_str()); + } + } + tmp = mxGetField(plan_struct, i, "var"); + if (tmp) + { + char name [100]; + mxGetString(tmp, name, 100); + splan[i].exo = name; + SymbolType variable_type; + int exo_num = emsg.get_ID(name, &variable_type); + if (variable_type == eEndogenous) + splan[i].exo_num = exo_num; + else + { + string tmp = name; + tmp.insert(0,"the variable '"); + tmp.append("' defined as exo in plan is not an endogenous variable\n"); + DYN_MEX_FUNC_ERR_MSG_TXT(tmp.c_str()); + } + } + tmp = mxGetField(plan_struct, i, "per_value"); + if (tmp) + { + size_t num_shocks = mxGetM(tmp); + (splan[i]).per_value.resize(num_shocks); + double * per_value = mxGetPr(tmp); + for (int j = 0; j < num_shocks; j++) + (splan[i]).per_value[j] = make_pair(ceil(per_value[j]), per_value[j + num_shocks]); + } + } + int i; + for (vector::iterator it = splan.begin(); it != splan.end(); it++) + { + mexPrintf("----------------------------------------------------------------------------------------------------\n"); + mexPrintf("suprise n°%d\n", i+1); + if (it->exo.length()) + mexPrintf(" plan fliping var=%s (%d) exo=%s (%d) for the following periods and with the following values:\n", it->var.c_str(), it->var_num, it->exo.c_str(), it->exo_num); + else + mexPrintf(" plan shocks on var=%s for the following periods and with the following values:\n", it->var.c_str()); + for (vector >::iterator it1 = it->per_value.begin(); it1 != it->per_value.end(); it1++) + { + mexPrintf(" %3d %10.5f\n",it1->first, it1->second); + } + i++; + } + } + + if (pfplan.length()>0) + { + pfplan_struct = mexGetVariable("base", pfplan.c_str()); + if (!pfplan_struct) + { + string tmp = pfplan; + tmp.insert(0,"Can't find the pfplan: "); + DYN_MEX_FUNC_ERR_MSG_TXT(tmp.c_str()); + } + size_t n_plan = mxGetN(pfplan_struct); + spfplan.resize(n_plan); + for (int i = 0; i < n_plan; i++) + { + spfplan[i].var = ""; + spfplan[i].exo = ""; + mxArray* tmp = mxGetField(pfplan_struct, i, "var"); + if (tmp) + { + char name [100]; + mxGetString(tmp, name, 100); + spfplan[i].var = name; + SymbolType variable_type; + int exo_num = emsg.get_ID(name, &variable_type); + if (variable_type == eExogenous || variable_type == eExogenousDet) + splan[i].var_num = exo_num; + else + { + string tmp = name; + tmp.insert(0,"the variable '"); + tmp.append("' defined as var in pfplan is not an exogenous or a deterministic exogenous\n"); + DYN_MEX_FUNC_ERR_MSG_TXT(tmp.c_str()); + } + } + tmp = mxGetField(pfplan_struct, i, "exo"); + if (tmp) + { + char name [100]; + mxGetString(tmp, name, 100); + spfplan[i].exo = name; + SymbolType variable_type; + int exo_num = emsg.get_ID(name, &variable_type); + if (variable_type == eEndogenous) + spfplan[i].exo_num = exo_num; + else + { + string tmp = name; + tmp.insert(0,"the variable '"); + tmp.append("' defined as exo in pfplan is not an endogenous variable\n"); + DYN_MEX_FUNC_ERR_MSG_TXT(tmp.c_str()); + } + } + tmp = mxGetField(pfplan_struct, i, "per_value"); + if (tmp) + { + size_t num_shocks = mxGetM(tmp); + double * per_value = mxGetPr(tmp); + (spfplan[i]).per_value.resize(num_shocks); + for (int j = 0; j < num_shocks; j++) + spfplan[i].per_value[j] = make_pair(ceil(per_value[j]), per_value[j+ num_shocks]); + } + } + int i; + for (vector::iterator it = spfplan.begin(); it != spfplan.end(); it++) + { + mexPrintf("----------------------------------------------------------------------------------------------------\n"); + mexPrintf("perfect foresight n°%d\n", i+1); + if (it->exo.length()) + mexPrintf(" plan flipping var=%s (%d) exo=%s (%d) for the following periods and with the following values:\n", it->var.c_str(), it->var_num, it->exo.c_str(), it->exo_num); + else + mexPrintf(" plan shocks on var=%s (%d) for the following periods and with the following values:\n", it->var.c_str(), it->var_num); + for (vector >::iterator it1 = it->per_value.begin(); it1 != it->per_value.end(); it1++) + { + mexPrintf(" %3d %10.5f\n",it1->first, it1->second); + } + i++; + } + } + + + + int field_steady_state = mxGetFieldNumber(oo_, "steady_state"); + if (field_steady_state < 0) + DYN_MEX_FUNC_ERR_MSG_TXT("steady_state is not a field of oo_"); + int field_exo_steady_state = mxGetFieldNumber(oo_, "exo_steady_state"); + if (field_exo_steady_state < 0) + DYN_MEX_FUNC_ERR_MSG_TXT("exo_steady_state is not a field of oo_"); + if (!steady_state) { + int field_endo_simul = mxGetFieldNumber(oo_, "endo_simul"); + if (field_endo_simul < 0) + DYN_MEX_FUNC_ERR_MSG_TXT("endo_simul is not a field of oo_"); + + int field_exo_simul = mxGetFieldNumber(oo_, "exo_simul"); + if (field_exo_simul < 0) + DYN_MEX_FUNC_ERR_MSG_TXT("exo_simul is not a field of oo_"); + if (!count_array_argument) { - yd = mxGetPr(mxGetFieldByNumber(oo_, 0, mxGetFieldNumber(oo_, "endo_simul"))); - row_y = mxGetM(mxGetFieldByNumber(oo_, 0, mxGetFieldNumber(oo_, "endo_simul"))); - col_y = mxGetN(mxGetFieldByNumber(oo_, 0, mxGetFieldNumber(oo_, "endo_simul"))); - xd = mxGetPr(mxGetFieldByNumber(oo_, 0, mxGetFieldNumber(oo_, "exo_simul"))); - row_x = mxGetM(mxGetFieldByNumber(oo_, 0, mxGetFieldNumber(oo_, "exo_simul"))); - col_x = mxGetN(mxGetFieldByNumber(oo_, 0, mxGetFieldNumber(oo_, "exo_simul"))); + mxArray* endo_sim_arr = mxGetFieldByNumber(oo_, 0, field_endo_simul); + yd = mxGetPr(endo_sim_arr); + row_y = mxGetM(endo_sim_arr); + col_y = mxGetN(endo_sim_arr); + mxArray* exo_sim_arr = mxGetFieldByNumber(oo_, 0, field_exo_simul); + xd = mxGetPr(exo_sim_arr); + row_x = mxGetM(exo_sim_arr); + col_x = mxGetN(exo_sim_arr); nb_row_xd = row_x; } + int field = mxGetFieldNumber(M_, "maximum_lag"); + if (field >= 0) + y_kmin = int (floor(*(mxGetPr(mxGetFieldByNumber(M_, 0, field))))); + else + DYN_MEX_FUNC_ERR_MSG_TXT("maximum_lag is not a field of M_"); + field = mxGetFieldNumber(M_, "maximum_lead"); + if (field >= 0) + y_kmax = int (floor(*(mxGetPr(mxGetFieldByNumber(M_, 0, field))))); + else + DYN_MEX_FUNC_ERR_MSG_TXT("maximum_lead is not a field of M_"); + field = mxGetFieldNumber(M_, "maximum_endo_lag"); + if (field >= 0) + y_decal = max(0, y_kmin-int (floor(*(mxGetPr(mxGetFieldByNumber(M_, 0, field)))))); + else + DYN_MEX_FUNC_ERR_MSG_TXT("maximum_endo_lag is not a field of M_"); - y_kmin = int (floor(*(mxGetPr(mxGetFieldByNumber(M_, 0, mxGetFieldNumber(M_, "maximum_lag")))))); - y_kmax = int (floor(*(mxGetPr(mxGetFieldByNumber(M_, 0, mxGetFieldNumber(M_, "maximum_lead")))))); - y_decal = max(0, y_kmin-int (floor(*(mxGetPr(mxGetFieldByNumber(M_, 0, mxGetFieldNumber(M_, "maximum_endo_lag"))))))); if (!count_array_argument) - periods = int (floor(*(mxGetPr(mxGetFieldByNumber(options_, 0, mxGetFieldNumber(options_, "periods")))))); + { + int field = mxGetFieldNumber(options_, "periods"); + if (field >= 0) + periods = int (floor(*(mxGetPr(mxGetFieldByNumber(options_, 0, field))))); + else + DYN_MEX_FUNC_ERR_MSG_TXT("options_ is not a field of options_"); + } + if (!steady_yd ) { - steady_yd = mxGetPr(mxGetFieldByNumber(oo_, 0, mxGetFieldNumber(oo_, "steady_state"))); - steady_row_y = mxGetM(mxGetFieldByNumber(oo_, 0, mxGetFieldNumber(oo_, "steady_state"))); - steady_col_y = mxGetN(mxGetFieldByNumber(oo_, 0, mxGetFieldNumber(oo_, "steady_state")));; + mxArray* steady_state_arr = mxGetFieldByNumber(oo_, 0, field_steady_state); + steady_yd = mxGetPr(steady_state_arr); + steady_row_y = mxGetM(steady_state_arr); + steady_col_y = mxGetN(steady_state_arr); } - steady_xd = mxGetPr(mxGetFieldByNumber(oo_, 0, mxGetFieldNumber(oo_, "exo_steady_state"))); + steady_xd = mxGetPr(mxGetFieldByNumber(oo_, 0, field_exo_steady_state)); } else { if (!count_array_argument) { - yd = mxGetPr(mxGetFieldByNumber(oo_, 0, mxGetFieldNumber(oo_, "steady_state"))); - row_y = mxGetM(mxGetFieldByNumber(oo_, 0, mxGetFieldNumber(oo_, "steady_state"))); - col_y = mxGetN(mxGetFieldByNumber(oo_, 0, mxGetFieldNumber(oo_, "steady_state")));; + mxArray* steady_state_arr = mxGetFieldByNumber(oo_, 0, field_steady_state); + yd = mxGetPr(steady_state_arr); + row_y = mxGetM(steady_state_arr); + col_y = mxGetN(steady_state_arr); - xd = mxGetPr(mxGetFieldByNumber(oo_, 0, mxGetFieldNumber(oo_, "exo_steady_state"))); - row_x = mxGetM(mxGetFieldByNumber(oo_, 0, mxGetFieldNumber(oo_, "exo_steady_state"))); - col_x = mxGetN(mxGetFieldByNumber(oo_, 0, mxGetFieldNumber(oo_, "exo_steady_state"))); + mxArray* exo_steady_state_arr = mxGetFieldByNumber(oo_, 0, field_exo_steady_state); + xd = mxGetPr(exo_steady_state_arr); + row_x = mxGetM(exo_steady_state_arr); + col_x = mxGetN(exo_steady_state_arr); nb_row_xd = row_x; } } - int verbose= int(*mxGetPr((mxGetFieldByNumber(options_, 0, mxGetFieldNumber(options_, "verbosity"))))); + int field = mxGetFieldNumber(options_, "verbosity"); + int verbose = 0; + if (field >= 0) + verbose = int(*mxGetPr((mxGetFieldByNumber(options_, 0, field)))); + else + DYN_MEX_FUNC_ERR_MSG_TXT("verbosity is not a field of options_"); if (verbose) print_it = true; - int maxit_ = int (floor(*(mxGetPr(mxGetFieldByNumber(options_, 0, mxGetFieldNumber(options_, "maxit_")))))); - double slowc = double (*(mxGetPr(mxGetFieldByNumber(options_, 0, mxGetFieldNumber(options_, "slowc"))))); - double markowitz_c = double (*(mxGetPr(mxGetFieldByNumber(options_, 0, mxGetFieldNumber(options_, "markowitz"))))); - int minimal_solving_periods = int (*(mxGetPr(mxGetFieldByNumber(options_, 0, mxGetFieldNumber(options_, "minimal_solving_periods"))))); - int stack_solve_algo = int (*(mxGetPr(mxGetFieldByNumber(options_, 0, mxGetFieldNumber(options_, "stack_solve_algo"))))); + field = mxGetFieldNumber(options_, "maxit_"); + if (field < 0) + DYN_MEX_FUNC_ERR_MSG_TXT("maxit_ is not a field of options_"); + int maxit_ = int (floor(*(mxGetPr(mxGetFieldByNumber(options_, 0, field))))); + field = mxGetFieldNumber(options_, "slowc"); + if (field < 0) + DYN_MEX_FUNC_ERR_MSG_TXT("slows is not a field of options_"); + double slowc = double (*(mxGetPr(mxGetFieldByNumber(options_, 0, field)))); + field = mxGetFieldNumber(options_, "markowitz"); + if (field < 0) + DYN_MEX_FUNC_ERR_MSG_TXT("markowitz is not a field of options_"); + double markowitz_c = double (*(mxGetPr(mxGetFieldByNumber(options_, 0, field)))); + field = mxGetFieldNumber(options_, "minimal_solving_periods"); + if (field < 0) + DYN_MEX_FUNC_ERR_MSG_TXT("minimal_solving_periods is not a field of options_"); + int minimal_solving_periods = int (*(mxGetPr(mxGetFieldByNumber(options_, 0, field)))); + field = mxGetFieldNumber(options_, "stack_solve_algo"); + if (field < 0) + DYN_MEX_FUNC_ERR_MSG_TXT("stack_solve_algo is not a field of options_"); + int stack_solve_algo = int (*(mxGetPr(mxGetFieldByNumber(options_, 0, field)))); int solve_algo; double solve_tolf; + if (steady_state) { - solve_algo = int (*(mxGetPr(mxGetFieldByNumber(options_, 0, mxGetFieldNumber(options_, "solve_algo"))))); - solve_tolf = *(mxGetPr(mxGetFieldByNumber(options_, 0, mxGetFieldNumber(options_, "solve_tolf")))); + int field = mxGetFieldNumber(options_, "solve_algo"); + if (field >= 0) + solve_algo = int (*(mxGetPr(mxGetFieldByNumber(options_, 0, field)))); + else + DYN_MEX_FUNC_ERR_MSG_TXT("solve_algo is not a field of options_"); + field = mxGetFieldNumber(options_, "solve_tolf"); + if (field >= 0) + solve_tolf = *(mxGetPr(mxGetFieldByNumber(options_, 0, field))); + else + DYN_MEX_FUNC_ERR_MSG_TXT("solve_tolf is not a field of options_"); } else { solve_algo = stack_solve_algo; - mxArray *dynatol = mxGetFieldByNumber(options_, 0, mxGetFieldNumber(options_, "dynatol")); - solve_tolf= *mxGetPr((mxGetFieldByNumber(dynatol, 0, mxGetFieldNumber(dynatol, "f")))); + int field = mxGetFieldNumber(options_, "dynatol"); + mxArray *dynatol; + if (field >= 0) + dynatol = mxGetFieldByNumber(options_, 0, field); + else + DYN_MEX_FUNC_ERR_MSG_TXT("dynatol is not a field of options_"); + field = mxGetFieldNumber(dynatol, "f"); + if (field >= 0) + solve_tolf= *mxGetPr((mxGetFieldByNumber(dynatol, 0, field))); + else + DYN_MEX_FUNC_ERR_MSG_TXT("f is not a field of options_.dynatol"); } - - mxArray *mxa = mxGetFieldByNumber(M_, 0, mxGetFieldNumber(M_, "fname")); - int buflen = mxGetM(mxa) * mxGetN(mxa) + 1; + field = mxGetFieldNumber(M_, "fname"); + mxArray *mxa; + if (field >= 0) + mxa = mxGetFieldByNumber(M_, 0, field); + else + DYN_MEX_FUNC_ERR_MSG_TXT("fname is not a field of M_"); + size_t buflen = mxGetM(mxa) * mxGetN(mxa) + 1; char *fname; fname = (char *) mxCalloc(buflen+1, sizeof(char)); - int status = mxGetString(mxa, fname, buflen); + size_t status = mxGetString(mxa, fname, int(buflen)); fname[buflen] = ' '; if (status != 0) mexWarnMsgTxt("Not enough space. Filename is truncated."); string file_name = fname; - int size_of_direction = col_y*row_y*sizeof(double); - double *y = (double *) mxMalloc(size_of_direction); - double *ya = (double *) mxMalloc(size_of_direction); - direction = (double *) mxMalloc(size_of_direction); - memset(direction, 0, size_of_direction); - - double *x = (double *) mxMalloc(col_x*row_x*sizeof(double)); - for (i = 0; i < row_x*col_x; i++) - x[i] = double (xd[i]); - for (i = 0; i < row_y*col_y; i++) - { - y[i] = double (yd[i]); - ya[i] = double (yd[i]); - } - int y_size = row_y; - int nb_row_x = row_x; - clock_t t0 = clock(); - - Interpreter interprete(params, y, ya, x, steady_yd, steady_xd, direction, y_size, nb_row_x, nb_row_xd, periods, y_kmin, y_kmax, maxit_, solve_tolf, size_of_direction, slowc, y_decal, markowitz_c, file_name, minimal_solving_periods, stack_solve_algo, solve_algo, global_temporary_terms, print, print_error, GlobalTemporaryTerms); - - string f(fname); - mxFree(fname); - int nb_blocks = 0; - double *pind; - bool no_error = true; - +#ifdef CUDA try { - interprete.compute_blocks(f, f, steady_state, evaluate, block, nb_blocks,print_it); + if (stack_solve_algo == 7 && !steady_state) + CUDA_device = GPU_Test_and_Info(&cublas_handle, &cusparse_handle, &descr); } catch (GeneralExceptionHandling &feh) { DYN_MEX_FUNC_ERR_MSG_TXT(feh.GetErrorMsg().c_str()); } +#else + if (stack_solve_algo == 7 && !steady_state) + DYN_MEX_FUNC_ERR_MSG_TXT("bytecode has not been compiled with CUDA option. Bytecode Can't use options_.stack_solve_algo=7\n"); +#endif + + size_t size_of_direction = col_y*row_y*sizeof(double); + double *y = (double *) mxMalloc(size_of_direction); + double *ya = (double *) mxMalloc(size_of_direction); + direction = (double *) mxMalloc(size_of_direction); + memset(direction, 0, size_of_direction); + double *x = (double *) mxMalloc(col_x*row_x*sizeof(double)); + #ifdef USE_OMP + #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) + #endif + for (i = 0; i < row_x*col_x; i++) + { + x[i] = double (xd[i]); + } + + #ifdef USE_OMP + #pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) + #endif + for (i = 0; i < row_y*col_y; i++) + { + y[i] = double (yd[i]); + ya[i] = double (yd[i]); + } + size_t y_size = row_y; + size_t nb_row_x = row_x; + clock_t t0 = clock(); + Interpreter interprete(params, y, ya, x, steady_yd, steady_xd, direction, y_size, nb_row_x, nb_row_xd, periods, y_kmin, y_kmax, maxit_, solve_tolf, size_of_direction, slowc, y_decal, + markowitz_c, file_name, minimal_solving_periods, stack_solve_algo, solve_algo, global_temporary_terms, print, print_error, GlobalTemporaryTerms, steady_state, + print_it +#ifdef CUDA + , CUDA_device, cublas_handle, cusparse_handle, descr +#endif + ); + string f(fname); + mxFree(fname); + int nb_blocks = 0; + double *pind; + bool no_error = true; + try + { + interprete.compute_blocks(f, f, evaluate, block, nb_blocks); + } + catch (GeneralExceptionHandling &feh) + { + DYN_MEX_FUNC_ERR_MSG_TXT(feh.GetErrorMsg().c_str()); + } + +#ifdef CUDA + if (stack_solve_algo == 7 && !steady_state) + GPU_close(cublas_handle, cusparse_handle, descr); +#endif clock_t t1 = clock(); if (!steady_state && !evaluate && no_error && print) @@ -370,14 +859,14 @@ main(int nrhs, const char *prhs[]) if (evaluate) { vector residual = interprete.get_residual(); - plhs[1] = mxCreateDoubleMatrix(residual.size()/col_y, col_y, mxREAL); + plhs[1] = mxCreateDoubleMatrix(int(residual.size()/double(col_y)), int(col_y), mxREAL); pind = mxGetPr(plhs[1]); for (i = 0; i < residual.size(); i++) pind[i] = residual[i]; } else { - plhs[1] = mxCreateDoubleMatrix(row_y, col_y, mxREAL); + plhs[1] = mxCreateDoubleMatrix(int(row_y), int(col_y), mxREAL); pind = mxGetPr(plhs[1]); for (i = 0; i < row_y*col_y; i++) pind[i] = y[i]; @@ -385,7 +874,7 @@ main(int nrhs, const char *prhs[]) } else { - plhs[1] = mxCreateDoubleMatrix(row_y, col_y, mxREAL); + plhs[1] = mxCreateDoubleMatrix(int(row_y), int(col_y), mxREAL); pind = mxGetPr(plhs[1]); if (evaluate) { @@ -409,7 +898,7 @@ main(int nrhs, const char *prhs[]) jacob_exo_field_number = 1; jacob_exo_det_field_number = 2; jacob_other_endo_field_number = 3; - mwSize dims[1] = {nb_blocks }; + mwSize dims[1] = {(mwSize)nb_blocks }; plhs[2] = mxCreateStructArray(1, dims, 4, field_names); } else if (!mxIsStruct(block_structur)) @@ -456,15 +945,15 @@ main(int nrhs, const char *prhs[]) } if (nlhs > 3) { - plhs[3] = mxCreateDoubleMatrix(row_y, col_y, mxREAL); + plhs[3] = mxCreateDoubleMatrix(int(row_y), int(col_y), mxREAL); pind = mxGetPr(plhs[3]); for (i = 0; i < row_y*col_y; i++) pind[i] = y[i]; if (nlhs > 4) { mxArray *GlobalTemporaryTerms = interprete.get_Temporary_Terms(); - unsigned int nb_temp_terms = mxGetM(GlobalTemporaryTerms); - plhs[4] = mxCreateDoubleMatrix(nb_temp_terms, 1, mxREAL); + size_t nb_temp_terms = mxGetM(GlobalTemporaryTerms); + plhs[4] = mxCreateDoubleMatrix(int(nb_temp_terms), 1, mxREAL); pind = mxGetPr(plhs[4]); double *tt = mxGetPr(GlobalTemporaryTerms); for (i = 0; i < nb_temp_terms; i++) @@ -486,4 +975,8 @@ main(int nrhs, const char *prhs[]) mxFree(ya); if (direction) mxFree(direction); +#ifdef _MSC_VER_ + /*fFreeResult =*/ FreeLibrary(hinstLib); +#endif + return; } diff --git a/mex/sources/dynblas.h b/mex/sources/dynblas.h index 415ca224f..ba159ae12 100644 --- a/mex/sources/dynblas.h +++ b/mex/sources/dynblas.h @@ -41,7 +41,7 @@ typedef ptrdiff_t blas_int; typedef int blas_int; #endif -#if defined(MATLAB_MEX_FILE) && defined(_WIN32) +#if defined(MATLAB_MEX_FILE) && defined(_WIN32) && !defined(_MSC_VER) # define FORTRAN_WRAPPER(x) x #else # define FORTRAN_WRAPPER(x) x ## _