Bytecode: remove CUDA-specific stuff

time-shift
Sébastien Villemot 2021-02-01 14:03:57 +01:00
parent 5ef440ad9b
commit eb29dc003a
No known key found for this signature in database
GPG Key ID: 2CECE9350ECEBE4A
6 changed files with 9 additions and 1979 deletions

View File

@ -33,16 +33,8 @@ Interpreter::Interpreter(double *params_arg, double *y_arg, double *ya_arg, doub
int maxit_arg_, double solve_tolf_arg, size_t size_of_direction_arg, double slowc_arg, int y_decal_arg, double markowitz_c_arg,
string &filename_arg, int minimal_solving_periods_arg, int stack_solve_algo_arg, int solve_algo_arg,
bool global_temporary_terms_arg, bool print_arg, bool print_error_arg, mxArray *GlobalTemporaryTerms_arg,
bool steady_state_arg, bool print_it_arg, int col_x_arg, int col_y_arg
#ifdef CUDA
, const int CUDA_device_arg, cublasHandle_t cublas_handle_arg, cusparseHandle_t cusparse_handle_arg, cusparseMatDescr_t descr_arg
#endif
)
: dynSparseMatrix(y_size_arg, y_kmin_arg, y_kmax_arg, print_it_arg, steady_state_arg, periods_arg, minimal_solving_periods_arg, slowc_arg
#ifdef CUDA
, CUDA_device_arg, cublas_handle_arg, cusparse_handle_arg, descr_arg
#endif
)
bool steady_state_arg, bool print_it_arg, int col_x_arg, int col_y_arg)
: dynSparseMatrix(y_size_arg, y_kmin_arg, y_kmax_arg, print_it_arg, steady_state_arg, periods_arg, minimal_solving_periods_arg, slowc_arg)
{
params = params_arg;
y = y_arg;

View File

@ -1,5 +1,5 @@
/*
* Copyright © 2007-2017 Dynare Team
* Copyright © 2007-2021 Dynare Team
*
* This file is part of Dynare.
*
@ -58,11 +58,7 @@ public:
int maxit_arg_, double solve_tolf_arg, size_t size_of_direction_arg, double slowc_arg, int y_decal_arg, double markowitz_c_arg,
string &filename_arg, int minimal_solving_periods_arg, int stack_solve_algo_arg, int solve_algo_arg,
bool global_temporary_terms_arg, bool print_arg, bool print_error_arg, mxArray *GlobalTemporaryTerms_arg,
bool steady_state_arg, bool print_it_arg, int col_x_arg, int col_y_arg
#ifdef CUDA
, const int CUDA_device, cublasHandle_t cublas_handle_arg, cusparseHandle_t cusparse_handle_arg, cusparseMatDescr_t descr_arg
#endif
);
bool steady_state_arg, bool print_it_arg, int col_x_arg, int col_y_arg);
bool extended_path(string file_name, string bin_basename, bool evaluate, int block, int &nb_blocks, int nb_periods, vector<s_plan> sextended_path, vector<s_plan> sconstrained_extended_path, vector<string> dates, table_conditional_global_type table_conditional_global);
bool compute_blocks(string file_name, string bin_basename, bool evaluate, int block, int &nb_blocks);
void check_for_controlled_exo_validity(FBEGINBLOCK_ *fb, vector<s_plan> sconstrained_extended_path);

File diff suppressed because it is too large Load Diff

View File

@ -28,51 +28,11 @@
#include "dynblas.h"
#include "dynumfpack.h"
#ifdef CUDA
# include "cuda.h"
# include "cuda_runtime_api.h"
# include "cublas_v2.h"
# include "cusparse_v2.h"
#endif
#include "Mem_Mngr.hh"
#include "ErrorHandling.hh"
//#include "Interpreter.hh"
#include "Evaluate.hh"
#define cudaChk(x, y) \
{ \
cudaError_t cuda_error = x; \
if (cuda_error != cudaSuccess) \
{ \
ostringstream tmp; \
tmp << y; \
throw FatalExceptionHandling(tmp.str()); \
} \
};
#define cusparseChk(x, y) \
{ \
cusparseStatus_t cusparse_status = x; \
if (cusparse_status != CUSPARSE_STATUS_SUCCESS) \
{ \
ostringstream tmp; \
tmp << y; \
throw FatalExceptionHandling(tmp.str()); \
} \
};
#define cublasChk(x, y) \
{ \
cublasStatus_t cublas_status = x; \
if (cublas_status != CUBLAS_STATUS_SUCCESS) \
{ \
ostringstream tmp; \
tmp << y; \
throw FatalExceptionHandling(tmp.str()); \
} \
};
#define NEW_ALLOC
#define MARKOVITZ
@ -101,11 +61,7 @@ class dynSparseMatrix : public Evaluate
{
public:
dynSparseMatrix();
dynSparseMatrix(const int y_size_arg, const int y_kmin_arg, const int y_kmax_arg, const bool print_it_arg, const bool steady_state_arg, const int periods_arg, const int minimal_solving_periods_arg, const double slowc_arg
#ifdef CUDA
, const int CUDA_device_arg, cublasHandle_t cublas_handle_arg, cusparseHandle_t cusparse_handle_arg, cusparseMatDescr_t descr_arg
#endif
);
dynSparseMatrix(const int y_size_arg, const int y_kmin_arg, const int y_kmax_arg, const bool print_it_arg, const bool steady_state_arg, const int periods_arg, const int minimal_solving_periods_arg, const double slowc_arg);
void Simulate_Newton_Two_Boundaries(int blck, int y_size, int y_kmin, int y_kmax, int Size, int periods, bool cvg, int minimal_solving_periods, int stack_solve_algo, unsigned int endo_name_length, char *P_endo_names, vector_table_conditional_local_type vector_table_conditional_local);
void Simulate_Newton_One_Boundary(bool forward);
void fixe_u(double **u, int u_count_int, int max_lag_plus_max_lead_plus_1);
@ -123,12 +79,8 @@ private:
void Init_GE(int periods, int y_kmin, int y_kmax, int Size, map<pair<pair<int, int>, int>, int> &IM);
void Init_Matlab_Sparse(int periods, int y_kmin, int y_kmax, int Size, map<pair<pair<int, int>, int>, int> &IM, mxArray *A_m, mxArray *b_m, mxArray *x0_m);
void Init_UMFPACK_Sparse(int periods, int y_kmin, int y_kmax, int Size, map<pair<pair<int, int>, int>, int> &IM, SuiteSparse_long **Ap, SuiteSparse_long **Ai, double **Ax, double **b, mxArray *x0_m, vector_table_conditional_local_type vector_table_conditional_local, int block_num);
#ifdef CUDA
void Init_CUDA_Sparse(int periods, int y_kmin, int y_kmax, int Size, map<pair<pair<int, int>, int>, int> &IM, int **Ap, int **Ai, double **Ax, int **Ap_tild, int **Ai_tild, double **A_tild, double **b, double **x0, mxArray *x0_m, int *nnz, int *nnz_tild, int preconditioner);
#endif
void Init_Matlab_Sparse_Simple(int Size, map<pair<pair<int, int>, int>, int> &IM, mxArray *A_m, mxArray *b_m, bool &zero_solution, mxArray *x0_m);
void Init_UMFPACK_Sparse_Simple(int Size, map<pair<pair<int, int>, int>, int> &IM, SuiteSparse_long **Ap, SuiteSparse_long **Ai, double **Ax, double **b, bool &zero_solution, mxArray *x0_m);
void Init_CUDA_Sparse_Simple(int Size, map<pair<pair<int, int>, int>, int> &IM, SuiteSparse_long **Ap, SuiteSparse_long **Ai, double **Ax, double **b, double **x0, bool &zero_solution, mxArray *x0_m);
void Simple_Init(int Size, std::map<std::pair<std::pair<int, int>, int>, int> &IM, bool &zero_solution);
void End_GE(int Size);
bool mnbrak(double *ax, double *bx, double *cx, double *fa, double *fb, double *fc);
@ -145,13 +97,6 @@ private:
void Solve_LU_UMFPack(SuiteSparse_long *Ap, SuiteSparse_long *Ai, double *Ax, double *b, int n, int Size, double slowc_l, bool is_two_boundaries, int it_);
void End_Matlab_LU_UMFPack();
#ifdef CUDA
void Solve_CUDA_BiCGStab_Free(double *tmp_vect_host, double *p, double *r, double *v, double *s, double *t, double *y_, double *z, double *tmp_,
int *Ai, double *Ax, int *Ap, double *x0, double *b, double *A_tild, int *A_tild_i, int *A_tild_p,
cusparseSolveAnalysisInfo_t infoL, cusparseSolveAnalysisInfo_t infoU,
cusparseMatDescr_t descrL, cusparseMatDescr_t descrU, int preconditioner);
int Solve_CUDA_BiCGStab(int *Ap, int *Ai, double *Ax, int *Ap_tild, int *Ai_tild, double *A_tild, double *b, double *x0, int n, int Size, double slowc_l, bool is_two_boundaries, int it_, int nnz, int nnz_tild, int preconditioner, int max_iterations, int block);
#endif
void Solve_Matlab_GMRES(mxArray *A_m, mxArray *b_m, int Size, double slowc, int block, bool is_two_boundaries, int it_, mxArray *x0_m);
void Solve_Matlab_BiCGStab(mxArray *A_m, mxArray *b_m, int Size, double slowc, int block, bool is_two_boundaries, int it_, mxArray *x0_m, int precond);
void Check_and_Correct_Previous_Iteration(int block_num, int y_size, int size, double crit_opt_old);
@ -196,12 +141,6 @@ private:
mxArray *Sparse_substract_SA_SB(mxArray *A_m, mxArray *B_m);
mxArray *Sparse_substract_A_SB(mxArray *A_m, mxArray *B_m);
mxArray *substract_A_B(mxArray *A_m, mxArray *B_m);
#ifdef CUDA
int CUDA_device;
cublasHandle_t cublas_handle;
cusparseHandle_t cusparse_handle;
cusparseMatDescr_t CUDA_descr;
#endif
protected:
stack<double> Stack;
int nb_prologue_table_u, nb_first_table_u, nb_middle_table_u, nb_last_table_u;

View File

@ -1,121 +0,0 @@
/*
* Copyright (C) 2007-2012 Dynare Team
*
* This file is part of Dynare.
*
* Dynare is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Dynare is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Dynare. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef SPARMATRIX_KERNEL
#define SPARMATRIX_KERNEL
// Kernel definition of vector division
__global__ void
VecDiv(double* A, double* B, double* C, int n)
{
int i = blockIdx.x * 1024 + threadIdx.x;
if (i < n)
C[i] = (B[i] != 0.0 ? A[i] / B[i] : A[i]);
}
__global__ void
VecAdd(double* res, double* r, double alpha, double* x, int n)
{
int i = blockIdx.x * 1024 + threadIdx.x;
if (i < n)
res[i] = r[i] + alpha * x[i];
}
__global__ void
VecInc(double* res, double alpha, double* x, int n)
{
int i = blockIdx.x * 1024 + threadIdx.x;
if (i < n)
res[i] += alpha * x[i];
}
__global__ void
update_x(double* x, double alpha, double* y, double omega, double *z)
{
int i = threadIdx.x;
x[i] += alpha * y[i] + omega * z[i];
}
__global__ void
Get_LU_dim(int *n, int* A_tild_i, int *A_tild_p, int *nnz_l, int *nnz_u)
{
nnz_u[0] = 0;
nnz_l[0] = 0;
for (int i = 0; i < n[0]; i++)
{
for (int j = A_tild_p[i]; j < A_tild_p[i+1]; j++)
{
if (A_tild_i[j] < i)
nnz_l[0]++;
else if (A_tild_i[j] == i)
{
nnz_u[0]++;
//nnz_l[0]++;
}
else
nnz_u[0]++;
}
}
}
__global__ void
Get_LU1_dim(int* n, int *nnz_l, int *nnz_u)
{
nnz_u[0] = 3+n[0];
nnz_l[0] = 1+n[0];
}
__global__ void
Get_L_and_U(int *n, double* A_tild_x, int* A_tild_i, int *A_tild_p, double* Lx, int* Li, int *Lp, double* Ux, int* Ui, int* Up)
{
int nnz_u = 0, nnz_l = 0;
Lp[0] = 0;
Up[0] = 0;
for (int i = 0; i < n[0]; i++)
{
for (int j = A_tild_p[i]; j < A_tild_p[i+1]; j++)
{
if (A_tild_i[j] < i)
{
Lx[nnz_l] = A_tild_x[j];
Li[nnz_l] = A_tild_i[j];
nnz_l++;
}
else if (A_tild_i[j] == i)
{
Ux[nnz_u] = A_tild_x[j];
Lx[nnz_l] = 1.0;
Li[nnz_l] = Ui[nnz_u] = A_tild_i[j];
nnz_u++;
//nnz_l++;
}
else
{
Ux[nnz_u] = A_tild_x[j];
Ui[nnz_u] = A_tild_i[j];
nnz_u++;
}
}
Lp[i+1] = nnz_l;
Up[i+1] = nnz_u;
}
}
#endif

View File

@ -57,164 +57,6 @@ Get_Argument(const mxArray *prhs)
//#include <windows.h>
#include <cstdio>
#ifdef CUDA
int
GPU_Test_and_Info(cublasHandle_t *cublas_handle, cusparseHandle_t *cusparse_handle, cusparseMatDescr_t *descr)
{
cudaDeviceProp deviceProp;
int device_count, device, version, version_max = 0;
cublasStatus_t cublas_status;
cudaError_t cuda_error;
*descr = 0;
/* ask cuda how many devices it can find */
cudaGetDeviceCount(&device_count);
if (device_count < 1)
{
/* if it couldn't find any fail out */
ostringstream tmp;
tmp << " Unable to find a CUDA device. Unable to implement CUDA solvers\n";
throw FatalExceptionHandling(tmp.str());
}
else
{
mexPrintf("-----------------------------------------\n");
for (int i = 0; i < device_count; i++)
{
cudaSetDevice(i);
// Statistics about the GPU device
cuda_error = cudaGetDeviceProperties(&deviceProp, i);
if (cuda_error != cudaSuccess)
{
ostringstream tmp;
tmp << " bytecode cudaGetDeviceProperties failed\n";
throw FatalExceptionHandling(tmp.str());
}
mexPrintf("> GPU device %d: \"%s\" has:\n - %d Multi-Processors,\n - %d threads per multiprocessor,\n", i, deviceProp.name, deviceProp.multiProcessorCount, deviceProp.maxThreadsPerMultiProcessor);
mexEvalString("drawnow;");
version = (deviceProp.major * 0x10 + deviceProp.minor);
if (version >= version_max)
{
device = i;
version_max = version;
}
mexPrintf(" - %4.2fMhz clock rate,\n - %2.0fMb of memory,\n - %d.%d compute capabilities.\n", double (deviceProp.clockRate) / (1024 * 1024), double (deviceProp.totalGlobalMem) / (1024 * 1024), deviceProp.major, deviceProp.minor);
mexEvalString("drawnow;");
}
}
mexPrintf("> Device %d selected\n", device);
mexEvalString("drawnow;");
cuda_error = cudaSetDevice(device);
if (cuda_error != cudaSuccess)
{
ostringstream tmp;
tmp << " bytecode cudaSetDevice failed\n";
throw FatalExceptionHandling(tmp.str());
}
if (version_max < 0x11)
{
ostringstream tmp;
tmp << " bytecode requires a minimum CUDA compute 1.1 capability\n";
cudaDeviceReset();
throw FatalExceptionHandling(tmp.str());
}
// Initialize CuBlas library
cublas_status = cublasCreate(cublas_handle);
if (cublas_status != CUBLAS_STATUS_SUCCESS)
{
ostringstream tmp;
switch (cublas_status)
{
case CUBLAS_STATUS_NOT_INITIALIZED:
tmp << " the CUBLAS initialization failed.\n";
break;
case CUBLAS_STATUS_ALLOC_FAILED:
tmp << " the resources could not be allocated.\n";
break;
default:
tmp << " unknown error during the initialization of cusparse library.\n";
}
throw FatalExceptionHandling(tmp.str());
}
// Initialize the CuSparse library
cusparseStatus_t cusparse_status;
cusparse_status = cusparseCreate(cusparse_handle);
if (cusparse_status != CUSPARSE_STATUS_SUCCESS)
{
ostringstream tmp;
switch (cusparse_status)
{
case CUSPARSE_STATUS_NOT_INITIALIZED:
tmp << " the CUDA Runtime initialization failed.\n";
break;
case CUSPARSE_STATUS_ALLOC_FAILED:
tmp << " the resources could not be allocated.\n";
break;
case CUSPARSE_STATUS_ARCH_MISMATCH:
tmp << " the device compute capability (CC) is less than 1.1. The CC of at least 1.1 is required.\n";
break;
default:
tmp << " unknown error during the initialization of cusparse library.\n";
}
throw FatalExceptionHandling(tmp.str());
}
// Create and setup matrix descriptor
cusparse_status = cusparseCreateMatDescr(descr);
if (cusparse_status != CUSPARSE_STATUS_SUCCESS)
{
ostringstream tmp;
tmp << " Matrix descriptor initialization failed\n";
throw FatalExceptionHandling(tmp.str());
}
cusparseSetMatType(*descr, CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatIndexBase(*descr, CUSPARSE_INDEX_BASE_ZERO);
mexPrintf("> Driver version:\n");
int cuda_version;
cuda_error = cudaDriverGetVersion(&cuda_version);
if (cuda_error != cudaSuccess)
{
ostringstream tmp;
tmp << " cudaGetVersion has failed\n";
throw FatalExceptionHandling(tmp.str());
}
mexPrintf(" - CUDA version %5.3f\n", double (cuda_version) / 1000);
int cublas_version;
cublas_status = cublasGetVersion(*cublas_handle, &cublas_version);
if (cublas_status != CUBLAS_STATUS_SUCCESS)
{
ostringstream tmp;
tmp << " cublasGetVersion has failed\n";
throw FatalExceptionHandling(tmp.str());
}
mexPrintf(" - CUBLAS version %5.3f\n", double (cublas_version) / 1000);
int cusparse_version;
cusparse_status = cusparseGetVersion(*cusparse_handle, &cusparse_version);
if (cusparse_status != CUSPARSE_STATUS_SUCCESS)
{
ostringstream tmp;
tmp << " cusparseGetVersion has failed\n";
throw FatalExceptionHandling(tmp.str());
}
mexPrintf(" - CUSPARSE version %5.3f\n", double (cusparse_version) / 1000);
mexPrintf("-----------------------------------------\n");
return device;
}
void
GPU_close(cublasHandle_t cublas_handle, cusparseHandle_t cusparse_handle, cusparseMatDescr_t descr)
{
cublasChk(cublasDestroy(cublas_handle), "in bytecode cublasDestroy failed\n");
cusparseChk(cusparseDestroyMatDescr(descr), "in bytecode cusparseDestroyMatDescr failed\n");
cusparseChk(cusparseDestroy(cusparse_handle), "in bytecode cusparseDestroy failed\n");
}
#endif
string
deblank(string x)
{
@ -437,12 +279,6 @@ main(int nrhs, const char *prhs[])
int max_periods = 0;
#ifdef CUDA
int CUDA_device = -1;
cublasHandle_t cublas_handle;
cusparseHandle_t cusparse_handle;
cusparseMatDescr_t descr;
#endif
try
{
Get_Arguments_and_global_variables(nrhs, prhs, count_array_argument,
@ -1005,20 +841,9 @@ main(int nrhs, const char *prhs[])
mexWarnMsgTxt("Not enough space. Filename is truncated.");
string file_name = fname;
#ifdef CUDA
try
{
if (stack_solve_algo == 7 && !steady_state)
CUDA_device = GPU_Test_and_Info(&cublas_handle, &cusparse_handle, &descr);
}
catch (GeneralExceptionHandling &feh)
{
mexErrMsgTxt(feh.GetErrorMsg().c_str());
}
#else
if (stack_solve_algo == 7 && !steady_state)
mexErrMsgTxt("bytecode has not been compiled with CUDA option. Bytecode Can't use options_.stack_solve_algo=7\n");
#endif
mexErrMsgTxt("Bytecode: Can't use option stack_solve_algo=7\n");
size_t size_of_direction = col_y*row_y*sizeof(double);
auto *y = static_cast<double *>(mxMalloc(size_of_direction));
error_msg.test_mxMalloc(y, __LINE__, __FILE__, __func__, size_of_direction);
@ -1045,11 +870,7 @@ main(int nrhs, const char *prhs[])
clock_t t0 = clock();
Interpreter interprete(params, y, ya, x, steady_yd, steady_xd, direction, y_size, nb_row_x, nb_row_xd, periods, y_kmin, y_kmax, maxit_, solve_tolf, size_of_direction, slowc, y_decal,
markowitz_c, file_name, minimal_solving_periods, stack_solve_algo, solve_algo, global_temporary_terms, print, print_error, GlobalTemporaryTerms, steady_state,
print_it, col_x, col_y
#ifdef CUDA
, CUDA_device, cublas_handle, cusparse_handle, descr
#endif
);
print_it, col_x, col_y);
string f(fname);
mxFree(fname);
int nb_blocks = 0;
@ -1078,11 +899,6 @@ main(int nrhs, const char *prhs[])
}
}
#ifdef CUDA
if (stack_solve_algo == 7 && !steady_state)
GPU_close(cublas_handle, cusparse_handle, descr);
#endif
clock_t t1 = clock();
if (!steady_state && !evaluate && print)
mexPrintf("Simulation Time=%f milliseconds\n", 1000.0*(double (t1)-double (t0))/double (CLOCKS_PER_SEC));