Parallelizes local_state_space_iteration_fortran

trustregion
NormannR 2022-01-04 11:31:02 +01:00
parent 3f00b51950
commit 5f828e241f
12 changed files with 285 additions and 33 deletions

View File

@ -7,7 +7,9 @@ nodist_libkordersim_a_SOURCES = \
sort.f08 \
partitions.f08 \
simulation.f08 \
struct.f08
struct.f08 \
pthread.F08 \
pparticle.f08
BUILT_SOURCES = $(nodist_libkordersim_a_SOURCES)
CLEANFILES = $(nodist_libkordersim_a_SOURCES)
@ -25,5 +27,8 @@ partitions.mod: partitions.o
simulation.o: partitions.mod lapack.mod
simulation.mod: simulation.o
pparticle.o: simulation.mod matlab_mex.mod
pparticle.mod: pparticle.o
%.f08: $(top_srcdir)/../../sources/libkordersim/%.f08
$(LN_S) -f $< $@

View File

@ -1,11 +1,11 @@
mex_PROGRAMS = local_state_space_iteration_fortran
local_state_space_iteration_fortran_FCFLAGS = $(AM_FCFLAGS) -I../libkordersim
local_state_space_iteration_fortran_FCFLAGS = $(AM_FCFLAGS) -I../libkordersim -pthread
nodist_local_state_space_iteration_fortran_SOURCES = \
mexFunction.f08
local_state_space_iteration_fortran_LDADD = ../libkordersim/libkordersim.a
local_state_space_iteration_fortran_LDADD = ../libkordersim/libkordersim.a
BUILT_SOURCES = $(nodist_local_state_space_iteration_fortran_SOURCES)
CLEANFILES = $(nodist_local_state_space_iteration_fortran_SOURCES)

View File

@ -56,6 +56,9 @@ AM_PROG_AR
AX_CXX11_THREAD
# Get the size of a pthread_t instance, needed by pthread.F08
AC_CHECK_SIZEOF([pthread_t], [], [#include <pthread.h>])
# Check for dlopen(), needed by k_order_perturbation DLL
AC_CHECK_LIB([dl], [dlopen], [LIBADD_DLOPEN="-ldl"], [])
AC_SUBST([LIBADD_DLOPEN])

View File

@ -32,7 +32,7 @@ clean-local:
cd $(top_srcdir)/../../matlab && rm -f $(PROGRAMS); \
fi
# Rules for the Fortran 2008 interface to MEX and BLAS/LAPACK functions
# Rules for the Fortran 2008 interface to MEX, BLAS/LAPACK and pthread functions
matlab_mat.mod: matlab_mex.o
matlab_mex.mod: matlab_mex.o
@ -44,3 +44,8 @@ lapack.mod: blas_lapack.o
blas_lapack.F08: $(top_srcdir)/../../sources/blas_lapack.F08
$(LN_S) -f $< $@
pthread.mod: pthread.o
pthread.F08: $(top_srcdir)/../../sources/pthread.F08
$(LN_S) -f $< $@

View File

@ -52,6 +52,9 @@ AM_PROG_AR
AX_CXX11_THREAD
# Get the size of a pthread_t instance, needed by pthread.F08
AC_CHECK_SIZEOF([pthread_t], [], [#include <pthread.h>])
# Check for dlopen(), needed by k_order_perturbation DLL
AC_CHECK_LIB([dl], [dlopen], [LIBADD_DLOPEN="-ldl"], [])
AC_SUBST([LIBADD_DLOPEN])

View File

@ -40,7 +40,7 @@ clean-local:
cd $(top_srcdir)/../../octave && rm -f $(PROGRAMS); \
fi
# Rules for the Fortran 2008 interface to MEX and BLAS/LAPACK functions
# Rules for the Fortran 2008 interface to MEX, BLAS/LAPACK and pthread functions
matlab_mat.mod: matlab_mex.o
matlab_mex.mod: matlab_mex.o
@ -52,3 +52,8 @@ lapack.mod: blas_lapack.o
blas_lapack.F08: $(top_srcdir)/../../sources/blas_lapack.F08
$(LN_S) -f $< $@
pthread.mod: pthread.o
pthread.F08: $(top_srcdir)/../../sources/pthread.F08
$(LN_S) -f $< $@

View File

@ -0,0 +1,78 @@
! Copyright © 2021 Dynare Team
!
! This file is part of Dynare.
!
! Dynare is free software: you can redistribute it and/or modify
! it under the terms of the GNU General Public License as published by
! the Free Software Foundation, either version 3 of the License, or
! (at your option) any later version.
!
! Dynare is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
! GNU General Public License for more details.
!
! You should have received a copy of the GNU General Public License
! along with Dynare. If not, see <https://www.gnu.org/licenses/>.
! Routines and data structures for multithreading over particles in local_state_space_iteration_fortran
module pparticle
use iso_c_binding
use simulation
use matlab_mex
implicit none
type tdata
integer :: nm, nys, endo_nbr, nvar, order, nrestricted, nparticles
real(real64), allocatable :: yhat(:,:), e(:,:), ynext(:,:), ys_reordered(:), restrict_var_list(:)
type(pol), dimension(:), allocatable :: udr
end type tdata
type(tdata) :: thread_data
contains
subroutine thread_eval(arg) bind(c)
type(c_ptr), intent(in), value :: arg
integer, pointer :: im
integer :: i, j, start, end, q, r, ind
type(horner), dimension(:), allocatable :: h
real(real64), dimension(:), allocatable :: dyu
! Checking that the thread number got passed as argument
if (.not. c_associated(arg)) then
call mexErrMsgTxt("No argument was passed to thread_eval")
end if
call c_f_pointer(arg, im)
! Allocating local arrays
allocate(h(0:thread_data%order), dyu(thread_data%nvar))
do i=0, thread_data%order
allocate(h(i)%c(thread_data%endo_nbr, thread_data%nvar**i))
end do
! Specifying bounds for the curent thread
q = thread_data%nparticles / thread_data%nm
r = mod(thread_data%nparticles, thread_data%nm)
start = (im-1)*q+1
if (im < thread_data%nm) then
end = start+q-1
else
end = thread_data%nparticles
end if
! Using the Horner algorithm to evaluate the decision rule at the chosen yhat and epsilon
do j=start,end
dyu(1:thread_data%nys) = thread_data%yhat(:,j)
dyu(thread_data%nys+1:) = thread_data%e(:,j)
call eval(h, dyu, thread_data%udr, thread_data%endo_nbr, thread_data%nvar, thread_data%order)
do i=1,thread_data%nrestricted
ind = int(thread_data%restrict_var_list(i))
thread_data%ynext(i,j) = h(0)%c(ind,1) + thread_data%ys_reordered(ind)
end do
end do
end subroutine thread_eval
end module pparticle

View File

@ -30,21 +30,21 @@ subroutine mexFunction(nlhs, plhs, nrhs, prhs) bind(c, name='mexFunction')
use iso_c_binding
use struct
use matlab_mex
use partitions
use simulation
use pthread
use pparticle
implicit none
type(c_ptr), dimension(*), intent(in), target :: prhs
type(c_ptr), dimension(*), intent(out) :: plhs
integer(c_int), intent(in), value :: nlhs, nrhs
type(c_ptr) :: M_mx, options_mx, dr_mx, yhat_mx, epsilon_mx, udr_mx, tmp
type(pol), dimension(:), allocatable, target :: udr
integer :: order, nstatic, npred, nboth, nfwrd, exo_nbr, endo_nbr, nparticles, nys, nvar, nrestricted
real(real64), dimension(:), allocatable :: ys_reordered, dyu
type(pol), dimension(:), allocatable :: udr
integer :: order, nstatic, npred, nboth, nfwrd, exo_nbr, endo_nbr, nparticles, nys, nvar, nrestricted, nm
real(real64), dimension(:), pointer, contiguous :: order_var, ys, restrict_var_list
real(real64), dimension(:,:), allocatable :: yhat, e, ynext, ynext_all
type(horner), dimension(:), allocatable :: h
integer :: i, j, m, n
real(real64), allocatable :: yhat(:,:), e(:,:), ynext(:,:), ys_reordered(:)
integer :: i, m, n, rc
type(c_pthread_t), allocatable :: threads(:)
integer, allocatable, target :: routines(:)
character(kind=c_char, len=10) :: fieldname
yhat_mx = prhs(1)
@ -114,6 +114,13 @@ subroutine mexFunction(nlhs, plhs, nrhs, prhs) bind(c, name='mexFunction')
restrict_var_list => mxGetPr(restrict_var_list_mx)
end associate
associate (thread_mx => mxGetField(options_mx, 1_mwIndex, "threads"))
if (.not. c_associated(thread_mx)) then
call mexErrMsgTxt("Cannot find `threads' in options_")
end if
nm = get_int_field(thread_mx, "local_state_space_iteration_fortran")
end associate
nparticles = int(mxGetN(yhat_mx));
if (int(mxGetN(epsilon_mx)) /= nparticles) then
call mexErrMsgTxt("epsilon and yhat don't have the same number of columns")
@ -125,11 +132,12 @@ subroutine mexFunction(nlhs, plhs, nrhs, prhs) bind(c, name='mexFunction')
call mexErrMsgTxt("epsilon should be a double precision matrix with exo_nbr rows")
end if
allocate(yhat(nys, nparticles), e(exo_nbr, nparticles), ynext(nrestricted, nparticles), ynext_all(endo_nbr, nparticles))
allocate(yhat(nys, nparticles), e(exo_nbr, nparticles), ynext(nrestricted, nparticles))
yhat = reshape(mxGetPr(yhat_mx), [nys, nparticles])
e = reshape(mxGetPr(epsilon_mx), [exo_nbr, nparticles])
allocate(h(0:order), udr(0:order))
allocate(udr(0:order))
do i = 0, order
write (fieldname, '(a2, i1)') "g_", i
tmp = mxGetField(udr_mx, 1_mwIndex, trim(fieldname))
@ -138,23 +146,45 @@ subroutine mexFunction(nlhs, plhs, nrhs, prhs) bind(c, name='mexFunction')
end if
m = int(mxGetM(tmp))
n = int(mxGetN(tmp))
allocate(udr(i)%g(m,n), h(i)%c(endo_nbr, nvar**i))
allocate(udr(i)%g(m,n))
udr(i)%g(1:m,1:n) = reshape(mxGetPr(tmp), [m,n])
end do
! Using the Horner algorithm to evaluate the decision rule at the chosen yhat and epsilon
allocate(dyu(nvar))
do j=1,nparticles
dyu(1:nys) = yhat(:,j)
dyu(nys+1:) = e(:,j)
call eval(h, dyu, udr, endo_nbr, nvar, order)
ynext_all(:,j) = h(0)%c(:,1) + ys_reordered
do i=1,nrestricted
ynext(i,j) = ynext_all(int(restrict_var_list(i)),j)
end do
end do
! Initializing the global structure containing
! useful information for threads
thread_data%nm = nm
thread_data%nys = nys
thread_data%endo_nbr = endo_nbr
thread_data%nvar = nvar
thread_data%order = order
thread_data%nrestricted = nrestricted
thread_data%nparticles = nparticles
thread_data%yhat = yhat
thread_data%e = e
thread_data%ynext = ynext
thread_data%udr = udr
thread_data%ys_reordered = ys_reordered
thread_data%restrict_var_list = restrict_var_list
allocate(threads(nm), routines(nm))
routines = [ (i, i = 1, nm) ]
if (nm == 1) then
call thread_eval(c_loc(routines(1)))
else
! Creating the threads
do i = 1, nm
rc = c_pthread_create(threads(i), c_null_ptr, c_funloc(thread_eval), c_loc(routines(i)))
end do
! Joining the threads
do i = 1, nm
rc = c_pthread_join(threads(i), c_loc(routines(i)))
end do
end if
! Returning the result
plhs(1) = mxCreateDoubleMatrix(int(size(restrict_var_list), mwSize), int(nparticles, mwSize), mxREAL)
mxGetPr(plhs(1)) = reshape(ynext, [size(ynext)])
mxGetPr(plhs(1)) = reshape(thread_data%ynext, [size(thread_data%ynext)])
end subroutine mexFunction

56
mex/sources/pthread.F08 Normal file
View File

@ -0,0 +1,56 @@
! Copyright © 2022 Dynare Team
!
! This file is part of Dynare.
!
! Dynare is free software: you can redistribute it and/or modify
! it under the terms of the GNU General Public License as published by
! the Free Software Foundation, either version 3 of the License, or
! (at your option) any later version.
!
! Dynare is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
! GNU General Public License for more details.
!
! You should have received a copy of the GNU General Public License
! along with Dynare. If not, see <https://www.gnu.org/licenses/>.
! Wrapper around pthread_create and pthread_join C routines for POSIX multithreading
module pthread
use iso_c_binding
implicit none
private
public :: c_pthread_create
public :: c_pthread_join
! SIZEOF_PTHREAD_T is set by the AC_CHECK_SIZEOF routine
integer, parameter :: pthread_size = SIZEOF_PTHREAD_T
type, bind(c), public :: c_pthread_t
private
integer(kind=c_char) :: hidden(pthread_size)
end type c_pthread_t
interface
function c_pthread_create(thread, attr, start_routine, arg) bind(c, name='pthread_create')
import :: c_int, c_ptr, c_funptr, c_pthread_t
implicit none
type(c_pthread_t), intent(inout) :: thread
type(c_ptr), intent(in), value :: attr
type(c_funptr), intent(in), value :: start_routine
type(c_ptr), intent(in), value :: arg
integer(kind=c_int) :: c_pthread_create
end function c_pthread_create
function c_pthread_join(thread, value_ptr) bind(c, name='pthread_join')
import :: c_int, c_ptr, c_pthread_t
implicit none
type(c_pthread_t), intent(in), value :: thread
type(c_ptr), intent(in) :: value_ptr
integer(kind=c_int) :: c_pthread_join
end function c_pthread_join
end interface
end module pthread

View File

@ -618,7 +618,8 @@ PARTICLEFILES = \
particle/dsge_base2.mod \
particle/first_spec.mod \
particle/first_spec_MCMC.mod \
particle/local_state_space_iteration_k_test.mod
particle/local_state_space_iteration_k_test.mod \
particle/local_it_k_test_parallel.mod
XFAIL_MODFILES = ramst_xfail.mod \
@ -928,6 +929,9 @@ discretionary_policy/Gali_2015_chapter_3_nonlinear.o.trs: discretionary_policy/G
particle/local_state_space_iteration_k_test.m.trs: particle/first_spec.m.trs
particle/local_state_space_iteration_k_test.o.trs: particle/first_spec.o.trs
particle/local_it_k_test_parallel.m.trs: particle/first_spec.m.trs
particle/local_it_k_test_parallel.o.trs: particle/first_spec.o.trs
pruning/AS_pruned_state_space_red_shock.m.trs: pruning/AnSchorfheide_pruned_state_space.m.trs
pruning/AS_pruned_state_space_red_shock.o.trs: pruning/AnSchorfheide_pruned_state_space.o.trs

View File

@ -0,0 +1,58 @@
/*
Tests that the parallelized version of local_state_space_iteration_k and local_state_space_iteration_fortran
(for k=3) return the same results.
This file must be run after first_spec.mod (both are based on the same model).
*/
@#include "first_spec_common.inc"
varobs q ca;
shocks;
var eeps = 0.04^2;
var nnu = 0.03^2;
var q = 0.01^2;
var ca = 0.01^2;
end;
// Initialize various structures
estimation(datafile='my_data.mat',order=2,mode_compute=0,mh_replic=0,filter_algorithm=sis,nonlinear_filter_initialization=2
,cova_compute=0 %tell program that no covariance matrix was computed
);
stoch_simul(order=2, periods=200, irf=0, k_order_solver);
// Really perform the test
nparticles = options_.particle.number_of_particles;
nsims = 1e6/nparticles;
/* We generate particles using realistic distributions (though this is not
strictly needed) */
state_idx = oo_.dr.order_var((M_.nstatic+1):(M_.nstatic+M_.npred+M_.nboth));
yhat = chol(oo_.var(state_idx,state_idx))*randn(M_.npred+M_.nboth, nparticles);
epsilon = chol(M_.Sigma_e)*randn(M_.exo_nbr, nparticles);
stoch_simul(order=3, periods=200, irf=0, k_order_solver);
options_.threads.local_state_space_iteration_k = 2;
options_.threads.local_state_space_iteration_fortran = 2;
tStart1 = tic; for i=1:nsims, ynext1 = local_state_space_iteration_k(yhat, epsilon, oo_.dr, M_, options_); end, tElapsed1 = toc(tStart1);
tStart2 = tic; [udr] = folded_to_unfolded_dr(oo_.dr, M_, options_); for i=1:nsims, ynext2 = local_state_space_iteration_fortran(yhat, epsilon, oo_.dr, M_, options_, udr); end, tElapsed2 = toc(tStart2);
if max(max(abs(ynext1-ynext2))) > 1e-14
error('Inconsistency between local_state_space_iteration_k and local_state_space_iteration_fortran')
end
if tElapsed2<tElapsed1
skipline()
dprintf('local_state_space_iteration_fortran is %5.2f times faster than local_state_space_iteration_k', tElapsed1/tElapsed2)
skipline()
else
skipline()
dprintf('local_state_space_iteration_fortran is %5.2f times slower than local_state_space_iteration_k', tElapsed2/tElapsed1)
skipline()
end

View File

@ -1,6 +1,8 @@
/*
Tests that local_state_space_iteration_2 and local_state_space_iteration_k
(for k=2) return the same results.
Tests that :
(i) local_state_space_iteration_2, local_state_space_iteration_k and local_state_space_iteration_fortran
(for k=2) return the same results
(ii) local_state_space_iteration_k and local_state_space_iteration_fortran return the same results for k=3 without parallelization.
This file must be run after first_spec.mod (both are based on the same model).
*/
@ -35,6 +37,9 @@ yhat = chol(oo_.var(state_idx,state_idx))*randn(M_.npred+M_.nboth, nparticles);
epsilon = chol(M_.Sigma_e)*randn(M_.exo_nbr, nparticles);
dr = oo_.dr;
options_.threads.local_state_space_iteration_fortran = 1;
options_.threads.local_state_space_iteration_k = 1;
// rf stands for Reduced Form
rf_ghx = dr.ghx(dr.restrict_var_list, :);
@ -86,7 +91,7 @@ tStart31 = tic; for i=1:nsims, ynext31 = local_state_space_iteration_k(yhat, eps
tStart32 = tic; [udr] = folded_to_unfolded_dr(oo_.dr, M_, options_); for i=1:nsims, ynext32 = local_state_space_iteration_fortran(yhat, epsilon, oo_.dr, M_, options_, udr); end, tElapsed32 = toc(tStart32);
if max(max(abs(ynext31-ynext32))) > 1e-14
error('Inconsistency between local_state_space_iteration_2 and local_state_space_iteration_fortran')
error('Inconsistency between local_state_space_iteration_k and local_state_space_iteration_fortran')
end
if tElapsed32<tElapsed31