Parallelizes local_state_space_iteration_fortran

2022-01-04 11:31:02 +01:00 · 2022-01-04 11:31:02 +01:00 · 5f828e241f
parent 3f00b51950
commit 5f828e241f
12 changed files with 285 additions and 33 deletions
--- a/mex/build/libkordersim.am
+++ b/mex/build/libkordersim.am
@ -7,7 +7,9 @@ nodist_libkordersim_a_SOURCES = \
 	sort.f08 \
 	partitions.f08 \
 	simulation.f08 \
-	struct.f08
+	struct.f08 \
+	pthread.F08 \
+	pparticle.f08

 BUILT_SOURCES = $(nodist_libkordersim_a_SOURCES)
 CLEANFILES = $(nodist_libkordersim_a_SOURCES)
@ -25,5 +27,8 @@ partitions.mod: partitions.o
 simulation.o: partitions.mod lapack.mod
 simulation.mod: simulation.o

+pparticle.o: simulation.mod matlab_mex.mod
+pparticle.mod: pparticle.o
+
 %.f08: $(top_srcdir)/../../sources/libkordersim/%.f08
 	$(LN_S) -f $< $@
--- a/mex/build/local_state_space_iteration_fortran.am
+++ b/mex/build/local_state_space_iteration_fortran.am
@ -1,11 +1,11 @@
 mex_PROGRAMS = local_state_space_iteration_fortran 

-local_state_space_iteration_fortran_FCFLAGS = $(AM_FCFLAGS) -I../libkordersim
+local_state_space_iteration_fortran_FCFLAGS = $(AM_FCFLAGS) -I../libkordersim -pthread

 nodist_local_state_space_iteration_fortran_SOURCES = \
 	mexFunction.f08

-local_state_space_iteration_fortran_LDADD = ../libkordersim/libkordersim.a
+local_state_space_iteration_fortran_LDADD = ../libkordersim/libkordersim.a 

 BUILT_SOURCES = $(nodist_local_state_space_iteration_fortran_SOURCES)
 CLEANFILES = $(nodist_local_state_space_iteration_fortran_SOURCES)
--- a/mex/build/matlab/configure.ac
+++ b/mex/build/matlab/configure.ac
@ -56,6 +56,9 @@ AM_PROG_AR

 AX_CXX11_THREAD

+# Get the size of a pthread_t instance, needed by pthread.F08
+AC_CHECK_SIZEOF([pthread_t], [], [#include <pthread.h>])
+
 # Check for dlopen(), needed by k_order_perturbation DLL
 AC_CHECK_LIB([dl], [dlopen], [LIBADD_DLOPEN="-ldl"], [])
 AC_SUBST([LIBADD_DLOPEN])
--- a/mex/build/matlab/mex.am
+++ b/mex/build/matlab/mex.am
@ -32,7 +32,7 @@ clean-local:
 		cd $(top_srcdir)/../../matlab && rm -f $(PROGRAMS); \
 	fi

-# Rules for the Fortran 2008 interface to MEX and BLAS/LAPACK functions
+# Rules for the Fortran 2008 interface to MEX, BLAS/LAPACK and pthread functions
 matlab_mat.mod: matlab_mex.o
 matlab_mex.mod: matlab_mex.o

@ -44,3 +44,8 @@ lapack.mod: blas_lapack.o

 blas_lapack.F08: $(top_srcdir)/../../sources/blas_lapack.F08
 	$(LN_S) -f $< $@
+
+pthread.mod: pthread.o
+
+pthread.F08: $(top_srcdir)/../../sources/pthread.F08
+	$(LN_S) -f $< $@
--- a/mex/build/octave/configure.ac
+++ b/mex/build/octave/configure.ac
@ -52,6 +52,9 @@ AM_PROG_AR

 AX_CXX11_THREAD

+# Get the size of a pthread_t instance, needed by pthread.F08
+AC_CHECK_SIZEOF([pthread_t], [], [#include <pthread.h>])
+
 # Check for dlopen(), needed by k_order_perturbation DLL
 AC_CHECK_LIB([dl], [dlopen], [LIBADD_DLOPEN="-ldl"], [])
 AC_SUBST([LIBADD_DLOPEN])
--- a/mex/build/octave/mex.am
+++ b/mex/build/octave/mex.am
@ -40,7 +40,7 @@ clean-local:
 		cd $(top_srcdir)/../../octave && rm -f $(PROGRAMS); \
 	fi

-# Rules for the Fortran 2008 interface to MEX and BLAS/LAPACK functions
+# Rules for the Fortran 2008 interface to MEX, BLAS/LAPACK and pthread functions
 matlab_mat.mod: matlab_mex.o
 matlab_mex.mod: matlab_mex.o

@ -52,3 +52,8 @@ lapack.mod: blas_lapack.o

 blas_lapack.F08: $(top_srcdir)/../../sources/blas_lapack.F08
 	$(LN_S) -f $< $@
+
+pthread.mod: pthread.o
+
+pthread.F08: $(top_srcdir)/../../sources/pthread.F08
+	$(LN_S) -f $< $@
--- a/mex/sources/libkordersim/pparticle.f08
+++ b/mex/sources/libkordersim/pparticle.f08
@ -0,0 +1,78 @@
+! Copyright © 2021 Dynare Team
+!
+! This file is part of Dynare.
+!
+! Dynare is free software: you can redistribute it and/or modify
+! it under the terms of the GNU General Public License as published by
+! the Free Software Foundation, either version 3 of the License, or
+! (at your option) any later version.
+!
+! Dynare is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+! GNU General Public License for more details.
+!
+! You should have received a copy of the GNU General Public License
+! along with Dynare.  If not, see <https://www.gnu.org/licenses/>.
+
+! Routines and data structures for multithreading over particles in local_state_space_iteration_fortran
+module pparticle
+   use iso_c_binding
+   use simulation
+   use matlab_mex
+
+   implicit none
+
+   type tdata
+      integer :: nm, nys, endo_nbr, nvar, order, nrestricted, nparticles 
+      real(real64), allocatable :: yhat(:,:), e(:,:), ynext(:,:), ys_reordered(:), restrict_var_list(:)
+      type(pol), dimension(:), allocatable :: udr
+   end type tdata
+
+   type(tdata) :: thread_data
+
+contains
+
+   subroutine thread_eval(arg) bind(c)
+      type(c_ptr), intent(in), value :: arg
+      integer, pointer :: im
+      integer :: i, j, start, end, q, r, ind
+      type(horner), dimension(:), allocatable :: h
+      real(real64), dimension(:), allocatable :: dyu
+
+      ! Checking that the thread number got passed as argument
+      if (.not. c_associated(arg)) then
+         call mexErrMsgTxt("No argument was passed to thread_eval")
+      end if
+      call c_f_pointer(arg, im)
+
+      ! Allocating local arrays
+      allocate(h(0:thread_data%order), dyu(thread_data%nvar)) 
+      do i=0, thread_data%order
+         allocate(h(i)%c(thread_data%endo_nbr, thread_data%nvar**i))
+      end do
+
+      ! Specifying bounds for the curent thread
+      q = thread_data%nparticles / thread_data%nm
+      r = mod(thread_data%nparticles, thread_data%nm)
+      start = (im-1)*q+1
+      if (im < thread_data%nm) then
+         end = start+q-1
+      else
+         end = thread_data%nparticles
+      end if
+
+      ! Using the Horner algorithm to evaluate the decision rule at the chosen yhat and epsilon
+      do j=start,end
+         dyu(1:thread_data%nys) = thread_data%yhat(:,j) 
+         dyu(thread_data%nys+1:) = thread_data%e(:,j) 
+         call eval(h, dyu, thread_data%udr, thread_data%endo_nbr, thread_data%nvar, thread_data%order)
+         do i=1,thread_data%nrestricted
+            ind = int(thread_data%restrict_var_list(i))
+            thread_data%ynext(i,j) = h(0)%c(ind,1) + thread_data%ys_reordered(ind)
+         end do
+      end do
+
+   end subroutine thread_eval
+
+end module pparticle
--- a/mex/sources/local_state_space_iteration_fortran/mexFunction.f08
+++ b/mex/sources/local_state_space_iteration_fortran/mexFunction.f08
@ -30,21 +30,21 @@ subroutine mexFunction(nlhs, plhs, nrhs, prhs) bind(c, name='mexFunction')
   use iso_c_binding
   use struct
   use matlab_mex
-   use partitions
-   use simulation
+   use pthread
+   use pparticle
   implicit none

   type(c_ptr), dimension(*), intent(in), target :: prhs
   type(c_ptr), dimension(*), intent(out) :: plhs
   integer(c_int), intent(in), value :: nlhs, nrhs
   type(c_ptr) :: M_mx, options_mx, dr_mx, yhat_mx, epsilon_mx, udr_mx, tmp
-   type(pol), dimension(:), allocatable, target :: udr
-   integer :: order, nstatic, npred, nboth, nfwrd, exo_nbr, endo_nbr, nparticles, nys, nvar, nrestricted
-   real(real64), dimension(:), allocatable :: ys_reordered, dyu
+   type(pol), dimension(:), allocatable :: udr
+   integer :: order, nstatic, npred, nboth, nfwrd, exo_nbr, endo_nbr, nparticles, nys, nvar, nrestricted, nm
   real(real64), dimension(:), pointer, contiguous :: order_var, ys, restrict_var_list
-   real(real64), dimension(:,:), allocatable :: yhat, e, ynext, ynext_all
-   type(horner), dimension(:), allocatable :: h
-   integer :: i, j, m, n
+   real(real64), allocatable :: yhat(:,:), e(:,:), ynext(:,:), ys_reordered(:)
+   integer :: i, m, n, rc
+   type(c_pthread_t), allocatable :: threads(:)
+   integer, allocatable, target :: routines(:)
   character(kind=c_char, len=10) :: fieldname

   yhat_mx = prhs(1)
@ -114,6 +114,13 @@ subroutine mexFunction(nlhs, plhs, nrhs, prhs) bind(c, name='mexFunction')
      restrict_var_list => mxGetPr(restrict_var_list_mx)
   end associate

+   associate (thread_mx => mxGetField(options_mx, 1_mwIndex, "threads"))
+      if (.not. c_associated(thread_mx)) then
+         call mexErrMsgTxt("Cannot find `threads' in options_")
+      end if
+      nm = get_int_field(thread_mx, "local_state_space_iteration_fortran")
+   end associate
+
   nparticles = int(mxGetN(yhat_mx));
   if (int(mxGetN(epsilon_mx)) /= nparticles) then
      call mexErrMsgTxt("epsilon and yhat don't have the same number of columns")
@ -125,11 +132,12 @@ subroutine mexFunction(nlhs, plhs, nrhs, prhs) bind(c, name='mexFunction')
      call mexErrMsgTxt("epsilon should be a double precision matrix with exo_nbr rows")
   end if

-   allocate(yhat(nys, nparticles), e(exo_nbr, nparticles), ynext(nrestricted, nparticles), ynext_all(endo_nbr, nparticles))
+   allocate(yhat(nys, nparticles), e(exo_nbr, nparticles), ynext(nrestricted, nparticles))
   yhat = reshape(mxGetPr(yhat_mx), [nys, nparticles])
   e = reshape(mxGetPr(epsilon_mx), [exo_nbr, nparticles])

-   allocate(h(0:order), udr(0:order)) 
+
+   allocate(udr(0:order)) 
   do i = 0, order
      write (fieldname, '(a2, i1)') "g_", i
      tmp = mxGetField(udr_mx, 1_mwIndex, trim(fieldname))
@ -138,23 +146,45 @@ subroutine mexFunction(nlhs, plhs, nrhs, prhs) bind(c, name='mexFunction')
      end if
      m = int(mxGetM(tmp))
      n = int(mxGetN(tmp))
-      allocate(udr(i)%g(m,n), h(i)%c(endo_nbr, nvar**i))
+      allocate(udr(i)%g(m,n))
      udr(i)%g(1:m,1:n) = reshape(mxGetPr(tmp), [m,n])
   end do

-   ! Using the Horner algorithm to evaluate the decision rule at the chosen yhat and epsilon
-   allocate(dyu(nvar))
-   do j=1,nparticles
-      dyu(1:nys) = yhat(:,j) 
-      dyu(nys+1:) = e(:,j) 
-      call eval(h, dyu, udr, endo_nbr, nvar, order)
-      ynext_all(:,j) = h(0)%c(:,1) + ys_reordered
-      do i=1,nrestricted
-         ynext(i,j) = ynext_all(int(restrict_var_list(i)),j) 
-      end do
-   end do
+   ! Initializing the global structure containing
+   ! useful information for threads
+   thread_data%nm = nm
+   thread_data%nys = nys
+   thread_data%endo_nbr = endo_nbr
+   thread_data%nvar = nvar
+   thread_data%order = order
+   thread_data%nrestricted = nrestricted
+   thread_data%nparticles = nparticles
+   thread_data%yhat = yhat
+   thread_data%e = e
+   thread_data%ynext = ynext
+   thread_data%udr = udr
+   thread_data%ys_reordered = ys_reordered
+   thread_data%restrict_var_list = restrict_var_list

+   allocate(threads(nm), routines(nm))
+   routines = [ (i, i = 1, nm) ]
+
+   if (nm == 1) then
+      call thread_eval(c_loc(routines(1)))
+   else
+      ! Creating the threads
+      do i = 1, nm
+         rc = c_pthread_create(threads(i), c_null_ptr, c_funloc(thread_eval), c_loc(routines(i)))
+      end do
+
+      ! Joining the threads
+      do i = 1, nm
+         rc = c_pthread_join(threads(i), c_loc(routines(i)))
+      end do
+   end if
+
+   ! Returning the result
   plhs(1) = mxCreateDoubleMatrix(int(size(restrict_var_list), mwSize), int(nparticles, mwSize), mxREAL)
-   mxGetPr(plhs(1)) = reshape(ynext, [size(ynext)])
- 
+   mxGetPr(plhs(1)) = reshape(thread_data%ynext, [size(thread_data%ynext)])
+
 end subroutine mexFunction
--- a/mex/sources/pthread.F08
+++ b/mex/sources/pthread.F08
@ -0,0 +1,56 @@
+! Copyright © 2022 Dynare Team
+!
+! This file is part of Dynare.
+!
+! Dynare is free software: you can redistribute it and/or modify
+! it under the terms of the GNU General Public License as published by
+! the Free Software Foundation, either version 3 of the License, or
+! (at your option) any later version.
+!
+! Dynare is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+! GNU General Public License for more details.
+!
+! You should have received a copy of the GNU General Public License
+! along with Dynare.  If not, see <https://www.gnu.org/licenses/>.
+
+! Wrapper around pthread_create and pthread_join C routines for POSIX multithreading
+module pthread
+   use iso_c_binding
+   implicit none
+   private
+
+   public :: c_pthread_create
+   public :: c_pthread_join
+
+   ! SIZEOF_PTHREAD_T is set by the AC_CHECK_SIZEOF routine
+   integer, parameter :: pthread_size = SIZEOF_PTHREAD_T
+
+   type, bind(c), public :: c_pthread_t
+      private
+      integer(kind=c_char) :: hidden(pthread_size)
+   end type c_pthread_t
+
+   interface
+
+      function c_pthread_create(thread, attr, start_routine, arg) bind(c, name='pthread_create')
+         import :: c_int, c_ptr, c_funptr, c_pthread_t
+         implicit none
+         type(c_pthread_t), intent(inout)     :: thread
+         type(c_ptr), intent(in), value :: attr
+         type(c_funptr), intent(in), value :: start_routine
+         type(c_ptr), intent(in), value :: arg
+         integer(kind=c_int) :: c_pthread_create
+      end function c_pthread_create
+
+      function c_pthread_join(thread, value_ptr) bind(c, name='pthread_join')
+         import :: c_int, c_ptr, c_pthread_t
+         implicit none
+         type(c_pthread_t), intent(in), value :: thread
+         type(c_ptr),       intent(in) :: value_ptr
+         integer(kind=c_int) :: c_pthread_join
+      end function c_pthread_join
+
+    end interface
+end module pthread
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@ -618,7 +618,8 @@ PARTICLEFILES = \
 	particle/dsge_base2.mod \
 	particle/first_spec.mod \
 	particle/first_spec_MCMC.mod \
-	particle/local_state_space_iteration_k_test.mod
+	particle/local_state_space_iteration_k_test.mod \
+	particle/local_it_k_test_parallel.mod


 XFAIL_MODFILES = ramst_xfail.mod \
@ -928,6 +929,9 @@ discretionary_policy/Gali_2015_chapter_3_nonlinear.o.trs: discretionary_policy/G
 particle/local_state_space_iteration_k_test.m.trs: particle/first_spec.m.trs
 particle/local_state_space_iteration_k_test.o.trs: particle/first_spec.o.trs

+particle/local_it_k_test_parallel.m.trs: particle/first_spec.m.trs
+particle/local_it_k_test_parallel.o.trs: particle/first_spec.o.trs
+
 pruning/AS_pruned_state_space_red_shock.m.trs: pruning/AnSchorfheide_pruned_state_space.m.trs
 pruning/AS_pruned_state_space_red_shock.o.trs: pruning/AnSchorfheide_pruned_state_space.o.trs

--- a/tests/particle/local_it_k_test_parallel.mod
+++ b/tests/particle/local_it_k_test_parallel.mod
@ -0,0 +1,58 @@
+/*
+  Tests that the parallelized version of local_state_space_iteration_k and local_state_space_iteration_fortran
+  (for k=3) return the same results.
+
+  This file must be run after first_spec.mod (both are based on the same model).
+*/
+
+@#include "first_spec_common.inc"
+
+varobs q ca;
+
+shocks;
+var eeps = 0.04^2;
+var nnu = 0.03^2;
+var q = 0.01^2;
+var ca = 0.01^2;
+end;
+
+// Initialize various structures
+estimation(datafile='my_data.mat',order=2,mode_compute=0,mh_replic=0,filter_algorithm=sis,nonlinear_filter_initialization=2
+    ,cova_compute=0 %tell program that no covariance matrix was computed
+);
+
+stoch_simul(order=2, periods=200, irf=0, k_order_solver);
+
+// Really perform the test
+
+nparticles = options_.particle.number_of_particles;
+nsims = 1e6/nparticles;
+
+/* We generate particles using realistic distributions (though this is not
+   strictly needed) */
+state_idx = oo_.dr.order_var((M_.nstatic+1):(M_.nstatic+M_.npred+M_.nboth));
+yhat = chol(oo_.var(state_idx,state_idx))*randn(M_.npred+M_.nboth, nparticles);
+epsilon = chol(M_.Sigma_e)*randn(M_.exo_nbr, nparticles);
+
+stoch_simul(order=3, periods=200, irf=0, k_order_solver);
+
+options_.threads.local_state_space_iteration_k = 2;
+options_.threads.local_state_space_iteration_fortran = 2;
+
+tStart1 = tic; for i=1:nsims, ynext1 = local_state_space_iteration_k(yhat, epsilon, oo_.dr, M_, options_); end, tElapsed1 = toc(tStart1);
+
+tStart2 = tic; [udr] = folded_to_unfolded_dr(oo_.dr, M_, options_); for i=1:nsims, ynext2 = local_state_space_iteration_fortran(yhat, epsilon, oo_.dr, M_, options_, udr); end, tElapsed2 = toc(tStart2);
+
+if max(max(abs(ynext1-ynext2))) > 1e-14
+    error('Inconsistency between local_state_space_iteration_k and local_state_space_iteration_fortran')
+end
+
+if tElapsed2<tElapsed1
+    skipline()
+    dprintf('local_state_space_iteration_fortran is %5.2f times faster than local_state_space_iteration_k', tElapsed1/tElapsed2)
+    skipline()
+else
+    skipline()
+    dprintf('local_state_space_iteration_fortran is %5.2f times slower than local_state_space_iteration_k', tElapsed2/tElapsed1)
+    skipline()
+end
--- a/tests/particle/local_state_space_iteration_k_test.mod
+++ b/tests/particle/local_state_space_iteration_k_test.mod
@ -1,6 +1,8 @@
 /*
-  Tests that local_state_space_iteration_2 and local_state_space_iteration_k
-  (for k=2) return the same results.
+  Tests that :
+  (i) local_state_space_iteration_2, local_state_space_iteration_k and local_state_space_iteration_fortran
+  (for k=2) return the same results
+  (ii) local_state_space_iteration_k and local_state_space_iteration_fortran return the same results for k=3 without parallelization.

  This file must be run after first_spec.mod (both are based on the same model).
 */
@ -35,6 +37,9 @@ yhat = chol(oo_.var(state_idx,state_idx))*randn(M_.npred+M_.nboth, nparticles);
 epsilon = chol(M_.Sigma_e)*randn(M_.exo_nbr, nparticles);

 dr = oo_.dr;
+options_.threads.local_state_space_iteration_fortran = 1;
+options_.threads.local_state_space_iteration_k = 1;
+

 // “rf” stands for “Reduced Form”
 rf_ghx = dr.ghx(dr.restrict_var_list, :);
@ -86,7 +91,7 @@ tStart31 = tic; for i=1:nsims, ynext31 = local_state_space_iteration_k(yhat, eps
 tStart32 = tic; [udr] = folded_to_unfolded_dr(oo_.dr, M_, options_); for i=1:nsims, ynext32 = local_state_space_iteration_fortran(yhat, epsilon, oo_.dr, M_, options_, udr); end, tElapsed32 = toc(tStart32);

 if max(max(abs(ynext31-ynext32))) > 1e-14
-    error('Inconsistency between local_state_space_iteration_2 and local_state_space_iteration_fortran')
+    error('Inconsistency between local_state_space_iteration_k and local_state_space_iteration_fortran')
 end

 if tElapsed32<tElapsed31