dynare/dynare++/kord/faa_di_bruno.cc

/*
 * Copyright © 2005 Ondra Kamenik
 * Copyright © 2019-2022 Dynare Team
 *
 * This file is part of Dynare.
 *
 * Dynare is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Dynare is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Dynare.  If not, see <https://www.gnu.org/licenses/>.
 */

#include "faa_di_bruno.hh"
#include "fine_container.hh"

#include <cmath>

// FaaDiBruno::calculate() folded sparse code
/* We take an opportunity to refine the stack container to avoid allocation of
   more memory than available. */
void
FaaDiBruno::calculate(const StackContainer<FGSTensor> &cont,
                      const TensorContainer<FSSparseTensor> &f,
                      FGSTensor &out)
{
  out.zeros();
  for (int l = 1; l <= out.dimen(); l++)
    {
      auto [max, mem_mb, p_size_mb] = estimRefinement(out.getDims(), out.nrows(), l);
      FoldedFineContainer fine_cont(cont, max);
      fine_cont.multAndAdd(l, f, out);
      JournalRecord recc(journal);
      recc << "dim=" << l << " avmem=" << mem_mb << " tmpmem=" << p_size_mb << " max=" << max
           << " stacks=" << cont.numStacks() << "→" << fine_cont.numStacks() << endrec;
    }
}

// FaaDiBruno::calculate() folded dense code
/* Here we just simply evaluate multAndAdd() for the dense container. There is
   no opportunity for tuning. */
void
FaaDiBruno::calculate(const FoldedStackContainer &cont, const FGSContainer &g,
                      FGSTensor &out)
{
  out.zeros();
  for (int l = 1; l <= out.dimen(); l++)
    {
      long int mem = SystemResources::availableMemory();
      cont.multAndAdd(l, g, out);
      JournalRecord rec(journal);
      int mem_mb = mem/1024/1024;
      rec << "dim=" << l << " avmem=" << mem_mb << endrec;
    }
}

// FaaDiBruno::calculate() unfolded sparse code
/* This is the same as FaaDiBruno::calculate() folded sparse code. The only
   difference is that we construct unfolded fine container. */
void
FaaDiBruno::calculate(const StackContainer<UGSTensor> &cont,
                      const TensorContainer<FSSparseTensor> &f,
                      UGSTensor &out)
{
  out.zeros();
  for (int l = 1; l <= out.dimen(); l++)
    {
      auto [max, mem_mb, p_size_mb] = estimRefinement(out.getDims(), out.nrows(), l);
      UnfoldedFineContainer fine_cont(cont, max);
      fine_cont.multAndAdd(l, f, out);
      JournalRecord recc(journal);
      recc << "dim=" << l << " avmem=" << mem_mb << " tmpmem=" << p_size_mb << " max=" << max
           << " stacks=" << cont.numStacks() << "→" << fine_cont.numStacks() << endrec;
    }
}

// FaaDiBruno::calculate() unfolded dense code
/* Again, no tuning opportunity here. */
void
FaaDiBruno::calculate(const UnfoldedStackContainer &cont, const UGSContainer &g,
                      UGSTensor &out)
{
  out.zeros();
  for (int l = 1; l <= out.dimen(); l++)
    {
      long int mem = SystemResources::availableMemory();
      cont.multAndAdd(l, g, out);
      JournalRecord rec(journal);
      int mem_mb = mem/1024/1024;
      rec << "dim=" << l << " avmem=" << mem_mb << endrec;
    }
}

/* This function returns a number of maximum rows used for refinement of the
   stacked container. We want to set the maximum so that the expected memory
   consumption for the number of paralel threads would be less than available
   memory. On the other hand we do not want to be too pesimistic since a very
   fine refinement can be very slow.

   Besides memory needed for a dense unfolded slice of a tensor from ‘f’, each
   thread needs ‘magic_mult*per_size’ bytes of memory. In the worst case,
   ‘magic_mult’ will be equal to 2, this means memory ‘per_size’ for target
   temporary (permuted symmetry) tensor plus one copy for intermediate result.
   However, this shows to be too pesimistic, so we set ‘magic_mult’ to 1.5. The
   memory for permuted symmetry temporary tensor ‘per_size’ is estimated as a
   weigthed average of unfolded memory of the ‘out’ tensor and unfolded memory
   of a symetric tensor with the largest coordinate size. Some experiments
   showed that the best combination of the two is to take 100% if the latter,
   so we set ‘lambda’ to zero.

   The ‘max’ number of rows in the refined ‘cont’ must be such that each
   slice fits to remaining memory. Number of columns of the slice are
   never greater maxˡ. (This is not true, since stacks corresponding to
   unit/zero matrices cannot be further refined). We get an equation:

    nthreads·maxˡ·8·r = mem − magic_mult·nthreads·per_size·8·r

   where ‘mem’ is available memory in bytes, ‘nthreads’ is a number of threads,
   r is a number of rows, and 8 is ‘sizeof(double)’.

   If the right hand side is less than zero, we set ‘max’ to 10, just to let it
   do something. */

std::tuple<int, int, int>
FaaDiBruno::estimRefinement(const TensorDimens &tdims, int nr, int l)
{
  int nthreads = sthread::detach_thread_group::max_parallel_threads;
  long per_size1 = tdims.calcUnfoldMaxOffset();
  long per_size2 = static_cast<long>(std::pow(tdims.getNVS().getMax(), l));
  double lambda = 0.0;
  long per_size = sizeof(double)*nr
    *static_cast<long>(lambda*per_size1+(1-lambda)*per_size2);
  long mem = SystemResources::availableMemory();
  int max = 0;
  double num_cols = static_cast<double>(mem-magic_mult*nthreads*per_size)
    /nthreads/sizeof(double)/nr;
  if (num_cols > 0)
    {
      double maxd = std::pow(num_cols, 1.0/l);
      max = static_cast<int>(std::floor(maxd));
    }
  if (max == 0)
    {
      max = 10;
      JournalRecord rec(journal);
      rec << "dim=" << l << " run out of memory, imposing max=" << max;
      if (nthreads > 1)
        rec << " (decrease number of threads)";
      rec << endrec;
    }
  int avmem_mb = mem/1024/1024;
  int tmpmem_mb = nthreads*per_size/1024/1024;
  return { max, avmem_mb, tmpmem_mb };
}
-												Dynare++: update copyright information

— Ondra Kamenik has agreed to the relicensing of utils and parser modules to
  GPL
— add DynareTeam’s copyright, since we have made a lot of modifications
— add proper copyright notices in all files

											
										
										
											2019-06-19 14:34:30 +02:00
+								/*
 								 * Copyright © 2005 Ondra Kamenik
-												No longer use C++ UTF-8 string literals

They don’t bring any added value, and break compilation in C++20 mode (because
they are then of type “const char8_t *” which is distinct from “const char *”).

											
										
										
											2022-05-04 18:26:37 +02:00
+								 * Copyright © 2019-2022 Dynare Team
-												Dynare++: update copyright information

— Ondra Kamenik has agreed to the relicensing of utils and parser modules to
  GPL
— add DynareTeam’s copyright, since we have made a lot of modifications
— add proper copyright notices in all files

											
										
										
											2019-06-19 14:34:30 +02:00
+								 *
 								 * This file is part of Dynare.
 								 *
 								 * Dynare is free software: you can redistribute it and/or modify
 								 * it under the terms of the GNU General Public License as published by
 								 * the Free Software Foundation, either version 3 of the License, or
 								 * (at your option) any later version.
 								 *
 								 * Dynare is distributed in the hope that it will be useful,
 								 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 								 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 								 * GNU General Public License for more details.
 								 *
 								 * You should have received a copy of the GNU General Public License
-												Use secure URL for link to GNU licenses

											
										
										
											2021-06-09 17:33:48 +02:00
+								 * along with Dynare.  If not, see <https://www.gnu.org/licenses/>.
-												Dynare++: update copyright information

— Ondra Kamenik has agreed to the relicensing of utils and parser modules to
  GPL
— add DynareTeam’s copyright, since we have made a lot of modifications
— add proper copyright notices in all files

											
										
										
											2019-06-19 14:34:30 +02:00
+								 */
-												dynare++/kord: move away from CWEB

By the way apply Dynare C++ coding style and extensions (.cc/.hh).

											
										
										
											2019-01-04 16:29:57 +01:00
 								#include "faa_di_bruno.hh"
-												dynare++ / tensor library (TL): move away from CWEB

By the way apply Dynare C++ coding style and extensions (.cc/.hh).

											
										
										
											2019-01-08 16:09:25 +01:00
+								#include "fine_container.hh"
-												dynare++/kord: move away from CWEB

By the way apply Dynare C++ coding style and extensions (.cc/.hh).

											
										
										
											2019-01-04 16:29:57 +01:00
 								#include <cmath>
-												Dynare++: improve comments

[skip ci]

											
										
										
											2019-06-05 18:17:43 +02:00
+								// FaaDiBruno::calculate() folded sparse code
 								/* We take an opportunity to refine the stack container to avoid allocation of
 								   more memory than available. */
-												dynare++/kord: move away from CWEB

By the way apply Dynare C++ coding style and extensions (.cc/.hh).

											
										
										
											2019-01-04 16:29:57 +01:00
+								void
 								FaaDiBruno::calculate(const StackContainer<FGSTensor> &cont,
 								                      const TensorContainer<FSSparseTensor> &f,
 								                      FGSTensor &out)
 								{
 								  out.zeros();
 								  for (int l = 1; l <= out.dimen(); l++)
 								    {
-												C++17: use structured bindings instead of std::tie

											
										
										
											2019-09-11 16:06:35 +02:00
+								      auto [max, mem_mb, p_size_mb] = estimRefinement(out.getDims(), out.nrows(), l);
-												dynare++/kord: move away from CWEB

By the way apply Dynare C++ coding style and extensions (.cc/.hh).

											
										
										
											2019-01-04 16:29:57 +01:00
+								      FoldedFineContainer fine_cont(cont, max);
 								      fine_cont.multAndAdd(l, f, out);
 								      JournalRecord recc(journal);
 								      recc << "dim=" << l << " avmem=" << mem_mb << " tmpmem=" << p_size_mb << " max=" << max
-												No longer use C++ UTF-8 string literals

They don’t bring any added value, and break compilation in C++20 mode (because
they are then of type “const char8_t *” which is distinct from “const char *”).

											
										
										
											2022-05-04 18:26:37 +02:00
+								           << " stacks=" << cont.numStacks() << "→" << fine_cont.numStacks() << endrec;
-												dynare++/kord: move away from CWEB

By the way apply Dynare C++ coding style and extensions (.cc/.hh).

											
										
										
											2019-01-04 16:29:57 +01:00
+								    }
 								}
-												Dynare++: improve comments

[skip ci]

											
										
										
											2019-06-05 18:17:43 +02:00
+								// FaaDiBruno::calculate() folded dense code
 								/* Here we just simply evaluate multAndAdd() for the dense container. There is
 								   no opportunity for tuning. */
-												dynare++/kord: move away from CWEB

By the way apply Dynare C++ coding style and extensions (.cc/.hh).

											
										
										
											2019-01-04 16:29:57 +01:00
+								void
 								FaaDiBruno::calculate(const FoldedStackContainer &cont, const FGSContainer &g,
 								                      FGSTensor &out)
 								{
 								  out.zeros();
 								  for (int l = 1; l <= out.dimen(); l++)
 								    {
 								      long int mem = SystemResources::availableMemory();
 								      cont.multAndAdd(l, g, out);
 								      JournalRecord rec(journal);
 								      int mem_mb = mem/1024/1024;
 								      rec << "dim=" << l << " avmem=" << mem_mb << endrec;
 								    }
 								}
-												Dynare++: improve comments

[skip ci]

											
										
										
											2019-06-05 18:17:43 +02:00
+								// FaaDiBruno::calculate() unfolded sparse code
 								/* This is the same as FaaDiBruno::calculate() folded sparse code. The only
 								   difference is that we construct unfolded fine container. */
-												dynare++/kord: move away from CWEB

By the way apply Dynare C++ coding style and extensions (.cc/.hh).

											
										
										
											2019-01-04 16:29:57 +01:00
+								void
 								FaaDiBruno::calculate(const StackContainer<UGSTensor> &cont,
 								                      const TensorContainer<FSSparseTensor> &f,
 								                      UGSTensor &out)
 								{
 								  out.zeros();
 								  for (int l = 1; l <= out.dimen(); l++)
 								    {
-												C++17: use structured bindings instead of std::tie

											
										
										
											2019-09-11 16:06:35 +02:00
+								      auto [max, mem_mb, p_size_mb] = estimRefinement(out.getDims(), out.nrows(), l);
-												dynare++/kord: move away from CWEB

By the way apply Dynare C++ coding style and extensions (.cc/.hh).

											
										
										
											2019-01-04 16:29:57 +01:00
+								      UnfoldedFineContainer fine_cont(cont, max);
 								      fine_cont.multAndAdd(l, f, out);
 								      JournalRecord recc(journal);
 								      recc << "dim=" << l << " avmem=" << mem_mb << " tmpmem=" << p_size_mb << " max=" << max
-												No longer use C++ UTF-8 string literals

They don’t bring any added value, and break compilation in C++20 mode (because
they are then of type “const char8_t *” which is distinct from “const char *”).

											
										
										
											2022-05-04 18:26:37 +02:00
+								           << " stacks=" << cont.numStacks() << "→" << fine_cont.numStacks() << endrec;
-												dynare++/kord: move away from CWEB

By the way apply Dynare C++ coding style and extensions (.cc/.hh).

											
										
										
											2019-01-04 16:29:57 +01:00
+								    }
 								}
-												Dynare++: improve comments

[skip ci]

											
										
										
											2019-06-05 18:17:43 +02:00
+								// FaaDiBruno::calculate() unfolded dense code
-												dynare++/kord: move away from CWEB

By the way apply Dynare C++ coding style and extensions (.cc/.hh).

											
										
										
											2019-01-04 16:29:57 +01:00
+								/* Again, no tuning opportunity here. */
 								void
 								FaaDiBruno::calculate(const UnfoldedStackContainer &cont, const UGSContainer &g,
 								                      UGSTensor &out)
 								{
 								  out.zeros();
 								  for (int l = 1; l <= out.dimen(); l++)
 								    {
 								      long int mem = SystemResources::availableMemory();
 								      cont.multAndAdd(l, g, out);
 								      JournalRecord rec(journal);
 								      int mem_mb = mem/1024/1024;
 								      rec << "dim=" << l << " avmem=" << mem_mb << endrec;
 								    }
 								}
-												Dynare++: improve comments

[skip ci]

											
										
										
											2019-06-05 18:17:43 +02:00
+								/* This function returns a number of maximum rows used for refinement of the
 								   stacked container. We want to set the maximum so that the expected memory
 								   consumption for the number of paralel threads would be less than available
 								   memory. On the other hand we do not want to be too pesimistic since a very
 								   fine refinement can be very slow.
-												dynare++/kord: move away from CWEB

By the way apply Dynare C++ coding style and extensions (.cc/.hh).

											
										
										
											2019-01-04 16:29:57 +01:00
-												Dynare++: improve comments

[skip ci]

											
										
										
											2019-06-05 18:17:43 +02:00
+								   Besides memory needed for a dense unfolded slice of a tensor from ‘f’, each
 								   thread needs ‘magic_mult*per_size’ bytes of memory. In the worst case,
 								   ‘magic_mult’ will be equal to 2, this means memory ‘per_size’ for target
 								   temporary (permuted symmetry) tensor plus one copy for intermediate result.
 								   However, this shows to be too pesimistic, so we set ‘magic_mult’ to 1.5. The
 								   memory for permuted symmetry temporary tensor ‘per_size’ is estimated as a
 								   weigthed average of unfolded memory of the ‘out’ tensor and unfolded memory
 								   of a symetric tensor with the largest coordinate size. Some experiments
 								   showed that the best combination of the two is to take 100% if the latter,
 								   so we set ‘lambda’ to zero.
-												dynare++/kord: move away from CWEB

By the way apply Dynare C++ coding style and extensions (.cc/.hh).

											
										
										
											2019-01-04 16:29:57 +01:00
-												Dynare++: improve comments

[skip ci]

											
										
										
											2019-06-05 18:17:43 +02:00
+								   The ‘max’ number of rows in the refined ‘cont’ must be such that each
-												dynare++/kord: move away from CWEB

By the way apply Dynare C++ coding style and extensions (.cc/.hh).

											
										
										
											2019-01-04 16:29:57 +01:00
+								   slice fits to remaining memory. Number of columns of the slice are
-												Dynare++: improve comments

[skip ci]

											
										
										
											2019-06-05 18:17:43 +02:00
+								   never greater maxˡ. (This is not true, since stacks corresponding to
 								   unit/zero matrices cannot be further refined). We get an equation:
-												dynare++/kord: move away from CWEB

By the way apply Dynare C++ coding style and extensions (.cc/.hh).

											
										
										
											2019-01-04 16:29:57 +01:00
-												Dynare++: improve comments

[skip ci]

											
										
										
											2019-06-05 18:17:43 +02:00
+								    nthreads·maxˡ·8·r = mem − magic_mult·nthreads·per_size·8·r
-												dynare++/kord: move away from CWEB

By the way apply Dynare C++ coding style and extensions (.cc/.hh).

											
										
										
											2019-01-04 16:29:57 +01:00
-												Dynare++: improve comments

[skip ci]

											
										
										
											2019-06-05 18:17:43 +02:00
+								   where ‘mem’ is available memory in bytes, ‘nthreads’ is a number of threads,
 								   r is a number of rows, and 8 is ‘sizeof(double)’.
 								   If the right hand side is less than zero, we set ‘max’ to 10, just to let it
 								   do something. */
-												dynare++/kord: move away from CWEB

By the way apply Dynare C++ coding style and extensions (.cc/.hh).

											
										
										
											2019-01-04 16:29:57 +01:00
-												Dynare++ kord and MEX: various modernizations and improvements

											
										
										
											2019-03-07 18:17:43 +01:00
+								std::tuple<int, int, int>
 								FaaDiBruno::estimRefinement(const TensorDimens &tdims, int nr, int l)
-												dynare++/kord: move away from CWEB

By the way apply Dynare C++ coding style and extensions (.cc/.hh).

											
										
										
											2019-01-04 16:29:57 +01:00
+								{
-												Dynare++: use C++11 threads (instead of POSIX threads API)

On Windows, this means that a POSIX threads implementation is no longer needed,
since C++11 threads are implemented using native Windows threads.

On GNU/Linux and macOS, POSIX threads are still used under the hood.

A new m4 macro (AX_CXX11_THREAD) is used to add the proper compilation
flags (instead of AX_PTHREAD).

											
										
										
											2019-01-28 18:39:42 +01:00
+								  int nthreads = sthread::detach_thread_group::max_parallel_threads;
-												Dynare++ kord library: various modernizations and improvements

Note that I removed several #define whose purpose was to avoid typing "typename
ctraits<t>::…". Even though this tends to complicates the code, this is
probably safer, especially since the #define was capturing a free variable (t).

											
										
										
											2019-03-06 18:40:19 +01:00
+								  long per_size1 = tdims.calcUnfoldMaxOffset();
 								  long per_size2 = static_cast<long>(std::pow(tdims.getNVS().getMax(), l));
-												dynare++/kord: move away from CWEB

By the way apply Dynare C++ coding style and extensions (.cc/.hh).

											
										
										
											2019-01-04 16:29:57 +01:00
+								  double lambda = 0.0;
-												Dynare++ kord library: various modernizations and improvements

Note that I removed several #define whose purpose was to avoid typing "typename
ctraits<t>::…". Even though this tends to complicates the code, this is
probably safer, especially since the #define was capturing a free variable (t).

											
										
										
											2019-03-06 18:40:19 +01:00
+								  long per_size = sizeof(double)*nr
 								    *static_cast<long>(lambda*per_size1+(1-lambda)*per_size2);
 								  long mem = SystemResources::availableMemory();
-												dynare++/kord: move away from CWEB

By the way apply Dynare C++ coding style and extensions (.cc/.hh).

											
										
										
											2019-01-04 16:29:57 +01:00
+								  int max = 0;
-												Dynare++ kord library: various modernizations and improvements

Note that I removed several #define whose purpose was to avoid typing "typename
ctraits<t>::…". Even though this tends to complicates the code, this is
probably safer, especially since the #define was capturing a free variable (t).

											
										
										
											2019-03-06 18:40:19 +01:00
+								  double num_cols = static_cast<double>(mem-magic_mult*nthreads*per_size)
-												dynare++/kord: move away from CWEB

By the way apply Dynare C++ coding style and extensions (.cc/.hh).

											
										
										
											2019-01-04 16:29:57 +01:00
+								    /nthreads/sizeof(double)/nr;
 								  if (num_cols > 0)
 								    {
-												Dynare++ kord library: various modernizations and improvements

Note that I removed several #define whose purpose was to avoid typing "typename
ctraits<t>::…". Even though this tends to complicates the code, this is
probably safer, especially since the #define was capturing a free variable (t).

											
										
										
											2019-03-06 18:40:19 +01:00
+								      double maxd = std::pow(num_cols, 1.0/l);
 								      max = static_cast<int>(std::floor(maxd));
-												dynare++/kord: move away from CWEB

By the way apply Dynare C++ coding style and extensions (.cc/.hh).

											
										
										
											2019-01-04 16:29:57 +01:00
+								    }
 								  if (max == 0)
 								    {
 								      max = 10;
 								      JournalRecord rec(journal);
 								      rec << "dim=" << l << " run out of memory, imposing max=" << max;
 								      if (nthreads > 1)
 								        rec << " (decrease number of threads)";
 								      rec << endrec;
 								    }
-												Dynare++ kord and MEX: various modernizations and improvements

											
										
										
											2019-03-07 18:17:43 +01:00
+								  int avmem_mb = mem/1024/1024;
 								  int tmpmem_mb = nthreads*per_size/1024/1024;
 								  return { max, avmem_mb, tmpmem_mb };
-												dynare++/kord: move away from CWEB

By the way apply Dynare C++ coding style and extensions (.cc/.hh).

											
										
										
											2019-01-04 16:29:57 +01:00
+								}