dynare/dynare++/kord/faa_di_bruno.cc

// Copyright 2005, Ondra Kamenik

#include "faa_di_bruno.hh"
#include "fine_container.hh"

#include <cmath>

// |FaaDiBruno::calculate| folded sparse code
/* We take an opportunity to refine the stack container to avoid
   allocation of more memory than available. */
void
FaaDiBruno::calculate(const StackContainer<FGSTensor> &cont,
                      const TensorContainer<FSSparseTensor> &f,
                      FGSTensor &out)
{
  out.zeros();
  for (int l = 1; l <= out.dimen(); l++)
    {
      int max, mem_mb, p_size_mb;
      std::tie(max, mem_mb, p_size_mb) = estimRefinement(out.getDims(), out.nrows(), l);
      FoldedFineContainer fine_cont(cont, max);
      fine_cont.multAndAdd(l, f, out);
      JournalRecord recc(journal);
      recc << "dim=" << l << " avmem=" << mem_mb << " tmpmem=" << p_size_mb << " max=" << max
           << " stacks=" << cont.numStacks() << u8"→" << fine_cont.numStacks() << endrec;
    }
}

// |FaaDiBruno::calculate| folded dense code
/* Here we just simply evaluate |multAndAdd| for the dense
   container. There is no opportunity for tuning. */
void
FaaDiBruno::calculate(const FoldedStackContainer &cont, const FGSContainer &g,
                      FGSTensor &out)
{
  out.zeros();
  for (int l = 1; l <= out.dimen(); l++)
    {
      long int mem = SystemResources::availableMemory();
      cont.multAndAdd(l, g, out);
      JournalRecord rec(journal);
      int mem_mb = mem/1024/1024;
      rec << "dim=" << l << " avmem=" << mem_mb << endrec;
    }
}

// |FaaDiBruno::calculate| unfolded sparse code
/* This is the same as |@<|FaaDiBruno::calculate| folded sparse
   code@>|. The only difference is that we construct unfolded fine
   container. */
void
FaaDiBruno::calculate(const StackContainer<UGSTensor> &cont,
                      const TensorContainer<FSSparseTensor> &f,
                      UGSTensor &out)
{
  out.zeros();
  for (int l = 1; l <= out.dimen(); l++)
    {
      int max, mem_mb, p_size_mb;
      std::tie(max, mem_mb, p_size_mb) = estimRefinement(out.getDims(), out.nrows(), l);
      UnfoldedFineContainer fine_cont(cont, max);
      fine_cont.multAndAdd(l, f, out);
      JournalRecord recc(journal);
      recc << "dim=" << l << " avmem=" << mem_mb << " tmpmem=" << p_size_mb << " max=" << max
           << " stacks=" << cont.numStacks() << u8"→" << fine_cont.numStacks() << endrec;
    }
}

// |FaaDiBruno::calculate| unfolded dense code
/* Again, no tuning opportunity here. */
void
FaaDiBruno::calculate(const UnfoldedStackContainer &cont, const UGSContainer &g,
                      UGSTensor &out)
{
  out.zeros();
  for (int l = 1; l <= out.dimen(); l++)
    {
      long int mem = SystemResources::availableMemory();
      cont.multAndAdd(l, g, out);
      JournalRecord rec(journal);
      int mem_mb = mem/1024/1024;
      rec << "dim=" << l << " avmem=" << mem_mb << endrec;
    }
}

/* This function returns a number of maximum rows used for refinement of
   the stacked container. We want to set the maximum so that the expected
   memory consumption for the number of paralel threads would be less
   than available memory. On the other hand we do not want to be too
   pesimistic since a very fine refinement can be very slow.

   Besides memory needed for a dense unfolded slice of a tensor from
   |f|, each thread needs |magic_mult*per_size| bytes of memory. In the
   worst case, |magic_mult| will be equal to 2, this means memory
   |per_size| for target temporary (permuted symmetry) tensor plus one
   copy for intermediate result. However, this shows to be too
   pesimistic, so we set |magic_mult| to 1.5. The memory for permuted
   symmetry temporary tensor |per_size| is estimated as a weigthed
   average of unfolded memory of the |out| tensor and unfolded memory of
   a symetric tensor with the largest coordinate size. Some experiments
   showed that the best combination of the two is to take 100\% if the
   latter, so we set |lambda| to zero.

   The |max| number of rows in the refined |cont| must be such that each
   slice fits to remaining memory. Number of columns of the slice are
   never greater $max^l$. (This is not true, since stacks corresponing to
   unit/zero matrices cannot be further refined). We get en equation:

   $$nthreads\cdot max^l\cdot 8\cdot r = mem -
   magic\_mult\cdot nthreads\cdot per\_size\cdot 8\cdot r,$$
   where |mem| is available memory in bytes, |nthreads| is a number of
   threads, $r$ is a number of rows, and $8$ is |sizeof(double)|.

   If the right hand side is less than zero, we set |max| to 10, just to
   let it do something. */

std::tuple<int, int, int>
FaaDiBruno::estimRefinement(const TensorDimens &tdims, int nr, int l)
{
  int nthreads = sthread::detach_thread_group::max_parallel_threads;
  long per_size1 = tdims.calcUnfoldMaxOffset();
  long per_size2 = static_cast<long>(std::pow(tdims.getNVS().getMax(), l));
  double lambda = 0.0;
  long per_size = sizeof(double)*nr
    *static_cast<long>(lambda*per_size1+(1-lambda)*per_size2);
  long mem = SystemResources::availableMemory();
  int max = 0;
  double num_cols = static_cast<double>(mem-magic_mult*nthreads*per_size)
    /nthreads/sizeof(double)/nr;
  if (num_cols > 0)
    {
      double maxd = std::pow(num_cols, 1.0/l);
      max = static_cast<int>(std::floor(maxd));
    }
  if (max == 0)
    {
      max = 10;
      JournalRecord rec(journal);
      rec << "dim=" << l << " run out of memory, imposing max=" << max;
      if (nthreads > 1)
        rec << " (decrease number of threads)";
      rec << endrec;
    }
  int avmem_mb = mem/1024/1024;
  int tmpmem_mb = nthreads*per_size/1024/1024;
  return { max, avmem_mb, tmpmem_mb };
}
dynare++/kord: move away from CWEB By the way apply Dynare C++ coding style and extensions (.cc/.hh). 2019-01-04 16:29:57 +01:00			`// Copyright 2005, Ondra Kamenik`

			`#include "faa_di_bruno.hh"`
dynare++ / tensor library (TL): move away from CWEB By the way apply Dynare C++ coding style and extensions (.cc/.hh). 2019-01-08 16:09:25 +01:00			`#include "fine_container.hh"`
dynare++/kord: move away from CWEB By the way apply Dynare C++ coding style and extensions (.cc/.hh). 2019-01-04 16:29:57 +01:00
			`#include <cmath>`

			`// \|FaaDiBruno::calculate\| folded sparse code`
			`/* We take an opportunity to refine the stack container to avoid`
			`allocation of more memory than available. */`
			`void`
			`FaaDiBruno::calculate(const StackContainer<FGSTensor> &cont,`
			`const TensorContainer<FSSparseTensor> &f,`
			`FGSTensor &out)`
			`{`
			`out.zeros();`
			`for (int l = 1; l <= out.dimen(); l++)`
			`{`
Dynare++ kord and MEX: various modernizations and improvements 2019-03-07 18:17:43 +01:00			`int max, mem_mb, p_size_mb;`
			`std::tie(max, mem_mb, p_size_mb) = estimRefinement(out.getDims(), out.nrows(), l);`
dynare++/kord: move away from CWEB By the way apply Dynare C++ coding style and extensions (.cc/.hh). 2019-01-04 16:29:57 +01:00			`FoldedFineContainer fine_cont(cont, max);`
			`fine_cont.multAndAdd(l, f, out);`
			`JournalRecord recc(journal);`
			`recc << "dim=" << l << " avmem=" << mem_mb << " tmpmem=" << p_size_mb << " max=" << max`
Dynare++ main: various modernizations 2019-04-23 18:57:52 +02:00			`<< " stacks=" << cont.numStacks() << u8"→" << fine_cont.numStacks() << endrec;`
dynare++/kord: move away from CWEB By the way apply Dynare C++ coding style and extensions (.cc/.hh). 2019-01-04 16:29:57 +01:00			`}`
			`}`

			`// \|FaaDiBruno::calculate\| folded dense code`
			`/* Here we just simply evaluate \|multAndAdd\| for the dense`
			`container. There is no opportunity for tuning. */`
			`void`
			`FaaDiBruno::calculate(const FoldedStackContainer &cont, const FGSContainer &g,`
			`FGSTensor &out)`
			`{`
			`out.zeros();`
			`for (int l = 1; l <= out.dimen(); l++)`
			`{`
			`long int mem = SystemResources::availableMemory();`
			`cont.multAndAdd(l, g, out);`
			`JournalRecord rec(journal);`
			`int mem_mb = mem/1024/1024;`
			`rec << "dim=" << l << " avmem=" << mem_mb << endrec;`
			`}`
			`}`

			`// \|FaaDiBruno::calculate\| unfolded sparse code`
			`/* This is the same as \|@<\|FaaDiBruno::calculate\| folded sparse`
			`code@>\|. The only difference is that we construct unfolded fine`
			`container. */`
			`void`
			`FaaDiBruno::calculate(const StackContainer<UGSTensor> &cont,`
			`const TensorContainer<FSSparseTensor> &f,`
			`UGSTensor &out)`
			`{`
			`out.zeros();`
			`for (int l = 1; l <= out.dimen(); l++)`
			`{`
Dynare++ kord and MEX: various modernizations and improvements 2019-03-07 18:17:43 +01:00			`int max, mem_mb, p_size_mb;`
			`std::tie(max, mem_mb, p_size_mb) = estimRefinement(out.getDims(), out.nrows(), l);`
dynare++/kord: move away from CWEB By the way apply Dynare C++ coding style and extensions (.cc/.hh). 2019-01-04 16:29:57 +01:00			`UnfoldedFineContainer fine_cont(cont, max);`
			`fine_cont.multAndAdd(l, f, out);`
			`JournalRecord recc(journal);`
			`recc << "dim=" << l << " avmem=" << mem_mb << " tmpmem=" << p_size_mb << " max=" << max`
Dynare++ main: various modernizations 2019-04-23 18:57:52 +02:00			`<< " stacks=" << cont.numStacks() << u8"→" << fine_cont.numStacks() << endrec;`
dynare++/kord: move away from CWEB By the way apply Dynare C++ coding style and extensions (.cc/.hh). 2019-01-04 16:29:57 +01:00			`}`
			`}`

			`// \|FaaDiBruno::calculate\| unfolded dense code`
			`/* Again, no tuning opportunity here. */`
			`void`
			`FaaDiBruno::calculate(const UnfoldedStackContainer &cont, const UGSContainer &g,`
			`UGSTensor &out)`
			`{`
			`out.zeros();`
			`for (int l = 1; l <= out.dimen(); l++)`
			`{`
			`long int mem = SystemResources::availableMemory();`
			`cont.multAndAdd(l, g, out);`
			`JournalRecord rec(journal);`
			`int mem_mb = mem/1024/1024;`
			`rec << "dim=" << l << " avmem=" << mem_mb << endrec;`
			`}`
			`}`

			`/* This function returns a number of maximum rows used for refinement of`
			`the stacked container. We want to set the maximum so that the expected`
			`memory consumption for the number of paralel threads would be less`
			`than available memory. On the other hand we do not want to be too`
			`pesimistic since a very fine refinement can be very slow.`

			`Besides memory needed for a dense unfolded slice of a tensor from`
			`\|f\|, each thread needs \|magic_mult*per_size\| bytes of memory. In the`
			`worst case, \|magic_mult\| will be equal to 2, this means memory`
			`\|per_size\| for target temporary (permuted symmetry) tensor plus one`
			`copy for intermediate result. However, this shows to be too`
			`pesimistic, so we set \|magic_mult\| to 1.5. The memory for permuted`
			`symmetry temporary tensor \|per_size\| is estimated as a weigthed`
			`average of unfolded memory of the \|out\| tensor and unfolded memory of`
			`a symetric tensor with the largest coordinate size. Some experiments`
			`showed that the best combination of the two is to take 100\% if the`
			`latter, so we set \|lambda\| to zero.`

			`The \|max\| number of rows in the refined \|cont\| must be such that each`
			`slice fits to remaining memory. Number of columns of the slice are`
			`never greater $max^l$. (This is not true, since stacks corresponing to`
			`unit/zero matrices cannot be further refined). We get en equation:`

			`$$nthreads\cdot max^l\cdot 8\cdot r = mem -`
			`magic\_mult\cdot nthreads\cdot per\_size\cdot 8\cdot r,$$`
			`where \|mem\| is available memory in bytes, \|nthreads\| is a number of`
			`threads, $r$ is a number of rows, and $8$ is \|sizeof(double)\|.`

			`If the right hand side is less than zero, we set \|max\| to 10, just to`
			`let it do something. */`

Dynare++ kord and MEX: various modernizations and improvements 2019-03-07 18:17:43 +01:00			`std::tuple<int, int, int>`
			`FaaDiBruno::estimRefinement(const TensorDimens &tdims, int nr, int l)`
dynare++/kord: move away from CWEB By the way apply Dynare C++ coding style and extensions (.cc/.hh). 2019-01-04 16:29:57 +01:00			`{`
Dynare++: use C++11 threads (instead of POSIX threads API) On Windows, this means that a POSIX threads implementation is no longer needed, since C++11 threads are implemented using native Windows threads. On GNU/Linux and macOS, POSIX threads are still used under the hood. A new m4 macro (AX_CXX11_THREAD) is used to add the proper compilation flags (instead of AX_PTHREAD). 2019-01-28 18:39:42 +01:00			`int nthreads = sthread::detach_thread_group::max_parallel_threads;`
Dynare++ kord library: various modernizations and improvements Note that I removed several #define whose purpose was to avoid typing "typename ctraits<t>::…". Even though this tends to complicates the code, this is probably safer, especially since the #define was capturing a free variable (t). 2019-03-06 18:40:19 +01:00			`long per_size1 = tdims.calcUnfoldMaxOffset();`
			`long per_size2 = static_cast<long>(std::pow(tdims.getNVS().getMax(), l));`
dynare++/kord: move away from CWEB By the way apply Dynare C++ coding style and extensions (.cc/.hh). 2019-01-04 16:29:57 +01:00			`double lambda = 0.0;`
Dynare++ kord library: various modernizations and improvements Note that I removed several #define whose purpose was to avoid typing "typename ctraits<t>::…". Even though this tends to complicates the code, this is probably safer, especially since the #define was capturing a free variable (t). 2019-03-06 18:40:19 +01:00			`long per_size = sizeof(double)*nr`
			`static_cast<long>(lambdaper_size1+(1-lambda)*per_size2);`
			`long mem = SystemResources::availableMemory();`
dynare++/kord: move away from CWEB By the way apply Dynare C++ coding style and extensions (.cc/.hh). 2019-01-04 16:29:57 +01:00			`int max = 0;`
Dynare++ kord library: various modernizations and improvements Note that I removed several #define whose purpose was to avoid typing "typename ctraits<t>::…". Even though this tends to complicates the code, this is probably safer, especially since the #define was capturing a free variable (t). 2019-03-06 18:40:19 +01:00			`double num_cols = static_cast<double>(mem-magic_multnthreadsper_size)`
dynare++/kord: move away from CWEB By the way apply Dynare C++ coding style and extensions (.cc/.hh). 2019-01-04 16:29:57 +01:00			`/nthreads/sizeof(double)/nr;`
			`if (num_cols > 0)`
			`{`
Dynare++ kord library: various modernizations and improvements Note that I removed several #define whose purpose was to avoid typing "typename ctraits<t>::…". Even though this tends to complicates the code, this is probably safer, especially since the #define was capturing a free variable (t). 2019-03-06 18:40:19 +01:00			`double maxd = std::pow(num_cols, 1.0/l);`
			`max = static_cast<int>(std::floor(maxd));`
dynare++/kord: move away from CWEB By the way apply Dynare C++ coding style and extensions (.cc/.hh). 2019-01-04 16:29:57 +01:00			`}`
			`if (max == 0)`
			`{`
			`max = 10;`
			`JournalRecord rec(journal);`
			`rec << "dim=" << l << " run out of memory, imposing max=" << max;`
			`if (nthreads > 1)`
			`rec << " (decrease number of threads)";`
			`rec << endrec;`
			`}`
Dynare++ kord and MEX: various modernizations and improvements 2019-03-07 18:17:43 +01:00			`int avmem_mb = mem/1024/1024;`
			`int tmpmem_mb = nthreads*per_size/1024/1024;`
			`return { max, avmem_mb, tmpmem_mb };`
dynare++/kord: move away from CWEB By the way apply Dynare C++ coding style and extensions (.cc/.hh). 2019-01-04 16:29:57 +01:00			`}`