2019-06-19 14:34:30 +02:00
|
|
|
|
/*
|
|
|
|
|
* Copyright © 2005 Ondra Kamenik
|
2022-05-04 18:26:37 +02:00
|
|
|
|
* Copyright © 2019-2022 Dynare Team
|
2019-06-19 14:34:30 +02:00
|
|
|
|
*
|
|
|
|
|
* This file is part of Dynare.
|
|
|
|
|
*
|
|
|
|
|
* Dynare is free software: you can redistribute it and/or modify
|
|
|
|
|
* it under the terms of the GNU General Public License as published by
|
|
|
|
|
* the Free Software Foundation, either version 3 of the License, or
|
|
|
|
|
* (at your option) any later version.
|
|
|
|
|
*
|
|
|
|
|
* Dynare is distributed in the hope that it will be useful,
|
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
|
* GNU General Public License for more details.
|
|
|
|
|
*
|
|
|
|
|
* You should have received a copy of the GNU General Public License
|
2021-06-09 17:33:48 +02:00
|
|
|
|
* along with Dynare. If not, see <https://www.gnu.org/licenses/>.
|
2019-06-19 14:34:30 +02:00
|
|
|
|
*/
|
2019-01-04 16:29:57 +01:00
|
|
|
|
|
|
|
|
|
#include "faa_di_bruno.hh"
|
2019-01-08 16:09:25 +01:00
|
|
|
|
#include "fine_container.hh"
|
2019-01-04 16:29:57 +01:00
|
|
|
|
|
|
|
|
|
#include <cmath>
|
|
|
|
|
|
2019-06-05 18:17:43 +02:00
|
|
|
|
// FaaDiBruno::calculate() folded sparse code
|
|
|
|
|
/* We take an opportunity to refine the stack container to avoid allocation of
|
|
|
|
|
more memory than available. */
|
2019-01-04 16:29:57 +01:00
|
|
|
|
void
|
|
|
|
|
FaaDiBruno::calculate(const StackContainer<FGSTensor> &cont,
|
|
|
|
|
const TensorContainer<FSSparseTensor> &f,
|
|
|
|
|
FGSTensor &out)
|
|
|
|
|
{
|
|
|
|
|
out.zeros();
|
|
|
|
|
for (int l = 1; l <= out.dimen(); l++)
|
|
|
|
|
{
|
2019-09-11 16:06:35 +02:00
|
|
|
|
auto [max, mem_mb, p_size_mb] = estimRefinement(out.getDims(), out.nrows(), l);
|
2019-01-04 16:29:57 +01:00
|
|
|
|
FoldedFineContainer fine_cont(cont, max);
|
|
|
|
|
fine_cont.multAndAdd(l, f, out);
|
|
|
|
|
JournalRecord recc(journal);
|
|
|
|
|
recc << "dim=" << l << " avmem=" << mem_mb << " tmpmem=" << p_size_mb << " max=" << max
|
2022-05-04 18:26:37 +02:00
|
|
|
|
<< " stacks=" << cont.numStacks() << "→" << fine_cont.numStacks() << endrec;
|
2019-01-04 16:29:57 +01:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2019-06-05 18:17:43 +02:00
|
|
|
|
// FaaDiBruno::calculate() folded dense code
|
|
|
|
|
/* Here we just simply evaluate multAndAdd() for the dense container. There is
|
|
|
|
|
no opportunity for tuning. */
|
2019-01-04 16:29:57 +01:00
|
|
|
|
void
|
|
|
|
|
FaaDiBruno::calculate(const FoldedStackContainer &cont, const FGSContainer &g,
|
|
|
|
|
FGSTensor &out)
|
|
|
|
|
{
|
|
|
|
|
out.zeros();
|
|
|
|
|
for (int l = 1; l <= out.dimen(); l++)
|
|
|
|
|
{
|
|
|
|
|
long int mem = SystemResources::availableMemory();
|
|
|
|
|
cont.multAndAdd(l, g, out);
|
|
|
|
|
JournalRecord rec(journal);
|
|
|
|
|
int mem_mb = mem/1024/1024;
|
|
|
|
|
rec << "dim=" << l << " avmem=" << mem_mb << endrec;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2019-06-05 18:17:43 +02:00
|
|
|
|
// FaaDiBruno::calculate() unfolded sparse code
|
|
|
|
|
/* This is the same as FaaDiBruno::calculate() folded sparse code. The only
|
|
|
|
|
difference is that we construct unfolded fine container. */
|
2019-01-04 16:29:57 +01:00
|
|
|
|
void
|
|
|
|
|
FaaDiBruno::calculate(const StackContainer<UGSTensor> &cont,
|
|
|
|
|
const TensorContainer<FSSparseTensor> &f,
|
|
|
|
|
UGSTensor &out)
|
|
|
|
|
{
|
|
|
|
|
out.zeros();
|
|
|
|
|
for (int l = 1; l <= out.dimen(); l++)
|
|
|
|
|
{
|
2019-09-11 16:06:35 +02:00
|
|
|
|
auto [max, mem_mb, p_size_mb] = estimRefinement(out.getDims(), out.nrows(), l);
|
2019-01-04 16:29:57 +01:00
|
|
|
|
UnfoldedFineContainer fine_cont(cont, max);
|
|
|
|
|
fine_cont.multAndAdd(l, f, out);
|
|
|
|
|
JournalRecord recc(journal);
|
|
|
|
|
recc << "dim=" << l << " avmem=" << mem_mb << " tmpmem=" << p_size_mb << " max=" << max
|
2022-05-04 18:26:37 +02:00
|
|
|
|
<< " stacks=" << cont.numStacks() << "→" << fine_cont.numStacks() << endrec;
|
2019-01-04 16:29:57 +01:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2019-06-05 18:17:43 +02:00
|
|
|
|
// FaaDiBruno::calculate() unfolded dense code
|
2019-01-04 16:29:57 +01:00
|
|
|
|
/* Again, no tuning opportunity here. */
|
|
|
|
|
void
|
|
|
|
|
FaaDiBruno::calculate(const UnfoldedStackContainer &cont, const UGSContainer &g,
|
|
|
|
|
UGSTensor &out)
|
|
|
|
|
{
|
|
|
|
|
out.zeros();
|
|
|
|
|
for (int l = 1; l <= out.dimen(); l++)
|
|
|
|
|
{
|
|
|
|
|
long int mem = SystemResources::availableMemory();
|
|
|
|
|
cont.multAndAdd(l, g, out);
|
|
|
|
|
JournalRecord rec(journal);
|
|
|
|
|
int mem_mb = mem/1024/1024;
|
|
|
|
|
rec << "dim=" << l << " avmem=" << mem_mb << endrec;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2019-06-05 18:17:43 +02:00
|
|
|
|
/* This function returns a number of maximum rows used for refinement of the
|
|
|
|
|
stacked container. We want to set the maximum so that the expected memory
|
|
|
|
|
consumption for the number of paralel threads would be less than available
|
|
|
|
|
memory. On the other hand we do not want to be too pesimistic since a very
|
|
|
|
|
fine refinement can be very slow.
|
2019-01-04 16:29:57 +01:00
|
|
|
|
|
2019-06-05 18:17:43 +02:00
|
|
|
|
Besides memory needed for a dense unfolded slice of a tensor from ‘f’, each
|
|
|
|
|
thread needs ‘magic_mult*per_size’ bytes of memory. In the worst case,
|
|
|
|
|
‘magic_mult’ will be equal to 2, this means memory ‘per_size’ for target
|
|
|
|
|
temporary (permuted symmetry) tensor plus one copy for intermediate result.
|
|
|
|
|
However, this shows to be too pesimistic, so we set ‘magic_mult’ to 1.5. The
|
|
|
|
|
memory for permuted symmetry temporary tensor ‘per_size’ is estimated as a
|
|
|
|
|
weigthed average of unfolded memory of the ‘out’ tensor and unfolded memory
|
|
|
|
|
of a symetric tensor with the largest coordinate size. Some experiments
|
|
|
|
|
showed that the best combination of the two is to take 100% if the latter,
|
|
|
|
|
so we set ‘lambda’ to zero.
|
2019-01-04 16:29:57 +01:00
|
|
|
|
|
2019-06-05 18:17:43 +02:00
|
|
|
|
The ‘max’ number of rows in the refined ‘cont’ must be such that each
|
2019-01-04 16:29:57 +01:00
|
|
|
|
slice fits to remaining memory. Number of columns of the slice are
|
2019-06-05 18:17:43 +02:00
|
|
|
|
never greater maxˡ. (This is not true, since stacks corresponding to
|
|
|
|
|
unit/zero matrices cannot be further refined). We get an equation:
|
2019-01-04 16:29:57 +01:00
|
|
|
|
|
2019-06-05 18:17:43 +02:00
|
|
|
|
nthreads·maxˡ·8·r = mem − magic_mult·nthreads·per_size·8·r
|
2019-01-04 16:29:57 +01:00
|
|
|
|
|
2019-06-05 18:17:43 +02:00
|
|
|
|
where ‘mem’ is available memory in bytes, ‘nthreads’ is a number of threads,
|
|
|
|
|
r is a number of rows, and 8 is ‘sizeof(double)’.
|
|
|
|
|
|
|
|
|
|
If the right hand side is less than zero, we set ‘max’ to 10, just to let it
|
|
|
|
|
do something. */
|
2019-01-04 16:29:57 +01:00
|
|
|
|
|
2019-03-07 18:17:43 +01:00
|
|
|
|
std::tuple<int, int, int>
|
|
|
|
|
FaaDiBruno::estimRefinement(const TensorDimens &tdims, int nr, int l)
|
2019-01-04 16:29:57 +01:00
|
|
|
|
{
|
2019-01-28 18:39:42 +01:00
|
|
|
|
int nthreads = sthread::detach_thread_group::max_parallel_threads;
|
2019-03-06 18:40:19 +01:00
|
|
|
|
long per_size1 = tdims.calcUnfoldMaxOffset();
|
|
|
|
|
long per_size2 = static_cast<long>(std::pow(tdims.getNVS().getMax(), l));
|
2019-01-04 16:29:57 +01:00
|
|
|
|
double lambda = 0.0;
|
2019-03-06 18:40:19 +01:00
|
|
|
|
long per_size = sizeof(double)*nr
|
|
|
|
|
*static_cast<long>(lambda*per_size1+(1-lambda)*per_size2);
|
|
|
|
|
long mem = SystemResources::availableMemory();
|
2019-01-04 16:29:57 +01:00
|
|
|
|
int max = 0;
|
2019-03-06 18:40:19 +01:00
|
|
|
|
double num_cols = static_cast<double>(mem-magic_mult*nthreads*per_size)
|
2019-01-04 16:29:57 +01:00
|
|
|
|
/nthreads/sizeof(double)/nr;
|
|
|
|
|
if (num_cols > 0)
|
|
|
|
|
{
|
2019-03-06 18:40:19 +01:00
|
|
|
|
double maxd = std::pow(num_cols, 1.0/l);
|
|
|
|
|
max = static_cast<int>(std::floor(maxd));
|
2019-01-04 16:29:57 +01:00
|
|
|
|
}
|
|
|
|
|
if (max == 0)
|
|
|
|
|
{
|
|
|
|
|
max = 10;
|
|
|
|
|
JournalRecord rec(journal);
|
|
|
|
|
rec << "dim=" << l << " run out of memory, imposing max=" << max;
|
|
|
|
|
if (nthreads > 1)
|
|
|
|
|
rec << " (decrease number of threads)";
|
|
|
|
|
rec << endrec;
|
|
|
|
|
}
|
2019-03-07 18:17:43 +01:00
|
|
|
|
int avmem_mb = mem/1024/1024;
|
|
|
|
|
int tmpmem_mb = nthreads*per_size/1024/1024;
|
|
|
|
|
return { max, avmem_mb, tmpmem_mb };
|
2019-01-04 16:29:57 +01:00
|
|
|
|
}
|