dynare/dynare++/kord/faa_di_bruno.cweb

@q $Id: faa_di_bruno.cweb 744 2006-05-09 13:16:07Z kamenik $ @>
@q Copyright 2005, Ondra Kamenik @>

@ Start of {\tt faa\_di\_bruno.cpp} file.

@s FoldedFineContainer int
@s UnfoldedFineContainer int

@c
#include "faa_di_bruno.h"
#include "fine_container.h"

#include <cmath>

double FaaDiBruno::magic_mult = 1.5;
@<|FaaDiBruno::calculate| folded sparse code@>;
@<|FaaDiBruno::calculate| folded dense code@>;
@<|FaaDiBruno::calculate| unfolded sparse code@>;
@<|FaaDiBruno::calculate| unfolded dense code@>;
@<|FaaDiBruno::estimRefinment| code@>;

@ We take an opportunity to refine the stack container to avoid
allocation of more memory than available.

@<|FaaDiBruno::calculate| folded sparse code@>=
void FaaDiBruno::calculate(const StackContainer<FGSTensor>& cont,
						   const TensorContainer<FSSparseTensor>& f,
						   FGSTensor& out)
{
	out.zeros();
	for (int l = 1; l <= out.dimen(); l++) {
		int mem_mb, p_size_mb;
		int max = estimRefinment(out.getDims(), out.nrows(), l, mem_mb, p_size_mb);
		FoldedFineContainer fine_cont(cont, max);
		fine_cont.multAndAdd(l, f, out);
		JournalRecord recc(journal);
		recc << "dim=" << l << " avmem=" << mem_mb << " tmpmem=" << p_size_mb << " max=" << max
			 << " stacks=" << cont.numStacks() << "->" << fine_cont.numStacks() << endrec;
	}
}

@ Here we just simply evaluate |multAndAdd| for the dense
container. There is no opportunity for tuning.

@<|FaaDiBruno::calculate| folded dense code@>=
void FaaDiBruno::calculate(const FoldedStackContainer& cont, const FGSContainer& g,
						   FGSTensor& out)
{
	out.zeros();
	for (int l = 1; l <= out.dimen(); l++) {
		long int mem = SystemResources::availableMemory();
		cont.multAndAdd(l, g, out);
		JournalRecord rec(journal);
		int mem_mb = mem/1024/1024;
		rec << "dim=" << l << " avmem=" << mem_mb << endrec;
	}
}

@ This is the same as |@<|FaaDiBruno::calculate| folded sparse
code@>|. The only difference is that we construct unfolded fine
container.

@<|FaaDiBruno::calculate| unfolded sparse code@>=
void FaaDiBruno::calculate(const StackContainer<UGSTensor>& cont,
						   const TensorContainer<FSSparseTensor>& f,
						   UGSTensor& out)
{
	out.zeros();
	for (int l = 1; l <= out.dimen(); l++) {
		int mem_mb, p_size_mb;
		int max = estimRefinment(out.getDims(), out.nrows(), l, mem_mb, p_size_mb);
		UnfoldedFineContainer fine_cont(cont, max);
		fine_cont.multAndAdd(l, f, out);
		JournalRecord recc(journal);
		recc << "dim=" << l << " avmem=" << mem_mb << " tmpmem=" << p_size_mb << " max=" << max
			 << " stacks=" << cont.numStacks() << "->" << fine_cont.numStacks() << endrec;
	}
}

@ Again, no tuning opportunity here.
@<|FaaDiBruno::calculate| unfolded dense code@>=
void FaaDiBruno::calculate(const UnfoldedStackContainer& cont, const UGSContainer& g,
					   UGSTensor& out)
{
	out.zeros();
	for (int l = 1; l <= out.dimen(); l++) {
		long int mem = SystemResources::availableMemory();
		cont.multAndAdd(l, g, out);
		JournalRecord rec(journal);
		int mem_mb = mem/1024/1024;
		rec << "dim=" << l << " avmem=" << mem_mb << endrec;
	}
}

@ This function returns a number of maximum rows used for refinement of
the stacked container. We want to set the maximum so that the expected
memory consumption for the number of paralel threads would be less
than available memory. On the other hand we do not want to be too
pesimistic since a very fine refinement can be very slow.

Besides memory needed for a dense unfolded slice of a tensor from
|f|, each thread needs |magic_mult*per_size| bytes of memory. In the
worst case, |magic_mult| will be equal to 2, this means memory
|per_size| for target temporary (permuted symmetry) tensor plus one
copy for intermediate result. However, this shows to be too
pesimistic, so we set |magic_mult| to 1.5. The memory for permuted
symmetry temporary tensor |per_size| is estimated as a weigthed
average of unfolded memory of the |out| tensor and unfolded memory of
a symetric tensor with the largest coordinate size. Some experiments
showed that the best combination of the two is to take 100\% if the
latter, so we set |lambda| to zero.

The |max| number of rows in the refined |cont| must be such that each
slice fits to remaining memory. Number of columns of the slice are
never greater $max^l$. (This is not true, since stacks corresponing to
unit/zero matrices cannot be further refined). We get en equation:

$$nthreads\cdot max^l\cdot 8\cdot r = mem - 
magic\_mult\cdot nthreads\cdot per\_size\cdot 8\cdot r,$$
where |mem| is available memory in bytes, |nthreads| is a number of
threads, $r$ is a number of rows, and $8$ is |sizeof(double)|.

If the right hand side is less than zero, we set |max| to 10, just to
let it do something.

@<|FaaDiBruno::estimRefinment| code@>=
int FaaDiBruno::estimRefinment(const TensorDimens& tdims, int nr, int l,
							   int& avmem_mb, int& tmpmem_mb)
{
	int nthreads = THREAD_GROUP::max_parallel_threads;
	long int per_size1 = tdims.calcUnfoldMaxOffset();
	long int per_size2 = (long int)pow((double)tdims.getNVS().getMax(), l);
	double lambda = 0.0;
	long int per_size = sizeof(double)*nr
		*(long int)(lambda*per_size1+(1-lambda)*per_size2);
	long int mem = SystemResources::availableMemory();
	int max = 0;
	double num_cols = ((double)(mem-magic_mult*nthreads*per_size))
		/nthreads/sizeof(double)/nr;
	if (num_cols > 0) {
		double maxd = pow(num_cols, ((double)1)/l);
		max = (int)floor(maxd);
	}
	if (max == 0) {
		max = 10;
		JournalRecord rec(journal);
		rec << "dim=" << l << " run out of memory, imposing max=" << max;
		if (nthreads > 1)
			rec << " (decrease number of threads)";
		rec << endrec;
	}
	avmem_mb = mem/1024/1024;
	tmpmem_mb = (nthreads*per_size)/1024/1024;
	return max;
}


@ End of {\tt faa\_di\_bruno.cpp} file.
Putting dynare++ under main dynare SVN repository git-svn-id: https://www.dynare.org/svn/dynare/trunk@2905 ac1d8469-bf42-47a9-8791-bf33cf982152 2009-09-08 15:55:19 +02:00			`@q $Id: faa_di_bruno.cweb 744 2006-05-09 13:16:07Z kamenik $ @>`
			`@q Copyright 2005, Ondra Kamenik @>`

			`@ Start of {\tt faa\_di\_bruno.cpp} file.`

			`@s FoldedFineContainer int`
			`@s UnfoldedFineContainer int`

			`@c`
			`#include "faa_di_bruno.h"`
			`#include "fine_container.h"`

Dynare++ and k_order_perturbation DLL: * support Microsoft Visual C++ 2008 compiler (necessary for 64-bit platforms) * use standard C++ headers for C Standard Library support git-svn-id: https://www.dynare.org/svn/dynare/trunk@3121 ac1d8469-bf42-47a9-8791-bf33cf982152 2009-11-03 15:16:18 +01:00			`#include <cmath>`
Putting dynare++ under main dynare SVN repository git-svn-id: https://www.dynare.org/svn/dynare/trunk@2905 ac1d8469-bf42-47a9-8791-bf33cf982152 2009-09-08 15:55:19 +02:00
			`double FaaDiBruno::magic_mult = 1.5;`
			`@<\|FaaDiBruno::calculate\| folded sparse code@>;`
			`@<\|FaaDiBruno::calculate\| folded dense code@>;`
			`@<\|FaaDiBruno::calculate\| unfolded sparse code@>;`
			`@<\|FaaDiBruno::calculate\| unfolded dense code@>;`
			`@<\|FaaDiBruno::estimRefinment\| code@>;`

			`@ We take an opportunity to refine the stack container to avoid`
			`allocation of more memory than available.`

			`@<\|FaaDiBruno::calculate\| folded sparse code@>=`
			`void FaaDiBruno::calculate(const StackContainer<FGSTensor>& cont,`
			`const TensorContainer<FSSparseTensor>& f,`
			`FGSTensor& out)`
			`{`
			`out.zeros();`
			`for (int l = 1; l <= out.dimen(); l++) {`
			`int mem_mb, p_size_mb;`
			`int max = estimRefinment(out.getDims(), out.nrows(), l, mem_mb, p_size_mb);`
			`FoldedFineContainer fine_cont(cont, max);`
			`fine_cont.multAndAdd(l, f, out);`
			`JournalRecord recc(journal);`
			`recc << "dim=" << l << " avmem=" << mem_mb << " tmpmem=" << p_size_mb << " max=" << max`
			`<< " stacks=" << cont.numStacks() << "->" << fine_cont.numStacks() << endrec;`
			`}`
			`}`

			`@ Here we just simply evaluate \|multAndAdd\| for the dense`
			`container. There is no opportunity for tuning.`

			`@<\|FaaDiBruno::calculate\| folded dense code@>=`
			`void FaaDiBruno::calculate(const FoldedStackContainer& cont, const FGSContainer& g,`
			`FGSTensor& out)`
			`{`
			`out.zeros();`
			`for (int l = 1; l <= out.dimen(); l++) {`
			`long int mem = SystemResources::availableMemory();`
			`cont.multAndAdd(l, g, out);`
			`JournalRecord rec(journal);`
			`int mem_mb = mem/1024/1024;`
			`rec << "dim=" << l << " avmem=" << mem_mb << endrec;`
			`}`
			`}`

			`@ This is the same as \|@<\|FaaDiBruno::calculate\| folded sparse`
			`code@>\|. The only difference is that we construct unfolded fine`
			`container.`

			`@<\|FaaDiBruno::calculate\| unfolded sparse code@>=`
			`void FaaDiBruno::calculate(const StackContainer<UGSTensor>& cont,`
			`const TensorContainer<FSSparseTensor>& f,`
			`UGSTensor& out)`
			`{`
			`out.zeros();`
			`for (int l = 1; l <= out.dimen(); l++) {`
			`int mem_mb, p_size_mb;`
			`int max = estimRefinment(out.getDims(), out.nrows(), l, mem_mb, p_size_mb);`
			`UnfoldedFineContainer fine_cont(cont, max);`
			`fine_cont.multAndAdd(l, f, out);`
			`JournalRecord recc(journal);`
			`recc << "dim=" << l << " avmem=" << mem_mb << " tmpmem=" << p_size_mb << " max=" << max`
			`<< " stacks=" << cont.numStacks() << "->" << fine_cont.numStacks() << endrec;`
			`}`
			`}`

			`@ Again, no tuning opportunity here.`
			`@<\|FaaDiBruno::calculate\| unfolded dense code@>=`
			`void FaaDiBruno::calculate(const UnfoldedStackContainer& cont, const UGSContainer& g,`
			`UGSTensor& out)`
			`{`
			`out.zeros();`
			`for (int l = 1; l <= out.dimen(); l++) {`
			`long int mem = SystemResources::availableMemory();`
			`cont.multAndAdd(l, g, out);`
			`JournalRecord rec(journal);`
			`int mem_mb = mem/1024/1024;`
			`rec << "dim=" << l << " avmem=" << mem_mb << endrec;`
			`}`
			`}`

			`@ This function returns a number of maximum rows used for refinement of`
			`the stacked container. We want to set the maximum so that the expected`
			`memory consumption for the number of paralel threads would be less`
			`than available memory. On the other hand we do not want to be too`
			`pesimistic since a very fine refinement can be very slow.`

			`Besides memory needed for a dense unfolded slice of a tensor from`
			`\|f\|, each thread needs \|magic_mult*per_size\| bytes of memory. In the`
			`worst case, \|magic_mult\| will be equal to 2, this means memory`
			`\|per_size\| for target temporary (permuted symmetry) tensor plus one`
			`copy for intermediate result. However, this shows to be too`
			`pesimistic, so we set \|magic_mult\| to 1.5. The memory for permuted`
			`symmetry temporary tensor \|per_size\| is estimated as a weigthed`
			`average of unfolded memory of the \|out\| tensor and unfolded memory of`
			`a symetric tensor with the largest coordinate size. Some experiments`
			`showed that the best combination of the two is to take 100\% if the`
			`latter, so we set \|lambda\| to zero.`

			`The \|max\| number of rows in the refined \|cont\| must be such that each`
			`slice fits to remaining memory. Number of columns of the slice are`
			`never greater $max^l$. (This is not true, since stacks corresponing to`
			`unit/zero matrices cannot be further refined). We get en equation:`

			`$$nthreads\cdot max^l\cdot 8\cdot r = mem -`
			`magic\_mult\cdot nthreads\cdot per\_size\cdot 8\cdot r,$$`
			`where \|mem\| is available memory in bytes, \|nthreads\| is a number of`
			`threads, $r$ is a number of rows, and $8$ is \|sizeof(double)\|.`

			`If the right hand side is less than zero, we set \|max\| to 10, just to`
			`let it do something.`

			`@<\|FaaDiBruno::estimRefinment\| code@>=`
			`int FaaDiBruno::estimRefinment(const TensorDimens& tdims, int nr, int l,`
			`int& avmem_mb, int& tmpmem_mb)`
			`{`
			`int nthreads = THREAD_GROUP::max_parallel_threads;`
			`long int per_size1 = tdims.calcUnfoldMaxOffset();`
			`long int per_size2 = (long int)pow((double)tdims.getNVS().getMax(), l);`
			`double lambda = 0.0;`
			`long int per_size = sizeof(double)*nr`
			`(long int)(lambdaper_size1+(1-lambda)*per_size2);`
			`long int mem = SystemResources::availableMemory();`
			`int max = 0;`
			`double num_cols = ((double)(mem-magic_multnthreadsper_size))`
			`/nthreads/sizeof(double)/nr;`
			`if (num_cols > 0) {`
			`double maxd = pow(num_cols, ((double)1)/l);`
			`max = (int)floor(maxd);`
			`}`
			`if (max == 0) {`
			`max = 10;`
			`JournalRecord rec(journal);`
			`rec << "dim=" << l << " run out of memory, imposing max=" << max;`
			`if (nthreads > 1)`
			`rec << " (decrease number of threads)";`
			`rec << endrec;`
			`}`
			`avmem_mb = mem/1024/1024;`
			`tmpmem_mb = (nthreads*per_size)/1024/1024;`
			`return max;`
			`}`


			`@ End of {\tt faa\_di\_bruno.cpp} file.`