dynare/matlab/utilities/dataset/makedataset.m

287 lines
11 KiB
Matlab

function [dataset_, dataset_info, newdatainterface] = makedataset(options_, initialconditions, gsa_flag)
%[dataset_, dataset_info, newdatainterface] = makedataset(options_, initialconditions, gsa_flag)
% Initialize a dataset as a dseries object.
% INPUTS
% ======
%
% options_ [struct] Structure of options built by Dynare's preprocessor.
% initialconditions [double] number of lags for VAR and DSGE_VAR
% gsa_flag [integer] 1: GSA, 0: other
%
% OUTPUTS
% =======
%
% dataset_ [dseries] The dataset.
% dataset_info [struct] Various informations about the dataset (descriptive statistics and missing observations).
%
% EXAMPLE
% =======
%
% [dataset_, dataset_info] = makedataset(options_) ;
%
%
% See also dynare_estimation_init
% Copyright © 2014-2023 Dynare Team
%
% This file is part of Dynare.
%
% Dynare is free software: you can redistribute it and/or modify
% it under the terms of the GNU General Public License as published by
% the Free Software Foundation, either version 3 of the License, or
% (at your option) any later version.
%
% Dynare is distributed in the hope that it will be useful,
% but WITHOUT ANY WARRANTY; without even the implied warranty of
% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
% GNU General Public License for more details.
%
% You should have received a copy of the GNU General Public License
% along with Dynare. If not, see <https://www.gnu.org/licenses/>.
if nargin<3
gsa_flag = 0;
end
if nargin<2 || isempty(initialconditions)
% If a the sample is to be used for the estimation of a VAR or DSGE-VAR model
% the second argument must be a strictly positive integer (the number of lags).
initialconditions = 0;
end
if isempty(options_.datafile) && isempty(options_.dataset.file) && isempty(options_.dataset.series)
if gsa_flag
dataset_ = dseries();
dataset_info = struct('missing', struct('state', 0, 'aindex', [], 'vindex', [], 'number_of_observations', NaN, 'no_more_missing_observations', NaN), ...
'descriptive', struct('mean', [], 'covariance', [], 'correlation', [], 'autocovariance', []));
newdatainterface=0;
return
else
error('makedataset: datafile option is missing!')
end
end
if isempty(options_.datafile) && ~isempty(options_.dataset.file)
datafile = options_.dataset.file;
newdatainterface = 1;
elseif isempty(options_.datafile) && ~isempty(options_.dataset.series)
try
dseriesobjectforuserdataset = evalin('base', options_.dataset.series);
catch
error(sprintf('makedataset: %s is unknown!', options_.dataset.series))
end
if ~isdseries(dseriesobjectforuserdataset)
error(sprintf('makedataset: %s has to be a dseries object!', options_.dataset.series))
end
datafile = [];
newdatainterface = 1;
elseif ~isempty(options_.datafile) && isempty(options_.dataset.file)
datafile = options_.datafile;
newdatainterface = 0;
elseif ~isempty(options_.datafile) && ~isempty(options_.dataset.file)
error('makedataset: You cannot simultaneously use the data command and the datafile option (in the estimation command)!')
else
error('makedataset: You have to specify the datafile!')
end
% Check extension.
if ~isempty(datafile)
allowed_extensions = {'m','mat','csv','xls','xlsx'};
datafile_extension = get_file_extension(datafile);
if isempty(datafile_extension)
available_extensions = {}; j = 1;
[datafilepath,datafilename,datafileext] = fileparts(datafile);
if isempty(datafilepath)
datafilepath = '.';
end
dircontent = dir(datafilepath);
for i=1:length(allowed_extensions)
if ~isempty(strmatch([datafilename '.' allowed_extensions{i}],{dircontent.name},'exact'))
available_extensions(j) = {allowed_extensions{i}};
j = j+1;
end
end
if isempty(available_extensions)
error(['makedataset: I can''t find a datafile (with allowed extension m, mat, csv, xls or xlsx)!'])
end
if length(available_extensions)>1
error(sprintf(['makedataset: You did not specify an extension for the datafile, but more than one candidate ' ...
'is available in the designated folder!\nPlease, add an extension to the datafile ' ...
'(m, mat, csv, xls or xlsx are permitted extensions).']));
end
datafile = [datafile '.' available_extensions{1}];
end
end
% Load the data in a dseries object.
if ~isempty(datafile)
if ~(newdatainterface==0 && ((length(datafile)>2 && strcmp(datafile(end-1:end),'.m')) || (length(datafile)>4 && strcmp(datafile(end-3:end),'.mat'))))
dataset_ = dseries(datafile);
else
if length(datafile)>2 && strcmp(datafile(end-1:end),'.m')
% Load an m file with the old interface.
dataset_ = load_m_file_data_legacy(datafile, options_.varobs);
elseif length(datafile)>4 && strcmp(datafile(end-3:end),'.mat')
% Load a mat file with the old interface.
dataset_ = load_mat_file_data_legacy(datafile, options_.varobs);
end
end
else
dataset_ = dseriesobjectforuserdataset;
clear('dseriesobjectforuserdataset');
end
if size(unique(dataset_.name),1)~=size(dataset_.name,1)
error('makedataset: the data set must not contain two variables with the same name and must not contain empty/non-named columns.')
end
% Select a subset of the variables.
dataset_ = dataset_{options_.varobs{:}};
% Apply log function if needed.
if options_.loglinear && ~options_.logdata
dataset_ = dataset_.log();
end
% Test if an initial period (different from its default value) is explicitely defined in the datafile.
if isequal(dataset_.init, dates(1,1))
dataset_default_initial_period = 1;
else
dataset_default_initial_period = 0;
end
% Test if an initial period (different from its default value) is explicitely defined in the mod file with the set_time command.
if ~isdates(options_.initial_period) && isnan(options_.initial_period)
set_time_default_initial_period = 1;
else
set_time_default_initial_period = 0;
end
if ~set_time_default_initial_period && dataset_default_initial_period
% Overwrite the initial period in dataset (it was set to default).
% Note that the updates of freq and time members are auto-magically
% done by dseries::subsasgn overloaded method.
dataset_.init = options_.initial_period;
end
if set_time_default_initial_period && ~dataset_default_initial_period
% Overwrite the global initial period defined by set_time (it was set to default).
options_.initial_period = dataset_.init;
end
if ~set_time_default_initial_period && ~dataset_default_initial_period
% Check if dataset.init and options_.initial_period are identical.
if options_.initial_period<dataset_.init
error('makedataset: The date as defined by the set_time command is not consistent with the initial period in the database!')
end
end
% Set firstobs, lastobs and nobs
if newdatainterface
if isempty(options_.dataset.firstobs)
% first_obs option was not used in the data command.
firstobs = dataset_.init;
else
firstobs = options_.dataset.firstobs;
end
if isnan(options_.dataset.nobs)
% nobs option was not used in the data command.
if isempty(options_.dataset.lastobs)
% last_obs option was not used in the data command.
nobs = dataset_.nobs;
lastobs = dataset_.dates(end);
else
lastobs = options_.dataset.lastobs;
nobs = lastobs-firstobs+1;
end
else
nobs = options_.dataset.nobs;
if isempty(options_.dataset.lastobs)
% last_obs option was not used in the data command.
lastobs = firstobs+(nobs-1);
else
% last_obs and nobs were used in the data command. Check that they are consistent (with firstobs).
if ~isequal(lastobs,firstobs+(nobs-1))
error(sprintf('makedataset: Options last_obs (%s), first_obs (%s) and nobs (%s) are not consistent!',char(lastobs),char(firstobs),num2str(nobs)));
end
end
end
else
if isnan(options_.first_obs)
firstobs = dataset_.init;
else
firstobs = dataset_.dates(options_.first_obs);
end
if isnan(options_.nobs)
lastobs = dataset_.dates(end);
nobs = lastobs-firstobs+1;
else
nobs = options_.nobs;
lastobs = firstobs+(nobs-1);
end
end
% Add initial conditions if needed
FIRSTOBS = firstobs-initialconditions;
% Check that firstobs belongs to dataset_.dates
if firstobs<dataset_.init
error(sprintf('makedataset: first_obs (%s) cannot be less than the first date in the dataset (%s)!',char(firstobs),char(dataset_.init)))
end
% Check that FIRSTOBS belongs to dataset_.dates
if initialconditions && FIRSTOBS<dataset_.init
error(sprintf('makedataset: first_obs (%s) - %i cannot be less than the first date in the dataset (%s)!\nReduce the number of lags in the VAR model or increase the value of first_obs\nto at least first_obs=%i.', char(firstobs), initialconditions, char(dataset_.init),initialconditions+1));
end
% Check that lastobs belongs to dataset_.dates...
if newdatainterface
if lastobs>dataset_.dates(end)
error(sprintf('makedataset: last_obs (%s) cannot be greater than the last date in the dataset (%s)!',char(lastobs),char(dataset_.dates(end))))
end
else
% ... or check that nobs is smaller than the number of observations in dataset_.
if nobs>dataset_.nobs
error(sprintf('makedataset: nobs (%s) cannot be greater than the last date in the dataset (%s)!', num2str(nobs), num2str(dataset_.nobs)))
end
end
% Select a subsample.
dataset_ = dataset_(FIRSTOBS:lastobs);
% Initialize dataset_info structure.
dataset_info = struct('missing', struct('state', NaN, 'aindex', [], 'vindex', [], 'number_of_observations', NaN, 'no_more_missing_observations', NaN), ...
'descriptive', struct('mean', [], 'covariance', [], 'correlation', [], 'autocovariance', []));
% Fill dataset_info.missing if some observations are missing
dataset_info.missing.state = isanynan(dataset_.data);
if dataset_info.missing.state
[dataset_info.missing.aindex, dataset_info.missing.number_of_observations, dataset_info.missing.no_more_missing_observations, dataset_info.missing.vindex] = ...
describe_missing_data(dataset_.data);
else
dataset_info.missing.aindex = num2cell(transpose(repmat(1:dataset_.vobs,dataset_.nobs,1)),1);
dataset_info.missing.no_more_missing_observations = 1;
end
% Compute the empirical mean of the observed variables.
dataset_info.descriptive.mean = nanmean(dataset_.data,1);
% Compute the empirical covariance matrix of the observed variables.
dataset_info.descriptive.covariance = nancovariance(dataset_.data);
% Compute the empirical correlation matrix of the observed variables.
normalization_matrix = diag(1./sqrt(diag(dataset_info.descriptive.covariance)));
dataset_info.descriptive.correlation = normalization_matrix*dataset_info.descriptive.covariance*normalization_matrix;
% Compute autocorrelation function.
dataset_info.descriptive.autocovariance = nanautocovariance(dataset_.data, options_.ar);
% Save raw data.
dataset_info.rawdata = dataset_.data;
% Prefilter the data if needed (remove the mean).
if isequal(options_.prefilter, 1)
dataset_ = dataset_.detrend();
end