2023-10-24 22:21:15 +02:00
function [dataset_, dataset_info, newdatainterface] = makedataset ( options_, initialconditions, gsa_flag)
%[dataset_, dataset_info, newdatainterface] = makedataset(options_, initialconditions, gsa_flag)
2014-06-16 17:41:59 +02:00
% Initialize a dataset as a dseries object.
% INPUTS
% ======
%
2023-10-24 22:21:15 +02:00
% options_ [struct] Structure of options built by Dynare's preprocessor.
2017-10-04 10:40:42 +02:00
% initialconditions [double] number of lags for VAR and DSGE_VAR
% gsa_flag [integer] 1: GSA, 0: other
2014-06-16 17:41:59 +02:00
%
% OUTPUTS
% =======
%
2023-10-24 22:21:15 +02:00
% dataset_ [dseries] The dataset.
% dataset_info [struct] Various informations about the dataset (descriptive statistics and missing observations).
2014-06-16 17:41:59 +02:00
%
% EXAMPLE
% =======
%
% [dataset_, dataset_info] = makedataset(options_) ;
%
%
% See also dynare_estimation_init
2023-10-24 22:21:15 +02:00
% Copyright © 2014-2023 Dynare Team
2017-05-18 18:36:38 +02:00
%
% This file is part of Dynare.
%
% Dynare is free software: you can redistribute it and/or modify
% it under the terms of the GNU General Public License as published by
% the Free Software Foundation, either version 3 of the License, or
% (at your option) any later version.
%
% Dynare is distributed in the hope that it will be useful,
% but WITHOUT ANY WARRANTY; without even the implied warranty of
% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
% GNU General Public License for more details.
%
% You should have received a copy of the GNU General Public License
2021-06-09 17:33:48 +02:00
% along with Dynare. If not, see <https://www.gnu.org/licenses/>.
2019-12-20 16:28:06 +01:00
2017-05-18 18:36:38 +02:00
2014-06-25 11:34:39 +02:00
if nargin < 3
gsa_flag = 0 ;
end
if nargin < 2 || isempty ( initialconditions )
2014-06-23 10:55:08 +02:00
% If a the sample is to be used for the estimation of a VAR or DSGE-VAR model
% the second argument must be a strictly positive integer (the number of lags).
initialconditions = 0 ;
end
2023-10-24 22:21:15 +02:00
if isempty ( options_ . datafile ) && isempty ( options_ . dataset . file ) && isempty ( options_ . dataset . series )
2014-06-16 17:41:59 +02:00
if gsa_flag
2023-10-24 22:21:15 +02:00
dataset_ = dseries ( ) ;
dataset_info = struct ( ' missing' , struct ( ' state' , 0 , ' aindex' , [ ] , ' vindex' , [ ] , ' number_of_observations' , NaN , ' no_more_missing_observations' , NaN ) , ...
2017-05-16 15:10:20 +02:00
' descriptive' , struct ( ' mean' , [ ] , ' covariance' , [ ] , ' correlation' , [ ] , ' autocovariance' , [ ] ) ) ;
2014-07-23 12:04:47 +02:00
newdatainterface = 0 ;
2014-06-16 17:41:59 +02:00
return
else
2014-06-17 16:03:30 +02:00
error ( ' makedataset: datafile option is missing!' )
2014-06-16 17:41:59 +02:00
end
end
2023-10-24 22:21:15 +02:00
if isempty ( options_ . datafile ) && ~ isempty ( options_ . dataset . file )
datafile = options_ . dataset . file ;
2014-06-16 17:41:59 +02:00
newdatainterface = 1 ;
2023-10-24 22:21:15 +02:00
elseif isempty ( options_ . datafile ) && ~ isempty ( options_ . dataset . series )
2014-06-17 16:03:30 +02:00
try
2023-10-24 22:21:15 +02:00
dseriesobjectforuserdataset = evalin ( ' base' , options_ . dataset . series ) ;
2014-06-17 16:03:30 +02:00
catch
2023-12-16 10:05:41 +01:00
error ( ' makedataset: %s is unknown!' , options_ . dataset . series )
2014-06-17 16:03:30 +02:00
end
if ~ isdseries ( dseriesobjectforuserdataset )
2023-12-16 10:05:41 +01:00
error ( ' makedataset: %s has to be a dseries object!' , options_ . dataset . series )
2014-06-17 16:03:30 +02:00
end
datafile = [ ] ;
newdatainterface = 1 ;
2023-10-24 22:21:15 +02:00
elseif ~ isempty ( options_ . datafile ) && isempty ( options_ . dataset . file )
datafile = options_ . datafile ;
2014-06-16 17:41:59 +02:00
newdatainterface = 0 ;
2023-10-24 22:21:15 +02:00
elseif ~ isempty ( options_ . datafile ) && ~ isempty ( options_ . dataset . file )
2014-09-22 20:17:20 +02:00
error ( ' makedataset: You cannot simultaneously use the data command and the datafile option (in the estimation command)!' )
2014-06-16 17:41:59 +02:00
else
2014-09-22 20:17:20 +02:00
error ( ' makedataset: You have to specify the datafile!' )
2014-06-16 17:41:59 +02:00
end
% Check extension.
2014-06-17 16:03:30 +02:00
if ~ isempty ( datafile )
allowed_extensions = { ' m' , ' mat' , ' csv' , ' xls' , ' xlsx' } ;
datafile_extension = get_file_extension ( datafile ) ;
if isempty ( datafile_extension )
available_extensions = { } ; j = 1 ;
2023-12-18 07:48:27 +01:00
[ datafilepath , datafilename ] = fileparts ( datafile ) ;
2015-06-05 16:35:05 +02:00
if isempty ( datafilepath )
datafilepath = ' .' ;
end
dircontent = dir ( datafilepath ) ;
2014-06-17 16:03:30 +02:00
for i = 1 : length ( allowed_extensions )
2015-06-05 16:35:05 +02:00
if ~ isempty ( strmatch ( [ datafilename ' .' allowed_extensions { i } ] , { dircontent . name } , ' exact' ) )
2014-06-17 16:03:30 +02:00
available_extensions ( j ) = { allowed_extensions { i } } ;
j = j + 1 ;
end
2014-06-16 17:41:59 +02:00
end
2014-06-17 16:03:30 +02:00
if isempty ( available_extensions )
2023-12-16 10:05:41 +01:00
error ( ' makedataset: I can' ' t find a datafile (with allowed extension m, mat, csv, xls or xlsx)!' )
2014-06-17 16:03:30 +02:00
end
if length ( available_extensions ) > 1
2014-09-22 20:17:20 +02:00
error ( sprintf ( [ ' makedataset: You did not specify an extension for the datafile, but more than one candidate ' ...
' is available in the designated folder!\nPlease, add an extension to the datafile ' ...
' (m, mat, csv, xls or xlsx are permitted extensions).' ] ) ) ;
2014-06-17 16:03:30 +02:00
end
datafile = [ datafile ' .' available_extensions { 1 } ] ;
2014-06-16 17:41:59 +02:00
end
end
% Load the data in a dseries object.
2014-06-17 16:03:30 +02:00
if ~ isempty ( datafile )
2023-08-29 16:15:34 +02:00
if ~ ( newdatainterface == 0 && ( ( length ( datafile ) > 2 && strcmp ( datafile ( end - 1 : end ) , ' .m' ) ) || ( length ( datafile ) > 4 && strcmp ( datafile ( end - 3 : end ) , ' .mat' ) ) ) )
2023-10-24 22:21:15 +02:00
dataset_ = dseries ( datafile ) ;
2014-10-20 17:28:35 +02:00
else
2023-08-29 16:15:34 +02:00
if length ( datafile ) > 2 && strcmp ( datafile ( end - 1 : end ) , ' .m' )
2017-05-10 16:35:58 +02:00
% Load an m file with the old interface.
2023-10-24 22:21:15 +02:00
dataset_ = load_m_file_data_legacy ( datafile , options_ . varobs ) ;
2023-08-29 16:15:34 +02:00
elseif length ( datafile ) > 4 && strcmp ( datafile ( end - 3 : end ) , ' .mat' )
2017-05-10 16:35:58 +02:00
% Load a mat file with the old interface.
2023-10-24 22:21:15 +02:00
dataset_ = load_mat_file_data_legacy ( datafile , options_ . varobs ) ;
2017-04-27 14:04:32 +02:00
end
2014-10-20 17:28:35 +02:00
end
2014-06-17 16:03:30 +02:00
else
2023-10-24 22:21:15 +02:00
dataset_ = dseriesobjectforuserdataset ;
2014-06-17 16:03:30 +02:00
clear ( ' dseriesobjectforuserdataset' ) ;
end
2014-06-16 17:41:59 +02:00
2023-10-24 22:21:15 +02:00
if size ( unique ( dataset_ . name ) , 1 ) ~= size ( dataset_ . name , 1 )
2015-06-08 11:16:40 +02:00
error ( ' makedataset: the data set must not contain two variables with the same name and must not contain empty/non-named columns.' )
end
2014-06-16 17:41:59 +02:00
% Select a subset of the variables.
2023-10-24 22:21:15 +02:00
dataset_ = dataset_ { options_ . varobs { : } } ;
2014-06-16 17:41:59 +02:00
% Apply log function if needed.
2023-10-24 22:21:15 +02:00
if options_ . loglinear && ~ options_ . logdata
dataset_ = dataset_ . log ( ) ;
2014-06-16 17:41:59 +02:00
end
% Test if an initial period (different from its default value) is explicitely defined in the datafile.
2023-10-24 22:21:15 +02:00
if isequal ( dataset_ . init , dates ( 1 , 1 ) )
2014-06-16 17:41:59 +02:00
dataset_default_initial_period = 1 ;
else
dataset_default_initial_period = 0 ;
end
% Test if an initial period (different from its default value) is explicitely defined in the mod file with the set_time command.
2023-10-24 22:21:15 +02:00
if ~ isdates ( options_ . initial_period ) && isnan ( options_ . initial_period )
2014-06-16 17:41:59 +02:00
set_time_default_initial_period = 1 ;
else
set_time_default_initial_period = 0 ;
end
if ~ set_time_default_initial_period && dataset_default_initial_period
% Overwrite the initial period in dataset (it was set to default).
% Note that the updates of freq and time members are auto-magically
% done by dseries::subsasgn overloaded method.
2023-10-24 22:21:15 +02:00
dataset_ . init = options_ . initial_period ;
2014-06-16 17:41:59 +02:00
end
if set_time_default_initial_period && ~ dataset_default_initial_period
% Overwrite the global initial period defined by set_time (it was set to default).
2023-10-24 22:21:15 +02:00
options_ . initial_period = dataset_ . init ;
2014-06-16 17:41:59 +02:00
end
if ~ set_time_default_initial_period && ~ dataset_default_initial_period
% Check if dataset.init and options_.initial_period are identical.
2023-10-24 22:21:15 +02:00
if options_ . initial_period < dataset_ . init
2014-09-22 20:17:20 +02:00
error ( ' makedataset: The date as defined by the set_time command is not consistent with the initial period in the database!' )
2014-06-16 17:41:59 +02:00
end
end
% Set firstobs, lastobs and nobs
if newdatainterface
2023-10-24 22:21:15 +02:00
if isempty ( options_ . dataset . firstobs )
2014-06-16 17:41:59 +02:00
% first_obs option was not used in the data command.
2023-10-24 22:21:15 +02:00
firstobs = dataset_ . init ;
2014-06-16 17:41:59 +02:00
else
2023-10-24 22:21:15 +02:00
firstobs = options_ . dataset . firstobs ;
2014-06-16 17:41:59 +02:00
end
2023-10-24 22:21:15 +02:00
if isnan ( options_ . dataset . nobs )
2014-06-16 17:41:59 +02:00
% nobs option was not used in the data command.
2023-10-24 22:21:15 +02:00
if isempty ( options_ . dataset . lastobs )
2014-06-16 17:41:59 +02:00
% last_obs option was not used in the data command.
2023-10-24 22:21:15 +02:00
nobs = dataset_ . nobs ;
lastobs = dataset_ . dates ( end ) ;
2014-06-16 17:41:59 +02:00
else
2023-10-24 22:21:15 +02:00
lastobs = options_ . dataset . lastobs ;
2014-06-16 17:41:59 +02:00
nobs = lastobs - firstobs + 1 ;
end
else
2023-10-24 22:21:15 +02:00
nobs = options_ . dataset . nobs ;
if isempty ( options_ . dataset . lastobs )
2014-06-16 17:41:59 +02:00
% last_obs option was not used in the data command.
lastobs = firstobs + ( nobs - 1 ) ;
else
% last_obs and nobs were used in the data command. Check that they are consistent (with firstobs).
if ~ isequal ( lastobs , firstobs + ( nobs - 1 ) )
2023-12-16 10:05:41 +01:00
error ( ' makedataset: Options last_obs (%s), first_obs (%s) and nobs (%s) are not consistent!' , char ( lastobs ) , char ( firstobs ) , num2str ( nobs ) ) ;
2014-06-16 17:41:59 +02:00
end
end
end
else
2023-10-24 22:21:15 +02:00
if isnan ( options_ . first_obs )
firstobs = dataset_ . init ;
2014-06-16 17:41:59 +02:00
else
2023-10-24 22:21:15 +02:00
firstobs = dataset_ . dates ( options_ . first_obs ) ;
2014-06-16 17:41:59 +02:00
end
2023-10-24 22:21:15 +02:00
if isnan ( options_ . nobs )
lastobs = dataset_ . dates ( end ) ;
2014-06-16 17:41:59 +02:00
nobs = lastobs - firstobs + 1 ;
else
2023-10-24 22:21:15 +02:00
nobs = options_ . nobs ;
2014-06-16 17:41:59 +02:00
lastobs = firstobs + ( nobs - 1 ) ;
end
end
2014-06-23 10:55:08 +02:00
% Add initial conditions if needed
FIRSTOBS = firstobs - initialconditions ;
2023-10-24 22:21:15 +02:00
% Check that firstobs belongs to dataset_.dates
if firstobs < dataset_ . init
2023-12-16 10:05:41 +01:00
error ( ' makedataset: first_obs (%s) cannot be less than the first date in the dataset (%s)!' , char ( firstobs ) , char ( dataset_ . init ) )
2014-06-16 17:41:59 +02:00
end
2023-10-24 22:21:15 +02:00
% Check that FIRSTOBS belongs to dataset_.dates
if initialconditions && FIRSTOBS < dataset_ . init
2023-12-16 10:05:41 +01:00
error ( ' makedataset: first_obs (%s) - %i cannot be less than the first date in the dataset (%s)!\nReduce the number of lags in the VAR model or increase the value of first_obs\nto at least first_obs=%i.' , char ( firstobs ) , initialconditions , char ( dataset_ . init ) , initialconditions + 1 ) ;
2014-06-23 10:55:08 +02:00
end
2023-10-24 22:21:15 +02:00
% Check that lastobs belongs to dataset_.dates...
2014-06-16 17:41:59 +02:00
if newdatainterface
2023-10-24 22:21:15 +02:00
if lastobs > dataset_ . dates ( end )
2023-12-16 10:05:41 +01:00
error ( ' makedataset: last_obs (%s) cannot be greater than the last date in the dataset (%s)!' , char ( lastobs ) , char ( dataset_ . dates ( end ) ) )
2014-06-16 17:41:59 +02:00
end
else
2023-10-24 22:21:15 +02:00
% ... or check that nobs is smaller than the number of observations in dataset_.
if nobs > dataset_ . nobs
2023-12-16 10:05:41 +01:00
error ( ' makedataset: nobs (%s) cannot be greater than the last date in the dataset (%s)!' , num2str ( nobs ) , num2str ( dataset_ . nobs ) )
2014-06-16 17:41:59 +02:00
end
end
% Select a subsample.
2023-10-24 22:21:15 +02:00
dataset_ = dataset_ ( FIRSTOBS : lastobs ) ;
2014-06-16 17:41:59 +02:00
2023-10-24 22:21:15 +02:00
% Initialize dataset_info structure.
dataset_info = struct ( ' missing' , struct ( ' state' , NaN , ' aindex' , [ ] , ' vindex' , [ ] , ' number_of_observations' , NaN , ' no_more_missing_observations' , NaN ) , ...
2014-06-16 17:41:59 +02:00
' descriptive' , struct ( ' mean' , [ ] , ' covariance' , [ ] , ' correlation' , [ ] , ' autocovariance' , [ ] ) ) ;
2023-10-24 22:21:15 +02:00
% Fill dataset_info.missing if some observations are missing
dataset_info . missing . state = isanynan ( dataset_ . data ) ;
if dataset_info . missing . state
[ dataset_info . missing . aindex , dataset_info . missing . number_of_observations , dataset_info . missing . no_more_missing_observations , dataset_info . missing . vindex ] = ...
describe_missing_data ( dataset_ . data ) ;
2014-06-16 17:41:59 +02:00
else
2023-10-24 22:21:15 +02:00
dataset_info . missing . aindex = num2cell ( transpose ( repmat ( 1 : dataset_ . vobs , dataset_ . nobs , 1 ) ) , 1 ) ;
dataset_info . missing . no_more_missing_observations = 1 ;
2014-06-16 17:41:59 +02:00
end
% Compute the empirical mean of the observed variables.
2023-10-24 22:21:15 +02:00
dataset_info . descriptive . mean = nanmean ( dataset_ . data , 1 ) ;
2014-06-16 17:41:59 +02:00
% Compute the empirical covariance matrix of the observed variables.
2023-10-24 22:21:15 +02:00
dataset_info . descriptive . covariance = nancovariance ( dataset_ . data ) ;
2014-06-16 17:41:59 +02:00
% Compute the empirical correlation matrix of the observed variables.
2023-10-24 22:21:15 +02:00
normalization_matrix = diag ( 1. / sqrt ( diag ( dataset_info . descriptive . covariance ) ) ) ;
dataset_info . descriptive . correlation = normalization_matrix * dataset_info . descriptive . covariance * normalization_matrix ;
2014-06-16 17:41:59 +02:00
% Compute autocorrelation function.
2023-10-24 22:21:15 +02:00
dataset_info . descriptive . autocovariance = nanautocovariance ( dataset_ . data , options_ . ar ) ;
2014-07-09 15:34:17 +02:00
% Save raw data.
2023-10-24 22:21:15 +02:00
dataset_info . rawdata = dataset_ . data ;
2014-07-09 15:34:17 +02:00
% Prefilter the data if needed (remove the mean).
2023-10-24 22:21:15 +02:00
if isequal ( options_ . prefilter , 1 )
dataset_ = dataset_ . detrend ( ) ;
2014-07-09 15:34:17 +02:00
end