dynare/matlab/missing/corrcoef/corrcoef.m

function [R,sig,ci1,ci2,nan_sig] = corrcoef(X,Y,varargin)
% CORRCOEF calculates the correlation matrix from pairwise correlations.
%   The input data can contain missing values encoded with NaN.
%   Missing data (NaN's) are handled by pairwise deletion [15].
%   In order to avoid possible pitfalls, use case-wise deletion or
%   or check the correlation of NaN's with your data (see below).
%   A significance test for testing the Hypothesis
%   'correlation coefficient R is significantly different to zero'
%   is included.
%
% [...] = CORRCOEF(X);
%      calculates the (auto-)correlation matrix of X
% [...] = CORRCOEF(X,Y);
%      calculates the crosscorrelation between X and Y
%
% [...] = CORRCOEF(..., Mode);
%       Mode='Pearson' or 'parametric' [default]
%               gives the correlation coefficient
%               also known as the 'product-moment coefficient of correlation'
%               or 'Pearson''s correlation' [1]
%       Mode='Spearman'     gives 'Spearman''s Rank Correlation Coefficient'
%               This replaces SPEARMAN.M
%       Mode='Rank'         gives a nonparametric Rank Correlation Coefficient
%               This is the "Spearman rank correlation with proper handling of ties"
%               This replaces RANKCORR.M
%
% [...] = CORRCOEF(..., param1, value1, param2, value2, ... );
%       param       value
%       'Mode'          type of correlation
%               'Pearson','parametric'
%               'Spearman'
%               'rank'
%       'rows'          how do deal with missing values encoded as NaN's.
%               'complete': remove all rows with at least one NaN
%               'pairwise': [default]
%       'alpha'         0.01    : significance level to compute confidence interval
%
% [R,p,ci1,ci2,nansig] = CORRCOEF(...);
%   R is the correlation matrix
%       R(i,j) is the correlation coefficient r between X(:,i) and Y(:,j)
%  p    gives the significance of R
%       It tests the null hypothesis that the product moment correlation coefficient is zero
%       using Student's t-test on the statistic t = r*sqrt(N-2)/sqrt(1-r^2)
%       where N is the number of samples (Statistics, M. Spiegel, Schaum series).
%  p > alpha: do not reject the Null hypothesis: 'R is zero'.
%  p < alpha: The alternative hypothesis 'R is larger than zero' is true with probability (1-alpha).
%  ci1  lower (1-alpha) confidence interval
%  ci2  upper (1-alpha) confidence interval
%       If no alpha is provided, the default alpha is 0.01. This can be changed with function flag_implicit_significance.
%  nan_sig  p-value whether H0: 'NaN''s are not correlated' could be correct
%       if nan_sig < alpha, H1 ('NaNs are correlated') is very likely.
%
% The result is only valid if the occurence of NaN's is uncorrelated. In
% order to avoid this pitfall, the correlation of NaN's should be checked
% or case-wise deletion should be applied.
%   Case-Wise deletion can be implemented
%    ix = ~any(isnan([X,Y]),2);
%    [...] = CORRCOEF(X(ix,:),Y(ix,:),...);
%
%  Correlation (non-random distribution) of NaN's can be checked with
%       [nan_R,nan_sig]=corrcoef(X,isnan(X))
%   or  [nan_R,nan_sig]=corrcoef([X,Y],isnan([X,Y]))
%   or  [R,p,ci1,ci2] = CORRCOEF(...);
%
% Further recommandation related to the correlation coefficient:
% + LOOK AT THE SCATTERPLOTS to make sure that the relationship is linear
% + Correlation is not causation because
%       it is not clear which parameter is 'cause' and which is 'effect' and
%       the observed correlation between two variables might be due to the action of other, unobserved variables.
%
% see also: SUMSKIPNAN, COVM, COV, COR, SPEARMAN, RANKCORR, RANKS,
%       PARTCORRCOEF, flag_implicit_significance
%
% REFERENCES:
% on the correlation coefficient
% [ 1] http://mathworld.wolfram.com/CorrelationCoefficient.html
% [ 2] http://www.geography.btinternet.co.uk/spearman.htm
% [ 3] Hogg, R. V. and Craig, A. T. Introduction to Mathematical Statistics, 5th ed.  New York: Macmillan, pp. 338 and 400, 1995.
% [ 4] Lehmann, E. L. and D'Abrera, H. J. M. Nonparametrics: Statistical Methods Based on Ranks, rev. ed. Englewood Cliffs, NJ: Prentice-Hall, pp. 292, 300, and 323, 1998.
% [ 5] Press, W. H.; Flannery, B. P.; Teukolsky, S. A.; and Vetterling, W. T. Numerical Recipes in FORTRAN: The Art of Scientific Computing, 2nd ed. Cambridge, England: Cambridge University Press, pp. 634-637, 1992
% [ 6] http://mathworld.wolfram.com/SpearmanRankCorrelationCoefficient.html
% on the significance test of the correlation coefficient
% [11] http://www.met.rdg.ac.uk/cag/STATS/corr.html
% [12] http://www.janda.org/c10/Lectures/topic06/L24-significanceR.htm
% [13] http://faculty.vassar.edu/lowry/ch4apx.html
% [14] http://davidmlane.com/hyperstat/B134689.html
% [15] http://www.statsoft.com/textbook/stbasic.html%Correlations
% others
% [20] http://www.tufts.edu/~gdallal/corr.htm
% [21] Fisher transformation http://en.wikipedia.org/wiki/Fisher_transformation

%       $Id: corrcoef.m 9387 2011-12-15 10:42:14Z schloegl $
%       Copyright (C) 2000-2004,2008,2009,2011 by Alois Schloegl <alois.schloegl@gmail.com>
%       Copyright (C) 2014-2017 Dynare Team
%       This function is part of the NaN-toolbox
%       http://pub.ist.ac.at/~schloegl/matlab/NaN/

%    This program is free software: you can redistribute it and/or modify
%    it under the terms of the GNU General Public License as published by
%    the Free Software Foundation, either version 3 of the License, or
%    (at your option) any later version.
%
%    This program is distributed in the hope that it will be useful,
%    but WITHOUT ANY WARRANTY; without even the implied warranty of
%    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
%    GNU General Public License for more details.
%
%    You should have received a copy of the GNU General Public License
%    along with this program.  If not, see <http://www.gnu.org/licenses/>.

% Features:
% + handles missing values (encoded as NaN's)
%       + pairwise deletion of missing data
%       + checks independence of missing values (NaNs)
% + parametric and non-parametric (rank) correlation
%       + Pearson's correlation
%       + Spearman's rank correlation
%       + Rank correlation (non-parametric, Spearman rank correlation with proper handling of ties)
% + is fast, using an efficient algorithm O(n.log(n)) for calculating the ranks
% + significance test for null-hypthesis: r=0
% + confidence interval included
% - rank correlation works for cell arrays, too (no check for missing values).
% + compatible with Octave and Matlab

global FLAG_NANS_OCCURED;

NARG = nargout; % needed because nargout is not reentrant in Octave, and corrcoef is recursive
mode = [];

if nargin==1
    Y = [];
    Mode='Pearson';
elseif nargin==0
    fprintf(2,'Error CORRCOEF: Missing argument(s)\n');
elseif nargin>1
    if ischar(Y)
        varg = [Y,varargin];
        Y=[];
    else
        varg = varargin;
    end

    if length(varg)<1
        Mode = 'Pearson';
    elseif length(varg)==1
        Mode = varg{1};
    else
        for k = 2:2:length(varg)
            mode = setfield(mode,lower(varg{k-1}),varg{k});
        end
        if isfield(mode,'mode')
            Mode = mode.mode;
        end
    end
end
if isempty(Mode), Mode='pearson'; end
Mode=[Mode,'        '];


FLAG_WARNING = warning;         % save warning status
warning('off');

[r1,c1]=size(X);
if ~isempty(Y)
    [r2,c2]=size(Y);
    if r1~=r2
        fprintf(2,'Error CORRCOEF: X and Y must have the same number of observations (rows).\n');
        return
    end
    NN = real(~isnan(X)')*real(~isnan(Y));
else
    [r2,c2]=size(X);
    NN = real(~isnan(X)')*real(~isnan(X));
end

%%%%% generate combinations using indices for pairwise calculation of the correlation
YESNAN = any(isnan(X(:))) | any(isnan(Y(:)));
if YESNAN
    FLAG_NANS_OCCURED=(1==1);
    if isfield(mode,'rows')
        if strcmp(mode.rows,'complete')
            ix = ~any([X,Y],2);
            X = X(ix,:);
            if ~isempty(Y)
                Y = Y(ix,:);
            end
            YESNAN = 0;
            NN = size(X,1);
        elseif strcmp(mode.rows,'all')
            fprintf(1,'Warning: data contains NaNs, rows=pairwise is used.');
            %%NN(NN < size(X,1)) = NaN;
        elseif strcmp(mode.rows,'pairwise')
            %%% default
        end
    end
end
if isempty(Y)
    IX = ones(c1)-diag(ones(c1,1));
    [jx, jy ] = find(IX);
    [jxo,jyo] = find(IX);
    R = eye(c1);
else
    IX = sparse([],[],[],c1+c2,c1+c2,c1*c2);
    IX(1:c1,c1+(1:c2)) = 1;
    [jx,jy] = find(IX);

    IX = ones(c1,c2);
    [jxo,jyo] = find(IX);
    R = zeros(c1,c2);
end

if strcmp(lower(Mode(1:7)),'pearson')
    % see http://mathworld.wolfram.com/CorrelationCoefficient.html
    if ~YESNAN
        [S,N,SSQ] = sumskipnan(X,1);
        if ~isempty(Y)
            [S2,N2,SSQ2] = sumskipnan(Y,1);
            CC = X'*Y;
            M1 = S./N;
            M2 = S2./N2;
            cc = CC./NN - M1'*M2;
            R  = cc./sqrt((SSQ./N-M1.*M1)'*(SSQ2./N2-M2.*M2));
        else
            CC = X'*X;
            M  = S./N;
            cc = CC./NN - M'*M;
            v  = SSQ./N - M.*M; %max(N-1,0);
            R  = cc./sqrt(v'*v);
        end
    else
        if ~isempty(Y)
            X  = [X,Y];
        end
        for k = 1:length(jx)
            %ik = ~any(isnan(X(:,[jx(k),jy(k)])),2);
            ik = ~isnan(X(:,jx(k))) & ~isnan(X(:,jy(k)));
            [s,n,s2] = sumskipnan(X(ik,[jx(k),jy(k)]),1);
            v  = (s2-s.*s./n)./n;
            cc = X(ik,jx(k))'*X(ik,jy(k));
            cc = cc/n(1) - prod(s./n);
            %r(k) = cc./sqrt(prod(v));
            R(jxo(k),jyo(k)) = cc./sqrt(prod(v));
        end
    end

elseif strcmp(lower(Mode(1:4)),'rank')
    % see [ 6] http://mathworld.wolfram.com/SpearmanRankCorrelationCoefficient.html
    if ~YESNAN
        if isempty(Y)
            R = corrcoef(ranks(X));
        else
            R = corrcoef(ranks(X),ranks(Y));
        end
    else
        if ~isempty(Y)
            X = [X,Y];
        end
        for k = 1:length(jx)
            %ik = ~any(isnan(X(:,[jx(k),jy(k)])),2);
            ik = ~isnan(X(:,jx(k))) & ~isnan(X(:,jy(k)));
            il = ranks(X(ik,[jx(k),jy(k)]));
            R(jxo(k),jyo(k)) = corrcoef(il(:,1),il(:,2));
        end
        X = ranks(X);
    end

elseif strcmp(lower(Mode(1:8)),'spearman')
    % see [ 6] http://mathworld.wolfram.com/SpearmanRankCorrelationCoefficient.html
    if ~isempty(Y)
        X = [X,Y];
    end

    n = repmat(nan,c1,c2);

    if ~YESNAN
        iy = ranks(X);  %  calculates ranks;

        for k = 1:length(jx)
            [R(jxo(k),jyo(k)),n(jxo(k),jyo(k))] = sumskipnan((iy(:,jx(k)) - iy(:,jy(k))).^2);       % NN is the number of non-missing values
        end
    else
        for k = 1:length(jx)
            %ik = ~any(isnan(X(:,[jx(k),jy(k)])),2);
            ik = ~isnan(X(:,jx(k))) & ~isnan(X(:,jy(k)));
            il = ranks(X(ik,[jx(k),jy(k)]));
            % NN is the number of non-missing values
            [R(jxo(k),jyo(k)),n(jxo(k),jyo(k))] = sumskipnan((il(:,1) - il(:,2)).^2);
        end
        X = ranks(X);
    end
    R = 1 - 6 * R ./ (n.*(n.*n-1));

elseif strcmp(lower(Mode(1:7)),'partial')
    fprintf(2,'Error CORRCOEF: use PARTCORRCOEF \n',Mode);

    return

elseif strcmp(lower(Mode(1:7)),'kendall')
    fprintf(2,'Error CORRCOEF: mode ''%s'' not implemented yet.\n',Mode);

    return
else
    fprintf(2,'Error CORRCOEF: unknown mode ''%s''\n',Mode);
end

if (NARG<2)
    warning(FLAG_WARNING);  % restore warning status
    return
end


% CONFIDENCE INTERVAL
if isfield(mode,'alpha')
    alpha = mode.alpha;
elseif exist('flag_implicit_significance','file')
    alpha = flag_implicit_significance;
else
    alpha = 0.01;
end
% fprintf(1,'CORRCOEF: confidence interval is based on alpha=%f\n',alpha);


% SIGNIFICANCE TEST
R(isnan(R))=0;
tmp = 1 - R.*R;
tmp(tmp<0) = 0;         % prevent tmp<0 i.e. imag(t)~=0
t   = R.*sqrt(max(NN-2,0)./tmp);

if exist('t_cdf','file')
    sig = t_cdf(t,NN-2);
elseif exist('tcdf','file')>1
    sig = tcdf(t,NN-2);
else
    fprintf('CORRCOEF: significance test not completed because of missing TCDF-function\n')
    sig = repmat(nan,size(R));
end
sig  = 2 * min(sig,1 - sig);


if NARG<3
    warning(FLAG_WARNING);  % restore warning status
    return
end


tmp = R;
%tmp(ix1 | ix2) = nan;          % avoid division-by-zero warning
z   = log((1+tmp)./(1-tmp))/2;  % Fisher transformation [21]
                                %sz = 1./sqrt(NN-3);            % standard error of z
sz  = sqrt(2)*erfinv(1-alpha)./sqrt(NN-3);      % confidence interval for alpha of z

ci1 = tanh(z-sz);
ci2 = tanh(z+sz);

%ci1(isnan(ci1))=R(isnan(ci1)); % in case of isnan(ci), the interval limits are exactly the R value
%ci2(isnan(ci2))=R(isnan(ci2));

if (NARG<5) || ~YESNAN
    nan_sig = repmat(NaN,size(R));
    warning(FLAG_WARNING);  % restore warning status
    return
end

%%%%% ----- check independence of NaNs (missing values) -----
[nan_R, nan_sig] = corrcoef(X,double(isnan(X)));

% remove diagonal elements, because these have not any meaning %
nan_sig(isnan(nan_R)) = nan;
% remove diagonal elements, because these have not any meaning %
nan_R(isnan(nan_R)) = 0;

if 0, any(nan_sig(:) < alpha)
    tmp = nan_sig(:);                       % Hack to skip NaN's in MIN(X)
    min_sig = min(tmp(~isnan(tmp)));    % Necessary, because Octave returns NaN rather than min(X) for min(NaN,X)
    fprintf(1,'CORRCOFF Warning: Missing Values (i.e. NaNs) are not independent of data (p-value=%f)\n', min_sig);
    fprintf(1,'   Its recommended to remove all samples (i.e. rows) with any missing value (NaN).\n');
    fprintf(1,'   The null-hypotheses (NaNs are uncorrelated) is rejected for the following parameter pair(s).\n');
    [ix,iy] = find(nan_sig < alpha);
    disp([ix,iy])
end

%%%%% ----- end of independence check ------

warning(FLAG_WARNING);  % restore warning status