2014-11-24 16:06:05 +01:00
function [R,sig,ci1,ci2,nan_sig] = corrcoef ( X,Y,varargin)
% CORRCOEF calculates the correlation matrix from pairwise correlations.
% The input data can contain missing values encoded with NaN.
% Missing data (NaN's) are handled by pairwise deletion [15].
% In order to avoid possible pitfalls, use case-wise deletion or
% or check the correlation of NaN's with your data (see below).
% A significance test for testing the Hypothesis
% 'correlation coefficient R is significantly different to zero'
% is included.
%
% [...] = CORRCOEF(X);
% calculates the (auto-)correlation matrix of X
% [...] = CORRCOEF(X,Y);
% calculates the crosscorrelation between X and Y
%
% [...] = CORRCOEF(..., Mode);
% Mode='Pearson' or 'parametric' [default]
% gives the correlation coefficient
% also known as the 'product-moment coefficient of correlation'
% or 'Pearson''s correlation' [1]
% Mode='Spearman' gives 'Spearman''s Rank Correlation Coefficient'
% This replaces SPEARMAN.M
% Mode='Rank' gives a nonparametric Rank Correlation Coefficient
% This is the "Spearman rank correlation with proper handling of ties"
% This replaces RANKCORR.M
%
% [...] = CORRCOEF(..., param1, value1, param2, value2, ... );
2017-05-16 15:10:20 +02:00
% param value
% 'Mode' type of correlation
% 'Pearson','parametric'
% 'Spearman'
% 'rank'
% 'rows' how do deal with missing values encoded as NaN's.
% 'complete': remove all rows with at least one NaN
% 'pairwise': [default]
% 'alpha' 0.01 : significance level to compute confidence interval
2014-11-24 16:06:05 +01:00
%
% [R,p,ci1,ci2,nansig] = CORRCOEF(...);
% R is the correlation matrix
2017-05-16 15:10:20 +02:00
% R(i,j) is the correlation coefficient r between X(:,i) and Y(:,j)
2014-11-24 16:06:05 +01:00
% p gives the significance of R
2017-05-16 15:10:20 +02:00
% It tests the null hypothesis that the product moment correlation coefficient is zero
2014-11-24 16:06:05 +01:00
% using Student's t-test on the statistic t = r*sqrt(N-2)/sqrt(1-r^2)
% where N is the number of samples (Statistics, M. Spiegel, Schaum series).
% p > alpha: do not reject the Null hypothesis: 'R is zero'.
% p < alpha: The alternative hypothesis 'R is larger than zero' is true with probability (1-alpha).
2017-05-16 15:10:20 +02:00
% ci1 lower (1-alpha) confidence interval
% ci2 upper (1-alpha) confidence interval
% If no alpha is provided, the default alpha is 0.01. This can be changed with function flag_implicit_significance.
2014-11-24 16:06:05 +01:00
% nan_sig p-value whether H0: 'NaN''s are not correlated' could be correct
% if nan_sig < alpha, H1 ('NaNs are correlated') is very likely.
%
% The result is only valid if the occurence of NaN's is uncorrelated. In
% order to avoid this pitfall, the correlation of NaN's should be checked
% or case-wise deletion should be applied.
% Case-Wise deletion can be implemented
% ix = ~any(isnan([X,Y]),2);
% [...] = CORRCOEF(X(ix,:),Y(ix,:),...);
%
% Correlation (non-random distribution) of NaN's can be checked with
% [nan_R,nan_sig]=corrcoef(X,isnan(X))
% or [nan_R,nan_sig]=corrcoef([X,Y],isnan([X,Y]))
% or [R,p,ci1,ci2] = CORRCOEF(...);
%
% Further recommandation related to the correlation coefficient:
% + LOOK AT THE SCATTERPLOTS to make sure that the relationship is linear
% + Correlation is not causation because
2017-05-16 15:10:20 +02:00
% it is not clear which parameter is 'cause' and which is 'effect' and
2014-11-24 16:06:05 +01:00
% the observed correlation between two variables might be due to the action of other, unobserved variables.
%
% see also: SUMSKIPNAN, COVM, COV, COR, SPEARMAN, RANKCORR, RANKS,
% PARTCORRCOEF, flag_implicit_significance
%
% REFERENCES:
% on the correlation coefficient
% [ 1] http://mathworld.wolfram.com/CorrelationCoefficient.html
% [ 2] http://www.geography.btinternet.co.uk/spearman.htm
% [ 3] Hogg, R. V. and Craig, A. T. Introduction to Mathematical Statistics, 5th ed. New York: Macmillan, pp. 338 and 400, 1995.
% [ 4] Lehmann, E. L. and D'Abrera, H. J. M. Nonparametrics: Statistical Methods Based on Ranks, rev. ed. Englewood Cliffs, NJ: Prentice-Hall, pp. 292, 300, and 323, 1998.
% [ 5] Press, W. H.; Flannery, B. P.; Teukolsky, S. A.; and Vetterling, W. T. Numerical Recipes in FORTRAN: The Art of Scientific Computing, 2nd ed. Cambridge, England: Cambridge University Press, pp. 634-637, 1992
% [ 6] http://mathworld.wolfram.com/SpearmanRankCorrelationCoefficient.html
% on the significance test of the correlation coefficient
% [11] http://www.met.rdg.ac.uk/cag/STATS/corr.html
% [12] http://www.janda.org/c10/Lectures/topic06/L24-significanceR.htm
% [13] http://faculty.vassar.edu/lowry/ch4apx.html
% [14] http://davidmlane.com/hyperstat/B134689.html
% [15] http://www.statsoft.com/textbook/stbasic.html%Correlations
% others
% [20] http://www.tufts.edu/~gdallal/corr.htm
% [21] Fisher transformation http://en.wikipedia.org/wiki/Fisher_transformation
% $Id: corrcoef.m 9387 2011-12-15 10:42:14Z schloegl $
% Copyright (C) 2000-2004,2008,2009,2011 by Alois Schloegl <alois.schloegl@gmail.com>
% Copyright (C) 2014 Dynare Team
% This function is part of the NaN-toolbox
% http://pub.ist.ac.at/~schloegl/matlab/NaN/
% This program is free software: you can redistribute it and/or modify
% it under the terms of the GNU General Public License as published by
% the Free Software Foundation, either version 3 of the License, or
% (at your option) any later version.
%
% This program is distributed in the hope that it will be useful,
% but WITHOUT ANY WARRANTY; without even the implied warranty of
% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
% GNU General Public License for more details.
%
% You should have received a copy of the GNU General Public License
% along with this program. If not, see <http://www.gnu.org/licenses/>.
% Features:
% + handles missing values (encoded as NaN's)
% + pairwise deletion of missing data
% + checks independence of missing values (NaNs)
% + parametric and non-parametric (rank) correlation
% + Pearson's correlation
% + Spearman's rank correlation
% + Rank correlation (non-parametric, Spearman rank correlation with proper handling of ties)
% + is fast, using an efficient algorithm O(n.log(n)) for calculating the ranks
% + significance test for null-hypthesis: r=0
% + confidence interval included
% - rank correlation works for cell arrays, too (no check for missing values).
% + compatible with Octave and Matlab
global FLAG_NANS_OCCURED ;
2017-05-16 15:10:20 +02:00
NARG = nargout ; % needed because nargout is not reentrant in Octave, and corrcoef is recursive
2014-11-24 16:06:05 +01:00
mode = [ ] ;
if nargin == 1
2017-05-16 15:10:20 +02:00
Y = [ ] ;
Mode = ' Pearson' ;
2014-11-24 16:06:05 +01:00
elseif nargin == 0
2017-05-16 15:10:20 +02:00
fprintf ( 2 , ' Error CORRCOEF: Missing argument(s)\n' ) ;
2014-11-24 16:06:05 +01:00
elseif nargin > 1
2017-05-16 15:10:20 +02:00
if ischar ( Y )
varg = [ Y , varargin ] ;
Y = [ ] ;
else
varg = varargin ;
end
2014-11-24 16:06:05 +01:00
2017-05-16 15:10:20 +02:00
if length ( varg ) < 1
Mode = ' Pearson' ;
elseif length ( varg ) == 1
Mode = varg { 1 } ;
else
for k = 2 : 2 : length ( varg )
mode = setfield ( mode , lower ( varg { k - 1 } ) , varg { k } ) ;
end
if isfield ( mode , ' mode' )
Mode = mode . mode ;
2017-05-16 12:42:01 +02:00
end
2017-05-16 15:10:20 +02:00
end
2017-05-16 12:42:01 +02:00
end
if isempty ( Mode ) , Mode = ' pearson' ; end
2014-11-24 16:06:05 +01:00
Mode = [ Mode , ' ' ] ;
2017-05-16 15:10:20 +02:00
FLAG_WARNING = warning ; % save warning status
2014-11-24 16:06:05 +01:00
warning ( ' off' ) ;
[ r1 , c1 ] = size ( X ) ;
if ~ isempty ( Y )
2017-05-16 15:10:20 +02:00
[ r2 , c2 ] = size ( Y ) ;
if r1 ~= r2
fprintf ( 2 , ' Error CORRCOEF: X and Y must have the same number of observations (rows).\n' ) ;
return
end
NN = real ( ~ isnan ( X ) ' ) * real ( ~ isnan ( Y ) ) ;
2014-11-24 16:06:05 +01:00
else
2017-05-16 15:10:20 +02:00
[ r2 , c2 ] = size ( X ) ;
NN = real ( ~ isnan ( X ) ' ) * real ( ~ isnan ( X ) ) ;
2017-05-16 12:42:01 +02:00
end
2014-11-24 16:06:05 +01:00
%%%%% generate combinations using indices for pairwise calculation of the correlation
YESNAN = any ( isnan ( X ( : ) ) ) | any ( isnan ( Y ( : ) ) ) ;
2017-05-16 12:42:01 +02:00
if YESNAN
2014-11-24 16:06:05 +01:00
FLAG_NANS_OCCURED = ( 1 == 1 ) ;
if isfield ( mode , ' rows' )
if strcmp ( mode . rows , ' complete' )
ix = ~ any ( [ X , Y ] , 2 ) ;
X = X ( ix , : ) ;
if ~ isempty ( Y )
Y = Y ( ix , : ) ;
2017-05-16 12:42:01 +02:00
end
2014-11-24 16:06:05 +01:00
YESNAN = 0 ;
NN = size ( X , 1 ) ;
elseif strcmp ( mode . rows , ' all' )
fprintf ( 1 , ' Warning: data contains NaNs, rows=pairwise is used.' ) ;
%%NN(NN < size(X,1)) = NaN;
elseif strcmp ( mode . rows , ' pairwise' )
%%% default
2017-05-16 12:42:01 +02:00
end
end
end
if isempty ( Y )
2017-05-16 15:10:20 +02:00
IX = ones ( c1 ) - diag ( ones ( c1 , 1 ) ) ;
[ jx , jy ] = find ( IX ) ;
[ jxo , jyo ] = find ( IX ) ;
2014-11-24 16:06:05 +01:00
R = eye ( c1 ) ;
else
2017-05-16 15:10:20 +02:00
IX = sparse ( [ ] , [ ] , [ ] , c1 + c2 , c1 + c2 , c1 * c2 ) ;
IX ( 1 : c1 , c1 + ( 1 : c2 ) ) = 1 ;
[ jx , jy ] = find ( IX ) ;
2014-11-24 16:06:05 +01:00
2017-05-16 15:10:20 +02:00
IX = ones ( c1 , c2 ) ;
[ jxo , jyo ] = find ( IX ) ;
2014-11-24 16:06:05 +01:00
R = zeros ( c1 , c2 ) ;
2017-05-16 12:42:01 +02:00
end
2014-11-24 16:06:05 +01:00
2017-05-16 12:42:01 +02:00
if strcmp ( lower ( Mode ( 1 : 7 ) ) , ' pearson' )
2017-05-16 15:10:20 +02:00
% see http://mathworld.wolfram.com/CorrelationCoefficient.html
2017-05-16 12:42:01 +02:00
if ~ YESNAN
2017-05-16 15:10:20 +02:00
[ S , N , SSQ ] = sumskipnan ( X , 1 ) ;
if ~ isempty ( Y )
[ S2 , N2 , SSQ2 ] = sumskipnan ( Y , 1 ) ;
CC = X ' * Y ;
M1 = S ./ N ;
M2 = S2 ./ N2 ;
cc = CC ./ NN - M1 ' * M2 ;
R = cc ./ sqrt ( ( SSQ ./ N - M1 .* M1 ) ' * ( SSQ2 ./ N2 - M2 .* M2 ) ) ;
2014-11-24 16:06:05 +01:00
else
2017-05-16 15:10:20 +02:00
CC = X ' * X ;
M = S ./ N ;
cc = CC ./ NN - M ' * M ;
v = SSQ ./ N - M .* M ; %max(N-1,0);
R = cc ./ sqrt ( v ' * v ) ;
end
else
if ~ isempty ( Y )
X = [ X , Y ] ;
end
for k = 1 : length ( jx )
%ik = ~any(isnan(X(:,[jx(k),jy(k)])),2);
ik = ~ isnan ( X ( : , jx ( k ) ) ) & ~ isnan ( X ( : , jy ( k ) ) ) ;
[ s , n , s2 ] = sumskipnan ( X ( ik , [ jx ( k ) , jy ( k ) ] ) , 1 ) ;
v = ( s2 - s .* s ./ n ) ./ n ;
cc = X ( ik , jx ( k ) ) ' * X ( ik , jy ( k ) ) ;
cc = cc / n ( 1 ) - prod ( s ./ n ) ;
%r(k) = cc./sqrt(prod(v));
R ( jxo ( k ) , jyo ( k ) ) = cc ./ sqrt ( prod ( v ) ) ;
end
2014-11-24 16:06:05 +01:00
end
2017-05-16 12:42:01 +02:00
elseif strcmp ( lower ( Mode ( 1 : 4 ) ) , ' rank' )
2017-05-16 15:10:20 +02:00
% see [ 6] http://mathworld.wolfram.com/SpearmanRankCorrelationCoefficient.html
2017-05-16 12:42:01 +02:00
if ~ YESNAN
2017-05-16 15:10:20 +02:00
if isempty ( Y )
R = corrcoef ( ranks ( X ) ) ;
2014-11-24 16:06:05 +01:00
else
2017-05-16 15:10:20 +02:00
R = corrcoef ( ranks ( X ) , ranks ( Y ) ) ;
end
else
if ~ isempty ( Y )
X = [ X , Y ] ;
end
for k = 1 : length ( jx )
%ik = ~any(isnan(X(:,[jx(k),jy(k)])),2);
ik = ~ isnan ( X ( : , jx ( k ) ) ) & ~ isnan ( X ( : , jy ( k ) ) ) ;
il = ranks ( X ( ik , [ jx ( k ) , jy ( k ) ] ) ) ;
R ( jxo ( k ) , jyo ( k ) ) = corrcoef ( il ( : , 1 ) , il ( : , 2 ) ) ;
end
2014-11-24 16:06:05 +01:00
X = ranks ( X ) ;
2017-05-16 12:42:01 +02:00
end
2014-11-24 16:06:05 +01:00
2017-05-16 12:42:01 +02:00
elseif strcmp ( lower ( Mode ( 1 : 8 ) ) , ' spearman' )
2017-05-16 15:10:20 +02:00
% see [ 6] http://mathworld.wolfram.com/SpearmanRankCorrelationCoefficient.html
if ~ isempty ( Y )
X = [ X , Y ] ;
end
2014-11-24 16:06:05 +01:00
2017-05-16 15:10:20 +02:00
n = repmat ( nan , c1 , c2 ) ;
2014-11-24 16:06:05 +01:00
2017-05-16 15:10:20 +02:00
if ~ YESNAN
iy = ranks ( X ) ; % calculates ranks;
2014-11-24 16:06:05 +01:00
2017-05-16 15:10:20 +02:00
for k = 1 : length ( jx )
[ R ( jxo ( k ) , jyo ( k ) ) , n ( jxo ( k ) , jyo ( k ) ) ] = sumskipnan ( ( iy ( : , jx ( k ) ) - iy ( : , jy ( k ) ) ) .^ 2 ) ; % NN is the number of non-missing values
2017-05-16 12:42:01 +02:00
end
2017-05-16 15:10:20 +02:00
else
for k = 1 : length ( jx )
%ik = ~any(isnan(X(:,[jx(k),jy(k)])),2);
ik = ~ isnan ( X ( : , jx ( k ) ) ) & ~ isnan ( X ( : , jy ( k ) ) ) ;
il = ranks ( X ( ik , [ jx ( k ) , jy ( k ) ] ) ) ;
% NN is the number of non-missing values
[ R ( jxo ( k ) , jyo ( k ) ) , n ( jxo ( k ) , jyo ( k ) ) ] = sumskipnan ( ( il ( : , 1 ) - il ( : , 2 ) ) .^ 2 ) ;
end
X = ranks ( X ) ;
end
R = 1 - 6 * R ./ ( n .* ( n .* n - 1 ) ) ;
2014-11-24 16:06:05 +01:00
2017-05-16 12:42:01 +02:00
elseif strcmp ( lower ( Mode ( 1 : 7 ) ) , ' partial' )
2017-05-16 15:10:20 +02:00
fprintf ( 2 , ' Error CORRCOEF: use PARTCORRCOEF \n' , Mode ) ;
2014-11-24 16:06:05 +01:00
2017-05-16 15:10:20 +02:00
return
2014-11-24 16:06:05 +01:00
2017-05-16 12:42:01 +02:00
elseif strcmp ( lower ( Mode ( 1 : 7 ) ) , ' kendall' )
2017-05-16 15:10:20 +02:00
fprintf ( 2 , ' Error CORRCOEF: mode ' ' %s' ' not implemented yet.\n' , Mode ) ;
2014-11-24 16:06:05 +01:00
2017-05-16 15:10:20 +02:00
return
2014-11-24 16:06:05 +01:00
else
2017-05-16 15:10:20 +02:00
fprintf ( 2 , ' Error CORRCOEF: unknown mode ' ' %s' ' \n' , Mode ) ;
2017-05-16 12:42:01 +02:00
end
2014-11-24 16:06:05 +01:00
2017-05-16 12:42:01 +02:00
if ( NARG < 2 )
2017-05-16 15:10:20 +02:00
warning ( FLAG_WARNING ) ; % restore warning status
return
2017-05-16 12:42:01 +02:00
end
2014-11-24 16:06:05 +01:00
% CONFIDENCE INTERVAL
if isfield ( mode , ' alpha' )
alpha = mode . alpha ;
2017-05-16 12:42:01 +02:00
elseif exist ( ' flag_implicit_significance' , ' file' )
2017-05-16 15:10:20 +02:00
alpha = flag_implicit_significance ;
2014-11-24 16:06:05 +01:00
else
alpha = 0.01 ;
2017-05-16 12:42:01 +02:00
end
2014-11-24 16:06:05 +01:00
% fprintf(1,'CORRCOEF: confidence interval is based on alpha=%f\n',alpha);
% SIGNIFICANCE TEST
R ( isnan ( R ) ) = 0 ;
tmp = 1 - R .* R ;
2017-05-16 15:10:20 +02:00
tmp ( tmp < 0 ) = 0 ; % prevent tmp<0 i.e. imag(t)~=0
2014-11-24 16:06:05 +01:00
t = R .* sqrt ( max ( NN - 2 , 0 ) ./ tmp ) ;
2017-05-16 12:42:01 +02:00
if exist ( ' t_cdf' , ' file' )
2017-05-16 15:10:20 +02:00
sig = t_cdf ( t , NN - 2 ) ;
2017-05-16 12:42:01 +02:00
elseif exist ( ' tcdf' , ' file' ) > 1
2017-05-16 15:10:20 +02:00
sig = tcdf ( t , NN - 2 ) ;
2014-11-24 16:06:05 +01:00
else
2017-05-16 15:10:20 +02:00
fprintf ( ' CORRCOEF: significance test not completed because of missing TCDF-function\n' )
sig = repmat ( nan , size ( R ) ) ;
2017-05-16 12:42:01 +02:00
end
2014-11-24 16:06:05 +01:00
sig = 2 * min ( sig , 1 - sig ) ;
2017-05-16 12:42:01 +02:00
if NARG < 3
2014-11-24 16:06:05 +01:00
warning ( FLAG_WARNING ) ; % restore warning status
2017-05-16 15:10:20 +02:00
return
2017-05-16 12:42:01 +02:00
end
2014-11-24 16:06:05 +01:00
tmp = R ;
2017-05-16 15:10:20 +02:00
%tmp(ix1 | ix2) = nan; % avoid division-by-zero warning
2014-11-24 16:06:05 +01:00
z = log ( ( 1 + tmp ) ./ ( 1 - tmp ) ) / 2 ; % Fisher transformation [21]
2017-05-16 15:10:20 +02:00
%sz = 1./sqrt(NN-3); % standard error of z
sz = sqrt ( 2 ) * erfinv ( 1 - alpha ) ./ sqrt ( NN - 3 ) ; % confidence interval for alpha of z
2014-11-24 16:06:05 +01:00
ci1 = tanh ( z - sz ) ;
ci2 = tanh ( z + sz ) ;
2017-05-16 15:10:20 +02:00
%ci1(isnan(ci1))=R(isnan(ci1)); % in case of isnan(ci), the interval limits are exactly the R value
2014-11-24 16:06:05 +01:00
%ci2(isnan(ci2))=R(isnan(ci2));
2017-05-16 12:42:01 +02:00
if ( NARG < 5 ) || ~ YESNAN
2014-11-24 16:06:05 +01:00
nan_sig = repmat ( NaN , size ( R ) ) ;
warning ( FLAG_WARNING ) ; % restore warning status
2017-05-16 12:42:01 +02:00
return
end
2014-11-24 16:06:05 +01:00
%%%%% ----- check independence of NaNs (missing values) -----
[ nan_R , nan_sig ] = corrcoef ( X , double ( isnan ( X ) ) ) ;
% remove diagonal elements, because these have not any meaning %
nan_sig ( isnan ( nan_R ) ) = nan ;
% remove diagonal elements, because these have not any meaning %
nan_R ( isnan ( nan_R ) ) = 0 ;
2017-05-16 12:42:01 +02:00
if 0 , any ( nan_sig ( : ) < alpha )
2017-05-16 15:10:20 +02:00
tmp = nan_sig ( : ) ; % Hack to skip NaN's in MIN(X)
min_sig = min ( tmp ( ~ isnan ( tmp ) ) ) ; % Necessary, because Octave returns NaN rather than min(X) for min(NaN,X)
fprintf ( 1 , ' CORRCOFF Warning: Missing Values (i.e. NaNs) are not independent of data (p-value=%f)\n' , min_sig ) ;
fprintf ( 1 , ' Its recommended to remove all samples (i.e. rows) with any missing value (NaN).\n' ) ;
fprintf ( 1 , ' The null-hypotheses (NaNs are uncorrelated) is rejected for the following parameter pair(s).\n' ) ;
[ ix , iy ] = find ( nan_sig < alpha ) ;
disp ( [ ix , iy ] )
2017-05-16 12:42:01 +02:00
end
2014-11-24 16:06:05 +01:00
%%%%% ----- end of independence check ------
warning ( FLAG_WARNING ) ; % restore warning status