dynare/mex/sources/libslicot/MB02ND.f

890 lines
32 KiB
Fortran

SUBROUTINE MB02ND( M, N, L, RANK, THETA, C, LDC, X, LDX, Q, INUL,
$ TOL, RELTOL, IWORK, DWORK, LDWORK, BWORK,
$ IWARN, INFO )
C
C SLICOT RELEASE 5.0.
C
C Copyright (c) 2002-2009 NICONET e.V.
C
C This program is free software: you can redistribute it and/or
C modify it under the terms of the GNU General Public License as
C published by the Free Software Foundation, either version 2 of
C the License, or (at your option) any later version.
C
C This program is distributed in the hope that it will be useful,
C but WITHOUT ANY WARRANTY; without even the implied warranty of
C MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
C GNU General Public License for more details.
C
C You should have received a copy of the GNU General Public License
C along with this program. If not, see
C <http://www.gnu.org/licenses/>.
C
C PURPOSE
C
C To solve the Total Least Squares (TLS) problem using a Partial
C Singular Value Decomposition (PSVD) approach.
C The TLS problem assumes an overdetermined set of linear equations
C AX = B, where both the data matrix A as well as the observation
C matrix B are inaccurate. The routine also solves determined and
C underdetermined sets of equations by computing the minimum norm
C solution.
C It is assumed that all preprocessing measures (scaling, coordinate
C transformations, whitening, ... ) of the data have been performed
C in advance.
C
C ARGUMENTS
C
C Input/Output Parameters
C
C M (input) INTEGER
C The number of rows in the data matrix A and the
C observation matrix B. M >= 0.
C
C N (input) INTEGER
C The number of columns in the data matrix A. N >= 0.
C
C L (input) INTEGER
C The number of columns in the observation matrix B.
C L >= 0.
C
C RANK (input/output) INTEGER
C On entry, if RANK < 0, then the rank of the TLS
C approximation [A+DA|B+DB] (r say) is computed by the
C routine.
C Otherwise, RANK must specify the value of r.
C RANK <= min(M,N).
C On exit, if RANK < 0 on entry and INFO = 0, then RANK
C contains the computed rank of the TLS approximation
C [A+DA|B+DB].
C Otherwise, the user-supplied value of RANK may be
C changed by the routine on exit if the RANK-th and the
C (RANK+1)-th singular values of C = [A|B] are considered
C to be equal, or if the upper triangular matrix F (as
C defined in METHOD) is (numerically) singular.
C
C THETA (input/output) DOUBLE PRECISION
C On entry, if RANK < 0, then the rank of the TLS
C approximation [A+DA|B+DB] is computed using THETA as
C (min(M,N+L) - d), where d is the number of singular
C values of [A|B] <= THETA. THETA >= 0.0.
C Otherwise, THETA is an initial estimate (t say) for
C computing a lower bound on the RANK largest singular
C values of [A|B]. If THETA < 0.0 on entry however, then
C t is computed by the routine.
C On exit, if RANK >= 0 on entry, then THETA contains the
C computed bound such that precisely RANK singular values
C of C = [A|B] are greater than THETA + TOL.
C Otherwise, THETA is unchanged.
C
C C (input/output) DOUBLE PRECISION array, dimension (LDC,N+L)
C On entry, the leading M-by-(N+L) part of this array must
C contain the matrices A and B. Specifically, the first N
C columns must contain the data matrix A and the last L
C columns the observation matrix B (right-hand sides).
C On exit, if INFO = 0, the first N+L components of the
C columns of this array whose index i corresponds with
C INUL(i) = .TRUE., are the possibly transformed (N+L-RANK)
C base vectors of the right singular subspace corresponding
C to the singular values of C = [A|B] which are less than or
C equal to THETA. Specifically, if L = 0, or if RANK = 0 and
C IWARN <> 2, these vectors are indeed the base vectors
C above. Otherwise, these vectors form the matrix V2,
C transformed as described in Step 4 of the PTLS algorithm
C (see METHOD). The TLS solution is computed from these
C vectors. The other columns of array C contain no useful
C information.
C
C LDC INTEGER
C The leading dimension of array C. LDC >= max(1,M,N+L).
C
C X (output) DOUBLE PRECISION array, dimension (LDX,L)
C If INFO = 0, the leading N-by-L part of this array
C contains the solution X to the TLS problem specified by
C A and B.
C
C LDX INTEGER
C The leading dimension of array X. LDX >= max(1,N).
C
C Q (output) DOUBLE PRECISION array, dimension
C (max(1,2*min(M,N+L)-1))
C This array contains the partially diagonalized bidiagonal
C matrix J computed from C, at the moment that the desired
C singular subspace has been found. Specifically, the
C leading p = min(M,N+L) entries of Q contain the diagonal
C elements q(1),q(2),...,q(p) and the entries Q(p+1),Q(p+2),
C ...,Q(2*p-1) contain the superdiagonal elements e(1),e(2),
C ...,e(p-1) of J.
C
C INUL (output) LOGICAL array, dimension (N+L)
C The indices of the elements of this array with value
C .TRUE. indicate the columns in C containing the base
C vectors of the right singular subspace of C from which
C the TLS solution has been computed.
C
C Tolerances
C
C TOL DOUBLE PRECISION
C This parameter defines the multiplicity of singular values
C by considering all singular values within an interval of
C length TOL as coinciding. TOL is used in checking how many
C singular values are less than or equal to THETA. Also in
C computing an appropriate upper bound THETA by a bisection
C method, TOL is used as a stopping criterion defining the
C minimum (absolute) subinterval width. TOL is also taken
C as an absolute tolerance for negligible elements in the
C QR/QL iterations. If the user sets TOL to be less than or
C equal to 0, then the tolerance is taken as specified in
C SLICOT Library routine MB04YD document.
C
C RELTOL DOUBLE PRECISION
C This parameter specifies the minimum relative width of an
C interval. When an interval is narrower than TOL, or than
C RELTOL times the larger (in magnitude) endpoint, then it
C is considered to be sufficiently small and bisection has
C converged. If the user sets RELTOL to be less than
C BASE * EPS, where BASE is machine radix and EPS is machine
C precision (see LAPACK Library routine DLAMCH), then the
C tolerance is taken as BASE * EPS.
C
C Workspace
C
C IWORK INTEGER array, dimension (N+2*L)
C
C DWORK DOUBLE PRECISION array, dimension (LDWORK)
C On exit, if INFO = 0, DWORK(1) returns the optimal value
C of LDWORK, and DWORK(2) returns the reciprocal of the
C condition number of the matrix F.
C
C LDWORK INTEGER
C The length of the array DWORK.
C LDWORK = max(2, max(M,N+L) + 2*min(M,N+L),
C min(M,N+L) + LW + max(6*(N+L)-5,
C L*L+max(N+L,3*L)),
C where
C LW = (N+L)*(N+L-1)/2, if M >= N+L,
C LW = M*(N+L-(M-1)/2), if M < N+L.
C For optimum performance LDWORK should be larger.
C
C BWORK LOGICAL array, dimension (N+L)
C
C Warning Indicator
C
C IWARN INTEGER
C = 0: no warnings;
C = 1: if the rank of matrix C has been lowered because a
C singular value of multiplicity greater than 1 was
C found;
C = 2: if the rank of matrix C has been lowered because the
C upper triangular matrix F is (numerically) singular.
C
C Error Indicator
C
C INFO INTEGER
C = 0: successful exit;
C < 0: if INFO = -i, the i-th argument had an illegal
C value;
C = 1: if the maximum number of QR/QL iteration steps
C (30*MIN(M,N)) has been exceeded;
C = 2: if the computed rank of the TLS approximation
C [A+DA|B+DB] exceeds MIN(M,N). Try increasing the
C value of THETA or set the value of RANK to min(M,N).
C
C METHOD
C
C The method used is the Partial Total Least Squares (PTLS) approach
C proposed by Van Huffel and Vandewalle [5].
C
C Let C = [A|B] denote the matrix formed by adjoining the columns of
C B to the columns of A on the right.
C
C Total Least Squares (TLS) definition:
C -------------------------------------
C
C Given matrices A and B, find a matrix X satisfying
C
C (A + DA) X = B + DB,
C
C where A and DA are M-by-N matrices, B and DB are M-by-L matrices
C and X is an N-by-L matrix.
C The solution X must be such that the Frobenius norm of [DA|DB]
C is a minimum and each column of B + DB is in the range of
C A + DA. Whenever the solution is not unique, the routine singles
C out the minimum norm solution X.
C
C Let V denote the right singular subspace of C. Since the TLS
C solution can be computed from any orthogonal basis of the subspace
C of V corresponding to the smallest singular values of C, the
C Partial Singular Value Decomposition (PSVD) can be used instead of
C the classical SVD. The dimension of this subspace of V may be
C determined by the rank of C or by an upper bound for those
C smallest singular values.
C
C The PTLS algorithm proceeds as follows (see [2 - 5]):
C
C Step 1: Bidiagonalization phase
C -----------------------
C (a) If M is large enough than N + L, transform C into upper
C triangular form R by Householder transformations.
C (b) Transform C (or R) into upper bidiagonal form
C (p = min(M,N+L)):
C
C |q(1) e(1) 0 ... 0 |
C (0) | 0 q(2) e(2) . |
C J = | . . |
C | . e(p-1)|
C | 0 ... q(p) |
C
C if M >= N + L, or lower bidiagonal form:
C
C |q(1) 0 0 ... 0 0 |
C (0) |e(1) q(2) 0 . . |
C J = | . . . |
C | . q(p) . |
C | 0 ... e(p-1) q(p)|
C
C if M < N + L, using Householder transformations.
C In the second case, transform the matrix to the upper
C bidiagonal form by applying Givens rotations.
C (c) Initialize the right singular base matrix with the identity
C matrix.
C
C Step 2: Partial diagonalization phase
C -----------------------------
C If the upper bound THETA is not given, then compute THETA such
C that precisely p - RANK singular values (p=min(M,N+L)) of the
C bidiagonal matrix are less than or equal to THETA, using a
C bisection method [5]. Diagonalize the given bidiagonal matrix J
C partially, using either QL iterations (if the upper left diagonal
C element of the considered bidiagonal submatrix is smaller than the
C lower right diagonal element) or QR iterations, such that J is
C split into unreduced bidiagonal submatrices whose singular values
C are either all larger than THETA or are all less than or equal
C to THETA. Accumulate the Givens rotations in V.
C
C Step 3: Back transformation phase
C -------------------------
C Apply the Householder transformations of Step 1(b) onto the base
C vectors of V associated with the bidiagonal submatrices with all
C singular values less than or equal to THETA.
C
C Step 4: Computation of F and Y
C ----------------------
C Let V2 be the matrix of the columns of V corresponding to the
C (N + L - RANK) smallest singular values of C.
C Compute with Householder transformations the matrices F and Y
C such that:
C
C |VH Y|
C V2 x Q = | |
C |0 F|
C
C where Q is an orthogonal matrix, VH is an N-by-(N-RANK) matrix,
C Y is an N-by-L matrix and F is an L-by-L upper triangular matrix.
C If F is singular, then reduce the value of RANK by one and repeat
C Steps 2, 3 and 4.
C
C Step 5: Computation of the TLS solution
C -------------------------------
C If F is non-singular then the solution X is obtained by solving
C the following equations by forward elimination:
C
C X F = -Y.
C
C Notes:
C If RANK is lowered in Step 4, some additional base vectors must
C be computed in Step 2. The additional computations are kept to
C a minimum.
C If RANK is lowered in Step 4 but the multiplicity of the RANK-th
C singular value is larger than 1, then the value of RANK is further
C lowered with its multiplicity defined by the parameter TOL. This
C is done at the beginning of Step 2 by calling SLICOT Library
C routine MB03MD (from MB04YD), which estimates THETA using a
C bisection method. If F in Step 4 is singular, then the computed
C solution is infinite and hence does not satisfy the second TLS
C criterion (see TLS definition). For these cases, Golub and
C Van Loan [1] claim that the TLS problem has no solution. The
C properties of these so-called nongeneric problems are described
C in [6] and the TLS computations are generalized in order to solve
C them. As proven in [6], the proposed generalization satisfies the
C TLS criteria for any number L of observation vectors in B provided
C that, in addition, the solution | X| is constrained to be
C |-I|
C orthogonal to all vectors of the form |w| which belong to the
C |0|
C space generated by the columns of the submatrix |Y|.
C |F|
C
C REFERENCES
C
C [1] Golub, G.H. and Van Loan, C.F.
C An Analysis of the Total Least-Squares Problem.
C SIAM J. Numer. Anal., 17, pp. 883-893, 1980.
C
C [2] Van Huffel, S., Vandewalle, J. and Haegemans, A.
C An Efficient and Reliable Algorithm for Computing the
C Singular Subspace of a Matrix Associated with its Smallest
C Singular Values.
C J. Comput. and Appl. Math., 19, pp. 313-330, 1987.
C
C [3] Van Huffel, S.
C Analysis of the Total Least Squares Problem and its Use in
C Parameter Estimation.
C Doctoral dissertation, Dept. of Electr. Eng., Katholieke
C Universiteit Leuven, Belgium, June 1987.
C
C [4] Chan, T.F.
C An Improved Algorithm for Computing the Singular Value
C Decomposition.
C ACM TOMS, 8, pp. 72-83, 1982.
C
C [5] Van Huffel, S. and Vandewalle, J.
C The Partial Total Least Squares Algorithm.
C J. Comput. Appl. Math., 21, pp. 333-341, 1988.
C
C [6] Van Huffel, S. and Vandewalle, J.
C Analysis and Solution of the Nongeneric Total Least Squares
C Problem.
C SIAM J. Matr. Anal. and Appl., 9, pp. 360-372, 1988.
C
C NUMERICAL ASPECTS
C
C The computational efficiency of the PTLS algorithm compared with
C the classical TLS algorithm (see [2 - 5]) is obtained by making
C use of PSVD (see [1]) instead of performing the entire SVD.
C Depending on the gap between the RANK-th and the (RANK+1)-th
C singular values of C, the number (N + L - RANK) of base vectors to
C be computed with respect to the column dimension (N + L) of C and
C the desired accuracy RELTOL, the algorithm used by this routine is
C approximately twice as fast as the classical TLS algorithm at the
C expense of extra storage requirements, namely:
C (N + L) x (N + L - 1)/2 if M >= N + L or
C M x (N + L - (M - 1)/2) if M < N + L.
C This is because the Householder transformations performed on the
C rows of C in the bidiagonalization phase (see Step 1) must be kept
C until the end (Step 5).
C
C CONTRIBUTOR
C
C Release 3.0: V. Sima, Katholieke Univ. Leuven, Belgium, Apr. 1997.
C Supersedes Release 2.0 routine MB02BD by S. Van Huffel, Katholieke
C University, Leuven, Belgium.
C
C REVISIONS
C
C June 30, 1997, Oct. 19, 2003, Feb. 15, 2004.
C
C KEYWORDS
C
C Least-squares approximation, singular subspace, singular value
C decomposition, singular values, total least-squares.
C
C ******************************************************************
C
C .. Parameters ..
DOUBLE PRECISION ZERO, ONE, TWO
PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0, TWO = 2.0D0 )
C .. Scalar Arguments ..
INTEGER INFO, IWARN, L, LDC, LDWORK, LDX, M, N, RANK
DOUBLE PRECISION RELTOL, THETA, TOL
C .. Array Arguments ..
LOGICAL BWORK(*), INUL(*)
INTEGER IWORK(*)
DOUBLE PRECISION C(LDC,*), DWORK(*), Q(*), X(LDX,*)
C .. Local Scalars ..
LOGICAL LFIRST, SUFWRK
INTEGER I, I1, IFAIL, IHOUSH, IJ, IOFF, ITAUP, ITAUQ,
$ IWARM, J, J1, JF, JV, JWORK, K, KF, KJ, LDF, LW,
$ MC, MJ, MNL, N1, NJ, NL, P, WRKOPT
DOUBLE PRECISION CS, EPS, FIRST, FNORM, HH, INPROD, RCOND, SN,
$ TEMP
C .. Local Arrays ..
DOUBLE PRECISION DUMMY(1)
C .. External Functions ..
LOGICAL LSAME
INTEGER ILAENV
DOUBLE PRECISION DLAMCH, DLANGE, DLANTR
EXTERNAL DLAMCH, DLANGE, DLANTR, ILAENV, LSAME
C .. External Subroutines ..
EXTERNAL DCOPY, DGEBRD, DGEQRF, DGERQF, DLARF, DLARFG,
$ DLARTG, DLASET, DORMBR, DORMRQ, DTRCON, DTRSM,
$ MB04YD, XERBLA
C .. Intrinsic Functions ..
INTRINSIC INT, MAX, MIN
C .. Executable Statements ..
C
IWARN = 0
INFO = 0
NL = N + L
K = MAX( M, NL )
P = MIN( M, NL )
IF ( M.GE.NL ) THEN
LW = ( NL*( NL - 1 ) )/2
ELSE
LW = M*NL - ( M*( M - 1 ) )/2
END IF
JV = P + LW + MAX( 6*NL - 5, L*L + MAX( NL, 3*L ) )
C
C Test the input scalar arguments.
C
IF( M.LT.0 ) THEN
INFO = -1
ELSE IF( N.LT.0 ) THEN
INFO = -2
ELSE IF( L.LT.0 ) THEN
INFO = -3
ELSE IF( RANK.GT.MIN( M, N ) ) THEN
INFO = -4
ELSE IF( ( RANK.LT.0 ) .AND. ( THETA.LT.ZERO ) ) THEN
INFO = -5
ELSE IF( LDC.LT.MAX( 1, K ) ) THEN
INFO = -7
ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
INFO = -9
ELSE IF( LDWORK.LT.MAX( 2, K + 2*P, JV ) ) THEN
INFO = -16
END IF
C
IF ( INFO.NE.0 ) THEN
C
C Error return.
C
CALL XERBLA( 'MB02ND', -INFO )
RETURN
END IF
C
C Quick return if possible.
C
IF ( MIN( M, NL ).EQ.0 ) THEN
IF ( M.EQ.0 ) THEN
CALL DLASET( 'Full', NL, NL, ZERO, ONE, C, LDC )
CALL DLASET( 'Full', N, L, ZERO, ZERO, X, LDX )
C
DO 10 I = 1, NL
INUL(I) = .TRUE.
10 CONTINUE
C
END IF
IF ( RANK.GE.0 )
$ THETA = ZERO
RANK = 0
DWORK(1) = TWO
DWORK(2) = ONE
RETURN
END IF
C
WRKOPT = 2
N1 = N + 1
C
EPS = DLAMCH( 'Precision' )
LFIRST = .TRUE.
C
C Initializations.
C
DO 20 I = 1, P
INUL(I) = .FALSE.
BWORK(I) = .FALSE.
20 CONTINUE
C
DO 40 I = P + 1, NL
INUL(I) = .TRUE.
BWORK(I) = .FALSE.
40 CONTINUE
C
C Subroutine MB02ND solves a set of linear equations by a Total
C Least Squares Approximation, based on the Partial SVD.
C
C Step 1: Bidiagonalization phase
C -----------------------
C 1.a): If M is large enough than N+L, transform C into upper
C triangular form R by Householder transformations.
C
C (Note: Comments in the code beginning "Workspace:" describe the
C minimal amount of real workspace needed at that point in the
C code, as well as the preferred amount for good performance.
C NB refers to the optimal block size for the immediately
C following subroutine, as returned by ILAENV.)
C
IF ( M.GE.MAX( NL,
$ ILAENV( 6, 'DGESVD', 'N' // 'N', M, NL, 0, 0 ) ) )
$ THEN
C
C Workspace: need 2*(N+L),
C prefer N+L + (N+L)*NB.
C
ITAUQ = 1
JWORK = ITAUQ + NL
CALL DGEQRF( M, NL, C, LDC, DWORK(ITAUQ), DWORK(JWORK),
$ LDWORK-JWORK+1, IFAIL )
WRKOPT = MAX( WRKOPT, INT( DWORK(JWORK) )+JWORK-1 )
IF ( NL.GT.1 )
$ CALL DLASET( 'Lower', NL-1, NL-1, ZERO, ZERO, C(2,1), LDC )
MNL = NL
ELSE
MNL = M
END IF
C
C 1.b): Transform C (or R) into bidiagonal form Q using Householder
C transformations.
C Workspace: need 2*min(M,N+L) + max(M,N+L),
C prefer 2*min(M,N+L) + (M+N+L)*NB.
C
ITAUP = 1
ITAUQ = ITAUP + P
JWORK = ITAUQ + P
CALL DGEBRD( MNL, NL, C, LDC, Q, Q(P+1), DWORK(ITAUQ),
$ DWORK(ITAUP), DWORK(JWORK), LDWORK-JWORK+1, IFAIL )
WRKOPT = MAX( WRKOPT, INT( DWORK(JWORK) )+JWORK-1 )
C
C If the matrix is lower bidiagonal, rotate to be upper bidiagonal
C by applying Givens rotations on the left.
C
IF ( M.LT.NL ) THEN
IOFF = 0
C
DO 60 I = 1, P - 1
CALL DLARTG( Q(I), Q(P+I), CS, SN, TEMP )
Q(I) = TEMP
Q(P+I) = SN*Q(I+1)
Q(I+1) = CS*Q(I+1)
60 CONTINUE
C
ELSE
IOFF = 1
END IF
C
C Store the Householder transformations performed onto the rows of C
C in the extra storage locations DWORK(IHOUSH).
C Workspace: need LDW = min(M,N+L) + (N+L)*(N+L-1)/2, if M >= N+L,
C LDW = min(M,N+L) + M*(N+L-(M-1)/2), if M < N+L;
C prefer LDW = min(M,N+L) + (N+L)**2, if M >= N+L,
C LDW = min(M,N+L) + M*(N+L), if M < N+L.
C
IHOUSH = ITAUQ
MC = NL - IOFF
KF = IHOUSH + P*NL
SUFWRK = LDWORK.GE.( KF + MAX( 6*(N+L)-5,
$ NL**2 + MAX( NL, 3*L ) - 1 ) )
IF ( SUFWRK ) THEN
C
C Enough workspace for a fast algorithm.
C
CALL DLACPY( 'Upper', P, NL, C, LDC, DWORK(IHOUSH), P )
KJ = KF
WRKOPT = MAX( WRKOPT, KF - 1 )
ELSE
C
C Not enough workspace for a fast algorithm.
C
KJ = IHOUSH
C
DO 80 NJ = 1, MIN( P, MC )
J = MC - NJ + 1
CALL DCOPY( J, C(NJ,NJ+IOFF), LDC, DWORK(KJ), 1 )
KJ = KJ + J
80 CONTINUE
C
END IF
C
C 1.c): Initialize the right singular base matrix V with the
C identity matrix (V overwrites C).
C
CALL DLASET( 'Full', NL, NL, ZERO, ONE, C, LDC )
JV = KJ
IWARM = 0
C
C REPEAT
C
C Compute the Householder matrix Q and matrices F and Y such that
C F is nonsingular.
C
C Step 2: Partial diagonalization phase.
C -----------------------------
C Diagonalize the bidiagonal Q partially until convergence to
C the desired right singular subspace.
C Workspace: LDW + 6*(N+L)-5.
C
100 CONTINUE
JWORK = JV
CALL MB04YD( 'No U', 'Update V', P, NL, RANK, THETA, Q, Q(P+1),
$ DUMMY, 1, C, LDC, INUL, TOL, RELTOL, DWORK(JWORK),
$ LDWORK-JWORK+1, IWARN, INFO )
WRKOPT = MAX( WRKOPT, JWORK + 6*NL - 6 )
C
IWARN = MAX( IWARN, IWARM )
IF ( INFO.GT.0 )
$ RETURN
C
C Set pointers to the selected base vectors in the right singular
C matrix of C.
C
K = 0
C
DO 120 I = 1, NL
IF ( INUL(I) ) THEN
K = K + 1
IWORK(K) = I
END IF
120 CONTINUE
C
IF ( K.LT.L ) THEN
C
C Rank of the TLS approximation is larger than min(M,N).
C
INFO = 2
RETURN
END IF
C
C Step 3: Back transformation phase.
C -------------------------
C Apply in backward order the Householder transformations (stored
C in DWORK(IHOUSH)) performed onto the rows of C during the
C bidiagonalization phase, to the selected base vectors (specified
C by INUL(I) = .TRUE.). Already transformed vectors are those for
C which BWORK(I) = .TRUE..
C
KF = K
IF ( SUFWRK.AND.LFIRST ) THEN
C
C Enough workspace for a fast algorithm and first pass.
C
IJ = JV
C
DO 140 J = 1, K
CALL DCOPY (NL, C(1,IWORK(J)), 1, DWORK(IJ), 1 )
IJ = IJ + NL
140 CONTINUE
C
C Workspace: need LDW + (N+L)*K + K,
C prefer LDW + (N+L)*K + K*NB.
C
IJ = JV
JWORK = IJ + NL*K
CALL DORMBR( 'P vectors', 'Left', 'No transpose', NL, K,
$ MNL, DWORK(IHOUSH), P, DWORK(ITAUP), DWORK(IJ),
$ NL, DWORK(JWORK), LDWORK-JWORK+1, IFAIL )
WRKOPT = MAX( WRKOPT, INT( DWORK(JWORK) )+JWORK-1 )
C
DO 160 I = 1, NL
IF ( INUL(I) .AND. ( .NOT. BWORK(I) ) )
$ BWORK(I) = .TRUE.
160 CONTINUE
C
ELSE
C
C Not enough workspace for a fast algorithm or subsequent passes.
C
DO 180 I = 1, NL
IF ( INUL(I) .AND. ( .NOT. BWORK(I) ) ) THEN
KJ = JV
C
DO 170 NJ = MIN( P, MC ), 1, -1
J = MC - NJ + 1
KJ = KJ - J
FIRST = DWORK(KJ)
DWORK(KJ) = ONE
CALL DLARF( 'Left', J, 1, DWORK(KJ), 1,
$ DWORK(ITAUP+NJ-1), C(NJ+IOFF,I), LDC,
$ DWORK(JWORK) )
DWORK(KJ) = FIRST
170 CONTINUE
C
BWORK(I) = .TRUE.
END IF
180 CONTINUE
END IF
C
IF ( RANK.LE.0 )
$ RANK = 0
IF ( MIN( RANK, L ).EQ.0 ) THEN
IF ( SUFWRK.AND.LFIRST )
$ CALL DLACPY( 'Full', NL, K, DWORK(JV), NL, C, LDC )
DWORK(1) = WRKOPT
DWORK(2) = ONE
RETURN
END IF
C
C Step 4: Compute matrices F and Y
C ------------------------
C using Householder transformation Q.
C
C Compute the orthogonal matrix Q (in factorized form) and the
C matrices F and Y using RQ factorization. It is assumed that,
C generically, the last L rows of V2 matrix have full rank.
C The code could not be the most efficient when RANK has been
C lowered, because the already created zero pattern of the last
C L rows of V2 matrix is not exploited.
C
IF ( SUFWRK.AND.LFIRST ) THEN
C
C Enough workspace for a fast algorithm and first pass.
C Workspace: need LDW1 + 2*L,
C prefer LDW1 + L + L*NB, where
C LDW1 = LDW + (N+L)*K;
C
ITAUQ = JWORK
JWORK = ITAUQ + L
CALL DGERQF( L, K, DWORK(JV+N), NL, DWORK(ITAUQ), DWORK(JWORK),
$ LDWORK-JWORK+1, INFO )
WRKOPT = MAX( WRKOPT, INT( DWORK(JWORK) )+JWORK-1 )
C
C Workspace: need LDW1 + N+L,
C prefer LDW1 + L + N*NB.
C
CALL DORMRQ( 'Right', 'Transpose', N, K, L, DWORK(JV+N), NL,
$ DWORK(ITAUQ), DWORK(JV), NL, DWORK(JWORK),
$ LDWORK-JWORK+1, INFO )
WRKOPT = MAX( WRKOPT, INT( DWORK(JWORK) )+JWORK-1 )
C
JF = JV + NL*(K-L) + N
LDF = NL
JWORK = JF + LDF*L - N
CALL DLASET( 'Full', L, K-L, ZERO, ZERO, DWORK(JV+N), LDF )
IF ( L.GT.1 )
$ CALL DLASET( 'Lower', L-1, L-1, ZERO, ZERO, DWORK(JF+1),
$ LDF )
IJ = JV
C
DO 200 J = 1, K
CALL DCOPY( NL, DWORK(IJ), 1, C(1,IWORK(J)), 1 )
IJ = IJ + NL
200 CONTINUE
C
ELSE
C
C Not enough workspace for a fast algorithm or subsequent passes.
C Workspace: LDW2 + N+L, where LDW2 = LDW + L*L.
C
I = NL
JF = JV
LDF = L
JWORK = JF + LDF*L
WRKOPT = MAX( WRKOPT, JWORK+NL-1 )
C
C WHILE ( ( K >= 1 ) .AND. ( I > N ) ) DO
220 CONTINUE
IF ( ( K.GE.1 ) .AND. ( I.GT.N ) ) THEN
C
DO 240 J = 1, K
DWORK(JWORK+J-1) = C(I,IWORK(J))
240 CONTINUE
C
C Compute Householder transformation.
C
CALL DLARFG( K, DWORK(JWORK+K-1), DWORK(JWORK), 1, TEMP )
C(I,IWORK(K)) = DWORK(JWORK+K-1)
IF ( TEMP.NE.ZERO ) THEN
C
C Apply Householder transformation onto the selected base
C vectors.
C
DO 300 I1 = 1, I - 1
INPROD = C(I1,IWORK(K))
C
DO 260 J = 1, K - 1
INPROD = INPROD + DWORK(JWORK+J-1)*C(I1,IWORK(J))
260 CONTINUE
C
HH = INPROD*TEMP
C(I1,IWORK(K)) = C(I1,IWORK(K)) - HH
C
DO 280 J = 1, K - 1
J1 = IWORK(J)
C(I1,J1) = C(I1,J1) - DWORK(JWORK+J-1)*HH
C(I,J1) = ZERO
280 CONTINUE
C
300 CONTINUE
C
END IF
CALL DCOPY( I-N, C(N1,IWORK(K)), 1, DWORK(JF+(I-N-1)*L), 1 )
K = K - 1
I = I - 1
GO TO 220
END IF
C END WHILE 220
END IF
C
C Estimate the reciprocal condition number of the matrix F.
C If F singular, lower the rank of the TLS approximation.
C Workspace: LDW1 + 3*L or
C LDW2 + 3*L.
C
CALL DTRCON( '1-norm', 'Upper', 'Non-unit', L, DWORK(JF), LDF,
$ RCOND, DWORK(JWORK), IWORK(KF+1), INFO )
WRKOPT = MAX( WRKOPT, JWORK + 3*L - 1 )
C
DO 320 J = 1, L
CALL DCOPY( N, C(1,IWORK(KF-L+J)), 1, X(1,J), 1 )
320 CONTINUE
C
FNORM = DLANTR( '1-norm', 'Upper', 'Non-unit', L, L, DWORK(JF),
$ LDF, DWORK(JWORK) )
IF ( RCOND.LE.EPS*FNORM ) THEN
RANK = RANK - 1
GO TO 340
END IF
IF ( FNORM.LE.EPS*DLANGE( '1-norm', N, L, X, LDX,
$ DWORK(JWORK) ) ) THEN
RANK = RANK - L
GO TO 340
ELSE
GO TO 400
END IF
C
340 CONTINUE
IWARM = 2
THETA = -ONE
IF ( SUFWRK.AND.LFIRST ) THEN
C
C Rearrange the stored Householder transformations for
C subsequent passes, taking care to avoid overwriting.
C
IF ( P.LT.NL ) THEN
KJ = IHOUSH + NL*(NL - 1)
MJ = IHOUSH + P*(NL - 1)
C
DO 360 NJ = 1, NL
DO 350 J = P - 1, 0, -1
DWORK(KJ+J) = DWORK(MJ+J)
350 CONTINUE
KJ = KJ - NL
MJ = MJ - P
360 CONTINUE
C
END IF
KJ = IHOUSH
MJ = IHOUSH + NL*IOFF
C
DO 380 NJ = 1, MIN( P, MC )
DO 370 J = 0, MC - NJ
DWORK(KJ) = DWORK(MJ+J*P)
KJ = KJ + 1
370 CONTINUE
MJ = MJ + NL + 1
380 CONTINUE
C
JV = KJ
LFIRST = .FALSE.
END IF
GO TO 100
C UNTIL ( F nonsingular, i.e., RCOND.GT.EPS*FNORM or
C FNORM.GT.EPS*norm(Y) )
400 CONTINUE
C
C Step 5: Compute TLS solution.
C --------------------
C Solve X F = -Y by forward elimination (F is upper triangular).
C
CALL DTRSM( 'Right', 'Upper', 'No transpose', 'Non-unit', N, L,
$ -ONE, DWORK(JF), LDF, X, LDX )
C
C Set the optimal workspace and reciprocal condition number of F.
C
DWORK(1) = WRKOPT
DWORK(2) = RCOND
C
RETURN
C *** Last line of MB02ND ***
END