SUBROUTINE MB02KD( LDBLK, TRANS, K, L, M, N, R, ALPHA, BETA, $ TC, LDTC, TR, LDTR, B, LDB, C, LDC, DWORK, $ LDWORK, INFO ) C C SLICOT RELEASE 5.0. C C Copyright (c) 2002-2009 NICONET e.V. C C This program is free software: you can redistribute it and/or C modify it under the terms of the GNU General Public License as C published by the Free Software Foundation, either version 2 of C the License, or (at your option) any later version. C C This program is distributed in the hope that it will be useful, C but WITHOUT ANY WARRANTY; without even the implied warranty of C MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the C GNU General Public License for more details. C C You should have received a copy of the GNU General Public License C along with this program. If not, see C . C C PURPOSE C C To compute the matrix product C C C = alpha*op( T )*B + beta*C, C C where alpha and beta are scalars and T is a block Toeplitz matrix C specified by its first block column TC and first block row TR; C B and C are general matrices of appropriate dimensions. C C ARGUMENTS C C Mode Parameters C C LDBLK CHARACTER*1 C Specifies where the (1,1)-block of T is stored, as C follows: C = 'C': in the first block of TC; C = 'R': in the first block of TR. C C TRANS CHARACTER*1 C Specifies the form of op( T ) to be used in the matrix C multiplication as follows: C = 'N': op( T ) = T; C = 'T': op( T ) = T'; C = 'C': op( T ) = T'. C C Input/Output Parameters C C K (input) INTEGER C The number of rows in the blocks of T. K >= 0. C C L (input) INTEGER C The number of columns in the blocks of T. L >= 0. C C M (input) INTEGER C The number of blocks in the first block column of T. C M >= 0. C C N (input) INTEGER C The number of blocks in the first block row of T. N >= 0. C C R (input) INTEGER C The number of columns in B and C. R >= 0. C C ALPHA (input) DOUBLE PRECISION C The scalar alpha. When alpha is zero then TC, TR and B C are not referenced. C C BETA (input) DOUBLE PRECISION C The scalar beta. When beta is zero then C need not be set C before entry. C C TC (input) DOUBLE PRECISION array, dimension (LDTC,L) C On entry with LDBLK = 'C', the leading M*K-by-L part of C this array must contain the first block column of T. C On entry with LDBLK = 'R', the leading (M-1)*K-by-L part C of this array must contain the 2nd to the M-th blocks of C the first block column of T. C C LDTC INTEGER C The leading dimension of the array TC. C LDTC >= MAX(1,M*K), if LDBLK = 'C'; C LDTC >= MAX(1,(M-1)*K), if LDBLK = 'R'. C C TR (input) DOUBLE PRECISION array, dimension (LDTR,k) C where k is (N-1)*L when LDBLK = 'C' and is N*L when C LDBLK = 'R'. C On entry with LDBLK = 'C', the leading K-by-(N-1)*L part C of this array must contain the 2nd to the N-th blocks of C the first block row of T. C On entry with LDBLK = 'R', the leading K-by-N*L part of C this array must contain the first block row of T. C C LDTR INTEGER C The leading dimension of the array TR. LDTR >= MAX(1,K). C C B (input) DOUBLE PRECISION array, dimension (LDB,R) C On entry with TRANS = 'N', the leading N*L-by-R part of C this array must contain the matrix B. C On entry with TRANS = 'T' or TRANS = 'C', the leading C M*K-by-R part of this array must contain the matrix B. C C LDB INTEGER C The leading dimension of the array B. C LDB >= MAX(1,N*L), if TRANS = 'N'; C LDB >= MAX(1,M*K), if TRANS = 'T' or TRANS = 'C'. C C C (input/output) DOUBLE PRECISION array, dimension (LDC,R) C On entry with TRANS = 'N', the leading M*K-by-R part of C this array must contain the matrix C. C On entry with TRANS = 'T' or TRANS = 'C', the leading C N*L-by-R part of this array must contain the matrix C. C On exit with TRANS = 'N', the leading M*K-by-R part of C this array contains the updated matrix C. C On exit with TRANS = 'T' or TRANS = 'C', the leading C N*L-by-R part of this array contains the updated matrix C. C C LDC INTEGER C The leading dimension of the array C. C LDC >= MAX(1,M*K), if TRANS = 'N'; C LDC >= MAX(1,N*L), if TRANS = 'T' or TRANS = 'C'. C C Workspace C C DWORK DOUBLE PRECISION array, dimension (LDWORK) C On exit, if INFO = 0, DWORK(1) returns the optimal C value of LDWORK. C On exit, if INFO = -19, DWORK(1) returns the minimum C value of LDWORK. C C LDWORK INTEGER C The length of the array DWORK. LDWORK >= 1. C For optimum performance LDWORK should be larger. C C Error Indicator C C INFO INTEGER C = 0: successful exit; C < 0: if INFO = -i, the i-th argument had an illegal C value. C C METHOD C C For point Toeplitz matrices or sufficiently large block Toeplitz C matrices, this algorithm uses convolution algorithms based on C the fast Hartley transforms [1]. Otherwise, TC is copied in C reversed order into the workspace such that C can be computed from C barely M matrix-by-matrix multiplications. C C REFERENCES C C [1] Van Loan, Charles. C Computational frameworks for the fast Fourier transform. C SIAM, 1992. C C NUMERICAL ASPECTS C C The algorithm requires O( (K*L+R*L+K*R)*(N+M)*log(N+M) + K*L*R ) C floating point operations. C C CONTRIBUTOR C C D. Kressner, Technical Univ. Berlin, Germany, May 2001. C C REVISIONS C C V. Sima, Research Institute for Informatics, Bucharest, June 2001, C March 2004. C C KEYWORDS C C Convolution, elementary matrix operations, C fast Hartley transform, Toeplitz matrix. C C ****************************************************************** C C .. Parameters .. DOUBLE PRECISION ZERO, ONE, TWO, THREE, FOUR, THOM50 PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0, TWO = 2.0D0, $ THREE = 3.0D0, FOUR = 4.0D0, THOM50 = .95D3 ) C .. Scalar Arguments .. CHARACTER LDBLK, TRANS INTEGER INFO, K, L, LDB, LDC, LDTC, LDTR, LDWORK, M, N, $ R DOUBLE PRECISION ALPHA, BETA C .. Array Arguments .. DOUBLE PRECISION B(LDB,*), C(LDC,*), DWORK(*), TC(LDTC,*), $ TR(LDTR,*) C .. Local Scalars .. LOGICAL FULLC, LMULT, LTRAN CHARACTER*1 WGHT INTEGER DIMB, DIMC, I, ICP, ICQ, IERR, IR, J, JJ, KK, $ LEN, LL, LN, METH, MK, NL, P, P1, P2, PB, PC, $ PDW, PP, PT, Q1, Q2, R1, R2, S1, S2, SHFT, WPOS, $ WRKOPT DOUBLE PRECISION CF, COEF, PARAM, SCAL, SF, T1, T2, TH C .. External Functions .. LOGICAL LSAME EXTERNAL LSAME C .. External Subroutines .. EXTERNAL DAXPY, DCOPY, DG01OD, DGEMM, DLACPY, DLASET, $ DSCAL, XERBLA C .. Intrinsic Functions .. INTRINSIC ATAN, COS, DBLE, MAX, MIN, SIN C C .. Executable Statements .. C C Decode the scalar input parameters. C INFO = 0 FULLC = LSAME( LDBLK, 'C' ) LTRAN = LSAME( TRANS, 'T' ) .OR. LSAME( TRANS, 'C' ) LMULT = ALPHA.NE.ZERO MK = M*K NL = N*L C C Check the scalar input parameters. C IF ( .NOT.( FULLC .OR. LSAME( LDBLK, 'R' ) ) ) THEN INFO = -1 ELSE IF ( .NOT.( LTRAN .OR. LSAME( TRANS, 'N' ) ) ) THEN INFO = -2 ELSE IF ( K.LT.0 ) THEN INFO = -3 ELSE IF ( L.LT.0 ) THEN INFO = -4 ELSE IF ( M.LT.0 ) THEN INFO = -5 ELSE IF ( N.LT.0 ) THEN INFO = -6 ELSE IF ( R.LT.0 ) THEN INFO = -7 ELSE IF ( LMULT .AND. FULLC .AND. LDTC.LT.MAX( 1, MK ) ) THEN INFO = -11 ELSE IF ( LMULT .AND. .NOT.FULLC .AND. $ LDTC.LT.MAX( 1,( M - 1 )*K ) ) THEN INFO = -11 ELSE IF ( LMULT .AND. LDTR.LT.MAX( 1, K ) ) THEN INFO = -13 ELSE IF ( LMULT .AND. .NOT.LTRAN .AND. LDB.LT.MAX( 1, NL ) ) THEN INFO = -15 ELSE IF ( LMULT .AND. LTRAN .AND. LDB.LT.MAX( 1, MK ) ) THEN INFO = -15 ELSE IF ( .NOT.LTRAN .AND. LDC.LT.MAX( 1, MK ) ) THEN INFO = -17 ELSE IF ( LTRAN .AND. LDC.LT.MAX( 1, NL ) ) THEN INFO = -17 ELSE IF ( LDWORK.LT.1 ) THEN DWORK(1) = ONE INFO = -19 END IF C C Return if there were illegal values. C IF ( INFO.NE.0 ) THEN CALL XERBLA( 'MB02KD', -INFO ) RETURN END IF C C Scale C beforehand. C IF ( BETA.EQ.ZERO ) THEN IF ( LTRAN ) THEN CALL DLASET( 'All', NL, R, ZERO, ZERO, C, LDC ) ELSE CALL DLASET( 'All', MK, R, ZERO, ZERO, C, LDC ) END IF ELSE IF ( BETA.NE.ONE ) THEN IF ( LTRAN ) THEN C DO 10 I = 1, R CALL DSCAL( NL, BETA, C(1,I), 1 ) 10 CONTINUE C ELSE C DO 20 I = 1, R CALL DSCAL( MK, BETA, C(1,I), 1 ) 20 CONTINUE C END IF END IF C C Quick return if possible. C IF ( .NOT.LMULT .OR. MIN( MK, NL, R ).EQ.0 ) THEN DWORK(1) = ONE RETURN END IF C C The parameter PARAM is the watershed between conventional C multiplication and convolution. This is of course depending C on the used computer architecture. The lower this value is set C the more likely the routine will use convolution to compute C op( T )*B. Note that if there is enough workspace available, C convolution is always used for point Toeplitz matrices. C PARAM = THOM50 C C Decide which method to choose, based on the block sizes and C the available workspace. C LEN = 1 P = 0 C 30 CONTINUE IF ( LEN.LT.M+N-1 ) THEN LEN = LEN*2 P = P + 1 GO TO 30 END IF C COEF = THREE*DBLE( M*N )*DBLE( K*L )*DBLE( R ) / $ DBLE( LEN*( K*L + L*R + K*R ) ) C IF ( FULLC ) THEN P1 = MK*L SHFT = 0 ELSE P1 = ( M - 1 )*K*L SHFT = 1 END IF IF ( K*L.EQ.1 .AND. MIN( M, N ).GT.1 ) THEN WRKOPT = LEN*( 2 + R ) - P METH = 3 ELSE IF ( ( LEN.LT.M*N ) .AND. ( COEF.GE.PARAM ) ) THEN WRKOPT = LEN*( K*L + K*R + L*R + 1 ) - P METH = 3 ELSE METH = 2 WRKOPT = P1 END IF C IF ( LDWORK.LT.WRKOPT ) METH = METH - 1 IF ( LDWORK.LT.P1 ) METH = 1 C C Start computations. C IF ( METH.EQ.1 .AND. .NOT.LTRAN ) THEN C C Method 1 is the most unlucky way to multiply Toeplitz matrices C with vectors. Due to the memory restrictions it is not C possible to flip TC. C PC = 1 C DO 50 I = 1, M PT = ( I - 1 - SHFT )*K + 1 PB = 1 C DO 40 J = SHFT + 1, I CALL DGEMM( 'No Transpose', 'No Transpose', K, R, L, $ ALPHA, TC(PT,1), LDTC, B(PB,1), LDB, ONE, $ C(PC,1), LDC ) PT = PT - K PB = PB + L 40 CONTINUE C IF ( N.GT.I-SHFT ) THEN CALL DGEMM( 'No Transpose', 'No Transpose', K, R, $ (N-I+SHFT)*L, ALPHA, TR, LDTR, B(PB,1), LDB, $ ONE, C(PC,1), LDC ) END IF PC = PC + K 50 CONTINUE C ELSE IF ( METH.EQ.1 .AND. LTRAN ) THEN C PB = 1 C DO 70 I = 1, M PT = ( I - 1 - SHFT )*K + 1 PC = 1 C DO 60 J = SHFT + 1, I CALL DGEMM( 'Transpose', 'No Transpose', L, R, K, ALPHA, $ TC(PT,1), LDTC, B(PB,1), LDB, ONE, C(PC,1), $ LDC ) PT = PT - K PC = PC + L 60 CONTINUE C IF ( N.GT.I-SHFT ) THEN CALL DGEMM( 'Transpose', 'No Transpose', (N-I+SHFT)*L, $ R, K, ALPHA, TR, LDTR, B(PB,1), LDB, ONE, $ C(PC,1), LDC ) END IF PB = PB + K 70 CONTINUE C ELSE IF ( METH.EQ.2 .AND. .NOT.LTRAN ) THEN C C In method 2 TC is flipped resulting in less calls to the BLAS C routine DGEMM. Actually this seems often to be the best way to C multiply with Toeplitz matrices except the point Toeplitz C case. C PT = ( M - 1 - SHFT )*K + 1 C DO 80 I = 1, ( M - SHFT )*K*L, K*L CALL DLACPY( 'All', K, L, TC(PT,1), LDTC, DWORK(I), K ) PT = PT - K 80 CONTINUE C PT = ( M - 1 )*K*L + 1 PC = 1 C DO 90 I = 1, M CALL DGEMM( 'No Transpose', 'No Transpose', K, R, $ MIN( I-SHFT, N )*L, ALPHA, DWORK(PT), K, B, LDB, $ ONE, C(PC,1), LDC ) IF ( N.GT.I-SHFT ) THEN CALL DGEMM( 'No Transpose', 'No Transpose', K, R, $ (N-I+SHFT)*L, ALPHA, TR, LDTR, $ B((I-SHFT)*L+1,1), LDB, ONE, C(PC,1), LDC ) END IF PC = PC + K PT = PT - K*L 90 CONTINUE C ELSE IF ( METH.EQ.2 .AND. LTRAN ) THEN C PT = ( M - 1 - SHFT )*K + 1 C DO 100 I = 1, ( M - SHFT )*K*L, K*L CALL DLACPY( 'All', K, L, TC(PT,1), LDTC, DWORK(I), K ) PT = PT - K 100 CONTINUE C PT = ( M - 1 )*K*L + 1 PB = 1 C DO 110 I = 1, M CALL DGEMM( 'Tranpose', 'No Transpose', MIN( I-SHFT, N )*L, $ R, K, ALPHA, DWORK(PT), K, B(PB,1), LDB, ONE, $ C, LDC ) IF ( N.GT.I-SHFT ) THEN CALL DGEMM( 'Transpose', 'No Transpose', (N-I+SHFT)*L, R, $ K, ALPHA, TR, LDTR, B(PB,1), LDB, ONE, $ C((I-SHFT)*L+1,1), LDC ) END IF PB = PB + K PT = PT - K*L 110 CONTINUE C ELSE IF ( METH.EQ.3 ) THEN C C In method 3 the matrix-vector product is computed by a suitable C block convolution via fast Hartley transforms similar to the C SLICOT routine DE01PD. C C Step 1: Copy input data into the workspace arrays. C PDW = 1 IF ( LTRAN ) THEN DIMB = K DIMC = L ELSE DIMB = L DIMC = K END IF PB = LEN*K*L PC = LEN*( K*L + DIMB*R ) IF ( LTRAN ) THEN IF ( FULLC ) THEN CALL DLACPY( 'All', K, L, TC, LDTC, DWORK, LEN*K ) END IF C DO 120 I = 1, N - 1 + SHFT CALL DLACPY( 'All', K, L, TR(1,(I-1)*L+1), LDTR, $ DWORK((I-SHFT)*K+1), LEN*K ) 120 CONTINUE C PDW = N*K + 1 R1 = ( LEN - M - N + 1 )*K CALL DLASET( 'All', R1, L, ZERO, ZERO, DWORK(PDW), LEN*K ) PDW = PDW + R1 C DO 130 I = ( M - 1 - SHFT )*K + 1, K - SHFT*K + 1, -K CALL DLACPY( 'All', K, L, TC(I,1), LDTC, $ DWORK(PDW), LEN*K ) PDW = PDW + K 130 CONTINUE C PDW = PB + 1 CALL DLACPY( 'All', MK, R, B, LDB, DWORK(PDW), LEN*K ) PDW = PDW + MK CALL DLASET( 'All', (LEN-M)*K, R, ZERO, ZERO, DWORK(PDW), $ LEN*K ) ELSE IF ( .NOT.FULLC ) THEN CALL DLACPY( 'All', K, L, TR, LDTR, DWORK, LEN*K ) END IF CALL DLACPY( 'All', (M-SHFT)*K, L, TC, LDTC, $ DWORK(SHFT*K+1), LEN*K ) PDW = MK + 1 R1 = ( LEN - M - N + 1 )*K CALL DLASET( 'All', R1, L, ZERO, ZERO, DWORK(PDW), LEN*K ) PDW = PDW + R1 C DO 140 I = ( N - 2 + SHFT )*L + 1, SHFT*L + 1, -L CALL DLACPY( 'All', K, L, TR(1,I), LDTR, DWORK(PDW), $ LEN*K ) PDW = PDW + K 140 CONTINUE C PDW = PB + 1 CALL DLACPY( 'All', NL, R, B, LDB, DWORK(PDW), LEN*L ) PDW = PDW + NL CALL DLASET( 'All', (LEN-N)*L, R, ZERO, ZERO, DWORK(PDW), $ LEN*L ) END IF C C Take point Toeplitz matrices into extra consideration. C IF ( K*L.EQ.1 ) THEN WGHT = 'N' CALL DG01OD( 'OutputScrambled', WGHT, LEN, DWORK, $ DWORK(PC+1), IERR ) C DO 170 I = PB, PB + LEN*R - 1, LEN CALL DG01OD( 'OutputScrambled', WGHT, LEN, DWORK(I+1), $ DWORK(PC+1), IERR ) SCAL = ALPHA / DBLE( LEN ) DWORK(I+1) = SCAL*DWORK(I+1)*DWORK(1) DWORK(I+2) = SCAL*DWORK(I+2)*DWORK(2) SCAL = SCAL / TWO C LN = 1 C DO 160 LL = 1, P - 1 LN = 2*LN R1 = 2*LN C DO 150 P1 = LN + 1, LN + LN/2 T1 = DWORK(P1) + DWORK(R1) T2 = DWORK(P1) - DWORK(R1) TH = T2*DWORK(I+P1) DWORK(I+P1) = SCAL*( T1*DWORK(I+P1) $ + T2*DWORK(I+R1) ) DWORK(I+R1) = SCAL*( T1*DWORK(I+R1) - TH ) R1 = R1 - 1 150 CONTINUE C 160 CONTINUE C CALL DG01OD( 'InputScrambled', WGHT, LEN, DWORK(I+1), $ DWORK(PC+1), IERR ) 170 CONTINUE C PC = PB GOTO 420 END IF C C Step 2: Compute the weights for the Hartley transforms. C PDW = PC R1 = 1 LN = 1 TH = FOUR*ATAN( ONE ) / DBLE( LEN ) C DO 190 LL = 1, P - 2 LN = 2*LN TH = TWO*TH CF = COS( TH ) SF = SIN( TH ) DWORK(PDW+R1) = CF DWORK(PDW+R1+1) = SF R1 = R1 + 2 C DO 180 I = 1, LN-2, 2 DWORK(PDW+R1) = CF*DWORK(PDW+I) - SF*DWORK(PDW+I+1) DWORK(PDW+R1+1) = SF*DWORK(PDW+I) + CF*DWORK(PDW+I+1) R1 = R1 + 2 180 CONTINUE C 190 CONTINUE C P1 = 3 Q1 = R1 - 2 C DO 210 LL = P - 2, 1, -1 C DO 200 I = P1, Q1, 4 DWORK(PDW+R1) = DWORK(PDW+I) DWORK(PDW+R1+1) = DWORK(PDW+I+1) R1 = R1 + 2 200 CONTINUE C P1 = Q1 + 4 Q1 = R1 - 2 210 CONTINUE C C Step 3: Compute the Hartley transforms with scrambled output. C J = 0 KK = K C C WHILE J < (L*LEN*K + R*LEN*DIMB), C 220 CONTINUE C LN = LEN WPOS = PDW+1 C DO 270 PP = P - 1, 1, -1 LN = LN / 2 P2 = 1 Q2 = LN*KK + 1 R2 = ( LN/2 )*KK + 1 S2 = R2 + Q2 - 1 C DO 260 I = 0, LEN/( 2*LN ) - 1 C DO 230 IR = 0, KK - 1 T1 = DWORK(Q2+IR+J) DWORK(Q2+IR+J) = DWORK(P2+IR+J) - T1 DWORK(P2+IR+J) = DWORK(P2+IR+J) + T1 T1 = DWORK(S2+IR+J) DWORK(S2+IR+J) = DWORK(R2+IR+J) - T1 DWORK(R2+IR+J) = DWORK(R2+IR+J) + T1 230 CONTINUE C P1 = P2 + KK Q1 = P1 + LN*KK R1 = Q1 - 2*KK S1 = R1 + LN*KK C DO 250 JJ = WPOS, WPOS + LN - 3, 2 CF = DWORK(JJ) SF = DWORK(JJ+1) C DO 240 IR = 0, KK-1 T1 = DWORK(P1+IR+J) - DWORK(Q1+IR+J) T2 = DWORK(R1+IR+J) - DWORK(S1+IR+J) DWORK(P1+IR+J) = DWORK(P1+IR+J) + $ DWORK(Q1+IR+J) DWORK(R1+IR+J) = DWORK(R1+IR+J) + $ DWORK(S1+IR+J) DWORK(Q1+IR+J) = CF*T1 + SF*T2 DWORK(S1+IR+J) = -CF*T2 + SF*T1 240 CONTINUE C P1 = P1 + KK Q1 = Q1 + KK R1 = R1 - KK S1 = S1 - KK 250 CONTINUE C P2 = P2 + 2*KK*LN Q2 = Q2 + 2*KK*LN R2 = R2 + 2*KK*LN S2 = S2 + 2*KK*LN 260 CONTINUE C WPOS = WPOS + LN - 2 270 CONTINUE C DO 290 ICP = KK + 1, LEN*KK, 2*KK ICQ = ICP - KK C DO 280 IR = 0, KK - 1 T1 = DWORK(ICP+IR+J) DWORK(ICP+IR+J) = DWORK(ICQ+IR+J) - T1 DWORK(ICQ+IR+J) = DWORK(ICQ+IR+J) + T1 280 CONTINUE C 290 CONTINUE C J = J + LEN*KK IF ( J.EQ.L*LEN*K ) THEN KK = DIMB END IF IF ( J.LT.PC ) GOTO 220 C END WHILE 220 C C Step 4: Compute a Hadamard like product. C CALL DCOPY( LEN-P, DWORK(PDW+1), 1,DWORK(PDW+1+R*LEN*DIMC), 1 ) PDW = PDW + R*LEN*DIMC SCAL = ALPHA / DBLE( LEN ) P1 = 1 R1 = LEN*K*L + 1 S1 = R1 + LEN*DIMB*R IF ( LTRAN ) THEN KK = L LL = K ELSE KK = K LL = L END IF CALL DGEMM( TRANS, 'No Transpose', KK, R, LL, SCAL, DWORK(P1), $ LEN*K, DWORK(R1), LEN*DIMB, ZERO, DWORK(S1), $ LEN*DIMC ) P1 = P1 + K R1 = R1 + DIMB S1 = S1 + DIMC CALL DGEMM( TRANS, 'No Transpose', KK, R, LL, SCAL, DWORK(P1), $ LEN*K, DWORK(R1), LEN*DIMB, ZERO, DWORK(S1), $ LEN*DIMC ) SCAL = SCAL / TWO LN = 1 C DO 330 PP = 1, P - 1 LN = 2*LN P2 = ( 2*LN - 1 )*K + 1 R1 = PB + LN*DIMB + 1 R2 = PB + ( 2*LN - 1 )*DIMB + 1 S1 = PC + LN*DIMC + 1 S2 = PC + ( 2*LN - 1 )*DIMC + 1 C DO 320 P1 = LN*K + 1, ( LN + LN/2 )*K, K C DO 310 J = 0, LEN*K*( L - 1 ), LEN*K C DO 300 I = P1, P1 + K - 1 T1 = DWORK(P2) DWORK(P2) = DWORK(J+I) - T1 DWORK(J+I) = DWORK(J+I) + T1 P2 = P2 + 1 300 CONTINUE C P2 = P2 + ( LEN - 1 )*K 310 CONTINUE C P2 = P2 - LEN*K*L CALL DGEMM( TRANS, 'No Transpose', KK, R, LL, SCAL, $ DWORK(P1), LEN*K, DWORK(R1), LEN*DIMB, $ ZERO, DWORK(S1), LEN*DIMC ) CALL DGEMM( TRANS, 'No Transpose', KK, R, LL, SCAL, $ DWORK(P2), LEN*K, DWORK(R2), LEN*DIMB, ONE, $ DWORK(S1), LEN*DIMC ) CALL DGEMM( TRANS, 'No Transpose', KK, R, LL, SCAL, $ DWORK(P1), LEN*K, DWORK(R2), LEN*DIMB, ZERO, $ DWORK(S2), LEN*DIMC ) CALL DGEMM( TRANS, 'No Transpose', KK, R, LL, -SCAL, $ DWORK(P2), LEN*K, DWORK(R1), LEN*DIMB, ONE, $ DWORK(S2), LEN*DIMC ) P2 = P2 - K R1 = R1 + DIMB R2 = R2 - DIMB S1 = S1 + DIMC S2 = S2 - DIMC 320 CONTINUE C 330 CONTINUE C C Step 5: Hartley transform with scrambled input. C DO 410 J = PC, PC + LEN*DIMC*R, LEN*DIMC C DO 350 ICP = DIMC + 1, LEN*DIMC, 2*DIMC ICQ = ICP - DIMC C DO 340 IR = 0, DIMC - 1 T1 = DWORK(ICP+IR+J) DWORK(ICP+IR+J) = DWORK(ICQ+IR+J) - T1 DWORK(ICQ+IR+J) = DWORK(ICQ+IR+J) + T1 340 CONTINUE C 350 CONTINUE C LN = 1 WPOS = PDW + LEN - 2*P + 1 C DO 400 PP = 1, P - 1 LN = 2*LN P2 = 1 Q2 = LN*DIMC + 1 R2 = ( LN/2 )*DIMC + 1 S2 = R2 + Q2 - 1 C DO 390 I = 0, LEN/( 2*LN ) - 1 C DO 360 IR = 0, DIMC - 1 T1 = DWORK(Q2+IR +J) DWORK(Q2+IR+J) = DWORK(P2+IR+J) - T1 DWORK(P2+IR+J) = DWORK(P2+IR+J) + T1 T1 = DWORK(S2+IR+J) DWORK(S2+IR+J) = DWORK(R2+IR+J) - T1 DWORK(R2+IR+J) = DWORK(R2+IR+J) + T1 360 CONTINUE C P1 = P2 + DIMC Q1 = P1 + LN*DIMC R1 = Q1 - 2*DIMC S1 = R1 + LN*DIMC C DO 380 JJ = WPOS, WPOS + LN - 3, 2 CF = DWORK(JJ) SF = DWORK(JJ+1) C DO 370 IR = 0, DIMC - 1 T1 = CF*DWORK(Q1+IR+J) + SF*DWORK(S1+IR+J) T2 = -CF*DWORK(S1+IR+J) + SF*DWORK(Q1+IR+J) DWORK(Q1+IR+J) = DWORK(P1+IR+J) - T1 DWORK(P1+IR+J) = DWORK(P1+IR+J) + T1 DWORK(S1+IR+J) = DWORK(R1+IR+J) - T2 DWORK(R1+IR+J) = DWORK(R1+IR+J) + T2 370 CONTINUE C P1 = P1 + DIMC Q1 = Q1 + DIMC R1 = R1 - DIMC S1 = S1 - DIMC 380 CONTINUE C P2 = P2 + 2*DIMC*LN Q2 = Q2 + 2*DIMC*LN R2 = R2 + 2*DIMC*LN S2 = S2 + 2*DIMC*LN 390 CONTINUE C WPOS = WPOS - 2*LN + 2 400 CONTINUE C 410 CONTINUE C C Step 6: Copy data from workspace to output. C 420 CONTINUE C IF ( LTRAN ) THEN I = NL ELSE I = MK END IF C DO 430 J = 0, R - 1 CALL DAXPY( I, ONE, DWORK(PC+(J*LEN*DIMC) + 1), 1, $ C(1,J+1), 1 ) 430 CONTINUE C END IF DWORK(1) = DBLE( MAX( 1, WRKOPT ) ) RETURN C C *** Last line of MB02KD *** END