Teuchos - Trilinos Tools Package Version of the Day
Loading...
Searching...
No Matches
Teuchos_Details_Lapack128.cpp
1// @HEADER
2// *****************************************************************************
3// Teuchos: Common Tools Package
4//
5// Copyright 2004 NTESS and the Teuchos contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
11#ifdef HAVE_TEUCHOSCORE_QUADMATH
12# include "Teuchos_BLAS.hpp"
13#endif // HAVE_TEUCHOSCORE_QUADMATH
14
15
16#ifdef HAVE_TEUCHOSCORE_QUADMATH
17namespace Teuchos {
18namespace Details {
19
20void
21Lapack128::
22GETRF (const int M, const int N, __float128 A[],
23 const int LDA, int IPIV[], int* INFO) const
24{
25 //std::cerr << "GETRF: N = " << N << std::endl;
26
28
29 // NOTE (mfh 05 Sep 2015) This is a direct translation of LAPACK
30 // 3.5.0's DGETF2 routine. LAPACK is under a BSD license.
31
32 *INFO = 0;
33 if (M < 0) {
34 *INFO = -1;
35 } else if (N < 0) {
36 *INFO = -2;
37 } else if (LDA < std::max (1, M)) {
38 *INFO = -4;
39 }
40 if (*INFO != 0) {
41 return;
42 }
43
44 // Quick return if possible
45 if (M == 0 || N == 0) {
46 return;
47 }
48
49 // Compute machine safe minimum sfmin (such that 1/sfmin does
50 // not overflow). LAPACK 3.1 just returns for this the smallest
51 // normalized number.
52 const __float128 sfmin = FLT128_MIN;
53 const __float128 zero = 0.0;
54 const __float128 one = 1.0;
55
56 const int j_upperBound = std::min (M, N);
57 for (int j = 1; j <= j_upperBound; ++j) {
58 //std::cerr << " j = " << j << std::endl;
59
60 // Find pivot and test for singularity.
61 __float128* const A_jj = A + (j-1)*LDA + (j-1);
62
63 //std::cerr << " CALLING IAMAX" << std::endl;
64 const int jp = (j - 1) + blas.IAMAX (M - j + 1, A_jj, 1);
65 IPIV[j - 1] = jp;
66
67 const __float128* A_jp_j = A + jp + LDA*j;
68 if (*A_jp_j != zero) {
69 // Apply the interchange to columns 1:N.
70 __float128* const A_j1 = A + (j - 1);
71 __float128* const A_jp_1 = A + (jp - 1);
72
73 if (jp != j) {
74 blas.SWAP (N, A_j1, LDA, A_jp_1, LDA);
75 }
76
77 // Compute elements J+1:M of J-th column.
78 if (j < M) {
79 __float128* const A_j1_j = A + j + (j-1)*LDA;
80
81 if (fabsq (*A_jj) >= sfmin) {
82 blas.SCAL (M-j, one / *A_jj, A_j1_j, 1);
83 } else {
84 for (int i = 1; i <= M-j; ++i) {
85 __float128* const A_jpi_j = A + (j+i-1) + (j-1)*LDA;
86 *A_jpi_j /= *A_jj;
87 }
88 }
89 }
90 } else if (*INFO == 0) {
91 *INFO = j;
92 }
93
94 if (j < std::min (M, N)) {
95 //std::cerr << " UPDATE TRAILING SUBMATRIX" << std::endl;
96
97 // Update trailing submatrix.
98 const __float128* A_j1_j = A + j + (j-1)*LDA;
99 const __float128* A_j_j1 = A + (j-1) + j*LDA;
100 __float128* A_j1_j1 = A + j + j*LDA;
101 blas.GER (M-j, N-j, -one, A_j1_j, 1, A_j_j1, LDA, A_j1_j1, LDA);
102 }
103 }
104}
105
106void
107Lapack128::
108LASWP (const int N, __float128 A[], const int LDA, const int K1,
109 const int K2, const int IPIV[], const int INCX) const
110{
111 int i, i1, i2, inc, ip, ix, ix0, j, k, n32;
112 __float128 temp;
113
114 // Interchange row I with row IPIV(I) for each of rows K1 through K2.
115
116 if (INCX > 0) {
117 ix0 = K1;
118 i1 = K1;
119 i2 = K2;
120 inc = 1;
121 } else if (INCX < 0) {
122 ix0 = 1 + (1 - K2)*INCX;
123 i1 = K2;
124 i2 = K1;
125 inc = -1;
126 } else { // INCX == 0
127 return;
128 }
129
130 // The LAPACK 3.5.0 source code does 32 entries at a time,
131 // presumably for better vectorization or cache line usage.
132 n32 = (N / 32) * 32;
133
134 if (n32 != 0) {
135 for (j = 1; j <= n32; j += 32) {
136 ix = ix0;
137 // C and C++ lack Fortran's convenient range specifier,
138 // which can iterate over a range in either direction
139 // without particular fuss about the end condition.
140 for (i = i1; (inc > 0) ? (i <= i2) : (i >= i2); i += inc) {
141 ip = IPIV[ix-1];
142 if (ip != i) {
143 for (k = j; k <= j+31; ++k) {
144 temp = A[(i-1) + (k-1)*LDA]; // temp = a( i, k )
145 A[(i-1) + (k-1)*LDA] = A[(ip-1) + (k-1)*LDA]; // a( i, k ) = a( ip, k )
146 A[(ip-1) + (k-1)*LDA] = temp; // a( ip, k ) = temp
147 }
148 }
149 ix = ix + INCX;
150 }
151 }
152 }
153
154 if (n32 != N) {
155 n32 = n32 + 1;
156 ix = ix0;
157 // C and C++ lack Fortran's convenient range specifier,
158 // which can iterate over a range in either direction
159 // without particular fuss about the end condition.
160 for (i = i1; (inc > 0) ? (i <= i2) : (i >= i2); i += inc) {
161 ip = IPIV[ix-1];
162 if (ip != i) {
163 for (k = n32; k <= N; ++k) {
164 temp = A[(i-1) + (k-1)*LDA]; // temp = a( i, k )
165 A[(i-1) + (k-1)*LDA] = A[(ip-1) + (k-1)*LDA]; // a( i, k ) = a( ip, k )
166 A[(ip-1) + (k-1)*LDA] = temp; // a( ip, k ) = temp
167 }
168 }
169 ix = ix + INCX;
170 }
171 }
172}
173
174void
175Lapack128::
176GETRI (const int /* N */, __float128 /* A */ [], const int /* LDA */,
177 int /* IPIV */ [], __float128 /* WORK */ [], const int /* LWORK */,
178 int* /* INFO */) const
179{
181 (true, std::logic_error, "Teuchos::LAPACK<int, __float128>::GETRI: Not implemented yet.");
182}
183
184
185void
186Lapack128::
187GETRS (const char TRANS, const int N, const int NRHS,
188 const __float128 A[], const int LDA, const int IPIV[],
189 __float128 B[], const int LDB, int* INFO) const
190{
191 //std::cerr << "GETRS: N = " << N << std::endl;
192
194
195 // NOTE (mfh 05 Sep 2015) This is a direct translation of LAPACK
196 // 3.5.0's DGETRS routine. LAPACK is under a BSD license.
197
198 *INFO = 0;
199 const bool notran = (TRANS == 'N' || TRANS == 'n');
200 if (! notran
201 && ! (TRANS == 'T' || TRANS == 't')
202 && ! (TRANS == 'C' || TRANS == 'c')) {
203 *INFO = -1; // invalid TRANS argument
204 }
205 else if (N < 0) {
206 *INFO = -2; // invalid N (negative)
207 }
208 else if (NRHS < 0) {
209 *INFO = -3; // invalid NRHS (negative)
210 }
211 else if (LDA < std::max (1, N)) {
212 *INFO = -5; // invalid LDA (too small)
213 }
214 else if (LDB < std::max (1, N)) {
215 *INFO = -8; // invalid LDB (too small)
216 }
217 if (*INFO != 0) {
218 return;
219 }
220
221 const __float128 one = 1.0;
222
223 using Teuchos::LEFT_SIDE;
224 using Teuchos::LOWER_TRI;
225 using Teuchos::UPPER_TRI;
226 using Teuchos::NO_TRANS;
227 using Teuchos::TRANS;
229 using Teuchos::UNIT_DIAG;
231
232 if (notran) { // No transpose; solve AX=B
233 // Apply row interchanges to the right-hand sides.
234 //std::cerr << "CALLING LASWP" << std::endl;
235 LASWP (NRHS, B, LDB, 1, N, IPIV, 1);
236 // Solve L*X = B, overwriting B with X.
237 //std::cerr << "CALLING TRSM (1)" << std::endl;
238 blas.TRSM (LEFT_SIDE, LOWER_TRI, NO_TRANS, UNIT_DIAG, N, NRHS,
239 one, A, LDA, B, LDB);
240 // Solve U*X = B, overwriting B with X.
241 //std::cerr << "CALLING TRSM (2)" << std::endl;
242 blas.TRSM (LEFT_SIDE, UPPER_TRI, NO_TRANS, NON_UNIT_DIAG, N, NRHS,
243 one, A, LDA, B, LDB);
244 }
245 else { // Transpose or conjugate transpose: solve A^{T,H}X = B.
246 const Teuchos::ETransp transposeMode = (TRANS == 'T' || TRANS == 't') ?
248
249 // Solve U^{T,H}*X = B, overwriting B with X.
250 //std::cerr << "CALLING TRSM (1)" << std::endl;
251 blas.TRSM (LEFT_SIDE, UPPER_TRI, transposeMode, NON_UNIT_DIAG, N, NRHS,
252 one, A, LDA, B, LDB);
253 // Solve L^{T,H}*X = B, overwriting B with X.
254 //std::cerr << "CALLING TRSM (2)" << std::endl;
255 blas.TRSM (LEFT_SIDE, LOWER_TRI, transposeMode, UNIT_DIAG, N, NRHS,
256 one, A, LDA, B, LDB);
257 //std::cerr << "CALLING LASWP" << std::endl;
258 // Apply row interchanges to the solution vectors.
259 LASWP (NRHS, B, LDB, 1, N, IPIV, -1);
260 }
261
262 //std::cerr << "DONE WITH GETRS" << std::endl;
263}
264
265__float128
266Lapack128::
267LAPY2 (const __float128& x, const __float128& y) const
268{
269 const __float128 xabs = fabsq (x);
270 const __float128 yabs = fabsq (y);
271 const __float128 w = fmaxq (xabs, yabs);
272 const __float128 z = fminq (xabs, yabs);
273
274 if (z == 0.0) {
275 return w;
276 } else {
277 const __float128 one = 1.0;
278 const __float128 z_div_w = z / w;
279 return w * sqrtq (one + z_div_w * z_div_w);
280 }
281}
282
283void
284Lapack128::
285ORM2R (const char side, const char trans,
286 const int m, const int n, const int k,
287 const __float128 A[], const int lda,
288 const __float128* const tau,
289 __float128 C[], const int ldc,
290 __float128 work[], int* const info) const
291{
292 TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, "Not implemented");
293}
294
295namespace { // (anonymous)
296
297 int
298 ILADLC (const int m, const int n, const __float128 A[], const int lda)
299 {
300 const __float128 zero = 0.0;
301
302 // Quick test for the common case where one corner is non-zero.
303 if (n == 0) {
304 return n;
305 } else if (A[0 + (n-1)*lda] != zero || A[(m-1) + (n-1)*lda] != zero) {
306 return n;
307 } else {
308 // Now scan each column from the end, returning with the first non-zero.
309 for (int j = n; j > 0; --j) {
310 for (int i = 1; i <= m; ++i) {
311 if (A[(i-1) + (j-1)*lda] != zero) {
312 return j;
313 }
314 }
315 }
316 return 0;
317 }
318 }
319
320 int
321 ILADLR (const int m, const int n, const __float128 A[], const int lda)
322 {
323 const __float128 zero = 0.0;
324
325 // Quick test for the common case where one corner is non-zero.
326 if (m == 0) {
327 return m;
328 } else if (A[(m-1) + 0*lda] != zero || A[(m-1) + (n-1)*lda] != zero) {
329 return m;
330 } else {
331 // Scan up each column tracking the last zero row seen.
332 int lastZeroRow = 0;
333 for (int j = 1; j <= n; ++j) {
334 int i = m;
335 while (A[(std::max (i, 1) - 1) + (j - 1)*lda] == zero && i >= 1) {
336 i--;
337 }
338 lastZeroRow = std::max (lastZeroRow, i);
339 }
340 return lastZeroRow;
341 }
342 }
343} // namespace (anonymous)
344
345void
346Lapack128::
347LARF (const char side,
348 const int m,
349 const int n,
350 const __float128 v[],
351 const int incv,
352 const __float128 tau,
353 __float128 C[],
354 const int ldc,
355 __float128 work[]) const
356{
357 const __float128 zero = 0.0;
358 const __float128 one = 1.0;
360 const bool applyLeft = (side == 'L');
361 int lastv = 0;
362 int lastc = 0;
363 int i = 0;
364
365 if (tau != zero) {
366 // Set up variables for scanning V. LASTV begins pointing to the end of V.
367 if (applyLeft) {
368 lastv = m;
369 } else {
370 lastv = n;
371 }
372 if (incv > 0) {
373 i = 1 + (lastv - 1) * incv;
374 } else {
375 i = 1;
376 }
377 // Look for the last non-zero row in V.
378 while (lastv > 0 && v[i-1] == zero) {
379 lastv = lastv - 1;
380 i = i - incv;
381 }
382 if (applyLeft) {
383 // Scan for the last non-zero column in C(1:lastv,:).
384 lastc = ILADLC (lastv, n, C, ldc);
385 } else {
386 // Scan for the last non-zero row in C(:,1:lastv).
387 lastc = ILADLR (m, lastv, C, ldc);
388 }
389 }
390
391 // Note that lastc == 0 renders the BLAS operations null; no special
392 // case is needed at this level.
393 if (applyLeft) {
394 // Form H * C
395 if (lastv > 0) {
396 // w(1:lastc,1) := C(1:lastv,1:lastc)**T * v(1:lastv,1)
397 blas.GEMV (Teuchos::TRANS, lastv, lastc, one, C, ldc, v, incv,
398 zero, work, 1);
399 // C(1:lastv,1:lastc) := C(...) - v(1:lastv,1) * w(1:lastc,1)**T
400 blas.GER (lastv, lastc, -tau, v, incv, work, 1, C, ldc);
401 }
402 }
403 else {
404 // Form C * H
405 if (lastv > 0) {
406 // w(1:lastc,1) := C(1:lastc,1:lastv) * v(1:lastv,1)
407 blas.GEMV (Teuchos::NO_TRANS, lastc, lastv, one, C, ldc,
408 v, incv, zero, work, 1);
409 // C(1:lastc,1:lastv) := C(...) - w(1:lastc,1) * v(1:lastv,1)**T
410 blas.GER (lastc, lastv, -tau, work, 1, v, incv, C, ldc);
411 }
412 }
413}
414
415
416void
417Lapack128::
418LARFG (const int N, __float128* const ALPHA,
419 __float128 X[], const int INCX, __float128* const TAU) const
420{
421 // This is actually LARFGP.
422
423 const __float128 zero = 0.0;
424 const __float128 one = 1.0;
425 const __float128 two = 2.0;
427
428 if (N <= 0) {
429 *TAU = zero;
430 return;
431 }
432 __float128 xnorm = blas.NRM2 (N-1, X, INCX);
433
434 if (xnorm == zero) {
435 // H = [+/-1, 0; I], sign chosen so *ALPHA >= 0
436 if (*ALPHA >= zero) {
437 // When TAU.eq.ZERO, the vector is special-cased to be all zeros
438 // in the application routines. We do not need to clear it.
439 *TAU = zero;
440 } else {
441 // However, the application routines rely on explicit
442 // zero checks when TAU.ne.ZERO, and we must clear X.
443 *TAU = two;
444 for (int j = 0; j < N; ++j) {
445 X[j * INCX] = 0.0;
446 }
447 *ALPHA = -*ALPHA;
448 }
449 } else { // general case (norm of x is nonzero)
450 // This implements Fortran's two-argument SIGN intrinsic.
451 __float128 beta = copysignq (LAPY2 (*ALPHA, xnorm), *ALPHA);
452 const __float128 smlnum = FLT128_MIN / FLT128_EPSILON;
453 int knt = 0;
454
455 if (fabsq (beta) < smlnum) {
456 // XNORM, BETA may be inaccurate; scale X and recompute them
457
458 __float128 bignum = one / smlnum;
459 do {
460 knt = knt + 1;
461 blas.SCAL (N-1, bignum, X, INCX);
462 beta = beta*bignum;
463 *ALPHA = *ALPHA*bignum;
464 } while (fabsq(beta) < smlnum);
465
466 // New BETA is at most 1, at least SMLNUM
467 xnorm = blas.NRM2 (N-1, X, INCX);
468 beta = copysignq (LAPY2 (*ALPHA, xnorm), *ALPHA);
469 }
470
471 __float128 savealpha = *ALPHA;
472 *ALPHA = *ALPHA + beta;
473 if (beta < zero) {
474 beta = -beta;
475 *TAU = -*ALPHA / beta;
476 } else {
477 *ALPHA = xnorm * (xnorm / *ALPHA);
478 *TAU = *ALPHA / beta;
479 *ALPHA = -*ALPHA;
480 }
481
482 if (fabsq (*TAU) <= smlnum) {
483 // In the case where the computed TAU ends up being a
484 // denormalized number, it loses relative accuracy. This is a
485 // BIG problem. Solution: flush TAU to ZERO. This explains the
486 // next IF statement.
487 //
488 // (Bug report provided by Pat Quillen from MathWorks on Jul 29,
489 // 2009.) (Thanks Pat. Thanks MathWorks.)
490
491 if (savealpha >= zero) {
492 *TAU = zero;
493 } else {
494 *TAU = two;
495 for (int j = 0; j < N; ++j) {
496 X[j*INCX] = 0.0;
497 }
498 beta = -savealpha;
499 }
500 }
501 else { // this is the general case
502 blas.SCAL (N-1, one / *ALPHA, X, INCX);
503 }
504 // If BETA is subnormal, it may lose relative accuracy
505 for (int j = 1; j <= knt; ++j) {
506 beta = beta*smlnum;
507 }
508 *ALPHA = beta;
509 }
510}
511
512void
513Lapack128::
514GEQR2 (const int /* M */,
515 const int /* N */,
516 __float128 /* A */ [],
517 const int /* LDA */,
518 __float128 /* TAU */ [],
519 __float128 /* WORK */ [],
520 int* const /* INFO */ ) const
521{
523 (true, std::logic_error, "Teuchos::LAPACK<int, __float128>::GEQR2: Not implemented yet.");
524}
525
526void
527Lapack128::
528GEQRF (const int M,
529 const int N,
530 __float128 A[],
531 const int LDA,
532 __float128 TAU[],
533 __float128 WORK[],
534 const int LWORK,
535 int* const INFO) const
536{
537 // mfh 17 Sep 2015: We don't implement a BLAS 3 QR factorization for
538 // __float128. Instead, we call the BLAS 2 QR factorization GEQR2,
539 // which has a fixed minimum WORK array length of N. Thus, we have
540 // to roll our own LWORK query here.
541
542 if (LWORK == -1) {
543 WORK[0] = static_cast<__float128> (N);
544 }
545 else {
546 GEQR2 (M, N, A, LDA, TAU, WORK, INFO);
547 }
548}
549
550void
551Lapack128::
552ORGQR (const int /* M */,
553 const int /* N */,
554 const int /* K */,
555 __float128 /* A */ [],
556 const int /* LDA */,
557 const __float128 /* TAU */ [],
558 __float128 /* WORK */ [],
559 const int /* LWORK */,
560 int* const /* INFO */) const
561{
563 (true, std::logic_error, "Teuchos::LAPACK<int, __float128>::GEQR2: Not implemented yet.");
564}
565
566void
567Lapack128::
568UNGQR (const int /* M */,
569 const int /* N */,
570 const int /* K */,
571 __float128 /* A */ [],
572 const int /* LDA */,
573 const __float128 /* TAU */ [],
574 __float128 /* WORK */ [],
575 const int /* LWORK */,
576 int* const /* INFO */) const
577{
579 (true, std::logic_error, "Teuchos::LAPACK<int, __float128>::GEQR2: Not implemented yet.");
580}
581
582void
583Lapack128::
584LASCL (const char TYPE,
585 const int kl,
586 const int ku,
587 const __float128 cfrom,
588 const __float128 cto,
589 const int m,
590 const int n,
591 __float128* A,
592 const int lda,
593 int* info) const
594{
596 (true, std::logic_error, "Teuchos::LAPACK<int, __float128>::LASCL: Not implemented yet.");
597}
598
599void
600Lapack128::
601GBTRF (const int m,
602 const int n,
603 const int kl,
604 const int ku,
605 __float128* A,
606 const int lda,
607 int* IPIV,
608 int* info) const
609{
611 (true, std::logic_error, "Teuchos::LAPACK<int, __float128>::GBTRF: Not implemented yet.");
612}
613
614void
615Lapack128::
616GBTRS (const char TRANS,
617 const int n,
618 const int kl,
619 const int ku,
620 const int nrhs,
621 const __float128* A,
622 const int lda,
623 const int* IPIV,
624 __float128* B,
625 const int ldb,
626 int* info) const
627{
629 (true, std::logic_error, "Teuchos::LAPACK<int, __float128>::GBTRS: Not implemented yet.");
630}
631
632} // namespace Details
633} // namespace Teuchos
634#endif // HAVE_TEUCHOSCORE_QUADMATH
Templated interface class to BLAS routines.
Declaration and definition of Teuchos::Details::Lapack128, a partial implementation of Teuchos::LAPAC...
Smart reference counting pointer class for automatic garbage collection.
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
Macro for throwing an exception with breakpointing to ease debugging.
Namespace of implementation details.
The Teuchos namespace contains all of the classes, structs and enums used by Teuchos,...