35#include "KnyazevLOBPCG.h"
38KnyazevLOBPCG::KnyazevLOBPCG(
const Epetra_Comm &_Comm,
const Epetra_Operator *KK,
39 const Epetra_Operator *PP,
40 double _tol,
int _maxIter,
int _verb) :
52 maxIterEigenSolve(_maxIter),
81KnyazevLOBPCG::KnyazevLOBPCG(
const Epetra_Comm &_Comm,
const Epetra_Operator *KK,
82 const Epetra_Operator *MM,
const Epetra_Operator *PP,
83 double _tol,
int _maxIter,
int _verb,
96 maxIterEigenSolve(_maxIter),
112 timeLocalUpdate(0.0),
125KnyazevLOBPCG::~KnyazevLOBPCG() {
135int KnyazevLOBPCG::solve(
int numEigen, Epetra_MultiVector &Q,
double *lambda) {
137 return KnyazevLOBPCG::reSolve(numEigen, Q, lambda);
142int KnyazevLOBPCG::reSolve(
int numEigen, Epetra_MultiVector &Q,
double *lambda,
int startingEV) {
197 if (numEigen <= startingEV) {
201 int info = myVerify.inputArguments(numEigen, K, M, Prec, Q, numEigen);
205 int myPid = MyComm.MyPID();
208 Epetra_Vector *vectWeight = 0;
210 vectWeight =
new Epetra_Vector(View, Q.Map(), normWeight);
213 int knownEV = startingEV;
214 int localVerbose = verbose*(myPid==0);
231 int xr = Q.MyLength();
232 Epetra_MultiVector X(View, Q, 0, numEigen);
234 blockSize = numEigen;
237 tmp = (M == 0) ? 6*numEigen*xr : 9*numEigen*xr;
239 double *work1 =
new (nothrow)
double[tmp];
246 memRequested +=
sizeof(double)*tmp/(1024.0*1024.0);
248 highMem = (highMem > currentSize()) ? highMem : currentSize();
250 double *tmpD = work1;
252 Epetra_MultiVector KX(View, Q.Map(), tmpD, xr, numEigen);
253 tmpD = tmpD + xr*numEigen;
255 Epetra_MultiVector MX(View, Q.Map(), (M) ? tmpD : X.Values(), xr, numEigen);
256 tmpD = (M) ? tmpD + xr*numEigen : tmpD;
258 Epetra_MultiVector R(View, Q.Map(), tmpD, xr, numEigen);
259 tmpD = tmpD + xr*numEigen;
261 Epetra_MultiVector H(View, Q.Map(), tmpD, xr, numEigen);
262 tmpD = tmpD + xr*numEigen;
264 Epetra_MultiVector KH(View, Q.Map(), tmpD, xr, numEigen);
265 tmpD = tmpD + xr*numEigen;
267 Epetra_MultiVector MH(View, Q.Map(), (M) ? tmpD : H.Values(), xr, numEigen);
268 tmpD = (M) ? tmpD + xr*numEigen : tmpD;
270 Epetra_MultiVector P(View, Q.Map(), tmpD, xr, numEigen);
271 tmpD = tmpD + xr*numEigen;
273 Epetra_MultiVector KP(View, Q.Map(), tmpD, xr, numEigen);
274 tmpD = tmpD + xr*numEigen;
276 Epetra_MultiVector MP(View, Q.Map(), (M) ? tmpD : P.Values(), xr, numEigen);
278 if (startingEV > 0) {
280 Epetra_MultiVector copyX(View, X, 0, startingEV);
281 Epetra_MultiVector copyKX(View, KX, 0, startingEV);
282 timeStifOp -= MyWatch.WallTime();
283 K->Apply(copyX, copyKX);
284 timeStifOp += MyWatch.WallTime();
285 stifOp += startingEV;
286 timeMassOp -= MyWatch.WallTime();
288 Epetra_MultiVector copyMX(View, MX, 0, startingEV);
289 M->Apply(copyX, copyMX);
291 timeMassOp += MyWatch.WallTime();
292 massOp += startingEV;
306 lwork2 = 2*numEigen + 27*numEigen*numEigen;
307 double *work2 =
new (nothrow)
double[lwork2];
316 highMem = (highMem > currentSize()) ? highMem : currentSize();
320 double *theta = tmpD;
321 tmpD = tmpD + numEigen;
323 double *normR = tmpD;
324 tmpD = tmpD + numEigen;
327 tmpD = tmpD + 9*numEigen*numEigen;
330 tmpD = tmpD + 9*numEigen*numEigen;
334 memRequested +=
sizeof(double)*lwork2/(1024.0*1024.0);
337 if (localVerbose > 2) {
338 resHistory =
new (nothrow)
double[maxIterEigenSolve*numEigen];
339 if (resHistory == 0) {
352 bool reStart =
false;
356 int firstIndex = knownEV;
359 if (localVerbose > 0) {
361 cout <<
" *|* Problem: ";
363 cout <<
"K*Q = M*Q D ";
365 cout <<
"K*Q = Q D ";
367 cout <<
" with preconditioner";
369 cout <<
" *|* Algorithm = LOBPCG (Knyazev's version)" << endl;
370 cout <<
" *|* Size of blocks = " << blockSize << endl;
371 cout <<
" *|* Number of requested eigenvalues = " << numEigen << endl;
373 cout.setf(ios::scientific, ios::floatfield);
374 cout <<
" *|* Tolerance for convergence = " << tolEigenSolve << endl;
375 cout <<
" *|* Norm used for convergence: ";
377 cout <<
"weighted L2-norm with user-provided weights" << endl;
379 cout <<
"L^2-norm" << endl;
381 cout <<
" *|* Input converged eigenvectors = " << startingEV << endl;
382 cout <<
"\n -- Start iterations -- \n";
385 timeOuterLoop -= MyWatch.WallTime();
386 for (outerIter = 1; outerIter <= maxIterEigenSolve; ++outerIter) {
388 highMem = (highMem > currentSize()) ? highMem : currentSize();
390 int workingSize = numEigen - knownEV;
392 Epetra_MultiVector XI(View, X, firstIndex, workingSize);
393 Epetra_MultiVector MXI(View, MX, firstIndex, workingSize);
394 Epetra_MultiVector KXI(View, KX, firstIndex, workingSize);
396 Epetra_MultiVector HI(View, H, firstIndex, workingSize);
397 Epetra_MultiVector MHI(View, MH, firstIndex, workingSize);
398 Epetra_MultiVector KHI(View, KH, firstIndex, workingSize);
400 Epetra_MultiVector PI(View, P, firstIndex, workingSize);
401 Epetra_MultiVector MPI(View, MP, firstIndex, workingSize);
402 Epetra_MultiVector KPI(View, KP, firstIndex, workingSize);
404 Epetra_MultiVector RI(View, R, firstIndex, workingSize);
406 if ((outerIter == 1) || (reStart ==
true)) {
409 localSize = numEigen;
412 timeMassOp -= MyWatch.WallTime();
415 timeMassOp += MyWatch.WallTime();
416 massOp += workingSize;
419 timeStifOp -= MyWatch.WallTime();
421 timeStifOp += MyWatch.WallTime();
422 stifOp += workingSize;
429 timePrecOp -= MyWatch.WallTime();
430 Prec->ApplyInverse(RI, HI);
431 timePrecOp += MyWatch.WallTime();
432 precOp += workingSize;
435 memcpy(HI.Values(), RI.Values(), xr*workingSize*
sizeof(
double));
439 timeMassOp -= MyWatch.WallTime();
442 timeMassOp += MyWatch.WallTime();
443 massOp += workingSize;
446 timeStifOp -= MyWatch.WallTime();
448 timeStifOp += MyWatch.WallTime();
449 stifOp += workingSize;
451 if (localSize == numEigen)
452 localSize += workingSize;
458 timeLocalProj -= MyWatch.WallTime();
459 modalTool.localProjection(numEigen, numEigen, xr, X.Values(), xr, KX.Values(), xr,
461 modalTool.localProjection(numEigen, numEigen, xr, X.Values(), xr, MX.Values(), xr,
463 if (localSize > numEigen) {
464 double *pointer = KK + numEigen*localSize;
465 modalTool.localProjection(numEigen, workingSize, xr, X.Values(), xr, KHI.Values(), xr,
466 pointer, localSize, S);
467 modalTool.localProjection(workingSize, workingSize, xr, HI.Values(), xr, KHI.Values(), xr,
468 pointer + numEigen, localSize, S);
469 pointer = MM + numEigen*localSize;
470 modalTool.localProjection(numEigen, workingSize, xr, X.Values(), xr, MHI.Values(), xr,
471 pointer, localSize, S);
472 modalTool.localProjection(workingSize, workingSize, xr, HI.Values(), xr, MHI.Values(), xr,
473 pointer + numEigen, localSize, S);
474 if (localSize > numEigen + workingSize) {
475 pointer = KK + (numEigen + workingSize)*localSize;
476 modalTool.localProjection(numEigen, workingSize, xr, X.Values(), xr, KPI.Values(), xr,
477 pointer, localSize, S);
478 modalTool.localProjection(workingSize, workingSize, xr, HI.Values(), xr, KPI.Values(), xr,
479 pointer + numEigen, localSize, S);
480 modalTool.localProjection(workingSize, workingSize, xr, PI.Values(), xr, KPI.Values(), xr,
481 pointer + numEigen + workingSize, localSize, S);
482 pointer = MM + (numEigen + workingSize)*localSize;
483 modalTool.localProjection(numEigen, workingSize, xr, X.Values(), xr, MPI.Values(), xr,
484 pointer, localSize, S);
485 modalTool.localProjection(workingSize, workingSize, xr, HI.Values(), xr, MPI.Values(), xr,
486 pointer + numEigen, localSize, S);
487 modalTool.localProjection(workingSize, workingSize, xr, PI.Values(), xr, MPI.Values(), xr,
488 pointer + numEigen + workingSize, localSize, S);
491 timeLocalProj += MyWatch.WallTime();
494 timeLocalSolve -= MyWatch.WallTime();
495 int nevLocal = localSize;
496 info = modalTool.directSolver(localSize, KK, localSize, MM, localSize, nevLocal,
497 S, localSize, theta, localVerbose,
498 (blockSize == 1) ? 1 : 0);
499 timeLocalSolve += MyWatch.WallTime();
507 if ((theta[0] < 0.0) || (nevLocal < numEigen)) {
508 if (localVerbose > 0) {
509 cout <<
" Iteration " << outerIter;
510 cout <<
" - Failure for spectral decomposition - RESTART with new random search\n";
512 if (workingSize == 1) {
516 Epetra_MultiVector Xinit(View, XI, 1, workingSize-1);
525 if ((localSize == numEigen+workingSize) && (nevLocal == numEigen)) {
526 for (j = 0; j < nevLocal; ++j)
527 memcpy(S+j*numEigen, S+j*localSize, numEigen*
sizeof(
double));
528 localSize = numEigen;
531 if ((localSize == numEigen+2*workingSize) && (nevLocal <= numEigen+workingSize)) {
532 for (j = 0; j < nevLocal; ++j)
533 memcpy(S+j*(numEigen+workingSize), S+j*localSize, (numEigen+workingSize)*
sizeof(
double));
534 localSize = numEigen + workingSize;
539 timeLocalUpdate -= MyWatch.WallTime();
541 memcpy(R.Values(), X.Values(), xr*numEigen*
sizeof(
double));
542 callBLAS.GEMM(
'N',
'N', xr, numEigen, numEigen, 1.0, R.Values(), xr, S, localSize,
543 0.0, X.Values(), xr);
545 memcpy(R.Values(), MX.Values(), xr*numEigen*
sizeof(
double));
546 callBLAS.GEMM(
'N',
'N', xr, numEigen, numEigen, 1.0, R.Values(), xr, S, localSize,
547 0.0, MX.Values(), xr);
549 memcpy(R.Values(), KX.Values(), xr*numEigen*
sizeof(
double));
550 callBLAS.GEMM(
'N',
'N', xr, numEigen, numEigen, 1.0, R.Values(), xr, S, localSize,
551 0.0, KX.Values(), xr);
553 if (localSize == numEigen + workingSize) {
554 callBLAS.GEMM(
'N',
'N', xr, numEigen, workingSize, 1.0, HI.Values(), xr,
555 S + numEigen, localSize, 0.0, P.Values(), xr);
557 callBLAS.GEMM(
'N',
'N', xr, numEigen, workingSize, 1.0, MHI.Values(), xr,
558 S + numEigen, localSize, 0.0, MP.Values(), xr);
560 callBLAS.GEMM(
'N',
'N', xr, numEigen, workingSize, 1.0, KHI.Values(), xr,
561 S + numEigen, localSize, 0.0, KP.Values(), xr);
564 if (localSize > numEigen + workingSize) {
565 memcpy(RI.Values(), PI.Values(), xr*workingSize*
sizeof(
double));
566 callBLAS.GEMM(
'N',
'N', xr, numEigen, workingSize, 1.0, HI.Values(), xr,
567 S + numEigen, localSize, 0.0, P.Values(), xr);
568 callBLAS.GEMM(
'N',
'N', xr, numEigen, workingSize, 1.0, RI.Values(), xr,
569 S + numEigen + workingSize, localSize, 1.0, P.Values(), xr);
571 memcpy(RI.Values(), MPI.Values(), xr*workingSize*
sizeof(
double));
572 callBLAS.GEMM(
'N',
'N', xr, numEigen, workingSize, 1.0, MHI.Values(), xr,
573 S + numEigen, localSize, 0.0, MP.Values(), xr);
574 callBLAS.GEMM(
'N',
'N', xr, numEigen, workingSize, 1.0, RI.Values(), xr,
575 S + numEigen + workingSize, localSize, 1.0, MP.Values(), xr);
577 memcpy(RI.Values(), KPI.Values(), xr*workingSize*
sizeof(
double));
578 callBLAS.GEMM(
'N',
'N', xr, numEigen, workingSize, 1.0, KHI.Values(), xr,
579 S + numEigen, localSize, 0.0, KP.Values(), xr);
580 callBLAS.GEMM(
'N',
'N', xr, numEigen, workingSize, 1.0, RI.Values(), xr,
581 S + numEigen + workingSize, localSize, 1.0, KP.Values(), xr);
584 if (localSize > numEigen) {
585 callBLAS.AXPY(xr*numEigen, 1.0, P.Values(), X.Values());
587 callBLAS.AXPY(xr*numEigen, 1.0, MP.Values(), MX.Values());
588 callBLAS.AXPY(xr*numEigen, 1.0, KP.Values(), KX.Values());
590 timeLocalUpdate += MyWatch.WallTime();
593 timeResidual -= MyWatch.WallTime();
594 memcpy(R.Values(), KX.Values(), xr*numEigen*
sizeof(
double));
595 for (j = 0; j < numEigen; ++j) {
596 callBLAS.AXPY(xr, -theta[j], MX.Values() + j*xr, R.Values() + j*xr);
598 timeResidual += MyWatch.WallTime();
599 residual += numEigen;
602 timeNorm -= MyWatch.WallTime();
604 R.NormWeighted(*vectWeight, normR);
610 for (j = 0; j < numEigen; ++j) {
611 normR[j] = (theta[j] == 0.0) ? normR[j] : normR[j]/theta[j];
613 timeNorm += MyWatch.WallTime();
617 accuracyCheck(&X, &MX, &R);
620 if (localSize == numEigen + workingSize)
621 localSize += workingSize;
624 timeNorm -= MyWatch.WallTime();
626 for (i=0; i < numEigen; ++i) {
627 if (normR[i] < tolEigenSolve) {
628 lambda[i] = theta[i];
632 timeNorm += MyWatch.WallTime();
635 if (localVerbose > 2) {
636 memcpy(resHistory + historyCount, normR, numEigen*
sizeof(
double));
637 historyCount += numEigen;
641 if (localVerbose > 0) {
642 cout <<
" Iteration " << outerIter;
643 cout <<
" - Number of converged eigenvectors " << knownEV << endl;
646 if (localVerbose > 1) {
649 cout.setf(ios::scientific, ios::floatfield);
650 for (i=0; i<numEigen; ++i) {
651 cout <<
" Iteration " << outerIter <<
" - Scaled Norm of Residual " << i;
652 cout <<
" = " << normR[i] << endl;
656 for (i=0; i<numEigen; ++i) {
657 cout <<
" Iteration " << outerIter <<
" - Ritz eigenvalue " << i;
658 cout.setf((fabs(theta[i]) < 0.01) ? ios::scientific : ios::fixed, ios::floatfield);
659 cout <<
" = " << theta[i] << endl;
665 if (knownEV >= numEigen) {
666 if (localVerbose == 1) {
669 cout.setf(ios::scientific, ios::floatfield);
670 for (i=0; i<numEigen; ++i) {
671 cout <<
" Iteration " << outerIter <<
" - Scaled Norm of Residual " << i;
672 cout <<
" = " << normR[i] << endl;
680 if ((knownEV > 0) && (localSize > numEigen)) {
681 if (localSize == numEigen + workingSize)
682 localSize = 2*numEigen - knownEV;
683 if (localSize == numEigen + 2*workingSize)
684 localSize = 3*numEigen - 2*knownEV;
689 for (i=0; i<numEigen; ++i) {
690 if (normR[i] >= tolEigenSolve) {
697 if (firstIndex == numEigen-1)
700 while (firstIndex < knownEV) {
702 for (j = firstIndex; j < numEigen; ++j) {
703 if (normR[j] < tolEigenSolve) {
704 callFortran.SWAP(1, normR + j, 1, normR + firstIndex, 1);
705 callFortran.SWAP(xr, X.Values() + j*xr, 1, X.Values() + firstIndex*xr, 1);
706 callFortran.SWAP(xr, KX.Values() + j*xr, 1, KX.Values() + firstIndex*xr, 1);
707 callFortran.SWAP(xr, R.Values() + j*xr, 1, R.Values() + firstIndex*xr, 1);
708 callFortran.SWAP(xr, H.Values() + j*xr, 1, H.Values() + firstIndex*xr, 1);
709 callFortran.SWAP(xr, KH.Values() + j*xr, 1, KH.Values() + firstIndex*xr, 1);
710 callFortran.SWAP(xr, P.Values() + j*xr, 1, P.Values() + firstIndex*xr, 1);
711 callFortran.SWAP(xr, KP.Values() + j*xr, 1, KP.Values() + firstIndex*xr, 1);
713 callFortran.SWAP(xr, MX.Values() + j*xr, 1, MX.Values() + firstIndex*xr, 1);
714 callFortran.SWAP(xr, MH.Values() + j*xr, 1, MH.Values() + firstIndex*xr, 1);
715 callFortran.SWAP(xr, MP.Values() + j*xr, 1, MP.Values() + firstIndex*xr, 1);
721 for (i = firstIndex; i < numEigen; ++i) {
722 if (normR[i] >= tolEigenSolve) {
731 timeOuterLoop += MyWatch.WallTime();
732 highMem = (highMem > currentSize()) ? highMem : currentSize();
741 timePostProce -= MyWatch.WallTime();
742 if ((info == 0) && (knownEV > 0)) {
743 mySort.sortScalars_Vectors(knownEV, lambda, Q.Values(), Q.MyLength());
745 timePostProce += MyWatch.WallTime();
747 return (info == 0) ? knownEV : info;
752void KnyazevLOBPCG::accuracyCheck(
const Epetra_MultiVector *X,
const Epetra_MultiVector *MX,
753 const Epetra_MultiVector *R)
const {
756 cout.setf(ios::scientific, ios::floatfield);
759 int myPid = MyComm.MyPID();
764 tmp = myVerify.errorEquality(X, MX, M);
766 cout <<
" >> Difference between MX and M*X = " << tmp << endl;
768 tmp = myVerify.errorOrthonormality(X, M);
770 cout <<
" >> Error in X^T M X - I = " << tmp << endl;
773 tmp = myVerify.errorOrthonormality(X, 0);
775 cout <<
" >> Error in X^T X - I = " << tmp << endl;
780 tmp = myVerify.errorOrthogonality(X, R);
782 cout <<
" >> Orthogonality X^T R up to " << tmp << endl;
788void KnyazevLOBPCG::initializeCounters() {
807 timeLocalSolve = 0.0;
808 timeLocalUpdate = 0.0;
821void KnyazevLOBPCG::algorithmInfo()
const {
823 int myPid = MyComm.MyPID();
826 cout <<
" Algorithm: LOBPCG (Knyazev's version) with Cholesky-based local eigensolver\n";
827 cout <<
" Block Size: " << blockSize << endl;
833void KnyazevLOBPCG::historyInfo()
const {
837 cout <<
" Block Size: " << blockSize << endl;
839 cout <<
" Residuals\n";
842 cout.setf(ios::scientific, ios::floatfield);
843 for (j = 0; j < historyCount; ++j) {
844 cout << resHistory[j] << endl;
852void KnyazevLOBPCG::memoryInfo()
const {
854 int myPid = MyComm.MyPID();
856 double maxHighMem = 0.0;
857 double tmp = highMem;
858 MyComm.MaxAll(&tmp, &maxHighMem, 1);
860 double maxMemRequested = 0.0;
862 MyComm.MaxAll(&tmp, &maxMemRequested, 1);
866 cout.setf(ios::fixed, ios::floatfield);
867 cout <<
" Memory requested per processor by the eigensolver = (EST) ";
869 cout << maxMemRequested <<
" MB " << endl;
871 cout <<
" High water mark in eigensolver = (EST) ";
873 cout << maxHighMem <<
" MB " << endl;
880void KnyazevLOBPCG::operationInfo()
const {
882 int myPid = MyComm.MyPID();
885 cout <<
" --- Operations ---\n\n";
886 cout <<
" Total number of mass matrix multiplications = ";
888 cout << massOp << endl;
889 cout <<
" Total number of stiffness matrix operations = ";
891 cout << stifOp << endl;
892 cout <<
" Total number of preconditioner applications = ";
894 cout << precOp << endl;
895 cout <<
" Total number of computed eigen-residuals = ";
897 cout << residual << endl;
899 cout <<
" Total number of outer iterations = ";
901 cout << outerIter << endl;
902 cout <<
" Number of restarts = ";
904 cout << numRestart << endl;
911void KnyazevLOBPCG::timeInfo()
const {
913 int myPid = MyComm.MyPID();
916 cout <<
" --- Timings ---\n\n";
917 cout.setf(ios::fixed, ios::floatfield);
919 cout <<
" Total time for outer loop = ";
921 cout << timeOuterLoop <<
" s" << endl;
922 cout <<
" Time for mass matrix operations = ";
924 cout << timeMassOp <<
" s ";
926 cout << 100*timeMassOp/timeOuterLoop <<
" %\n";
927 cout <<
" Time for stiffness matrix operations = ";
929 cout << timeStifOp <<
" s ";
931 cout << 100*timeStifOp/timeOuterLoop <<
" %\n";
932 cout <<
" Time for preconditioner applications = ";
934 cout << timePrecOp <<
" s ";
936 cout << 100*timePrecOp/timeOuterLoop <<
" %\n";
937 cout <<
" Time for local projection = ";
939 cout << timeLocalProj <<
" s ";
941 cout << 100*timeLocalProj/timeOuterLoop <<
" %\n";
942 cout <<
" Time for local eigensolve = ";
944 cout << timeLocalSolve <<
" s ";
946 cout << 100*timeLocalSolve/timeOuterLoop <<
" %\n";
947 cout <<
" Time for local update = ";
949 cout << timeLocalUpdate <<
" s ";
951 cout << 100*timeLocalUpdate/timeOuterLoop <<
" %\n";
952 cout <<
" Time for residual computations = ";
954 cout << timeResidual <<
" s ";
956 cout << 100*timeResidual/timeOuterLoop <<
" %\n";
957 cout <<
" Time for residuals norm computations = ";
959 cout << timeNorm <<
" s ";
961 cout << 100*timeNorm/timeOuterLoop <<
" %\n";
963 cout <<
" Total time for post-processing = ";
965 cout << timePostProce <<
" s\n";