| 488 | | if (ta == FflasTrans) { |
| 489 | | A21 = A + mr; |
| 490 | | A12 = A + kr*lda; |
| 491 | | A22 = A12 + mr; |
| 492 | | imaxa = kr; |
| 493 | | ldx2 = jmaxa = mr; |
| | 487 | |
| | 488 | if (F.isZero(beta)){ |
| | 489 | size_t x1rd = MAX(nr,kr); |
| | 490 | size_t ldx1; |
| | 491 | if (ta == FflasTrans) { |
| | 492 | A21 = A + mr; |
| | 493 | A12 = A + kr*lda; |
| | 494 | A22 = A12 + mr; |
| | 495 | imaxa = kr; |
| | 496 | jmaxa = mr; |
| | 497 | ldx1 = mr; |
| | 498 | } else { |
| | 499 | A12 = A + kr; |
| | 500 | A21 = A + mr*lda; |
| | 501 | A22 = A21 + kr; |
| | 502 | imaxa = mr; |
| | 503 | jmaxa = kr; |
| | 504 | ldx1 = x1rd; |
| | 505 | } |
| | 506 | if (tb == FflasTrans) { |
| | 507 | B21 = B + kr; |
| | 508 | B12 = B + nr*ldb; |
| | 509 | B22 = B12 + kr; |
| | 510 | imaxb = nr; |
| | 511 | jmaxb = kr; |
| | 512 | ldx2 = kr; |
| | 513 | } else { |
| | 514 | B12 = B + nr; |
| | 515 | B21 = B + kr*ldb; |
| | 516 | B22 = B21 + nr; |
| | 517 | imaxb = kr; |
| | 518 | ldx2 = jmaxb = nr; |
| | 519 | } |
| | 520 | |
| | 521 | |
| | 522 | // Two temporary submatrices are required |
| | 523 | typename Field::Element* X1 = new typename Field::Element[mr*x1rd]; |
| | 524 | typename Field::Element* X2 = new typename Field::Element[kr*nr]; |
| | 525 | |
| | 526 | // T3 = B22 - B12 in X2 |
| | 527 | d12 = B12; d22 = B22; dx2 = X2; |
| | 528 | for (size_t i=0; i < imaxb; ++i, d12+=ldb, d22+=ldb, dx2+=ldx2) { |
| | 529 | for (size_t j=0;j < jmaxb;++j) |
| | 530 | F.sub (*(dx2+j), *(d22 + j), *(d12 + j)); |
| | 531 | } |
| | 532 | |
| | 533 | // S3 = A11 - A21 in X1 |
| | 534 | d11 = A11; d21 = A21; dx1 = X1; |
| | 535 | for (size_t i = 0; i < imaxa; ++i, d11 += lda, d21 += lda, dx1 += ldx1) |
| | 536 | for (size_t j = 0; j < jmaxa; ++j) |
| | 537 | F.sub (*(dx1+j), *(d11 + j), *(d21 + j)); |
| | 538 | |
| | 539 | // P7 = alpha . S3 * T3 in C21 |
| | 540 | WinoMain (F, ta, tb, mr, nr, kr, alpha, X1, ldx1, X2, ldx2, zero, C21, ldc, kmax, w-1, base); |
| | 541 | |
| | 542 | // T1 = B12 - B11 in X2 |
| | 543 | // fgemsub (F, imaxb, jmaxb, X2, ldx2, B12, ldb, B11, ldb); |
| | 544 | |
| | 545 | d11 = B11; d12 = B12; dx2 = X2; |
| | 546 | for (size_t i = 0; i < imaxb; ++i, d11 += ldb, d12 += ldb, dx2 += ldx2) { |
| | 547 | for (size_t j = 0; j < jmaxb; ++j) |
| | 548 | F.sub (*(dx2 + j), *(d12 + j), *(d11 + j)); |
| | 549 | } |
| | 550 | |
| | 551 | // S1 = A21 + A22 in X1 |
| | 552 | |
| | 553 | d21 = A21; d22 = A22; dx1 = X1; |
| | 554 | for (size_t i = 0; i < imaxa; ++i, d21+=lda, d22+=lda, dx1+=ldx1) { |
| | 555 | for (size_t j=0;j < jmaxa;++j) |
| | 556 | F.add(*(dx1+j),* (d21 + j),*(d22 + j)); |
| | 557 | } |
| | 558 | |
| | 559 | // P5 = alpha . S1*T1 in C22 |
| | 560 | WinoMain (F, ta, tb, mr, nr, kr, alpha, X1, ldx1, X2, ldx2, zero, C22, ldc, kmax, w-1, base); |
| | 561 | |
| | 562 | // T2 = B22 - T1 in X2 |
| | 563 | d22 = B22; dx2 = X2; |
| | 564 | for (size_t i = 0; i < imaxb; ++i, d22+=ldb, dx2+=ldx2) { |
| | 565 | for (size_t j = 0; j < jmaxb; ++j) |
| | 566 | F.sub (*(dx2+j), *(d22 + j), *(dx2+j)); |
| | 567 | } |
| | 568 | |
| | 569 | // S2 = S1 - A11 in X1 |
| | 570 | d11 = A11; dx1 = X1; |
| | 571 | for (size_t i = 0; i < imaxa; ++i, d11+=lda, dx1+=ldx1) { |
| | 572 | for (size_t j = 0; j < jmaxa; ++j) |
| | 573 | F.subin (*(dx1+j), *(d11 + j)); |
| | 574 | } |
| | 575 | //write_field(F, cerr<<"S2 = "<<endl, X1, imaxa, jmaxa, ldx1); |
| | 576 | |
| | 577 | // P6 = alpha . S2 * T2 in C12 |
| | 578 | WinoMain (F, ta, tb, mr, nr, kr, alpha, X1, ldx1, X2, ldx2, zero, C12, ldc, kmax, w-1, base); |
| | 579 | |
| | 580 | // S4 = A12 -S2 in X1 |
| | 581 | d12 = A12; dx1 = X1; |
| | 582 | for (size_t i = 0; i < imaxa; ++i, d12 += lda, dx1 += ldx1) { |
| | 583 | for (size_t j = 0; j < jmaxa; ++j) |
| | 584 | F.sub (*(dx1+j), *(d12 + j), *(dx1+j)); |
| | 585 | } |
| | 586 | //write_field(F, cerr<<"S4 = "<<endl, X1, imaxa, jmaxa, ldx1); |
| | 587 | |
| | 588 | // P3 = alpha . S4*B22 in C11 |
| | 589 | WinoMain (F, ta, tb, mr, nr, kr, alpha, X1, ldx1, B22, ldb, zero, C11, ldc, kmax, w-1, base); |
| | 590 | //write_field(F, cerr<<"P3 = "<<endl, C11, mr, nr, ldc); |
| | 591 | |
| | 592 | // P1 = alpha . A11 * B11 in X1 |
| | 593 | WinoMain (F, ta, tb, mr, nr, kr, alpha, A11, lda, B11, ldb, zero, X1, nr, kmax, w-1, base); |
| | 594 | |
| | 595 | |
| | 596 | |
| | 597 | // U2 = P1 + P6 in tmpU2 and |
| | 598 | // U3 = P7 + U2 in tmpU3 and |
| | 599 | // U7 = P5 + U3 in C22 and |
| | 600 | // U4 = P5 + U2 in C12 and |
| | 601 | d12c = C12; dx1=X1; d21c = C21; d22c = C22; |
| | 602 | for (size_t i = 0; i < mr; |
| | 603 | ++i, d12c += ldc, dx1 += nr, d22c+=ldc, d21c += ldc) { |
| | 604 | for (size_t j=0;j < nr;++j) { |
| | 605 | F.addin ( *(d12c + j), *(dx1 + j)); // U2 = P1 + P6 |
| | 606 | F.addin ( *(d21c+j), *(d12c+j)); // U3 = U2 + P7 |
| | 607 | F.addin (*(d12c + j), *(d22c+j)); // U4 = P5 + U2 in C12 |
| | 608 | F.addin (*(d22c + j), *(d21c+j)); // U7 = P5 + U3 in C22 |
| | 609 | } |
| | 610 | } |
| | 611 | |
| | 612 | // U5 = P3 + U4 in C12 |
| | 613 | d12c = C12; d11 = C11; |
| | 614 | for (size_t i = 0; i < mr; ++i, d12c += ldc, d11 += ldc) |
| | 615 | for (size_t j = 0; j < nr; ++j) |
| | 616 | F.addin (*(d12c + j), *(d11 + j)); |
| | 617 | // T4 = T2 - B21 in X2 |
| | 618 | d21 = B21;dx2=X2; |
| | 619 | for (size_t i = 0; i < imaxb; ++i, d21+=ldb, dx2+=ldx2) { |
| | 620 | for (size_t j = 0; j < jmaxb; ++j) |
| | 621 | F.subin (*(dx2+j),* (d21 + j)); |
| | 622 | } |
| | 623 | //write_field(F, cerr<<"T4 = "<<endl, X2, imaxb, jmaxb, ldx2); |
| | 624 | |
| | 625 | // P4 = alpha . A22 * T4 in C11 |
| | 626 | WinoMain (F, ta, tb, mr, nr, kr, alpha, A22, lda, X2, ldx2, zero, C11, ldc, kmax, w-1, base); |
| | 627 | //write_field(F, cerr<<"P4 = "<<endl, C11, mr, nr, ldc); |
| | 628 | |
| | 629 | // U6 = U3 - P4 in C21 |
| | 630 | d21c = C21; d11c = C11; |
| | 631 | for (size_t i = 0; i < mr; ++i, d21c += ldc, d11c += ldc) |
| | 632 | for (size_t j = 0; j < nr; ++j) |
| | 633 | F.subin (*(d21c + j), *(d11c + j)); |
| | 634 | |
| | 635 | // P2 = alpha . A12 * B21 in C11 |
| | 636 | WinoMain (F, ta, tb, mr, nr, kr, alpha, A12, lda, B21, ldb, zero, C11, ldc, kmax,w-1, base); |
| | 637 | |
| | 638 | // U1 = P2 + P1 in C11 |
| | 639 | d11c = C11; dx1 = X1; |
| | 640 | for (size_t i = 0; i < mr; ++i, d11c += ldc, dx1 += nr) |
| | 641 | for (size_t j = 0; j < nr; ++j) |
| | 642 | F.addin (*(d11c + j), *(dx1 + j)); |
| | 643 | |
| | 644 | delete[] X1; |
| | 645 | delete[] X2; |
| | 646 | |
| 529 | | } |
| 530 | | |
| 531 | | // S3 = A11 - A21 in X2 |
| 532 | | d11 = A11; d21 = A21; dx2 = X2; |
| 533 | | for (size_t i = 0; i < imaxa; ++i, d11 += lda, d21 += lda, dx2 += ldx2) |
| 534 | | for (size_t j = 0; j < jmaxa; ++j) |
| 535 | | F.sub (*(dx2+j), *(d11 + j), *(d21 + j)); |
| 536 | | |
| 537 | | if (!F.isZero(beta)) { |
| | 690 | |
| | 691 | // T1 = B12 - B11 in X3 |
| | 692 | d11 = B11; d12 = B12; dx3 = X3; |
| | 693 | for (size_t i = 0; i < imaxb; ++i, d11 += ldb, d12 += ldb, dx3 += ldx3) { |
| | 694 | for (size_t j = 0; j < jmaxb; ++j) |
| | 695 | F.sub (*(dx3 + j), *(d12 + j), *(d11 + j)); |
| | 696 | } |
| | 697 | |
| | 698 | // S1 = A21 + A22 in X2 |
| | 699 | d21 = A21; d22 = A22; dx2 = X2; |
| | 700 | for (size_t i = 0; i < imaxa; ++i, d21+=lda, d22+=lda, dx2+=ldx2) { |
| | 701 | for (size_t j=0;j < jmaxa;++j) |
| | 702 | F.add(*(dx2+j),* (d21 + j),*(d22 + j)); |
| | 703 | } |
| | 704 | |
| | 705 | // P5 = alpha . S1*T1 + beta . C12 in C12 |
| | 706 | //WinoMain (F, ta, tb, mr, nr, kr, alpha, X2, ldx2, X3, ldx3, beta, C12, ldc, kmax, w-1,base); |
| | 707 | WinoMain (F, ta, tb, mr, nr, kr, alpha, X2, ldx2, X3, ldx3, zero, X1, nr, kmax, w-1,base); |
| | 708 | |
| | 709 | // C22 = P5 + beta C22 in C22 |
| | 710 | d22c = C22; dx1 = X1; |
| | 711 | for (size_t i = 0; i < mr; ++i, dx1 += nr, d22c += ldc) |
| | 712 | for (size_t j=0;j < nr;++j) { |
| | 713 | F.mulin (*(d22c + j), beta); |
| | 714 | F.addin (*(d22c + j), *(dx1 + j)); |
| | 715 | } |
| | 716 | |
| | 717 | // C12 = P5 + beta C12 in C12 |
| | 718 | dx1 = X1; d12c = C12; |
| | 719 | for (size_t i = 0; i < mr; ++i, d12c += ldc, dx1 += nr) |
| | 720 | for (size_t j=0;j < nr;++j) { |
| | 721 | F.mulin (*(d12c + j), beta); |
| | 722 | F.addin (*(d12c + j), *(dx1 + j)); |
| | 723 | } |
| | 724 | |
| | 725 | // P1 = alpha . A11 * B11 in X1 |
| | 726 | WinoMain (F, ta, tb, mr, nr, kr, alpha, A11, lda, B11, ldb, zero, X1, nr, kmax, w-1,base); |
| | 727 | |
| | 728 | |
| | 729 | // P2 = alpha . A12 * B21 + beta . C11 in C11 |
| | 730 | WinoMain (F, ta, tb, mr, nr, kr, alpha, A12, lda, B21, ldb, beta, C11, ldc, kmax,w-1,base); |
| | 731 | |
| | 732 | // U1 = P2 + P1 in C11 |
| | 733 | d11c = C11; dx1 = X1; |
| | 734 | for (size_t i = 0; i < mr; ++i, d11c += ldc, dx1 += nr) |
| | 735 | for (size_t j = 0; j < nr; ++j) |
| | 736 | F.addin (*(d11c + j), *(dx1 + j)); |
| | 737 | |
| | 738 | // T2 = B22 - T1 in X3 |
| | 739 | d22 = B22; dx3 = X3; |
| | 740 | for (size_t i = 0; i < imaxb; ++i, d22+=ldb, dx3+=ldx3) { |
| | 741 | for (size_t j = 0; j < jmaxb; ++j) |
| | 742 | F.sub (*(dx3+j), *(d22 + j), *(dx3+j)); |
| | 743 | } |
| | 744 | |
| | 745 | // S2 = S1 - A11 in X2 |
| | 746 | d11 = A11; dx2 = X2; |
| | 747 | for (size_t i = 0; i < imaxa; ++i, d11+=lda, dx2+=ldx2) { |
| | 748 | for (size_t j = 0; j < jmaxa; ++j) |
| | 749 | F.subin (*(dx2+j), *(d11 + j)); |
| | 750 | } |
| | 751 | |
| | 752 | // U2 = P6 + P1 = alpha . S2 * T2 + P1 in X1 |
| | 753 | WinoMain (F, ta, tb, mr, nr, kr, alpha, X2, ldx2, X3, ldx3, one, X1, nr, kmax, w-1,base); |
| | 754 | |
| | 755 | |
| | 756 | |
| | 757 | |
| | 758 | // U4 = U2 + P5 in C12 |
| | 759 | d12c = C12; dx1 = X1; |
| | 760 | for (size_t i = 0; i < mr; ++i, d12c += ldc, dx1 += nr) |
| | 761 | for (size_t j=0;j < nr;++j) |
| | 762 | F.addin (*(d12c + j), *(dx1 + j)); |
| | 763 | |
| | 764 | // T4 = T2 - B21 in X3 |
| | 765 | d21 = B21;dx3=X3; |
| | 766 | for (size_t i = 0; i < imaxb; ++i, d21+=ldb, dx3+=ldx3) { |
| | 767 | for (size_t j = 0; j < jmaxb; ++j) |
| | 768 | F.subin (*(dx3+j),* (d21 + j)); |
| | 769 | } |
| | 770 | |
| | 771 | // S4 = A12 -S2 in X2 |
| | 772 | d12 = A12; dx2 = X2; |
| | 773 | for (size_t i = 0; i < imaxa; ++i, d12 += lda, dx2 += ldx2) { |
| | 774 | for (size_t j = 0; j < jmaxa; ++j) |
| | 775 | F.sub (*(dx2+j), *(d12 + j), *(dx2+j)); |
| | 776 | } |
| | 777 | |
| | 778 | // P4 = alpha . A22 * T4 - beta . C21 in C21 |
| | 779 | WinoMain (F, ta, tb, mr, nr, kr, alpha, A22, lda, X3, ldx3, mbeta, C21, ldc, kmax, w-1,base); |
| | 780 | |
| | 781 | // U5 = P3 + U4 = alpha . S4*B22 + U4 in C12 |
| | 782 | WinoMain (F, ta, tb, mr, nr, kr, alpha, X2, ldx2, B22, ldb, one, C12, ldc, kmax, w-1,base); |
| | 783 | |
| | 784 | // T3 = B22 - B12 in X3 |
| | 785 | d12 = B12; d22 = B22; dx3 = X3; |
| | 786 | for (size_t i=0; i < imaxb; ++i, d12+=ldb, d22+=ldb, dx3+=ldx3) |
| | 787 | for (size_t j=0;j < jmaxb;++j) |
| | 788 | F.sub (*(dx3+j), *(d22 + j), *(d12 + j)); |
| | 789 | |
| | 790 | // S3 = A11 - A21 in X2 |
| | 791 | d11 = A11; d21 = A21; dx2 = X2; |
| | 792 | for (size_t i = 0; i < imaxa; ++i, d11 += lda, d21 += lda, dx2 += ldx2) |
| | 793 | for (size_t j = 0; j < jmaxa; ++j) |
| | 794 | F.sub (*(dx2+j), *(d11 + j), *(d21 + j)); |
| | 795 | |
| | 796 | // U3 = P7 + U2 = alpha . S3 * T3 + U2 in X1 |
| | 797 | WinoMain (F, ta, tb, mr, nr, kr, alpha, X2, ldx2, X3, ldx3, one, X1, nr, kmax, w-1,base); |
| | 798 | |
| | 799 | // U7 = U3 + C22 in C22 |
| | 800 | d22c = C22; dx1 = X1; d12c = C12; |
| | 801 | for (size_t i = 0; i < mr; ++i, d22c += ldc, dx1 += nr) |
| | 802 | for (size_t j = 0; j < nr; ++j) |
| | 803 | F.addin (*(d22c + j), *(dx1 + j)); |
| | 804 | |
| | 805 | // U6 = U3 - P4 in C21 |
| | 806 | dx1 = X1; d21c = C21; |
| | 807 | for (size_t i = 0; i < mr; ++i, dx1 += nr, d21c += ldc) |
| | 808 | for (size_t j=0;j < nr;++j) |
| | 809 | F.sub (*(d21c + j), *(dx1 + j),* (d21c + j)); |
| | 810 | #else |
| | 811 | // P2 = alpha . A12 * B21 + beta . C11 in C11 |
| | 812 | WinoMain (F, ta, tb, mr, nr, kr, alpha, A12, lda, B21, ldb, beta, C11, ldc, kmax,w-1,base); |
| | 813 | |
| | 814 | // T3 = B22 - B12 in X3 |
| | 815 | d12 = B12; d22 = B22; dx3 = X3; |
| | 816 | for (size_t i=0; i < imaxb; ++i, d12+=ldb, d22+=ldb, dx3+=ldx3) { |
| | 817 | for (size_t j=0;j < jmaxb;++j) |
| | 818 | F.sub (*(dx3+j), *(d22 + j), *(d12 + j)); |
| | 819 | |
| | 820 | } |
| | 821 | |
| | 822 | // S3 = A11 - A21 in X2 |
| | 823 | d11 = A11; d21 = A21; dx2 = X2; |
| | 824 | for (size_t i = 0; i < imaxa; ++i, d11 += lda, d21 += lda, dx2 += ldx2) |
| | 825 | for (size_t j = 0; j < jmaxa; ++j) |
| | 826 | F.sub (*(dx2+j), *(d11 + j), *(d21 + j)); |
| | 827 | |
| | 841 | |
| | 842 | // P7 = alpha . S3 * T3 + beta . C22 in C22 |
| | 843 | WinoMain (F, ta, tb, mr, nr, kr, alpha, X2, ldx2, X3, ldx3, beta, C22, ldc, kmax, w-1,base); |
| | 844 | |
| | 845 | // T1 = B12 - B11 in X3 |
| | 846 | d11 = B11; d12 = B12; dx3 = X3; |
| | 847 | for (size_t i = 0; i < imaxb; ++i, d11 += ldb, d12 += ldb, dx3 += ldx3) { |
| | 848 | for (size_t j = 0; j < jmaxb; ++j) |
| | 849 | F.sub (*(dx3 + j), *(d12 + j), *(d11 + j)); |
| | 850 | } |
| | 851 | |
| | 852 | // S1 = A21 + A22 in X2 |
| | 853 | d21 = A21; d22 = A22; dx2 = X2; |
| | 854 | for (size_t i = 0; i < imaxa; ++i, d21+=lda, d22+=lda, dx2+=ldx2) { |
| | 855 | for (size_t j=0;j < jmaxa;++j) |
| | 856 | F.add(*(dx2+j),* (d21 + j),*(d22 + j)); |
| | 857 | } |
| | 858 | |
| | 859 | // P5 = alpha . S1*T1 + beta . C12 in C12 |
| | 860 | WinoMain (F, ta, tb, mr, nr, kr, alpha, X2, ldx2, X3, ldx3, beta, C12, ldc, kmax, w-1,base); |
| | 861 | |
| | 862 | // T2 = B22 - T1 in X3 |
| | 863 | d22 = B22; dx3 = X3; |
| | 864 | for (size_t i = 0; i < imaxb; ++i, d22+=ldb, dx3+=ldx3) { |
| | 865 | for (size_t j = 0; j < jmaxb; ++j) |
| | 866 | F.sub (*(dx3+j), *(d22 + j), *(dx3+j)); |
| | 867 | } |
| | 868 | |
| | 869 | // S2 = S1 - A11 in X2 |
| | 870 | d11 = A11; dx2 = X2; |
| | 871 | for (size_t i = 0; i < imaxa; ++i, d11+=lda, dx2+=ldx2) { |
| | 872 | for (size_t j = 0; j < jmaxa; ++j) |
| | 873 | F.subin (*(dx2+j), *(d11 + j)); |
| | 874 | } |
| | 875 | |
| | 876 | // P6 = alpha . S2 * T2 in X1 |
| | 877 | WinoMain (F, ta, tb, mr, nr, kr, alpha, X2, ldx2, X3, ldx3, zero, X1, nr, kmax, w-1,base); |
| | 878 | |
| | 879 | // T4 = T2 - B21 in X3 |
| | 880 | d21 = B21;dx3=X3; |
| | 881 | for (size_t i = 0; i < imaxb; ++i, d21+=ldb, dx3+=ldx3) { |
| | 882 | for (size_t j = 0; j < jmaxb; ++j) |
| | 883 | F.subin (*(dx3+j),* (d21 + j)); |
| | 884 | } |
| | 885 | |
| | 886 | // S4 = A12 -S2 in X2 |
| | 887 | d12 = A12; dx2 = X2; |
| | 888 | for (size_t i = 0; i < imaxa; ++i, d12 += lda, dx2 += ldx2) { |
| | 889 | for (size_t j = 0; j < jmaxa; ++j) |
| | 890 | F.sub (*(dx2+j), *(d12 + j), *(dx2+j)); |
| | 891 | } |
| | 892 | |
| | 893 | // P4 = alpha . A22 * T4 - beta . C21 in C21 |
| | 894 | WinoMain (F, ta, tb, mr, nr, kr, alpha, A22, lda, X3, ldx3, mbeta, C21, ldc, kmax, w-1,base); |
| | 895 | |
| | 896 | // P1 = alpha . A11 * B11 in X3 |
| | 897 | WinoMain (F, ta, tb, mr, nr, kr, alpha, A11, lda, B11, ldb, zero, X3, nr, kmax, w-1,base); |
| | 898 | |
| | 899 | // U1 = P2 + P1 in C11 |
| | 900 | d11c = C11; dx3 = X3; |
| | 901 | for (size_t i = 0; i < mr; ++i, d11c += ldc, dx3 += nr) |
| | 902 | for (size_t j = 0; j < nr; ++j) |
| | 903 | F.addin (*(d11c + j), *(dx3 + j)); |
| | 904 | |
| | 905 | // U2 = P1 + P6 in tmpU2 and |
| | 906 | // U3 = P7 + U2 in tmpU3 and |
| | 907 | // U7 = P5 + U3 in C22 and |
| | 908 | // U4 = P5 + U2 in C12 and |
| | 909 | // U6 = U3 - P4 in C21 and |
| | 910 | typename Field::Element tmpU2, tmpU3; |
| | 911 | d12c = C12; dx1=X1; dx3=X3; d21c = C21; d22c = C22; |
| | 912 | for (size_t i = 0; i < mr; |
| | 913 | ++i, d12c += ldc, dx1 += nr, dx3 += nr, d22c+=ldc, d21c += ldc) { |
| | 914 | for (size_t j=0;j < nr;++j) { |
| | 915 | F.add (tmpU2, *(dx3 + j), *(dx1 + j)); // temporary U2 = P1 + P6 |
| | 916 | F.add (tmpU3, tmpU2, *(d22c + j)); // temporary U3 = U2 + P7 |
| | 917 | F.add (*(d22c + j), *(d12c + j), tmpU3); // U7 = P5 + U3 in C22 |
| | 918 | F.addin (*(d12c + j), tmpU2); // U4 = P5 + U2 in C12 |
| | 919 | F.sub (*(d21c + j), tmpU3, *(d21c + j)); // U6 = U3 - P4 in C21 |
| | 920 | } |
| | 921 | } |
| | 922 | // P3 = alpha . S4*B22 in X1 |
| | 923 | WinoMain (F, ta, tb, mr, nr, kr, alpha, X2, ldx2, B22, ldb, one, C12, ldc, kmax, w-1,base); |
| | 924 | |
| | 925 | // U5 = P3 + U4 in C12 |
| | 926 | // d12c = C12; dx1 = X1; |
| | 927 | // for (size_t i = 0; i < mr; ++i, d12c += ldc, dx1 += nr) |
| | 928 | // for (size_t j = 0; j < nr; ++j) |
| | 929 | // F.addin (*(d12c + j), *(dx1 + j)); |
| | 930 | #endif |
| | 931 | delete[] X1; |
| | 932 | delete[] X2; |
| | 933 | delete[] X3; |
| 552 | | |
| 553 | | // P7 = alpha . S3 * T3 + beta . C22 in C22 |
| 554 | | WinoMain (F, ta, tb, mr, nr, kr, alpha, X2, ldx2, X3, ldx3, beta, C22, ldc, kmax, w-1,base); |
| 555 | | |
| 556 | | // T1 = B12 - B11 in X3 |
| 557 | | d11 = B11; d12 = B12; dx3 = X3; |
| 558 | | for (size_t i = 0; i < imaxb; ++i, d11 += ldb, d12 += ldb, dx3 += ldx3) { |
| 559 | | for (size_t j = 0; j < jmaxb; ++j) |
| 560 | | F.sub (*(dx3 + j), *(d12 + j), *(d11 + j)); |
| 561 | | } |
| 562 | | |
| 563 | | // S1 = A21 + A22 in X2 |
| 564 | | d21 = A21; d22 = A22; dx2 = X2; |
| 565 | | for (size_t i = 0; i < imaxa; ++i, d21+=lda, d22+=lda, dx2+=ldx2) { |
| 566 | | for (size_t j=0;j < jmaxa;++j) |
| 567 | | F.add(*(dx2+j),* (d21 + j),*(d22 + j)); |
| 568 | | } |
| 569 | | |
| 570 | | // P5 = alpha . S1*T1 + beta . C12 in C12 |
| 571 | | WinoMain (F, ta, tb, mr, nr, kr, alpha, X2, ldx2, X3, ldx3, beta, C12, ldc, kmax, w-1,base); |
| 572 | | |
| 573 | | // T2 = B22 - T1 in X3 |
| 574 | | d22 = B22; dx3 = X3; |
| 575 | | for (size_t i = 0; i < imaxb; ++i, d22+=ldb, dx3+=ldx3) { |
| 576 | | for (size_t j = 0; j < jmaxb; ++j) |
| 577 | | F.sub (*(dx3+j), *(d22 + j), *(dx3+j)); |
| 578 | | } |
| 579 | | |
| 580 | | // S2 = S1 - A11 in X2 |
| 581 | | d11 = A11; dx2 = X2; |
| 582 | | for (size_t i = 0; i < imaxa; ++i, d11+=lda, dx2+=ldx2) { |
| 583 | | for (size_t j = 0; j < jmaxa; ++j) |
| 584 | | F.subin (*(dx2+j), *(d11 + j)); |
| 585 | | } |
| 586 | | |
| 587 | | // P6 = alpha . S2 * T2 in X1 |
| 588 | | WinoMain (F, ta, tb, mr, nr, kr, alpha, X2, ldx2, X3, ldx3, zero, X1, nr, kmax, w-1,base); |
| 589 | | |
| 590 | | // T4 = T2 - B21 in X3 |
| 591 | | d21 = B21;dx3=X3; |
| 592 | | for (size_t i = 0; i < imaxb; ++i, d21+=ldb, dx3+=ldx3) { |
| 593 | | for (size_t j = 0; j < jmaxb; ++j) |
| 594 | | F.subin (*(dx3+j),* (d21 + j)); |
| 595 | | } |
| 596 | | |
| 597 | | // S4 = A12 -S2 in X2 |
| 598 | | d12 = A12; dx2 = X2; |
| 599 | | for (size_t i = 0; i < imaxa; ++i, d12 += lda, dx2 += ldx2) { |
| 600 | | for (size_t j = 0; j < jmaxa; ++j) |
| 601 | | F.sub (*(dx2+j), *(d12 + j), *(dx2+j)); |
| 602 | | } |
| 603 | | |
| 604 | | // P4 = alpha . A22 * T4 - beta . C21 in C21 |
| 605 | | WinoMain (F, ta, tb, mr, nr, kr, alpha, A22, lda, X3, ldx3, mbeta, C21, ldc, kmax, w-1,base); |
| 606 | | |
| 607 | | // P1 = alpha . A11 * B11 in X3 |
| 608 | | WinoMain (F, ta, tb, mr, nr, kr, alpha, A11, lda, B11, ldb, zero, X3, nr, kmax, w-1,base); |
| 609 | | |
| 610 | | // U1 = P2 + P1 in C11 |
| 611 | | d11c = C11; dx3 = X3; |
| 612 | | for (size_t i = 0; i < mr; ++i, d11c += ldc, dx3 += nr) |
| 613 | | for (size_t j = 0; j < nr; ++j) |
| 614 | | F.addin (*(d11c + j), *(dx3 + j)); |
| 615 | | |
| 616 | | // U2 = P1 + P6 in tmpU2 and |
| 617 | | // U3 = P7 + U2 in tmpU3 and |
| 618 | | // U7 = P5 + U3 in C22 and |
| 619 | | // U4 = P5 + U2 in C12 and |
| 620 | | // U6 = U3 - P4 in C21 and |
| 621 | | d12c = C12; dx1=X1; dx3=X3; d21c = C21; d22c = C22; |
| 622 | | for (size_t i = 0; i < mr; |
| 623 | | ++i, d12c += ldc, dx1 += nr, dx3 += nr, d22c+=ldc, d21c += ldc) { |
| 624 | | for (size_t j=0;j < nr;++j) { |
| 625 | | F.add (tmpU2, *(dx3 + j), *(dx1 + j)); // temporary U2 = P1 + P6 |
| 626 | | F.add (tmpU3, tmpU2, *(d22c + j)); // temporary U3 = U2 + P7 |
| 627 | | F.add (*(d22c + j), *(d12c + j), tmpU3); // U7 = P5 + U3 in C22 |
| 628 | | F.addin (*(d12c + j), tmpU2); // U4 = P5 + U2 in C12 |
| 629 | | F.sub (*(d21c + j), tmpU3, *(d21c + j)); // U6 = U3 - P4 in C21 |
| 630 | | } |
| 631 | | } |
| 632 | | // P3 = alpha . S4*B22 in X1 |
| 633 | | WinoMain (F, ta, tb, mr, nr, kr, alpha, X2, ldx2, B22, ldb, zero, X1, nr, kmax, w-1,base); |
| 634 | | |
| 635 | |   |