| 38 | | |
| 39 | | if ( !(M && N) ) return 0; |
| 40 | | typedef typename Field::Element elt; |
| 41 | | elt mone,zero,one;; |
| 42 | | F.init (one, 1.0); |
| 43 | | F.neg(mone, one); |
| 44 | | F.init (zero, 0.0); |
| 45 | | elt * Aini = A; |
| 46 | | elt * Acurr; |
| 47 | | size_t rowp = 0; |
| 48 | | size_t colp; |
| 49 | | size_t R = 0; |
| 50 | | size_t k = 0; |
| 51 | | size_t delay =0; |
| 52 | | size_t kmax = DotProdBound (F, 0, one) -1; // the max number of delayed operations |
| 53 | | while ((rowp<M) && (k<N)){ |
| 54 | | |
| 55 | | //Find non zero pivot |
| 56 | | colp = k; |
| 57 | | Acurr = Aini; |
| 58 | | while ((F.isZero(*Acurr)) || (F.isZero (F.init (*Acurr, *Acurr)))) |
| 59 | | if (++colp == N){ |
| 60 | | if (rowp==M-1) |
| 61 | | break; |
| 62 | | colp=k; ++rowp; |
| 63 | | Acurr = Aini += lda; |
| 64 | | } |
| 65 | | else |
| 66 | | ++Acurr; |
| 67 | | |
| 68 | | if ((rowp == M-1)&&(colp == N)) |
| 69 | | break; |
| 70 | | R++; |
| 71 | | P[k] = colp; |
| 72 | | Q[k] = rowp; |
| 73 | | |
| 74 | | // Permutation of the pivot column |
| 75 | | fswap (F, M, A+k, lda, A + colp , lda); |
| 76 | | |
| 77 | | //Normalization |
| 78 | | elt invpiv; |
| 79 | | F.init(*Aini,*Aini); |
| 80 | | F.inv (invpiv,*Aini); |
| 81 | | |
| 82 | | for (size_t j=1; j<N-k; ++j) |
| 83 | | if (!F.isZero(*(Aini+j))) |
| 84 | | F.init(*(Aini+j), *(Aini+j)); |
| 85 | | for (size_t i=lda; i<(M-rowp)*lda; i+=lda) |
| 86 | | if (!F.isZero(*(Aini+i))) |
| 87 | | F.init(*(Aini+i), *(Aini+i)); |
| 88 | | |
| 89 | | |
| 90 | | if (Diag == FflasUnit) { |
| | 39 | return callLUdivine_small <AreEqual<typename Field::Element, double>::value> () |
| | 40 | (F, Diag, M, N, A, lda, P, Q, LuTag); |
| | 41 | } |
| | 42 | template<> |
| | 43 | class FFPACK::callLUdivine_small<false>{ |
| | 44 | public: |
| | 45 | template <class Field> |
| | 46 | inline size_t |
| | 47 | operator()( const Field& F, const enum FFLAS_DIAG Diag, |
| | 48 | const size_t M, const size_t N, |
| | 49 | typename Field::Element * A, const size_t lda, size_t*P, |
| | 50 | size_t *Q, const enum FFPACK_LUDIVINE_TAG LuTag){ |
| | 51 | |
| | 52 | if ( !(M && N) ) return 0; |
| | 53 | typedef typename Field::Element elt; |
| | 54 | elt mone,zero,one;; |
| | 55 | F.init (one, 1.0); |
| | 56 | F.neg(mone, one); |
| | 57 | F.init (zero, 0.0); |
| | 58 | elt * Aini = A; |
| | 59 | elt * Acurr; |
| | 60 | size_t rowp = 0; |
| | 61 | size_t colp; |
| | 62 | size_t R = 0; |
| | 63 | size_t k = 0; |
| | 64 | size_t kmax = DotProdBound (F, 0, one) -1; // the max number of delayed operations |
| | 65 | while ((rowp<M) && (k<N)){ |
| | 66 | |
| | 67 | //Find non zero pivot |
| | 68 | colp = k; |
| | 69 | Acurr = Aini; |
| | 70 | while ((F.isZero(*Acurr)) || (F.isZero (F.init (*Acurr, *Acurr)))) |
| | 71 | if (++colp == N){ |
| | 72 | if (rowp==M-1) |
| | 73 | break; |
| | 74 | colp=k; ++rowp; |
| | 75 | Acurr = Aini += lda; |
| | 76 | } |
| | 77 | else |
| | 78 | ++Acurr; |
| | 79 | |
| | 80 | if ((rowp == M-1)&&(colp == N)) |
| | 81 | break; |
| | 82 | R++; |
| | 83 | P[k] = colp; |
| | 84 | Q[k] = rowp; |
| | 85 | |
| | 86 | // Permutation of the pivot column |
| | 87 | fswap (F, M, A+k, lda, A + colp , lda); |
| | 88 | |
| | 89 | //Normalization |
| | 90 | elt invpiv; |
| | 91 | F.init(*Aini,*Aini); |
| | 92 | F.inv (invpiv,*Aini); |
| | 93 | |
| 97 | | F.mulin (*(Aini+i),invpiv); |
| 98 | | |
| 99 | | if (delay++ >= kmax){ // Reduction has to be done |
| 100 | | delay = 0; |
| | 99 | F.init(*(Aini+i), *(Aini+i)); |
| | 100 | |
| | 101 | |
| | 102 | if (Diag == FflasUnit) { |
| | 103 | for (size_t j=1; j<N-k; ++j) |
| | 104 | if (!F.isZero(*(Aini+j))) |
| | 105 | F.mulin (*(Aini+j),invpiv); |
| | 106 | } else |
| | 107 | for (size_t i=lda; i<(M-rowp)*lda; i+=lda) |
| | 108 | if (!F.isZero(*(Aini+i))) |
| | 109 | F.mulin (*(Aini+i),invpiv); |
| | 110 | |
| | 111 | //Elimination |
| | 112 | //Or equivalently, but without delayed ops : |
| | 113 | fger (F, M-rowp-1, N-k-1, mone, Aini+lda, lda, Aini+1, 1, Aini+(lda+1), lda); |
| | 114 | |
| | 115 | Aini += lda+1; ++rowp; ++k; |
| | 116 | } |
| | 117 | |
| | 118 | // Compression the U matrix |
| | 119 | size_t l; |
| | 120 | if (Diag == FflasNonUnit){ |
| | 121 | Aini = A; |
| | 122 | l = N; |
| | 123 | } else { |
| | 124 | Aini = A+1; |
| | 125 | l=N-1; |
| | 126 | } |
| | 127 | for (size_t i=0; i<R; ++i, Aini += lda+1) { |
| | 128 | if (Q[i] > i){ |
| | 129 | fcopy (F, l-i, Aini, 1, Aini+(Q[i]-i)*lda, 1); |
| | 130 | for (size_t j=0; j<l-i; ++j) |
| | 131 | F.assign (*(Aini+(Q[i]-i)*lda+j), zero); |
| | 132 | } |
| | 133 | } |
| | 134 | return R; |
| | 135 | } |
| | 136 | }; |
| | 137 | template<> |
| | 138 | class FFPACK::callLUdivine_small<true>{ |
| | 139 | public: |
| | 140 | template <class Field> |
| | 141 | inline size_t |
| | 142 | operator()( const Field& F, const enum FFLAS_DIAG Diag, |
| | 143 | const size_t M, const size_t N, |
| | 144 | typename Field::Element * A, const size_t lda, size_t*P, |
| | 145 | size_t *Q, const enum FFPACK_LUDIVINE_TAG LuTag){ |
| | 146 | |
| | 147 | if ( !(M && N) ) return 0; |
| | 148 | typedef typename Field::Element elt; |
| | 149 | elt mone,zero,one;; |
| | 150 | F.init (one, 1.0); |
| | 151 | F.neg(mone, one); |
| | 152 | F.init (zero, 0.0); |
| | 153 | elt * Aini = A; |
| | 154 | elt * Acurr; |
| | 155 | size_t rowp = 0; |
| | 156 | size_t colp; |
| | 157 | size_t R = 0; |
| | 158 | size_t k = 0; |
| | 159 | size_t delay =0; |
| | 160 | size_t kmax = DotProdBound (F, 0, one) -1; // the max number of delayed operations |
| | 161 | while ((rowp<M) && (k<N)){ |
| | 162 | |
| | 163 | //Find non zero pivot |
| | 164 | colp = k; |
| | 165 | Acurr = Aini; |
| | 166 | while ((F.isZero(*Acurr)) || (F.isZero (F.init (*Acurr, *Acurr)))) |
| | 167 | if (++colp == N){ |
| | 168 | if (rowp==M-1) |
| | 169 | break; |
| | 170 | colp=k; ++rowp; |
| | 171 | Acurr = Aini += lda; |
| | 172 | } |
| | 173 | else |
| | 174 | ++Acurr; |
| | 175 | |
| | 176 | if ((rowp == M-1)&&(colp == N)) |
| | 177 | break; |
| | 178 | R++; |
| | 179 | P[k] = colp; |
| | 180 | Q[k] = rowp; |
| | 181 | |
| | 182 | // Permutation of the pivot column |
| | 183 | fswap (F, M, A+k, lda, A + colp , lda); |
| | 184 | |
| | 185 | //Normalization |
| | 186 | elt invpiv; |
| | 187 | F.init(*Aini,*Aini); |
| | 188 | F.inv (invpiv,*Aini); |
| | 189 | |
| | 190 | for (size_t j=1; j<N-k; ++j) |
| | 191 | if (!F.isZero(*(Aini+j))) |
| | 192 | F.init(*(Aini+j), *(Aini+j)); |
| | 193 | for (size_t i=lda; i<(M-rowp)*lda; i+=lda) |
| | 194 | if (!F.isZero(*(Aini+i))) |
| | 195 | F.init(*(Aini+i), *(Aini+i)); |
| | 196 | |
| | 197 | |
| | 198 | if (Diag == FflasUnit) { |
| | 199 | for (size_t j=1; j<N-k; ++j) |
| | 200 | if (!F.isZero(*(Aini+j))) |
| | 201 | F.mulin (*(Aini+j),invpiv); |
| | 202 | } else |
| | 203 | for (size_t i=lda; i<(M-rowp)*lda; i+=lda) |
| | 204 | if (!F.isZero(*(Aini+i))) |
| | 205 | F.mulin (*(Aini+i),invpiv); |
| | 206 | |
| | 207 | if (delay++ >= kmax){ // Reduction has to be done |
| | 208 | delay = 0; |
| | 209 | for (size_t i=1; i<M-rowp; ++i) |
| | 210 | for (size_t j=1; j<N-k; ++j) |
| | 211 | F.init( *(Aini+i*lda+j),*(Aini+i*lda+j)); |
| | 212 | } |
| | 213 | //Elimination |
| 103 | | F.init( *(Aini+i*lda+j),*(Aini+i*lda+j)); |
| 104 | | } |
| 105 | | //Elimination |
| 106 | | for (size_t i=1; i<M-rowp; ++i) |
| 107 | | for (size_t j=1; j<N-k; ++j) |
| 108 | | *(Aini+i*lda+j) -= *(Aini+i*lda) * *(Aini+j); |
| 109 | | //Or equivalently, but without delayed ops : |
| 110 | | //fger (F, M-rowp-1, N-k-1, mone, Aini+lda, lda, Aini+1, 1, Aini+(lda+1), lda); |
| 111 | | |
| 112 | | Aini += lda+1; ++rowp; ++k; |
| | 216 | *(Aini+i*lda+j) -= *(Aini+i*lda) * *(Aini+j); |
| | 217 | //Or equivalently, but without delayed ops : |
| | 218 | //fger (F, M-rowp-1, N-k-1, mone, Aini+lda, lda, Aini+1, 1, Aini+(lda+1), lda); |
| | 219 | |
| | 220 | Aini += lda+1; ++rowp; ++k; |
| | 221 | } |
| | 222 | |
| | 223 | // Compression the U matrix |
| | 224 | size_t l; |
| | 225 | if (Diag == FflasNonUnit){ |
| | 226 | Aini = A; |
| | 227 | l = N; |
| | 228 | } else { |
| | 229 | Aini = A+1; |
| | 230 | l=N-1; |
| | 231 | } |
| | 232 | for (size_t i=0; i<R; ++i, Aini += lda+1) { |
| | 233 | if (Q[i] > i){ |
| | 234 | fcopy (F, l-i, Aini, 1, Aini+(Q[i]-i)*lda, 1); |
| | 235 | for (size_t j=0; j<l-i; ++j) |
| | 236 | F.assign (*(Aini+(Q[i]-i)*lda+j), zero); |
| | 237 | } |
| | 238 | } |
| | 239 | return R; |