| 17 | | /* From M Abshoff */ |
| 18 | | #if defined(__sun) |
| 19 | | #define lround(x) my_lround(x) |
| 20 | | static long my_lround(double x) |
| 21 | | { |
| 22 | | return (long) ((x) >= 0 ? (x) + 0.5 : (x) - 0.5); |
| 23 | | } |
| 24 | | #endif |
| 25 | | |
| 26 | | //--------------------------------------------------------------------- |
| 27 | | // DotProdBound : |
| 28 | | // Computes the maximal dimension k so that the product A*B+beta.C over Z, |
| 29 | | // where A is m*k and B is k*n can be performed correctly with w Winograd |
| 30 | | // recursion levels on the number of bits of the floating point mantissa |
| 31 | | //--------------------------------------------------------------------- |
| 32 | | template <class Field> |
| 33 | | inline size_t FFLAS::DotProdBoundCompute (const Field& F, const size_t w, |
| 34 | | const typename Field::Element& beta, |
| 35 | | const FFLAS_BASE base){ |
| | 17 | /** |
| | 18 | * MatMulParameters |
| | 19 | * |
| | 20 | * \brief Computes the threshold parameters for the cascade |
| | 21 | * Matmul algorithm |
| | 22 | * |
| | 23 | * |
| | 24 | * \param F Finite Field/Ring of the computation. |
| | 25 | * \param k Common dimension of A and B, in the product A x B |
| | 26 | * \param bet Computing AB + beta C |
| | 27 | * \param delayedDim Returns the size of blocks that can be multiplied |
| | 28 | * over Z with no overflow |
| | 29 | * \param base Returns the type of BLAS representation to use |
| | 30 | * \param winoRecLevel Returns the number of recursion levels of |
| | 31 | * Strassen-Winograd's algorithm to perform |
| | 32 | * \param winoLevelProvided tells whether the user forced the number of |
| | 33 | * recursive level of Winograd's algorithm |
| | 34 | */ |
| | 35 | template <class Field> |
| | 36 | inline void FFLAS::MatMulParameters (const Field& F, |
| | 37 | const size_t k, |
| | 38 | const typename Field::Element& beta, |
| | 39 | size_t& delayedDim, |
| | 40 | FFLAS_BASE& base, |
| | 41 | size_t& winoRecLevel, |
| | 42 | bool winoLevelProvided) { |
| | 43 | |
| | 44 | // Strategy : determine Winograd's recursion first, then choose appropriate |
| | 45 | // floating point representation, and finally the blocking dimension. |
| | 46 | // Can be improved for some cases. |
| | 47 | |
| | 48 | if (!winoLevelProvided) |
| | 49 | winoRecLevel = WinoSteps (k); |
| | 50 | base = BaseCompute (F, winoRecLevel); |
| | 51 | delayedDim = DotProdBound (F, winoRecLevel, beta, base); |
| | 52 | |
| | 53 | size_t n = k; |
| | 54 | size_t winoDel = winoRecLevel; |
| | 55 | |
| | 56 | // Computes the delayedDim, only depending on the recursive levels |
| | 57 | // that must be performed over Z |
| | 58 | while (winoDel > 0 && delayedDim < n) { |
| | 59 | winoDel--; |
| | 60 | delayedDim = DotProdBound (F, winoDel, beta, base); |
| | 61 | n >>= 1; |
| | 62 | } |
| | 63 | delayedDim = MIN (n, delayedDim); |
| | 64 | } |
| | 65 | |
| | 66 | /** |
| | 67 | * DotProdBound |
| | 68 | * |
| | 69 | * \brief computes the maximal size for delaying the modular reduction |
| | 70 | * in a dotproduct |
| | 71 | * |
| | 72 | * This is the default version assuming a conversion to a positive modular representation |
| | 73 | * |
| | 74 | * \param F Finite Field/Ring of the computation |
| | 75 | * \param winoRecLevel Number of recusrive Strassen-Winograd levels (if any, 0 otherwise) |
| | 76 | * \param beta Computing AB + beta C |
| | 77 | * \param base Type of floating point representation for delayed modular computations |
| | 78 | * |
| | 79 | */ |
| | 80 | template <class Field> |
| | 81 | inline size_t FFLAS::DotProdBound (const Field& F, |
| | 82 | const size_t w, |
| | 83 | const typename Field::Element& beta, |
| | 84 | const FFLAS_BASE base) { |
| 38 | | static FFLAS_INT_TYPE p; |
| | 89 | F.init (mone, -1.0); |
| | 90 | |
| | 91 | unsigned long mantissa = |
| | 92 | (base == FflasDouble) ? DOUBLE_MANTISSA : FLOAT_MANTISSA; |
| | 93 | |
| | 94 | if (p == 0) |
| | 95 | return 1; |
| | 96 | |
| | 97 | double kmax; |
| | 98 | if (w > 0) { |
| | 99 | double c = computeFactor (F,w); |
| | 100 | double d = (double (1ULL << mantissa) /(c*c) + 1); |
| | 101 | if (d < 2) |
| | 102 | return 1; |
| | 103 | kmax = floor (d * (1ULL << w)); |
| | 104 | } else { |
| | 105 | ////// A fixer: (p-1)/2 si balanced |
| | 106 | |
| | 107 | double c = p-1; |
| | 108 | double cplt=0; |
| | 109 | if (!F.isZero (beta)){ |
| | 110 | if (F.isOne (beta) || F.areEqual (beta, mone)) cplt = c; |
| | 111 | else cplt = c*c; |
| | 112 | } |
| | 113 | kmax = floor ( (double ((1ULL << mantissa) - cplt)) / (c*c)); |
| | 114 | if (kmax <= 1) |
| | 115 | return 1; |
| | 116 | } |
| | 117 | //kmax--; // we computed a strict upper bound |
| | 118 | return (size_t) MIN (kmax, 1ULL << 31); |
| | 119 | } |
| | 120 | |
| | 121 | /** |
| | 122 | * Internal function for the bound computation |
| | 123 | * Generic implementation for positive representations |
| | 124 | */ |
| | 125 | template <class Field> |
| | 126 | inline double FFLAS::computeFactor (const Field& F, const size_t w){ |
| | 127 | FFLAS_INT_TYPE p; |
| 40 | | F.init (mone, -1.0); |
| 41 | | size_t kmax; |
| 42 | | |
| 43 | | unsigned long mantissa = (base == FflasDouble)? DOUBLE_MANTISSA : FLOAT_MANTISSA; |
| | 129 | size_t ex=1; |
| | 130 | for (size_t i=0; i < w; ++i) ex *= 3; |
| | 131 | return double(p - 1) * (1 + ex) / 2; |
| | 132 | } |
| | 133 | |
| | 134 | /** |
| | 135 | * WinoSteps |
| | 136 | * |
| | 137 | * \brief Computes the number of recursive levels to perform |
| | 138 | * |
| | 139 | * \param m the common dimension in the product AxB |
| | 140 | * |
| | 141 | */ |
| | 142 | inline size_t FFLAS::WinoSteps (const size_t m) { |
| | 143 | size_t w = 0; |
| | 144 | size_t mt = m; |
| | 145 | while (mt >= WINOTHRESHOLD) {w++; mt >>= 1;} |
| | 146 | return w; |
| | 147 | } |
| | 148 | |
| | 149 | /** |
| | 150 | * BaseCompute |
| | 151 | * |
| | 152 | * \brief Determines the type of floating point representation to convert to, |
| | 153 | * for BLAS computations |
| | 154 | * \param F Finite Field/Ring of the computation |
| | 155 | * \param w Number of recursive levels in Winograd's algorithm |
| | 156 | * |
| | 157 | */ |
| | 158 | template <class Field> |
| | 159 | inline FFLAS::FFLAS_BASE FFLAS::BaseCompute (const Field& F, const size_t w){ |
| 45 | | if (p == 0) |
| 46 | | kmax = 2; |
| 47 | | else |
| 48 | | if (w > 0) { |
| 49 | | size_t ex=1; |
| 50 | | for (size_t i=0; i < w; ++i) ex *= 3; |
| 51 | | //FFLAS_INT_TYPE c = (p-1)*(ex)/2; //bound for a centered representation |
| 52 | | FFLAS_INT_TYPE c; |
| 53 | | #ifndef _LINBOX_LINBOX_CONFIG_H |
| 54 | | if (F.balanced) |
| 55 | | c = (p-1)*(ex)/2; // balanced representation |
| 56 | | else |
| 57 | | #endif |
| 58 | | c = (p-1)*(1+ex)/2; |
| 59 | | kmax = lround(( double(1ULL << mantissa) /double(c*c) + 1)*(1ULL << w)); |
| 60 | | if (kmax == ( 1ULL << w)) |
| 61 | | kmax = 2; |
| 62 | | } |
| 63 | | else{ |
| 64 | | FFLAS_INT_TYPE c = p-1; |
| 65 | | FFLAS_INT_TYPE cplt=0; |
| 66 | | if (!F.isZero (beta)) { |
| 67 | | if (F.isOne (beta) || F.areEqual (beta, mone)) |
| 68 | | cplt = c; |
| 69 | | else cplt = c*c; |
| 70 | | } |
| 71 | | kmax = lround(( double((1ULL << mantissa) - cplt)) /double(c*c)); |
| 72 | | if (kmax < 2) |
| 73 | | kmax = 2; |
| 74 | | } |
| 75 | | //cerr<<"kmax = "<<kmax<<endl; |
| 76 | | return (size_t) MIN(kmax,1ULL<<31); |
| 77 | | } |
| 78 | | |
| 79 | | |
| 80 | | |
| 81 | | template < class Field > |
| 82 | | inline size_t FFLAS::DotProdBound (const Field& F, const size_t w, |
| 83 | | const typename Field::Element& beta, |
| 84 | | const FFLAS_BASE base) |
| 85 | | { |
| 86 | | static Field G = F; |
| 87 | | static FFLAS_INT_TYPE pig; |
| 88 | | G.characteristic(pig); |
| 89 | | FFLAS_INT_TYPE pif; |
| 90 | | F.characteristic(pif); |
| 91 | | static typename Field::Element b = beta; |
| 92 | | static size_t w2 = w; |
| 93 | | static FFLAS_BASE base2 = base; |
| 94 | | static size_t kmax = DotProdBoundCompute (F, w, beta, base); |
| 95 | | if ((b != beta) || (pif != pig) || (w2 != w) || (base2!=base)) { |
| 96 | | G = F; |
| 97 | | w2 = w; |
| 98 | | b = beta; |
| 99 | | base2 = base; |
| 100 | | kmax = DotProdBoundCompute (F, w, beta, base); |
| 101 | | } |
| 102 | | return kmax; |
| 103 | | } |
| 104 | | |
| 105 | | //--------------------------------------------------------------------- |
| 106 | | // TRSMBound |
| 107 | | // Computes nmax s.t. (p-1)/2*(p^{nmax-1} + (p-2)^{nmax-1}) < 2^53 |
| 108 | | //--------------------------------------------------------------------- |
| 109 | | inline size_t bound_compute_double(const FFLAS_INT_TYPE pi) { |
| 110 | | |
| 111 | | FFLAS_INT_TYPE p=pi,p1=1,p2=1; |
| 112 | | size_t nmax = 1; |
| 113 | | FFLAS_INT_TYPE max = ( FFLAS_INT_TYPE( 1ULL<<(DOUBLE_MANTISSA+1) )/(p-1)); |
| | 161 | FFLAS_INT_TYPE pi; |
| | 162 | F.characteristic(pi); |
| | 163 | FFLAS_BASE base; |
| | 164 | switch (w) { |
| | 165 | case 0: base = (pi < FLOAT_DOUBLE_THRESHOLD_0)? FflasFloat : FflasDouble; |
| | 166 | break; |
| | 167 | case 1: base = (pi < FLOAT_DOUBLE_THRESHOLD_1)? FflasFloat : FflasDouble; |
| | 168 | break; |
| | 169 | case 2: base = (pi < FLOAT_DOUBLE_THRESHOLD_2)? FflasFloat : FflasDouble; |
| | 170 | break; |
| | 171 | default: base = FflasDouble; |
| | 172 | break; |
| | 173 | } |
| | 174 | return base; |
| | 175 | } |
| | 176 | |
| | 177 | |
| | 178 | /************************************************************************************* |
| | 179 | * Specializations for ModularPositive and ModularBalanced over double and float |
| | 180 | *************************************************************************************/ |
| | 181 | |
| | 182 | template <class Element> |
| | 183 | inline double computeFactor (const ModularBalanced<Element>& F, const size_t w){ |
| | 184 | FFLAS_INT_TYPE p; |
| | 185 | F.characteristic(p); |
| | 186 | size_t ex=1; |
| | 187 | for (size_t i=0; i < w; ++i) ex *= 3; |
| | 188 | return (p - 1) * ex / 2; |
| | 189 | } |
| | 190 | |
| | 191 | template <> |
| | 192 | inline FFLAS::FFLAS_BASE FFLAS::BaseCompute (const Modular<double>& F, |
| | 193 | const size_t w){ |
| | 194 | return FflasDouble; |
| | 195 | } |
| | 196 | |
| | 197 | template <> |
| | 198 | inline FFLAS::FFLAS_BASE FFLAS::BaseCompute (const Modular<float>& F, |
| | 199 | const size_t w){ |
| | 200 | return FflasFloat; |
| | 201 | } |
| | 202 | |
| | 203 | template <> |
| | 204 | inline FFLAS::FFLAS_BASE FFLAS::BaseCompute (const ModularBalanced<double>& F, |
| | 205 | const size_t w){ |
| | 206 | return FflasDouble; |
| | 207 | } |
| | 208 | |
| | 209 | template <> |
| | 210 | inline FFLAS::FFLAS_BASE FFLAS::BaseCompute (const ModularBalanced<float>& F, |
| | 211 | const size_t w){ |
| | 212 | return FflasFloat; |
| | 213 | } |
| | 214 | |
| | 215 | /** |
| | 216 | * TRSMBound |
| | 217 | * |
| | 218 | * \brief computes the maximal size for delaying the modular reduction |
| | 219 | * in a triangular system resolution |
| | 220 | * |
| | 221 | * This is the default version over an arbitrary field. |
| | 222 | * It is currently never used (the recursive algorithm is run until n=1 in this case) |
| | 223 | * |
| | 224 | * \param F Finite Field/Ring of the computation |
| | 225 | * |
| | 226 | */ |
| | 227 | template <class Field> |
| | 228 | inline size_t FFLAS::TRSMBound (const Field& F) { |
| | 229 | return 1; |
| | 230 | } |
| | 231 | |
| | 232 | /** |
| | 233 | * Specialization for positive modular representation over double |
| | 234 | * Computes nmax s.t. (p-1)/2*(p^{nmax-1} + (p-2)^{nmax-1}) < 2^53 |
| | 235 | * See [Dumas Giorgi Pernet 06, arXiv:cs/0601133] |
| | 236 | */ |
| | 237 | template<> |
| | 238 | inline size_t FFLAS::TRSMBound (const Modular<double>& F){ |
| | 239 | |
| | 240 | FFLAS_INT_TYPE pi; |
| | 241 | F.characteristic(pi); |
| | 242 | unsigned long long p = pi, p1 = 1, p2 = 1; |
| | 243 | size_t nmax = 0; |
| | 244 | unsigned long long max = ( (1ULL << (DOUBLE_MANTISSA + 1) ) / (p - 1)); |
| 122 | | inline size_t bound_compute_double_balanced(const FFLAS_INT_TYPE pi) { |
| 123 | | |
| 124 | | FFLAS_INT_TYPE p=(pi+1)/2,p1=1; |
| 125 | | size_t nmax=0; |
| 126 | | FFLAS_INT_TYPE max = ( FFLAS_INT_TYPE( 1ULL<<(DOUBLE_MANTISSA))/(p-1)); |
| 127 | | while ( (p1) < max ){ |
| 128 | | p1*=p; |
| 129 | | nmax++; |
| 130 | | } |
| 131 | | return nmax; |
| 132 | | } |
| 133 | | inline size_t bound_compute_float(const FFLAS_INT_TYPE pi) { |
| 134 | | |
| 135 | | FFLAS_INT_TYPE p=pi,p1=1,p2=1; |
| 136 | | size_t nmax=1; |
| 137 | | FFLAS_INT_TYPE max = (FFLAS_INT_TYPE( 1ULL<<(FLOAT_MANTISSA+1) )/(p-1)); |
| | 252 | |
| | 253 | |
| | 254 | /** |
| | 255 | * Specialization for positive modular representation over float |
| | 256 | * Computes nmax s.t. (p-1)/2*(p^{nmax-1} + (p-2)^{nmax-1}) < 2^24 |
| | 257 | * See [Dumas Giorgi Pernet 06, arXiv:cs/0601133] |
| | 258 | */ |
| | 259 | template<> |
| | 260 | inline size_t FFLAS::TRSMBound (const Modular<float>& F){ |
| | 261 | |
| | 262 | FFLAS_INT_TYPE pi; |
| | 263 | F.characteristic(pi); |
| | 264 | unsigned long long p = pi, p1 = 1, p2 = 1; |
| | 265 | size_t nmax = 0; |
| | 266 | unsigned long long max = ( (1ULL << (FLOAT_MANTISSA + 1) ) / (p - 1)); |