#
source:
branches/wezzy/linbox/algorithms/opencl-kernels/kernel_muladd_partial_32_dp.cl
@
4090

Revision 4090, 2.1 KB checked in by wezowicz, 4 years ago (diff) |
---|

Line | |
---|---|

1 | /* |

2 | * kernel_partial_32_dp.cl |

3 | * |

4 | * Created on: Jul 5, 2011 |

5 | * Author: Matthew Wezowicz |

6 | */ |

7 | |

8 | #define BLOCK_SIZE 16 |

9 | #pragma OPENCL EXTENSION cl_khr_fp64 : enable |

10 | |

11 | __kernel void matrix_mul_kernel(__global double* D, double alpha, __global double* A, __global double* B, |

12 | double beta, __global double* C, int width_A, int width_B, double mod){ |

13 | //Get Workgroup ID |

14 | int bx = get_group_id(0); |

15 | int by = get_group_id(1); |

16 | |

17 | //Get Local ID |

18 | int tx = get_local_id(0); |

19 | int ty = get_local_id(1); |

20 | |

21 | //Range of indecies for sub-matrix of A |

22 | int aBegin = width_A * BLOCK_SIZE * by; |

23 | int aEnd = aBegin + width_A - 1; |

24 | int aStep = BLOCK_SIZE; |

25 | |

26 | //Range of indecies for sub-matrix of B |

27 | int bBegin = BLOCK_SIZE * bx; |

28 | int bStep = BLOCK_SIZE * width_B; |

29 | |

30 | //Local storage of sub-matrices of A and B |

31 | __local double As[BLOCK_SIZE][BLOCK_SIZE]; |

32 | __local double Bs[BLOCK_SIZE][BLOCK_SIZE]; |

33 | |

34 | //Temporary storage for result |

35 | double Dsub = 0; |

36 | |

37 | //Counter for modulus every 32 iterations |

38 | int mCount = 0; |

39 | |

40 | //Loop over all the sub-matrices of A and B required to compute |

41 | //the result sub-matrix |

42 | for(int a = aBegin, b = bBegin; a < aEnd; a += aStep, b += bStep){ |

43 | //Load the matrices from global memory to local memory |

44 | //Each thread loads one element of each sub-matrix |

45 | As[ty][tx] = A[a + width_A * ty + tx]; |

46 | Bs[ty][tx] = B[b + width_B * ty + tx]; |

47 | |

48 | //Synchronize threads |

49 | barrier(CLK_LOCAL_MEM_FENCE); |

50 | |

51 | //Multiply the two sub-matrices together |

52 | for(int i = 0; i < BLOCK_SIZE; i++){ |

53 | Dsub += As[ty][i] * Bs[i][tx]; |

54 | } |

55 | mCount++; |

56 | |

57 | //fmod every 32 iterations |

58 | if(mCount == 2){ |

59 | Dsub = fmod(Dsub, mod); |

60 | mCount = 0; |

61 | } |

62 | |

63 | //Synchronize threads |

64 | barrier(CLK_LOCAL_MEM_FENCE); |

65 | } |

66 | |

67 | Dsub = fmod(Dsub, mod); |

68 | |

69 | //Calculates the offset in the result matrix |

70 | int d = width_B * BLOCK_SIZE * by + BLOCK_SIZE * bx; |

71 | |

72 | //Scale Dsub by alpha |

73 | Dsub = alpha * Dsub; |

74 | Dsub = fmod(Dsub, mod); |

75 | |

76 | //Scalse Csub by beta |

77 | double Csub = C[d + ty * width_B + tx]; |

78 | Csub = beta * Csub; |

79 | Csub = fmod(Csub, mod); |

80 | |

81 | //Add Dsub and Dsub |

82 | Dsub = Dsub + Csub; |

83 | Dsub = fmod(Dsub, mod); |

84 | |

85 | //Add the sum to the appropriate spot |

86 | D[d + ty * width_B + tx] = Dsub; |

87 | } |

**Note:**See TracBrowser for help on using the repository browser.