1 | /* |
---|

2 | * kernel_partial_1024_sp.cl |
---|

3 | * |
---|

4 | * Created on: Jul 13, 2011 |
---|

5 | * Author: Matthew Wezowicz |
---|

6 | */ |
---|

7 | |
---|

8 | #define BLOCK_SIZE 16 |
---|

9 | |
---|

10 | __kernel void matrix_mul_kernel(__global float* D, float alpha, __global float* A, __global float* B, |
---|

11 | float beta, __global float* C, int width_A, int width_B, float mod){ |
---|

12 | //Get Workgroup ID |
---|

13 | int bx = get_group_id(0); |
---|

14 | int by = get_group_id(1); |
---|

15 | |
---|

16 | //Get Local ID |
---|

17 | int tx = get_local_id(0); |
---|

18 | int ty = get_local_id(1); |
---|

19 | |
---|

20 | //Range of indecies for sub-matrix of A |
---|

21 | int aBegin = width_A * BLOCK_SIZE * by; |
---|

22 | int aEnd = aBegin + width_A - 1; |
---|

23 | int aStep = BLOCK_SIZE; |
---|

24 | |
---|

25 | //Range of indecies for sub-matrix of B |
---|

26 | int bBegin = BLOCK_SIZE * bx; |
---|

27 | int bStep = BLOCK_SIZE * width_B; |
---|

28 | |
---|

29 | //Local storage of sub-matrices of A and B |
---|

30 | __local float As[BLOCK_SIZE][BLOCK_SIZE]; |
---|

31 | __local float Bs[BLOCK_SIZE][BLOCK_SIZE]; |
---|

32 | |
---|

33 | //Temporary storage for result |
---|

34 | float Dsub = 0; |
---|

35 | |
---|

36 | //Counter for modulus every 32 iterations |
---|

37 | int mCount = 0; |
---|

38 | |
---|

39 | //Loop over all the sub-matrices of A and B required to compute |
---|

40 | //the result sub-matrix |
---|

41 | for(int a = aBegin, b = bBegin; a < aEnd; a += aStep, b += bStep){ |
---|

42 | //Load the matrices from global memory to local memory |
---|

43 | //Each thread loads one element of each sub-matrix |
---|

44 | As[ty][tx] = A[a + width_A * ty + tx]; |
---|

45 | Bs[ty][tx] = B[b + width_B * ty + tx]; |
---|

46 | |
---|

47 | //Synchronize threads |
---|

48 | barrier(CLK_LOCAL_MEM_FENCE); |
---|

49 | |
---|

50 | //Multiply the two sub-matrices together |
---|

51 | for(int i = 0; i < BLOCK_SIZE; i++){ |
---|

52 | Dsub += As[ty][i] * Bs[i][tx]; |
---|

53 | } |
---|

54 | mCount++; |
---|

55 | |
---|

56 | //fmod every 1024 iterations |
---|

57 | if(mCount == 64){ |
---|

58 | Dsub = fmod(Dsub, mod); |
---|

59 | mCount = 0; |
---|

60 | } |
---|

61 | |
---|

62 | //Synchronize threads |
---|

63 | barrier(CLK_LOCAL_MEM_FENCE); |
---|

64 | } |
---|

65 | |
---|

66 | Dsub = fmod(Dsub, mod); |
---|

67 | |
---|

68 | //Calculates the offset in the result matrix |
---|

69 | int d = width_B * BLOCK_SIZE * by + BLOCK_SIZE * bx; |
---|

70 | |
---|

71 | //Scale Dsub by alpha |
---|

72 | Dsub = alpha * Dsub; |
---|

73 | Dsub = fmod(Dsub, mod); |
---|

74 | |
---|

75 | //Scalse Csub by beta |
---|

76 | float Csub = C[d + ty * width_B + tx]; |
---|

77 | Csub = beta * Csub; |
---|

78 | Csub = fmod(Csub, mod); |
---|

79 | |
---|

80 | //Add Dsub and Dsub |
---|

81 | Dsub = Dsub + Csub; |
---|

82 | Dsub = fmod(Dsub, mod); |
---|

83 | |
---|

84 | //Add the sum to the appropriate spot |
---|

85 | D[d + ty * width_B + tx] = Dsub; |
---|

86 | } |
---|