source: branches/wezzy/linbox/algorithms/opencl-kernels/kernel_muladd_partial_1024_sp.cl @ 4090

Revision 4090, 2.0 KB checked in by wezowicz, 3 years ago (diff)

Fixed memory leak in opencl-domain-memory.inl.
Add OpenCL specialization to muladd() along with 8 more kernels.

Line 
1/*
2 * kernel_partial_1024_sp.cl
3 *
4 *  Created on: Jul 13, 2011
5 *      Author: Matthew Wezowicz
6 */
7
8#define BLOCK_SIZE 16
9
10__kernel void matrix_mul_kernel(__global float* D, float alpha, __global float* A, __global float* B,
11                float beta, __global float* C, int width_A, int width_B, float mod){
12        //Get Workgroup ID
13        int bx = get_group_id(0);
14        int by = get_group_id(1);
15
16        //Get Local ID
17        int tx = get_local_id(0);
18        int ty = get_local_id(1);
19
20        //Range of indecies for sub-matrix of A
21        int aBegin = width_A * BLOCK_SIZE * by;
22        int aEnd = aBegin + width_A - 1;
23        int aStep = BLOCK_SIZE;
24
25        //Range of indecies for sub-matrix of B
26        int bBegin = BLOCK_SIZE * bx;
27        int bStep = BLOCK_SIZE * width_B;
28
29        //Local storage of sub-matrices of A and B
30        __local float As[BLOCK_SIZE][BLOCK_SIZE];
31        __local float Bs[BLOCK_SIZE][BLOCK_SIZE];
32
33        //Temporary storage for result
34        float Dsub = 0;
35
36        //Counter for modulus every 32 iterations
37        int mCount = 0;
38
39        //Loop over all the sub-matrices of A and B required to compute
40        //the result sub-matrix
41        for(int a = aBegin, b = bBegin; a < aEnd; a += aStep, b += bStep){
42                //Load the matrices from global memory to local memory
43                //Each thread loads one element of each sub-matrix
44                As[ty][tx] = A[a + width_A * ty + tx];
45                Bs[ty][tx] = B[b + width_B * ty + tx];
46
47                //Synchronize threads
48                barrier(CLK_LOCAL_MEM_FENCE);
49
50                //Multiply the two sub-matrices together
51                for(int i = 0; i < BLOCK_SIZE; i++){
52                        Dsub += As[ty][i] * Bs[i][tx];
53                }
54                mCount++;
55
56                //fmod every 1024 iterations
57                if(mCount == 64){
58                        Dsub = fmod(Dsub, mod);
59                        mCount = 0;
60                }
61
62                //Synchronize threads
63                barrier(CLK_LOCAL_MEM_FENCE);
64        }
65       
66        Dsub = fmod(Dsub, mod);
67       
68        //Calculates the offset in the result matrix
69        int d = width_B * BLOCK_SIZE * by + BLOCK_SIZE * bx;
70       
71        //Scale Dsub by alpha
72        Dsub = alpha * Dsub;
73        Dsub = fmod(Dsub, mod);
74       
75        //Scalse Csub by beta
76        float Csub = C[d + ty * width_B + tx];
77        Csub = beta * Csub;
78        Csub = fmod(Csub, mod);
79       
80        //Add Dsub and Dsub
81        Dsub = Dsub + Csub;
82        Dsub = fmod(Dsub, mod);
83       
84        //Add the sum to the appropriate spot
85        D[d + ty * width_B + tx] = Dsub;
86}
Note: See TracBrowser for help on using the repository browser.