source: branches/wezzy/linbox/algorithms/opencl-kernels/kernel_muladd_partial_32_dp.cl @ 4090

Revision 4090, 2.1 KB checked in by wezowicz, 3 years ago (diff)

Fixed memory leak in opencl-domain-memory.inl.
Add OpenCL specialization to muladd() along with 8 more kernels.

Line 
1/*
2 * kernel_partial_32_dp.cl
3 *
4 *  Created on: Jul 5, 2011
5 *      Author: Matthew Wezowicz
6 */
7
8#define BLOCK_SIZE 16
9#pragma OPENCL EXTENSION cl_khr_fp64 : enable
10
11__kernel void matrix_mul_kernel(__global double* D, double alpha, __global double* A, __global double* B,
12                double beta, __global double* C, int width_A, int width_B, double mod){
13        //Get Workgroup ID
14        int bx = get_group_id(0);
15        int by = get_group_id(1);
16
17        //Get Local ID
18        int tx = get_local_id(0);
19        int ty = get_local_id(1);
20
21        //Range of indecies for sub-matrix of A
22        int aBegin = width_A * BLOCK_SIZE * by;
23        int aEnd = aBegin + width_A - 1;
24        int aStep = BLOCK_SIZE;
25
26        //Range of indecies for sub-matrix of B
27        int bBegin = BLOCK_SIZE * bx;
28        int bStep = BLOCK_SIZE * width_B;
29
30        //Local storage of sub-matrices of A and B
31        __local double As[BLOCK_SIZE][BLOCK_SIZE];
32        __local double Bs[BLOCK_SIZE][BLOCK_SIZE];
33
34        //Temporary storage for result
35        double Dsub = 0;
36
37        //Counter for modulus every 32 iterations
38        int mCount = 0;
39
40        //Loop over all the sub-matrices of A and B required to compute
41        //the result sub-matrix
42        for(int a = aBegin, b = bBegin; a < aEnd; a += aStep, b += bStep){
43                //Load the matrices from global memory to local memory
44                //Each thread loads one element of each sub-matrix
45                As[ty][tx] = A[a + width_A * ty + tx];
46                Bs[ty][tx] = B[b + width_B * ty + tx];
47
48                //Synchronize threads
49                barrier(CLK_LOCAL_MEM_FENCE);
50
51                //Multiply the two sub-matrices together
52                for(int i = 0; i < BLOCK_SIZE; i++){
53                        Dsub += As[ty][i] * Bs[i][tx];
54                }
55                mCount++;
56
57                //fmod every 32 iterations
58                if(mCount == 2){
59                        Dsub = fmod(Dsub, mod);
60                        mCount = 0;
61                }
62
63                //Synchronize threads
64                barrier(CLK_LOCAL_MEM_FENCE);
65        }
66       
67        Dsub = fmod(Dsub, mod);
68       
69        //Calculates the offset in the result matrix
70        int d = width_B * BLOCK_SIZE * by + BLOCK_SIZE * bx;
71       
72        //Scale Dsub by alpha
73        Dsub = alpha * Dsub;
74        Dsub = fmod(Dsub, mod);
75       
76        //Scalse Csub by beta
77        double Csub = C[d + ty * width_B + tx];
78        Csub = beta * Csub;
79        Csub = fmod(Csub, mod);
80       
81        //Add Dsub and Dsub
82        Dsub = Dsub + Csub;
83        Dsub = fmod(Dsub, mod);
84       
85        //Add the sum to the appropriate spot
86        D[d + ty * width_B + tx] = Dsub;
87}
Note: See TracBrowser for help on using the repository browser.