-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMatMul.cu
86 lines (65 loc) · 2.47 KB
/
MatMul.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#include <cuda_runtime.h>
#include <iostream>
#define TILE_WIDTH 16
__global__ void matrixMultiplyTiled(
float* A, float* B, float* C,
int numARows, int numAColumns,
int numBRows, int numBColumns,
int numCRows, int numCColumns) {
__shared__ float ds_A[TILE_WIDTH][TILE_WIDTH];
__shared__ float ds_B[TILE_WIDTH][TILE_WIDTH];
int bx = blockIdx.x, by = blockIdx.y;
int tx = threadIdx.x, ty = threadIdx.y;
int Row = by * TILE_WIDTH + ty;
int Col = bx * TILE_WIDTH + tx;
float Pvalue = 0.0f;
for (int p = 0; p < (numAColumns - 1) / TILE_WIDTH + 1; ++p) {
if (Row < numARows && p * TILE_WIDTH + tx < numAColumns)
ds_A[ty][tx] = A[Row * numAColumns + p * TILE_WIDTH + tx];
else
ds_A[ty][tx] = 0.0f;
if (p * TILE_WIDTH + ty < numBRows && Col < numBColumns)
ds_B[ty][tx] = B[(p * TILE_WIDTH + ty) * numBColumns + Col];
else
ds_B[ty][tx] = 0.0f;
__syncthreads();
for (int i = 0; i < TILE_WIDTH; ++i)
Pvalue += ds_A[ty][i] * ds_B[i][tx];
__syncthreads();
}
if (Row < numCRows && Col < numCColumns)
C[Row * numCColumns + Col] = Pvalue;
}
void launchMatrixMultiplyTiled(
float* deviceA, float* deviceB, float* deviceC,
int numARows, int numAColumns,
int numBRows, int numBColumns,
int numCRows, int numCColumns) {
// Define grid and block dimensions
dim3 gridDim((numCColumns - 1) / TILE_WIDTH + 1, (numCRows - 1) / TILE_WIDTH + 1, 1);
dim3 blockDim(TILE_WIDTH, TILE_WIDTH, 1);
// Create CUDA events for timing
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// Record the start event
cudaEventRecord(start, 0);
// Launch the kernel
matrixMultiplyTiled<<<gridDim, blockDim>>>(deviceA, deviceB, deviceC,
numARows, numAColumns, numBRows, numBColumns, numCRows, numCColumns);
// Record the stop event
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
// Calculate elapsed time
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, start, stop);
std::cout << "Kernel execution time: " << elapsedTime << " ms" << std::endl;
// Cleanup events
cudaEventDestroy(start);
cudaEventDestroy(stop);
// Check for any kernel launch errors
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
std::cerr << "CUDA Kernel Launch Error: " << cudaGetErrorString(err) << std::endl;
}
}