From b1b815a41839c2eaa2fb8f598fe05324f1e2f315 Mon Sep 17 00:00:00 2001 From: PatricZhao Date: Thu, 28 Oct 2021 15:00:50 +0800 Subject: [PATCH] GEMM with tiling --- code/gemm_tile.cpp | 193 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 193 insertions(+) create mode 100644 code/gemm_tile.cpp diff --git a/code/gemm_tile.cpp b/code/gemm_tile.cpp new file mode 100644 index 0000000..44c4ebc --- /dev/null +++ b/code/gemm_tile.cpp @@ -0,0 +1,193 @@ +//Patri Zhao: patric.zhao@intel.com + +#include +#include +#include + +#define random_float() (rand() / double(RAND_MAX)) + +using namespace std; +using namespace sycl; + +#define tileY 2 +#define tileX 2 + +// return execution time +double gpu_kernel(float *A, float *B, float *C, + int M, int N, int K, + int BLOCK, sycl::queue &q) { + + // define the workgroup size and mapping + auto grid_rows = M / tileX; + auto grid_cols = N / tileY; + auto local_ndrange = range<2>(BLOCK, BLOCK); + auto global_ndrange = range<2>(grid_rows, grid_cols); + + double duration = 0.0f; + + auto e = q.submit([&](sycl::handler &h) { + h.parallel_for( + sycl::nd_range<2>(global_ndrange, local_ndrange), [=](sycl::nd_item<2> index) { + + int row = tileX * index.get_global_id(0); + int col = tileY * index.get_global_id(1); + + float sum[tileY][tileX] = {0.0f}; + float subA[tileY] = {0.0f}; + float subB[tileX] = {0.0f}; + + // core computation + for (int k = 0; k < N; k++) { + + // read data to register + for(int m = 0; m < tileY; m++) { + subA[m] = A[(row + m) * N + k]; + } + + for(int p = 0; p < tileX; p++) { + subB[p] = B[k * N + p + col]; + } + + for (int m = 0; m < tileY; m++) { + for (int p = 0; p < tileX; p++) { + sum[m][p] += subA[m] * subB[p]; + } + } + + } //end of K + + // write results back + for (int m = 0; m < tileY; m++) { + for (int p = 0; p < tileX; p++) { + C[(row + m) * N + col + p] = sum[m][p]; + } + } + + }); + }); + e.wait(); + + duration += (e.get_profiling_info() - + e.get_profiling_info()) /1000.0f/1000.0f; + + return(duration); +} + +// return execution time +double cpu_kernel(float *cA, float *cB, float *cC, int M, int N, int K) { + + double duration = 0.0; + std::chrono::high_resolution_clock::time_point s, e; + + // Single Thread Computation in CPU + s = std::chrono::high_resolution_clock::now(); + for(int i = 0; i < M; i++) { + for(int j = 0; j < N; j++) { + float sum = 0.0f; + for(int k = 0; k < K; k++) { + sum += cA[i * K + k] * cB[k * N + j]; + } + cC[i * N + j] = sum; + } + } + e = std::chrono::high_resolution_clock::now(); + duration = std::chrono::duration(e - s).count(); + + return(duration); +} + +int verify(float *cpu_res, float *gpu_res, int length){ + int err = 0; + for(int i = 0; i < length; i++) { + if( fabs(cpu_res[i] - gpu_res[i]) > 1e-3) { + err++; + printf("\n%lf, %lf", cpu_res[i], gpu_res[i]); + } + } + return(err); +} + +int gemm(const int M, + const int N, + const int K, + const int block_size, + const int iterations, + sycl::queue &q) { + + cout << "Problem size: c(" << M << "," << N << ") =" + << " a(" << M << "," << K << ") *" + << " b(" << K << "," << N << ")\n"; + + auto A = malloc_shared(M * K, q); + auto B = malloc_shared(K * N, q); + auto C = malloc_shared(M * N, q); + auto C_host = malloc_host(M * N, q); + + // init the A/B/C + for(int i=0; i < M * K; i++) { + A[i] = random_float(); + } + + for(int i=0; i < K * N; i++) { + B[i] = random_float(); + } + + for(int i=0; i < M * N; i++) { + C[i] = 0.0f; + C_host[i] = 0.0f; + } + + double flopsPerMatrixMul + = 2.0 * static_cast(M) * static_cast(N) * static_cast(K); + + double duration_gpu = 0.0f; + double duration_cpu = 0.0f; + + // GPU compuation and timer + int warmup = 10; + for (int run = 0; run < iterations + warmup; run++) { + float duration = gpu_kernel(A, B, C, M, N, K, block_size, q); + if(run >= warmup) duration_gpu += duration; + } + duration_gpu = duration_gpu / iterations; + + // CPU compuation and timer + warmup = 2; + for(int run = 0; run < iterations/2 + warmup; run++) { + float duration = cpu_kernel(A, B, C_host, M, N, K); + if(run >= warmup) duration_cpu += duration; + } + duration_cpu = duration_cpu / iterations/2; + + // Compare the resutls of CPU and GPU + int errCode = 0; + errCode = verify(C_host, C, M*N); + if(errCode > 0) printf("\nThere are %d errors\n", errCode); + + printf("\nGEMM size M = %d, N = %d, K = %d", M, N, K); + printf("\nWork-Group size = %d * %d, tile_X = %d, tile_Y = %d", block_size, block_size, tileX, tileY); + printf("\nPerformance Flops = %lf, \n" + "GPU Computation Time = %lf (ms); \n" + "CPU Computaiton Time = %lf (ms); \n", + flopsPerMatrixMul, duration_gpu, duration_cpu); + + free(A, q); + free(B, q); + free(C, q); + free(C_host, q); + + return(errCode); +} + +int main() { + + auto propList = cl::sycl::property_list {cl::sycl::property::queue::enable_profiling()}; + queue my_gpu_queue( cl::sycl::gpu_selector{} , propList); + + int errCode = gemm(512, 512, 512, /* GEMM size, M, N, K */ + 4, /* workgroup size */ + 10, /* repeat time */ + my_gpu_queue); + + return(errCode); +}