//Patri Zhao: patric.zhao@intel.com #include #include #include #define random_float() (rand() / double(RAND_MAX)) using namespace std; using namespace sycl; #define tileY 2 #define tileX 2 // return execution time double gpu_kernel(float *A, float *B, float *C, int M, int N, int K, int BLOCK, sycl::queue &q) { // define the workgroup size and mapping auto grid_rows = M / tileY; auto grid_cols = N / tileX; auto local_ndrange = range<2>(BLOCK, BLOCK); auto global_ndrange = range<2>(grid_rows, grid_cols); double duration = 0.0f; auto e = q.submit([&](sycl::handler &h) { h.parallel_for( sycl::nd_range<2>(global_ndrange, local_ndrange), [=](sycl::nd_item<2> index) { int row = tileY * index.get_global_id(0); int col = tileX * index.get_global_id(1); float sum[tileY][tileX] = {0.0f}; float subA[tileY] = {0.0f}; float subB[tileX] = {0.0f}; // core computation for (int k = 0; k < N; k++) { // read data to register for(int m = 0; m < tileY; m++) { subA[m] = A[(row + m) * N + k]; } for(int p = 0; p < tileX; p++) { subB[p] = B[k * N + p + col]; } for (int m = 0; m < tileY; m++) { for (int p = 0; p < tileX; p++) { sum[m][p] += subA[m] * subB[p]; } } } //end of K // write results back for (int m = 0; m < tileY; m++) { for (int p = 0; p < tileX; p++) { C[(row + m) * N + col + p] = sum[m][p]; } } }); }); e.wait(); duration += (e.get_profiling_info() - e.get_profiling_info()) /1000.0f/1000.0f; return(duration); } // return execution time double cpu_kernel(float *cA, float *cB, float *cC, int M, int N, int K) { double duration = 0.0; std::chrono::high_resolution_clock::time_point s, e; // Single Thread Computation in CPU s = std::chrono::high_resolution_clock::now(); for(int i = 0; i < M; i++) { for(int j = 0; j < N; j++) { float sum = 0.0f; for(int k = 0; k < K; k++) { sum += cA[i * K + k] * cB[k * N + j]; } cC[i * N + j] = sum; } } e = std::chrono::high_resolution_clock::now(); duration = std::chrono::duration(e - s).count(); return(duration); } int verify(float *cpu_res, float *gpu_res, int length){ int err = 0; for(int i = 0; i < length; i++) { if( fabs(cpu_res[i] - gpu_res[i]) > 1e-3) { err++; printf("\n%lf, %lf", cpu_res[i], gpu_res[i]); } } return(err); } int gemm(const int M, const int N, const int K, const int block_size, const int iterations, sycl::queue &q) { cout << "Problem size: c(" << M << "," << N << ") =" << " a(" << M << "," << K << ") *" << " b(" << K << "," << N << ")\n"; auto A = malloc_shared(M * K, q); auto B = malloc_shared(K * N, q); auto C = malloc_shared(M * N, q); auto C_host = malloc_host(M * N, q); // init the A/B/C for(int i=0; i < M * K; i++) { A[i] = random_float(); } for(int i=0; i < K * N; i++) { B[i] = random_float(); } for(int i=0; i < M * N; i++) { C[i] = 0.0f; C_host[i] = 0.0f; } double flopsPerMatrixMul = 2.0 * static_cast(M) * static_cast(N) * static_cast(K); double duration_gpu = 0.0f; double duration_cpu = 0.0f; // GPU compuation and timer int warmup = 10; for (int run = 0; run < iterations + warmup; run++) { float duration = gpu_kernel(A, B, C, M, N, K, block_size, q); if(run >= warmup) duration_gpu += duration; } duration_gpu = duration_gpu / iterations; // CPU compuation and timer warmup = 2; for(int run = 0; run < iterations/2 + warmup; run++) { float duration = cpu_kernel(A, B, C_host, M, N, K); if(run >= warmup) duration_cpu += duration; } duration_cpu = duration_cpu / iterations/2; // Compare the resutls of CPU and GPU int errCode = 0; errCode = verify(C_host, C, M*N); if(errCode > 0) printf("\nThere are %d errors\n", errCode); printf("\nGEMM size M = %d, N = %d, K = %d", M, N, K); printf("\nWork-Group size = %d * %d, tile_X = %d, tile_Y = %d", block_size, block_size, tileX, tileY); printf("\nPerformance Flops = %lf, \n" "GPU Computation Time = %lf (ms); \n" "CPU Computaiton Time = %lf (ms); \n", flopsPerMatrixMul, duration_gpu, duration_cpu); free(A, q); free(B, q); free(C, q); free(C_host, q); return(errCode); } int main() { auto propList = cl::sycl::property_list {cl::sycl::property::queue::enable_profiling()}; queue my_gpu_queue( cl::sycl::gpu_selector{} , propList); int errCode = gemm(512, 512, 512, /* GEMM size, M, N, K */ 4, /* workgroup size */ 10, /* repeat time */ my_gpu_queue); return(errCode); }