You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

193 lines
5.3 KiB

2 years ago
2 years ago
2 years ago
  1. //Patri Zhao: patric.zhao@intel.com
  2. #include <chrono>
  3. #include <iostream>
  4. #include <CL/sycl.hpp>
  5. #define random_float() (rand() / double(RAND_MAX))
  6. using namespace std;
  7. using namespace sycl;
  8. #define tileY 2
  9. #define tileX 2
  10. // return execution time
  11. double gpu_kernel(float *A, float *B, float *C,
  12. int M, int N, int K,
  13. int BLOCK, sycl::queue &q) {
  14. // define the workgroup size and mapping
  15. auto grid_rows = M / tileY;
  16. auto grid_cols = N / tileX;
  17. auto local_ndrange = range<2>(BLOCK, BLOCK);
  18. auto global_ndrange = range<2>(grid_rows, grid_cols);
  19. double duration = 0.0f;
  20. auto e = q.submit([&](sycl::handler &h) {
  21. h.parallel_for<class k_name_t>(
  22. sycl::nd_range<2>(global_ndrange, local_ndrange), [=](sycl::nd_item<2> index) {
  23. int row = tileY * index.get_global_id(0);
  24. int col = tileX * index.get_global_id(1);
  25. float sum[tileY][tileX] = {0.0f};
  26. float subA[tileY] = {0.0f};
  27. float subB[tileX] = {0.0f};
  28. // core computation
  29. for (int k = 0; k < N; k++) {
  30. // read data to register
  31. for(int m = 0; m < tileY; m++) {
  32. subA[m] = A[(row + m) * N + k];
  33. }
  34. for(int p = 0; p < tileX; p++) {
  35. subB[p] = B[k * N + p + col];
  36. }
  37. for (int m = 0; m < tileY; m++) {
  38. for (int p = 0; p < tileX; p++) {
  39. sum[m][p] += subA[m] * subB[p];
  40. }
  41. }
  42. } //end of K
  43. // write results back
  44. for (int m = 0; m < tileY; m++) {
  45. for (int p = 0; p < tileX; p++) {
  46. C[(row + m) * N + col + p] = sum[m][p];
  47. }
  48. }
  49. });
  50. });
  51. e.wait();
  52. duration += (e.get_profiling_info<info::event_profiling::command_end>() -
  53. e.get_profiling_info<info::event_profiling::command_start>()) /1000.0f/1000.0f;
  54. return(duration);
  55. }
  56. // return execution time
  57. double cpu_kernel(float *cA, float *cB, float *cC, int M, int N, int K) {
  58. double duration = 0.0;
  59. std::chrono::high_resolution_clock::time_point s, e;
  60. // Single Thread Computation in CPU
  61. s = std::chrono::high_resolution_clock::now();
  62. for(int i = 0; i < M; i++) {
  63. for(int j = 0; j < N; j++) {
  64. float sum = 0.0f;
  65. for(int k = 0; k < K; k++) {
  66. sum += cA[i * K + k] * cB[k * N + j];
  67. }
  68. cC[i * N + j] = sum;
  69. }
  70. }
  71. e = std::chrono::high_resolution_clock::now();
  72. duration = std::chrono::duration<float, std::milli>(e - s).count();
  73. return(duration);
  74. }
  75. int verify(float *cpu_res, float *gpu_res, int length){
  76. int err = 0;
  77. for(int i = 0; i < length; i++) {
  78. if( fabs(cpu_res[i] - gpu_res[i]) > 1e-3) {
  79. err++;
  80. printf("\n%lf, %lf", cpu_res[i], gpu_res[i]);
  81. }
  82. }
  83. return(err);
  84. }
  85. int gemm(const int M,
  86. const int N,
  87. const int K,
  88. const int block_size,
  89. const int iterations,
  90. sycl::queue &q) {
  91. cout << "Problem size: c(" << M << "," << N << ") ="
  92. << " a(" << M << "," << K << ") *"
  93. << " b(" << K << "," << N << ")\n";
  94. auto A = malloc_shared<float>(M * K, q);
  95. auto B = malloc_shared<float>(K * N, q);
  96. auto C = malloc_shared<float>(M * N, q);
  97. auto C_host = malloc_host<float>(M * N, q);
  98. // init the A/B/C
  99. for(int i=0; i < M * K; i++) {
  100. A[i] = random_float();
  101. }
  102. for(int i=0; i < K * N; i++) {
  103. B[i] = random_float();
  104. }
  105. for(int i=0; i < M * N; i++) {
  106. C[i] = 0.0f;
  107. C_host[i] = 0.0f;
  108. }
  109. double flopsPerMatrixMul
  110. = 2.0 * static_cast<double>(M) * static_cast<double>(N) * static_cast<double>(K);
  111. double duration_gpu = 0.0f;
  112. double duration_cpu = 0.0f;
  113. // GPU compuation and timer
  114. int warmup = 10;
  115. for (int run = 0; run < iterations + warmup; run++) {
  116. float duration = gpu_kernel(A, B, C, M, N, K, block_size, q);
  117. if(run >= warmup) duration_gpu += duration;
  118. }
  119. duration_gpu = duration_gpu / iterations;
  120. // CPU compuation and timer
  121. warmup = 2;
  122. for(int run = 0; run < iterations/2 + warmup; run++) {
  123. float duration = cpu_kernel(A, B, C_host, M, N, K);
  124. if(run >= warmup) duration_cpu += duration;
  125. }
  126. duration_cpu = duration_cpu / iterations/2;
  127. // Compare the resutls of CPU and GPU
  128. int errCode = 0;
  129. errCode = verify(C_host, C, M*N);
  130. if(errCode > 0) printf("\nThere are %d errors\n", errCode);
  131. printf("\nGEMM size M = %d, N = %d, K = %d", M, N, K);
  132. printf("\nWork-Group size = %d * %d, tile_X = %d, tile_Y = %d", block_size, block_size, tileX, tileY);
  133. printf("\nPerformance Flops = %lf, \n"
  134. "GPU Computation Time = %lf (ms); \n"
  135. "CPU Computaiton Time = %lf (ms); \n",
  136. flopsPerMatrixMul, duration_gpu, duration_cpu);
  137. free(A, q);
  138. free(B, q);
  139. free(C, q);
  140. free(C_host, q);
  141. return(errCode);
  142. }
  143. int main() {
  144. auto propList = cl::sycl::property_list {cl::sycl::property::queue::enable_profiling()};
  145. queue my_gpu_queue( cl::sycl::gpu_selector{} , propList);
  146. int errCode = gemm(512, 512, 512, /* GEMM size, M, N, K */
  147. 4, /* workgroup size */
  148. 10, /* repeat time */
  149. my_gpu_queue);
  150. return(errCode);
  151. }