You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

157 lines
4.3 KiB

  1. //Patric Zhao: patric.zhao@gmail.com
  2. #include <chrono>
  3. #include <iostream>
  4. #include <CL/sycl.hpp>
  5. #define random_float() (rand() / double(RAND_MAX))
  6. using namespace std;
  7. using namespace sycl;
  8. // return execution time
  9. double gpu_kernel(float *A, float *B, float *C, int M, int N, int K, int block_size, sycl::queue &q) {
  10. // define the workgroup size and mapping
  11. auto grid_rows = (M + block_size - 1) / block_size * block_size;
  12. auto grid_cols = (N + block_size - 1) / block_size * block_size;
  13. auto local_ndrange = range<2>(block_size, block_size);
  14. auto global_ndrange = range<2>(grid_rows, grid_cols);
  15. double duration = 0.0f;
  16. auto e = q.submit([&](sycl::handler &h) {
  17. h.parallel_for<class k_name_t>(
  18. sycl::nd_range<2>(global_ndrange, local_ndrange), [=](sycl::nd_item<2> index) {
  19. int row = index.get_global_id(0);
  20. int col = index.get_global_id(1);
  21. float sum = 0.0f;
  22. for (int i = 0; i < K; i++) {
  23. sum += A[row * K + i] * B[i * N + col];
  24. }
  25. C[row * N + col] = sum;
  26. });
  27. });
  28. e.wait();
  29. duration += (e.get_profiling_info<info::event_profiling::command_end>() -
  30. e.get_profiling_info<info::event_profiling::command_start>()) /1000.0f/1000.0f;
  31. return(duration);
  32. }
  33. // return execution time
  34. double cpu_kernel(float *cA, float *cB, float *cC, int M, int N, int K) {
  35. double duration = 0.0;
  36. std::chrono::high_resolution_clock::time_point s, e;
  37. // Single Thread Computation in CPU
  38. s = std::chrono::high_resolution_clock::now();
  39. for(int i = 0; i < M; i++) {
  40. for(int j = 0; j < N; j++) {
  41. float sum = 0.0f;
  42. for(int k = 0; k < K; k++) {
  43. sum += cA[i * K + k] * cB[k * N + j];
  44. }
  45. cC[i * N + j] = sum;
  46. }
  47. }
  48. e = std::chrono::high_resolution_clock::now();
  49. duration = std::chrono::duration<float, std::milli>(e - s).count();
  50. return(duration);
  51. }
  52. int verify(float *cpu_res, float *gpu_res, int length){
  53. int err = 0;
  54. for(int i = 0; i < length; i++) {
  55. if( fabs(cpu_res[i] - gpu_res[i]) > 1e-3) {
  56. err++;
  57. printf("\n%lf, %lf", cpu_res[i], gpu_res[i]);
  58. }
  59. }
  60. return(err);
  61. }
  62. int gemm(const int M,
  63. const int N,
  64. const int K,
  65. const int block_size,
  66. const int iterations,
  67. sycl::queue &q) {
  68. cout << "Problem size: c(" << M << "," << N << ") ="
  69. << " a(" << M << "," << K << ") *"
  70. << " b(" << K << "," << N << ")\n";
  71. auto A = malloc_shared<float>(M * K, q);
  72. auto B = malloc_shared<float>(K * N, q);
  73. auto C = malloc_shared<float>(M * N, q);
  74. auto C_host = malloc_host<float>(M * N, q);
  75. // init the A/B/C
  76. for(int i=0; i < M * K; i++) {
  77. A[i] = random_float();
  78. }
  79. for(int i=0; i < K * N; i++) {
  80. B[i] = random_float();
  81. }
  82. for(int i=0; i < M * N; i++) {
  83. C[i] = 0.0f;
  84. C_host[i] = 0.0f;
  85. }
  86. double flopsPerMatrixMul
  87. = 2.0 * static_cast<double>(M) * static_cast<double>(N) * static_cast<double>(K);
  88. double duration_gpu = 0.0f;
  89. double duration_cpu = 0.0f;
  90. // GPU compuation and timer
  91. int warmup = 10;
  92. for (int run = 0; run < iterations + warmup; run++) {
  93. float duration = gpu_kernel(A, B, C, M, N, K, block_size, q);
  94. if(run >= warmup) duration_gpu += duration;
  95. }
  96. duration_gpu = duration_gpu / iterations;
  97. // CPU compuation and timer
  98. warmup = 2;
  99. for(int run = 0; run < iterations/2 + warmup; run++) {
  100. float duration = cpu_kernel(A, B, C_host, M, N, K);
  101. if(run >= warmup) duration_cpu += duration;
  102. }
  103. duration_cpu = duration_cpu / iterations/2;
  104. // Compare the resutls of CPU and GPU
  105. int errCode = 0;
  106. errCode = verify(C_host, C, M*N);
  107. if(errCode > 0) printf("\nThere are %d errors\n", errCode);
  108. printf("\nPerformance Flops = %lf, \n"
  109. "GPU Computation Time = %lf (ms); \n"
  110. "CPU Computaiton Time = %lf (ms); \n",
  111. flopsPerMatrixMul, duration_gpu, duration_cpu);
  112. free(A, q);
  113. free(B, q);
  114. free(C, q);
  115. free(C_host, q);
  116. return(errCode);
  117. }
  118. int main() {
  119. auto propList = cl::sycl::property_list {cl::sycl::property::queue::enable_profiling()};
  120. queue my_gpu_queue( cl::sycl::gpu_selector{} , propList);
  121. int errCode = gemm(1024, 1024, 1024, 4, 10, my_gpu_queue);
  122. return(errCode);
  123. }