You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

139 lines
4.7 KiB

  1. // patric zhao, patric.zhao@intel.com
  2. // show SLM usage by Finite Difference Approximating Derivatives (fdad)
  3. #include <CL/sycl.hpp>
  4. #include <iostream>
  5. using namespace sycl;
  6. #define random_float() (rand() / double(RAND_MAX))
  7. #define BLOCK 256
  8. #define CheckResult 0
  9. constexpr int64_t N = 256 * 256 * 256 + 2;
  10. constexpr float delta = 0.001f;
  11. void verify(float *gpu, float *cpu, int N) {
  12. int error = 0;
  13. for(int i = 0; i < N; i++) {
  14. if(std::fabs(gpu[i] - cpu[i]) > 10e-3) {
  15. printf("\nError at %d GPU = %f, CPU = %f\n", i, gpu[i], cpu[i]);
  16. error++;
  17. }
  18. if(error > 20) break;
  19. }
  20. return;
  21. }
  22. int main() {
  23. // Enable queue profiling
  24. auto propList = cl::sycl::property_list {cl::sycl::property::queue::enable_profiling()};
  25. queue my_gpu_queue(gpu_selector{}, propList);
  26. std::cout << "Selected GPU device: " <<
  27. my_gpu_queue.get_device().get_info<info::device::name>() << "\n";
  28. float *input = malloc_host<float>(N, my_gpu_queue);
  29. float *output_P_cpu = malloc_host<float>(N-2, my_gpu_queue);
  30. float *input_Q = malloc_device<float>(N, my_gpu_queue);
  31. float *output_P = malloc_device<float>(N-2, my_gpu_queue);
  32. float *output_P_gpu = malloc_host<float>(N-2, my_gpu_queue);
  33. // Init CPU data
  34. for(int64_t i = 0; i < N; i++) {
  35. input[i] = random_float();
  36. }
  37. // CPU compuatation
  38. printf("\n Start Computation, Number of Elems = %ld \n", N);
  39. for(int64_t i = 0; i < N-2; i++) {
  40. output_P_cpu[i] = (input[i+2] - input[i]) / (2.0f * delta);
  41. }
  42. float duration_gpu_a = 0.0;
  43. float duration_gpu_b = 0.0;
  44. // Copy from host(CPU) to device(GPU)
  45. my_gpu_queue.memcpy(input_Q, input, N * sizeof(float)).wait();
  46. int warmup = 10;
  47. int iteration = 50;
  48. for(int i = 0; i < iteration + warmup; i++) {
  49. // read/write global memory directly
  50. auto event1 = my_gpu_queue.submit([&](handler& h) {
  51. h.parallel_for(nd_range<1>{N-2, BLOCK}, [=](nd_item<1> item) {
  52. auto global_id = item.get_global_id(0);
  53. output_P[global_id] = (input_Q[global_id +2] - input_Q[global_id]) / (2.0f * delta);
  54. });
  55. });
  56. // wait the computation done
  57. my_gpu_queue.wait();
  58. if (i >= warmup) {
  59. duration_gpu_a +=
  60. (event1.get_profiling_info<info::event_profiling::command_end>() -
  61. event1.get_profiling_info<info::event_profiling::command_start>()) /1000.0f/1000.0f;
  62. }
  63. if (CheckResult) {
  64. my_gpu_queue.memcpy(output_P_gpu, output_P, (N - 2) * sizeof(float)).wait();
  65. verify(output_P_gpu, output_P_gpu, N);
  66. }
  67. // read data to SLM and then computaiton w/ SLM read
  68. // finally write back to global memory
  69. auto event2 = my_gpu_queue.submit([&](handler& h) {
  70. // Define SLM size per work-group
  71. sycl::accessor<float, 1, sycl::access::mode::read_write,
  72. sycl::access::target::local>
  73. slm_buffer(BLOCK + 2, h);
  74. h.parallel_for(nd_range<1>(N-2, BLOCK), [=](nd_item<1> item) {
  75. auto local_id = item.get_local_id(0);
  76. auto global_id = item.get_global_id(0);
  77. slm_buffer[local_id] = input_Q[global_id];
  78. if(local_id == BLOCK-1) {
  79. slm_buffer[BLOCK ] = input_Q[global_id +1];
  80. slm_buffer[BLOCK+1] = input_Q[global_id +2];
  81. }
  82. item.barrier(sycl::access::fence_space::local_space);
  83. output_P[global_id] = (slm_buffer[local_id +2] - slm_buffer[local_id]) / (2.0f * delta);
  84. });
  85. });
  86. my_gpu_queue.wait();
  87. if (i >= warmup) {
  88. duration_gpu_b +=
  89. (event2.get_profiling_info<info::event_profiling::command_end>() -
  90. event2.get_profiling_info<info::event_profiling::command_start>()) /1000.0f/1000.0f;
  91. }
  92. if (CheckResult) {
  93. my_gpu_queue.memcpy(output_P_gpu, output_P, (N - 2) * sizeof(float)).wait();
  94. verify(output_P_gpu, output_P_gpu, N);
  95. }
  96. }
  97. printf("\n GPU Computation, GPU Time w/o SLM = %lf \n", duration_gpu_a / iteration);
  98. printf("\n GPU Computation, GPU Time w/ SLM = %lf \n", duration_gpu_b / iteration);
  99. printf("\nTask Done!\n");
  100. free(input_Q, my_gpu_queue);
  101. free(output_P, my_gpu_queue);
  102. free(output_P_cpu, my_gpu_queue);
  103. free(output_P_gpu, my_gpu_queue);
  104. free(input, my_gpu_queue);
  105. return 0;
  106. }