You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

103 lines
3.4 KiB

3 years ago
3 years ago
  1. // Patric Zhao: patric.zhao@intel.com
  2. #include <CL/sycl.hpp>
  3. #include <iostream>
  4. using namespace sycl;
  5. constexpr int64_t N = 10000000;
  6. int main() {
  7. // Enable queue profiling
  8. auto propList = cl::sycl::property_list {cl::sycl::property::queue::enable_profiling()};
  9. queue my_gpu_queue(gpu_selector{}, propList);
  10. std::cout << "Selected GPU device: " <<
  11. my_gpu_queue.get_device().get_info<info::device::name>() << "\n";
  12. int *host_mem = malloc_host<int>(N, my_gpu_queue);
  13. int *cpu_mem = malloc_host<int>(N, my_gpu_queue);
  14. int *device_mem = malloc_device<int>(N, my_gpu_queue);
  15. // Init CPU data
  16. for(int64_t i = 0; i < N; i++) {
  17. host_mem[i] = i % 6666;
  18. }
  19. float duration_cpu = 0.0;
  20. float duration_gpu_a = 0.0;
  21. float duration_gpu_b = 0.0;
  22. float duration_gpu_c = 0.0;
  23. std::chrono::high_resolution_clock::time_point s, e;
  24. std::chrono::high_resolution_clock::time_point s_a, e_a;
  25. std::chrono::high_resolution_clock::time_point s_b, e_b;
  26. std::chrono::high_resolution_clock::time_point s_c, e_c;
  27. // CPU computation
  28. printf("\n Start CPU Computation, Number of Elems = %ld \n", N);
  29. s = std::chrono::high_resolution_clock::now();
  30. // CPU code here
  31. for(int64_t i = 0; i < N; i++) {
  32. cpu_mem[i] = host_mem[i] * 2;
  33. }
  34. e = std::chrono::high_resolution_clock::now();
  35. duration_cpu = std::chrono::duration<float, std::milli>(e - s).count();
  36. printf("\n End CPU Computation, Time = %lf \n", duration_cpu);
  37. // warmup
  38. /*********************************************************************/
  39. my_gpu_queue.memcpy(device_mem, host_mem, N * sizeof(int)).wait();
  40. my_gpu_queue.submit([&](handler& h) {
  41. // Parallel Computation
  42. h.parallel_for(range{N}, [=](id<1> item) {
  43. device_mem[item] *= 2;
  44. });
  45. });
  46. my_gpu_queue.wait();
  47. /*********************************************************************/
  48. s_c = std::chrono::high_resolution_clock::now();
  49. // Copy from host(CPU) to device(GPU)
  50. my_gpu_queue.memcpy(device_mem, host_mem, N * sizeof(int)).wait();
  51. s_b = std::chrono::high_resolution_clock::now();
  52. s_a = std::chrono::high_resolution_clock::now();
  53. // submit the content to the queue for execution
  54. auto event = my_gpu_queue.submit([&](handler& h) {
  55. // Parallel Computation
  56. h.parallel_for(range{N}, [=](id<1> item) {
  57. device_mem[item] *= 2;
  58. });
  59. });
  60. // wait the computation done
  61. my_gpu_queue.wait();
  62. e_b = std::chrono::high_resolution_clock::now();
  63. duration_gpu_b = std::chrono::duration<float, std::milli>(e_b - s_b).count();
  64. duration_gpu_a =
  65. (event.get_profiling_info<info::event_profiling::command_end>() -
  66. event.get_profiling_info<info::event_profiling::command_start>()) /1000.0f/1000.0f;
  67. // Copy back from GPU to CPU
  68. my_gpu_queue.memcpy(host_mem, device_mem, N * sizeof(int)).wait();
  69. e_c = std::chrono::high_resolution_clock::now();
  70. duration_gpu_c = std::chrono::duration<float, std::milli>(e_c - s_c).count();
  71. printf("\n GPU Computation, GPU Time A = %lf \n", duration_gpu_a);
  72. printf("\n GPU Computation, GPU Time B = %lf \n", duration_gpu_b);
  73. printf("\n GPU Computation, GPU Time C = %lf \n", duration_gpu_c);
  74. printf("\nTask Done!\n");
  75. return 0;
  76. }