You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

95 lines
3.0 KiB

2 years ago
2 years ago
2 years ago
2 years ago
  1. // Patric Zhao: patric.zhao@intel.com
  2. #include <CL/sycl.hpp>
  3. #include <iostream>
  4. using namespace sycl;
  5. constexpr int64_t N = 10000000;
  6. int main() {
  7. // Enable queue profiling
  8. auto propList = cl::sycl::property_list {cl::sycl::property::queue::enable_profiling()};
  9. queue my_gpu_queue(gpu_selector{}, propList);
  10. std::cout << "Selected GPU device: " <<
  11. my_gpu_queue.get_device().get_info<info::device::name>() << "\n";
  12. int *host_mem = malloc_host<int>(N, my_gpu_queue);
  13. int *cpu_mem = malloc_host<int>(N, my_gpu_queue);
  14. int *device_mem = malloc_device<int>(N, my_gpu_queue);
  15. // Init CPU data
  16. for(int64_t i = 0; i < N; i++) {
  17. host_mem[i] = i % 6666;
  18. }
  19. float duration_cpu = 0.0;
  20. float duration_gpu = 0.0;
  21. float duration_total = 0.0;
  22. std::chrono::high_resolution_clock::time_point s_cpu, e_cpu;
  23. std::chrono::high_resolution_clock::time_point s_gpu, e_gpu;
  24. std::chrono::high_resolution_clock::time_point s_t, e_t;
  25. // warmup
  26. /*********************************************************************/
  27. my_gpu_queue.memcpy(device_mem, host_mem, N * sizeof(int)).wait();
  28. my_gpu_queue.submit([&](handler& h) {
  29. // Parallel Computation
  30. h.parallel_for(range{N}, [=](id<1> item) {
  31. device_mem[item] *= 2;
  32. });
  33. });
  34. my_gpu_queue.wait();
  35. /*********************************************************************/
  36. // CPU computation
  37. printf("\n Start CPU Computation, Number of Elems = %ld \n", N);
  38. s_t = std::chrono::high_resolution_clock::now();
  39. s_cpu = std::chrono::high_resolution_clock::now();
  40. // CPU code here
  41. for(int64_t i = 0; i < N; i++) {
  42. cpu_mem[i] = host_mem[i] * 2;
  43. }
  44. e_cpu = std::chrono::high_resolution_clock::now();
  45. duration_cpu = std::chrono::duration<float, std::milli>(e_cpu - s_cpu).count();
  46. s_gpu = std::chrono::high_resolution_clock::now();
  47. // submit the content to the queue for execution
  48. auto event = my_gpu_queue.submit([&](handler& h) {
  49. // Parallel Computation
  50. h.parallel_for(range{N}, [=](id<1> item) {
  51. device_mem[item] *= 2;
  52. });
  53. });
  54. // wait the computation done
  55. my_gpu_queue.wait();
  56. e_gpu = std::chrono::high_resolution_clock::now();
  57. e_t = std::chrono::high_resolution_clock::now();
  58. duration_gpu = std::chrono::duration<float, std::milli>(e_gpu - s_gpu).count();
  59. duration_total = std::chrono::duration<float, std::milli>(e_t - s_t).count();
  60. // Copy back from GPU to CPU
  61. my_gpu_queue.memcpy(host_mem, device_mem, N * sizeof(int)).wait();
  62. printf("\n CPU Computation, Time = %lf \n", duration_cpu);
  63. printf("\n GPU Computation, Time = %lf \n", duration_gpu);
  64. printf("\n Total Computation, TIme = %lf \n", duration_total);
  65. printf("\nTask Done!\n");
  66. free(cpu_mem, my_gpu_queue);
  67. free(host_mem, my_gpu_queue);
  68. free(device_mem, my_gpu_queue);
  69. return 0;
  70. }