You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

101 lines
3.1 KiB

2 years ago
2 years ago
  1. // Patric Zhao, patric.zhao@intel.com
  2. #include <CL/sycl.hpp>
  3. #include <iostream>
  4. using namespace sycl;
  5. constexpr int64_t N = 10000000;
  6. int main() {
  7. // Enable queue profiling
  8. queue my_gpu_queue(gpu_selector{});
  9. std::cout << "Selected GPU device: " <<
  10. my_gpu_queue.get_device().get_info<info::device::name>() << "\n";
  11. int *cpu_out = (int*)malloc(N * sizeof(int));
  12. int *host_mem = malloc_host<int>(N, my_gpu_queue);
  13. int *device_mem = malloc_device<int>(N, my_gpu_queue);
  14. // Init CPU data
  15. for(int64_t i = 0; i < N; i++) {
  16. host_mem[i] = i % 6666;
  17. cpu_out[i] = i % 6666;
  18. }
  19. float duration_cpu = 0.0;
  20. float duration_gpu = 0.0;
  21. float duration_total = 0.0;
  22. std::chrono::high_resolution_clock::time_point s_cpu, e_cpu;
  23. std::chrono::high_resolution_clock::time_point s_gpu, e_gpu;
  24. std::chrono::high_resolution_clock::time_point s_t, e_t;
  25. // warmup
  26. /*********************************************************************/
  27. for(int64_t i = 0; i < N; i++) {
  28. cpu_out[i] = cpu_out[i] * 2;
  29. }
  30. my_gpu_queue.memcpy(device_mem, host_mem, N * sizeof(int)).wait();
  31. my_gpu_queue.submit([&](handler& h) {
  32. // Parallel Computation
  33. h.parallel_for(range{N}, [=](id<1> item) {
  34. device_mem[item] *= 2;
  35. });
  36. });
  37. my_gpu_queue.wait();
  38. /*********************************************************************/
  39. printf("\n Start CPU Computation, Number of Elems = %ld \n", N);
  40. s_t = std::chrono::high_resolution_clock::now();
  41. // GPU Computation
  42. // submit the content to the queue for execution
  43. s_gpu = std::chrono::high_resolution_clock::now();
  44. auto event = my_gpu_queue.submit([&](handler& h) {
  45. // Parallel Computation
  46. h.parallel_for(range{N}, [=](id<1> item) {
  47. device_mem[item] *= 2;
  48. });
  49. });
  50. // CPU computation
  51. s_cpu = std::chrono::high_resolution_clock::now();
  52. for(int64_t i = 0; i < N; i++) {
  53. cpu_out[i] *= 2;
  54. }
  55. e_cpu = std::chrono::high_resolution_clock::now();
  56. // Testing overlapping between CPU and GPU
  57. // Delay the wait() after CPU computation
  58. event.wait();
  59. e_gpu = std::chrono::high_resolution_clock::now();
  60. e_t = std::chrono::high_resolution_clock::now();
  61. duration_cpu = std::chrono::duration<float, std::milli>(e_cpu - s_cpu).count();
  62. duration_gpu = std::chrono::duration<float, std::milli>(e_gpu - s_gpu).count();
  63. duration_total = std::chrono::duration<float, std::milli>(e_t - s_t).count();
  64. // Copy back from GPU to CPU
  65. my_gpu_queue.memcpy(host_mem, device_mem, N * sizeof(int)).wait();
  66. printf("\n CPU Computation, Time = %lf \n", duration_cpu);
  67. printf("\n GPU Computation, Time = %lf \n", duration_gpu);
  68. printf("\n Total Computation, TIme = %lf \n", duration_total);
  69. free(cpu_out);
  70. free(host_mem, my_gpu_queue);
  71. free(device_mem, my_gpu_queue);
  72. printf("\nTask Done!\n");
  73. return 0;
  74. }