You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

92 lines
3.0 KiB

3 years ago
3 years ago
3 years ago
  1. // Copyright (C) 2020 Intel Corporation
  2. // Patric Zhao: patric.zhao@intel.com
  3. #include <CL/sycl.hpp>
  4. #include <iostream>
  5. using namespace sycl;
  6. constexpr int64_t N = 10000000;
  7. int main() {
  8. // Enable queue profiling
  9. auto propList = cl::sycl::property_list {cl::sycl::property::queue::enable_profiling()};
  10. queue my_gpu_queue(gpu_selector{}, propList);
  11. std::cout << "Selected GPU device: " <<
  12. my_gpu_queue.get_device().get_info<info::device::name>() << "\n";
  13. int *host_mem = malloc_host<int>(N, my_gpu_queue);
  14. int *cpu_mem = malloc_host<int>(N, my_gpu_queue);
  15. int *device_mem = malloc_device<int>(N, my_gpu_queue);
  16. // Init CPU data
  17. for(int64_t i = 0; i < N; i++) {
  18. host_mem[i] = i % 6666;
  19. }
  20. float duration_cpu = 0.0;
  21. float duration_gpu = 0.0;
  22. float duration_total = 0.0;
  23. std::chrono::high_resolution_clock::time_point s_cpu, e_cpu;
  24. std::chrono::high_resolution_clock::time_point s_gpu, e_gpu;
  25. std::chrono::high_resolution_clock::time_point s_t, e_t;
  26. // warmup
  27. /*********************************************************************/
  28. my_gpu_queue.memcpy(device_mem, host_mem, N * sizeof(int)).wait();
  29. my_gpu_queue.submit([&](handler& h) {
  30. // Parallel Computation
  31. h.parallel_for(range{N}, [=](id<1> item) {
  32. device_mem[item] *= 2;
  33. });
  34. });
  35. my_gpu_queue.wait();
  36. /*********************************************************************/
  37. // CPU computation
  38. printf("\n Start CPU Computation, Number of Elems = %ld \n", N);
  39. s_t = std::chrono::high_resolution_clock::now();
  40. s_cpu = std::chrono::high_resolution_clock::now();
  41. // CPU code here
  42. for(int64_t i = 0; i < N; i++) {
  43. cpu_mem[i] = host_mem[i] * 2;
  44. }
  45. e_cpu = std::chrono::high_resolution_clock::now();
  46. duration_cpu = std::chrono::duration<float, std::milli>(e_cpu - s_cpu).count();
  47. s_gpu = std::chrono::high_resolution_clock::now();
  48. // submit the content to the queue for execution
  49. auto event = my_gpu_queue.submit([&](handler& h) {
  50. // Parallel Computation
  51. h.parallel_for(range{N}, [=](id<1> item) {
  52. device_mem[item] *= 2;
  53. });
  54. });
  55. // wait the computation done
  56. my_gpu_queue.wait();
  57. e_gpu = std::chrono::high_resolution_clock::now();
  58. e_t = std::chrono::high_resolution_clock::now();
  59. duration_gpu = std::chrono::duration<float, std::milli>(e_gpu - s_gpu).count();
  60. duration_total = std::chrono::duration<float, std::milli>(e_t - s_t).count();
  61. // Copy back from GPU to CPU
  62. my_gpu_queue.memcpy(host_mem, device_mem, N * sizeof(int)).wait();
  63. printf("\n CPU Computation, Time = %lf \n", duration_cpu);
  64. printf("\n GPU Computation, Time = %lf \n", duration_gpu);
  65. printf("\n Total Computation, TIme = %lf \n", duration_total);
  66. printf("\nTask Done!\n");
  67. return 0;
  68. }