// Patric Zhao: patric.zhao@intel.com #include #include using namespace sycl; constexpr int64_t N = 10000000; int main() { // Enable queue profiling auto propList = cl::sycl::property_list {cl::sycl::property::queue::enable_profiling()}; queue my_gpu_queue(gpu_selector{}, propList); std::cout << "Selected GPU device: " << my_gpu_queue.get_device().get_info() << "\n"; int *host_mem = malloc_host(N, my_gpu_queue); int *cpu_mem = malloc_host(N, my_gpu_queue); int *device_mem = malloc_device(N, my_gpu_queue); // Init CPU data for(int64_t i = 0; i < N; i++) { host_mem[i] = i % 6666; } float duration_cpu = 0.0; float duration_gpu_a = 0.0; float duration_gpu_b = 0.0; float duration_gpu_c = 0.0; std::chrono::high_resolution_clock::time_point s, e; std::chrono::high_resolution_clock::time_point s_a, e_a; std::chrono::high_resolution_clock::time_point s_b, e_b; std::chrono::high_resolution_clock::time_point s_c, e_c; // CPU computation printf("\n Start CPU Computation, Number of Elems = %ld \n", N); s = std::chrono::high_resolution_clock::now(); // CPU code here for(int64_t i = 0; i < N; i++) { cpu_mem[i] = host_mem[i] * 2; } e = std::chrono::high_resolution_clock::now(); duration_cpu = std::chrono::duration(e - s).count(); printf("\n End CPU Computation, Time = %lf \n", duration_cpu); // warmup /*********************************************************************/ my_gpu_queue.memcpy(device_mem, host_mem, N * sizeof(int)).wait(); my_gpu_queue.submit([&](handler& h) { // Parallel Computation h.parallel_for(range{N}, [=](id<1> item) { device_mem[item] *= 2; }); }); my_gpu_queue.wait(); /*********************************************************************/ s_c = std::chrono::high_resolution_clock::now(); // Copy from host(CPU) to device(GPU) my_gpu_queue.memcpy(device_mem, host_mem, N * sizeof(int)).wait(); s_b = std::chrono::high_resolution_clock::now(); s_a = std::chrono::high_resolution_clock::now(); // submit the content to the queue for execution auto event = my_gpu_queue.submit([&](handler& h) { // Parallel Computation h.parallel_for(range{N}, [=](id<1> item) { device_mem[item] *= 2; }); }); // wait the computation done my_gpu_queue.wait(); e_b = std::chrono::high_resolution_clock::now(); duration_gpu_b = std::chrono::duration(e_b - s_b).count(); duration_gpu_a = (event.get_profiling_info() - event.get_profiling_info()) /1000.0f/1000.0f; // Copy back from GPU to CPU my_gpu_queue.memcpy(host_mem, device_mem, N * sizeof(int)).wait(); e_c = std::chrono::high_resolution_clock::now(); duration_gpu_c = std::chrono::duration(e_c - s_c).count(); printf("\n GPU Computation, GPU Time A = %lf \n", duration_gpu_a); printf("\n GPU Computation, GPU Time B = %lf \n", duration_gpu_b); printf("\n GPU Computation, GPU Time C = %lf \n", duration_gpu_c); printf("\nTask Done!\n"); return 0; }