// Patric Zhao, patric.zhao@intel.com #include #include using namespace sycl; constexpr int64_t N = 10000000; int main() { // Enable queue profiling queue my_gpu_queue(gpu_selector{}); std::cout << "Selected GPU device: " << my_gpu_queue.get_device().get_info() << "\n"; int *cpu_out = (int*)malloc(N * sizeof(int)); int *host_mem = malloc_host(N, my_gpu_queue); int *device_mem = malloc_device(N, my_gpu_queue); // Init CPU data for(int64_t i = 0; i < N; i++) { host_mem[i] = i % 6666; cpu_out[i] = i % 6666; } float duration_cpu = 0.0; float duration_gpu = 0.0; float duration_total = 0.0; std::chrono::high_resolution_clock::time_point s_cpu, e_cpu; std::chrono::high_resolution_clock::time_point s_gpu, e_gpu; std::chrono::high_resolution_clock::time_point s_t, e_t; // warmup /*********************************************************************/ for(int64_t i = 0; i < N; i++) { cpu_out[i] = cpu_out[i] * 2; } my_gpu_queue.memcpy(device_mem, host_mem, N * sizeof(int)).wait(); my_gpu_queue.submit([&](handler& h) { // Parallel Computation h.parallel_for(range{N}, [=](id<1> item) { device_mem[item] *= 2; }); }); my_gpu_queue.wait(); /*********************************************************************/ printf("\n Start CPU Computation, Number of Elems = %ld \n", N); s_t = std::chrono::high_resolution_clock::now(); // GPU Computation // submit the content to the queue for execution s_gpu = std::chrono::high_resolution_clock::now(); auto event = my_gpu_queue.submit([&](handler& h) { // Parallel Computation h.parallel_for(range{N}, [=](id<1> item) { device_mem[item] *= 2; }); }); // CPU computation s_cpu = std::chrono::high_resolution_clock::now(); for(int64_t i = 0; i < N; i++) { cpu_out[i] *= 2; } e_cpu = std::chrono::high_resolution_clock::now(); // Testing overlapping between CPU and GPU // Delay the wait() after CPU computation event.wait(); e_gpu = std::chrono::high_resolution_clock::now(); e_t = std::chrono::high_resolution_clock::now(); duration_cpu = std::chrono::duration(e_cpu - s_cpu).count(); duration_gpu = std::chrono::duration(e_gpu - s_gpu).count(); duration_total = std::chrono::duration(e_t - s_t).count(); // Copy back from GPU to CPU my_gpu_queue.memcpy(host_mem, device_mem, N * sizeof(int)).wait(); printf("\n CPU Computation, Time = %lf \n", duration_cpu); printf("\n GPU Computation, Time = %lf \n", duration_gpu); printf("\n Total Computation, TIme = %lf \n", duration_total); free(cpu_out); free(host_mem, my_gpu_queue); free(device_mem, my_gpu_queue); printf("\nTask Done!\n"); return 0; }