diff --git a/code/async.cpp b/code/async.cpp new file mode 100644 index 0000000..31ef769 --- /dev/null +++ b/code/async.cpp @@ -0,0 +1,103 @@ + // Copyright (C) 2020 Intel Corporation + + // SPDX-License-Identifier: MIT + + #include + #include + using namespace sycl; + + constexpr int64_t N = 10000000; + + int main() { + + // Enable queue profiling + queue my_gpu_queue(gpu_selector{}, cl::sycl::property::queue::in_order()); + + std::cout << "Selected GPU device: " << + my_gpu_queue.get_device().get_info() << "\n"; + + + int *cpu_out = (int*)malloc(N * sizeof(int)); + int *host_mem = malloc_host(N, my_gpu_queue); + int *device_mem = malloc_device(N, my_gpu_queue); + + // Init CPU data + for(int64_t i = 0; i < N; i++) { + host_mem[i] = i % 6666; + cpu_out[i] = i % 6666; + } + + float duration_cpu = 0.0; + float duration_gpu = 0.0; + float duration_total = 0.0; + + std::chrono::high_resolution_clock::time_point s_cpu, e_cpu; + std::chrono::high_resolution_clock::time_point s_gpu, e_gpu; + std::chrono::high_resolution_clock::time_point s_t, e_t; + + // warmup + /*********************************************************************/ + for(int64_t i = 0; i < N; i++) { + cpu_out[i] = cpu_out[i] * 2; + } + + my_gpu_queue.memcpy(device_mem, host_mem, N * sizeof(int)).wait(); + my_gpu_queue.submit([&](handler& h) { + + // Parallel Computation + h.parallel_for(range{N}, [=](id<1> item) { + device_mem[item] *= 2; + }); + + }); + my_gpu_queue.wait(); + /*********************************************************************/ + + printf("\n Start CPU Computation, Number of Elems = %ld \n", N); + + s_t = std::chrono::high_resolution_clock::now(); + + // GPU Computation + // submit the content to the queue for execution + s_gpu = std::chrono::high_resolution_clock::now(); + auto event = my_gpu_queue.submit([&](handler& h) { + // Parallel Computation + h.parallel_for(range{N}, [=](id<1> item) { + device_mem[item] *= 2; + }); + }); + + // CPU computation + s_cpu = std::chrono::high_resolution_clock::now(); + for(int64_t i = 0; i < N; i++) { + cpu_out[i] *= 2; + } + e_cpu = std::chrono::high_resolution_clock::now(); + + // Testing overlapping between CPU and GPU + // Delay the wait() after CPU computation + event.wait(); + e_gpu = std::chrono::high_resolution_clock::now(); + + e_t = std::chrono::high_resolution_clock::now(); + + duration_cpu = std::chrono::duration(e_cpu - s_cpu).count(); + duration_gpu = std::chrono::duration(e_gpu - s_gpu).count(); + duration_total = std::chrono::duration(e_t - s_t).count(); + + // Copy back from GPU to CPU + my_gpu_queue.memcpy(host_mem, device_mem, N * sizeof(int)).wait(); + + printf("\n CPU Computation, Time = %lf \n", duration_cpu); + printf("\n GPU Computation, Time = %lf \n", duration_gpu); + printf("\n Total Computation, TIme = %lf \n", duration_total); + + free(cpu_out); + free(host_mem, my_gpu_queue); + free(device_mem, my_gpu_queue); + + printf("\nTask Done!\n"); + + return 0; + } +