// Patric Zhao, patric.zhao@intel.com
|
|
|
|
#include <CL/sycl.hpp>
|
|
#include <iostream>
|
|
using namespace sycl;
|
|
|
|
constexpr int64_t N = 10000000;
|
|
|
|
int main() {
|
|
|
|
// Enable queue profiling
|
|
queue my_gpu_queue(gpu_selector{});
|
|
|
|
std::cout << "Selected GPU device: " <<
|
|
my_gpu_queue.get_device().get_info<info::device::name>() << "\n";
|
|
|
|
|
|
int *cpu_out = (int*)malloc(N * sizeof(int));
|
|
int *host_mem = malloc_host<int>(N, my_gpu_queue);
|
|
int *device_mem = malloc_device<int>(N, my_gpu_queue);
|
|
|
|
// Init CPU data
|
|
for(int64_t i = 0; i < N; i++) {
|
|
host_mem[i] = i % 6666;
|
|
cpu_out[i] = i % 6666;
|
|
}
|
|
|
|
float duration_cpu = 0.0;
|
|
float duration_gpu = 0.0;
|
|
float duration_total = 0.0;
|
|
|
|
std::chrono::high_resolution_clock::time_point s_cpu, e_cpu;
|
|
std::chrono::high_resolution_clock::time_point s_gpu, e_gpu;
|
|
std::chrono::high_resolution_clock::time_point s_t, e_t;
|
|
|
|
// warmup
|
|
/*********************************************************************/
|
|
for(int64_t i = 0; i < N; i++) {
|
|
cpu_out[i] = cpu_out[i] * 2;
|
|
}
|
|
|
|
my_gpu_queue.memcpy(device_mem, host_mem, N * sizeof(int)).wait();
|
|
my_gpu_queue.submit([&](handler& h) {
|
|
|
|
// Parallel Computation
|
|
h.parallel_for(range{N}, [=](id<1> item) {
|
|
device_mem[item] *= 2;
|
|
});
|
|
|
|
});
|
|
my_gpu_queue.wait();
|
|
/*********************************************************************/
|
|
|
|
printf("\n Start CPU Computation, Number of Elems = %ld \n", N);
|
|
|
|
s_t = std::chrono::high_resolution_clock::now();
|
|
|
|
// GPU Computation
|
|
// submit the content to the queue for execution
|
|
s_gpu = std::chrono::high_resolution_clock::now();
|
|
auto event = my_gpu_queue.submit([&](handler& h) {
|
|
// Parallel Computation
|
|
h.parallel_for(range{N}, [=](id<1> item) {
|
|
device_mem[item] *= 2;
|
|
});
|
|
});
|
|
|
|
// CPU computation
|
|
s_cpu = std::chrono::high_resolution_clock::now();
|
|
for(int64_t i = 0; i < N; i++) {
|
|
cpu_out[i] *= 2;
|
|
}
|
|
e_cpu = std::chrono::high_resolution_clock::now();
|
|
|
|
// Testing overlapping between CPU and GPU
|
|
// Delay the wait() after CPU computation
|
|
event.wait();
|
|
e_gpu = std::chrono::high_resolution_clock::now();
|
|
|
|
e_t = std::chrono::high_resolution_clock::now();
|
|
|
|
duration_cpu = std::chrono::duration<float, std::milli>(e_cpu - s_cpu).count();
|
|
duration_gpu = std::chrono::duration<float, std::milli>(e_gpu - s_gpu).count();
|
|
duration_total = std::chrono::duration<float, std::milli>(e_t - s_t).count();
|
|
|
|
// Copy back from GPU to CPU
|
|
my_gpu_queue.memcpy(host_mem, device_mem, N * sizeof(int)).wait();
|
|
|
|
printf("\n CPU Computation, Time = %lf \n", duration_cpu);
|
|
printf("\n GPU Computation, Time = %lf \n", duration_gpu);
|
|
printf("\n Total Computation, TIme = %lf \n", duration_total);
|
|
|
|
free(cpu_out);
|
|
free(host_mem, my_gpu_queue);
|
|
free(device_mem, my_gpu_queue);
|
|
|
|
printf("\nTask Done!\n");
|
|
|
|
return 0;
|
|
}
|
|
|