// Copyright (C) 2020 Intel Corporation
|
|
|
|
// SPDX-License-Identifier: MIT
|
|
|
|
#include <CL/sycl.hpp>
|
|
#include <iostream>
|
|
using namespace sycl;
|
|
|
|
constexpr int64_t N = 10000000;
|
|
|
|
int main() {
|
|
|
|
// Enable queue profiling
|
|
auto propList = cl::sycl::property_list {cl::sycl::property::queue::enable_profiling()};
|
|
queue my_gpu_queue(gpu_selector{}, propList);
|
|
|
|
std::cout << "Selected GPU device: " <<
|
|
my_gpu_queue.get_device().get_info<info::device::name>() << "\n";
|
|
|
|
|
|
|
|
int *host_mem = malloc_host<int>(N, my_gpu_queue);
|
|
int *cpu_mem = malloc_host<int>(N, my_gpu_queue);
|
|
int *device_mem = malloc_device<int>(N, my_gpu_queue);
|
|
|
|
// Init CPU data
|
|
for(int64_t i = 0; i < N; i++) {
|
|
host_mem[i] = i % 6666;
|
|
}
|
|
|
|
float duration_cpu = 0.0;
|
|
float duration_gpu_a = 0.0;
|
|
float duration_gpu_b = 0.0;
|
|
float duration_gpu_c = 0.0;
|
|
|
|
std::chrono::high_resolution_clock::time_point s, e;
|
|
std::chrono::high_resolution_clock::time_point s_a, e_a;
|
|
std::chrono::high_resolution_clock::time_point s_b, e_b;
|
|
std::chrono::high_resolution_clock::time_point s_c, e_c;
|
|
|
|
// CPU computation
|
|
printf("\n Start CPU Computation, Number of Elems = %ld \n", N);
|
|
|
|
s = std::chrono::high_resolution_clock::now();
|
|
// CPU code here
|
|
for(int64_t i = 0; i < N; i++) {
|
|
cpu_mem[i] = host_mem[i] * 2;
|
|
}
|
|
e = std::chrono::high_resolution_clock::now();
|
|
duration_cpu = std::chrono::duration<float, std::milli>(e - s).count();
|
|
printf("\n End CPU Computation, Time = %lf \n", duration_cpu);
|
|
|
|
|
|
// warmup
|
|
/*********************************************************************/
|
|
my_gpu_queue.memcpy(device_mem, host_mem, N * sizeof(int)).wait();
|
|
my_gpu_queue.submit([&](handler& h) {
|
|
|
|
// Parallel Computation
|
|
h.parallel_for(range{N}, [=](id<1> item) {
|
|
device_mem[item] *= 2;
|
|
});
|
|
|
|
});
|
|
my_gpu_queue.wait();
|
|
/*********************************************************************/
|
|
|
|
s_c = std::chrono::high_resolution_clock::now();
|
|
// Copy from host(CPU) to device(GPU)
|
|
my_gpu_queue.memcpy(device_mem, host_mem, N * sizeof(int)).wait();
|
|
|
|
s_b = std::chrono::high_resolution_clock::now();
|
|
s_a = std::chrono::high_resolution_clock::now();
|
|
// submit the content to the queue for execution
|
|
auto event = my_gpu_queue.submit([&](handler& h) {
|
|
|
|
// Parallel Computation
|
|
h.parallel_for(range{N}, [=](id<1> item) {
|
|
device_mem[item] *= 2;
|
|
});
|
|
|
|
});
|
|
// wait the computation done
|
|
my_gpu_queue.wait();
|
|
e_b = std::chrono::high_resolution_clock::now();
|
|
duration_gpu_b = std::chrono::duration<float, std::milli>(e_b - s_b).count();
|
|
|
|
duration_gpu_a =
|
|
(event.get_profiling_info<info::event_profiling::command_end>() -
|
|
event.get_profiling_info<info::event_profiling::command_start>()) /1000.0f/1000.0f;
|
|
|
|
// Copy back from GPU to CPU
|
|
my_gpu_queue.memcpy(host_mem, device_mem, N * sizeof(int)).wait();
|
|
e_c = std::chrono::high_resolution_clock::now();
|
|
duration_gpu_c = std::chrono::duration<float, std::milli>(e_c - s_c).count();
|
|
|
|
printf("\n GPU Computation, GPU Time A = %lf \n", duration_gpu_a);
|
|
printf("\n GPU Computation, GPU Time B = %lf \n", duration_gpu_b);
|
|
printf("\n GPU Computation, GPU Time C = %lf \n", duration_gpu_c);
|
|
|
|
printf("\nTask Done!\n");
|
|
|
|
return 0;
|
|
}
|
|
|