Ви не можете вибрати більше 25 тем Теми мають розпочинатися з літери або цифри, можуть містити дефіси (-) і не повинні перевищувати 35 символів.

101 рядки
3.1 KiB

// Patric Zhao, patric.zhao@intel.com
#include <CL/sycl.hpp>
#include <iostream>
using namespace sycl;
constexpr int64_t N = 10000000;
int main() {
// Enable queue profiling
queue my_gpu_queue(gpu_selector{});
std::cout << "Selected GPU device: " <<
my_gpu_queue.get_device().get_info<info::device::name>() << "\n";
int *cpu_out = (int*)malloc(N * sizeof(int));
int *host_mem = malloc_host<int>(N, my_gpu_queue);
int *device_mem = malloc_device<int>(N, my_gpu_queue);
// Init CPU data
for(int64_t i = 0; i < N; i++) {
host_mem[i] = i % 6666;
cpu_out[i] = i % 6666;
}
float duration_cpu = 0.0;
float duration_gpu = 0.0;
float duration_total = 0.0;
std::chrono::high_resolution_clock::time_point s_cpu, e_cpu;
std::chrono::high_resolution_clock::time_point s_gpu, e_gpu;
std::chrono::high_resolution_clock::time_point s_t, e_t;
// warmup
/*********************************************************************/
for(int64_t i = 0; i < N; i++) {
cpu_out[i] = cpu_out[i] * 2;
}
my_gpu_queue.memcpy(device_mem, host_mem, N * sizeof(int)).wait();
my_gpu_queue.submit([&](handler& h) {
// Parallel Computation
h.parallel_for(range{N}, [=](id<1> item) {
device_mem[item] *= 2;
});
});
my_gpu_queue.wait();
/*********************************************************************/
printf("\n Start CPU Computation, Number of Elems = %ld \n", N);
s_t = std::chrono::high_resolution_clock::now();
// GPU Computation
// submit the content to the queue for execution
s_gpu = std::chrono::high_resolution_clock::now();
auto event = my_gpu_queue.submit([&](handler& h) {
// Parallel Computation
h.parallel_for(range{N}, [=](id<1> item) {
device_mem[item] *= 2;
});
});
// CPU computation
s_cpu = std::chrono::high_resolution_clock::now();
for(int64_t i = 0; i < N; i++) {
cpu_out[i] *= 2;
}
e_cpu = std::chrono::high_resolution_clock::now();
// Testing overlapping between CPU and GPU
// Delay the wait() after CPU computation
event.wait();
e_gpu = std::chrono::high_resolution_clock::now();
e_t = std::chrono::high_resolution_clock::now();
duration_cpu = std::chrono::duration<float, std::milli>(e_cpu - s_cpu).count();
duration_gpu = std::chrono::duration<float, std::milli>(e_gpu - s_gpu).count();
duration_total = std::chrono::duration<float, std::milli>(e_t - s_t).count();
// Copy back from GPU to CPU
my_gpu_queue.memcpy(host_mem, device_mem, N * sizeof(int)).wait();
printf("\n CPU Computation, Time = %lf \n", duration_cpu);
printf("\n GPU Computation, Time = %lf \n", duration_gpu);
printf("\n Total Computation, TIme = %lf \n", duration_total);
free(cpu_out);
free(host_mem, my_gpu_queue);
free(device_mem, my_gpu_queue);
printf("\nTask Done!\n");
return 0;
}