|
@ -0,0 +1,103 @@ |
|
|
|
|
|
// Copyright (C) 2020 Intel Corporation
|
|
|
|
|
|
|
|
|
|
|
|
// SPDX-License-Identifier: MIT
|
|
|
|
|
|
|
|
|
|
|
|
#include <CL/sycl.hpp>
|
|
|
|
|
|
#include <iostream>
|
|
|
|
|
|
using namespace sycl; |
|
|
|
|
|
|
|
|
|
|
|
constexpr int64_t N = 10000000; |
|
|
|
|
|
|
|
|
|
|
|
int main() { |
|
|
|
|
|
|
|
|
|
|
|
// Enable queue profiling
|
|
|
|
|
|
queue my_gpu_queue(gpu_selector{}, cl::sycl::property::queue::in_order()); |
|
|
|
|
|
|
|
|
|
|
|
std::cout << "Selected GPU device: " << |
|
|
|
|
|
my_gpu_queue.get_device().get_info<info::device::name>() << "\n"; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int *cpu_out = (int*)malloc(N * sizeof(int)); |
|
|
|
|
|
int *host_mem = malloc_host<int>(N, my_gpu_queue); |
|
|
|
|
|
int *device_mem = malloc_device<int>(N, my_gpu_queue); |
|
|
|
|
|
|
|
|
|
|
|
// Init CPU data
|
|
|
|
|
|
for(int64_t i = 0; i < N; i++) { |
|
|
|
|
|
host_mem[i] = i % 6666; |
|
|
|
|
|
cpu_out[i] = i % 6666; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
float duration_cpu = 0.0; |
|
|
|
|
|
float duration_gpu = 0.0; |
|
|
|
|
|
float duration_total = 0.0; |
|
|
|
|
|
|
|
|
|
|
|
std::chrono::high_resolution_clock::time_point s_cpu, e_cpu; |
|
|
|
|
|
std::chrono::high_resolution_clock::time_point s_gpu, e_gpu; |
|
|
|
|
|
std::chrono::high_resolution_clock::time_point s_t, e_t; |
|
|
|
|
|
|
|
|
|
|
|
// warmup
|
|
|
|
|
|
/*********************************************************************/ |
|
|
|
|
|
for(int64_t i = 0; i < N; i++) { |
|
|
|
|
|
cpu_out[i] = cpu_out[i] * 2; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
my_gpu_queue.memcpy(device_mem, host_mem, N * sizeof(int)).wait(); |
|
|
|
|
|
my_gpu_queue.submit([&](handler& h) { |
|
|
|
|
|
|
|
|
|
|
|
// Parallel Computation
|
|
|
|
|
|
h.parallel_for(range{N}, [=](id<1> item) { |
|
|
|
|
|
device_mem[item] *= 2; |
|
|
|
|
|
}); |
|
|
|
|
|
|
|
|
|
|
|
}); |
|
|
|
|
|
my_gpu_queue.wait(); |
|
|
|
|
|
/*********************************************************************/ |
|
|
|
|
|
|
|
|
|
|
|
printf("\n Start CPU Computation, Number of Elems = %ld \n", N); |
|
|
|
|
|
|
|
|
|
|
|
s_t = std::chrono::high_resolution_clock::now(); |
|
|
|
|
|
|
|
|
|
|
|
// GPU Computation
|
|
|
|
|
|
// submit the content to the queue for execution
|
|
|
|
|
|
s_gpu = std::chrono::high_resolution_clock::now(); |
|
|
|
|
|
auto event = my_gpu_queue.submit([&](handler& h) { |
|
|
|
|
|
// Parallel Computation
|
|
|
|
|
|
h.parallel_for(range{N}, [=](id<1> item) { |
|
|
|
|
|
device_mem[item] *= 2; |
|
|
|
|
|
}); |
|
|
|
|
|
}); |
|
|
|
|
|
|
|
|
|
|
|
// CPU computation
|
|
|
|
|
|
s_cpu = std::chrono::high_resolution_clock::now(); |
|
|
|
|
|
for(int64_t i = 0; i < N; i++) { |
|
|
|
|
|
cpu_out[i] *= 2; |
|
|
|
|
|
} |
|
|
|
|
|
e_cpu = std::chrono::high_resolution_clock::now(); |
|
|
|
|
|
|
|
|
|
|
|
// Testing overlapping between CPU and GPU
|
|
|
|
|
|
// Delay the wait() after CPU computation
|
|
|
|
|
|
event.wait(); |
|
|
|
|
|
e_gpu = std::chrono::high_resolution_clock::now(); |
|
|
|
|
|
|
|
|
|
|
|
e_t = std::chrono::high_resolution_clock::now(); |
|
|
|
|
|
|
|
|
|
|
|
duration_cpu = std::chrono::duration<float, std::milli>(e_cpu - s_cpu).count(); |
|
|
|
|
|
duration_gpu = std::chrono::duration<float, std::milli>(e_gpu - s_gpu).count(); |
|
|
|
|
|
duration_total = std::chrono::duration<float, std::milli>(e_t - s_t).count(); |
|
|
|
|
|
|
|
|
|
|
|
// Copy back from GPU to CPU
|
|
|
|
|
|
my_gpu_queue.memcpy(host_mem, device_mem, N * sizeof(int)).wait(); |
|
|
|
|
|
|
|
|
|
|
|
printf("\n CPU Computation, Time = %lf \n", duration_cpu); |
|
|
|
|
|
printf("\n GPU Computation, Time = %lf \n", duration_gpu); |
|
|
|
|
|
printf("\n Total Computation, TIme = %lf \n", duration_total); |
|
|
|
|
|
|
|
|
|
|
|
free(cpu_out); |
|
|
|
|
|
free(host_mem, my_gpu_queue); |
|
|
|
|
|
free(device_mem, my_gpu_queue); |
|
|
|
|
|
|
|
|
|
|
|
printf("\nTask Done!\n"); |
|
|
|
|
|
|
|
|
|
|
|
return 0; |
|
|
|
|
|
} |
|
|
|
|
|
|