From 4943e036cefcb8b6ac8a68683425f47b0dab74dd Mon Sep 17 00:00:00 2001 From: PatricZhao Date: Fri, 22 Oct 2021 19:07:15 +0800 Subject: [PATCH] Add example --- code/async.cpp | 93 ++++++++++++++++++++++++++++++++++++++++ code/basic_parafor.cpp | 52 +++++++++++++++++++++++ code/data_movement_ex.cpp | 43 +++++++++++++++++++ code/gpu_selector.cpp | 17 ++++++++ code/timer.cpp | 105 ++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 310 insertions(+) create mode 100644 code/async.cpp create mode 100644 code/basic_parafor.cpp create mode 100644 code/data_movement_ex.cpp create mode 100644 code/gpu_selector.cpp create mode 100644 code/timer.cpp diff --git a/code/async.cpp b/code/async.cpp new file mode 100644 index 0000000..154893f --- /dev/null +++ b/code/async.cpp @@ -0,0 +1,93 @@ + // Copyright (C) 2020 Intel Corporation + + // SPDX-License-Identifier: MIT + + #include + #include + using namespace sycl; + + constexpr int64_t N = 10000000; + + int main() { + + // Enable queue profiling + auto propList = cl::sycl::property_list {cl::sycl::property::queue::enable_profiling()}; + queue my_gpu_queue(gpu_selector{}, propList); + + std::cout << "Selected GPU device: " << + my_gpu_queue.get_device().get_info() << "\n"; + + + int *host_mem = malloc_host(N, my_gpu_queue); + int *cpu_mem = malloc_host(N, my_gpu_queue); + int *device_mem = malloc_device(N, my_gpu_queue); + + // Init CPU data + for(int64_t i = 0; i < N; i++) { + host_mem[i] = i % 6666; + } + + float duration_cpu = 0.0; + float duration_gpu = 0.0; + float duration_total = 0.0; + + std::chrono::high_resolution_clock::time_point s_cpu, e_cpu; + std::chrono::high_resolution_clock::time_point s_gpu, e_gpu; + std::chrono::high_resolution_clock::time_point s_t, e_t; + + // warmup + /*********************************************************************/ + my_gpu_queue.memcpy(device_mem, host_mem, N * sizeof(int)).wait(); + my_gpu_queue.submit([&](handler& h) { + + // Parallel Computation + h.parallel_for(range{N}, [=](id<1> item) { + device_mem[item] *= 2; + }); + + }); + my_gpu_queue.wait(); + /*********************************************************************/ + + // CPU computation + printf("\n Start CPU Computation, Number of Elems = %ld \n", N); + + s_t = std::chrono::high_resolution_clock::now(); + s_cpu = std::chrono::high_resolution_clock::now(); + // CPU code here + for(int64_t i = 0; i < N; i++) { + cpu_mem[i] = host_mem[i] * 2; + } + e_cpu = std::chrono::high_resolution_clock::now(); + duration_cpu = std::chrono::duration(e_cpu - s_cpu).count(); + + + s_gpu = std::chrono::high_resolution_clock::now(); + // submit the content to the queue for execution + auto event = my_gpu_queue.submit([&](handler& h) { + + // Parallel Computation + h.parallel_for(range{N}, [=](id<1> item) { + device_mem[item] *= 2; + }); + + }); + // wait the computation done + my_gpu_queue.wait(); + e_gpu = std::chrono::high_resolution_clock::now(); + e_t = std::chrono::high_resolution_clock::now(); + duration_gpu = std::chrono::duration(e_gpu - s_gpu).count(); + duration_total = std::chrono::duration(e_t - s_t).count(); + + // Copy back from GPU to CPU + my_gpu_queue.memcpy(host_mem, device_mem, N * sizeof(int)).wait(); + + printf("\n CPU Computation, Time = %lf \n", duration_cpu); + printf("\n GPU Computation, Time = %lf \n", duration_gpu); + printf("\n Total Computation, TIme = %lf \n", duration_total); + + printf("\nTask Done!\n"); + + return 0; + } + diff --git a/code/basic_parafor.cpp b/code/basic_parafor.cpp new file mode 100644 index 0000000..7f8dde2 --- /dev/null +++ b/code/basic_parafor.cpp @@ -0,0 +1,52 @@ + // Copyright (C) 2020 Intel Corporation + + // SPDX-License-Identifier: MIT + + #include + #include + using namespace sycl; + + constexpr int N = 10; + + int main() { + queue my_gpu_queue( gpu_selector{} ); + + std::cout << "Selected GPU device: " << + my_gpu_queue.get_device().get_info() << "\n"; + + int *host_mem = malloc_host(N, my_gpu_queue); + int *device_mem = malloc_device(N, my_gpu_queue); + + // Init CPU data + for(int i = 0; i < N; i++) { + host_mem[i] = i; + } + + // Copy from host(CPU) to device(GPU) + my_gpu_queue.memcpy(device_mem, host_mem, N * sizeof(int)).wait(); + + // submit the content to the queue for execution + my_gpu_queue.submit([&](handler& h) { + + // Parallel Computation + h.parallel_for(range{N}, [=](id<1> item) { + device_mem[item] *= 2; + }); + + }); + + // wait the computation done + my_gpu_queue.wait(); + + // Copy back from GPU to CPU + my_gpu_queue.memcpy(host_mem, device_mem, N * sizeof(int)).wait(); + + printf("\nData Result\n"); + for(int i = 0; i < N; i++) { + printf("%d, ", host_mem[i]); + } + printf("\nTask Done!\n"); + + return 0; + } + diff --git a/code/data_movement_ex.cpp b/code/data_movement_ex.cpp new file mode 100644 index 0000000..86c5256 --- /dev/null +++ b/code/data_movement_ex.cpp @@ -0,0 +1,43 @@ + // Copyright (C) 2020 Intel Corporation + + // SPDX-License-Identifier: MIT + + #include + #include + using namespace sycl; + + constexpr int N = 10; + + int main() { + queue my_gpu_queue( gpu_selector{} ); + + std::cout << "Selected GPU device: " << + my_gpu_queue.get_device().get_info() << "\n"; + + int *host_mem = malloc_host(N, my_gpu_queue); + int *device_mem = malloc_device(N, my_gpu_queue); + + // Init CPU data + for(int i = 0; i < N; i++) { + host_mem[i] = i; + } + + // Copy from host(CPU) to device(GPU) + my_gpu_queue.memcpy(device_mem, host_mem, N * sizeof(int)).wait(); + + // do some works on GPU + // ...... + // + + // Copy back from GPU to CPU + my_gpu_queue.memcpy(host_mem, device_mem, N * sizeof(int)).wait(); + + printf("\nData Result\n"); + for(int i = 0; i < N; i++) { + printf("%d, ", host_mem[i]); + } + printf("\nTask Done!\n"); + + return 0; + } + diff --git a/code/gpu_selector.cpp b/code/gpu_selector.cpp new file mode 100644 index 0000000..1378854 --- /dev/null +++ b/code/gpu_selector.cpp @@ -0,0 +1,17 @@ +// Copyright (C) 2020 Intel Corporation + +// SPDX-License-Identifier: MIT + +#include +#include +using namespace sycl; + +int main() { + queue my_gpu_queue( gpu_selector{} ); + + std::cout << "Selected GPU device: " << + my_gpu_queue.get_device().get_info() << "\n"; + + return 0; +} + diff --git a/code/timer.cpp b/code/timer.cpp new file mode 100644 index 0000000..81b4416 --- /dev/null +++ b/code/timer.cpp @@ -0,0 +1,105 @@ + // Copyright (C) 2020 Intel Corporation + + // SPDX-License-Identifier: MIT + + #include + #include + using namespace sycl; + + constexpr int64_t N = 10000000; + + int main() { + + // Enable queue profiling + auto propList = cl::sycl::property_list {cl::sycl::property::queue::enable_profiling()}; + queue my_gpu_queue(gpu_selector{}, propList); + + std::cout << "Selected GPU device: " << + my_gpu_queue.get_device().get_info() << "\n"; + + + + int *host_mem = malloc_host(N, my_gpu_queue); + int *cpu_mem = malloc_host(N, my_gpu_queue); + int *device_mem = malloc_device(N, my_gpu_queue); + + // Init CPU data + for(int64_t i = 0; i < N; i++) { + host_mem[i] = i % 6666; + } + + float duration_cpu = 0.0; + float duration_gpu_a = 0.0; + float duration_gpu_b = 0.0; + float duration_gpu_c = 0.0; + + std::chrono::high_resolution_clock::time_point s, e; + std::chrono::high_resolution_clock::time_point s_a, e_a; + std::chrono::high_resolution_clock::time_point s_b, e_b; + std::chrono::high_resolution_clock::time_point s_c, e_c; + + // CPU computation + printf("\n Start CPU Computation, Number of Elems = %ld \n", N); + + s = std::chrono::high_resolution_clock::now(); + // CPU code here + for(int64_t i = 0; i < N; i++) { + cpu_mem[i] = host_mem[i] * 2; + } + e = std::chrono::high_resolution_clock::now(); + duration_cpu = std::chrono::duration(e - s).count(); + printf("\n End CPU Computation, Time = %lf \n", duration_cpu); + + + // warmup + /*********************************************************************/ + my_gpu_queue.memcpy(device_mem, host_mem, N * sizeof(int)).wait(); + my_gpu_queue.submit([&](handler& h) { + + // Parallel Computation + h.parallel_for(range{N}, [=](id<1> item) { + device_mem[item] *= 2; + }); + + }); + my_gpu_queue.wait(); + /*********************************************************************/ + + s_c = std::chrono::high_resolution_clock::now(); + // Copy from host(CPU) to device(GPU) + my_gpu_queue.memcpy(device_mem, host_mem, N * sizeof(int)).wait(); + + s_b = std::chrono::high_resolution_clock::now(); + s_a = std::chrono::high_resolution_clock::now(); + // submit the content to the queue for execution + auto event = my_gpu_queue.submit([&](handler& h) { + + // Parallel Computation + h.parallel_for(range{N}, [=](id<1> item) { + device_mem[item] *= 2; + }); + + }); + // wait the computation done + my_gpu_queue.wait(); + e_b = std::chrono::high_resolution_clock::now(); + duration_gpu_b = std::chrono::duration(e_b - s_b).count(); + + duration_gpu_a = + (event.get_profiling_info() - + event.get_profiling_info()) /1000.0f/1000.0f; + + // Copy back from GPU to CPU + my_gpu_queue.memcpy(host_mem, device_mem, N * sizeof(int)).wait(); + e_c = std::chrono::high_resolution_clock::now(); + duration_gpu_c = std::chrono::duration(e_c - s_c).count(); + + printf("\n GPU Computation, GPU Time A = %lf \n", duration_gpu_a); + printf("\n GPU Computation, GPU Time B = %lf \n", duration_gpu_b); + printf("\n GPU Computation, GPU Time C = %lf \n", duration_gpu_c); + + printf("\nTask Done!\n"); + + return 0; + } +