From 4943e036cefcb8b6ac8a68683425f47b0dab74dd Mon Sep 17 00:00:00 2001
From: PatricZhao <patric.zhao@intel.com>
Date: Fri, 22 Oct 2021 19:07:15 +0800
Subject: [PATCH] Add example

---
 code/async.cpp            |  93 ++++++++++++++++++++++++++++++++++++++++
 code/basic_parafor.cpp    |  52 +++++++++++++++++++++++
 code/data_movement_ex.cpp |  43 +++++++++++++++++++
 code/gpu_selector.cpp     |  17 ++++++++
 code/timer.cpp            | 105 ++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 310 insertions(+)
 create mode 100644 code/async.cpp
 create mode 100644 code/basic_parafor.cpp
 create mode 100644 code/data_movement_ex.cpp
 create mode 100644 code/gpu_selector.cpp
 create mode 100644 code/timer.cpp
diff --git a/code/async.cpp b/code/async.cpp
new file mode 100644
index 0000000..154893f
--- /dev/null
+++ b/code/async.cpp
@@ -0,0 +1,93 @@
+   // Copyright (C) 2020 Intel Corporation
+   
+   // SPDX-License-Identifier: MIT
+   
+   #include <CL/sycl.hpp>
+   #include <iostream>
+   using namespace sycl;
+   
+   constexpr int64_t N = 10000000;
+   
+   int main() {
+     
+     // Enable queue profiling  
+     auto propList = cl::sycl::property_list {cl::sycl::property::queue::enable_profiling()};
+     queue my_gpu_queue(gpu_selector{}, propList);
+   
+     std::cout << "Selected GPU device: " <<
+       my_gpu_queue.get_device().get_info<info::device::name>() << "\n";
+
+   
+     int *host_mem   = malloc_host<int>(N, my_gpu_queue);
+     int *cpu_mem   = malloc_host<int>(N, my_gpu_queue);
+     int *device_mem = malloc_device<int>(N, my_gpu_queue); 
+   
+     // Init CPU data
+     for(int64_t i = 0; i < N; i++) {
+        host_mem[i] = i % 6666;
+     }
+
+     float duration_cpu = 0.0;
+     float duration_gpu = 0.0;
+     float duration_total = 0.0;
+
+     std::chrono::high_resolution_clock::time_point s_cpu, e_cpu;
+     std::chrono::high_resolution_clock::time_point s_gpu, e_gpu;
+     std::chrono::high_resolution_clock::time_point s_t, e_t;
+
+      // warmup
+     /*********************************************************************/
+      my_gpu_queue.memcpy(device_mem, host_mem, N * sizeof(int)).wait();
+      my_gpu_queue.submit([&](handler& h) {
+
+       // Parallel Computation
+       h.parallel_for(range{N}, [=](id<1> item) {
+         device_mem[item] *= 2;
+       });
+
+      });
+      my_gpu_queue.wait();
+     /*********************************************************************/
+
+     // CPU computation
+     printf("\n Start CPU Computation, Number of Elems = %ld \n", N);
+     
+     s_t = std::chrono::high_resolution_clock::now();
+     s_cpu = std::chrono::high_resolution_clock::now();
+     // CPU code here
+     for(int64_t i = 0; i < N; i++) {
+         cpu_mem[i] = host_mem[i] * 2;
+     }
+     e_cpu = std::chrono::high_resolution_clock::now();
+     duration_cpu =  std::chrono::duration<float, std::milli>(e_cpu - s_cpu).count();
+   
+
+     s_gpu = std::chrono::high_resolution_clock::now();
+     // submit the content to the queue for execution
+     auto event = my_gpu_queue.submit([&](handler& h) {
+       
+       // Parallel Computation      
+       h.parallel_for(range{N}, [=](id<1> item) {
+         device_mem[item] *= 2;
+       });
+
+     });
+     // wait the computation done
+     my_gpu_queue.wait();
+     e_gpu = std::chrono::high_resolution_clock::now();
+     e_t = std::chrono::high_resolution_clock::now();
+     duration_gpu =  std::chrono::duration<float, std::milli>(e_gpu - s_gpu).count();
+     duration_total =  std::chrono::duration<float, std::milli>(e_t - s_t).count();
+
+     // Copy back from GPU to CPU
+     my_gpu_queue.memcpy(host_mem, device_mem, N * sizeof(int)).wait();
+
+     printf("\n CPU Computation,   Time = %lf \n", duration_cpu);
+     printf("\n GPU Computation,   Time = %lf \n", duration_gpu);
+     printf("\n Total Computation, TIme = %lf \n", duration_total);
+
+     printf("\nTask Done!\n");
+   
+     return 0;
+   }
+
diff --git a/code/basic_parafor.cpp b/code/basic_parafor.cpp
new file mode 100644
index 0000000..7f8dde2
--- /dev/null
+++ b/code/basic_parafor.cpp
@@ -0,0 +1,52 @@
+   // Copyright (C) 2020 Intel Corporation
+   
+   // SPDX-License-Identifier: MIT
+   
+   #include <CL/sycl.hpp>
+   #include <iostream>
+   using namespace sycl;
+   
+   constexpr int N = 10;
+   
+   int main() {
+     queue my_gpu_queue( gpu_selector{} );
+   
+     std::cout << "Selected GPU device: " <<
+       my_gpu_queue.get_device().get_info<info::device::name>() << "\n";
+   
+     int *host_mem   = malloc_host<int>(N, my_gpu_queue);
+     int *device_mem = malloc_device<int>(N, my_gpu_queue); 
+   
+     // Init CPU data
+     for(int i = 0; i < N; i++) {
+        host_mem[i] = i;
+     }
+   
+     // Copy from host(CPU) to device(GPU)
+     my_gpu_queue.memcpy(device_mem, host_mem, N * sizeof(int)).wait();
+   
+     // submit the content to the queue for execution
+     my_gpu_queue.submit([&](handler& h) {
+       
+       // Parallel Computation      
+       h.parallel_for(range{N}, [=](id<1> item) {
+         device_mem[item] *= 2;
+       });
+
+     });
+
+     // wait the computation done
+     my_gpu_queue.wait();
+   
+     // Copy back from GPU to CPU
+     my_gpu_queue.memcpy(host_mem, device_mem, N * sizeof(int)).wait();
+
+     printf("\nData Result\n");
+     for(int i = 0; i < N; i++) {
+        printf("%d, ", host_mem[i]);
+     }
+     printf("\nTask Done!\n");
+   
+     return 0;
+   }
+
diff --git a/code/data_movement_ex.cpp b/code/data_movement_ex.cpp
new file mode 100644
index 0000000..86c5256
--- /dev/null
+++ b/code/data_movement_ex.cpp
@@ -0,0 +1,43 @@
+   // Copyright (C) 2020 Intel Corporation
+   
+   // SPDX-License-Identifier: MIT
+   
+   #include <CL/sycl.hpp>
+   #include <iostream>
+   using namespace sycl;
+   
+   constexpr int N = 10;
+   
+   int main() {
+     queue my_gpu_queue( gpu_selector{} );
+   
+     std::cout << "Selected GPU device: " <<
+       my_gpu_queue.get_device().get_info<info::device::name>() << "\n";
+   
+     int *host_mem   = malloc_host<int>(N, my_gpu_queue);
+     int *device_mem = malloc_device<int>(N, my_gpu_queue); 
+   
+     // Init CPU data
+     for(int i = 0; i < N; i++) {
+        host_mem[i] = i;
+     }
+   
+     // Copy from host(CPU) to device(GPU)
+     my_gpu_queue.memcpy(device_mem, host_mem, N * sizeof(int)).wait();
+   
+     // do some works on GPU
+     // ......
+     //
+   
+     // Copy back from GPU to CPU
+     my_gpu_queue.memcpy(host_mem, device_mem, N * sizeof(int)).wait();
+
+     printf("\nData Result\n");
+     for(int i = 0; i < N; i++) {
+        printf("%d, ", host_mem[i]);
+     }
+     printf("\nTask Done!\n");
+   
+     return 0;
+   }
+
diff --git a/code/gpu_selector.cpp b/code/gpu_selector.cpp
new file mode 100644
index 0000000..1378854
--- /dev/null
+++ b/code/gpu_selector.cpp
@@ -0,0 +1,17 @@
+// Copyright (C) 2020 Intel Corporation
+
+// SPDX-License-Identifier: MIT
+
+#include <CL/sycl.hpp>
+#include <iostream>
+using namespace sycl;
+
+int main() {
+  queue my_gpu_queue( gpu_selector{} );
+
+  std::cout << "Selected GPU device: " <<
+    my_gpu_queue.get_device().get_info<info::device::name>() << "\n";
+
+  return 0;
+}
+
diff --git a/code/timer.cpp b/code/timer.cpp
new file mode 100644
index 0000000..81b4416
--- /dev/null
+++ b/code/timer.cpp
@@ -0,0 +1,105 @@
+   // Copyright (C) 2020 Intel Corporation
+   
+   // SPDX-License-Identifier: MIT
+   
+   #include <CL/sycl.hpp>
+   #include <iostream>
+   using namespace sycl;
+   
+   constexpr int64_t N = 10000000;
+   
+   int main() {
+     
+     // Enable queue profiling  
+     auto propList = cl::sycl::property_list {cl::sycl::property::queue::enable_profiling()};
+     queue my_gpu_queue(gpu_selector{}, propList);
+   
+     std::cout << "Selected GPU device: " <<
+       my_gpu_queue.get_device().get_info<info::device::name>() << "\n";
+
+
+   
+     int *host_mem   = malloc_host<int>(N, my_gpu_queue);
+     int *cpu_mem   = malloc_host<int>(N, my_gpu_queue);
+     int *device_mem = malloc_device<int>(N, my_gpu_queue); 
+   
+     // Init CPU data
+     for(int64_t i = 0; i < N; i++) {
+        host_mem[i] = i % 6666;
+     }
+
+     float duration_cpu = 0.0;
+     float duration_gpu_a = 0.0;
+     float duration_gpu_b = 0.0;
+     float duration_gpu_c = 0.0;
+
+     std::chrono::high_resolution_clock::time_point s, e;
+     std::chrono::high_resolution_clock::time_point s_a, e_a;
+     std::chrono::high_resolution_clock::time_point s_b, e_b;
+     std::chrono::high_resolution_clock::time_point s_c, e_c;
+
+     // CPU computation
+     printf("\n Start CPU Computation, Number of Elems = %ld \n", N);
+     
+     s = std::chrono::high_resolution_clock::now();
+     // CPU code here
+     for(int64_t i = 0; i < N; i++) {
+         cpu_mem[i] = host_mem[i] * 2;
+     }
+     e = std::chrono::high_resolution_clock::now();
+     duration_cpu =  std::chrono::duration<float, std::milli>(e - s).count();
+     printf("\n End CPU Computation, Time = %lf \n", duration_cpu);
+   
+
+     // warmup
+     /*********************************************************************/
+      my_gpu_queue.memcpy(device_mem, host_mem, N * sizeof(int)).wait();
+      my_gpu_queue.submit([&](handler& h) {
+
+       // Parallel Computation
+       h.parallel_for(range{N}, [=](id<1> item) {
+         device_mem[item] *= 2;
+       });
+
+      });
+      my_gpu_queue.wait();
+     /*********************************************************************/
+   
+     s_c = std::chrono::high_resolution_clock::now();
+     // Copy from host(CPU) to device(GPU)
+     my_gpu_queue.memcpy(device_mem, host_mem, N * sizeof(int)).wait();
+
+     s_b = std::chrono::high_resolution_clock::now();
+     s_a = std::chrono::high_resolution_clock::now();
+     // submit the content to the queue for execution
+     auto event = my_gpu_queue.submit([&](handler& h) {
+       
+       // Parallel Computation      
+       h.parallel_for(range{N}, [=](id<1> item) {
+         device_mem[item] *= 2;
+       });
+
+     });
+     // wait the computation done
+     my_gpu_queue.wait();
+     e_b = std::chrono::high_resolution_clock::now();
+     duration_gpu_b =  std::chrono::duration<float, std::milli>(e_b - s_b).count();
+
+     duration_gpu_a =
+      (event.get_profiling_info<info::event_profiling::command_end>() -
+      event.get_profiling_info<info::event_profiling::command_start>()) /1000.0f/1000.0f;
+   
+     // Copy back from GPU to CPU
+     my_gpu_queue.memcpy(host_mem, device_mem, N * sizeof(int)).wait();
+     e_c = std::chrono::high_resolution_clock::now();
+     duration_gpu_c =  std::chrono::duration<float, std::milli>(e_c - s_c).count();
+
+     printf("\n GPU Computation, GPU Time A = %lf \n", duration_gpu_a);
+     printf("\n GPU Computation, GPU Time B = %lf \n", duration_gpu_b);
+     printf("\n GPU Computation, GPU Time C = %lf \n", duration_gpu_c);
+
+     printf("\nTask Done!\n");
+   
+     return 0;
+   }
+