Add example

3 years ago · 4943e036ce
--- a/code/async.cpp
+++ b/code/async.cpp
@ -0,0 +1,93 @@
   // Copyright (C) 2020 Intel Corporation
   
   // SPDX-License-Identifier: MIT
   
   #include <CL/sycl.hpp>
   #include <iostream>
   using namespace sycl;
   
   constexpr int64_t N = 10000000;
   
   int main() {
     
     // Enable queue profiling  
     auto propList = cl::sycl::property_list {cl::sycl::property::queue::enable_profiling()};
     queue my_gpu_queue(gpu_selector{}, propList);
   
     std::cout << "Selected GPU device: " <<
       my_gpu_queue.get_device().get_info<info::device::name>() << "\n";

   
     int *host_mem   = malloc_host<int>(N, my_gpu_queue);
     int *cpu_mem   = malloc_host<int>(N, my_gpu_queue);
     int *device_mem = malloc_device<int>(N, my_gpu_queue); 
   
     // Init CPU data
     for(int64_t i = 0; i < N; i++) {
        host_mem[i] = i % 6666;
     }

     float duration_cpu = 0.0;
     float duration_gpu = 0.0;
     float duration_total = 0.0;

     std::chrono::high_resolution_clock::time_point s_cpu, e_cpu;
     std::chrono::high_resolution_clock::time_point s_gpu, e_gpu;
     std::chrono::high_resolution_clock::time_point s_t, e_t;

      // warmup
     /*********************************************************************/
      my_gpu_queue.memcpy(device_mem, host_mem, N * sizeof(int)).wait();
      my_gpu_queue.submit([&](handler& h) {

       // Parallel Computation
       h.parallel_for(range{N}, [=](id<1> item) {
         device_mem[item] *= 2;
       });

      });
      my_gpu_queue.wait();
     /*********************************************************************/

     // CPU computation
     printf("\n Start CPU Computation, Number of Elems = %ld \n", N);
     
     s_t = std::chrono::high_resolution_clock::now();
     s_cpu = std::chrono::high_resolution_clock::now();
     // CPU code here
     for(int64_t i = 0; i < N; i++) {
         cpu_mem[i] = host_mem[i] * 2;
     }
     e_cpu = std::chrono::high_resolution_clock::now();
     duration_cpu =  std::chrono::duration<float, std::milli>(e_cpu - s_cpu).count();
   

     s_gpu = std::chrono::high_resolution_clock::now();
     // submit the content to the queue for execution
     auto event = my_gpu_queue.submit([&](handler& h) {
       
       // Parallel Computation      
       h.parallel_for(range{N}, [=](id<1> item) {
         device_mem[item] *= 2;
       });

     });
     // wait the computation done
     my_gpu_queue.wait();
     e_gpu = std::chrono::high_resolution_clock::now();
     e_t = std::chrono::high_resolution_clock::now();
     duration_gpu =  std::chrono::duration<float, std::milli>(e_gpu - s_gpu).count();
     duration_total =  std::chrono::duration<float, std::milli>(e_t - s_t).count();

     // Copy back from GPU to CPU
     my_gpu_queue.memcpy(host_mem, device_mem, N * sizeof(int)).wait();

     printf("\n CPU Computation,   Time = %lf \n", duration_cpu);
     printf("\n GPU Computation,   Time = %lf \n", duration_gpu);
     printf("\n Total Computation, TIme = %lf \n", duration_total);

     printf("\nTask Done!\n");
   
     return 0;
   }

--- a/code/basic_parafor.cpp
+++ b/code/basic_parafor.cpp
@ -0,0 +1,52 @@
   // Copyright (C) 2020 Intel Corporation
   
   // SPDX-License-Identifier: MIT
   
   #include <CL/sycl.hpp>
   #include <iostream>
   using namespace sycl;
   
   constexpr int N = 10;
   
   int main() {
     queue my_gpu_queue( gpu_selector{} );
   
     std::cout << "Selected GPU device: " <<
       my_gpu_queue.get_device().get_info<info::device::name>() << "\n";
   
     int *host_mem   = malloc_host<int>(N, my_gpu_queue);
     int *device_mem = malloc_device<int>(N, my_gpu_queue); 
   
     // Init CPU data
     for(int i = 0; i < N; i++) {
        host_mem[i] = i;
     }
   
     // Copy from host(CPU) to device(GPU)
     my_gpu_queue.memcpy(device_mem, host_mem, N * sizeof(int)).wait();
   
     // submit the content to the queue for execution
     my_gpu_queue.submit([&](handler& h) {
       
       // Parallel Computation      
       h.parallel_for(range{N}, [=](id<1> item) {
         device_mem[item] *= 2;
       });

     });

     // wait the computation done
     my_gpu_queue.wait();
   
     // Copy back from GPU to CPU
     my_gpu_queue.memcpy(host_mem, device_mem, N * sizeof(int)).wait();

     printf("\nData Result\n");
     for(int i = 0; i < N; i++) {
        printf("%d, ", host_mem[i]);
     }
     printf("\nTask Done!\n");
   
     return 0;
   }

--- a/code/data_movement_ex.cpp
+++ b/code/data_movement_ex.cpp
@ -0,0 +1,43 @@
   // Copyright (C) 2020 Intel Corporation
   
   // SPDX-License-Identifier: MIT
   
   #include <CL/sycl.hpp>
   #include <iostream>
   using namespace sycl;
   
   constexpr int N = 10;
   
   int main() {
     queue my_gpu_queue( gpu_selector{} );
   
     std::cout << "Selected GPU device: " <<
       my_gpu_queue.get_device().get_info<info::device::name>() << "\n";
   
     int *host_mem   = malloc_host<int>(N, my_gpu_queue);
     int *device_mem = malloc_device<int>(N, my_gpu_queue); 
   
     // Init CPU data
     for(int i = 0; i < N; i++) {
        host_mem[i] = i;
     }
   
     // Copy from host(CPU) to device(GPU)
     my_gpu_queue.memcpy(device_mem, host_mem, N * sizeof(int)).wait();
   
     // do some works on GPU
     // ......
     //
   
     // Copy back from GPU to CPU
     my_gpu_queue.memcpy(host_mem, device_mem, N * sizeof(int)).wait();

     printf("\nData Result\n");
     for(int i = 0; i < N; i++) {
        printf("%d, ", host_mem[i]);
     }
     printf("\nTask Done!\n");
   
     return 0;
   }

--- a/code/gpu_selector.cpp
+++ b/code/gpu_selector.cpp
@ -0,0 +1,17 @@
 // Copyright (C) 2020 Intel Corporation

 // SPDX-License-Identifier: MIT

 #include <CL/sycl.hpp>
 #include <iostream>
 using namespace sycl;

 int main() {
  queue my_gpu_queue( gpu_selector{} );

  std::cout << "Selected GPU device: " <<
    my_gpu_queue.get_device().get_info<info::device::name>() << "\n";

  return 0;
 }

--- a/code/timer.cpp
+++ b/code/timer.cpp
@ -0,0 +1,105 @@
   // Copyright (C) 2020 Intel Corporation
   
   // SPDX-License-Identifier: MIT
   
   #include <CL/sycl.hpp>
   #include <iostream>
   using namespace sycl;
   
   constexpr int64_t N = 10000000;
   
   int main() {
     
     // Enable queue profiling  
     auto propList = cl::sycl::property_list {cl::sycl::property::queue::enable_profiling()};
     queue my_gpu_queue(gpu_selector{}, propList);
   
     std::cout << "Selected GPU device: " <<
       my_gpu_queue.get_device().get_info<info::device::name>() << "\n";


   
     int *host_mem   = malloc_host<int>(N, my_gpu_queue);
     int *cpu_mem   = malloc_host<int>(N, my_gpu_queue);
     int *device_mem = malloc_device<int>(N, my_gpu_queue); 
   
     // Init CPU data
     for(int64_t i = 0; i < N; i++) {
        host_mem[i] = i % 6666;
     }

     float duration_cpu = 0.0;
     float duration_gpu_a = 0.0;
     float duration_gpu_b = 0.0;
     float duration_gpu_c = 0.0;

     std::chrono::high_resolution_clock::time_point s, e;
     std::chrono::high_resolution_clock::time_point s_a, e_a;
     std::chrono::high_resolution_clock::time_point s_b, e_b;
     std::chrono::high_resolution_clock::time_point s_c, e_c;

     // CPU computation
     printf("\n Start CPU Computation, Number of Elems = %ld \n", N);
     
     s = std::chrono::high_resolution_clock::now();
     // CPU code here
     for(int64_t i = 0; i < N; i++) {
         cpu_mem[i] = host_mem[i] * 2;
     }
     e = std::chrono::high_resolution_clock::now();
     duration_cpu =  std::chrono::duration<float, std::milli>(e - s).count();
     printf("\n End CPU Computation, Time = %lf \n", duration_cpu);
   

     // warmup
     /*********************************************************************/
      my_gpu_queue.memcpy(device_mem, host_mem, N * sizeof(int)).wait();
      my_gpu_queue.submit([&](handler& h) {

       // Parallel Computation
       h.parallel_for(range{N}, [=](id<1> item) {
         device_mem[item] *= 2;
       });

      });
      my_gpu_queue.wait();
     /*********************************************************************/
   
     s_c = std::chrono::high_resolution_clock::now();
     // Copy from host(CPU) to device(GPU)
     my_gpu_queue.memcpy(device_mem, host_mem, N * sizeof(int)).wait();

     s_b = std::chrono::high_resolution_clock::now();
     s_a = std::chrono::high_resolution_clock::now();
     // submit the content to the queue for execution
     auto event = my_gpu_queue.submit([&](handler& h) {
       
       // Parallel Computation      
       h.parallel_for(range{N}, [=](id<1> item) {
         device_mem[item] *= 2;
       });

     });
     // wait the computation done
     my_gpu_queue.wait();
     e_b = std::chrono::high_resolution_clock::now();
     duration_gpu_b =  std::chrono::duration<float, std::milli>(e_b - s_b).count();

     duration_gpu_a =
      (event.get_profiling_info<info::event_profiling::command_end>() -
      event.get_profiling_info<info::event_profiling::command_start>()) /1000.0f/1000.0f;
   
     // Copy back from GPU to CPU
     my_gpu_queue.memcpy(host_mem, device_mem, N * sizeof(int)).wait();
     e_c = std::chrono::high_resolution_clock::now();
     duration_gpu_c =  std::chrono::duration<float, std::milli>(e_c - s_c).count();

     printf("\n GPU Computation, GPU Time A = %lf \n", duration_gpu_a);
     printf("\n GPU Computation, GPU Time B = %lf \n", duration_gpu_b);
     printf("\n GPU Computation, GPU Time C = %lf \n", duration_gpu_c);

     printf("\nTask Done!\n");
   
     return 0;
   }