From 9cc667cf92285dfc6f95f9e0eab9f756d595d95b Mon Sep 17 00:00:00 2001
From: PatricZhao <patric.zhao@intel.com>
Date: Sat, 6 Nov 2021 20:15:10 +0800
Subject: [PATCH] show how to use SLM

---
 code/fdad.cpp | 139 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 139 insertions(+)
 create mode 100644 code/fdad.cpp
diff --git a/code/fdad.cpp b/code/fdad.cpp
new file mode 100644
index 0000000..1c023d1
--- /dev/null
+++ b/code/fdad.cpp
@@ -0,0 +1,139 @@
+   // patric zhao, patric.zhao@intel.com
+   // show SLM usage by Finite Difference Approximating Derivatives (fdad)  
+   #include <CL/sycl.hpp>
+   #include <iostream>
+   using namespace sycl;
+
+   #define random_float() (rand() / double(RAND_MAX))
+   #define BLOCK 256 
+   #define CheckResult 0
+   
+   constexpr int64_t N = 256 * 256 * 256 + 2;
+   constexpr float delta = 0.001f;
+
+   void verify(float *gpu, float *cpu, int N) {
+       int error = 0;
+       for(int i = 0; i < N; i++) {
+           if(std::fabs(gpu[i] - cpu[i]) > 10e-3) {
+               printf("\nError at %d GPU = %f, CPU = %f\n", i, gpu[i], cpu[i]);
+               error++;
+           }
+           if(error > 20) break;
+       }
+       return;
+   }
+   
+   int main() {
+     
+     // Enable queue profiling  
+     auto propList = cl::sycl::property_list {cl::sycl::property::queue::enable_profiling()};
+     queue my_gpu_queue(gpu_selector{}, propList);
+   
+     std::cout << "Selected GPU device: " <<
+       my_gpu_queue.get_device().get_info<info::device::name>() << "\n";
+
+     float *input        = malloc_host<float>(N, my_gpu_queue);
+     float *output_P_cpu = malloc_host<float>(N-2, my_gpu_queue);
+
+     float *input_Q = malloc_device<float>(N, my_gpu_queue); 
+     float *output_P = malloc_device<float>(N-2, my_gpu_queue); 
+
+     float *output_P_gpu   = malloc_host<float>(N-2, my_gpu_queue);
+   
+     // Init CPU data
+     for(int64_t i = 0; i < N; i++) {
+        input[i] = random_float();
+     }
+
+     // CPU compuatation
+     printf("\n Start Computation, Number of Elems = %ld \n", N);
+     for(int64_t i = 0; i < N-2; i++) {
+         output_P_cpu[i] = (input[i+2] - input[i]) / (2.0f * delta);
+     }
+
+     float duration_gpu_a = 0.0;
+     float duration_gpu_b = 0.0;
+     
+     // Copy from host(CPU) to device(GPU)
+     my_gpu_queue.memcpy(input_Q, input, N * sizeof(float)).wait();
+
+     int warmup = 10;
+     int iteration = 50;
+     for(int i = 0; i < iteration + warmup; i++) {
+
+         // read/write global memory directly
+         auto event1 = my_gpu_queue.submit([&](handler& h) {
+             h.parallel_for(nd_range<1>{N-2, BLOCK}, [=](nd_item<1> item) {
+                  auto global_id = item.get_global_id(0);
+                  output_P[global_id] = (input_Q[global_id +2] - input_Q[global_id]) / (2.0f * delta);
+             });
+         });
+         // wait the computation done
+         my_gpu_queue.wait();
+
+         if (i >= warmup) {
+             duration_gpu_a +=
+              (event1.get_profiling_info<info::event_profiling::command_end>() -
+              event1.get_profiling_info<info::event_profiling::command_start>()) /1000.0f/1000.0f;
+         }
+   
+         if (CheckResult) {
+             my_gpu_queue.memcpy(output_P_gpu, output_P, (N - 2) * sizeof(float)).wait();
+             verify(output_P_gpu, output_P_gpu, N);
+         }
+
+         // read data to SLM and then computaiton w/ SLM read
+         // finally write back to global memory
+         auto event2 = my_gpu_queue.submit([&](handler& h) {
+
+           // Define SLM size per work-group      
+           sycl::accessor<float, 1, sycl::access::mode::read_write, 
+                                    sycl::access::target::local>
+                                    slm_buffer(BLOCK + 2, h);
+
+
+           h.parallel_for(nd_range<1>(N-2, BLOCK), [=](nd_item<1> item) {
+
+                auto local_id  = item.get_local_id(0);
+                auto global_id = item.get_global_id(0);
+
+                slm_buffer[local_id] = input_Q[global_id];
+                if(local_id == BLOCK-1) {
+                    slm_buffer[BLOCK  ] = input_Q[global_id +1];
+                    slm_buffer[BLOCK+1] = input_Q[global_id +2];
+                }
+                item.barrier(sycl::access::fence_space::local_space);
+
+                output_P[global_id] = (slm_buffer[local_id +2] - slm_buffer[local_id]) / (2.0f * delta);
+           });
+
+         });
+         my_gpu_queue.wait();
+
+         if (i >= warmup) {
+             duration_gpu_b +=
+             (event2.get_profiling_info<info::event_profiling::command_end>() -
+             event2.get_profiling_info<info::event_profiling::command_start>()) /1000.0f/1000.0f;
+         }
+
+         if (CheckResult) {
+             my_gpu_queue.memcpy(output_P_gpu, output_P, (N - 2) * sizeof(float)).wait();
+             verify(output_P_gpu, output_P_gpu, N);
+         }
+
+     }
+
+     printf("\n GPU Computation, GPU Time w/o SLM = %lf \n", duration_gpu_a / iteration);
+     printf("\n GPU Computation, GPU Time w/  SLM = %lf \n", duration_gpu_b / iteration);
+
+     printf("\nTask Done!\n");
+   
+     free(input_Q, my_gpu_queue);
+     free(output_P, my_gpu_queue);
+     free(output_P_cpu, my_gpu_queue);
+     free(output_P_gpu, my_gpu_queue);
+     free(input, my_gpu_queue);
+
+     return 0;
+   }
+