From 9cc667cf92285dfc6f95f9e0eab9f756d595d95b Mon Sep 17 00:00:00 2001 From: PatricZhao Date: Sat, 6 Nov 2021 20:15:10 +0800 Subject: [PATCH] show how to use SLM --- code/fdad.cpp | 139 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 139 insertions(+) create mode 100644 code/fdad.cpp diff --git a/code/fdad.cpp b/code/fdad.cpp new file mode 100644 index 0000000..1c023d1 --- /dev/null +++ b/code/fdad.cpp @@ -0,0 +1,139 @@ + // patric zhao, patric.zhao@intel.com + // show SLM usage by Finite Difference Approximating Derivatives (fdad) + #include + #include + using namespace sycl; + + #define random_float() (rand() / double(RAND_MAX)) + #define BLOCK 256 + #define CheckResult 0 + + constexpr int64_t N = 256 * 256 * 256 + 2; + constexpr float delta = 0.001f; + + void verify(float *gpu, float *cpu, int N) { + int error = 0; + for(int i = 0; i < N; i++) { + if(std::fabs(gpu[i] - cpu[i]) > 10e-3) { + printf("\nError at %d GPU = %f, CPU = %f\n", i, gpu[i], cpu[i]); + error++; + } + if(error > 20) break; + } + return; + } + + int main() { + + // Enable queue profiling + auto propList = cl::sycl::property_list {cl::sycl::property::queue::enable_profiling()}; + queue my_gpu_queue(gpu_selector{}, propList); + + std::cout << "Selected GPU device: " << + my_gpu_queue.get_device().get_info() << "\n"; + + float *input = malloc_host(N, my_gpu_queue); + float *output_P_cpu = malloc_host(N-2, my_gpu_queue); + + float *input_Q = malloc_device(N, my_gpu_queue); + float *output_P = malloc_device(N-2, my_gpu_queue); + + float *output_P_gpu = malloc_host(N-2, my_gpu_queue); + + // Init CPU data + for(int64_t i = 0; i < N; i++) { + input[i] = random_float(); + } + + // CPU compuatation + printf("\n Start Computation, Number of Elems = %ld \n", N); + for(int64_t i = 0; i < N-2; i++) { + output_P_cpu[i] = (input[i+2] - input[i]) / (2.0f * delta); + } + + float duration_gpu_a = 0.0; + float duration_gpu_b = 0.0; + + // Copy from host(CPU) to device(GPU) + my_gpu_queue.memcpy(input_Q, input, N * sizeof(float)).wait(); + + int warmup = 10; + int iteration = 50; + for(int i = 0; i < iteration + warmup; i++) { + + // read/write global memory directly + auto event1 = my_gpu_queue.submit([&](handler& h) { + h.parallel_for(nd_range<1>{N-2, BLOCK}, [=](nd_item<1> item) { + auto global_id = item.get_global_id(0); + output_P[global_id] = (input_Q[global_id +2] - input_Q[global_id]) / (2.0f * delta); + }); + }); + // wait the computation done + my_gpu_queue.wait(); + + if (i >= warmup) { + duration_gpu_a += + (event1.get_profiling_info() - + event1.get_profiling_info()) /1000.0f/1000.0f; + } + + if (CheckResult) { + my_gpu_queue.memcpy(output_P_gpu, output_P, (N - 2) * sizeof(float)).wait(); + verify(output_P_gpu, output_P_gpu, N); + } + + // read data to SLM and then computaiton w/ SLM read + // finally write back to global memory + auto event2 = my_gpu_queue.submit([&](handler& h) { + + // Define SLM size per work-group + sycl::accessor + slm_buffer(BLOCK + 2, h); + + + h.parallel_for(nd_range<1>(N-2, BLOCK), [=](nd_item<1> item) { + + auto local_id = item.get_local_id(0); + auto global_id = item.get_global_id(0); + + slm_buffer[local_id] = input_Q[global_id]; + if(local_id == BLOCK-1) { + slm_buffer[BLOCK ] = input_Q[global_id +1]; + slm_buffer[BLOCK+1] = input_Q[global_id +2]; + } + item.barrier(sycl::access::fence_space::local_space); + + output_P[global_id] = (slm_buffer[local_id +2] - slm_buffer[local_id]) / (2.0f * delta); + }); + + }); + my_gpu_queue.wait(); + + if (i >= warmup) { + duration_gpu_b += + (event2.get_profiling_info() - + event2.get_profiling_info()) /1000.0f/1000.0f; + } + + if (CheckResult) { + my_gpu_queue.memcpy(output_P_gpu, output_P, (N - 2) * sizeof(float)).wait(); + verify(output_P_gpu, output_P_gpu, N); + } + + } + + printf("\n GPU Computation, GPU Time w/o SLM = %lf \n", duration_gpu_a / iteration); + printf("\n GPU Computation, GPU Time w/ SLM = %lf \n", duration_gpu_b / iteration); + + printf("\nTask Done!\n"); + + free(input_Q, my_gpu_queue); + free(output_P, my_gpu_queue); + free(output_P_cpu, my_gpu_queue); + free(output_P_gpu, my_gpu_queue); + free(input, my_gpu_queue); + + return 0; + } +