PatricZhao преди 2 години
committed by GitHub
родител
ревизия
4943e036ce
No known key found for this signature in database GPG ключ ID: 4AEE18F83AFDEB23
променени са 5 файла, в които са добавени 310 реда и са изтрити 0 реда
  1. +93
    -0
      code/async.cpp
  2. +52
    -0
      code/basic_parafor.cpp
  3. +43
    -0
      code/data_movement_ex.cpp
  4. +17
    -0
      code/gpu_selector.cpp
  5. +105
    -0
      code/timer.cpp

+ 93
- 0
code/async.cpp Целия файл

@ -0,0 +1,93 @@
// Copyright (C) 2020 Intel Corporation
// SPDX-License-Identifier: MIT
#include <CL/sycl.hpp>
#include <iostream>
using namespace sycl;
constexpr int64_t N = 10000000;
int main() {
// Enable queue profiling
auto propList = cl::sycl::property_list {cl::sycl::property::queue::enable_profiling()};
queue my_gpu_queue(gpu_selector{}, propList);
std::cout << "Selected GPU device: " <<
my_gpu_queue.get_device().get_info<info::device::name>() << "\n";
int *host_mem = malloc_host<int>(N, my_gpu_queue);
int *cpu_mem = malloc_host<int>(N, my_gpu_queue);
int *device_mem = malloc_device<int>(N, my_gpu_queue);
// Init CPU data
for(int64_t i = 0; i < N; i++) {
host_mem[i] = i % 6666;
}
float duration_cpu = 0.0;
float duration_gpu = 0.0;
float duration_total = 0.0;
std::chrono::high_resolution_clock::time_point s_cpu, e_cpu;
std::chrono::high_resolution_clock::time_point s_gpu, e_gpu;
std::chrono::high_resolution_clock::time_point s_t, e_t;
// warmup
/*********************************************************************/
my_gpu_queue.memcpy(device_mem, host_mem, N * sizeof(int)).wait();
my_gpu_queue.submit([&](handler& h) {
// Parallel Computation
h.parallel_for(range{N}, [=](id<1> item) {
device_mem[item] *= 2;
});
});
my_gpu_queue.wait();
/*********************************************************************/
// CPU computation
printf("\n Start CPU Computation, Number of Elems = %ld \n", N);
s_t = std::chrono::high_resolution_clock::now();
s_cpu = std::chrono::high_resolution_clock::now();
// CPU code here
for(int64_t i = 0; i < N; i++) {
cpu_mem[i] = host_mem[i] * 2;
}
e_cpu = std::chrono::high_resolution_clock::now();
duration_cpu = std::chrono::duration<float, std::milli>(e_cpu - s_cpu).count();
s_gpu = std::chrono::high_resolution_clock::now();
// submit the content to the queue for execution
auto event = my_gpu_queue.submit([&](handler& h) {
// Parallel Computation
h.parallel_for(range{N}, [=](id<1> item) {
device_mem[item] *= 2;
});
});
// wait the computation done
my_gpu_queue.wait();
e_gpu = std::chrono::high_resolution_clock::now();
e_t = std::chrono::high_resolution_clock::now();
duration_gpu = std::chrono::duration<float, std::milli>(e_gpu - s_gpu).count();
duration_total = std::chrono::duration<float, std::milli>(e_t - s_t).count();
// Copy back from GPU to CPU
my_gpu_queue.memcpy(host_mem, device_mem, N * sizeof(int)).wait();
printf("\n CPU Computation, Time = %lf \n", duration_cpu);
printf("\n GPU Computation, Time = %lf \n", duration_gpu);
printf("\n Total Computation, TIme = %lf \n", duration_total);
printf("\nTask Done!\n");
return 0;
}

+ 52
- 0
code/basic_parafor.cpp Целия файл

@ -0,0 +1,52 @@
// Copyright (C) 2020 Intel Corporation
// SPDX-License-Identifier: MIT
#include <CL/sycl.hpp>
#include <iostream>
using namespace sycl;
constexpr int N = 10;
int main() {
queue my_gpu_queue( gpu_selector{} );
std::cout << "Selected GPU device: " <<
my_gpu_queue.get_device().get_info<info::device::name>() << "\n";
int *host_mem = malloc_host<int>(N, my_gpu_queue);
int *device_mem = malloc_device<int>(N, my_gpu_queue);
// Init CPU data
for(int i = 0; i < N; i++) {
host_mem[i] = i;
}
// Copy from host(CPU) to device(GPU)
my_gpu_queue.memcpy(device_mem, host_mem, N * sizeof(int)).wait();
// submit the content to the queue for execution
my_gpu_queue.submit([&](handler& h) {
// Parallel Computation
h.parallel_for(range{N}, [=](id<1> item) {
device_mem[item] *= 2;
});
});
// wait the computation done
my_gpu_queue.wait();
// Copy back from GPU to CPU
my_gpu_queue.memcpy(host_mem, device_mem, N * sizeof(int)).wait();
printf("\nData Result\n");
for(int i = 0; i < N; i++) {
printf("%d, ", host_mem[i]);
}
printf("\nTask Done!\n");
return 0;
}

+ 43
- 0
code/data_movement_ex.cpp Целия файл

@ -0,0 +1,43 @@
// Copyright (C) 2020 Intel Corporation
// SPDX-License-Identifier: MIT
#include <CL/sycl.hpp>
#include <iostream>
using namespace sycl;
constexpr int N = 10;
int main() {
queue my_gpu_queue( gpu_selector{} );
std::cout << "Selected GPU device: " <<
my_gpu_queue.get_device().get_info<info::device::name>() << "\n";
int *host_mem = malloc_host<int>(N, my_gpu_queue);
int *device_mem = malloc_device<int>(N, my_gpu_queue);
// Init CPU data
for(int i = 0; i < N; i++) {
host_mem[i] = i;
}
// Copy from host(CPU) to device(GPU)
my_gpu_queue.memcpy(device_mem, host_mem, N * sizeof(int)).wait();
// do some works on GPU
// ......
//
// Copy back from GPU to CPU
my_gpu_queue.memcpy(host_mem, device_mem, N * sizeof(int)).wait();
printf("\nData Result\n");
for(int i = 0; i < N; i++) {
printf("%d, ", host_mem[i]);
}
printf("\nTask Done!\n");
return 0;
}

+ 17
- 0
code/gpu_selector.cpp Целия файл

@ -0,0 +1,17 @@
// Copyright (C) 2020 Intel Corporation
// SPDX-License-Identifier: MIT
#include <CL/sycl.hpp>
#include <iostream>
using namespace sycl;
int main() {
queue my_gpu_queue( gpu_selector{} );
std::cout << "Selected GPU device: " <<
my_gpu_queue.get_device().get_info<info::device::name>() << "\n";
return 0;
}

+ 105
- 0
code/timer.cpp Целия файл

@ -0,0 +1,105 @@
// Copyright (C) 2020 Intel Corporation
// SPDX-License-Identifier: MIT
#include <CL/sycl.hpp>
#include <iostream>
using namespace sycl;
constexpr int64_t N = 10000000;
int main() {
// Enable queue profiling
auto propList = cl::sycl::property_list {cl::sycl::property::queue::enable_profiling()};
queue my_gpu_queue(gpu_selector{}, propList);
std::cout << "Selected GPU device: " <<
my_gpu_queue.get_device().get_info<info::device::name>() << "\n";
int *host_mem = malloc_host<int>(N, my_gpu_queue);
int *cpu_mem = malloc_host<int>(N, my_gpu_queue);
int *device_mem = malloc_device<int>(N, my_gpu_queue);
// Init CPU data
for(int64_t i = 0; i < N; i++) {
host_mem[i] = i % 6666;
}
float duration_cpu = 0.0;
float duration_gpu_a = 0.0;
float duration_gpu_b = 0.0;
float duration_gpu_c = 0.0;
std::chrono::high_resolution_clock::time_point s, e;
std::chrono::high_resolution_clock::time_point s_a, e_a;
std::chrono::high_resolution_clock::time_point s_b, e_b;
std::chrono::high_resolution_clock::time_point s_c, e_c;
// CPU computation
printf("\n Start CPU Computation, Number of Elems = %ld \n", N);
s = std::chrono::high_resolution_clock::now();
// CPU code here
for(int64_t i = 0; i < N; i++) {
cpu_mem[i] = host_mem[i] * 2;
}
e = std::chrono::high_resolution_clock::now();
duration_cpu = std::chrono::duration<float, std::milli>(e - s).count();
printf("\n End CPU Computation, Time = %lf \n", duration_cpu);
// warmup
/*********************************************************************/
my_gpu_queue.memcpy(device_mem, host_mem, N * sizeof(int)).wait();
my_gpu_queue.submit([&](handler& h) {
// Parallel Computation
h.parallel_for(range{N}, [=](id<1> item) {
device_mem[item] *= 2;
});
});
my_gpu_queue.wait();
/*********************************************************************/
s_c = std::chrono::high_resolution_clock::now();
// Copy from host(CPU) to device(GPU)
my_gpu_queue.memcpy(device_mem, host_mem, N * sizeof(int)).wait();
s_b = std::chrono::high_resolution_clock::now();
s_a = std::chrono::high_resolution_clock::now();
// submit the content to the queue for execution
auto event = my_gpu_queue.submit([&](handler& h) {
// Parallel Computation
h.parallel_for(range{N}, [=](id<1> item) {
device_mem[item] *= 2;
});
});
// wait the computation done
my_gpu_queue.wait();
e_b = std::chrono::high_resolution_clock::now();
duration_gpu_b = std::chrono::duration<float, std::milli>(e_b - s_b).count();
duration_gpu_a =
(event.get_profiling_info<info::event_profiling::command_end>() -
event.get_profiling_info<info::event_profiling::command_start>()) /1000.0f/1000.0f;
// Copy back from GPU to CPU
my_gpu_queue.memcpy(host_mem, device_mem, N * sizeof(int)).wait();
e_c = std::chrono::high_resolution_clock::now();
duration_gpu_c = std::chrono::duration<float, std::milli>(e_c - s_c).count();
printf("\n GPU Computation, GPU Time A = %lf \n", duration_gpu_a);
printf("\n GPU Computation, GPU Time B = %lf \n", duration_gpu_b);
printf("\n GPU Computation, GPU Time C = %lf \n", duration_gpu_c);
printf("\nTask Done!\n");
return 0;
}

Зареждане…
Отказ
Запис