- 01
- 02
- 03
- 04
- 05
- 06
- 07
- 08
- 09
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 75
- 76
- 77
- 78
- 79
- 80
- 81
- 82
- 83
- 84
- 85
- 86
- 87
- 88
- 89
- 90
- 91
#include <algorithm>
#include <exception>
#include <iostream>
#include <memory>
#include <string>
#include <vector>
#include <random>
#include <chrono>
#define __CL_ENABLE_EXCEPTIONS
#include "cl.hpp"
using namespace std::string_literals;
int main(int argc, char * argv[]) {
size_t data_len = (1024 * 1024 * strtoul(argv[1], nullptr, 10)) / sizeof(uint32_t),
out_len = strtoul(argv[2], nullptr, 10),
iter = strtoul(argv[3], nullptr, 10),
block_size = strtoul(argv[4], nullptr, 10);
std::string src = R"(
typedef unsigned int uint32_t;
__kernel void bench(global const uint32_t * data, global uint32_t * out) {
uint32_t res = 0, id = get_global_id(0), next = id;
for(uint32_t i = 0; i < )"s + std::to_string(iter) + R"(; ++i) {
for(uint32_t j = 0; j < )" + std::to_string(block_size / sizeof(uint32_t)) + R"(; ++j)
res ^= data[next + j];
next = data[next];
}
out[id] = res;
}
)"s;
cl::Program::Sources sources = {{src.data(), src.size()}};
std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
std::vector<uint32_t> data(data_len);
std::vector<uint32_t> out(out_len);
std::generate(std::begin(data), std::end(data), [=,gen = std::mt19937{}]() mutable {
return gen() % (data_len - (block_size / sizeof(uint32_t)));
});
for(auto & platform : platforms) {
std::cout << "Using platform: " << platform.getInfo<CL_PLATFORM_NAME>() << "\n";
std::vector<cl::Device> devices;
platform.getDevices(CL_DEVICE_TYPE_GPU, &devices);
for(auto & device : devices) {
try {
std::cout << "Using device: " << device.getInfo<CL_DEVICE_NAME>() << "\n";
cl::Context ctx({device});
cl::Program program(ctx, sources);
program.build({device});
cl::Buffer data_buffer(ctx, CL_MEM_READ_WRITE, data.size() * sizeof(uint32_t));
cl::Buffer out_buffer(ctx, CL_MEM_READ_WRITE, out.size() * sizeof(uint32_t));
cl::CommandQueue queue(ctx, device);
queue.enqueueWriteBuffer(data_buffer, CL_TRUE, 0, data.size() * sizeof(uint32_t), data.data());
cl::make_kernel<cl::Buffer &, cl::Buffer &> bench(program, "bench");
cl::EnqueueArgs eargs(queue,cl::NullRange,cl::NDRange(out.size()),cl::NullRange);
auto start = std::chrono::high_resolution_clock::now();
bench(eargs, data_buffer, out_buffer).wait();
auto time = std::chrono::duration_cast<std::chrono::duration<double>>(std::chrono::high_resolution_clock::now() - start).count();
size_t ops = out_len * iter;
size_t total_tp = ops * block_size;
double miops = (ops / time) / (1000 * 1000);
double tpgbps = (total_tp / time) / (1024 * 1024 * 1024);
fprintf(stderr, "Result: %.2fMIOPS, %.2fGB/s, %.2fsec\n", miops, tpgbps, time);
queue.enqueueReadBuffer(out_buffer, CL_TRUE, 0, out.size() * sizeof(uint32_t), out.data());
} catch(cl::Error e) {
std::cout << e.what() << " : " << e.err() << std::endl;
std::terminate();
}
}
}
}