25#include <ql/errors.hpp>
27#include <boost/algorithm/string/join.hpp>
28#include <boost/timer/timer.hpp>
35#define ORE_OPENCL_MAX_N_DEV_INFO 1024U
36#define ORE_OPENCL_MAX_N_DEV_INFO_LARGE 65536U
37#define ORE_OPENCL_MAX_BUILD_LOG 65536U
38#define ORE_OPENCL_MAX_BUILD_LOG_LOGFILE 1024U
42#ifdef ORE_ENABLE_OPENCL
44std::string errorText(cl_int err) {
49 return "CL_DEVICE_NOT_FOUND";
51 return "CL_DEVICE_NOT_AVAILABLE";
53 return "CL_COMPILER_NOT_AVAILABLE";
55 return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
57 return "CL_OUT_OF_RESOURCES";
59 return "CL_OUT_OF_HOST_MEMORY";
61 return "CL_PROFILING_INFO_NOT_AVAILABLE";
63 return "CL_MEM_COPY_OVERLAP";
65 return "CL_IMAGE_FORMAT_MISMATCH";
67 return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
69 return "CL_BUILD_PROGRAM_FAILURE";
71 return "CL_MAP_FAILURE";
73 return "CL_MISALIGNED_SUB_BUFFER_OFFSET";
75 return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";
77 return "CL_COMPILE_PROGRAM_FAILURE";
79 return "CL_LINKER_NOT_AVAILABLE";
81 return "CL_LINK_PROGRAM_FAILURE";
83 return "CL_DEVICE_PARTITION_FAILED";
85 return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE";
87 return "CL_INVALID_VALUE";
89 return "CL_INVALID_DEVICE_TYPE";
91 return "CL_INVALID_PLATFORM";
93 return "CL_INVALID_DEVICE";
95 return "CL_INVALID_CONTEXT";
97 return "CL_INVALID_QUEUE_PROPERTIES";
99 return "CL_INVALID_COMMAND_QUEUE";
101 return "CL_INVALID_HOST_PTR";
103 return "CL_INVALID_MEM_OBJECT";
105 return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
107 return "CL_INVALID_IMAGE_SIZE";
109 return "CL_INVALID_SAMPLER";
111 return "CL_INVALID_BINARY";
113 return "CL_INVALID_BUILD_OPTIONS";
115 return "CL_INVALID_PROGRAM";
117 return "CL_INVALID_PROGRAM_EXECUTABLE";
119 return "CL_INVALID_KERNEL_NAME";
121 return "CL_INVALID_KERNEL_DEFINITION";
123 return "CL_INVALID_KERNEL";
125 return "CL_INVALID_ARG_INDEX";
127 return "CL_INVALID_ARG_VALUE";
129 return "CL_INVALID_ARG_SIZE";
131 return "CL_INVALID_KERNEL_ARGS";
133 return "CL_INVALID_WORK_DIMENSION";
135 return "CL_INVALID_WORK_GROUP_SIZE";
137 return "CL_INVALID_WORK_ITEM_SIZE";
139 return "CL_INVALID_GLOBAL_OFFSET";
141 return "CL_INVALID_EVENT_WAIT_LIST";
143 return "CL_INVALID_EVENT";
145 return "CL_INVALID_OPERATION";
147 return "CL_INVALID_GL_OBJECT";
149 return "CL_INVALID_BUFFER_SIZE";
151 return "CL_INVALID_MIP_LEVEL";
153 return "CL_INVALID_GLOBAL_WORK_SIZE";
155 return "CL_INVALID_PROPERTY";
157 return "CL_INVALID_IMAGE_DESCRIPTOR";
159 return "CL_INVALID_COMPILER_OPTIONS";
161 return "CL_INVALID_LINKER_OPTIONS";
163 return "CL_INVALID_DEVICE_PARTITION_COUNT";
165 return "unknown cl error code " + std::to_string(err);
169void errorCallback(
const char* errinfo,
const void* private_info,
size_t cb,
void* user_data) {
170 printf(
"Callback from OpenCL context: errinfo = '%s'\n", errinfo);
175class OpenClContext :
public ComputeContext {
177 OpenClContext(cl_device_id* device, cl_context* context,
178 const std::vector<std::pair<std::string, std::string>>& deviceInfo,
179 const bool supportsDoublePrecision);
180 ~OpenClContext()
override final;
181 void init()
override final;
183 std::pair<std::size_t, bool> initiateCalculation(
const std::size_t n,
const std::size_t
id = 0,
184 const std::size_t version = 0,
185 const Settings settings = {})
override final;
186 void disposeCalculation(
const std::size_t n)
override final;
187 std::size_t createInputVariable(
double v)
override final;
188 std::size_t createInputVariable(
double* v)
override final;
189 std::vector<std::vector<std::size_t>> createInputVariates(
const std::size_t dim,
190 const std::size_t
steps)
override final;
191 std::size_t applyOperation(
const std::size_t randomVariableOpCode,
192 const std::vector<std::size_t>& args)
override final;
193 void freeVariable(
const std::size_t
id)
override final;
194 void declareOutputVariable(
const std::size_t
id)
override final;
195 void finalizeCalculation(std::vector<double*>& output)
override final;
197 std::vector<std::pair<std::string, std::string>> deviceInfo()
const override;
198 bool supportsDoublePrecision()
const override;
199 const DebugInfo& debugInfo()
const override final;
204 std::optional<std::size_t> lhs_local_id;
206 std::set<std::size_t> rhs_local_id;
209 std::size_t generateResultId();
210 std::pair<std::vector<std::string>, std::set<std::size_t>> getArgString(
const std::vector<std::size_t>& args)
const;
211 std::size_t valuesBufferId(
const std::size_t i)
const;
212 void startNewSsaPart();
213 std::string generateSsaCode(
const std::vector<ssa_entry>& ssa)
const;
215 void updateVariatesPool();
217 void runHealthChecks();
218 std::string runHealthCheckProgram(
const std::string& source,
const std::string& kernelName);
220 static void releaseMem(cl_mem& m,
const std::string& desc);
221 static void releaseKernel(cl_kernel& k,
const std::string& desc);
222 static void releaseKernel(std::vector<cl_kernel>& ks,
const std::string& desc);
223 static void releaseProgram(cl_program& p,
const std::string& desc);
225 enum class ComputeState { idle, createInput, createVariates, calc, declareOutput };
227 bool initialized_ =
false;
228 cl_device_id* device_;
229 cl_context* context_;
230 cl_command_queue queue_;
233 std::vector<std::pair<std::string, std::string>> deviceInfo_;
234 bool supportsDoublePrecision_;
237 ComputeContext::DebugInfo debugInfo_;
241 std::vector<std::size_t> size_;
242 std::vector<bool> disposed_;
243 std::vector<bool> hasKernel_;
244 std::vector<std::size_t> version_;
245 std::vector<cl_program> program_;
246 std::vector<std::vector<cl_kernel>> kernel_;
247 std::vector<std::vector<std::vector<std::vector<std::size_t>>>> conditionalExpectationVarIds_;
248 std::vector<std::size_t> inputBufferSize_;
249 std::vector<std::size_t> nOutputVars_;
250 std::vector<std::vector<std::size_t>> nVars_;
251 std::vector<std::size_t> nVariates_;
255 std::size_t variatesPoolSize_ = 0;
256 cl_mem variatesPool_;
257 cl_mem variatesMtStateBuffer_;
258 cl_program variatesProgram_;
259 cl_kernel variatesKernelSeedInit_;
260 cl_kernel variatesKernelTwist_;
261 cl_kernel variatesKernelGenerate_;
265 std::size_t currentId_ = 0;
266 ComputeState currentState_ = ComputeState::idle;
267 std::size_t nVarsTmp_;
268 std::size_t nVariatesTmp_;
270 std::set<std::string> currentConditionalExpectationArgs_;
273 std::vector<std::size_t> inputVarOffset_;
274 std::vector<bool> inputVarIsScalar_;
275 std::vector<float> inputVarValues32_;
276 std::vector<double> inputVarValues64_;
279 std::vector<std::size_t> freedVariables_;
280 std::vector<std::size_t> outputVariables_;
283 std::vector<std::vector<ssa_entry>> currentSsa_;
288cl_uint OpenClFramework::nPlatforms_ = 0;
299 boost::unique_lock<boost::shared_mutex> lock(
mutex_);
309 for (std::size_t p = 0; p < nPlatforms_; ++p) {
316 for (std::size_t d = 0; d < nDevices_[p]; ++d) {
319 cl_device_fp_config doubleFpConfig;
327 deviceInfo_[p][d].push_back(std::make_pair(
"device_name", std::string(deviceName)));
328 deviceInfo_[p][d].push_back(std::make_pair(
"driver_version", std::string(driverVersion)));
329 deviceInfo_[p][d].push_back(std::make_pair(
"device_version", std::string(deviceVersion)));
330 deviceInfo_[p][d].push_back(std::make_pair(
"device_extensions", std::string(deviceExtensions)));
336 clGetDeviceInfo(devices_[p][d], CL_DEVICE_DOUBLE_FP_CONFIG,
sizeof(cl_device_fp_config), &doubleFpConfig,
339 "device_double_fp_config",
340 ((doubleFpConfig & CL_FP_DENORM) ? std::string(
"Denorm,") : std::string()) +
341 ((doubleFpConfig & CL_FP_INF_NAN) ? std::string(
"InfNan,") : std::string()) +
342 ((doubleFpConfig & CL_FP_ROUND_TO_NEAREST) ? std::string(
"RoundNearest,") : std::string()) +
343 ((doubleFpConfig & CL_FP_ROUND_TO_ZERO) ? std::string(
"RoundZero,") : std::string()) +
344 ((doubleFpConfig & CL_FP_FMA) ? std::string(
"FMA,") : std::string()) +
345 ((doubleFpConfig & CL_FP_SOFT_FLOAT) ? std::string(
"SoftFloat,") : std::string())));
348 deviceInfo_[p][d].push_back(std::make_pair(
"device_double_fp_config",
"not provided before opencl 1.2"));
350 supportsDoublePrecision || std::string(deviceExtensions).find(
"cl_khr_fp64");
356 context_[p][d] = clCreateContext(NULL, 1, &devices_[p][d], &errorCallback, NULL, &err);
357 QL_REQUIRE(err == CL_SUCCESS,
358 "OpenClFramework::OpenClContext(): error during clCreateContext(): " << errorText(err));
365 for (std::size_t p = 0; p < nPlatforms_; ++p) {
366 for (std::size_t d = 0; d < nDevices_[p]; ++d) {
379OpenClContext::OpenClContext(cl_device_id* device, cl_context* context,
380 const std::vector<std::pair<std::string, std::string>>& deviceInfo,
381 const bool supportsDoublePrecision)
382 : initialized_(
false), device_(device), context_(context), deviceInfo_(deviceInfo),
383 supportsDoublePrecision_(supportsDoublePrecision) {}
385OpenClContext::~OpenClContext() {
387 if (variatesPoolSize_ > 0) {
388 releaseMem(variatesPool_,
"variates pool");
389 releaseMem(variatesMtStateBuffer_,
"variates state buffer");
390 releaseKernel(variatesKernelSeedInit_,
"variates seed init");
391 releaseKernel(variatesKernelTwist_,
"variates twist");
392 releaseKernel(variatesKernelGenerate_,
"variates generate");
393 releaseProgram(variatesProgram_,
"variates");
396 for (Size i = 0; i < kernel_.size(); ++i) {
397 if (disposed_[i] || !hasKernel_[i])
399 releaseKernel(kernel_[i],
"ore kernel");
402 for (Size i = 0; i < program_.size(); ++i) {
403 if (disposed_[i] || !hasKernel_[i])
405 releaseProgram(program_[i],
"ore program");
409 if (err = clReleaseCommandQueue(queue_); err != CL_SUCCESS) {
410 std::cerr <<
"OpenClFramework: error during clReleaseCommandQueue: " + errorText(err) << std::endl;
415void OpenClContext::releaseMem(cl_mem& m,
const std::string& description) {
417 if (err = clReleaseMemObject(m); err != CL_SUCCESS) {
418 std::cerr <<
"OpenClContext: error during clReleaseMemObject '" << description <<
"': " + errorText(err)
423void OpenClContext::releaseKernel(cl_kernel& k,
const std::string& description) {
425 if (err = clReleaseKernel(k); err != CL_SUCCESS) {
426 std::cerr <<
"OpenClContext: error during clReleaseKernel '" << description <<
"': " + errorText(err)
431void OpenClContext::releaseKernel(std::vector<cl_kernel>& ks,
const std::string& description) {
433 releaseKernel(k, description);
436void OpenClContext::releaseProgram(cl_program& p,
const std::string& description) {
438 if (err = clReleaseProgram(p); err != CL_SUCCESS) {
439 std::cerr <<
"OpenClContext: error during clReleaseProgram '" << description <<
"': " + errorText(err)
444std::string OpenClContext::runHealthCheckProgram(
const std::string& source,
const std::string& kernelName) {
447 std::vector<cl_program> p;
448 std::vector<cl_kernel> k;
449 std::vector<cl_mem> m;
452 OpenClContext::releaseProgram(pgm,
"health check");
454 OpenClContext::releaseKernel(krn,
"health check");
456 OpenClContext::releaseMem(mem,
"health check");
460 const char* programPtr = source.c_str();
464 cl_program program = clCreateProgramWithSource(*context_, 1, &programPtr, NULL, &err);
465 if (err != CL_SUCCESS) {
466 return errorText(err);
468 cleanup.p.push_back(program);
470 err = clBuildProgram(program, 1, device_, NULL, NULL, NULL);
471 if (err != CL_SUCCESS) {
472 return errorText(err);
475 cl_kernel kernel = clCreateKernel(program, kernelName.c_str(), &err);
476 if (err != CL_SUCCESS) {
477 return errorText(err);
479 cleanup.k.push_back(kernel);
481 cl_mem resultBuffer = clCreateBuffer(*context_, CL_MEM_READ_WRITE,
sizeof(cl_ulong), NULL, &err);
482 if (err != CL_SUCCESS) {
483 return errorText(err);
485 cleanup.m.push_back(resultBuffer);
487 err = clSetKernelArg(kernel, 0,
sizeof(cl_mem), &resultBuffer);
490 constexpr std::size_t sizeOne = 1;
491 err = clEnqueueNDRangeKernel(queue_, kernel, 1, NULL, &sizeOne, NULL, 0, NULL, &runEvent);
492 if (err != CL_SUCCESS) {
493 return errorText(err);
497 err = clEnqueueReadBuffer(queue_, resultBuffer, CL_TRUE, 0,
sizeof(cl_ulong), &result, 1, &runEvent, NULL);
498 if (err != CL_SUCCESS) {
499 return errorText(err);
502 return std::to_string(result);
505void OpenClContext::runHealthChecks() {
506 deviceInfo_.push_back(std::make_pair(
"host_sizeof(cl_uint)", std::to_string(
sizeof(cl_uint))));
507 deviceInfo_.push_back(std::make_pair(
"host_sizeof(cl_ulong)", std::to_string(
sizeof(cl_ulong))));
508 deviceInfo_.push_back(std::make_pair(
"host_sizeof(cl_float)", std::to_string(
sizeof(cl_float))));
509 deviceInfo_.push_back(std::make_pair(
"host_sizeof(cl_double)", std::to_string(
sizeof(cl_double))));
511 std::string kernelGetUintSize =
512 "__kernel void ore_get_uint_size(__global ulong* result) { result[0] = sizeof(uint); }";
513 std::string kernelGetUlongSize =
514 "__kernel void ore_get_ulong_size(__global ulong* result) { result[0] = sizeof(ulong); }";
515 std::string kernelGetFloatSize =
516 "__kernel void ore_get_float_size(__global ulong* result) { result[0] = sizeof(float); }";
517 std::string kernelGetDoubleSize =
518 "__kernel void ore_get_double_size(__global ulong* result) { result[0] = sizeof(double); }";
520 deviceInfo_.push_back(
521 std::make_pair(
"device_sizeof(uint)", runHealthCheckProgram(kernelGetUintSize,
"ore_get_uint_size")));
522 deviceInfo_.push_back(
523 std::make_pair(
"device_sizeof(ulong)", runHealthCheckProgram(kernelGetUlongSize,
"ore_get_ulong_size")));
524 deviceInfo_.push_back(
525 std::make_pair(
"device_sizeof(float)", runHealthCheckProgram(kernelGetFloatSize,
"ore_get_float_size")));
526 deviceInfo_.push_back(
527 std::make_pair(
"device_sizeof(double)", runHealthCheckProgram(kernelGetDoubleSize,
"ore_get_double_size")));
530void OpenClContext::init() {
536 debugInfo_.numberOfOperations = 0;
537 debugInfo_.nanoSecondsDataCopy = 0;
538 debugInfo_.nanoSecondsProgramBuild = 0;
539 debugInfo_.nanoSecondsCalculation = 0;
543 queue_ = clCreateCommandQueueWithProperties(*context_, *device_, NULL, &err);
546 queue_ = clCreateCommandQueue(*context_, *device_, 0, &err);
548 QL_REQUIRE(err == CL_SUCCESS,
549 "OpenClFramework::OpenClContext(): error during clCreateCommandQueue(): " << errorText(err));
556void OpenClContext::disposeCalculation(
const std::size_t
id) {
557 QL_REQUIRE(!disposed_[
id - 1],
"OpenClContext::disposeCalculation(): id " <<
id <<
" was already disposed.");
558 disposed_[
id - 1] =
true;
559 if (hasKernel_[
id - 1]) {
560 releaseKernel(kernel_[
id - 1],
"kernel id " + std::to_string(
id) +
" (during dispose())");
561 releaseProgram(program_[
id - 1],
"program id " + std::to_string(
id) +
" (during dispose())");
565std::pair<std::size_t, bool> OpenClContext::initiateCalculation(
const std::size_t n,
const std::size_t
id,
566 const std::size_t version,
const Settings settings) {
567 QL_REQUIRE(n > 0,
"OpenClContext::initiateCalculation(): n must not be zero");
569 bool newCalc =
false;
570 settings_ = settings;
577 disposed_.push_back(
false);
578 hasKernel_.push_back(
false);
579 version_.push_back(version);
580 program_.push_back(cl_program());
581 kernel_.push_back(std::vector<cl_kernel>());
582 conditionalExpectationVarIds_.push_back(std::vector<std::vector<std::vector<std::size_t>>>(1));
583 inputBufferSize_.push_back(0);
584 nOutputVars_.push_back(0);
585 nVars_.push_back(std::vector<std::size_t>());
586 nVariates_.push_back(0);
588 currentId_ = hasKernel_.size();
595 QL_REQUIRE(
id <= hasKernel_.size(),
596 "OpenClContext::initiateCalculation(): id (" <<
id <<
") invalid, got 1..." << hasKernel_.size());
597 QL_REQUIRE(size_[
id - 1] == n,
"OpenClContext::initiateCalculation(): size ("
598 << size_[
id - 1] <<
") for id " <<
id <<
" does not match current size ("
600 QL_REQUIRE(!disposed_[
id - 1],
"OpenClContext::initiateCalculation(): id ("
601 <<
id <<
") was already disposed, it can not be used any more.");
603 if (version != version_[
id - 1]) {
604 hasKernel_[
id - 1] =
false;
605 version_[
id - 1] = version;
606 nVars_[
id - 1].clear();
607 nVariates_[
id - 1] = 0;
608 releaseKernel(kernel_[
id - 1],
609 "kernel id " + std::to_string(
id) +
" (during initiateCalculation, old version: " +
610 std::to_string(version_[
id - 1]) +
", new version:" + std::to_string(version) +
")");
611 conditionalExpectationVarIds_[
id - 1] = std::vector<std::vector<std::vector<std::size_t>>>(1);
612 releaseProgram(program_[
id - 1],
613 "program id " + std::to_string(
id) +
" (during initiateCalculation, old version: " +
614 std::to_string(version_[
id - 1]) +
", new version:" + std::to_string(version) +
")");
626 inputVarOffset_.clear();
627 inputVarIsScalar_.clear();
628 inputVarValues32_.clear();
629 inputVarValues64_.clear();
632 freedVariables_.clear();
633 outputVariables_.clear();
634 currentSsa_ = std::vector<std::vector<ssa_entry>>(1);
635 currentConditionalExpectationArgs_.clear();
640 currentState_ = ComputeState::createInput;
644 return std::make_pair(currentId_, newCalc);
647std::size_t OpenClContext::createInputVariable(
double v) {
648 QL_REQUIRE(currentState_ == ComputeState::createInput,
649 "OpenClContext::createInputVariable(): not in state createInput (" <<
static_cast<int>(currentState_)
651 std::size_t nextOffset = 0;
652 if (!inputVarOffset_.empty()) {
653 nextOffset = inputVarOffset_.back() + (inputVarIsScalar_.back() ? 1 : size_[currentId_ - 1]);
655 inputVarOffset_.push_back(nextOffset);
656 inputVarIsScalar_.push_back(
true);
657 if (settings_.useDoublePrecision) {
658 inputVarValues64_.push_back(v);
661 inputVarValues32_.push_back((
float)std::max(std::min(v, (
double)std::numeric_limits<float>::max()),
662 -(
double)std::numeric_limits<float>::max()));
667std::size_t OpenClContext::createInputVariable(
double* v) {
668 QL_REQUIRE(currentState_ == ComputeState::createInput,
669 "OpenClContext::createInputVariable(): not in state createInput (" <<
static_cast<int>(currentState_)
671 std::size_t nextOffset = 0;
672 if (!inputVarOffset_.empty()) {
673 nextOffset = inputVarOffset_.back() + (inputVarIsScalar_.back() ? 1 : size_[currentId_ - 1]);
675 inputVarOffset_.push_back(nextOffset);
676 inputVarIsScalar_.push_back(
false);
677 for (std::size_t i = 0; i < size_[currentId_ - 1]; ++i) {
678 if (settings_.useDoublePrecision) {
679 inputVarValues64_.push_back(v[i]);
681 inputVarValues32_.push_back((
float)std::max(std::min(v[i], (
double)std::numeric_limits<float>::max()),
682 -(
double)std::numeric_limits<float>::max()));
688void OpenClContext::updateVariatesPool() {
689 QL_REQUIRE(nVariatesTmp_ > 0,
"OpenClContext::updateVariatesPool(): internal error, got nVariatesTmp_ == 0.");
691 constexpr std::size_t size_one = 1;
692 constexpr std::size_t mt_N = 624;
694 std::size_t fpSize = settings_.useDoublePrecision ?
sizeof(double) :
sizeof(
float);
696 QL_REQUIRE(!settings_.useDoublePrecision || supportsDoublePrecision(),
697 "OpenClContext::updateVariatesPool(): double precision is configured for this calculation, but not "
698 "supported by the device. Switch to single precision or use an appropriate device.");
701 if (variatesPoolSize_ == 0) {
705 std::string fpTypeStr = settings_.useDoublePrecision ?
"double" :
"float";
706 std::string fpSuffix = settings_.useDoublePrecision ?
"" :
"f";
707 std::string fpMaxValue = settings_.useDoublePrecision ?
"0x1.fffffffffffffp1023" :
"0x1.fffffep127f";
711 std::string sourceInvCumN = fpTypeStr +
" ore_invCumN(const uint x0);\n" +
712 fpTypeStr +
" ore_invCumN(const uint x0) {\n"
713 " const " + fpTypeStr +
" a1_ = -3.969683028665376e+01" + fpSuffix +
";\n"
714 " const " + fpTypeStr +
" a2_ = 2.209460984245205e+02" + fpSuffix +
";\n"
715 " const " + fpTypeStr +
" a3_ = -2.759285104469687e+02" + fpSuffix +
";\n"
716 " const " + fpTypeStr +
" a4_ = 1.383577518672690e+02" + fpSuffix +
";\n"
717 " const " + fpTypeStr +
" a5_ = -3.066479806614716e+01" + fpSuffix +
";\n"
718 " const " + fpTypeStr +
" a6_ = 2.506628277459239e+00" + fpSuffix +
";\n"
719 " const " + fpTypeStr +
" b1_ = -5.447609879822406e+01" + fpSuffix +
";\n"
720 " const " + fpTypeStr +
" b2_ = 1.615858368580409e+02" + fpSuffix +
";\n"
721 " const " + fpTypeStr +
" b3_ = -1.556989798598866e+02" + fpSuffix +
";\n"
722 " const " + fpTypeStr +
" b4_ = 6.680131188771972e+01" + fpSuffix +
";\n"
723 " const " + fpTypeStr +
" b5_ = -1.328068155288572e+01" + fpSuffix +
";\n"
724 " const " + fpTypeStr +
" c1_ = -7.784894002430293e-03" + fpSuffix +
";\n"
725 " const " + fpTypeStr +
" c2_ = -3.223964580411365e-01" + fpSuffix +
";\n"
726 " const " + fpTypeStr +
" c3_ = -2.400758277161838e+00" + fpSuffix +
";\n"
727 " const " + fpTypeStr +
" c4_ = -2.549732539343734e+00" + fpSuffix +
";\n"
728 " const " + fpTypeStr +
" c5_ = 4.374664141464968e+00" + fpSuffix +
";\n"
729 " const " + fpTypeStr +
" c6_ = 2.938163982698783e+00" + fpSuffix +
";\n"
730 " const " + fpTypeStr +
" d1_ = 7.784695709041462e-03" + fpSuffix +
";\n"
731 " const " + fpTypeStr +
" d2_ = 3.224671290700398e-01" + fpSuffix +
";\n"
732 " const " + fpTypeStr +
" d3_ = 2.445134137142996e+00" + fpSuffix +
";\n"
733 " const " + fpTypeStr +
" d4_ = 3.754408661907416e+00" + fpSuffix +
";\n"
734 " const " + fpTypeStr +
" x_low_ = 0.02425" + fpSuffix +
";\n"
735 " const " + fpTypeStr +
" x_high_ = 1.0" + fpSuffix +
" - x_low_;\n"
736 " const " + fpTypeStr +
" x = ((" + fpTypeStr +
")x0 + 0.5" + fpSuffix +
") / 4294967296.0"+ fpSuffix +
";\n"
737 " if (x < x_low_ || x_high_ < x) {\n"
738 " if (x0 == UINT_MAX) {\n"
739 " return " + fpMaxValue +
";\n"
740 " } else if(x0 == 0) {\n"
741 " return -" + fpMaxValue +
";\n"
743 " " + fpTypeStr +
" z;\n"
744 " if (x < x_low_) {\n"
745 " z = sqrt(-2.0" + fpSuffix +
" * log(x));\n"
746 " z = (((((c1_ * z + c2_) * z + c3_) * z + c4_) * z + c5_) * z + c6_) /\n"
747 " ((((d1_ * z + d2_) * z + d3_) * z + d4_) * z + 1.0" + fpSuffix +
");\n"
749 " z = sqrt(-2.0f * log(1.0f - x));\n"
750 " z = -(((((c1_ * z + c2_) * z + c3_) * z + c4_) * z + c5_) * z + c6_) /\n"
751 " ((((d1_ * z + d2_) * z + d3_) * z + d4_) * z + 1.0" + fpSuffix +
");\n"
755 " " + fpTypeStr +
" z = x - 0.5" + fpSuffix +
";\n"
756 " " + fpTypeStr +
" r = z * z;\n"
757 " z = (((((a1_ * r + a2_) * r + a3_) * r + a4_) * r + a5_) * r + a6_) * z /\n"
758 " (((((b1_ * r + b2_) * r + b3_) * r + b4_) * r + b5_) * r + 1.0" + fpSuffix +
");\n"
765 std::string kernelSourceSeedInit =
"__kernel void ore_seedInitialization(const ulong s, __global ulong* mt) {\n"
766 " const ulong N = 624;\n"
767 " mt[0]= s & 0xffffffffUL;\n"
768 " for (ulong mti=1; mti<N; ++mti) {\n"
769 " mt[mti] = (1812433253UL * (mt[mti-1] ^ (mt[mti-1] >> 30)) + mti);\n"
770 " mt[mti] &= 0xffffffffUL;\n"
774 std::string kernelSourceTwist =
"__kernel void ore_twist(__global ulong* mt) {\n"
775 " const ulong N = 624;\n"
776 " const ulong M = 397;\n"
777 " const ulong MATRIX_A = 0x9908b0dfUL;\n"
778 " const ulong UPPER_MASK=0x80000000UL;\n"
779 " const ulong LOWER_MASK=0x7fffffffUL;\n"
780 " const ulong mag01[2]={0x0UL, MATRIX_A};\n"
783 " for (kk=0;kk<N-M;++kk) {\n"
784 " y = (mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK);\n"
785 " mt[kk] = mt[kk+M] ^ (y >> 1) ^ mag01[y & 0x1UL];\n"
787 " for (;kk<N-1;kk++) {\n"
788 " y = (mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK);\n"
789 " mt[kk] = mt[(kk+M)-N] ^ (y >> 1) ^ mag01[y & 0x1UL];\n"
791 " y = (mt[N-1]&UPPER_MASK)|(mt[0]&LOWER_MASK);\n"
792 " mt[N-1] = mt[M-1] ^ (y >> 1) ^ mag01[y & 0x1UL];\n"
795 std::string kernelSourceGenerate =
796 "__kernel void ore_generate(const ulong offset, __global ulong* mt, __global " + fpTypeStr +
"* output) {\n"
797 " ulong mti = get_global_id(0);\n"
798 " ulong y = mt[mti];\n"
800 " y ^= (y << 7) & 0x9d2c5680UL;\n"
801 " y ^= (y << 15) & 0xefc60000UL;\n"
803 " output[offset + mti] = ore_invCumN((uint)y);\n"
807 std::string programSource = sourceInvCumN + kernelSourceSeedInit + kernelSourceTwist + kernelSourceGenerate;
811 const char* programSourcePtr = programSource.c_str();
813 variatesProgram_ = clCreateProgramWithSource(*context_, 1, &programSourcePtr, NULL, &err);
814 QL_REQUIRE(err == CL_SUCCESS,
815 "OpenClContext::updateVariatesPool(): error creating program: " << errorText(err));
816 err = clBuildProgram(variatesProgram_, 1, device_, NULL, NULL, NULL);
817 if (err != CL_SUCCESS) {
819 clGetProgramBuildInfo(variatesProgram_, *device_, CL_PROGRAM_BUILD_LOG,
821 QL_FAIL(
"OpenClContext::updateVariatesPool(): error during program build: "
825 variatesKernelSeedInit_ = clCreateKernel(variatesProgram_,
"ore_seedInitialization", &err);
826 QL_REQUIRE(err == CL_SUCCESS,
827 "OpenClContext::updateVariatesPool(): error creating kernel seedInit: " << errorText(err));
829 variatesKernelTwist_ = clCreateKernel(variatesProgram_,
"ore_twist", &err);
830 QL_REQUIRE(err == CL_SUCCESS,
831 "OpenClContext::updateVariatesPool(): error creating kernel twist: " << errorText(err));
833 variatesKernelGenerate_ = clCreateKernel(variatesProgram_,
"ore_generate", &err);
834 QL_REQUIRE(err == CL_SUCCESS,
835 "OpenClContext::updateVariatesPool(): error creating kernel generate: " << errorText(err));
837 variatesMtStateBuffer_ = clCreateBuffer(*context_, CL_MEM_READ_WRITE,
sizeof(cl_ulong) * mt_N, NULL, &err);
838 QL_REQUIRE(err == CL_SUCCESS,
839 "OpenClContext::updateVariatesPool(): error creating mt state buffer: " << errorText(err));
841 cl_ulong tmpSeed = (cl_ulong)settings_.rngSeed;
842 err = clSetKernelArg(variatesKernelSeedInit_, 0,
sizeof(cl_ulong), &tmpSeed);
843 err |= clSetKernelArg(variatesKernelSeedInit_, 1,
sizeof(cl_mem), &variatesMtStateBuffer_);
844 QL_REQUIRE(err == CL_SUCCESS,
845 "OpenClContext::updateVariatesPool(): error setting kernel args seed init: " << errorText(err));
847 err = clEnqueueNDRangeKernel(queue_, variatesKernelSeedInit_, 1, NULL, &size_one, NULL, 0, NULL, &initEvent);
848 QL_REQUIRE(err == CL_SUCCESS,
849 "OpenClContext::updateVariatesPool(): error running kernel seed init: " << errorText(err));
854 if (variatesPoolSize_ >= nVariatesTmp_ * size_[currentId_ - 1]) {
855 if (variatesPoolSize_ == 0)
856 clWaitForEvents(1, &initEvent);
862 Size alignedSize = 624 * (nVariatesTmp_ * size_[currentId_ - 1] / 624 +
863 (nVariatesTmp_ * size_[currentId_ - 1] % 624 == 0 ? 0 : 1));
867 cl_mem oldBuffer = variatesPool_;
869 struct OldBufferReleaser {
870 OldBufferReleaser(cl_mem b,
bool active) : b(b), active(active) {}
871 ~OldBufferReleaser() {
873 OpenClContext::releaseMem(b,
"expired variates buffer");
877 } oldBufferReleaser(oldBuffer, variatesPoolSize_ > 0);
879 variatesPool_ = clCreateBuffer(*context_, CL_MEM_READ_WRITE, fpSize * alignedSize, NULL, &err);
880 QL_REQUIRE(err == CL_SUCCESS,
"OpenClContext::updateVariatesPool(): error creating variates buffer with size "
881 << fpSize * alignedSize <<
" bytes: " << errorText(err));
883 if (variatesPoolSize_ > 0) {
884 err = clEnqueueCopyBuffer(queue_, oldBuffer, variatesPool_, 0, 0, fpSize * variatesPoolSize_, 0, NULL,
886 QL_REQUIRE(err == CL_SUCCESS,
887 "OpenClContext::updateVariatesPool(): error copying existing variates buffer to new buffer: "
893 std::size_t currentPoolSize;
894 cl_event generateEvent;
895 bool haveGenerated =
false;
896 for (currentPoolSize = variatesPoolSize_; currentPoolSize < nVariatesTmp_ * size_[currentId_ - 1];
897 currentPoolSize += mt_N) {
898 err = clSetKernelArg(variatesKernelTwist_, 0,
sizeof(cl_mem), &variatesMtStateBuffer_);
899 QL_REQUIRE(err == CL_SUCCESS,
900 "OpenClContext::updateVariatesPool(): error setting args for kernel twist: " << errorText(err));
902 err = clEnqueueNDRangeKernel(
903 queue_, variatesKernelTwist_, 1, NULL, &size_one, NULL, variatesPoolSize_ == 0 || haveGenerated ? 1 : 0,
904 variatesPoolSize_ == 0 ? &initEvent : (haveGenerated ? &generateEvent : NULL), &twistEvent);
905 QL_REQUIRE(err == CL_SUCCESS,
906 "OpenClContext::updateVariatesPool(): error running kernel twist: " << errorText(err));
908 err = clSetKernelArg(variatesKernelGenerate_, 0,
sizeof(cl_ulong), ¤tPoolSize);
909 err |= clSetKernelArg(variatesKernelGenerate_, 1,
sizeof(cl_mem), &variatesMtStateBuffer_);
910 err |= clSetKernelArg(variatesKernelGenerate_, 2,
sizeof(cl_mem), &variatesPool_);
911 QL_REQUIRE(err == CL_SUCCESS,
912 "OpenClContext::updateVariatesPool(): error settings args for kernel generate: " << errorText(err));
913 err = clEnqueueNDRangeKernel(queue_, variatesKernelGenerate_, 1, NULL, &mt_N, NULL, 1, &twistEvent,
915 QL_REQUIRE(err == CL_SUCCESS,
916 "OpenClContext::updateVariatesPool(): error running kernel generate: " << errorText(err));
917 haveGenerated =
true;
922 std::vector<cl_event> waitList;
923 if (variatesPoolSize_ > 0)
924 waitList.push_back(copyEvent);
926 waitList.push_back(generateEvent);
927 if (!waitList.empty())
928 clWaitForEvents(waitList.size(), &waitList[0]);
932 QL_REQUIRE(currentPoolSize == alignedSize,
"OpenClContext::updateVariatesPool(): internal error, currentPoolSize = "
933 << currentPoolSize <<
" does not match alignedSize " << alignedSize);
934 variatesPoolSize_ = currentPoolSize;
937std::vector<std::vector<std::size_t>> OpenClContext::createInputVariates(
const std::size_t dim,
938 const std::size_t
steps) {
939 QL_REQUIRE(currentState_ == ComputeState::createInput || currentState_ == ComputeState::createVariates,
940 "OpenClContext::createInputVariates(): not in state createInput or createVariates ("
941 <<
static_cast<int>(currentState_) <<
")");
942 QL_REQUIRE(currentId_ > 0,
"OpenClContext::applyOperation(): current id is not set");
943 QL_REQUIRE(!hasKernel_[currentId_ - 1],
"OpenClContext::createInputVariates(): id ("
944 << currentId_ <<
") in version " << version_[currentId_ - 1]
945 <<
" has a kernel already, input variates can not be regenerated.");
946 currentState_ = ComputeState::createVariates;
947 std::vector<std::vector<std::size_t>> resultIds(dim, std::vector<std::size_t>(
steps));
948 for (std::size_t j = 0; j <
steps; ++j) {
949 for (std::size_t i = 0; i < dim; ++i) {
950 resultIds[i][j] = nVarsTmp_++;
953 nVariatesTmp_ += dim *
steps;
954 updateVariatesPool();
958std::size_t OpenClContext::generateResultId() {
959 std::size_t resultId;
960 if (!freedVariables_.empty()) {
961 resultId = freedVariables_.back();
962 freedVariables_.pop_back();
964 resultId = nVarsTmp_++;
969std::pair<std::vector<std::string>, std::set<std::size_t>>
970OpenClContext::getArgString(
const std::vector<std::size_t>& args)
const {
971 std::vector<std::string> argStr(args.size());
972 std::set<std::size_t> localIds;
973 for (std::size_t i = 0; i < args.size(); ++i) {
974 if (args[i] < inputVarOffset_.size()) {
975 argStr[i] =
"input[" + std::to_string(inputVarOffset_[args[i]]) +
"U" +
976 (inputVarIsScalar_[args[i]] ?
"]" :
" + i]");
977 }
else if (args[i] < inputVarOffset_.size() + nVariatesTmp_) {
978 argStr[i] =
"rn[" + std::to_string((args[i] - inputVarOffset_.size()) * size_[currentId_ - 1]) +
"U + i]";
980 argStr[i] =
"v" + std::to_string(args[i]);
981 localIds.insert(args[i]);
984 return std::make_pair(argStr, localIds);
987void OpenClContext::startNewSsaPart() {
988 currentSsa_.push_back(std::vector<ssa_entry>());
989 conditionalExpectationVarIds_[currentId_ - 1].push_back(std::vector<std::vector<std::size_t>>());
990 nVars_[currentId_ - 1].push_back(nVarsTmp_);
993std::size_t OpenClContext::applyOperation(
const std::size_t randomVariableOpCode,
994 const std::vector<std::size_t>& args) {
995 QL_REQUIRE(currentState_ == ComputeState::createInput || currentState_ == ComputeState::createVariates ||
996 currentState_ == ComputeState::calc,
997 "OpenClContext::applyOperation(): not in state createInput or calc (" <<
static_cast<int>(currentState_)
999 currentState_ = ComputeState::calc;
1000 QL_REQUIRE(currentId_ > 0,
"OpenClContext::applyOperation(): current id is not set");
1001 QL_REQUIRE(!hasKernel_[currentId_ - 1],
"OpenClContext::applyOperation(): id (" << currentId_ <<
") in version "
1002 << version_[currentId_ - 1]
1003 <<
" has a kernel already.");
1004 std::string fpTypeStr = settings_.useDoublePrecision ?
"double" :
"float";
1006 auto [argStr, argLocalIds] = getArgString(args);
1010 if (std::find_if(argStr.begin(), argStr.end(), [
this](
const std::string& a) {
1011 return currentConditionalExpectationArgs_.find(a) != currentConditionalExpectationArgs_.end();
1012 }) != argStr.end()) {
1014 currentConditionalExpectationArgs_.clear();
1017 std::vector<std::size_t> argIds;
1018 std::size_t regressandId;
1019 for (std::size_t i = 0; i < argStr.size() + 1; ++i) {
1020 auto resultId = generateResultId();
1022 currentSsa_.back().push_back(
1023 {std::string(
"v") + std::to_string(resultId), resultId, i == 0 ?
"0.0" : argStr[i - 1], argLocalIds});
1024 argIds.push_back(resultId);
1026 regressandId = resultId;
1027 currentConditionalExpectationArgs_.insert(
"v" + std::to_string(resultId));
1031 conditionalExpectationVarIds_[currentId_ - 1].back().push_back(argIds);
1033 return regressandId;
1039 auto resultId = generateResultId();
1041 switch (randomVariableOpCode) {
1046 currentSsa_.back().push_back(
1047 {
"v" + std::to_string(resultId), resultId, argStr[0] +
" + " + argStr[1], argLocalIds});
1051 currentSsa_.back().push_back(
1052 {
"v" + std::to_string(resultId), resultId, argStr[0] +
" - " + argStr[1], argLocalIds});
1056 currentSsa_.back().push_back({
"v" + std::to_string(resultId), resultId,
"-" + argStr[0], argLocalIds});
1060 currentSsa_.back().push_back(
1061 {
"v" + std::to_string(resultId), resultId, argStr[0] +
" * " + argStr[1], argLocalIds});
1065 currentSsa_.back().push_back(
1066 {
"v" + std::to_string(resultId), resultId, argStr[0] +
" / " + argStr[1], argLocalIds});
1070 currentSsa_.back().push_back({
"v" + std::to_string(resultId), resultId,
1071 "ore_indicatorEq(" + argStr[0] +
"," + argStr[1] +
")", argLocalIds});
1075 currentSsa_.back().push_back({
"v" + std::to_string(resultId), resultId,
1076 "ore_indicatorGt(" + argStr[0] +
"," + argStr[1] +
")", argLocalIds});
1080 currentSsa_.back().push_back({
"v" + std::to_string(resultId), resultId,
1081 "ore_indicatorGeq(" + argStr[0] +
"," + argStr[1] +
")", argLocalIds});
1085 currentSsa_.back().push_back(
1086 {
"v" + std::to_string(resultId), resultId,
"fmin(" + argStr[0] +
"," + argStr[1] +
")", argLocalIds});
1090 currentSsa_.back().push_back(
1091 {
"v" + std::to_string(resultId), resultId,
"fmax(" + argStr[0] +
"," + argStr[1] +
")", argLocalIds});
1095 currentSsa_.back().push_back(
1096 {
"v" + std::to_string(resultId), resultId,
"fabs(" + argStr[0] +
")", argLocalIds});
1100 currentSsa_.back().push_back(
1101 {
"v" + std::to_string(resultId), resultId,
"exp(" + argStr[0] +
")", argLocalIds});
1105 currentSsa_.back().push_back(
1106 {
"v" + std::to_string(resultId), resultId,
"sqrt(" + argStr[0] +
")", argLocalIds});
1110 currentSsa_.back().push_back(
1111 {
"v" + std::to_string(resultId), resultId,
"log(" + argStr[0] +
")", argLocalIds});
1115 currentSsa_.back().push_back(
1116 {
"v" + std::to_string(resultId), resultId,
"pow(" + argStr[0] +
"," + argStr[1] +
")", argLocalIds});
1126 currentSsa_.back().push_back(
1127 {
"v" + std::to_string(resultId), resultId,
"ore_normalPdf(" + argStr[0] +
")", argLocalIds});
1131 QL_FAIL(
"OpenClContext::executeKernel(): no implementation for op code "
1137 if (settings_.debug)
1138 debugInfo_.numberOfOperations += 1 * size_[currentId_ - 1];
1144void OpenClContext::freeVariable(
const std::size_t
id) {
1145 QL_REQUIRE(currentState_ == ComputeState::calc,
1146 "OpenClContext::free(): not in state calc (" <<
static_cast<int>(currentState_) <<
")");
1147 QL_REQUIRE(currentId_ > 0,
"OpenClContext::freeVariable(): current id is not set");
1148 QL_REQUIRE(!hasKernel_[currentId_ - 1],
"OpenClContext::freeVariable(): id ("
1149 << currentId_ <<
") in version " << version_[currentId_ - 1]
1150 <<
" has a kernel already, variables can not be freed.");
1154 if (
id < inputVarOffset_.size() + nVariatesTmp_)
1157 freedVariables_.push_back(
id);
1160void OpenClContext::declareOutputVariable(
const std::size_t
id) {
1161 QL_REQUIRE(currentState_ != ComputeState::idle,
"OpenClContext::declareOutputVariable(): state is idle");
1162 QL_REQUIRE(currentId_ > 0,
"OpenClContext::declareOutputVariable(): current id not set");
1163 QL_REQUIRE(!hasKernel_[currentId_ - 1],
"OpenClContext::declareOutputVariable(): id ("
1164 << currentId_ <<
") in version " << version_[currentId_ - 1]
1165 <<
" has a kernel already, output variables can not be declared.");
1166 currentState_ = ComputeState::declareOutput;
1167 outputVariables_.push_back(
id);
1168 nOutputVars_[currentId_ - 1]++;
1172 if (currentConditionalExpectationArgs_.find(
"v" + std::to_string(
id)) != currentConditionalExpectationArgs_.end()) {
1174 currentConditionalExpectationArgs_.clear();
1178std::size_t OpenClContext::valuesBufferId(
const std::size_t i)
const {
1179 return i - inputVarOffset_.size() - nVariates_[currentId_ - 1];
1182std::string OpenClContext::generateSsaCode(
const std::vector<ssa_entry>& ssa)
const {
1183 std::set<std::size_t> localVars;
1184 for (
auto const& s : ssa) {
1185 localVars.insert(s.rhs_local_id.begin(), s.rhs_local_id.end());
1188 std::string fpTypeStr = settings_.useDoublePrecision ?
"double" :
"float";
1191 std::set<std::size_t> hasDeclaration;
1192 for (
auto const& s : ssa) {
1193 if (s.lhs_local_id) {
1194 if (localVars.find(*s.lhs_local_id) == localVars.end())
1196 if (hasDeclaration.find(*s.lhs_local_id) == hasDeclaration.end()) {
1197 result += fpTypeStr +
" ";
1198 hasDeclaration.insert(*s.lhs_local_id);
1201 result += s.lhs_str +
"=" + s.rhs_str +
";\n";
1206void OpenClContext::finalizeCalculation(std::vector<double*>& output) {
1210 *currentState = ComputeState::idle;
1212 clReleaseMemObject(m);
1214 ComputeState* currentState;
1215 std::vector<cl_mem> mem;
1218 guard.currentState = ¤tState_;
1220 QL_REQUIRE(currentId_ > 0,
"OpenClContext::finalizeCalculation(): current id is not set");
1221 QL_REQUIRE(output.size() == nOutputVars_[currentId_ - 1],
1222 "OpenClContext::finalizeCalculation(): output size ("
1223 << output.size() <<
") inconsistent to kernel output size (" << nOutputVars_[currentId_ - 1] <<
")");
1224 QL_REQUIRE(!settings_.useDoublePrecision || supportsDoublePrecision(),
1225 "OpenClContext::finalizeCalculation(): double precision is configured for this calculation, but not "
1226 "supported by the device. Switch to single precision or use an appropriate device.");
1230 if (nVars_[currentId_ - 1].size() < currentSsa_.size())
1231 nVars_[currentId_ - 1].push_back(nVarsTmp_);
1233 if (nVariates_[currentId_ - 1] == 0)
1234 nVariates_[currentId_ - 1] = nVariatesTmp_;
1236 boost::timer::cpu_timer timer;
1237 boost::timer::nanosecond_type timerBase;
1241 std::size_t fpSize = settings_.useDoublePrecision ?
sizeof(double) :
sizeof(
float);
1243 if (settings_.debug) {
1244 timerBase = timer.elapsed().wall;
1247 std::size_t inputBufferSize = 0;
1248 if (!inputVarOffset_.empty())
1249 inputBufferSize = inputVarOffset_.back() + (inputVarIsScalar_.back() ? 1 : size_[currentId_ - 1]);
1252 if (inputBufferSize > 0) {
1253 inputBuffer = clCreateBuffer(*context_, CL_MEM_READ_WRITE, fpSize * inputBufferSize, NULL, &err);
1254 guard.mem.push_back(inputBuffer);
1255 QL_REQUIRE(err == CL_SUCCESS,
"OpenClContext::finalizeCalculation(): creating input buffer of size "
1256 << inputBufferSize <<
" fails: " << errorText(err));
1259 std::size_t valuesBufferSize = 0;
1260 cl_mem valuesBuffer;
1261 if (currentSsa_.size() > 1) {
1262 valuesBufferSize = (nVars_[currentId_ - 1].back() - inputVarOffset_.size() - nVariates_[currentId_ - 1]) *
1263 size_[currentId_ - 1];
1264 if (valuesBufferSize > 0) {
1265 valuesBuffer = clCreateBuffer(*context_, CL_MEM_READ_WRITE, fpSize * valuesBufferSize, NULL, &err);
1266 guard.mem.push_back(valuesBuffer);
1267 QL_REQUIRE(err == CL_SUCCESS,
"OpenClContext::finalizeCalculation(): creating values buffer of size "
1268 << valuesBufferSize <<
" fails: " << errorText(err));
1272 std::size_t outputBufferSize = nOutputVars_[currentId_ - 1] * size_[currentId_ - 1];
1273 cl_mem outputBuffer;
1274 if (outputBufferSize > 0) {
1275 outputBuffer = clCreateBuffer(*context_, CL_MEM_READ_WRITE, fpSize * outputBufferSize, NULL, &err);
1276 guard.mem.push_back(outputBuffer);
1277 QL_REQUIRE(err == CL_SUCCESS,
"OpenClContext::finalizeCalculation(): creating output buffer of size "
1278 << outputBufferSize <<
" fails: " << errorText(err));
1281 if (settings_.debug) {
1282 debugInfo_.nanoSecondsDataCopy += timer.elapsed().wall - timerBase;
1287 if (!hasKernel_[currentId_ - 1]) {
1288 std::string fpTypeStr = settings_.useDoublePrecision ?
"double" :
"float";
1289 std::string fpEpsStr = settings_.useDoublePrecision ?
"0x1.0p-52" :
"0x1.0p-23f";
1290 std::string fpSuffix = settings_.useDoublePrecision ? std::string() :
"f";
1295 std::string kernelSource =
1296 "bool ore_closeEnough(const " + fpTypeStr +
" x, const " + fpTypeStr +
" y);\n"
1297 "bool ore_closeEnough(const " + fpTypeStr +
" x, const " + fpTypeStr +
" y) {\n"
1298 " const " + fpTypeStr +
" tol = 42.0" + fpSuffix +
" * " + fpEpsStr +
";\n"
1299 " " + fpTypeStr +
" diff = fabs(x - y);\n"
1300 " if (x == 0.0" + fpSuffix +
" || y == 0.0" + fpSuffix +
")\n"
1301 " return diff < tol * tol;\n"
1302 " return diff <= tol * fabs(x) || diff <= tol * fabs(y);\n"
1304 fpTypeStr +
" ore_indicatorEq(const " + fpTypeStr +
" x, const " + fpTypeStr +
" y);\n" +
1305 fpTypeStr +
" ore_indicatorEq(const " + fpTypeStr +
" x, const " + fpTypeStr +
" y) "
1306 "{ return ore_closeEnough(x, y) ? 1.0" + fpSuffix +
" : 0.0" + fpSuffix +
"; }\n\n" +
1307 fpTypeStr +
" ore_indicatorGt(const " + fpTypeStr +
" x, const " + fpTypeStr +
" y);\n" +
1308 fpTypeStr +
" ore_indicatorGt(const " + fpTypeStr +
" x, const " + fpTypeStr +
" y) " +
1309 "{ return x > y && !ore_closeEnough(x, y); }\n\n" +
1310 fpTypeStr +
" ore_indicatorGeq(const " + fpTypeStr +
" x, const " + fpTypeStr +
" y);\n" +
1311 fpTypeStr +
" ore_indicatorGeq(const " + fpTypeStr +
" x, const " + fpTypeStr +
" y) { return x > y || ore_closeEnough(x, y); }\n\n" +
1312 fpTypeStr +
" ore_normalCdf(const " + fpTypeStr +
" x);\n" +
1313 fpTypeStr +
" ore_normalCdf(const " + fpTypeStr +
" x) {\n return 0.0" + fpSuffix +
";}\n" +
1314 fpTypeStr +
" ore_normalPdf(const " + fpTypeStr +
" x);\n" +
1315 fpTypeStr +
" ore_normalPdf(const " + fpTypeStr +
" x) {\n" +
1316 " " + fpTypeStr +
" exponent = -(x*x)/2.0" + fpSuffix +
";\n" +
1317 " return exponent <= -690.0" + fpSuffix +
" ? 0.0 : exp(exponent) * 0.3989422804014327" + fpSuffix +
";\n"
1322 std::string kernelNameStem =
1323 "ore_kernel_" + std::to_string(currentId_) +
"_" + std::to_string(version_[currentId_ - 1]) +
"_";
1325 for (std::size_t part = 0; part < currentSsa_.size(); ++part) {
1327 bool initFromValues = part > 0;
1328 bool cacheToValues = currentSsa_.size() > 1 && part < currentSsa_.size() - 1;
1329 bool generateOutputValues = part == currentSsa_.size() - 1;
1331 std::string kernelName = kernelNameStem + std::to_string(part);
1333 std::vector<std::string> inputArgs;
1334 if (inputBufferSize > 0)
1335 inputArgs.push_back(
"__global " + fpTypeStr +
"* input");
1336 if (nVariates_[currentId_ - 1] > 0)
1337 inputArgs.push_back(
"__global " + fpTypeStr +
"* rn");
1338 if (valuesBufferSize > 0 && (initFromValues || cacheToValues))
1339 inputArgs.push_back(
"__global " + fpTypeStr +
"* values");
1340 if (outputBufferSize > 0 && generateOutputValues)
1341 inputArgs.push_back(
"__global " + fpTypeStr +
"* output");
1343 kernelSource +=
"__kernel void " + kernelName +
"(" + boost::join(inputArgs,
",") +
1345 "unsigned int i = get_global_id(0);\n"
1347 std::to_string(size_[currentId_ - 1]) +
"U) {\n";
1349 std::vector<ssa_entry> ssa;
1351 if (initFromValues) {
1352 for (std::size_t i = inputVarOffset_.size() + nVariates_[currentId_ - 1];
1353 i < nVars_[currentId_ - 1][part]; ++i) {
1355 {std::string(
"v") + std::to_string(i),
1357 std::string(
"values[") + std::to_string(valuesBufferId(i) * size_[currentId_ - 1]) +
"U + i]",
1362 ssa.insert(ssa.end(), currentSsa_[part].begin(), currentSsa_[part].end());
1364 if (cacheToValues) {
1365 for (std::size_t i = inputVarOffset_.size() + nVariates_[currentId_ - 1];
1366 i < nVars_[currentId_ - 1][part]; ++i) {
1367 ssa.push_back({
"values[" + std::to_string(valuesBufferId(i) * size_[currentId_ - 1]) +
"U + i]",
1369 "v" + std::to_string(i),
1374 if (generateOutputValues) {
1375 for (std::size_t i = 0; i < nOutputVars_[currentId_ - 1]; ++i) {
1376 std::size_t offset = i * size_[currentId_ - 1];
1378 std::set<std::size_t> rhsLocalIds;
1379 if (outputVariables_[i] < inputVarOffset_.size()) {
1380 output =
"input[" + std::to_string(inputVarOffset_[outputVariables_[i]]) +
"U" +
1381 (inputVarIsScalar_[outputVariables_[i]] ?
"]" :
" + i] ");
1382 }
else if (outputVariables_[i] < inputVarOffset_.size() + nVariates_[currentId_ - 1]) {
1385 std::to_string((outputVariables_[i] - inputVarOffset_.size()) * size_[currentId_ - 1]) +
1388 output =
"v" + std::to_string(outputVariables_[i]);
1389 rhsLocalIds.insert(outputVariables_[i]);
1391 ssa.push_back({
"output[" + std::to_string(offset) +
"U + i]", std::nullopt, output, rhsLocalIds});
1395 kernelSource += generateSsaCode(ssa);
1396 kernelSource +=
"}}\n";
1402 if (settings_.debug) {
1403 timerBase = timer.elapsed().wall;
1407 const char* kernelSourcePtr = kernelSource.c_str();
1408 program_[currentId_ - 1] = clCreateProgramWithSource(*context_, 1, &kernelSourcePtr, NULL, &err);
1409 QL_REQUIRE(err == CL_SUCCESS,
"OpenClContext::finalizeCalculation(): error during clCreateProgramWithSource(): "
1411 err = clBuildProgram(program_[currentId_ - 1], 1, device_, NULL, NULL, NULL);
1412 if (err != CL_SUCCESS) {
1414 clGetProgramBuildInfo(program_[currentId_ - 1], *device_, CL_PROGRAM_BUILD_LOG,
1416 QL_FAIL(
"OpenClContext::finalizeCalculation(): error during program build for kernel '"
1417 << kernelNameStem <<
"*': " << errorText(err) <<
": "
1421 for (std::size_t part = 0; part < currentSsa_.size(); ++part) {
1422 std::string kernelName = kernelNameStem + std::to_string(part);
1423 kernel_[currentId_ - 1].push_back(clCreateKernel(program_[currentId_ - 1], kernelName.c_str(), &err));
1424 QL_REQUIRE(err == CL_SUCCESS,
"OpenClContext::finalizeCalculation(): error during clCreateKernel(), part #"
1425 << part <<
": " << errorText(err));
1428 hasKernel_[currentId_ - 1] =
true;
1429 inputBufferSize_[currentId_ - 1] = inputBufferSize;
1431 if (settings_.debug) {
1432 debugInfo_.nanoSecondsProgramBuild += timer.elapsed().wall - timerBase;
1435 QL_REQUIRE(inputBufferSize == inputBufferSize_[currentId_ - 1],
1436 "OpenClContext::finalizeCalculation(): input buffer size ("
1437 << inputBufferSize <<
") inconsistent to kernel input buffer size ("
1438 << inputBufferSize_[currentId_ - 1] <<
")");
1443 if (settings_.debug) {
1444 timerBase = timer.elapsed().wall;
1447 cl_event inputBufferEvent;
1448 if (inputBufferSize > 0) {
1449 err = clEnqueueWriteBuffer(queue_, inputBuffer, CL_FALSE, 0, fpSize * inputBufferSize,
1450 settings_.useDoublePrecision ? (
void*)&inputVarValues64_[0]
1451 : (
void*)&inputVarValues32_[0],
1452 0, NULL, &inputBufferEvent);
1453 QL_REQUIRE(err == CL_SUCCESS,
1454 "OpenClContext::finalizeCalculation(): writing to input buffer fails: " << errorText(err));
1457 if (settings_.debug) {
1458 err = clFinish(queue_);
1459 QL_REQUIRE(err == CL_SUCCESS,
"OpenClContext::clFinish(): error in debug mode: " << errorText(err));
1460 debugInfo_.nanoSecondsDataCopy += timer.elapsed().wall - timerBase;
1463 std::vector<cl_event> runWaitEvents;
1464 if (inputBufferSize > 0)
1465 runWaitEvents.push_back(inputBufferEvent);
1467 std::vector<double> values(valuesBufferSize);
1468 std::vector<float> valuesFloat;
1469 if (!settings_.useDoublePrecision) {
1470 valuesFloat.resize(values.size());
1473 for (std::size_t part = 0; part < kernel_[currentId_ - 1].size(); ++part) {
1474 bool initFromValues = part > 0;
1475 bool cacheToValues = currentSsa_.size() > 1 && part < currentSsa_.size() - 1;
1476 bool generateOutputValues = part == currentSsa_.size() - 1;
1480 std::size_t kidx = 0;
1482 if (inputBufferSize > 0) {
1483 err |= clSetKernelArg(kernel_[currentId_ - 1][part], kidx++,
sizeof(cl_mem), &inputBuffer);
1485 if (nVariates_[currentId_ - 1] > 0) {
1486 err |= clSetKernelArg(kernel_[currentId_ - 1][part], kidx++,
sizeof(cl_mem), &variatesPool_);
1488 if (valuesBufferSize > 0 && (initFromValues || cacheToValues)) {
1489 err |= clSetKernelArg(kernel_[currentId_ - 1][part], kidx++,
sizeof(cl_mem), &valuesBuffer);
1491 if (outputBufferSize > 0 && generateOutputValues) {
1492 err |= clSetKernelArg(kernel_[currentId_ - 1][part], kidx++,
sizeof(cl_mem), &outputBuffer);
1494 QL_REQUIRE(err == CL_SUCCESS,
1495 "OpenClContext::finalizeCalculation(): set kernel args fails: " << errorText(err));
1499 if (settings_.debug) {
1500 err = clFinish(queue_);
1501 timerBase = timer.elapsed().wall;
1505 err = clEnqueueNDRangeKernel(queue_, kernel_[currentId_ - 1][part], 1, NULL, &size_[currentId_ - 1], NULL,
1506 runWaitEvents.size(), runWaitEvents.empty() ? NULL : &runWaitEvents[0], &runEvent);
1507 QL_REQUIRE(err == CL_SUCCESS,
"OpenClContext::finalizeCalculation(): enqueue kernel fails: " << errorText(err));
1508 runWaitEvents.push_back(runEvent);
1512 if (kernel_[currentId_ - 1].size() > 1 && part < kernel_[currentId_ - 1].size() - 1) {
1518 clEnqueueReadBuffer(queue_, valuesBuffer, CL_FALSE, 0, fpSize * valuesBufferSize,
1519 settings_.useDoublePrecision ? (
void*)&values[0] : (
void*)&valuesFloat[0],
1520 runWaitEvents.size(), runWaitEvents.empty() ? NULL : &runWaitEvents[0], &readEvent);
1521 QL_REQUIRE(err == CL_SUCCESS,
1522 "OpenClContext::finalizeCalculation(): enqueue read values buffer fails: " << errorText(err));
1524 err = clWaitForEvents(1, &readEvent);
1528 "OpenClContext::finalizeCalculation(): wait for read values buffer event fails: " << errorText(err));
1530 if (!settings_.useDoublePrecision) {
1531 std::copy(valuesFloat.begin(), valuesFloat.end(), values.begin());
1534 for (
auto const& v : conditionalExpectationVarIds_[currentId_ - 1][part]) {
1538 QL_REQUIRE(v.size() >= 4,
1539 "OpenClContext::finalizeCalculation(): expected at least 4 varIds (3 args and 1 result) for "
1540 "conditional expectation, got "
1542 RandomVariable regressand(size_[currentId_ - 1], &values[valuesBufferId(v[1]) * size_[currentId_ - 1]]);
1544 RandomVariable(size_[currentId_ - 1], &values[valuesBufferId(v[2]) * size_[currentId_ - 1]]),
1545 RandomVariable(size_[currentId_ - 1], 1.0));
1546 std::vector<RandomVariable> regressor(v.size() - 3);
1547 for (std::size_t i = 3; i < v.size(); ++i) {
1549 RandomVariable(size_[currentId_ - 1], &values[valuesBufferId(v[i]) * size_[currentId_ - 1]]);
1555 QuantLib::LsmBasisSystem::Monomial, regressand.size()),
1561 std::copy(ce.data(), ce.data() + ce.size(), &values[valuesBufferId(v[0]) * size_[currentId_ - 1]]);
1564 if (!settings_.useDoublePrecision) {
1565 std::copy(values.begin(), values.end(), valuesFloat.begin());
1570 runWaitEvents.push_back(cl_event());
1571 err = clEnqueueWriteBuffer(queue_, valuesBuffer, CL_FALSE, 0, fpSize * valuesBufferSize,
1572 settings_.useDoublePrecision ? (
void*)&values[0] : (
void*)&valuesFloat[0], 0,
1573 NULL, &runWaitEvents.back());
1574 QL_REQUIRE(err == CL_SUCCESS,
1575 "OpenClContext::finalizeCalculation(): write values buffer fails: " << errorText(err));
1579 if (settings_.debug) {
1580 err = clFinish(queue_);
1581 QL_REQUIRE(err == CL_SUCCESS,
"OpenClContext::clFinish(): error in debug mode: " << errorText(err));
1582 debugInfo_.nanoSecondsCalculation += timer.elapsed().wall - timerBase;
1587 if (settings_.debug) {
1588 timerBase = timer.elapsed().wall;
1593 std::vector<cl_event> outputBufferEvents;
1594 if (outputBufferSize > 0) {
1595 std::vector<std::vector<float>> outputFloat;
1596 if (!settings_.useDoublePrecision) {
1597 outputFloat.resize(output.size(), std::vector<float>(size_[currentId_ - 1]));
1599 for (std::size_t i = 0; i < output.size(); ++i) {
1600 outputBufferEvents.push_back(cl_event());
1601 err = clEnqueueReadBuffer(
1602 queue_, outputBuffer, CL_FALSE, fpSize * i * size_[currentId_ - 1], fpSize * size_[currentId_ - 1],
1603 settings_.useDoublePrecision ? (
void*)&output[i][0] : (
void*)&outputFloat[i][0], runWaitEvents.size(),
1604 runWaitEvents.empty() ? NULL : &runWaitEvents[0], &outputBufferEvents.back());
1605 QL_REQUIRE(err == CL_SUCCESS,
1606 "OpenClContext::finalizeCalculation(): writing to output buffer fails: " << errorText(err));
1608 err = clWaitForEvents(outputBufferEvents.size(), outputBufferEvents.empty() ?
nullptr : &outputBufferEvents[0]);
1611 "OpenClContext::finalizeCalculation(): wait for output buffer events to finish fails: " << errorText(err));
1612 if (!settings_.useDoublePrecision) {
1613 for (std::size_t i = 0; i < output.size(); ++i) {
1614 std::copy(outputFloat[i].begin(), outputFloat[i].end(), output[i]);
1619 if (settings_.debug) {
1620 err = clFinish(queue_);
1621 QL_REQUIRE(err == CL_SUCCESS,
"OpenClContext::clFinish(): error in debug mode: " << errorText(err));
1622 debugInfo_.nanoSecondsDataCopy += timer.elapsed().wall - timerBase;
1626const ComputeContext::DebugInfo& OpenClContext::debugInfo()
const {
return debugInfo_; }
1628std::vector<std::pair<std::string, std::string>> OpenClContext::deviceInfo()
const {
return deviceInfo_; }
1629bool OpenClContext::supportsDoublePrecision()
const {
return supportsDoublePrecision_; }
1633#ifndef ORE_ENABLE_OPENCL
1639 std::set<std::string> tmp;
1650 QL_FAIL(
"OpenClFrameWork::getContext(): device '"
1651 << deviceName <<
"' not found. Available devices: " << boost::join(
getAvailableDevices(),
","));
std::map< std::string, ComputeContext * > contexts_
static std::string platformName_[4U]
static std::vector< std::pair< std::string, std::string > > deviceInfo_[4U][8U]
~OpenClFramework() override final
static std::string deviceName_[4U][8U]
static bool supportsDoublePrecision_[4U][8U]
static boost::shared_mutex mutex_
ComputeContext * getContext(const std::string &deviceName) override final
std::set< std::string > getAvailableDevices() const override final
std::vector< std::function< RandomVariable(const std::vector< const RandomVariable * > &)> > multiPathBasisSystem(Size dim, Size order, QuantLib::LsmBasisSystem::PolynomialType type, Size basisSystemSizeBound)
Filter close_enough(const RandomVariable &x, const RandomVariable &y)
std::vector< const RandomVariable * > vec2vecptr(const std::vector< RandomVariable > &values)
RandomVariable conditionalExpectation(const std::vector< const RandomVariable * > ®ressor, const std::vector< std::function< RandomVariable(const std::vector< const RandomVariable * > &)> > &basisFn, const Array &coefficients)
std::vector< std::string > getRandomVariableOpLabels()
#define ORE_OPENCL_MAX_N_DEV_INFO_LARGE
#define ORE_OPENCL_MAX_BUILD_LOG_LOGFILE
#define ORE_OPENCL_MAX_BUILD_LOG
#define ORE_OPENCL_MAX_N_DEV_INFO
opencl compute env implementation
#define ORE_OPENCL_MAX_N_PLATFORMS
#define ORE_OPENCL_MAX_N_DEVICES
static constexpr std::size_t Sqrt
static constexpr std::size_t IndicatorEq
static constexpr std::size_t Max
static constexpr std::size_t Add
static constexpr std::size_t Mult
static constexpr std::size_t IndicatorGeq
static constexpr std::size_t Log
static constexpr std::size_t Pow
static constexpr std::size_t Min
static constexpr std::size_t Negative
static constexpr std::size_t Subtract
static constexpr std::size_t NormalPdf
static constexpr std::size_t Abs
static constexpr std::size_t None
static constexpr std::size_t ConditionalExpectation
static constexpr std::size_t IndicatorGt
static constexpr std::size_t Div
static constexpr std::size_t Exp
std::vector< Size > steps