Logo
Fully annotated reference manual - version 1.8.12
Loading...
Searching...
No Matches
openclenvironment.cpp
Go to the documentation of this file.
1/*
2 Copyright (C) 2023 Quaternion Risk Management Ltd
3 All rights reserved.
4
5 This file is part of ORE, a free-software/open-source library
6 for transparent pricing and risk analysis - http://opensourcerisk.org
7
8 ORE is free software: you can redistribute it and/or modify it
9 under the terms of the Modified BSD License. You should have received a
10 copy of the license along with this program.
11 The license is also available online at <http://opensourcerisk.org>
12
13 This program is distributed on the basis that it will form a useful
14 contribution to risk analytics and model standardisation, but WITHOUT
15 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
16 FITNESS FOR A PARTICULAR PURPOSE. See the license for more details.
17*/
18
22
23#include <qle/math/randomvariable_io.hpp> // just for debugging!
24
25#include <ql/errors.hpp>
26
27#include <boost/algorithm/string/join.hpp>
28#include <boost/timer/timer.hpp>
29
30#include <chrono>
31#include <iostream>
32#include <optional>
33#include <thread>
34
35#define ORE_OPENCL_MAX_N_DEV_INFO 1024U
36#define ORE_OPENCL_MAX_N_DEV_INFO_LARGE 65536U
37#define ORE_OPENCL_MAX_BUILD_LOG 65536U
38#define ORE_OPENCL_MAX_BUILD_LOG_LOGFILE 1024U
39
40namespace QuantExt {
41
42#ifdef ORE_ENABLE_OPENCL
43namespace {
44std::string errorText(cl_int err) {
45 switch (err) {
46 case 0:
47 return "CL_SUCCESS";
48 case -1:
49 return "CL_DEVICE_NOT_FOUND";
50 case -2:
51 return "CL_DEVICE_NOT_AVAILABLE";
52 case -3:
53 return "CL_COMPILER_NOT_AVAILABLE";
54 case -4:
55 return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
56 case -5:
57 return "CL_OUT_OF_RESOURCES";
58 case -6:
59 return "CL_OUT_OF_HOST_MEMORY";
60 case -7:
61 return "CL_PROFILING_INFO_NOT_AVAILABLE";
62 case -8:
63 return "CL_MEM_COPY_OVERLAP";
64 case -9:
65 return "CL_IMAGE_FORMAT_MISMATCH";
66 case -10:
67 return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
68 case -11:
69 return "CL_BUILD_PROGRAM_FAILURE";
70 case -12:
71 return "CL_MAP_FAILURE";
72 case -13:
73 return "CL_MISALIGNED_SUB_BUFFER_OFFSET";
74 case -14:
75 return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";
76 case -15:
77 return "CL_COMPILE_PROGRAM_FAILURE";
78 case -16:
79 return "CL_LINKER_NOT_AVAILABLE";
80 case -17:
81 return "CL_LINK_PROGRAM_FAILURE";
82 case -18:
83 return "CL_DEVICE_PARTITION_FAILED";
84 case -19:
85 return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE";
86 case -30:
87 return "CL_INVALID_VALUE";
88 case -31:
89 return "CL_INVALID_DEVICE_TYPE";
90 case -32:
91 return "CL_INVALID_PLATFORM";
92 case -33:
93 return "CL_INVALID_DEVICE";
94 case -34:
95 return "CL_INVALID_CONTEXT";
96 case -35:
97 return "CL_INVALID_QUEUE_PROPERTIES";
98 case -36:
99 return "CL_INVALID_COMMAND_QUEUE";
100 case -37:
101 return "CL_INVALID_HOST_PTR";
102 case -38:
103 return "CL_INVALID_MEM_OBJECT";
104 case -39:
105 return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
106 case -40:
107 return "CL_INVALID_IMAGE_SIZE";
108 case -41:
109 return "CL_INVALID_SAMPLER";
110 case -42:
111 return "CL_INVALID_BINARY";
112 case -43:
113 return "CL_INVALID_BUILD_OPTIONS";
114 case -44:
115 return "CL_INVALID_PROGRAM";
116 case -45:
117 return "CL_INVALID_PROGRAM_EXECUTABLE";
118 case -46:
119 return "CL_INVALID_KERNEL_NAME";
120 case -47:
121 return "CL_INVALID_KERNEL_DEFINITION";
122 case -48:
123 return "CL_INVALID_KERNEL";
124 case -49:
125 return "CL_INVALID_ARG_INDEX";
126 case -50:
127 return "CL_INVALID_ARG_VALUE";
128 case -51:
129 return "CL_INVALID_ARG_SIZE";
130 case -52:
131 return "CL_INVALID_KERNEL_ARGS";
132 case -53:
133 return "CL_INVALID_WORK_DIMENSION";
134 case -54:
135 return "CL_INVALID_WORK_GROUP_SIZE";
136 case -55:
137 return "CL_INVALID_WORK_ITEM_SIZE";
138 case -56:
139 return "CL_INVALID_GLOBAL_OFFSET";
140 case -57:
141 return "CL_INVALID_EVENT_WAIT_LIST";
142 case -58:
143 return "CL_INVALID_EVENT";
144 case -59:
145 return "CL_INVALID_OPERATION";
146 case -60:
147 return "CL_INVALID_GL_OBJECT";
148 case -61:
149 return "CL_INVALID_BUFFER_SIZE";
150 case -62:
151 return "CL_INVALID_MIP_LEVEL";
152 case -63:
153 return "CL_INVALID_GLOBAL_WORK_SIZE";
154 case -64:
155 return "CL_INVALID_PROPERTY";
156 case -65:
157 return "CL_INVALID_IMAGE_DESCRIPTOR";
158 case -66:
159 return "CL_INVALID_COMPILER_OPTIONS";
160 case -67:
161 return "CL_INVALID_LINKER_OPTIONS";
162 case -68:
163 return "CL_INVALID_DEVICE_PARTITION_COUNT";
164 default:
165 return "unknown cl error code " + std::to_string(err);
166 }
167}
168
169void errorCallback(const char* errinfo, const void* private_info, size_t cb, void* user_data) {
170 printf("Callback from OpenCL context: errinfo = '%s'\n", errinfo);
171}
172
173} // namespace
174
175class OpenClContext : public ComputeContext {
176public:
177 OpenClContext(cl_device_id* device, cl_context* context,
178 const std::vector<std::pair<std::string, std::string>>& deviceInfo,
179 const bool supportsDoublePrecision);
180 ~OpenClContext() override final;
181 void init() override final;
182
183 std::pair<std::size_t, bool> initiateCalculation(const std::size_t n, const std::size_t id = 0,
184 const std::size_t version = 0,
185 const Settings settings = {}) override final;
186 void disposeCalculation(const std::size_t n) override final;
187 std::size_t createInputVariable(double v) override final;
188 std::size_t createInputVariable(double* v) override final;
189 std::vector<std::vector<std::size_t>> createInputVariates(const std::size_t dim,
190 const std::size_t steps) override final;
191 std::size_t applyOperation(const std::size_t randomVariableOpCode,
192 const std::vector<std::size_t>& args) override final;
193 void freeVariable(const std::size_t id) override final;
194 void declareOutputVariable(const std::size_t id) override final;
195 void finalizeCalculation(std::vector<double*>& output) override final;
196
197 std::vector<std::pair<std::string, std::string>> deviceInfo() const override;
198 bool supportsDoublePrecision() const override;
199 const DebugInfo& debugInfo() const override final;
200
201private:
202 struct ssa_entry {
203 std::string lhs_str;
204 std::optional<std::size_t> lhs_local_id;
205 std::string rhs_str;
206 std::set<std::size_t> rhs_local_id;
207 };
208
209 std::size_t generateResultId();
210 std::pair<std::vector<std::string>, std::set<std::size_t>> getArgString(const std::vector<std::size_t>& args) const;
211 std::size_t valuesBufferId(const std::size_t i) const;
212 void startNewSsaPart();
213 std::string generateSsaCode(const std::vector<ssa_entry>& ssa) const;
214
215 void updateVariatesPool();
216
217 void runHealthChecks();
218 std::string runHealthCheckProgram(const std::string& source, const std::string& kernelName);
219
220 static void releaseMem(cl_mem& m, const std::string& desc);
221 static void releaseKernel(cl_kernel& k, const std::string& desc);
222 static void releaseKernel(std::vector<cl_kernel>& ks, const std::string& desc);
223 static void releaseProgram(cl_program& p, const std::string& desc);
224
225 enum class ComputeState { idle, createInput, createVariates, calc, declareOutput };
226
227 bool initialized_ = false;
228 cl_device_id* device_; // passed from framework
229 cl_context* context_; // passed from framework
230 cl_command_queue queue_; // contructed per OpenClContext
231
232 // set once in the ctor
233 std::vector<std::pair<std::string, std::string>> deviceInfo_;
234 bool supportsDoublePrecision_;
235
236 // will be accumulated over all calcs
237 ComputeContext::DebugInfo debugInfo_;
238
239 // 1a vectors per current calc id
240
241 std::vector<std::size_t> size_;
242 std::vector<bool> disposed_;
243 std::vector<bool> hasKernel_;
244 std::vector<std::size_t> version_;
245 std::vector<cl_program> program_;
246 std::vector<std::vector<cl_kernel>> kernel_;
247 std::vector<std::vector<std::vector<std::vector<std::size_t>>>> conditionalExpectationVarIds_;
248 std::vector<std::size_t> inputBufferSize_;
249 std::vector<std::size_t> nOutputVars_;
250 std::vector<std::vector<std::size_t>> nVars_;
251 std::vector<std::size_t> nVariates_;
252
253 // 1b variates (shared pool of mersenne twister based normal variates)
254
255 std::size_t variatesPoolSize_ = 0; // count of single random numbers
256 cl_mem variatesPool_;
257 cl_mem variatesMtStateBuffer_;
258 cl_program variatesProgram_;
259 cl_kernel variatesKernelSeedInit_;
260 cl_kernel variatesKernelTwist_;
261 cl_kernel variatesKernelGenerate_;
262
263 // 2 curent calc
264
265 std::size_t currentId_ = 0;
266 ComputeState currentState_ = ComputeState::idle;
267 std::size_t nVarsTmp_;
268 std::size_t nVariatesTmp_;
269 Settings settings_;
270 std::set<std::string> currentConditionalExpectationArgs_;
271
272 // 2a indexed by var id
273 std::vector<std::size_t> inputVarOffset_;
274 std::vector<bool> inputVarIsScalar_;
275 std::vector<float> inputVarValues32_;
276 std::vector<double> inputVarValues64_;
277
278 // 2b collection of variable ids
279 std::vector<std::size_t> freedVariables_;
280 std::vector<std::size_t> outputVariables_;
281
282 // 2d kernel ssa
283 std::vector<std::vector<ssa_entry>> currentSsa_;
284};
285
287boost::shared_mutex OpenClFramework::mutex_;
288cl_uint OpenClFramework::nPlatforms_ = 0;
291cl_uint OpenClFramework::nDevices_[ORE_OPENCL_MAX_N_PLATFORMS];
292cl_device_id OpenClFramework::devices_[ORE_OPENCL_MAX_N_PLATFORMS][ORE_OPENCL_MAX_N_DEVICES];
293cl_context OpenClFramework::context_[ORE_OPENCL_MAX_N_PLATFORMS][ORE_OPENCL_MAX_N_DEVICES];
294std::vector<std::pair<std::string, std::string>> OpenClFramework::deviceInfo_[ORE_OPENCL_MAX_N_PLATFORMS]
297
299 boost::unique_lock<boost::shared_mutex> lock(mutex_);
300
301 if (initialized_)
302 return;
303
304 initialized_ = true;
305
306 cl_platform_id platforms[ORE_OPENCL_MAX_N_PLATFORMS];
307 clGetPlatformIDs(ORE_OPENCL_MAX_N_PLATFORMS, platforms, &nPlatforms_);
308
309 for (std::size_t p = 0; p < nPlatforms_; ++p) {
310 char platformName[ORE_OPENCL_MAX_N_DEV_INFO];
311 clGetPlatformInfo(platforms[p], CL_PLATFORM_NAME, ORE_OPENCL_MAX_N_DEV_INFO, platformName, NULL);
312 clGetDeviceIDs(platforms[p], CL_DEVICE_TYPE_ALL, ORE_OPENCL_MAX_N_DEVICES, devices_[p], &nDevices_[p]);
313
314 platformName_[p] = std::string(platformName);
315
316 for (std::size_t d = 0; d < nDevices_[p]; ++d) {
317 char deviceName[ORE_OPENCL_MAX_N_DEV_INFO], driverVersion[ORE_OPENCL_MAX_N_DEV_INFO],
318 deviceVersion[ORE_OPENCL_MAX_N_DEV_INFO], deviceExtensions[ORE_OPENCL_MAX_N_DEV_INFO];
319 cl_device_fp_config doubleFpConfig;
320
321 clGetDeviceInfo(devices_[p][d], CL_DEVICE_NAME, ORE_OPENCL_MAX_N_DEV_INFO, &deviceName, NULL);
322 clGetDeviceInfo(devices_[p][d], CL_DRIVER_VERSION, ORE_OPENCL_MAX_N_DEV_INFO, &driverVersion, NULL);
323 clGetDeviceInfo(devices_[p][d], CL_DEVICE_VERSION, ORE_OPENCL_MAX_N_DEV_INFO, &deviceVersion, NULL);
324 clGetDeviceInfo(devices_[p][d], CL_DEVICE_EXTENSIONS, ORE_OPENCL_MAX_N_DEV_INFO_LARGE, &deviceExtensions,
325 NULL);
326
327 deviceInfo_[p][d].push_back(std::make_pair("device_name", std::string(deviceName)));
328 deviceInfo_[p][d].push_back(std::make_pair("driver_version", std::string(driverVersion)));
329 deviceInfo_[p][d].push_back(std::make_pair("device_version", std::string(deviceVersion)));
330 deviceInfo_[p][d].push_back(std::make_pair("device_extensions", std::string(deviceExtensions)));
331
332 deviceName_[p][d] = std::string(deviceName);
333
334 supportsDoublePrecision_[p][d] = false;
335#if CL_VERSION_1_2
336 clGetDeviceInfo(devices_[p][d], CL_DEVICE_DOUBLE_FP_CONFIG, sizeof(cl_device_fp_config), &doubleFpConfig,
337 NULL);
338 deviceInfo_[p][d].push_back(std::make_pair(
339 "device_double_fp_config",
340 ((doubleFpConfig & CL_FP_DENORM) ? std::string("Denorm,") : std::string()) +
341 ((doubleFpConfig & CL_FP_INF_NAN) ? std::string("InfNan,") : std::string()) +
342 ((doubleFpConfig & CL_FP_ROUND_TO_NEAREST) ? std::string("RoundNearest,") : std::string()) +
343 ((doubleFpConfig & CL_FP_ROUND_TO_ZERO) ? std::string("RoundZero,") : std::string()) +
344 ((doubleFpConfig & CL_FP_FMA) ? std::string("FMA,") : std::string()) +
345 ((doubleFpConfig & CL_FP_SOFT_FLOAT) ? std::string("SoftFloat,") : std::string())));
346 supportsDoublePrecision_[p][d] = supportsDoublePrecision_[p][d] || (doubleFpConfig != 0);
347#else
348 deviceInfo_[p][d].push_back(std::make_pair("device_double_fp_config", "not provided before opencl 1.2"));
350 supportsDoublePrecision || std::string(deviceExtensions).find("cl_khr_fp64");
351#endif
352
353 // create context, this is static and will be deleted at program end (no clReleaseContext() necessary ?)
354
355 cl_int err;
356 context_[p][d] = clCreateContext(NULL, 1, &devices_[p][d], &errorCallback, NULL, &err);
357 QL_REQUIRE(err == CL_SUCCESS,
358 "OpenClFramework::OpenClContext(): error during clCreateContext(): " << errorText(err));
359 }
360 }
361}
362
364 init();
365 for (std::size_t p = 0; p < nPlatforms_; ++p) {
366 for (std::size_t d = 0; d < nDevices_[p]; ++d) {
367 contexts_["OpenCL/" + platformName_[p] + "/" + deviceName_[p][d]] =
368 new OpenClContext(&devices_[p][d], &context_[p][d], deviceInfo_[p][d], supportsDoublePrecision_[p][d]);
369 }
370 }
371}
372
374 for (auto& [_, c] : contexts_) {
375 delete c;
376 }
377}
378
379OpenClContext::OpenClContext(cl_device_id* device, cl_context* context,
380 const std::vector<std::pair<std::string, std::string>>& deviceInfo,
381 const bool supportsDoublePrecision)
382 : initialized_(false), device_(device), context_(context), deviceInfo_(deviceInfo),
383 supportsDoublePrecision_(supportsDoublePrecision) {}
384
385OpenClContext::~OpenClContext() {
386 if (initialized_) {
387 if (variatesPoolSize_ > 0) {
388 releaseMem(variatesPool_, "variates pool");
389 releaseMem(variatesMtStateBuffer_, "variates state buffer");
390 releaseKernel(variatesKernelSeedInit_, "variates seed init");
391 releaseKernel(variatesKernelTwist_, "variates twist");
392 releaseKernel(variatesKernelGenerate_, "variates generate");
393 releaseProgram(variatesProgram_, "variates");
394 }
395
396 for (Size i = 0; i < kernel_.size(); ++i) {
397 if (disposed_[i] || !hasKernel_[i])
398 continue;
399 releaseKernel(kernel_[i], "ore kernel");
400 }
401
402 for (Size i = 0; i < program_.size(); ++i) {
403 if (disposed_[i] || !hasKernel_[i])
404 continue;
405 releaseProgram(program_[i], "ore program");
406 }
407
408 cl_int err;
409 if (err = clReleaseCommandQueue(queue_); err != CL_SUCCESS) {
410 std::cerr << "OpenClFramework: error during clReleaseCommandQueue: " + errorText(err) << std::endl;
411 }
412 }
413}
414
415void OpenClContext::releaseMem(cl_mem& m, const std::string& description) {
416 cl_int err;
417 if (err = clReleaseMemObject(m); err != CL_SUCCESS) {
418 std::cerr << "OpenClContext: error during clReleaseMemObject '" << description << "': " + errorText(err)
419 << std::endl;
420 }
421}
422
423void OpenClContext::releaseKernel(cl_kernel& k, const std::string& description) {
424 cl_int err;
425 if (err = clReleaseKernel(k); err != CL_SUCCESS) {
426 std::cerr << "OpenClContext: error during clReleaseKernel '" << description << "': " + errorText(err)
427 << std::endl;
428 }
429}
430
431void OpenClContext::releaseKernel(std::vector<cl_kernel>& ks, const std::string& description) {
432 for (auto& k : ks)
433 releaseKernel(k, description);
434}
435
436void OpenClContext::releaseProgram(cl_program& p, const std::string& description) {
437 cl_int err;
438 if (err = clReleaseProgram(p); err != CL_SUCCESS) {
439 std::cerr << "OpenClContext: error during clReleaseProgram '" << description << "': " + errorText(err)
440 << std::endl;
441 }
442}
443
444std::string OpenClContext::runHealthCheckProgram(const std::string& source, const std::string& kernelName) {
445
446 struct CleanUp {
447 std::vector<cl_program> p;
448 std::vector<cl_kernel> k;
449 std::vector<cl_mem> m;
450 ~CleanUp() {
451 for (auto& pgm : p)
452 OpenClContext::releaseProgram(pgm, "health check");
453 for (auto& krn : k)
454 OpenClContext::releaseKernel(krn, "health check");
455 for (auto& mem : m)
456 OpenClContext::releaseMem(mem, "health check");
457 }
458 } cleanup;
459
460 const char* programPtr = source.c_str();
461
462 cl_int err;
463
464 cl_program program = clCreateProgramWithSource(*context_, 1, &programPtr, NULL, &err);
465 if (err != CL_SUCCESS) {
466 return errorText(err);
467 }
468 cleanup.p.push_back(program);
469
470 err = clBuildProgram(program, 1, device_, NULL, NULL, NULL);
471 if (err != CL_SUCCESS) {
472 return errorText(err);
473 }
474
475 cl_kernel kernel = clCreateKernel(program, kernelName.c_str(), &err);
476 if (err != CL_SUCCESS) {
477 return errorText(err);
478 }
479 cleanup.k.push_back(kernel);
480
481 cl_mem resultBuffer = clCreateBuffer(*context_, CL_MEM_READ_WRITE, sizeof(cl_ulong), NULL, &err);
482 if (err != CL_SUCCESS) {
483 return errorText(err);
484 }
485 cleanup.m.push_back(resultBuffer);
486
487 err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &resultBuffer);
488
489 cl_event runEvent;
490 constexpr std::size_t sizeOne = 1;
491 err = clEnqueueNDRangeKernel(queue_, kernel, 1, NULL, &sizeOne, NULL, 0, NULL, &runEvent);
492 if (err != CL_SUCCESS) {
493 return errorText(err);
494 }
495
496 cl_ulong result;
497 err = clEnqueueReadBuffer(queue_, resultBuffer, CL_TRUE, 0, sizeof(cl_ulong), &result, 1, &runEvent, NULL);
498 if (err != CL_SUCCESS) {
499 return errorText(err);
500 }
501
502 return std::to_string(result);
503}
504
505void OpenClContext::runHealthChecks() {
506 deviceInfo_.push_back(std::make_pair("host_sizeof(cl_uint)", std::to_string(sizeof(cl_uint))));
507 deviceInfo_.push_back(std::make_pair("host_sizeof(cl_ulong)", std::to_string(sizeof(cl_ulong))));
508 deviceInfo_.push_back(std::make_pair("host_sizeof(cl_float)", std::to_string(sizeof(cl_float))));
509 deviceInfo_.push_back(std::make_pair("host_sizeof(cl_double)", std::to_string(sizeof(cl_double))));
510
511 std::string kernelGetUintSize =
512 "__kernel void ore_get_uint_size(__global ulong* result) { result[0] = sizeof(uint); }";
513 std::string kernelGetUlongSize =
514 "__kernel void ore_get_ulong_size(__global ulong* result) { result[0] = sizeof(ulong); }";
515 std::string kernelGetFloatSize =
516 "__kernel void ore_get_float_size(__global ulong* result) { result[0] = sizeof(float); }";
517 std::string kernelGetDoubleSize =
518 "__kernel void ore_get_double_size(__global ulong* result) { result[0] = sizeof(double); }";
519
520 deviceInfo_.push_back(
521 std::make_pair("device_sizeof(uint)", runHealthCheckProgram(kernelGetUintSize, "ore_get_uint_size")));
522 deviceInfo_.push_back(
523 std::make_pair("device_sizeof(ulong)", runHealthCheckProgram(kernelGetUlongSize, "ore_get_ulong_size")));
524 deviceInfo_.push_back(
525 std::make_pair("device_sizeof(float)", runHealthCheckProgram(kernelGetFloatSize, "ore_get_float_size")));
526 deviceInfo_.push_back(
527 std::make_pair("device_sizeof(double)", runHealthCheckProgram(kernelGetDoubleSize, "ore_get_double_size")));
528}
529
530void OpenClContext::init() {
531
532 if (initialized_) {
533 return;
534 }
535
536 debugInfo_.numberOfOperations = 0;
537 debugInfo_.nanoSecondsDataCopy = 0;
538 debugInfo_.nanoSecondsProgramBuild = 0;
539 debugInfo_.nanoSecondsCalculation = 0;
540
541 cl_int err;
542#if CL_VERSION_2_0
543 queue_ = clCreateCommandQueueWithProperties(*context_, *device_, NULL, &err);
544#else
545 // deprecated in cl version 2_0
546 queue_ = clCreateCommandQueue(*context_, *device_, 0, &err);
547#endif
548 QL_REQUIRE(err == CL_SUCCESS,
549 "OpenClFramework::OpenClContext(): error during clCreateCommandQueue(): " << errorText(err));
550
551 initialized_ = true;
552
553 runHealthChecks();
554}
555
556void OpenClContext::disposeCalculation(const std::size_t id) {
557 QL_REQUIRE(!disposed_[id - 1], "OpenClContext::disposeCalculation(): id " << id << " was already disposed.");
558 disposed_[id - 1] = true;
559 if (hasKernel_[id - 1]) {
560 releaseKernel(kernel_[id - 1], "kernel id " + std::to_string(id) + " (during dispose())");
561 releaseProgram(program_[id - 1], "program id " + std::to_string(id) + " (during dispose())");
562 }
563}
564
565std::pair<std::size_t, bool> OpenClContext::initiateCalculation(const std::size_t n, const std::size_t id,
566 const std::size_t version, const Settings settings) {
567 QL_REQUIRE(n > 0, "OpenClContext::initiateCalculation(): n must not be zero");
568
569 bool newCalc = false;
570 settings_ = settings;
571
572 if (id == 0) {
573
574 // initiate new calcaultion
575
576 size_.push_back(n);
577 disposed_.push_back(false);
578 hasKernel_.push_back(false);
579 version_.push_back(version);
580 program_.push_back(cl_program());
581 kernel_.push_back(std::vector<cl_kernel>());
582 conditionalExpectationVarIds_.push_back(std::vector<std::vector<std::vector<std::size_t>>>(1));
583 inputBufferSize_.push_back(0);
584 nOutputVars_.push_back(0);
585 nVars_.push_back(std::vector<std::size_t>());
586 nVariates_.push_back(0);
587
588 currentId_ = hasKernel_.size();
589 newCalc = true;
590
591 } else {
592
593 // initiate calculation on existing id
594
595 QL_REQUIRE(id <= hasKernel_.size(),
596 "OpenClContext::initiateCalculation(): id (" << id << ") invalid, got 1..." << hasKernel_.size());
597 QL_REQUIRE(size_[id - 1] == n, "OpenClContext::initiateCalculation(): size ("
598 << size_[id - 1] << ") for id " << id << " does not match current size ("
599 << n << ")");
600 QL_REQUIRE(!disposed_[id - 1], "OpenClContext::initiateCalculation(): id ("
601 << id << ") was already disposed, it can not be used any more.");
602
603 if (version != version_[id - 1]) {
604 hasKernel_[id - 1] = false;
605 version_[id - 1] = version;
606 nVars_[id - 1].clear();
607 nVariates_[id - 1] = 0;
608 releaseKernel(kernel_[id - 1],
609 "kernel id " + std::to_string(id) + " (during initiateCalculation, old version: " +
610 std::to_string(version_[id - 1]) + ", new version:" + std::to_string(version) + ")");
611 conditionalExpectationVarIds_[id - 1] = std::vector<std::vector<std::vector<std::size_t>>>(1);
612 releaseProgram(program_[id - 1],
613 "program id " + std::to_string(id) + " (during initiateCalculation, old version: " +
614 std::to_string(version_[id - 1]) + ", new version:" + std::to_string(version) + ")");
615 newCalc = true;
616 }
617
618 currentId_ = id;
619 }
620
621 // reset variable info
622
623 nVarsTmp_ = 0;
624 nVariatesTmp_ = 0;
625
626 inputVarOffset_.clear();
627 inputVarIsScalar_.clear();
628 inputVarValues32_.clear();
629 inputVarValues64_.clear();
630
631 if (newCalc) {
632 freedVariables_.clear();
633 outputVariables_.clear();
634 currentSsa_ = std::vector<std::vector<ssa_entry>>(1);
635 currentConditionalExpectationArgs_.clear();
636 }
637
638 // set state
639
640 currentState_ = ComputeState::createInput;
641
642 // return calc id
643
644 return std::make_pair(currentId_, newCalc);
645}
646
647std::size_t OpenClContext::createInputVariable(double v) {
648 QL_REQUIRE(currentState_ == ComputeState::createInput,
649 "OpenClContext::createInputVariable(): not in state createInput (" << static_cast<int>(currentState_)
650 << ")");
651 std::size_t nextOffset = 0;
652 if (!inputVarOffset_.empty()) {
653 nextOffset = inputVarOffset_.back() + (inputVarIsScalar_.back() ? 1 : size_[currentId_ - 1]);
654 }
655 inputVarOffset_.push_back(nextOffset);
656 inputVarIsScalar_.push_back(true);
657 if (settings_.useDoublePrecision) {
658 inputVarValues64_.push_back(v);
659 } else {
660 // ensure that v falls into the single precision range
661 inputVarValues32_.push_back((float)std::max(std::min(v, (double)std::numeric_limits<float>::max()),
662 -(double)std::numeric_limits<float>::max()));
663 }
664 return nVarsTmp_++;
665}
666
667std::size_t OpenClContext::createInputVariable(double* v) {
668 QL_REQUIRE(currentState_ == ComputeState::createInput,
669 "OpenClContext::createInputVariable(): not in state createInput (" << static_cast<int>(currentState_)
670 << ")");
671 std::size_t nextOffset = 0;
672 if (!inputVarOffset_.empty()) {
673 nextOffset = inputVarOffset_.back() + (inputVarIsScalar_.back() ? 1 : size_[currentId_ - 1]);
674 }
675 inputVarOffset_.push_back(nextOffset);
676 inputVarIsScalar_.push_back(false);
677 for (std::size_t i = 0; i < size_[currentId_ - 1]; ++i) {
678 if (settings_.useDoublePrecision) {
679 inputVarValues64_.push_back(v[i]);
680 } else {
681 inputVarValues32_.push_back((float)std::max(std::min(v[i], (double)std::numeric_limits<float>::max()),
682 -(double)std::numeric_limits<float>::max()));
683 }
684 }
685 return nVarsTmp_++;
686}
687
688void OpenClContext::updateVariatesPool() {
689 QL_REQUIRE(nVariatesTmp_ > 0, "OpenClContext::updateVariatesPool(): internal error, got nVariatesTmp_ == 0.");
690
691 constexpr std::size_t size_one = 1; // constant 1
692 constexpr std::size_t mt_N = 624; // mersenne twister N
693
694 std::size_t fpSize = settings_.useDoublePrecision ? sizeof(double) : sizeof(float);
695
696 QL_REQUIRE(!settings_.useDoublePrecision || supportsDoublePrecision(),
697 "OpenClContext::updateVariatesPool(): double precision is configured for this calculation, but not "
698 "supported by the device. Switch to single precision or use an appropriate device.");
699
700 cl_event initEvent;
701 if (variatesPoolSize_ == 0) {
702
703 // build the kernels to fill the variates pool
704
705 std::string fpTypeStr = settings_.useDoublePrecision ? "double" : "float";
706 std::string fpSuffix = settings_.useDoublePrecision ? "" : "f";
707 std::string fpMaxValue = settings_.useDoublePrecision ? "0x1.fffffffffffffp1023" : "0x1.fffffep127f";
708
709 // clang-format off
710 // ported from from QuantLib::InverseCumulativeNormal
711 std::string sourceInvCumN = fpTypeStr + " ore_invCumN(const uint x0);\n" +
712 fpTypeStr + " ore_invCumN(const uint x0) {\n"
713 " const " + fpTypeStr + " a1_ = -3.969683028665376e+01" + fpSuffix + ";\n"
714 " const " + fpTypeStr + " a2_ = 2.209460984245205e+02" + fpSuffix + ";\n"
715 " const " + fpTypeStr + " a3_ = -2.759285104469687e+02" + fpSuffix + ";\n"
716 " const " + fpTypeStr + " a4_ = 1.383577518672690e+02" + fpSuffix + ";\n"
717 " const " + fpTypeStr + " a5_ = -3.066479806614716e+01" + fpSuffix + ";\n"
718 " const " + fpTypeStr + " a6_ = 2.506628277459239e+00" + fpSuffix + ";\n"
719 " const " + fpTypeStr + " b1_ = -5.447609879822406e+01" + fpSuffix + ";\n"
720 " const " + fpTypeStr + " b2_ = 1.615858368580409e+02" + fpSuffix + ";\n"
721 " const " + fpTypeStr + " b3_ = -1.556989798598866e+02" + fpSuffix + ";\n"
722 " const " + fpTypeStr + " b4_ = 6.680131188771972e+01" + fpSuffix + ";\n"
723 " const " + fpTypeStr + " b5_ = -1.328068155288572e+01" + fpSuffix + ";\n"
724 " const " + fpTypeStr + " c1_ = -7.784894002430293e-03" + fpSuffix + ";\n"
725 " const " + fpTypeStr + " c2_ = -3.223964580411365e-01" + fpSuffix + ";\n"
726 " const " + fpTypeStr + " c3_ = -2.400758277161838e+00" + fpSuffix + ";\n"
727 " const " + fpTypeStr + " c4_ = -2.549732539343734e+00" + fpSuffix + ";\n"
728 " const " + fpTypeStr + " c5_ = 4.374664141464968e+00" + fpSuffix + ";\n"
729 " const " + fpTypeStr + " c6_ = 2.938163982698783e+00" + fpSuffix + ";\n"
730 " const " + fpTypeStr + " d1_ = 7.784695709041462e-03" + fpSuffix + ";\n"
731 " const " + fpTypeStr + " d2_ = 3.224671290700398e-01" + fpSuffix + ";\n"
732 " const " + fpTypeStr + " d3_ = 2.445134137142996e+00" + fpSuffix + ";\n"
733 " const " + fpTypeStr + " d4_ = 3.754408661907416e+00" + fpSuffix + ";\n"
734 " const " + fpTypeStr + " x_low_ = 0.02425" + fpSuffix + ";\n"
735 " const " + fpTypeStr + " x_high_ = 1.0" + fpSuffix + " - x_low_;\n"
736 " const " + fpTypeStr + " x = ((" + fpTypeStr + ")x0 + 0.5" + fpSuffix + ") / 4294967296.0"+ fpSuffix + ";\n"
737 " if (x < x_low_ || x_high_ < x) {\n"
738 " if (x0 == UINT_MAX) {\n"
739 " return " + fpMaxValue + ";\n"
740 " } else if(x0 == 0) {\n"
741 " return -" + fpMaxValue + ";\n"
742 " }\n"
743 " " + fpTypeStr + " z;\n"
744 " if (x < x_low_) {\n"
745 " z = sqrt(-2.0" + fpSuffix + " * log(x));\n"
746 " z = (((((c1_ * z + c2_) * z + c3_) * z + c4_) * z + c5_) * z + c6_) /\n"
747 " ((((d1_ * z + d2_) * z + d3_) * z + d4_) * z + 1.0" + fpSuffix + ");\n"
748 " } else {\n"
749 " z = sqrt(-2.0f * log(1.0f - x));\n"
750 " z = -(((((c1_ * z + c2_) * z + c3_) * z + c4_) * z + c5_) * z + c6_) /\n"
751 " ((((d1_ * z + d2_) * z + d3_) * z + d4_) * z + 1.0" + fpSuffix + ");\n"
752 " }\n"
753 " return z;\n"
754 " } else {\n"
755 " " + fpTypeStr + " z = x - 0.5" + fpSuffix + ";\n"
756 " " + fpTypeStr + " r = z * z;\n"
757 " z = (((((a1_ * r + a2_) * r + a3_) * r + a4_) * r + a5_) * r + a6_) * z /\n"
758 " (((((b1_ * r + b2_) * r + b3_) * r + b4_) * r + b5_) * r + 1.0" + fpSuffix +");\n"
759 " return z;\n"
760 " }\n"
761 "}\n\n";
762
763 // from QuantLib::MersenneTwisterUniformRng
764
765 std::string kernelSourceSeedInit = "__kernel void ore_seedInitialization(const ulong s, __global ulong* mt) {\n"
766 " const ulong N = 624;\n"
767 " mt[0]= s & 0xffffffffUL;\n"
768 " for (ulong mti=1; mti<N; ++mti) {\n"
769 " mt[mti] = (1812433253UL * (mt[mti-1] ^ (mt[mti-1] >> 30)) + mti);\n"
770 " mt[mti] &= 0xffffffffUL;\n"
771 " }\n"
772 "}\n\n";
773
774 std::string kernelSourceTwist = "__kernel void ore_twist(__global ulong* mt) {\n"
775 " const ulong N = 624;\n"
776 " const ulong M = 397;\n"
777 " const ulong MATRIX_A = 0x9908b0dfUL;\n"
778 " const ulong UPPER_MASK=0x80000000UL;\n"
779 " const ulong LOWER_MASK=0x7fffffffUL;\n"
780 " const ulong mag01[2]={0x0UL, MATRIX_A};\n"
781 " ulong kk;\n"
782 " ulong y;\n"
783 " for (kk=0;kk<N-M;++kk) {\n"
784 " y = (mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK);\n"
785 " mt[kk] = mt[kk+M] ^ (y >> 1) ^ mag01[y & 0x1UL];\n"
786 " }\n"
787 " for (;kk<N-1;kk++) {\n"
788 " y = (mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK);\n"
789 " mt[kk] = mt[(kk+M)-N] ^ (y >> 1) ^ mag01[y & 0x1UL];\n"
790 " }\n"
791 " y = (mt[N-1]&UPPER_MASK)|(mt[0]&LOWER_MASK);\n"
792 " mt[N-1] = mt[M-1] ^ (y >> 1) ^ mag01[y & 0x1UL];\n"
793 "}\n\n";
794
795 std::string kernelSourceGenerate =
796 "__kernel void ore_generate(const ulong offset, __global ulong* mt, __global " + fpTypeStr + "* output) {\n"
797 " ulong mti = get_global_id(0);\n"
798 " ulong y = mt[mti];\n"
799 " y ^= (y >> 11);\n"
800 " y ^= (y << 7) & 0x9d2c5680UL;\n"
801 " y ^= (y << 15) & 0xefc60000UL;\n"
802 " y ^= (y >> 18);\n"
803 " output[offset + mti] = ore_invCumN((uint)y);\n"
804 "}\n\n";
805 // clang-format on
806
807 std::string programSource = sourceInvCumN + kernelSourceSeedInit + kernelSourceTwist + kernelSourceGenerate;
808
809 // std::cerr << "generated variates program:\n" + programSource << std::endl;
810
811 const char* programSourcePtr = programSource.c_str();
812 cl_int err;
813 variatesProgram_ = clCreateProgramWithSource(*context_, 1, &programSourcePtr, NULL, &err);
814 QL_REQUIRE(err == CL_SUCCESS,
815 "OpenClContext::updateVariatesPool(): error creating program: " << errorText(err));
816 err = clBuildProgram(variatesProgram_, 1, device_, NULL, NULL, NULL);
817 if (err != CL_SUCCESS) {
818 char buffer[ORE_OPENCL_MAX_BUILD_LOG];
819 clGetProgramBuildInfo(variatesProgram_, *device_, CL_PROGRAM_BUILD_LOG,
820 ORE_OPENCL_MAX_BUILD_LOG * sizeof(char), buffer, NULL);
821 QL_FAIL("OpenClContext::updateVariatesPool(): error during program build: "
822 << errorText(err) << ": " << std::string(buffer).substr(ORE_OPENCL_MAX_BUILD_LOG_LOGFILE));
823 }
824
825 variatesKernelSeedInit_ = clCreateKernel(variatesProgram_, "ore_seedInitialization", &err);
826 QL_REQUIRE(err == CL_SUCCESS,
827 "OpenClContext::updateVariatesPool(): error creating kernel seedInit: " << errorText(err));
828
829 variatesKernelTwist_ = clCreateKernel(variatesProgram_, "ore_twist", &err);
830 QL_REQUIRE(err == CL_SUCCESS,
831 "OpenClContext::updateVariatesPool(): error creating kernel twist: " << errorText(err));
832
833 variatesKernelGenerate_ = clCreateKernel(variatesProgram_, "ore_generate", &err);
834 QL_REQUIRE(err == CL_SUCCESS,
835 "OpenClContext::updateVariatesPool(): error creating kernel generate: " << errorText(err));
836
837 variatesMtStateBuffer_ = clCreateBuffer(*context_, CL_MEM_READ_WRITE, sizeof(cl_ulong) * mt_N, NULL, &err);
838 QL_REQUIRE(err == CL_SUCCESS,
839 "OpenClContext::updateVariatesPool(): error creating mt state buffer: " << errorText(err));
840
841 cl_ulong tmpSeed = (cl_ulong)settings_.rngSeed;
842 err = clSetKernelArg(variatesKernelSeedInit_, 0, sizeof(cl_ulong), &tmpSeed);
843 err |= clSetKernelArg(variatesKernelSeedInit_, 1, sizeof(cl_mem), &variatesMtStateBuffer_);
844 QL_REQUIRE(err == CL_SUCCESS,
845 "OpenClContext::updateVariatesPool(): error setting kernel args seed init: " << errorText(err));
846
847 err = clEnqueueNDRangeKernel(queue_, variatesKernelSeedInit_, 1, NULL, &size_one, NULL, 0, NULL, &initEvent);
848 QL_REQUIRE(err == CL_SUCCESS,
849 "OpenClContext::updateVariatesPool(): error running kernel seed init: " << errorText(err));
850 }
851
852 // if the variates pool is big enough, we exit early
853
854 if (variatesPoolSize_ >= nVariatesTmp_ * size_[currentId_ - 1]) {
855 if (variatesPoolSize_ == 0)
856 clWaitForEvents(1, &initEvent);
857 return;
858 }
859
860 // create new buffer to hold the variates and copy the current buffer contents to the new buffer
861
862 Size alignedSize = 624 * (nVariatesTmp_ * size_[currentId_ - 1] / 624 +
863 (nVariatesTmp_ * size_[currentId_ - 1] % 624 == 0 ? 0 : 1));
864
865 cl_int err;
866
867 cl_mem oldBuffer = variatesPool_;
868
869 struct OldBufferReleaser {
870 OldBufferReleaser(cl_mem b, bool active) : b(b), active(active) {}
871 ~OldBufferReleaser() {
872 if (active)
873 OpenClContext::releaseMem(b, "expired variates buffer");
874 }
875 cl_mem b;
876 bool active;
877 } oldBufferReleaser(oldBuffer, variatesPoolSize_ > 0);
878
879 variatesPool_ = clCreateBuffer(*context_, CL_MEM_READ_WRITE, fpSize * alignedSize, NULL, &err);
880 QL_REQUIRE(err == CL_SUCCESS, "OpenClContext::updateVariatesPool(): error creating variates buffer with size "
881 << fpSize * alignedSize << " bytes: " << errorText(err));
882 cl_event copyEvent;
883 if (variatesPoolSize_ > 0) {
884 err = clEnqueueCopyBuffer(queue_, oldBuffer, variatesPool_, 0, 0, fpSize * variatesPoolSize_, 0, NULL,
885 &copyEvent);
886 QL_REQUIRE(err == CL_SUCCESS,
887 "OpenClContext::updateVariatesPool(): error copying existing variates buffer to new buffer: "
888 << errorText(err));
889 }
890
891 // fill in the new variates
892
893 std::size_t currentPoolSize;
894 cl_event generateEvent;
895 bool haveGenerated = false;
896 for (currentPoolSize = variatesPoolSize_; currentPoolSize < nVariatesTmp_ * size_[currentId_ - 1];
897 currentPoolSize += mt_N) {
898 err = clSetKernelArg(variatesKernelTwist_, 0, sizeof(cl_mem), &variatesMtStateBuffer_);
899 QL_REQUIRE(err == CL_SUCCESS,
900 "OpenClContext::updateVariatesPool(): error setting args for kernel twist: " << errorText(err));
901 cl_event twistEvent;
902 err = clEnqueueNDRangeKernel(
903 queue_, variatesKernelTwist_, 1, NULL, &size_one, NULL, variatesPoolSize_ == 0 || haveGenerated ? 1 : 0,
904 variatesPoolSize_ == 0 ? &initEvent : (haveGenerated ? &generateEvent : NULL), &twistEvent);
905 QL_REQUIRE(err == CL_SUCCESS,
906 "OpenClContext::updateVariatesPool(): error running kernel twist: " << errorText(err));
907
908 err = clSetKernelArg(variatesKernelGenerate_, 0, sizeof(cl_ulong), &currentPoolSize);
909 err |= clSetKernelArg(variatesKernelGenerate_, 1, sizeof(cl_mem), &variatesMtStateBuffer_);
910 err |= clSetKernelArg(variatesKernelGenerate_, 2, sizeof(cl_mem), &variatesPool_);
911 QL_REQUIRE(err == CL_SUCCESS,
912 "OpenClContext::updateVariatesPool(): error settings args for kernel generate: " << errorText(err));
913 err = clEnqueueNDRangeKernel(queue_, variatesKernelGenerate_, 1, NULL, &mt_N, NULL, 1, &twistEvent,
914 &generateEvent);
915 QL_REQUIRE(err == CL_SUCCESS,
916 "OpenClContext::updateVariatesPool(): error running kernel generate: " << errorText(err));
917 haveGenerated = true;
918 }
919
920 // wait for events to finish
921
922 std::vector<cl_event> waitList;
923 if (variatesPoolSize_ > 0)
924 waitList.push_back(copyEvent);
925 if (haveGenerated)
926 waitList.push_back(generateEvent);
927 if (!waitList.empty())
928 clWaitForEvents(waitList.size(), &waitList[0]);
929
930 // update current variates pool size
931
932 QL_REQUIRE(currentPoolSize == alignedSize, "OpenClContext::updateVariatesPool(): internal error, currentPoolSize = "
933 << currentPoolSize << " does not match alignedSize " << alignedSize);
934 variatesPoolSize_ = currentPoolSize;
935}
936
937std::vector<std::vector<std::size_t>> OpenClContext::createInputVariates(const std::size_t dim,
938 const std::size_t steps) {
939 QL_REQUIRE(currentState_ == ComputeState::createInput || currentState_ == ComputeState::createVariates,
940 "OpenClContext::createInputVariates(): not in state createInput or createVariates ("
941 << static_cast<int>(currentState_) << ")");
942 QL_REQUIRE(currentId_ > 0, "OpenClContext::applyOperation(): current id is not set");
943 QL_REQUIRE(!hasKernel_[currentId_ - 1], "OpenClContext::createInputVariates(): id ("
944 << currentId_ << ") in version " << version_[currentId_ - 1]
945 << " has a kernel already, input variates can not be regenerated.");
946 currentState_ = ComputeState::createVariates;
947 std::vector<std::vector<std::size_t>> resultIds(dim, std::vector<std::size_t>(steps));
948 for (std::size_t j = 0; j < steps; ++j) {
949 for (std::size_t i = 0; i < dim; ++i) {
950 resultIds[i][j] = nVarsTmp_++;
951 }
952 }
953 nVariatesTmp_ += dim * steps;
954 updateVariatesPool();
955 return resultIds;
956}
957
958std::size_t OpenClContext::generateResultId() {
959 std::size_t resultId;
960 if (!freedVariables_.empty()) {
961 resultId = freedVariables_.back();
962 freedVariables_.pop_back();
963 } else {
964 resultId = nVarsTmp_++;
965 }
966 return resultId;
967}
968
969std::pair<std::vector<std::string>, std::set<std::size_t>>
970OpenClContext::getArgString(const std::vector<std::size_t>& args) const {
971 std::vector<std::string> argStr(args.size());
972 std::set<std::size_t> localIds;
973 for (std::size_t i = 0; i < args.size(); ++i) {
974 if (args[i] < inputVarOffset_.size()) {
975 argStr[i] = "input[" + std::to_string(inputVarOffset_[args[i]]) + "U" +
976 (inputVarIsScalar_[args[i]] ? "]" : " + i]");
977 } else if (args[i] < inputVarOffset_.size() + nVariatesTmp_) {
978 argStr[i] = "rn[" + std::to_string((args[i] - inputVarOffset_.size()) * size_[currentId_ - 1]) + "U + i]";
979 } else {
980 argStr[i] = "v" + std::to_string(args[i]);
981 localIds.insert(args[i]);
982 }
983 }
984 return std::make_pair(argStr, localIds);
985}
986
987void OpenClContext::startNewSsaPart() {
988 currentSsa_.push_back(std::vector<ssa_entry>());
989 conditionalExpectationVarIds_[currentId_ - 1].push_back(std::vector<std::vector<std::size_t>>());
990 nVars_[currentId_ - 1].push_back(nVarsTmp_);
991}
992
993std::size_t OpenClContext::applyOperation(const std::size_t randomVariableOpCode,
994 const std::vector<std::size_t>& args) {
995 QL_REQUIRE(currentState_ == ComputeState::createInput || currentState_ == ComputeState::createVariates ||
996 currentState_ == ComputeState::calc,
997 "OpenClContext::applyOperation(): not in state createInput or calc (" << static_cast<int>(currentState_)
998 << ")");
999 currentState_ = ComputeState::calc;
1000 QL_REQUIRE(currentId_ > 0, "OpenClContext::applyOperation(): current id is not set");
1001 QL_REQUIRE(!hasKernel_[currentId_ - 1], "OpenClContext::applyOperation(): id (" << currentId_ << ") in version "
1002 << version_[currentId_ - 1]
1003 << " has a kernel already.");
1004 std::string fpTypeStr = settings_.useDoublePrecision ? "double" : "float";
1005
1006 auto [argStr, argLocalIds] = getArgString(args);
1007
1008 if (randomVariableOpCode == RandomVariableOpCode::ConditionalExpectation) {
1009
1010 if (std::find_if(argStr.begin(), argStr.end(), [this](const std::string& a) {
1011 return currentConditionalExpectationArgs_.find(a) != currentConditionalExpectationArgs_.end();
1012 }) != argStr.end()) {
1013 startNewSsaPart();
1014 currentConditionalExpectationArgs_.clear();
1015 }
1016
1017 std::vector<std::size_t> argIds;
1018 std::size_t regressandId;
1019 for (std::size_t i = 0; i < argStr.size() + 1; ++i) {
1020 auto resultId = generateResultId();
1021 // no harm in pushing all arg local ids here for each entry
1022 currentSsa_.back().push_back(
1023 {std::string("v") + std::to_string(resultId), resultId, i == 0 ? "0.0" : argStr[i - 1], argLocalIds});
1024 argIds.push_back(resultId);
1025 if (i == 0) {
1026 regressandId = resultId;
1027 currentConditionalExpectationArgs_.insert("v" + std::to_string(resultId));
1028 }
1029 }
1030
1031 conditionalExpectationVarIds_[currentId_ - 1].back().push_back(argIds);
1032
1033 return regressandId;
1034
1035 } else {
1036
1037 // op code is everythig but conditional expectation (i.e. a pathwise operation)
1038
1039 auto resultId = generateResultId();
1040
1041 switch (randomVariableOpCode) {
1043 break;
1044 }
1046 currentSsa_.back().push_back(
1047 {"v" + std::to_string(resultId), resultId, argStr[0] + " + " + argStr[1], argLocalIds});
1048 break;
1049 }
1051 currentSsa_.back().push_back(
1052 {"v" + std::to_string(resultId), resultId, argStr[0] + " - " + argStr[1], argLocalIds});
1053 break;
1054 }
1056 currentSsa_.back().push_back({"v" + std::to_string(resultId), resultId, "-" + argStr[0], argLocalIds});
1057 break;
1058 }
1060 currentSsa_.back().push_back(
1061 {"v" + std::to_string(resultId), resultId, argStr[0] + " * " + argStr[1], argLocalIds});
1062 break;
1063 }
1065 currentSsa_.back().push_back(
1066 {"v" + std::to_string(resultId), resultId, argStr[0] + " / " + argStr[1], argLocalIds});
1067 break;
1068 }
1070 currentSsa_.back().push_back({"v" + std::to_string(resultId), resultId,
1071 "ore_indicatorEq(" + argStr[0] + "," + argStr[1] + ")", argLocalIds});
1072 break;
1073 }
1075 currentSsa_.back().push_back({"v" + std::to_string(resultId), resultId,
1076 "ore_indicatorGt(" + argStr[0] + "," + argStr[1] + ")", argLocalIds});
1077 break;
1078 }
1080 currentSsa_.back().push_back({"v" + std::to_string(resultId), resultId,
1081 "ore_indicatorGeq(" + argStr[0] + "," + argStr[1] + ")", argLocalIds});
1082 break;
1083 }
1085 currentSsa_.back().push_back(
1086 {"v" + std::to_string(resultId), resultId, "fmin(" + argStr[0] + "," + argStr[1] + ")", argLocalIds});
1087 break;
1088 }
1090 currentSsa_.back().push_back(
1091 {"v" + std::to_string(resultId), resultId, "fmax(" + argStr[0] + "," + argStr[1] + ")", argLocalIds});
1092 break;
1093 }
1095 currentSsa_.back().push_back(
1096 {"v" + std::to_string(resultId), resultId, "fabs(" + argStr[0] + ")", argLocalIds});
1097 break;
1098 }
1100 currentSsa_.back().push_back(
1101 {"v" + std::to_string(resultId), resultId, "exp(" + argStr[0] + ")", argLocalIds});
1102 break;
1103 }
1105 currentSsa_.back().push_back(
1106 {"v" + std::to_string(resultId), resultId, "sqrt(" + argStr[0] + ")", argLocalIds});
1107 break;
1108 }
1110 currentSsa_.back().push_back(
1111 {"v" + std::to_string(resultId), resultId, "log(" + argStr[0] + ")", argLocalIds});
1112 break;
1113 }
1115 currentSsa_.back().push_back(
1116 {"v" + std::to_string(resultId), resultId, "pow(" + argStr[0] + "," + argStr[1] + ")", argLocalIds});
1117 break;
1118 }
1119 // TODO add this in the kernel code below first before activating it here
1120 // case RandomVariableOpCode::NormalCdf: {
1121 // currentSsa_.back().push_back(
1122 // std::make_tuple(true, "v" + std::to_string(resultId), "ore_normalCdf(" + argStr[0] + ")"));
1123 // break;
1124 // }
1126 currentSsa_.back().push_back(
1127 {"v" + std::to_string(resultId), resultId, "ore_normalPdf(" + argStr[0] + ")", argLocalIds});
1128 break;
1129 }
1130 default: {
1131 QL_FAIL("OpenClContext::executeKernel(): no implementation for op code "
1132 << randomVariableOpCode << " (" << getRandomVariableOpLabels()[randomVariableOpCode]
1133 << ") provided.");
1134 }
1135 } // switch random var op code
1136
1137 if (settings_.debug)
1138 debugInfo_.numberOfOperations += 1 * size_[currentId_ - 1];
1139
1140 return resultId;
1141 }
1142}
1143
1144void OpenClContext::freeVariable(const std::size_t id) {
1145 QL_REQUIRE(currentState_ == ComputeState::calc,
1146 "OpenClContext::free(): not in state calc (" << static_cast<int>(currentState_) << ")");
1147 QL_REQUIRE(currentId_ > 0, "OpenClContext::freeVariable(): current id is not set");
1148 QL_REQUIRE(!hasKernel_[currentId_ - 1], "OpenClContext::freeVariable(): id ("
1149 << currentId_ << ") in version " << version_[currentId_ - 1]
1150 << " has a kernel already, variables can not be freed.");
1151
1152 // we do not free input variables, only variables that were added during the calc
1153
1154 if (id < inputVarOffset_.size() + nVariatesTmp_)
1155 return;
1156
1157 freedVariables_.push_back(id);
1158}
1159
1160void OpenClContext::declareOutputVariable(const std::size_t id) {
1161 QL_REQUIRE(currentState_ != ComputeState::idle, "OpenClContext::declareOutputVariable(): state is idle");
1162 QL_REQUIRE(currentId_ > 0, "OpenClContext::declareOutputVariable(): current id not set");
1163 QL_REQUIRE(!hasKernel_[currentId_ - 1], "OpenClContext::declareOutputVariable(): id ("
1164 << currentId_ << ") in version " << version_[currentId_ - 1]
1165 << " has a kernel already, output variables can not be declared.");
1166 currentState_ = ComputeState::declareOutput;
1167 outputVariables_.push_back(id);
1168 nOutputVars_[currentId_ - 1]++;
1169
1170 // if we declare a conditional expectation in the current ssa part as output, we need to create a new ssa part
1171
1172 if (currentConditionalExpectationArgs_.find("v" + std::to_string(id)) != currentConditionalExpectationArgs_.end()) {
1173 startNewSsaPart();
1174 currentConditionalExpectationArgs_.clear();
1175 }
1176}
1177
1178std::size_t OpenClContext::valuesBufferId(const std::size_t i) const {
1179 return i - inputVarOffset_.size() - nVariates_[currentId_ - 1];
1180}
1181
1182std::string OpenClContext::generateSsaCode(const std::vector<ssa_entry>& ssa) const {
1183 std::set<std::size_t> localVars;
1184 for (auto const& s : ssa) {
1185 localVars.insert(s.rhs_local_id.begin(), s.rhs_local_id.end());
1186 }
1187
1188 std::string fpTypeStr = settings_.useDoublePrecision ? "double" : "float";
1189
1190 std::string result;
1191 std::set<std::size_t> hasDeclaration;
1192 for (auto const& s : ssa) {
1193 if (s.lhs_local_id) {
1194 if (localVars.find(*s.lhs_local_id) == localVars.end())
1195 continue;
1196 if (hasDeclaration.find(*s.lhs_local_id) == hasDeclaration.end()) {
1197 result += fpTypeStr + " ";
1198 hasDeclaration.insert(*s.lhs_local_id);
1199 }
1200 }
1201 result += s.lhs_str + "=" + s.rhs_str + ";\n";
1202 }
1203 return result;
1204}
1205
1206void OpenClContext::finalizeCalculation(std::vector<double*>& output) {
1207 struct exitGuard {
1208 exitGuard() {}
1209 ~exitGuard() {
1210 *currentState = ComputeState::idle;
1211 for (auto& m : mem)
1212 clReleaseMemObject(m);
1213 }
1214 ComputeState* currentState;
1215 std::vector<cl_mem> mem;
1216 } guard;
1217
1218 guard.currentState = &currentState_;
1219
1220 QL_REQUIRE(currentId_ > 0, "OpenClContext::finalizeCalculation(): current id is not set");
1221 QL_REQUIRE(output.size() == nOutputVars_[currentId_ - 1],
1222 "OpenClContext::finalizeCalculation(): output size ("
1223 << output.size() << ") inconsistent to kernel output size (" << nOutputVars_[currentId_ - 1] << ")");
1224 QL_REQUIRE(!settings_.useDoublePrecision || supportsDoublePrecision(),
1225 "OpenClContext::finalizeCalculation(): double precision is configured for this calculation, but not "
1226 "supported by the device. Switch to single precision or use an appropriate device.");
1227
1228 // update nVars_, nVariates_ for this calculation if it was a new calculation
1229
1230 if (nVars_[currentId_ - 1].size() < currentSsa_.size())
1231 nVars_[currentId_ - 1].push_back(nVarsTmp_);
1232
1233 if (nVariates_[currentId_ - 1] == 0)
1234 nVariates_[currentId_ - 1] = nVariatesTmp_;
1235
1236 boost::timer::cpu_timer timer;
1237 boost::timer::nanosecond_type timerBase;
1238
1239 // create input, values and output buffers
1240
1241 std::size_t fpSize = settings_.useDoublePrecision ? sizeof(double) : sizeof(float);
1242
1243 if (settings_.debug) {
1244 timerBase = timer.elapsed().wall;
1245 }
1246
1247 std::size_t inputBufferSize = 0;
1248 if (!inputVarOffset_.empty())
1249 inputBufferSize = inputVarOffset_.back() + (inputVarIsScalar_.back() ? 1 : size_[currentId_ - 1]);
1250 cl_int err;
1251 cl_mem inputBuffer;
1252 if (inputBufferSize > 0) {
1253 inputBuffer = clCreateBuffer(*context_, CL_MEM_READ_WRITE, fpSize * inputBufferSize, NULL, &err);
1254 guard.mem.push_back(inputBuffer);
1255 QL_REQUIRE(err == CL_SUCCESS, "OpenClContext::finalizeCalculation(): creating input buffer of size "
1256 << inputBufferSize << " fails: " << errorText(err));
1257 }
1258
1259 std::size_t valuesBufferSize = 0;
1260 cl_mem valuesBuffer;
1261 if (currentSsa_.size() > 1) {
1262 valuesBufferSize = (nVars_[currentId_ - 1].back() - inputVarOffset_.size() - nVariates_[currentId_ - 1]) *
1263 size_[currentId_ - 1];
1264 if (valuesBufferSize > 0) {
1265 valuesBuffer = clCreateBuffer(*context_, CL_MEM_READ_WRITE, fpSize * valuesBufferSize, NULL, &err);
1266 guard.mem.push_back(valuesBuffer);
1267 QL_REQUIRE(err == CL_SUCCESS, "OpenClContext::finalizeCalculation(): creating values buffer of size "
1268 << valuesBufferSize << " fails: " << errorText(err));
1269 }
1270 }
1271
1272 std::size_t outputBufferSize = nOutputVars_[currentId_ - 1] * size_[currentId_ - 1];
1273 cl_mem outputBuffer;
1274 if (outputBufferSize > 0) {
1275 outputBuffer = clCreateBuffer(*context_, CL_MEM_READ_WRITE, fpSize * outputBufferSize, NULL, &err);
1276 guard.mem.push_back(outputBuffer);
1277 QL_REQUIRE(err == CL_SUCCESS, "OpenClContext::finalizeCalculation(): creating output buffer of size "
1278 << outputBufferSize << " fails: " << errorText(err));
1279 }
1280
1281 if (settings_.debug) {
1282 debugInfo_.nanoSecondsDataCopy += timer.elapsed().wall - timerBase;
1283 }
1284
1285 // build kernel if necessary
1286
1287 if (!hasKernel_[currentId_ - 1]) {
1288 std::string fpTypeStr = settings_.useDoublePrecision ? "double" : "float";
1289 std::string fpEpsStr = settings_.useDoublePrecision ? "0x1.0p-52" : "0x1.0p-23f";
1290 std::string fpSuffix = settings_.useDoublePrecision ? std::string() : "f";
1291
1292 // TODO ore_normalCdf
1293
1294 // clang-format off
1295 std::string kernelSource =
1296 "bool ore_closeEnough(const " + fpTypeStr + " x, const " + fpTypeStr + " y);\n"
1297 "bool ore_closeEnough(const " + fpTypeStr + " x, const " + fpTypeStr + " y) {\n"
1298 " const " + fpTypeStr + " tol = 42.0" + fpSuffix + " * " + fpEpsStr + ";\n"
1299 " " + fpTypeStr + " diff = fabs(x - y);\n"
1300 " if (x == 0.0" + fpSuffix + " || y == 0.0" + fpSuffix + ")\n"
1301 " return diff < tol * tol;\n"
1302 " return diff <= tol * fabs(x) || diff <= tol * fabs(y);\n"
1303 "}\n" +
1304 fpTypeStr + " ore_indicatorEq(const " + fpTypeStr + " x, const " + fpTypeStr + " y);\n" +
1305 fpTypeStr + " ore_indicatorEq(const " + fpTypeStr + " x, const " + fpTypeStr + " y) "
1306 "{ return ore_closeEnough(x, y) ? 1.0" + fpSuffix + " : 0.0" + fpSuffix +"; }\n\n" +
1307 fpTypeStr + " ore_indicatorGt(const " + fpTypeStr + " x, const " + fpTypeStr + " y);\n" +
1308 fpTypeStr + " ore_indicatorGt(const " + fpTypeStr + " x, const " + fpTypeStr + " y) " +
1309 "{ return x > y && !ore_closeEnough(x, y); }\n\n" +
1310 fpTypeStr + " ore_indicatorGeq(const " + fpTypeStr + " x, const " + fpTypeStr + " y);\n" +
1311 fpTypeStr + " ore_indicatorGeq(const " + fpTypeStr + " x, const " + fpTypeStr + " y) { return x > y || ore_closeEnough(x, y); }\n\n" +
1312 fpTypeStr + " ore_normalCdf(const " + fpTypeStr + " x);\n" +
1313 fpTypeStr + " ore_normalCdf(const " + fpTypeStr + " x) {\n return 0.0" + fpSuffix + ";}\n" +
1314 fpTypeStr + " ore_normalPdf(const " + fpTypeStr + " x);\n" +
1315 fpTypeStr + " ore_normalPdf(const " + fpTypeStr + " x) {\n" +
1316 " " + fpTypeStr + " exponent = -(x*x)/2.0" + fpSuffix + ";\n" +
1317 " return exponent <= -690.0" + fpSuffix + " ? 0.0 : exp(exponent) * 0.3989422804014327" + fpSuffix + ";\n"
1318 "}\n";
1319
1320 // clang-format on
1321
1322 std::string kernelNameStem =
1323 "ore_kernel_" + std::to_string(currentId_) + "_" + std::to_string(version_[currentId_ - 1]) + "_";
1324
1325 for (std::size_t part = 0; part < currentSsa_.size(); ++part) {
1326
1327 bool initFromValues = part > 0;
1328 bool cacheToValues = currentSsa_.size() > 1 && part < currentSsa_.size() - 1;
1329 bool generateOutputValues = part == currentSsa_.size() - 1;
1330
1331 std::string kernelName = kernelNameStem + std::to_string(part);
1332
1333 std::vector<std::string> inputArgs;
1334 if (inputBufferSize > 0)
1335 inputArgs.push_back("__global " + fpTypeStr + "* input");
1336 if (nVariates_[currentId_ - 1] > 0)
1337 inputArgs.push_back("__global " + fpTypeStr + "* rn");
1338 if (valuesBufferSize > 0 && (initFromValues || cacheToValues))
1339 inputArgs.push_back("__global " + fpTypeStr + "* values");
1340 if (outputBufferSize > 0 && generateOutputValues)
1341 inputArgs.push_back("__global " + fpTypeStr + "* output");
1342
1343 kernelSource += "__kernel void " + kernelName + "(" + boost::join(inputArgs, ",") +
1344 ") {\n"
1345 "unsigned int i = get_global_id(0);\n"
1346 "if(i < " +
1347 std::to_string(size_[currentId_ - 1]) + "U) {\n";
1348
1349 std::vector<ssa_entry> ssa;
1350
1351 if (initFromValues) {
1352 for (std::size_t i = inputVarOffset_.size() + nVariates_[currentId_ - 1];
1353 i < nVars_[currentId_ - 1][part]; ++i) {
1354 ssa.push_back(
1355 {std::string("v") + std::to_string(i),
1356 i,
1357 std::string("values[") + std::to_string(valuesBufferId(i) * size_[currentId_ - 1]) + "U + i]",
1358 {}});
1359 }
1360 }
1361
1362 ssa.insert(ssa.end(), currentSsa_[part].begin(), currentSsa_[part].end());
1363
1364 if (cacheToValues) {
1365 for (std::size_t i = inputVarOffset_.size() + nVariates_[currentId_ - 1];
1366 i < nVars_[currentId_ - 1][part]; ++i) {
1367 ssa.push_back({"values[" + std::to_string(valuesBufferId(i) * size_[currentId_ - 1]) + "U + i]",
1368 std::nullopt,
1369 "v" + std::to_string(i),
1370 {i}});
1371 }
1372 }
1373
1374 if (generateOutputValues) {
1375 for (std::size_t i = 0; i < nOutputVars_[currentId_ - 1]; ++i) {
1376 std::size_t offset = i * size_[currentId_ - 1];
1377 std::string output;
1378 std::set<std::size_t> rhsLocalIds;
1379 if (outputVariables_[i] < inputVarOffset_.size()) {
1380 output = "input[" + std::to_string(inputVarOffset_[outputVariables_[i]]) + "U" +
1381 (inputVarIsScalar_[outputVariables_[i]] ? "]" : " + i] ");
1382 } else if (outputVariables_[i] < inputVarOffset_.size() + nVariates_[currentId_ - 1]) {
1383 output =
1384 "rn[" +
1385 std::to_string((outputVariables_[i] - inputVarOffset_.size()) * size_[currentId_ - 1]) +
1386 "U + i]";
1387 } else {
1388 output = "v" + std::to_string(outputVariables_[i]);
1389 rhsLocalIds.insert(outputVariables_[i]);
1390 }
1391 ssa.push_back({"output[" + std::to_string(offset) + "U + i]", std::nullopt, output, rhsLocalIds});
1392 }
1393 }
1394
1395 kernelSource += generateSsaCode(ssa);
1396 kernelSource += "}}\n";
1397
1398 } // for part
1399
1400 // std::cerr << "generated kernel: \n" + kernelSource + "\n";
1401
1402 if (settings_.debug) {
1403 timerBase = timer.elapsed().wall;
1404 }
1405
1406 cl_int err;
1407 const char* kernelSourcePtr = kernelSource.c_str();
1408 program_[currentId_ - 1] = clCreateProgramWithSource(*context_, 1, &kernelSourcePtr, NULL, &err);
1409 QL_REQUIRE(err == CL_SUCCESS, "OpenClContext::finalizeCalculation(): error during clCreateProgramWithSource(): "
1410 << errorText(err));
1411 err = clBuildProgram(program_[currentId_ - 1], 1, device_, NULL, NULL, NULL);
1412 if (err != CL_SUCCESS) {
1413 char buffer[ORE_OPENCL_MAX_BUILD_LOG];
1414 clGetProgramBuildInfo(program_[currentId_ - 1], *device_, CL_PROGRAM_BUILD_LOG,
1415 ORE_OPENCL_MAX_BUILD_LOG * sizeof(char), buffer, NULL);
1416 QL_FAIL("OpenClContext::finalizeCalculation(): error during program build for kernel '"
1417 << kernelNameStem << "*': " << errorText(err) << ": "
1418 << std::string(buffer).substr(ORE_OPENCL_MAX_BUILD_LOG_LOGFILE));
1419 }
1420
1421 for (std::size_t part = 0; part < currentSsa_.size(); ++part) {
1422 std::string kernelName = kernelNameStem + std::to_string(part);
1423 kernel_[currentId_ - 1].push_back(clCreateKernel(program_[currentId_ - 1], kernelName.c_str(), &err));
1424 QL_REQUIRE(err == CL_SUCCESS, "OpenClContext::finalizeCalculation(): error during clCreateKernel(), part #"
1425 << part << ": " << errorText(err));
1426 }
1427
1428 hasKernel_[currentId_ - 1] = true;
1429 inputBufferSize_[currentId_ - 1] = inputBufferSize;
1430
1431 if (settings_.debug) {
1432 debugInfo_.nanoSecondsProgramBuild += timer.elapsed().wall - timerBase;
1433 }
1434 } else {
1435 QL_REQUIRE(inputBufferSize == inputBufferSize_[currentId_ - 1],
1436 "OpenClContext::finalizeCalculation(): input buffer size ("
1437 << inputBufferSize << ") inconsistent to kernel input buffer size ("
1438 << inputBufferSize_[currentId_ - 1] << ")");
1439 }
1440
1441 // write input data to input buffer (asynchronously)
1442
1443 if (settings_.debug) {
1444 timerBase = timer.elapsed().wall;
1445 }
1446
1447 cl_event inputBufferEvent;
1448 if (inputBufferSize > 0) {
1449 err = clEnqueueWriteBuffer(queue_, inputBuffer, CL_FALSE, 0, fpSize * inputBufferSize,
1450 settings_.useDoublePrecision ? (void*)&inputVarValues64_[0]
1451 : (void*)&inputVarValues32_[0],
1452 0, NULL, &inputBufferEvent);
1453 QL_REQUIRE(err == CL_SUCCESS,
1454 "OpenClContext::finalizeCalculation(): writing to input buffer fails: " << errorText(err));
1455 }
1456
1457 if (settings_.debug) {
1458 err = clFinish(queue_);
1459 QL_REQUIRE(err == CL_SUCCESS, "OpenClContext::clFinish(): error in debug mode: " << errorText(err));
1460 debugInfo_.nanoSecondsDataCopy += timer.elapsed().wall - timerBase;
1461 }
1462
1463 std::vector<cl_event> runWaitEvents;
1464 if (inputBufferSize > 0)
1465 runWaitEvents.push_back(inputBufferEvent);
1466
1467 std::vector<double> values(valuesBufferSize);
1468 std::vector<float> valuesFloat;
1469 if (!settings_.useDoublePrecision) {
1470 valuesFloat.resize(values.size());
1471 }
1472
1473 for (std::size_t part = 0; part < kernel_[currentId_ - 1].size(); ++part) {
1474 bool initFromValues = part > 0;
1475 bool cacheToValues = currentSsa_.size() > 1 && part < currentSsa_.size() - 1;
1476 bool generateOutputValues = part == currentSsa_.size() - 1;
1477
1478 // set kernel args
1479
1480 std::size_t kidx = 0;
1481 err = 0;
1482 if (inputBufferSize > 0) {
1483 err |= clSetKernelArg(kernel_[currentId_ - 1][part], kidx++, sizeof(cl_mem), &inputBuffer);
1484 }
1485 if (nVariates_[currentId_ - 1] > 0) {
1486 err |= clSetKernelArg(kernel_[currentId_ - 1][part], kidx++, sizeof(cl_mem), &variatesPool_);
1487 }
1488 if (valuesBufferSize > 0 && (initFromValues || cacheToValues)) {
1489 err |= clSetKernelArg(kernel_[currentId_ - 1][part], kidx++, sizeof(cl_mem), &valuesBuffer);
1490 }
1491 if (outputBufferSize > 0 && generateOutputValues) {
1492 err |= clSetKernelArg(kernel_[currentId_ - 1][part], kidx++, sizeof(cl_mem), &outputBuffer);
1493 }
1494 QL_REQUIRE(err == CL_SUCCESS,
1495 "OpenClContext::finalizeCalculation(): set kernel args fails: " << errorText(err));
1496
1497 // execute kernel
1498
1499 if (settings_.debug) {
1500 err = clFinish(queue_);
1501 timerBase = timer.elapsed().wall;
1502 }
1503
1504 cl_event runEvent;
1505 err = clEnqueueNDRangeKernel(queue_, kernel_[currentId_ - 1][part], 1, NULL, &size_[currentId_ - 1], NULL,
1506 runWaitEvents.size(), runWaitEvents.empty() ? NULL : &runWaitEvents[0], &runEvent);
1507 QL_REQUIRE(err == CL_SUCCESS, "OpenClContext::finalizeCalculation(): enqueue kernel fails: " << errorText(err));
1508 runWaitEvents.push_back(runEvent);
1509
1510 // calculate conditional expectations, this is the variant where we do this on the host
1511
1512 if (kernel_[currentId_ - 1].size() > 1 && part < kernel_[currentId_ - 1].size() - 1) {
1513
1514 // copy values from device to host
1515
1516 cl_event readEvent;
1517 err =
1518 clEnqueueReadBuffer(queue_, valuesBuffer, CL_FALSE, 0, fpSize * valuesBufferSize,
1519 settings_.useDoublePrecision ? (void*)&values[0] : (void*)&valuesFloat[0],
1520 runWaitEvents.size(), runWaitEvents.empty() ? NULL : &runWaitEvents[0], &readEvent);
1521 QL_REQUIRE(err == CL_SUCCESS,
1522 "OpenClContext::finalizeCalculation(): enqueue read values buffer fails: " << errorText(err));
1523
1524 err = clWaitForEvents(1, &readEvent);
1525
1526 QL_REQUIRE(
1527 err == CL_SUCCESS,
1528 "OpenClContext::finalizeCalculation(): wait for read values buffer event fails: " << errorText(err));
1529
1530 if (!settings_.useDoublePrecision) {
1531 std::copy(valuesFloat.begin(), valuesFloat.end(), values.begin());
1532 }
1533
1534 for (auto const& v : conditionalExpectationVarIds_[currentId_ - 1][part]) {
1535
1536 // calculate conditional expectation value
1537
1538 QL_REQUIRE(v.size() >= 4,
1539 "OpenClContext::finalizeCalculation(): expected at least 4 varIds (3 args and 1 result) for "
1540 "conditional expectation, got "
1541 << v.size());
1542 RandomVariable regressand(size_[currentId_ - 1], &values[valuesBufferId(v[1]) * size_[currentId_ - 1]]);
1543 Filter filter = close_enough(
1544 RandomVariable(size_[currentId_ - 1], &values[valuesBufferId(v[2]) * size_[currentId_ - 1]]),
1545 RandomVariable(size_[currentId_ - 1], 1.0));
1546 std::vector<RandomVariable> regressor(v.size() - 3);
1547 for (std::size_t i = 3; i < v.size(); ++i) {
1548 regressor[i - 3] =
1549 RandomVariable(size_[currentId_ - 1], &values[valuesBufferId(v[i]) * size_[currentId_ - 1]]);
1550 }
1551
1552 auto ce =
1553 conditionalExpectation(regressand, vec2vecptr(regressor),
1554 multiPathBasisSystem(regressor.size(), settings_.regressionOrder,
1555 QuantLib::LsmBasisSystem::Monomial, regressand.size()),
1556 filter);
1557
1558 // overwrite the value
1559
1560 ce.expand();
1561 std::copy(ce.data(), ce.data() + ce.size(), &values[valuesBufferId(v[0]) * size_[currentId_ - 1]]);
1562 }
1563
1564 if (!settings_.useDoublePrecision) {
1565 std::copy(values.begin(), values.end(), valuesFloat.begin());
1566 }
1567
1568 // copy values from host to device
1569
1570 runWaitEvents.push_back(cl_event());
1571 err = clEnqueueWriteBuffer(queue_, valuesBuffer, CL_FALSE, 0, fpSize * valuesBufferSize,
1572 settings_.useDoublePrecision ? (void*)&values[0] : (void*)&valuesFloat[0], 0,
1573 NULL, &runWaitEvents.back());
1574 QL_REQUIRE(err == CL_SUCCESS,
1575 "OpenClContext::finalizeCalculation(): write values buffer fails: " << errorText(err));
1576
1577 } // if part > 0 (to update conditional expectation values)
1578
1579 if (settings_.debug) {
1580 err = clFinish(queue_);
1581 QL_REQUIRE(err == CL_SUCCESS, "OpenClContext::clFinish(): error in debug mode: " << errorText(err));
1582 debugInfo_.nanoSecondsCalculation += timer.elapsed().wall - timerBase;
1583 }
1584
1585 } // for part (execute kernel part)
1586
1587 if (settings_.debug) {
1588 timerBase = timer.elapsed().wall;
1589 }
1590
1591 // copy the results (asynchronously)
1592
1593 std::vector<cl_event> outputBufferEvents;
1594 if (outputBufferSize > 0) {
1595 std::vector<std::vector<float>> outputFloat;
1596 if (!settings_.useDoublePrecision) {
1597 outputFloat.resize(output.size(), std::vector<float>(size_[currentId_ - 1]));
1598 }
1599 for (std::size_t i = 0; i < output.size(); ++i) {
1600 outputBufferEvents.push_back(cl_event());
1601 err = clEnqueueReadBuffer(
1602 queue_, outputBuffer, CL_FALSE, fpSize * i * size_[currentId_ - 1], fpSize * size_[currentId_ - 1],
1603 settings_.useDoublePrecision ? (void*)&output[i][0] : (void*)&outputFloat[i][0], runWaitEvents.size(),
1604 runWaitEvents.empty() ? NULL : &runWaitEvents[0], &outputBufferEvents.back());
1605 QL_REQUIRE(err == CL_SUCCESS,
1606 "OpenClContext::finalizeCalculation(): writing to output buffer fails: " << errorText(err));
1607 }
1608 err = clWaitForEvents(outputBufferEvents.size(), outputBufferEvents.empty() ? nullptr : &outputBufferEvents[0]);
1609 QL_REQUIRE(
1610 err == CL_SUCCESS,
1611 "OpenClContext::finalizeCalculation(): wait for output buffer events to finish fails: " << errorText(err));
1612 if (!settings_.useDoublePrecision) {
1613 for (std::size_t i = 0; i < output.size(); ++i) {
1614 std::copy(outputFloat[i].begin(), outputFloat[i].end(), output[i]);
1615 }
1616 }
1617 }
1618
1619 if (settings_.debug) {
1620 err = clFinish(queue_);
1621 QL_REQUIRE(err == CL_SUCCESS, "OpenClContext::clFinish(): error in debug mode: " << errorText(err));
1622 debugInfo_.nanoSecondsDataCopy += timer.elapsed().wall - timerBase;
1623 }
1624}
1625
1626const ComputeContext::DebugInfo& OpenClContext::debugInfo() const { return debugInfo_; }
1627
1628std::vector<std::pair<std::string, std::string>> OpenClContext::deviceInfo() const { return deviceInfo_; }
1629bool OpenClContext::supportsDoublePrecision() const { return supportsDoublePrecision_; }
1630
1631#endif
1632
1633#ifndef ORE_ENABLE_OPENCL
1636#endif
1637
1638std::set<std::string> OpenClFramework::getAvailableDevices() const {
1639 std::set<std::string> tmp;
1640 for (auto const& [name, _] : contexts_)
1641 tmp.insert(name);
1642 return tmp;
1643}
1644
1645ComputeContext* OpenClFramework::getContext(const std::string& deviceName) {
1646 auto c = contexts_.find(deviceName);
1647 if (c != contexts_.end()) {
1648 return c->second;
1649 }
1650 QL_FAIL("OpenClFrameWork::getContext(): device '"
1651 << deviceName << "' not found. Available devices: " << boost::join(getAvailableDevices(), ","));
1652}
1653}; // namespace QuantExt
std::map< std::string, ComputeContext * > contexts_
static std::string platformName_[4U]
static std::vector< std::pair< std::string, std::string > > deviceInfo_[4U][8U]
static std::string deviceName_[4U][8U]
static bool supportsDoublePrecision_[4U][8U]
static boost::shared_mutex mutex_
ComputeContext * getContext(const std::string &deviceName) override final
std::set< std::string > getAvailableDevices() const override final
std::vector< std::function< RandomVariable(const std::vector< const RandomVariable * > &)> > multiPathBasisSystem(Size dim, Size order, QuantLib::LsmBasisSystem::PolynomialType type, Size basisSystemSizeBound)
Filter close_enough(const RandomVariable &x, const RandomVariable &y)
std::vector< const RandomVariable * > vec2vecptr(const std::vector< RandomVariable > &values)
RandomVariable conditionalExpectation(const std::vector< const RandomVariable * > &regressor, const std::vector< std::function< RandomVariable(const std::vector< const RandomVariable * > &)> > &basisFn, const Array &coefficients)
std::vector< std::string > getRandomVariableOpLabels()
#define ORE_OPENCL_MAX_N_DEV_INFO_LARGE
#define ORE_OPENCL_MAX_BUILD_LOG_LOGFILE
#define ORE_OPENCL_MAX_BUILD_LOG
#define ORE_OPENCL_MAX_N_DEV_INFO
opencl compute env implementation
#define ORE_OPENCL_MAX_N_PLATFORMS
#define ORE_OPENCL_MAX_N_DEVICES
static constexpr std::size_t Sqrt
static constexpr std::size_t IndicatorEq
static constexpr std::size_t Max
static constexpr std::size_t Add
static constexpr std::size_t Mult
static constexpr std::size_t IndicatorGeq
static constexpr std::size_t Log
static constexpr std::size_t Pow
static constexpr std::size_t Min
static constexpr std::size_t Negative
static constexpr std::size_t Subtract
static constexpr std::size_t NormalPdf
static constexpr std::size_t Abs
static constexpr std::size_t None
static constexpr std::size_t ConditionalExpectation
static constexpr std::size_t IndicatorGt
static constexpr std::size_t Div
static constexpr std::size_t Exp
std::vector< Size > steps