45 constexpr const int runs = 2;
46 cudaEvent_t events[runs + 1];
49 for (
int i = 0; i < runs + 1; ++i) {
50 OPM_GPU_SAFE_CALL(cudaEventCreate(&events[i]));
54 float bestTime = std::numeric_limits<float>::max();
55 int bestBlockSize = -1;
59 for (
int thrBlockSize = interval; thrBlockSize <= 1024; thrBlockSize += interval) {
62 OPM_GPU_SAFE_CALL(cudaEventRecord(events[0]));
63 for (
int i = 0; i < runs; ++i) {
65 OPM_GPU_SAFE_CALL(cudaEventRecord(events[i + 1]));
69 OPM_GPU_SAFE_CALL(cudaEventSynchronize(events[runs]));
72 if (cudaSuccess == cudaGetLastError()) {
74 for (
int i = 0; i < runs; ++i) {
75 float candidateBlockSizeTime;
76 OPM_GPU_SAFE_CALL(cudaEventElapsedTime(&candidateBlockSizeTime, events[i], events[i + 1]));
77 if (candidateBlockSizeTime < bestTime) {
78 bestTime = candidateBlockSizeTime;
79 bestBlockSize = thrBlockSize;
86 fmt::format(
"{}: Tuned Blocksize: {} (fastest runtime: {}).", descriptionOfFunction, bestBlockSize, bestTime));