CandidateVectorSearch 1.7.2
Searching for peptide candidates using sparse matrix + matrix/vector multiplication.
Loading...
Searching...
No Matches
findTopCandidatesCudaInt.cpp
Go to the documentation of this file.
1// dllmain.cpp : Defines the entry point for the DLL application.
2#include "pch.h"
3#include <cuda_runtime_api.h>
4#include <cusparse.h>
5#include <stdio.h>
6#include <stdlib.h>
7#include <iostream>
8#include <numeric>
9#include <algorithm>
10#include <vector>
11
12const int versionMajor = 1;
13const int versionMinor = 5;
14const int versionFix = 1;
15
16#define METHOD_EXPORTS
17#ifdef METHOD_EXPORTS
18#define EXPORT __declspec(dllexport)
19#else
20#define EXPORT __declspec(dllimport)
21#endif
22
23#define CHECK_CUDA(status) { checkCuda(status, __LINE__); }
24#define CHECK_CUSPARSE(status) { checkCusparse(status, __LINE__); }
25
26const int MASS_RANGE = 5000; // Encoding values up to 5000 m/z
27const int MASS_MULTIPLIER = 100; // Encoding values with 0.01 precision
28const int ENCODING_SIZE = MASS_RANGE * MASS_MULTIPLIER; // The total length of an encoding vector
29const float ROUNDING_ACCURACY = (float) INT8_MAX / 7.0f; // Rounding precision for converting gaussian f32 to i8
30const double ONE_OVER_SQRT_PI = 0.39894228040143267793994605993438;
31
32extern "C" {
33 EXPORT int* findTopCandidatesCuda(int*, int*,
34 int*, int*,
35 int, int,
36 int, int,
37 int, float,
38 bool, bool,
39 int);
40
41 EXPORT int* findTopCandidatesCudaInt(int*, int*,
42 int*, int*,
43 int, int,
44 int, int,
45 int, float,
46 bool, bool,
47 int);
48
50 int*, int*,
51 int, int,
52 int, int,
53 int, float,
54 bool, bool,
55 int,
56 int);
57
59 int*, int*,
60 int, int,
61 int, int,
62 int, float,
63 bool, bool,
64 int,
65 int);
66
67 EXPORT int releaseMemoryCuda(int*);
68}
69
70float squared(float);
71float normpdf(float, float, float);
72int getRowIdx(int*, int, int);
73
74int checkCuda(cudaError_t, int line);
75int checkCusparse(cusparseStatus_t, int line);
76
77int checkCuda(cudaError_t status, int line)
78{
79 if (status != cudaSuccess) {
80 printf("CUDA API failed at line %d with error: %s (%d)\n",
81 line, cudaGetErrorString(status), status);
82 return EXIT_FAILURE;
83 }
84 return EXIT_SUCCESS;
85}
86
87int checkCusparse(cusparseStatus_t status, int line)
88{
89 if (status != CUSPARSE_STATUS_SUCCESS) {
90 printf("CUSPARSE API failed at line %d with error: %s (%d)\n",
91 line, cusparseGetErrorString(status), status);
92 return EXIT_FAILURE;
93 }
94 return EXIT_SUCCESS;
95}
96
114int* findTopCandidatesCuda(int* csrRowoffsets, int* csrColIdx,
115 int* spectraValues, int* spectraIdx,
116 int csrRowoffsetsLength, int csrNNZ,
117 int sVLength, int sILength,
118 int n, float tolerance,
119 bool normalize, bool gaussianTol,
120 int verbose) {
121
122 if (n >= csrRowoffsetsLength) {
123 throw std::invalid_argument("Cannot return more hits than number of candidates!");
124 }
125
126 std::cout << "Running CUDA f32 vector search version " << versionMajor << "." << versionMinor << "." << versionFix << std::endl;
127
128 float t = round(tolerance * MASS_MULTIPLIER);
129 int* result = new int[sILength * n];
130 float* csrValues = new float[csrNNZ];
131 float* MVresult = new float[csrRowoffsetsLength - 1] {0.0};
132
133 // create csrValues
134 for (int i = 0; i < csrRowoffsetsLength - 1; ++i) {
135 int startIter = csrRowoffsets[i];
136 int endIter = csrRowoffsets[i + 1];
137 int nrNonZero = endIter - startIter;
138 float val = normalize ? 1.0 / (float) nrNonZero : 1.0;
139 for (int j = startIter; j < endIter; ++j) {
140 csrValues[j] = val;
141 }
142 }
143
144 // cusparse spmv variables
145 float alpha = 1.0;
146 float beta = 0.0;
147
148 // device memory management
149 int* dM_csrRowoffsets;
150 int* dM_csrColIdx;
151 float* dM_csrValues;
152 float* dV;
153 float* dMVresult;
154
155 CHECK_CUDA(cudaMalloc((void**) &dM_csrRowoffsets, csrRowoffsetsLength * sizeof(int)));
156 CHECK_CUDA(cudaMalloc((void**) &dM_csrColIdx, csrNNZ * sizeof(int)));
157 CHECK_CUDA(cudaMalloc((void**) &dM_csrValues, csrNNZ * sizeof(float)));
158 CHECK_CUDA(cudaMalloc((void**) &dV, ENCODING_SIZE * sizeof(float)));
159 CHECK_CUDA(cudaMalloc((void**) &dMVresult, (csrRowoffsetsLength - 1) * sizeof(float)));
160
161 // device memory copy
162 CHECK_CUDA(cudaMemcpy(dM_csrRowoffsets, csrRowoffsets, csrRowoffsetsLength * sizeof(int), cudaMemcpyHostToDevice));
163 CHECK_CUDA(cudaMemcpy(dM_csrColIdx, csrColIdx, csrNNZ * sizeof(int), cudaMemcpyHostToDevice));
164 CHECK_CUDA(cudaMemcpy(dM_csrValues, csrValues, csrNNZ * sizeof(float), cudaMemcpyHostToDevice));
165 CHECK_CUDA(cudaMemcpy(dMVresult, MVresult, (csrRowoffsetsLength - 1) * sizeof(float), cudaMemcpyHostToDevice));
166
167 // device setup cusparse
168 cusparseHandle_t handle = NULL;
169 cusparseSpMatDescr_t mat;
170 cusparseDnVecDescr_t vec;
171 cusparseDnVecDescr_t res;
172 void* dBuffer = NULL;
173 size_t bufferSize = 0;
174
175 CHECK_CUSPARSE(cusparseCreate(&handle));
176 CHECK_CUSPARSE(cusparseCreateCsr(&mat, csrRowoffsetsLength - 1, ENCODING_SIZE, csrNNZ,
177 dM_csrRowoffsets, dM_csrColIdx, dM_csrValues,
178 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
179 CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F));
180 CHECK_CUSPARSE(cusparseCreateDnVec(&res, csrRowoffsetsLength - 1, dMVresult, CUDA_R_32F));
181
182 // device iterative spmv
183 for (int i = 0; i < sILength; ++i) {
184
185 float* V = new float[ENCODING_SIZE] {0.0};
186
187 // host spectrum encoding
188 int startIter = spectraIdx[i];
189 int endIter = i + 1 == sILength ? sVLength : spectraIdx[i + 1];
190 for (int j = startIter; j < endIter; ++j) {
191 auto currentPeak = spectraValues[j];
192 auto minPeak = currentPeak - t > 0 ? currentPeak - t : 0;
193 auto maxPeak = currentPeak + t < ENCODING_SIZE ? currentPeak + t : ENCODING_SIZE - 1;
194
195 for (int k = minPeak; k <= maxPeak; ++k) {
196 float currentVal = V[k];
197 float newVal = gaussianTol ? normpdf((float) k, (float) currentPeak, (float) (t / 3.0)) : 1.0;
198 V[k] = max(currentVal, newVal);
199 }
200 }
201
202 // device spmv
203 CHECK_CUDA(cudaMemcpy(dV, V, ENCODING_SIZE * sizeof(float), cudaMemcpyHostToDevice));
204 CHECK_CUSPARSE(cusparseCreateDnVec(&vec, ENCODING_SIZE, dV, CUDA_R_32F));
205 CHECK_CUSPARSE(cusparseSpMV_bufferSize(handle,
206 CUSPARSE_OPERATION_NON_TRANSPOSE,
207 &alpha, mat, vec, &beta, res,
208 CUDA_R_32F, CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize));
209 CHECK_CUDA(cudaMalloc(&dBuffer, bufferSize));
210 CHECK_CUSPARSE(cusparseSpMV(handle,
211 CUSPARSE_OPERATION_NON_TRANSPOSE,
212 &alpha, mat, vec, &beta, res,
213 CUDA_R_32F, CUSPARSE_SPMV_ALG_DEFAULT, dBuffer));
214
215 CHECK_CUDA(cudaMemcpy(MVresult, dMVresult, (csrRowoffsetsLength - 1) * sizeof(float), cudaMemcpyDeviceToHost));
216 CHECK_CUSPARSE(cusparseDestroyDnVec(vec));
217 delete[] V;
218
219 // host result max
220 auto* idx = new int[csrRowoffsetsLength - 1];
221 std::iota(idx, idx + csrRowoffsetsLength - 1, 0);
222 std::sort(idx, idx + csrRowoffsetsLength - 1, [&](int i, int j) {return MVresult[i] > MVresult[j];});
223
224 for (int j = 0; j < n; ++j) {
225 result[i * n + j] = idx[j];
226 }
227
228 delete[] idx;
229
230 if (verbose != 0 && (i + 1) % verbose == 0) {
231 std::cout << "Searched " << i + 1 << " spectra in total..." << std::endl;
232 }
233 }
234
235 // device destroy descriptors
236 CHECK_CUSPARSE(cusparseDestroySpMat(mat));
237 CHECK_CUSPARSE(cusparseDestroyDnVec(res));
238 CHECK_CUSPARSE(cusparseDestroy(handle));
239
240 // device memory deallocation
241 CHECK_CUDA(cudaFree(dBuffer));
242 CHECK_CUDA(cudaFree(dM_csrRowoffsets));
243 CHECK_CUDA(cudaFree(dM_csrColIdx));
244 CHECK_CUDA(cudaFree(dM_csrValues));
245 CHECK_CUDA(cudaFree(dV));
246 CHECK_CUDA(cudaFree(dMVresult));
247
248 // host memory deallocation
249 delete[] MVresult;
250 delete[] csrValues;
251
252 return result;
253}
254
272int* findTopCandidatesCudaInt(int* csrRowoffsets, int* csrColIdx,
273 int* spectraValues, int* spectraIdx,
274 int csrRowoffsetsLength, int csrNNZ,
275 int sVLength, int sILength,
276 int n, float tolerance,
277 bool normalize, bool gaussianTol,
278 int verbose) {
279
280 if (n >= csrRowoffsetsLength) {
281 throw std::invalid_argument("Cannot return more hits than number of candidates!");
282 }
283
284 if (tolerance < 0.01f) {
285 throw std::invalid_argument("Tolerance must not be smaller than 0.01 for i8 operations!");
286 }
287
288 std::cout << "Running CUDA i8 vector search version " << versionMajor << "." << versionMinor << "." << versionFix << std::endl;
289
290 float t = round(tolerance * MASS_MULTIPLIER);
291 int* result = new int[sILength * n];
292 int8_t* csrValues = new int8_t[csrNNZ];
293 int* MVresult = new int[csrRowoffsetsLength - 1] {0};
294
295 // create csrValues
296 for (int i = 0; i < csrRowoffsetsLength - 1; ++i) {
297 int startIter = csrRowoffsets[i];
298 int endIter = csrRowoffsets[i + 1];
299 int nrNonZero = endIter - startIter;
300 int8_t val = normalize ? (int8_t) round(INT8_MAX / (float) nrNonZero) : 1i8;
301 for (int j = startIter; j < endIter; ++j) {
302 csrValues[j] = val;
303 }
304 }
305
306 // cusparse spmv variables
307 int8_t alpha = 1i8;
308 int8_t beta = 0i8;
309
310 // device memory management
311 int* dM_csrRowoffsets;
312 int* dM_csrColIdx;
313 int8_t* dM_csrValues;
314 int8_t* dV;
315 int* dMVresult;
316
317 CHECK_CUDA(cudaMalloc((void**) &dM_csrRowoffsets, csrRowoffsetsLength * sizeof(int)));
318 CHECK_CUDA(cudaMalloc((void**) &dM_csrColIdx, csrNNZ * sizeof(int)));
319 CHECK_CUDA(cudaMalloc((void**) &dM_csrValues, csrNNZ * sizeof(int8_t)));
320 CHECK_CUDA(cudaMalloc((void**) &dV, ENCODING_SIZE * sizeof(int8_t)));
321 CHECK_CUDA(cudaMalloc((void**) &dMVresult, (csrRowoffsetsLength - 1) * sizeof(int)));
322
323 // device memory copy
324 CHECK_CUDA(cudaMemcpy(dM_csrRowoffsets, csrRowoffsets, csrRowoffsetsLength * sizeof(int), cudaMemcpyHostToDevice));
325 CHECK_CUDA(cudaMemcpy(dM_csrColIdx, csrColIdx, csrNNZ * sizeof(int), cudaMemcpyHostToDevice));
326 CHECK_CUDA(cudaMemcpy(dM_csrValues, csrValues, csrNNZ * sizeof(int8_t), cudaMemcpyHostToDevice));
327 CHECK_CUDA(cudaMemcpy(dMVresult, MVresult, (csrRowoffsetsLength - 1) * sizeof(int), cudaMemcpyHostToDevice));
328
329 // device setup cusparse
330 cusparseHandle_t handle = NULL;
331 cusparseSpMatDescr_t mat;
332 cusparseDnVecDescr_t vec;
333 cusparseDnVecDescr_t res;
334 void* dBuffer = NULL;
335 size_t bufferSize = 0;
336
337 CHECK_CUSPARSE(cusparseCreate(&handle));
338 CHECK_CUSPARSE(cusparseCreateCsr(&mat, csrRowoffsetsLength - 1, ENCODING_SIZE, csrNNZ,
339 dM_csrRowoffsets, dM_csrColIdx, dM_csrValues,
340 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
341 CUSPARSE_INDEX_BASE_ZERO, CUDA_R_8I));
342 CHECK_CUSPARSE(cusparseCreateDnVec(&res, csrRowoffsetsLength - 1, dMVresult, CUDA_R_32I));
343
344 // device iterative spmv
345 for (int i = 0; i < sILength; ++i) {
346
347 int8_t* V = new int8_t[ENCODING_SIZE] {0i8};
348
349 // host spectrum encoding
350 int startIter = spectraIdx[i];
351 int endIter = i + 1 == sILength ? sVLength : spectraIdx[i + 1];
352 for (int j = startIter; j < endIter; ++j) {
353 auto currentPeak = spectraValues[j];
354 auto minPeak = currentPeak - t > 0 ? currentPeak - t : 0;
355 auto maxPeak = currentPeak + t < ENCODING_SIZE ? currentPeak + t : ENCODING_SIZE - 1;
356
357 for (int k = minPeak; k <= maxPeak; ++k) {
358 int8_t currentVal = V[k];
359 int8_t newVal = gaussianTol ? (int8_t) round(normpdf((float) k, (float) currentPeak, (float) (t / 3.0)) * ROUNDING_ACCURACY) : 1i8;
360 V[k] = max(currentVal, newVal);
361 }
362 }
363
364 // device spmv
365 CHECK_CUDA(cudaMemcpy(dV, V, ENCODING_SIZE * sizeof(int8_t), cudaMemcpyHostToDevice));
366 CHECK_CUSPARSE(cusparseCreateDnVec(&vec, ENCODING_SIZE, dV, CUDA_R_8I));
367 CHECK_CUSPARSE(cusparseSpMV_bufferSize(handle,
368 CUSPARSE_OPERATION_NON_TRANSPOSE,
369 &alpha, mat, vec, &beta, res,
370 CUDA_R_32I, CUSPARSE_SPMV_CSR_ALG2, &bufferSize));
371 CHECK_CUDA(cudaMalloc(&dBuffer, bufferSize));
372 CHECK_CUSPARSE(cusparseSpMV(handle,
373 CUSPARSE_OPERATION_NON_TRANSPOSE,
374 &alpha, mat, vec, &beta, res,
375 CUDA_R_32I, CUSPARSE_SPMV_CSR_ALG2, dBuffer));
376
377 CHECK_CUDA(cudaMemcpy(MVresult, dMVresult, (csrRowoffsetsLength - 1) * sizeof(int), cudaMemcpyDeviceToHost));
378 CHECK_CUSPARSE(cusparseDestroyDnVec(vec));
379 delete[] V;
380
381 // host result max
382 auto* idx = new int[csrRowoffsetsLength - 1];
383 std::iota(idx, idx + csrRowoffsetsLength - 1, 0);
384 std::sort(idx, idx + csrRowoffsetsLength - 1, [&](int i, int j) {return MVresult[i] > MVresult[j];});
385
386 for (int j = 0; j < n; ++j) {
387 result[i * n + j] = idx[j];
388 }
389
390 delete[] idx;
391
392 if (verbose != 0 && (i + 1) % verbose == 0) {
393 std::cout << "Searched " << i + 1 << " spectra in total..." << std::endl;
394 }
395 }
396
397 // device destroy descriptors
398 CHECK_CUSPARSE(cusparseDestroySpMat(mat));
399 CHECK_CUSPARSE(cusparseDestroyDnVec(res));
400 CHECK_CUSPARSE(cusparseDestroy(handle));
401
402 // device memory deallocation
403 CHECK_CUDA(cudaFree(dBuffer));
404 CHECK_CUDA(cudaFree(dM_csrRowoffsets));
405 CHECK_CUDA(cudaFree(dM_csrColIdx));
406 CHECK_CUDA(cudaFree(dM_csrValues));
407 CHECK_CUDA(cudaFree(dV));
408 CHECK_CUDA(cudaFree(dMVresult));
409
410 // host memory deallocation
411 delete[] MVresult;
412 delete[] csrValues;
413
414 return result;
415}
416
435int* findTopCandidatesCudaBatched(int* csrRowoffsets, int* csrColIdx,
436 int* spectraValues, int* spectraIdx,
437 int csrRowoffsetsLength, int csrNNZ,
438 int sVLength, int sILength,
439 int n, float tolerance,
440 bool normalize, bool gaussianTol,
441 int batchSize,
442 int verbose) {
443
444 if (n >= csrRowoffsetsLength) {
445 throw std::invalid_argument("Cannot return more hits than number of candidates!");
446 }
447
448 std::cout << "Running CUDA f32 sparse matrix search version " << versionMajor << "." << versionMinor << "." << versionFix << std::endl;
449
450 float t = round(tolerance * MASS_MULTIPLIER);
451 int* result = new int[sILength * n];
452 float* csrValues = new float[csrNNZ];
453
454 // create csrValues
455 for (int i = 0; i < csrRowoffsetsLength - 1; ++i) {
456 int startIter = csrRowoffsets[i];
457 int endIter = csrRowoffsets[i + 1];
458 int nrNonZero = endIter - startIter;
459 float val = normalize ? 1.0 / (float) nrNonZero : 1.0;
460 for (int j = startIter; j < endIter; ++j) {
461 csrValues[j] = val;
462 }
463 }
464
465 // cusparse spgemM variables
466 float alpha = 1.0;
467 float beta = 0.0;
468
469 // device memory management
470 int* dm_csrRowoffsets;
471 int* dm_csrColIdx;
472 float* dm_csrValues;
473 int* dM_csrRowoffsets;
474 int* dM_csrColIdx;
475 float* dM_csrValues;
476 int* dspgemM_csrRowoffsets;
477 int* dspgemM_csrColIdx;
478 float* dspgemM_csrValues;
479
480 // device buffer managment
481 void* dBuffer1 = NULL;
482 void* dBuffer2 = NULL;
483 size_t bufferSize1 = 0;
484 size_t bufferSize2 = 0;
485
486 // device m memory management
487 CHECK_CUDA(cudaMalloc((void**) &dm_csrRowoffsets, csrRowoffsetsLength * sizeof(int)));
488 CHECK_CUDA(cudaMalloc((void**) &dm_csrColIdx, csrNNZ * sizeof(int)));
489 CHECK_CUDA(cudaMalloc((void**) &dm_csrValues, csrNNZ * sizeof(float)));
490
491 // device m memory copy
492 CHECK_CUDA(cudaMemcpy(dm_csrRowoffsets, csrRowoffsets, csrRowoffsetsLength * sizeof(int), cudaMemcpyHostToDevice));
493 CHECK_CUDA(cudaMemcpy(dm_csrColIdx, csrColIdx, csrNNZ * sizeof(int), cudaMemcpyHostToDevice));
494 CHECK_CUDA(cudaMemcpy(dm_csrValues, csrValues, csrNNZ * sizeof(float), cudaMemcpyHostToDevice));
495
496 // device setup cusparse m
497 cusparseHandle_t handle = NULL;
498 cusparseSpMatDescr_t mat;
499
500 CHECK_CUSPARSE(cusparseCreate(&handle));
501 CHECK_CUSPARSE(cusparseCreateCsr(&mat, csrRowoffsetsLength - 1, ENCODING_SIZE, csrNNZ,
502 dm_csrRowoffsets, dm_csrColIdx, dm_csrValues,
503 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
504 CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F));
505
506 // device allocate csr rowoffsets of spgemM result
507 CHECK_CUDA(cudaMalloc((void**) &dspgemM_csrRowoffsets, csrRowoffsetsLength * sizeof(int)));
508
509 for (int i = 0; i < sILength; i += batchSize) {
510
511 // calculate csr representation of spectrum matrix
512 std::vector<std::vector<float>> vectorStorage;
513 std::vector<int> M_csrRowoffsets;
514 std::vector<int> M_csrColIdx;
515 std::vector<float> M_csrValues;
516
517 M_csrRowoffsets.push_back(0);
518
519 for (int s = 0; s < batchSize; ++s) {
520
521 std::vector<float> v(ENCODING_SIZE, 0.0);
522
523 if (i + s < sILength) {
524
525 int startIter = spectraIdx[i + s];
526 int endIter = i + s + 1 == sILength ? sVLength : spectraIdx[i + s + 1];
527
528 for (int j = startIter; j < endIter; ++j) {
529 auto currentPeak = spectraValues[j];
530 auto minPeak = currentPeak - t > 0 ? currentPeak - t : 0;
531 auto maxPeak = currentPeak + t < ENCODING_SIZE ? currentPeak + t : ENCODING_SIZE - 1;
532
533 for (int k = minPeak; k <= maxPeak; ++k) {
534 float currentVal = v[k];
535 float newVal = gaussianTol ? normpdf((float) k, (float) currentPeak, (float) (t / 3.0)) : 1.0;
536 v[k] = max(currentVal, newVal);
537 }
538 }
539 }
540
541 vectorStorage.push_back(v);
542 }
543
544 int M_csrNNZ = 0;
545 for (int j = 0; j < ENCODING_SIZE; ++j) {
546 for (int s = 0; s < batchSize; ++s) {
547 if (vectorStorage[s][j] != 0.0) {
548 M_csrColIdx.push_back(s);
549 M_csrValues.push_back(vectorStorage[s][j]);
550 ++M_csrNNZ;
551 }
552 }
553 M_csrRowoffsets.push_back(M_csrNNZ);
554 }
555
556 vectorStorage.clear();
557
558 // device M memory management
559 CHECK_CUDA(cudaMalloc((void**) &dM_csrRowoffsets, (ENCODING_SIZE + 1) * sizeof(int)));
560 CHECK_CUDA(cudaMalloc((void**) &dM_csrColIdx, M_csrNNZ * sizeof(int)));
561 CHECK_CUDA(cudaMalloc((void**) &dM_csrValues, M_csrNNZ * sizeof(float)));
562
563 // device m memory copy
564 CHECK_CUDA(cudaMemcpy(dM_csrRowoffsets, M_csrRowoffsets.data(), (ENCODING_SIZE + 1) * sizeof(int), cudaMemcpyHostToDevice));
565 CHECK_CUDA(cudaMemcpy(dM_csrColIdx, M_csrColIdx.data(), M_csrNNZ * sizeof(int), cudaMemcpyHostToDevice));
566 CHECK_CUDA(cudaMemcpy(dM_csrValues, M_csrValues.data(), M_csrNNZ * sizeof(float), cudaMemcpyHostToDevice));
567
568 // device setup cusparse M
569 cusparseSpMatDescr_t Mat;
570
571 CHECK_CUSPARSE(cusparseCreateCsr(&Mat, ENCODING_SIZE, batchSize, M_csrNNZ,
572 dM_csrRowoffsets, dM_csrColIdx, dM_csrValues,
573 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
574 CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F));
575
576 // device setup cusparse spgemM result
577 cusparseSpMatDescr_t spgemM;
578
579 CHECK_CUSPARSE(cusparseCreateCsr(&spgemM, csrRowoffsetsLength - 1, batchSize, 0,
580 NULL, NULL, NULL,
581 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
582 CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F));
583
584 // device spgemM computation
585 cusparseSpGEMMDescr_t spgemmDesc;
586 CHECK_CUSPARSE(cusparseSpGEMM_createDescr(&spgemmDesc));
587 CHECK_CUSPARSE(cusparseSpGEMM_workEstimation(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
588 &alpha, mat, Mat, &beta, spgemM,
589 CUDA_R_32F, CUSPARSE_SPGEMM_ALG1,
590 spgemmDesc, &bufferSize1, NULL));
591 CHECK_CUDA(cudaMalloc((void**) &dBuffer1, bufferSize1));
592 CHECK_CUSPARSE(cusparseSpGEMM_workEstimation(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
593 &alpha, mat, Mat, &beta, spgemM,
594 CUDA_R_32F, CUSPARSE_SPGEMM_ALG1,
595 spgemmDesc, &bufferSize1, dBuffer1));
596 CHECK_CUSPARSE(cusparseSpGEMM_compute(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
597 &alpha, mat, Mat, &beta, spgemM,
598 CUDA_R_32F, CUSPARSE_SPGEMM_ALG1,
599 spgemmDesc, &bufferSize2, NULL));
600 CHECK_CUDA(cudaMalloc((void**) &dBuffer2, bufferSize2));
601 CHECK_CUSPARSE(cusparseSpGEMM_compute(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
602 &alpha, mat, Mat, &beta, spgemM,
603 CUDA_R_32F, CUSPARSE_SPGEMM_ALG1,
604 spgemmDesc, &bufferSize2, dBuffer2));
605
606 // device get spgemM result
607 int64_t spgemM_rows, spgemM_cols, spgemM_nnz;
608 CHECK_CUSPARSE(cusparseSpMatGetSize(spgemM, &spgemM_rows, &spgemM_cols, &spgemM_nnz));
609 CHECK_CUDA(cudaMalloc((void**) &dspgemM_csrColIdx, spgemM_nnz * sizeof(int)));
610 CHECK_CUDA(cudaMalloc((void**) &dspgemM_csrValues, spgemM_nnz * sizeof(float)));
611 CHECK_CUSPARSE(cusparseCsrSetPointers(spgemM, dspgemM_csrRowoffsets, dspgemM_csrColIdx, dspgemM_csrValues));
612 CHECK_CUSPARSE(cusparseSpGEMM_copy(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
613 &alpha, mat, Mat, &beta, spgemM,
614 CUDA_R_32F, CUSPARSE_SPGEMM_ALG1, spgemmDesc));
615
616 // host setup spgemM result
617 int* spgemM_csrRowoffsets = new int[csrRowoffsetsLength];
618 int* spgemM_csrColIdx = new int[spgemM_nnz];
619 float* spgemM_csrValues = new float[spgemM_nnz];
620 CHECK_CUDA(cudaMemcpy(spgemM_csrRowoffsets, dspgemM_csrRowoffsets, csrRowoffsetsLength * sizeof(int), cudaMemcpyDeviceToHost));
621 CHECK_CUDA(cudaMemcpy(spgemM_csrColIdx, dspgemM_csrColIdx, spgemM_nnz * sizeof(int), cudaMemcpyDeviceToHost));
622 CHECK_CUDA(cudaMemcpy(spgemM_csrValues, dspgemM_csrValues, spgemM_nnz * sizeof(float), cudaMemcpyDeviceToHost));
623
624 // host result max
625 for (int s = 0; s < batchSize; ++s) {
626
627 if (i + s >= sILength) {
628 break;
629 }
630
631 std::vector<int> rowIdx;
632 std::vector<float> rowValues;
633 for (int j = 0; j < spgemM_nnz; ++j) {
634 if (spgemM_csrColIdx[j] == s) {
635 rowIdx.push_back(getRowIdx(spgemM_csrRowoffsets, csrRowoffsetsLength, j));
636 rowValues.push_back(spgemM_csrValues[j]);
637 }
638 }
639
640 // need to create an idx vector because we can't sort rowIdx directly
641 //std::sort(rowIdx.data(), rowIdx.data() + rowIdx.size(), [&](int i, int j) {return rowValues[i] > rowValues[j];});
642 std::vector<int> idx(rowIdx.size());
643 std::iota(idx.begin(), idx.end(), 0);
644 std::sort(idx.begin(), idx.end(), [&](int i, int j) {return rowValues[i] > rowValues[j];});
645
646 if (rowIdx.size() >= n) {
647 for (int j = 0; j < n; ++j) {
648 result[(i + s) * n + j] = rowIdx[idx[j]];
649 }
650 }
651 }
652
653 // host memory deallocation
654 delete[] spgemM_csrRowoffsets;
655 delete[] spgemM_csrColIdx;
656 delete[] spgemM_csrValues;
657
658 // device destroy descriptors
659 CHECK_CUSPARSE(cusparseDestroySpMat(Mat));
660 CHECK_CUSPARSE(cusparseDestroySpMat(spgemM));
661
662 // device memory deallocation
663 CHECK_CUDA(cudaFree(dM_csrRowoffsets));
664 CHECK_CUDA(cudaFree(dM_csrColIdx));
665 CHECK_CUDA(cudaFree(dM_csrValues));
666 CHECK_CUDA(cudaFree(dspgemM_csrColIdx));
667 CHECK_CUDA(cudaFree(dspgemM_csrValues));
668
669 if (verbose != 0 && (i + batchSize) % verbose == 0) {
670 std::cout << "Searched " << i + batchSize << " spectra in total..." << std::endl;
671 }
672 }
673
674 // device destroy descriptors
675 CHECK_CUSPARSE(cusparseDestroySpMat(mat));
676 CHECK_CUSPARSE(cusparseDestroy(handle));
677
678 // device memory deallocation
679 CHECK_CUDA(cudaFree(dm_csrRowoffsets));
680 CHECK_CUDA(cudaFree(dm_csrColIdx));
681 CHECK_CUDA(cudaFree(dm_csrValues));
682 CHECK_CUDA(cudaFree(dspgemM_csrRowoffsets));
683 CHECK_CUDA(cudaFree(dBuffer1));
684 CHECK_CUDA(cudaFree(dBuffer2));
685
686 // host memory deallocation
687 delete[] csrValues;
688
689 return result;
690}
691
710int* findTopCandidatesCudaBatched2(int* csrRowoffsets, int* csrColIdx,
711 int* spectraValues, int* spectraIdx,
712 int csrRowoffsetsLength, int csrNNZ,
713 int sVLength, int sILength,
714 int n, float tolerance,
715 bool normalize, bool gaussianTol,
716 int batchSize,
717 int verbose) {
718
719 if (n >= csrRowoffsetsLength) {
720 throw std::invalid_argument("Cannot return more hits than number of candidates!");
721 }
722
723 std::cout << "Running CUDA f32 dense matrix search version " << versionMajor << "." << versionMinor << "." << versionFix << std::endl;
724
725 float t = round(tolerance * MASS_MULTIPLIER);
726 int* result = new int[sILength * n];
727 float* csrValues = new float[csrNNZ];
728
729 // create csrValues
730 for (int i = 0; i < csrRowoffsetsLength - 1; ++i) {
731 int startIter = csrRowoffsets[i];
732 int endIter = csrRowoffsets[i + 1];
733 int nrNonZero = endIter - startIter;
734 float val = normalize ? 1.0 / (float) nrNonZero : 1.0;
735 for (int j = startIter; j < endIter; ++j) {
736 csrValues[j] = val;
737 }
738 }
739
740 // cusparse spmM variables
741 float alpha = 1.0;
742 float beta = 0.0;
743
744 // device memory management
745 int* dm_csrRowoffsets;
746 int* dm_csrColIdx;
747 float* dm_csrValues;
748 float* dM_dnValues;
749 float* dspmM_dnValues;
750
751 // device buffer managment
752 void* dBuffer = NULL;
753 size_t bufferSize = 0;
754
755 // device m memory management
756 CHECK_CUDA(cudaMalloc((void**) &dm_csrRowoffsets, csrRowoffsetsLength * sizeof(int)));
757 CHECK_CUDA(cudaMalloc((void**) &dm_csrColIdx, csrNNZ * sizeof(int)));
758 CHECK_CUDA(cudaMalloc((void**) &dm_csrValues, csrNNZ * sizeof(float)));
759
760 // device m memory copy
761 CHECK_CUDA(cudaMemcpy(dm_csrRowoffsets, csrRowoffsets, csrRowoffsetsLength * sizeof(int), cudaMemcpyHostToDevice));
762 CHECK_CUDA(cudaMemcpy(dm_csrColIdx, csrColIdx, csrNNZ * sizeof(int), cudaMemcpyHostToDevice));
763 CHECK_CUDA(cudaMemcpy(dm_csrValues, csrValues, csrNNZ * sizeof(float), cudaMemcpyHostToDevice));
764
765 // device M memory management
766 CHECK_CUDA(cudaMalloc((void**) &dM_dnValues, ENCODING_SIZE * batchSize * sizeof(float)));
767
768 // device spmm memory management
769 CHECK_CUDA(cudaMalloc((void**) &dspmM_dnValues, (csrRowoffsetsLength - 1) * batchSize * sizeof(float)));
770
771 // device setup cusparse m
772 cusparseHandle_t handle = NULL;
773 cusparseSpMatDescr_t mat;
774
775 CHECK_CUSPARSE(cusparseCreate(&handle));
776 CHECK_CUSPARSE(cusparseCreateCsr(&mat, csrRowoffsetsLength - 1, ENCODING_SIZE, csrNNZ,
777 dm_csrRowoffsets, dm_csrColIdx, dm_csrValues,
778 CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
779 CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F));
780
781 for (int i = 0; i < sILength; i += batchSize) {
782
783 // calculate spectrum matrix
784 float* M = new float[ENCODING_SIZE * batchSize] {0.0};
785
786 for (int s = 0; s < batchSize; ++s) {
787
788 if (i + s < sILength) {
789
790 int startIter = spectraIdx[i + s];
791 int endIter = i + s + 1 == sILength ? sVLength : spectraIdx[i + s + 1];
792
793 for (int j = startIter; j < endIter; ++j) {
794 auto currentPeak = spectraValues[j];
795 auto minPeak = currentPeak - t > 0 ? currentPeak - t : 0;
796 auto maxPeak = currentPeak + t < ENCODING_SIZE ? currentPeak + t : ENCODING_SIZE - 1;
797
798 for (int k = minPeak; k <= maxPeak; ++k) {
799 float currentVal = M[s * ENCODING_SIZE + k];
800 float newVal = gaussianTol ? normpdf((float) k, (float) currentPeak, (float) (t / 3.0)) : 1.0;
801 M[s * ENCODING_SIZE + k] = max(currentVal, newVal);
802 }
803 }
804 }
805 }
806
807 // device setup cusparse M
808 cusparseDnMatDescr_t Mat;
809
810 CHECK_CUDA(cudaMemcpy(dM_dnValues, M, ENCODING_SIZE * batchSize * sizeof(float), cudaMemcpyHostToDevice));
811 CHECK_CUSPARSE(cusparseCreateDnMat(&Mat, ENCODING_SIZE, batchSize, ENCODING_SIZE, dM_dnValues,
812 CUDA_R_32F, CUSPARSE_ORDER_COL));
813
814 // host setup result matrix
815 float* spmm = new float[(csrRowoffsetsLength - 1) * batchSize] {0.0};
816
817 // device setup cusparse spmM result
818 cusparseDnMatDescr_t spmM;
819
820 CHECK_CUDA(cudaMemcpy(dspmM_dnValues, spmm, (csrRowoffsetsLength - 1) * batchSize * sizeof(float), cudaMemcpyHostToDevice));
821 CHECK_CUSPARSE(cusparseCreateDnMat(&spmM, csrRowoffsetsLength - 1, batchSize, csrRowoffsetsLength - 1, dspmM_dnValues,
822 CUDA_R_32F, CUSPARSE_ORDER_COL));
823
824 // device allocate buffer
825 CHECK_CUSPARSE(cusparseSpMM_bufferSize(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
826 &alpha, mat, Mat, &beta, spmM, CUDA_R_32F,
827 CUSPARSE_SPMM_CSR_ALG1, &bufferSize));
828 CHECK_CUDA(cudaMalloc(&dBuffer, bufferSize));
829
830 // device spmm computation
831 CHECK_CUSPARSE(cusparseSpMM(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
832 &alpha, mat, Mat, &beta, spmM, CUDA_R_32F,
833 CUSPARSE_SPMM_CSR_ALG1, dBuffer));
834
835 // host get spmM result
836 CHECK_CUDA(cudaMemcpy(spmm, dspmM_dnValues, (csrRowoffsetsLength - 1) * batchSize * sizeof(float), cudaMemcpyDeviceToHost));
837
838 // host result max
839 for (int s = 0; s < batchSize; ++s) {
840
841 if (i + s >= sILength) {
842 break;
843 }
844
845 int currentCol = s * (csrRowoffsetsLength - 1);
846 std::vector<float> values(csrRowoffsetsLength - 1, 0.0);
847 for (int j = 0; j < csrRowoffsetsLength - 1; ++j) {
848 values[j] = spmm[currentCol + j];
849 }
850
851 std::vector<int> idx(csrRowoffsetsLength - 1);
852 std::iota(idx.begin(), idx.end(), 0);
853 std::sort(idx.begin(), idx.end(), [&](int i, int j) {return values[i] > values[j];});
854
855 if (idx.size() >= n) {
856 for (int j = 0; j < n; ++j) {
857 result[(i + s) * n + j] = idx[j];
858 }
859 }
860 }
861
862 // host memory deallocation
863 delete[] spmm;
864 delete[] M;
865
866 // device destroy descriptors
867 CHECK_CUSPARSE(cusparseDestroyDnMat(Mat));
868 CHECK_CUSPARSE(cusparseDestroyDnMat(spmM));
869
870 if (verbose != 0 && (i + batchSize) % verbose == 0) {
871 std::cout << "Searched " << i + batchSize << " spectra in total..." << std::endl;
872 }
873 }
874
875 // device destroy descriptors
876 CHECK_CUSPARSE(cusparseDestroySpMat(mat));
877 CHECK_CUSPARSE(cusparseDestroy(handle));
878
879 // device memory deallocation
880 CHECK_CUDA(cudaFree(dm_csrRowoffsets));
881 CHECK_CUDA(cudaFree(dm_csrColIdx));
882 CHECK_CUDA(cudaFree(dm_csrValues));
883 CHECK_CUDA(cudaFree(dM_dnValues));
884 CHECK_CUDA(cudaFree(dspmM_dnValues));
885 CHECK_CUDA(cudaFree(dBuffer));
886
887 // host memory deallocation
888 delete[] csrValues;
889
890 return result;
891}
892
898int releaseMemoryCuda(int* result) {
899
900 delete[] result;
901 return 0;
902}
903
909float squared(float x) {
910 return x * x;
911}
912
920float normpdf(float x, float mu, float sigma) {
921 if (sigma == 0.0) {
922 return 1.0;
923 }
924 return (ONE_OVER_SQRT_PI / sigma) * exp(-0.5 * squared((x - mu) / sigma));
925}
926
934int getRowIdx(int* csrRowoffsets, int csrRowoffsetsLength, int colIdxPos) {
935 for (int i = 0; i < csrRowoffsetsLength - 1; ++i) {
936 if (csrRowoffsets[i + 1] > colIdxPos) {
937 return i;
938 }
939 }
940
941 throw std::logic_error("Couldn't find row index.");
942
943 return -1;
944}
945
946BOOL APIENTRY DllMain( HMODULE hModule,
947 DWORD ul_reason_for_call,
948 LPVOID lpReserved
949 )
950{
951 switch (ul_reason_for_call)
952 {
953 case DLL_PROCESS_ATTACH:
954 case DLL_THREAD_ATTACH:
955 case DLL_THREAD_DETACH:
956 case DLL_PROCESS_DETACH:
957 break;
958 }
959 return TRUE;
960}
961
const double ONE_OVER_SQRT_PI
EXPORT int * findTopCandidatesCudaBatched(int *, int *, int *, int *, int, int, int, int, int, float, bool, bool, int, int)
A function that calculates the top n candidates for each spectrum. Uses cusparseSpGEMM to calculate m...
BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReserved)
#define EXPORT
int getRowIdx(int *, int, int)
Gets the row index of a specific position in the csr_column_indices array.
const int versionMajor
int checkCusparse(cusparseStatus_t, int line)
EXPORT int releaseMemoryCuda(int *)
Free memory after result has been marshalled.
const int MASS_MULTIPLIER
EXPORT int * findTopCandidatesCuda(int *, int *, int *, int *, int, int, int, int, int, float, bool, bool, int)
A function that calculates the top n candidates for each spectrum (SpM*V) using f32 operations.
int checkCuda(cudaError_t, int line)
const float ROUNDING_ACCURACY
EXPORT int * findTopCandidatesCudaInt(int *, int *, int *, int *, int, int, int, int, int, float, bool, bool, int)
A function that calculates the top n candidates for each spectrum (SpM*V) using i8 and 32 operations.
float normpdf(float, float, float)
Returns the PDF for a given x for the normal distribution given by mu and sigma.
#define CHECK_CUSPARSE(status)
EXPORT int * findTopCandidatesCudaBatched2(int *, int *, int *, int *, int, int, int, int, int, float, bool, bool, int, int)
A function that calculates the top n candidates for each spectrum. Uses cusparseSpMM to calculate mat...
float squared(float)
Returns the square for a given value x.
#define CHECK_CUDA(status)
const int ENCODING_SIZE
const int versionMinor
const int versionFix
const int MASS_RANGE