Ich versuche, einige Tests zu laufen cusparse und cublas Leistung unter differents vergleichen sparsity (mit Titan X), hier ist der Haupt-Code namens „testcusparsevector.cpp“:cublas ist ungewöhnlich langsam vergleichen cusparse
#include <stdio.h>
#include <iostream>
#include <vector>
#include <cstdlib>
#include <fstream>
#include <time.h>
#include <cuda_runtime.h>
#include <cublas.h>
#include <cusparse_v2.h>
#include <cublas_v2.h>
#include <assert.h>
#define M 6
#define N 5
#define IDX2C(i,j,ld) (((j)*(ld))+(i))
// /home/gpu1/Install/OpenBLAS-0.2.14
#define CHECK_EQ(a,b) do { \
if ((a) != (b)) { \
cout <<__FILE__<<" : "<< __LINE__<<" : check failed because "<<a<<"!="<<b<<endl;\
exit(1);\
}\
} while(0)
#define CUBLAS_CHECK(condition) \
do {\
cublasStatus_t status = condition; \
CHECK_EQ(status, CUBLAS_STATUS_SUCCESS); \
} while(0)
#define CUSPARSE_CHECK(condition)\
do {\
cusparseStatus_t status = condition; \
switch(status)\
{\
case CUSPARSE_STATUS_NOT_INITIALIZED:\
cout << "CUSPARSE_STATUS_NOT_INITIALIZED" << endl;\
break;\
case CUSPARSE_STATUS_ALLOC_FAILED:\
cout << "CUSPARSE_STATUS_ALLOC_FAILED" << endl;\
break;\
case CUSPARSE_STATUS_INVALID_VALUE:\
cout << "CUSPARSE_STATUS_INVALID_VALUE" << endl;\
break;\
case CUSPARSE_STATUS_ARCH_MISMATCH:\
cout << "CUSPARSE_STATUS_ARCH_MISMATCH" << endl;\
break;\
case CUSPARSE_STATUS_MAPPING_ERROR:\
cout << "CUSPARSE_STATUS_MAPPING_ERROR" << endl;\
break;\
case CUSPARSE_STATUS_EXECUTION_FAILED:\
cout << "CUSPARSE_STATUS_EXECUTION_FAILED" << endl;\
break;\
case CUSPARSE_STATUS_INTERNAL_ERROR:\
cout << "CUSPARSE_STATUS_INTERNAL_ERROR" << endl;\
break;\
case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:\
cout << "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED" << endl;\
break;\
case CUSPARSE_STATUS_ZERO_PIVOT:\
cout << "CUSPARSE_STATUS_ZERO_PIVOT" << endl;\
}\
CHECK_EQ(status, CUSPARSE_STATUS_SUCCESS); \
} while(0)
#define CUDA_CHECK(condition)\
do {\
cudaError_t error = condition;\
CHECK_EQ(error, cudaSuccess);\
} while(0)
//check after kernel function
#define CUDA_POST_KERNEL_CHECK CUDA_CHECK(cudaPeekAtLastError())
#define __TIMING__ 1
#if __TIMING__
#define INIT_TIMER cudaEvent_t start, stop; \
float milliseconds = 0; \
float sum = 0;\
cudaEventCreate(&start);\
cudaEventCreate(&stop);
#define TIC { cudaEventRecord(start); }
#if __CUDNN__
#define PREDEFNAME "CUDNN"
#else
#define PREDEFNAME "CUDA"
#endif
#define TOC(a) { cudaEventRecord(stop); \
cudaEventSynchronize(stop); \
cudaEventElapsedTime(&milliseconds, start, stop); \
printf("GPU Execution time of %s_%s: %f ms\n",PREDEFNAME, a, milliseconds); \
sum += milliseconds;\
fflush(stdout); }
#define CLOSE_TIMER {cudaEventDestroy(start); cudaEventDestroy(stop); }
#endif
using namespace std;
void dispArray(double* array, size_t width, size_t height) {
for (int i=0; i < height;i++) {
for (int j=0;j < width;j++) {
cout << array[j*height+i] << ' ';
}
cout << endl;
}
cout << endl;
}
int main()
{
srand(time(NULL));
const int num_loop = 1;
const int inside_loop = 1000;
// const int WIDTH = 512*3*3;
// const int HEIGHT = 512;
// const int WIDTHOUT = 36;
const int WIDTH = 4608;
const int HEIGHT = 512;
const int WIDTHOUT = 144;
// const int WIDTH = 18500;
// const int HEIGHT = 512;
// const int WIDTHOUT = 1;
// const int WIDTH = 3;
// const int HEIGHT = 5;
// const int WIDTHOUT = 2;
INIT_TIMER
ofstream myfile;
myfile.open("test_sparsity.log");
cudaError_t cudaStat;
cusparseStatus_t stat;
cusparseHandle_t handle;
cublasHandle_t handleblas;
double *devPtrOutput;
double *devPtrOutput2;
double *devPtrRand;
double *devPtrSec;
CUDA_CHECK(cudaMalloc((void **)&(devPtrOutput), sizeof(double)*HEIGHT*WIDTHOUT));
CUDA_CHECK(cudaMalloc((void **)&(devPtrOutput2), sizeof(double)*HEIGHT*WIDTHOUT));
CUDA_CHECK(cudaMalloc((void **)&(devPtrRand), sizeof(double)*WIDTH*WIDTHOUT));
CUDA_CHECK(cudaMalloc((void **)&(devPtrSec), sizeof(double)*WIDTH*HEIGHT));
const double alpha=1.0;
const double beta=0.0;
double *csrVal;
int *csrRowPtr;
int *csrColInd;
const bool SPARSE = true;
long a = clock();
long temp = clock();
cusparseMatDescr_t descr;
CUSPARSE_CHECK(cusparseCreateMatDescr(&descr));
cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
int nnz;
CUSPARSE_CHECK(cusparseCreate(&handle));
CUBLAS_CHECK(cublasCreate(&handleblas));
int *nnzPerRow_gpu;
CUDA_CHECK(cudaMalloc((void **)&(nnzPerRow_gpu), sizeof(int)*HEIGHT));
CUDA_CHECK(cudaMalloc((void **)&(csrRowPtr), sizeof(int)*(HEIGHT+1)));
double density_array[1] = {0.9999};//, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1 ,0.09, 0.08, 0.07, 0.06, 0.05 ,0.04, 0.03, 0.02, 0.01};
for (int inddense=0;inddense < 1;inddense++) {
double DENSITY = density_array[inddense];
int num_non_zeros = DENSITY * (WIDTH * HEIGHT);
CUDA_CHECK(cudaMalloc((void **)&(csrColInd), sizeof(int)*num_non_zeros));
CUDA_CHECK(cudaMalloc((void **)&(csrVal), sizeof(double)*num_non_zeros));
INIT_TIMER
for (int iter=0; iter < num_loop;iter++) {
vector<double> randVec(WIDTH*WIDTHOUT, 0);
vector<double> secArray(WIDTH*HEIGHT, 0);
vector<int> temp(WIDTH*HEIGHT, 1);
for (int j = 0; j < WIDTH*WIDTHOUT; j++) {
randVec[j]=(double)(rand()%100000)/100;
}
for (int x, i = 0; i < num_non_zeros;i++) {
do
{
x = rand() % (WIDTH*HEIGHT);
} while(temp[x] == 0);
temp[x]=0;
secArray[x]=(double)(rand()%100000)/100;
}
int count = 0;
for(int i=0;i < WIDTH*HEIGHT;i++) {
if (secArray[i] != 0) {
count++;
}
}
// randVec = {2,2,2,3,3,3};
// secArray = {0,5,0,2,5,8,7,0,0,0,0,2,0,4,4};
CUDA_CHECK(cudaMemcpy(devPtrRand, &randVec[0], sizeof(double)*WIDTH*WIDTHOUT, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(devPtrSec, &secArray[0], sizeof(double)*WIDTH*HEIGHT, cudaMemcpyHostToDevice));
if (SPARSE) {
CUSPARSE_CHECK(cusparseDnnz(handle, CUSPARSE_DIRECTION_ROW, HEIGHT, WIDTH, descr, devPtrSec, HEIGHT, nnzPerRow_gpu, &nnz));
CUSPARSE_CHECK(cusparseDdense2csr(handle, HEIGHT, WIDTH, descr,devPtrSec,HEIGHT,nnzPerRow_gpu,csrVal,csrRowPtr,csrColInd));
}
// vector<double> tempcsrVal(nnz,0);
// vector<int> tempcsrRowPtr(HEIGHT+1);
// vector<int> tempcsrColInd(nnz,0);
// CUDA_CHECK(cudaMemcpy(&tempcsrVal[0], csrVal, sizeof(double)*nnz, cudaMemcpyDeviceToHost));
// CUDA_CHECK(cudaMemcpy(&tempcsrRowPtr[0], csrRowPtr, sizeof(int)*(HEIGHT+1), cudaMemcpyDeviceToHost));
// CUDA_CHECK(cudaMemcpy(&tempcsrColInd[0], csrColInd, sizeof(int)*nnz, cudaMemcpyDeviceToHost));
// for (int i =0; i < nnz;i++) {
// cout << tempcsrVal[i] << " ";
// }
// cout << endl;
// for (int i =0; i < HEIGHT+1;i++) {
// cout << tempcsrRowPtr[i] << " ";
// }
// cout << endl;
// for (int i =0; i < nnz;i++) {
// cout << tempcsrColInd[i] << " ";
// }
// cout << endl;
cudaDeviceSynchronize();
TIC
for (int i=0 ; i < inside_loop;i++) {
if (WIDTHOUT == 1) {
// TIC
CUSPARSE_CHECK(cusparseDcsrmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
HEIGHT, WIDTH, nnz, &alpha, descr, csrVal, csrRowPtr, csrColInd,
devPtrRand, &beta, devPtrOutput));
// TOC("csrmv")
} else {
// TIC
CUSPARSE_CHECK(cusparseDcsrmm(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
HEIGHT, WIDTHOUT, WIDTH, nnz, &alpha, descr, csrVal, csrRowPtr,
csrColInd, devPtrRand, WIDTH, &beta, devPtrOutput, HEIGHT));
// TOC("csrmm")
}
}
TOC("csr")
TIC
for (int i=0 ; i < inside_loop;i++) {
if (WIDTHOUT == 1) {
// TIC
CUBLAS_CHECK(cublasDgemv(handleblas, CUBLAS_OP_N, HEIGHT, WIDTH, &alpha, devPtrSec, HEIGHT , devPtrRand, 1, &beta, devPtrOutput2, 1));
// TOC("dgemv")
} else {
// TIC
CUBLAS_CHECK(cublasDgemm(handleblas, CUBLAS_OP_N, CUBLAS_OP_N, HEIGHT, WIDTHOUT, WIDTH, &alpha, devPtrSec, HEIGHT, devPtrRand, WIDTH, &beta, devPtrOutput2, HEIGHT));
// TOC("dgemm")
}
}
TOC("blas")
#if 0
vector<double> output(HEIGHT*WIDTHOUT, 0);
vector<double> output2(HEIGHT*WIDTHOUT, 0);
CUDA_CHECK(cudaMemcpy(&output[0], devPtrOutput, sizeof(double)*HEIGHT*WIDTHOUT, cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaMemcpy(&output2[0], devPtrOutput2, sizeof(double)*HEIGHT*WIDTHOUT, cudaMemcpyDeviceToHost));
dispArray(&output[0], WIDTHOUT, HEIGHT);
cout << endl;
for (int i=0;i < WIDTHOUT * HEIGHT;i++) {
if (output[i] != output2[i]) {
cout << "error: " << i << " " << (output[i] - output2[i]) << " " << output[i] << endl;
}
}
#endif
}
cout << DENSITY << " " << sum/num_loop << endl;
myfile << DENSITY << " " << sum/num_loop << endl;
cudaFree(csrColInd);
cudaFree(csrVal);
}
myfile.close();
cudaFree(csrRowPtr);
cudaFree(devPtrOutput);
cudaFree(devPtrRand);
cudaFree(devPtrSec);
}
jedoch nach dem Code mit
g++ -std=c++1y -O3 -I/usr/local/cuda/include -o testcusparsevector testcusparsevector.cpp -L/usr/local/cuda/lib64 -lcudart -lcublas -lcusparse
hier Kompilieren ist die Ausgabe:
GPU Execution time of CUDA_csr: 4818.447266 ms
GPU Execution time of CUDA_blas: 5024.459961 ms
wh ich sollte meinen, dass selbst wenn meine Dichte bei 0,999 ist, der cusparseDcsrmm immer noch schneller ist als cublasDgemm, habe ich bereits das Ergebnis überprüft, was gut ist, und im Vergleich zu anderen Beispielen scheint, dass das Problem von cublas kommt, was viel zu langsam ist.
Haben Sie eine Idee, wo kommt es her?
EDIT: Ich habe versucht, die Werte zu ändern zu schweben, und das Ergebnis ist mehr, was ich suchte, offenbar wird cublas nicht für doppelte Berechnung gemacht ...
Dank im voraus.
Der von Ihnen gepostete Code wird nicht kompiliert. Es ist ziemlich schwer, einen Fehler in einem riesigen Stapel von nicht kompilierbarem Code zu erkennen. – talonmies
Oh, ja, mein Fehler, ich habe schnelle Änderungen vorgenommen, um es lesbarer zu machen, habe aber vergessen, es einmal zu versuchen. Sie können es jetzt kompilieren. – Caenorst