cublas ist ungewöhnlich langsam vergleichen cusparse

Ich versuche, einige Tests zu laufen cusparse und cublas Leistung unter differents vergleichen sparsity (mit Titan X), hier ist der Haupt-Code namens „testcusparsevector.cpp“:cublas ist ungewöhnlich langsam vergleichen cusparse

#include <stdio.h> 
#include <iostream> 
#include <vector> 
#include <cstdlib> 
#include <fstream> 
#include <time.h> 
#include <cuda_runtime.h> 
#include <cublas.h> 
#include <cusparse_v2.h> 
#include <cublas_v2.h> 
#include <assert.h> 
#define M 6 
#define N 5 
#define IDX2C(i,j,ld) (((j)*(ld))+(i)) 


// /home/gpu1/Install/OpenBLAS-0.2.14 


#define CHECK_EQ(a,b) do { \ 
    if ((a) != (b)) { \ 
     cout <<__FILE__<<" : "<< __LINE__<<" : check failed because "<<a<<"!="<<b<<endl;\ 
     exit(1);\ 
    }\ 
} while(0) 

#define CUBLAS_CHECK(condition) \ 
do {\ 
    cublasStatus_t status = condition; \ 
    CHECK_EQ(status, CUBLAS_STATUS_SUCCESS); \ 
} while(0) 

#define CUSPARSE_CHECK(condition)\ 
do {\ 
    cusparseStatus_t status = condition; \ 
    switch(status)\ 
    {\ 
     case CUSPARSE_STATUS_NOT_INITIALIZED:\ 
      cout << "CUSPARSE_STATUS_NOT_INITIALIZED" << endl;\ 
      break;\ 
     case CUSPARSE_STATUS_ALLOC_FAILED:\ 
      cout << "CUSPARSE_STATUS_ALLOC_FAILED" << endl;\ 
      break;\ 
     case CUSPARSE_STATUS_INVALID_VALUE:\ 
      cout << "CUSPARSE_STATUS_INVALID_VALUE" << endl;\ 
      break;\ 
     case CUSPARSE_STATUS_ARCH_MISMATCH:\ 
      cout << "CUSPARSE_STATUS_ARCH_MISMATCH" << endl;\ 
      break;\ 
     case CUSPARSE_STATUS_MAPPING_ERROR:\ 
      cout << "CUSPARSE_STATUS_MAPPING_ERROR" << endl;\ 
      break;\ 
      case CUSPARSE_STATUS_EXECUTION_FAILED:\ 
      cout << "CUSPARSE_STATUS_EXECUTION_FAILED" << endl;\ 
      break;\ 
     case CUSPARSE_STATUS_INTERNAL_ERROR:\ 
      cout << "CUSPARSE_STATUS_INTERNAL_ERROR" << endl;\ 
      break;\ 
     case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:\ 
      cout << "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED" << endl;\ 
      break;\ 
     case CUSPARSE_STATUS_ZERO_PIVOT:\ 
      cout << "CUSPARSE_STATUS_ZERO_PIVOT" << endl;\ 
    }\ 
    CHECK_EQ(status, CUSPARSE_STATUS_SUCCESS); \ 
} while(0) 

#define CUDA_CHECK(condition)\ 
do {\ 
    cudaError_t error = condition;\ 
    CHECK_EQ(error, cudaSuccess);\ 
} while(0) 

//check after kernel function 
#define CUDA_POST_KERNEL_CHECK CUDA_CHECK(cudaPeekAtLastError()) 



#define __TIMING__ 1 

#if __TIMING__ 


#define INIT_TIMER cudaEvent_t start, stop; \ 
    float milliseconds = 0; \ 
    float sum = 0;\ 
    cudaEventCreate(&start);\ 
    cudaEventCreate(&stop); 

#define TIC { cudaEventRecord(start); } 

#if __CUDNN__ 
    #define PREDEFNAME "CUDNN" 
#else 
    #define PREDEFNAME "CUDA" 
#endif 

#define TOC(a) { cudaEventRecord(stop); \ 
     cudaEventSynchronize(stop); \ 
     cudaEventElapsedTime(&milliseconds, start, stop); \ 
     printf("GPU Execution time of %s_%s: %f ms\n",PREDEFNAME, a, milliseconds); \ 
     sum += milliseconds;\ 
     fflush(stdout); } 

#define CLOSE_TIMER {cudaEventDestroy(start); cudaEventDestroy(stop); } 
#endif 

using namespace std; 

void dispArray(double* array, size_t width, size_t height) { 
    for (int i=0; i < height;i++) { 
     for (int j=0;j < width;j++) { 
      cout << array[j*height+i] << ' '; 
     } 
     cout << endl; 
    } 
    cout << endl; 
} 

int main() 
{ 
    srand(time(NULL)); 
    const int num_loop = 1; 
    const int inside_loop = 1000; 
    // const int WIDTH = 512*3*3; 
    // const int HEIGHT = 512; 
    // const int WIDTHOUT = 36; 
    const int WIDTH = 4608; 
    const int HEIGHT = 512; 
    const int WIDTHOUT = 144; 
    // const int WIDTH = 18500; 
    // const int HEIGHT = 512; 
    // const int WIDTHOUT = 1; 
    // const int WIDTH = 3; 
    // const int HEIGHT = 5; 
    // const int WIDTHOUT = 2; 
    INIT_TIMER 
    ofstream myfile; 
    myfile.open("test_sparsity.log"); 

    cudaError_t cudaStat;  
    cusparseStatus_t stat; 
    cusparseHandle_t handle; 
    cublasHandle_t handleblas; 

    double *devPtrOutput; 
    double *devPtrOutput2; 
    double *devPtrRand; 
    double *devPtrSec; 
    CUDA_CHECK(cudaMalloc((void **)&(devPtrOutput), sizeof(double)*HEIGHT*WIDTHOUT)); 
    CUDA_CHECK(cudaMalloc((void **)&(devPtrOutput2), sizeof(double)*HEIGHT*WIDTHOUT)); 

    CUDA_CHECK(cudaMalloc((void **)&(devPtrRand), sizeof(double)*WIDTH*WIDTHOUT)); 
    CUDA_CHECK(cudaMalloc((void **)&(devPtrSec), sizeof(double)*WIDTH*HEIGHT)); 
    const double alpha=1.0; 
    const double beta=0.0; 
    double *csrVal; 
    int *csrRowPtr; 
    int *csrColInd; 

    const bool SPARSE = true; 
    long a = clock(); 
    long temp = clock(); 
    cusparseMatDescr_t descr; 
    CUSPARSE_CHECK(cusparseCreateMatDescr(&descr)); 
    cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL); 
    cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO); 
    int nnz; 
    CUSPARSE_CHECK(cusparseCreate(&handle)); 
    CUBLAS_CHECK(cublasCreate(&handleblas)); 
    int *nnzPerRow_gpu; 
    CUDA_CHECK(cudaMalloc((void **)&(nnzPerRow_gpu), sizeof(int)*HEIGHT)); 
    CUDA_CHECK(cudaMalloc((void **)&(csrRowPtr), sizeof(int)*(HEIGHT+1))); 
    double density_array[1] = {0.9999};//, 0.8, 0.7, 0.6, 0.5,  0.4, 0.3, 0.2, 0.1 ,0.09,  0.08, 0.07, 0.06, 0.05 ,0.04,  0.03, 0.02, 0.01}; 
    for (int inddense=0;inddense < 1;inddense++) { 
     double DENSITY = density_array[inddense]; 
     int num_non_zeros = DENSITY * (WIDTH * HEIGHT); 

     CUDA_CHECK(cudaMalloc((void **)&(csrColInd), sizeof(int)*num_non_zeros)); 
     CUDA_CHECK(cudaMalloc((void **)&(csrVal), sizeof(double)*num_non_zeros)); 
     INIT_TIMER 
     for (int iter=0; iter < num_loop;iter++) { 
      vector<double> randVec(WIDTH*WIDTHOUT, 0); 
      vector<double> secArray(WIDTH*HEIGHT, 0); 
      vector<int> temp(WIDTH*HEIGHT, 1); 

      for (int j = 0; j < WIDTH*WIDTHOUT; j++) { 
       randVec[j]=(double)(rand()%100000)/100; 
      } 

      for (int x, i = 0; i < num_non_zeros;i++) { 
       do 
       { 
        x = rand() % (WIDTH*HEIGHT); 
       } while(temp[x] == 0); 
       temp[x]=0; 
       secArray[x]=(double)(rand()%100000)/100; 
      } 
      int count = 0; 
      for(int i=0;i < WIDTH*HEIGHT;i++) { 
       if (secArray[i] != 0) { 
        count++; 
       } 
      } 

      // randVec = {2,2,2,3,3,3}; 
      // secArray = {0,5,0,2,5,8,7,0,0,0,0,2,0,4,4}; 
      CUDA_CHECK(cudaMemcpy(devPtrRand, &randVec[0], sizeof(double)*WIDTH*WIDTHOUT, cudaMemcpyHostToDevice)); 
      CUDA_CHECK(cudaMemcpy(devPtrSec, &secArray[0], sizeof(double)*WIDTH*HEIGHT, cudaMemcpyHostToDevice)); 


      if (SPARSE) { 
       CUSPARSE_CHECK(cusparseDnnz(handle, CUSPARSE_DIRECTION_ROW, HEIGHT, WIDTH, descr, devPtrSec, HEIGHT, nnzPerRow_gpu, &nnz)); 
       CUSPARSE_CHECK(cusparseDdense2csr(handle, HEIGHT, WIDTH, descr,devPtrSec,HEIGHT,nnzPerRow_gpu,csrVal,csrRowPtr,csrColInd)); 
      }  
      // vector<double> tempcsrVal(nnz,0); 
      // vector<int> tempcsrRowPtr(HEIGHT+1); 
      // vector<int> tempcsrColInd(nnz,0); 
      // CUDA_CHECK(cudaMemcpy(&tempcsrVal[0], csrVal, sizeof(double)*nnz, cudaMemcpyDeviceToHost)); 
      // CUDA_CHECK(cudaMemcpy(&tempcsrRowPtr[0], csrRowPtr, sizeof(int)*(HEIGHT+1), cudaMemcpyDeviceToHost)); 
      // CUDA_CHECK(cudaMemcpy(&tempcsrColInd[0], csrColInd, sizeof(int)*nnz, cudaMemcpyDeviceToHost)); 
      // for (int i =0; i < nnz;i++) { 
       // cout << tempcsrVal[i] << " "; 
      // } 
      // cout << endl; 
      // for (int i =0; i < HEIGHT+1;i++) { 
       // cout << tempcsrRowPtr[i] << " "; 
      // } 
      // cout << endl; 
      // for (int i =0; i < nnz;i++) { 
       // cout << tempcsrColInd[i] << " "; 
      // } 
      // cout << endl; 
      cudaDeviceSynchronize(); 
      TIC 
      for (int i=0 ; i < inside_loop;i++) { 
       if (WIDTHOUT == 1) { 
        // TIC 
        CUSPARSE_CHECK(cusparseDcsrmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 
        HEIGHT, WIDTH, nnz, &alpha, descr, csrVal, csrRowPtr, csrColInd, 
        devPtrRand, &beta, devPtrOutput)); 
        // TOC("csrmv") 
       } else { 
        // TIC 
        CUSPARSE_CHECK(cusparseDcsrmm(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 
         HEIGHT, WIDTHOUT, WIDTH, nnz, &alpha, descr, csrVal, csrRowPtr, 
         csrColInd, devPtrRand, WIDTH, &beta, devPtrOutput, HEIGHT)); 
        // TOC("csrmm") 
       } 
      } 
      TOC("csr") 
      TIC 
      for (int i=0 ; i < inside_loop;i++) { 
       if (WIDTHOUT == 1) { 
        // TIC 
        CUBLAS_CHECK(cublasDgemv(handleblas, CUBLAS_OP_N, HEIGHT, WIDTH, &alpha, devPtrSec, HEIGHT , devPtrRand, 1, &beta, devPtrOutput2, 1)); 
        // TOC("dgemv") 
       } else { 
        // TIC 
        CUBLAS_CHECK(cublasDgemm(handleblas, CUBLAS_OP_N, CUBLAS_OP_N, HEIGHT, WIDTHOUT, WIDTH, &alpha, devPtrSec, HEIGHT, devPtrRand, WIDTH, &beta, devPtrOutput2, HEIGHT)); 
        // TOC("dgemm") 
       } 
      } 
      TOC("blas") 


      #if 0 
      vector<double> output(HEIGHT*WIDTHOUT, 0); 
      vector<double> output2(HEIGHT*WIDTHOUT, 0); 
      CUDA_CHECK(cudaMemcpy(&output[0], devPtrOutput, sizeof(double)*HEIGHT*WIDTHOUT, cudaMemcpyDeviceToHost)); 
      CUDA_CHECK(cudaMemcpy(&output2[0], devPtrOutput2, sizeof(double)*HEIGHT*WIDTHOUT, cudaMemcpyDeviceToHost)); 
      dispArray(&output[0], WIDTHOUT, HEIGHT); 
      cout << endl; 
      for (int i=0;i < WIDTHOUT * HEIGHT;i++) { 
       if (output[i] != output2[i]) { 
        cout << "error: " << i << " " << (output[i] - output2[i]) << " " << output[i] << endl; 
       } 
      } 
      #endif 

     } 

     cout << DENSITY << " " << sum/num_loop << endl; 
     myfile << DENSITY << " " << sum/num_loop << endl; 
     cudaFree(csrColInd); 
     cudaFree(csrVal); 
    } 
    myfile.close(); 
    cudaFree(csrRowPtr); 
    cudaFree(devPtrOutput); 
    cudaFree(devPtrRand); 
    cudaFree(devPtrSec); 

}

jedoch nach dem Code mit

g++ -std=c++1y -O3 -I/usr/local/cuda/include -o testcusparsevector testcusparsevector.cpp -L/usr/local/cuda/lib64 -lcudart -lcublas -lcusparse

hier Kompilieren ist die Ausgabe:

GPU Execution time of CUDA_csr: 4818.447266 ms 
GPU Execution time of CUDA_blas: 5024.459961 ms

wh ich sollte meinen, dass selbst wenn meine Dichte bei 0,999 ist, der cusparseDcsrmm immer noch schneller ist als cublasDgemm, habe ich bereits das Ergebnis überprüft, was gut ist, und im Vergleich zu anderen Beispielen scheint, dass das Problem von cublas kommt, was viel zu langsam ist.

Haben Sie eine Idee, wo kommt es her?

EDIT: Ich habe versucht, die Werte zu ändern zu schweben, und das Ergebnis ist mehr, was ich suchte, offenbar wird cublas nicht für doppelte Berechnung gemacht ...

Dank im voraus.

Quelle

2016-03-31 Caenorst

Der von Ihnen gepostete Code wird nicht kompiliert. Es ist ziemlich schwer, einen Fehler in einem riesigen Stapel von nicht kompilierbarem Code zu erkennen. – talonmies

Oh, ja, mein Fehler, ich habe schnelle Änderungen vorgenommen, um es lesbarer zu machen, habe aber vergessen, es einmal zu versuchen. Sie können es jetzt kompilieren. – Caenorst

Titan X (und alle aktuellen Mitglieder der maxwell GPU-Familie) haben ein Verhältnis des Durchsatzes zwischen Gleitkommaoperationen mit doppelter Genauigkeit und Gleitkommaoperationen mit einfacher Genauigkeit von 1:32.

Normalerweise sind spärliche Matrixoperationen bandbreitengebunden, während dichte Matrix-Matrix-Multiplikation ein Beispiel für ein rechenintensives Problem ist.

In Ihrem Beispiel nehmen Sie ein Problem, das normalerweise compute-gebunden ist, und es als spärliche Matrix multiplizieren auf einem Prozessor, der eine relativ große Menge an Speicherbandbreite hat, und eine relativ kleine Menge an doppelter präziser Rechendurchsatz.

Die Situation kann zu einer Verwischung der Linien zwischen den beiden APIs führen, während die CUBLAS API normalerweise für diesen Vergleich viel schneller wäre.

Wenn Sie Ihren Code auf float statt double umstellen, wie ich denke, dass Sie bereits versucht haben, werden Sie CUBLAS wieder gewinnen sehen. Wenn Sie den Code wie gewohnt auf einer GPU ausführen, die ein anderes Verhältnis zwischen Durchsatz mit einfacher und doppelter Genauigkeit aufweist, wird CUBLAS auch dort wieder gewonnen.

offenbar wird cublas nicht für doppelte Berechnung gemacht ...

anstatt zu sagen, dass, würde ich sagen, dass GTX Titan X (in erster Linie) für doppelte Berechnung nicht gemacht wird. Probieren Sie eine Tesla K80, K40 oder eine andere GPU aus, die ein engeres Verhältnis von doppeltem zu einfachem Durchsatz aufweist.

Hier ist die Ausgabe des Programms auf einem „ungeboostertem“ Tesla K40 läuft:

$ ./testcusparsevector 
GPU Execution time of CUDA_csr: 8870.386719 ms 
GPU Execution time of CUDA_blas: 1045.211792 ms

Haftungsausschluss: Ich habe nicht versucht, Ihren Code zu studieren. Ich schaute es mir an, und keine offensichtlichen Probleme sprangen heraus. Aber es könnte Probleme geben, die ich nicht entdeckt habe.

Quelle

2016-03-31 16:22:10

cublas ist ungewöhnlich langsam vergleichen cusparse

Antwort

Verwandte Themen