@@ -49,40 +49,90 @@ void LinearAlgebraCUDA::spmv(const SparseMatrix& A, const Vector& x, Vector& y)
4949 Scalar* d_A_values; // /< device memory matrix A values
5050 Scalar* d_x; // /< device memory vector x
5151 Scalar* d_y; // /< device memory vector y
52- cusparseHandle_t handle;
53- cusparseMatDescr_t descr;
5452
5553 CALL_CUDA (cudaMalloc ((void **)&d_A_rowptr, sizeArowptr));
5654 CALL_CUDA (cudaMalloc ((void **)&d_A_colidx, sizeAcolidx));
5755 CALL_CUDA (cudaMalloc ((void **)&d_A_values, sizeAvalues));
5856 CALL_CUDA (cudaMalloc ((void **)&d_x, sizex));
5957 CALL_CUDA (cudaMalloc ((void **)&d_y, sizey));
6058
61- CALL_CUSPARSE (cusparseCreate (&handle));
62- CALL_CUSPARSE (cusparseCreateMatDescr (&descr));
63- cusparseSetMatType (descr, CUSPARSE_MATRIX_TYPE_GENERAL);
64- cusparseSetMatIndexBase (descr, CUSPARSE_INDEX_BASE_ZERO);
65-
6659 CALL_CUDA (cudaMemcpy (d_A_rowptr, A.outer (), sizeArowptr, cudaMemcpyHostToDevice));
6760 CALL_CUDA (cudaMemcpy (d_A_colidx, A.inner (), sizeAcolidx, cudaMemcpyHostToDevice));
6861 CALL_CUDA (cudaMemcpy (d_A_values, A.data (), sizeAvalues, cudaMemcpyHostToDevice));
6962 CALL_CUDA (cudaMemcpy (d_x, x.data (), sizex, cudaMemcpyHostToDevice));
7063
64+ cusparseHandle_t handle;
65+ CALL_CUSPARSE (cusparseCreate (&handle));
66+
67+ cusparseSpMatDescr_t matA;
68+ CALL_CUSPARSE ( cusparseCreateCsr (
69+ &matA,
70+ A.rows (), A.cols (), A.nonZeros (),
71+ d_A_rowptr,
72+ d_A_colidx,
73+ d_A_values,
74+ CUSPARSE_INDEX_32I,
75+ CUSPARSE_INDEX_32I,
76+ CUSPARSE_INDEX_BASE_ZERO,
77+ CUDA_R_64F) );
78+
79+ cusparseDnVecDescr_t vecX;
80+ CALL_CUSPARSE ( cusparseCreateDnVec (
81+ &vecX,
82+ x.size (),
83+ d_x,
84+ CUDA_R_64F) );
85+
86+ cusparseDnVecDescr_t vecY;
87+ CALL_CUSPARSE ( cusparseCreateDnVec (
88+ &vecY,
89+ y.size (),
90+ d_y,
91+ CUDA_R_64F) );
92+
7193 const Scalar alpha = 1.0 ;
7294 const Scalar beta = 0.0 ;
73- // cusparseStatus_t
74- // cusparseDcsrmv(cusparseHandle_t handle, cusparseOperation_t transA,
75- // int m, int n, int nnz,
76- // const double *alpha, const cusparseMatDescr_t descrA,
77- // const double *csrValA, const int *csrRowPtrA, const int *csrColIndA,
78- // const double *x, const double *beta, double *y)
79- CALL_CUSPARSE (cusparseDcsrmv (handle, CUSPARSE_OPERATION_NON_TRANSPOSE, A.rows (), A.cols (), A.nonZeros (), &alpha,
80- descr, d_A_values, d_A_rowptr, d_A_colidx, d_x, &beta, d_y));
8195
96+ // Determine buffer size
97+ size_t bufferSize = 0 ;
98+ CALL_CUSPARSE ( cusparseSpMV_bufferSize (
99+ handle,
100+ CUSPARSE_OPERATION_NON_TRANSPOSE,
101+ &alpha,
102+ matA,
103+ vecX,
104+ &beta,
105+ vecY,
106+ CUDA_R_64F,
107+ CUSPARSE_SPMV_ALG_DEFAULT,
108+ &bufferSize) );
109+
110+ // Allocate buffer
111+ char * buffer;
112+ CALL_CUDA ( cudaMalloc (&buffer, bufferSize) );
113+
114+ // Perform SpMV
115+ // y = alpha * A * x + beta * y
116+ CALL_CUSPARSE ( cusparseSpMV (
117+ handle,
118+ CUSPARSE_OPERATION_NON_TRANSPOSE,
119+ &alpha,
120+ matA,
121+ vecX,
122+ &beta,
123+ vecY,
124+ CUDA_R_64F,
125+ CUSPARSE_SPMV_ALG_DEFAULT,
126+ buffer) );
127+
128+ // Copy result back to host
82129 CALL_CUDA (cudaMemcpy (y.data (), d_y, sizey, cudaMemcpyDeviceToHost));
83130
84- CALL_CUSPARSE (cusparseDestroyMatDescr (descr));
85- CALL_CUSPARSE (cusparseDestroy (handle));
131+ CALL_CUSPARSE ( cusparseDestroyDnVec (vecY) );
132+ CALL_CUSPARSE ( cusparseDestroyDnVec (vecX) );
133+ CALL_CUSPARSE ( cusparseDestroySpMat (matA) );
134+ CALL_CUSPARSE ( cusparseDestroy (handle) );
135+
86136
87137 CALL_CUDA (cudaFree (d_A_rowptr));
88138 CALL_CUDA (cudaFree (d_A_colidx));
@@ -107,44 +157,97 @@ void LinearAlgebraCUDA::spmm(const SparseMatrix& A, const Matrix& B, Matrix& C)
107157 Scalar* d_A_values; // /< device memory matrix A values
108158 Scalar* d_B; // /< device memory matrix B
109159 Scalar* d_C; // /< device memory matrix C
110- cusparseHandle_t handle;
111- cusparseMatDescr_t descr;
112160
113161 CALL_CUDA (cudaMalloc ((void **)&d_A_rowptr, sizeArowptr));
114162 CALL_CUDA (cudaMalloc ((void **)&d_A_colidx, sizeAcolidx));
115163 CALL_CUDA (cudaMalloc ((void **)&d_A_values, sizeAvalues));
116164 CALL_CUDA (cudaMalloc ((void **)&d_B, sizeB));
117165 CALL_CUDA (cudaMalloc ((void **)&d_C, sizeC));
118166
119- CALL_CUSPARSE (cusparseCreate (&handle));
120- CALL_CUSPARSE (cusparseCreateMatDescr (&descr));
121- cusparseSetMatType (descr, CUSPARSE_MATRIX_TYPE_GENERAL);
122- cusparseSetMatIndexBase (descr, CUSPARSE_INDEX_BASE_ZERO);
123-
124167 CALL_CUDA (cudaMemcpy (d_A_rowptr, A.outer (), sizeArowptr, cudaMemcpyHostToDevice));
125168 CALL_CUDA (cudaMemcpy (d_A_colidx, A.inner (), sizeAcolidx, cudaMemcpyHostToDevice));
126169 CALL_CUDA (cudaMemcpy (d_A_values, A.data (), sizeAvalues, cudaMemcpyHostToDevice));
127170 CALL_CUDA (cudaMemcpy (d_B, B.data (), sizeB, cudaMemcpyHostToDevice));
128171
129- // FIXME: Should we transpose B and use cusparseDcsrmm2 instread?
130- // http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-csrmm2
172+ cusparseHandle_t handle;
173+ CALL_CUSPARSE (cusparseCreate (&handle));
174+
175+ cusparseSpMatDescr_t matA;
176+ CALL_CUSPARSE ( cusparseCreateCsr (
177+ &matA,
178+ A.rows (), A.cols (), A.nonZeros (),
179+ d_A_rowptr,
180+ d_A_colidx,
181+ d_A_values,
182+ CUSPARSE_INDEX_32I,
183+ CUSPARSE_INDEX_32I,
184+ CUSPARSE_INDEX_BASE_ZERO,
185+ CUDA_R_64F) );
186+
187+ // Create dense matrix descriptors
188+ cusparseDnMatDescr_t matB;
189+ CALL_CUSPARSE (cusparseCreateDnMat (
190+ &matB,
191+ B.rows (), // rows
192+ B.cols (), // cols
193+ B.rows (), // leading dimension
194+ d_B,
195+ CUDA_R_64F,
196+ CUSPARSE_ORDER_COL) );
197+
198+ cusparseDnMatDescr_t matC;
199+ CALL_CUSPARSE (cusparseCreateDnMat (
200+ &matC,
201+ C.rows (), // rows
202+ C.cols (), // cols
203+ C.rows (), // leading dimension
204+ d_C,
205+ CUDA_R_64F,
206+ CUSPARSE_ORDER_COL) );
207+
131208 const Scalar alpha = 1.0 ;
132209 const Scalar beta = 0.0 ;
133- // cusparseStatus_t
134- // cusparseDcsrmm(cusparseHandle_t handle, cusparseOperation_t transA,
135- // int m, int n, int k, int nnz,
136- // const double *alpha, const cusparseMatDescr_t descrA,
137- // const double *csrValA, const int *csrRowPtrA, const int *csrColIndA,
138- // const double *B, int ldb, const double *beta, double *C, int ldc)
139- CALL_CUSPARSE (cusparseDcsrmm (handle, CUSPARSE_OPERATION_NON_TRANSPOSE, A.rows (), A.cols (), B.cols (), A.nonZeros (),
140- &alpha, descr, d_A_values, d_A_rowptr, d_A_colidx, d_B, B.rows (), &beta, d_C,
141- C.rows ()));
210+
211+ size_t bufferSize = 0 ;
212+ CALL_CUSPARSE (cusparseSpMM_bufferSize (
213+ handle,
214+ CUSPARSE_OPERATION_NON_TRANSPOSE,
215+ CUSPARSE_OPERATION_NON_TRANSPOSE,
216+ &alpha,
217+ matA,
218+ matB,
219+ &beta,
220+ matC,
221+ CUDA_R_64F,
222+ CUSPARSE_SPMM_ALG_DEFAULT,
223+ &bufferSize));
224+
225+ // Allocate buffer
226+ char * buffer;
227+ CALL_CUDA (cudaMalloc (&buffer, bufferSize));
228+
229+ // Perform SpMM
230+ CALL_CUSPARSE (cusparseSpMM (
231+ handle,
232+ CUSPARSE_OPERATION_NON_TRANSPOSE,
233+ CUSPARSE_OPERATION_NON_TRANSPOSE,
234+ &alpha,
235+ matA,
236+ matB,
237+ &beta,
238+ matC,
239+ CUDA_R_64F,
240+ CUSPARSE_SPMM_ALG_DEFAULT,
241+ buffer));
142242
143243 CALL_CUDA (cudaMemcpy (C.data (), d_C, sizeC, cudaMemcpyDeviceToHost));
144244
145- CALL_CUSPARSE (cusparseDestroyMatDescr (descr));
146245 CALL_CUSPARSE (cusparseDestroy (handle));
246+ CALL_CUSPARSE (cusparseDestroyDnMat (matC));
247+ CALL_CUSPARSE (cusparseDestroyDnMat (matB));
248+ CALL_CUSPARSE (cusparseDestroySpMat (matA));
147249
250+ CALL_CUDA (cudaFree (buffer));
148251 CALL_CUDA (cudaFree (d_A_rowptr));
149252 CALL_CUDA (cudaFree (d_A_colidx));
150253 CALL_CUDA (cudaFree (d_A_values));
0 commit comments