1 #ifndef VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_HPP_
2 #define VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_HPP_
48 const unsigned int * row_indices,
49 const unsigned int * column_indices,
55 for (
unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
57 row += gridDim.x * blockDim.x)
60 unsigned int row_end = row_indices[
row+1];
65 for (
unsigned int i = row_indices[
row]; i < row_end; ++i)
66 value = max(value, fabs(elements[i]));
70 for (
unsigned int i = row_indices[
row]; i < row_end; ++i)
71 value += fabs(elements[i]);
75 for (
unsigned int i = row_indices[
row]; i < row_end; ++i)
76 value += elements[i] * elements[i];
81 for (
unsigned int i = row_indices[
row]; i < row_end; ++i)
83 if (column_indices[i] ==
row)
99 template<
typename ScalarType,
unsigned int MAT_ALIGNMENT>
104 csr_row_info_extractor_kernel<<<128, 128>>>(detail::cuda_arg<unsigned int>(mat.
handle1().cuda_handle()),
105 detail::cuda_arg<unsigned int>(mat.
handle2().cuda_handle()),
106 detail::cuda_arg<ScalarType>(mat.
handle().cuda_handle()),
107 detail::cuda_arg<ScalarType>(vec),
108 static_cast<unsigned int>(mat.
size1()),
109 static_cast<unsigned int>(info_selector)
117 template <
typename T>
119 const unsigned int * row_indices,
120 const unsigned int * column_indices,
123 unsigned int start_x,
126 unsigned int start_result,
127 unsigned int inc_result,
128 unsigned int size_result)
130 for (
unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
132 row += gridDim.x * blockDim.x)
135 unsigned int row_end = row_indices[
row+1];
136 for (
unsigned int i = row_indices[
row]; i < row_end; ++i)
137 dot_prod += elements[i] * x[column_indices[i] * inc_x + start_x];
138 result[
row * inc_result + start_result] =
dot_prod;
154 template<
class ScalarType,
unsigned int ALIGNMENT>
159 compressed_matrix_vec_mul_kernel<<<128, 128>>>(detail::cuda_arg<unsigned int>(mat.
handle1().cuda_handle()),
160 detail::cuda_arg<unsigned int>(mat.
handle2().cuda_handle()),
161 detail::cuda_arg<ScalarType>(mat.
handle().cuda_handle()),
162 detail::cuda_arg<ScalarType>(vec),
163 static_cast<unsigned int>(vec.
start()),
164 static_cast<unsigned int>(vec.
stride()),
165 detail::cuda_arg<ScalarType>(result),
166 static_cast<unsigned int>(result.
start()),
167 static_cast<unsigned int>(result.
stride()),
168 static_cast<unsigned int>(result.
size())
177 template <
typename LayoutT>
180 static __device__
unsigned int apply(
unsigned int i,
unsigned int j,
181 unsigned int row_start,
unsigned int row_inc,
182 unsigned int col_start,
unsigned int col_inc,
183 unsigned int internal_rows,
unsigned int internal_cols)
185 return (row_start + i * row_inc) * internal_cols + col_start + j * col_inc;
193 static __device__
unsigned int apply(
unsigned int i,
unsigned int j,
194 unsigned int row_start,
unsigned int row_inc,
195 unsigned int col_start,
unsigned int col_inc,
196 unsigned int internal_rows,
unsigned int internal_cols)
198 return (row_start + i * row_inc) + (col_start + j * col_inc) * internal_rows;
204 template <
typename DMatIndexT,
typename ResultIndexT,
typename T>
206 const unsigned int * sp_mat_row_indices,
207 const unsigned int * sp_mat_col_indices,
208 const T * sp_mat_elements,
210 unsigned int d_mat_row_start,
211 unsigned int d_mat_col_start,
212 unsigned int d_mat_row_inc,
213 unsigned int d_mat_col_inc,
214 unsigned int d_mat_row_size,
215 unsigned int d_mat_col_size,
216 unsigned int d_mat_internal_rows,
217 unsigned int d_mat_internal_cols,
219 unsigned int result_row_start,
220 unsigned int result_col_start,
221 unsigned int result_row_inc,
222 unsigned int result_col_inc,
223 unsigned int result_row_size,
224 unsigned int result_col_size,
225 unsigned int result_internal_rows,
226 unsigned int result_internal_cols) {
228 for (
unsigned int row = blockIdx.x;
row < result_row_size;
row += gridDim.x) {
230 unsigned int row_start = sp_mat_row_indices[
row];
231 unsigned int row_end = sp_mat_row_indices[
row+1];
233 for (
unsigned int col = threadIdx.x; col < result_col_size; col += blockDim.x) {
237 for (
unsigned int k = row_start; k < row_end; k++) {
239 unsigned int j = sp_mat_col_indices[k];
240 T x = sp_mat_elements[k];
241 T y = d_mat[ DMatIndexT::apply(j, col,
242 d_mat_row_start, d_mat_row_inc,
243 d_mat_col_start, d_mat_col_inc,
244 d_mat_internal_rows, d_mat_internal_cols) ];
249 result [ ResultIndexT::apply(
row, col,
250 result_row_start, result_row_inc,
251 result_col_start, result_col_inc,
252 result_internal_rows, result_internal_cols) ] = r;
268 template<
typename TYPE,
unsigned int ALIGNMENT,
typename F1,
typename F2>
273 (detail::cuda_arg<unsigned int>(sp_mat.
handle1().cuda_handle()),
274 detail::cuda_arg<unsigned int>(sp_mat.
handle2().cuda_handle()),
275 detail::cuda_arg<TYPE>(sp_mat.
handle().cuda_handle()),
277 detail::cuda_arg<TYPE>(d_mat),
283 detail::cuda_arg<TYPE>(result),
293 template <
typename DMatIndexT,
typename ResultIndexT,
typename T>
295 const unsigned int * sp_mat_row_indices,
296 const unsigned int * sp_mat_col_indices,
297 const T * sp_mat_elements,
299 unsigned int d_mat_row_start,
300 unsigned int d_mat_col_start,
301 unsigned int d_mat_row_inc,
302 unsigned int d_mat_col_inc,
303 unsigned int d_mat_row_size,
304 unsigned int d_mat_col_size,
305 unsigned int d_mat_internal_rows,
306 unsigned int d_mat_internal_cols,
308 unsigned int result_row_start,
309 unsigned int result_col_start,
310 unsigned int result_row_inc,
311 unsigned int result_col_inc,
312 unsigned int result_row_size,
313 unsigned int result_col_size,
314 unsigned int result_internal_rows,
315 unsigned int result_internal_cols) {
317 for (
unsigned int row = blockIdx.x;
row < result_row_size;
row += gridDim.x) {
319 unsigned int row_start = sp_mat_row_indices[
row];
320 unsigned int row_end = sp_mat_row_indices[
row+1];
322 for (
unsigned int col = threadIdx.x; col < result_col_size; col += blockDim.x) {
326 for (
unsigned int k = row_start; k < row_end; k++) {
328 unsigned int j = sp_mat_col_indices[k];
329 T x = sp_mat_elements[k];
330 T y = d_mat[ DMatIndexT::apply(col, j,
331 d_mat_row_start, d_mat_row_inc,
332 d_mat_col_start, d_mat_col_inc,
333 d_mat_internal_rows, d_mat_internal_cols) ];
338 result [ ResultIndexT::apply(
row, col,
339 result_row_start, result_row_inc,
340 result_col_start, result_col_inc,
341 result_internal_rows, result_internal_cols) ] = r;
356 template<
typename TYPE,
unsigned int ALIGNMENT,
typename F1,
typename F2>
364 (detail::cuda_arg<unsigned int>(sp_mat.
handle1().cuda_handle()),
365 detail::cuda_arg<unsigned int>(sp_mat.
handle2().cuda_handle()),
366 detail::cuda_arg<TYPE>(sp_mat.
handle().cuda_handle()),
368 detail::cuda_arg<TYPE>(d_mat.lhs()),
374 detail::cuda_arg<TYPE>(result),
388 template <
typename T>
390 const unsigned int * row_indices,
391 const unsigned int * column_indices,
396 for (
unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
398 row += gridDim.x * blockDim.x)
401 unsigned int row_end = row_indices[
row+1];
402 for (
unsigned int i = row_indices[
row]; i < row_end; ++i)
404 unsigned int col_index = column_indices[i];
405 if (col_index ==
row)
421 template<
typename SparseMatrixType,
class ScalarType>
427 csr_unit_lu_forward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
428 detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
429 detail::cuda_arg<ScalarType>(mat.handle().cuda_handle()),
430 detail::cuda_arg<ScalarType>(vec),
431 static_cast<unsigned int>(mat.size1())
442 template<
typename SparseMatrixType,
class ScalarType>
448 csr_lu_forward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
449 detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
450 detail::cuda_arg<ScalarType>(mat.handle().cuda_handle()),
451 detail::cuda_arg<ScalarType>(vec),
452 static_cast<unsigned int>(mat.size1())
464 template<
typename SparseMatrixType,
class ScalarType>
470 csr_unit_lu_backward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
471 detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
472 detail::cuda_arg<ScalarType>(mat.handle().cuda_handle()),
473 detail::cuda_arg<ScalarType>(vec),
474 static_cast<unsigned int>(mat.size1())
485 template<
typename SparseMatrixType,
class ScalarType>
491 csr_lu_backward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
492 detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
493 detail::cuda_arg<ScalarType>(mat.handle().cuda_handle()),
494 detail::cuda_arg<ScalarType>(vec),
495 static_cast<unsigned int>(mat.size1())
509 template<
typename SparseMatrixType,
class ScalarType>
515 csr_trans_unit_lu_forward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.
lhs().handle1().cuda_handle()),
516 detail::cuda_arg<unsigned int>(mat.
lhs().handle2().cuda_handle()),
517 detail::cuda_arg<ScalarType>(mat.
lhs().handle().cuda_handle()),
518 detail::cuda_arg<ScalarType>(vec),
519 static_cast<unsigned int>(mat.
lhs().size1())
530 template<
typename SparseMatrixType,
class ScalarType>
538 compressed_matrix_diagonal_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.
lhs().handle1().cuda_handle()),
539 detail::cuda_arg<unsigned int>(mat.
lhs().handle2().cuda_handle()),
540 detail::cuda_arg<ScalarType>(mat.
lhs().handle().cuda_handle()),
541 detail::cuda_arg<ScalarType>(diagonal),
542 static_cast<unsigned int>(mat.
size1())
545 csr_trans_lu_forward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.
lhs().handle1().cuda_handle()),
546 detail::cuda_arg<unsigned int>(mat.
lhs().handle2().cuda_handle()),
547 detail::cuda_arg<ScalarType>(mat.
lhs().handle().cuda_handle()),
548 detail::cuda_arg<ScalarType>(diagonal),
549 detail::cuda_arg<ScalarType>(vec),
550 static_cast<unsigned int>(mat.
lhs().size1())
561 template<
typename SparseMatrixType,
class ScalarType>
567 csr_trans_unit_lu_backward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.
lhs().handle1().cuda_handle()),
568 detail::cuda_arg<unsigned int>(mat.
lhs().handle2().cuda_handle()),
569 detail::cuda_arg<ScalarType>(mat.
lhs().handle().cuda_handle()),
570 detail::cuda_arg<ScalarType>(vec),
571 static_cast<unsigned int>(mat.
lhs().size1())
582 template<
typename SparseMatrixType,
class ScalarType>
590 compressed_matrix_diagonal_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.
lhs().handle1().cuda_handle()),
591 detail::cuda_arg<unsigned int>(mat.
lhs().handle2().cuda_handle()),
592 detail::cuda_arg<ScalarType>(mat.
lhs().handle().cuda_handle()),
593 detail::cuda_arg<ScalarType>(diagonal),
594 static_cast<unsigned int>(mat.
size1())
597 csr_trans_lu_backward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.
lhs().handle1().cuda_handle()),
598 detail::cuda_arg<unsigned int>(mat.
lhs().handle2().cuda_handle()),
599 detail::cuda_arg<ScalarType>(mat.
lhs().handle().cuda_handle()),
600 detail::cuda_arg<ScalarType>(diagonal),
601 detail::cuda_arg<ScalarType>(vec),
602 static_cast<unsigned int>(mat.
lhs().size1())
612 template<
typename ScalarType,
unsigned int MAT_ALIGNMENT>
621 csr_block_trans_unit_lu_forward<<<num_blocks, 128>>>(detail::cuda_arg<unsigned int>(L.lhs().handle1().cuda_handle()),
622 detail::cuda_arg<unsigned int>(L.lhs().handle2().cuda_handle()),
623 detail::cuda_arg<ScalarType>(L.lhs().handle().cuda_handle()),
624 detail::cuda_arg<unsigned int>(block_indices.cuda_handle()),
625 detail::cuda_arg<ScalarType>(vec),
626 static_cast<unsigned int>(L.lhs().size1())
631 template<
typename ScalarType,
unsigned int MAT_ALIGNMENT>
640 csr_block_trans_lu_backward<<<num_blocks, 128>>>(detail::cuda_arg<unsigned int>(U.lhs().handle1().cuda_handle()),
641 detail::cuda_arg<unsigned int>(U.lhs().handle2().cuda_handle()),
642 detail::cuda_arg<ScalarType>(U.lhs().handle().cuda_handle()),
643 detail::cuda_arg<ScalarType>(U_diagonal.
handle().cuda_handle()),
644 detail::cuda_arg<unsigned int>(block_indices.cuda_handle()),
645 detail::cuda_arg<ScalarType>(vec),
646 static_cast<unsigned int>(U.lhs().size1())
658 template <
typename T>
660 const unsigned int * row_jumper,
661 const unsigned int * row_indices,
662 const unsigned int * column_indices,
664 unsigned int nonzero_rows,
666 unsigned int start_x,
669 unsigned int start_result,
670 unsigned int inc_result,
671 unsigned int size_result)
673 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
675 i += gridDim.x * blockDim.x)
677 result[i * inc_result + start_result] = 0;
680 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
682 i += gridDim.x * blockDim.x)
685 unsigned int row_end = row_jumper[i+1];
686 for (
unsigned int j = row_jumper[i]; j < row_end; ++j)
687 dot_prod += elements[j] * x[column_indices[j] * inc_x + start_x];
688 result[row_indices[i] * inc_result + start_result] =
dot_prod;
701 template<
class ScalarType>
706 compressed_compressed_matrix_vec_mul_kernel<<<128, 128>>>(detail::cuda_arg<unsigned int>(mat.
handle1().cuda_handle()),
707 detail::cuda_arg<unsigned int>(mat.
handle3().cuda_handle()),
708 detail::cuda_arg<unsigned int>(mat.
handle2().cuda_handle()),
709 detail::cuda_arg<ScalarType>(mat.
handle().cuda_handle()),
710 static_cast<unsigned int>(mat.
nnz1()),
711 detail::cuda_arg<ScalarType>(vec),
712 static_cast<unsigned int>(vec.
start()),
713 static_cast<unsigned int>(vec.
stride()),
714 detail::cuda_arg<ScalarType>(result),
715 static_cast<unsigned int>(result.
start()),
716 static_cast<unsigned int>(result.
stride()),
717 static_cast<unsigned int>(result.
size())
730 template <
typename T>
733 const unsigned int * group_boundaries,
737 __shared__
unsigned int shared_rows[128];
738 __shared__ T inter_results[128];
742 unsigned int last_index = blockDim.x - 1;
743 unsigned int group_start = group_boundaries[blockIdx.x];
744 unsigned int group_end = group_boundaries[blockIdx.x + 1];
745 unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0;
747 unsigned int local_index = 0;
749 for (
unsigned int k = 0; k < k_end; ++k)
751 local_index = group_start + k * blockDim.x + threadIdx.x;
753 tmp = (local_index < group_end) ? ((
const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
754 val = (local_index < group_end && (option != 3 || tmp.x == tmp.y) ) ? elements[local_index] : 0;
757 if (threadIdx.x == 0 && k > 0)
759 if (tmp.x == shared_rows[last_index])
765 val = max(val, fabs(inter_results[last_index]));
769 val = fabs(val) + inter_results[last_index];
773 val = sqrt(val * val + inter_results[last_index]);
787 result[shared_rows[last_index]] = inter_results[last_index];
791 result[shared_rows[last_index]] = sqrt(inter_results[last_index]);
800 shared_rows[threadIdx.x] = tmp.x;
805 inter_results[threadIdx.x] = val;
808 inter_results[threadIdx.x] = fabs(val);
811 inter_results[threadIdx.x] = val * val;
820 left = (threadIdx.x >=
stride && tmp.x == shared_rows[threadIdx.x -
stride]) ? inter_results[threadIdx.x -
stride] : 0;
826 inter_results[threadIdx.x] = max(inter_results[threadIdx.x], left);
830 inter_results[threadIdx.x] += left;
834 inter_results[threadIdx.x] += left;
844 if (threadIdx.x != last_index &&
845 shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1] &&
846 inter_results[threadIdx.x] != 0)
848 result[tmp.x] = (option == 2) ? sqrt(inter_results[threadIdx.x]) : inter_results[threadIdx.x];
854 if (threadIdx.x == last_index && inter_results[last_index] != 0)
855 result[tmp.x] = (option == 2) ? sqrt(inter_results[last_index]) : inter_results[last_index];
858 template<
typename ScalarType,
unsigned int MAT_ALIGNMENT>
863 coo_row_info_extractor<<<64, 128>>>(detail::cuda_arg<unsigned int>(mat.
handle12().cuda_handle()),
864 detail::cuda_arg<ScalarType>(mat.
handle().cuda_handle()),
865 detail::cuda_arg<unsigned int>(mat.
handle3().cuda_handle()),
866 detail::cuda_arg<ScalarType>(vec),
867 static_cast<unsigned int>(info_selector)
875 template <
typename T>
878 const unsigned int * group_boundaries,
880 unsigned int start_x,
883 unsigned int start_result,
884 unsigned int inc_result
887 __shared__
unsigned int shared_rows[128];
888 __shared__ T inter_results[128];
892 unsigned int group_start = group_boundaries[blockIdx.x];
893 unsigned int group_end = group_boundaries[blockIdx.x + 1];
894 unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0;
896 unsigned int local_index = 0;
898 for (
unsigned int k = 0; k < k_end; ++k)
900 local_index = group_start + k * blockDim.x + threadIdx.x;
902 tmp = (local_index < group_end) ? ((
const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
903 val = (local_index < group_end) ? elements[local_index] * x[tmp.y * inc_x + start_x] : 0;
906 if (threadIdx.x == 0 && k > 0)
908 if (tmp.x == shared_rows[blockDim.x-1])
909 val += inter_results[blockDim.x-1];
911 result[shared_rows[blockDim.x-1] * inc_result + start_result] = inter_results[blockDim.x-1];
916 shared_rows[threadIdx.x] = tmp.x;
917 inter_results[threadIdx.x] = val;
923 left = (threadIdx.x >=
stride && tmp.x == shared_rows[threadIdx.x -
stride]) ? inter_results[threadIdx.x -
stride] : 0;
925 inter_results[threadIdx.x] += left;
930 if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
931 shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
933 result[tmp.x * inc_result + start_result] = inter_results[threadIdx.x];
939 if (local_index + 1 == group_end)
940 result[tmp.x * inc_result + start_result] = inter_results[threadIdx.x];
952 template<
class ScalarType,
unsigned int ALIGNMENT>
959 coordinate_matrix_vec_mul_kernel<<<64, 128>>>(detail::cuda_arg<unsigned int>(mat.
handle12().cuda_handle()),
960 detail::cuda_arg<ScalarType>(mat.
handle().cuda_handle()),
961 detail::cuda_arg<unsigned int>(mat.
handle3().cuda_handle()),
962 detail::cuda_arg<ScalarType>(vec),
963 static_cast<unsigned int>(vec.
start()),
964 static_cast<unsigned int>(vec.
stride()),
965 detail::cuda_arg<ScalarType>(result),
966 static_cast<unsigned int>(result.
start()),
967 static_cast<unsigned int>(result.
stride())
975 template <
typename DMatIndexT,
typename ResultIndexT,
typename ScalarType,
typename NumericT>
977 const ScalarType * elements,
978 const unsigned int * group_boundaries,
979 const NumericT * d_mat,
980 unsigned int d_mat_row_start,
981 unsigned int d_mat_col_start,
982 unsigned int d_mat_row_inc,
983 unsigned int d_mat_col_inc,
984 unsigned int d_mat_row_size,
985 unsigned int d_mat_col_size,
986 unsigned int d_mat_internal_rows,
987 unsigned int d_mat_internal_cols,
989 unsigned int result_row_start,
990 unsigned int result_col_start,
991 unsigned int result_row_inc,
992 unsigned int result_col_inc,
993 unsigned int result_row_size,
994 unsigned int result_col_size,
995 unsigned int result_internal_rows,
996 unsigned int result_internal_cols)
998 __shared__
unsigned int shared_rows[128];
999 __shared__ NumericT inter_results[128];
1003 unsigned int group_start = group_boundaries[blockIdx.x];
1004 unsigned int group_end = group_boundaries[blockIdx.x + 1];
1005 unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0;
1007 unsigned int local_index = 0;
1009 for (
unsigned int result_col = 0; result_col < result_col_size; ++result_col)
1011 for (
unsigned int k = 0; k < k_end; ++k)
1013 local_index = group_start + k * blockDim.x + threadIdx.x;
1015 tmp = (local_index < group_end) ? ((
const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
1016 val = (local_index < group_end) ? elements[local_index] * d_mat[DMatIndexT::apply(tmp.y, result_col,
1017 d_mat_row_start, d_mat_row_inc,
1018 d_mat_col_start, d_mat_col_inc,
1019 d_mat_internal_rows, d_mat_internal_cols) ] : 0;
1022 if (threadIdx.x == 0 && k > 0)
1024 if (tmp.x == shared_rows[blockDim.x-1])
1025 val += inter_results[blockDim.x-1];
1027 result[ResultIndexT::apply(shared_rows[blockDim.x-1], result_col,
1028 result_row_start, result_row_inc,
1029 result_col_start, result_col_inc,
1030 result_internal_rows, result_internal_cols)] = inter_results[blockDim.x-1];
1035 shared_rows[threadIdx.x] = tmp.x;
1036 inter_results[threadIdx.x] = val;
1042 left = (threadIdx.x >=
stride && tmp.x == shared_rows[threadIdx.x -
stride]) ? inter_results[threadIdx.x -
stride] : 0;
1044 inter_results[threadIdx.x] += left;
1049 if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
1050 shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
1052 result[ResultIndexT::apply(tmp.x, result_col,
1053 result_row_start, result_row_inc,
1054 result_col_start, result_col_inc,
1055 result_internal_rows, result_internal_cols)] = inter_results[threadIdx.x];
1061 if (local_index + 1 == group_end)
1062 result[ResultIndexT::apply(tmp.x, result_col,
1063 result_row_start, result_row_inc,
1064 result_col_start, result_col_inc,
1065 result_internal_rows, result_internal_cols)] = inter_results[threadIdx.x];
1078 template<
typename NumericT,
unsigned int ALIGNMENT,
typename F1,
typename F2>
1084 (detail::cuda_arg<unsigned int>(sp_mat.
handle12().cuda_handle()),
1085 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
1086 detail::cuda_arg<unsigned int>(sp_mat.
handle3().cuda_handle()),
1088 detail::cuda_arg<NumericT>(d_mat),
1094 detail::cuda_arg<NumericT>(result),
1103 template <
typename DMatIndexT,
typename ResultIndexT,
typename ScalarType,
typename NumericT>
1105 const ScalarType * elements,
1106 const unsigned int * group_boundaries,
1107 const NumericT * d_mat,
1108 unsigned int d_mat_row_start,
1109 unsigned int d_mat_col_start,
1110 unsigned int d_mat_row_inc,
1111 unsigned int d_mat_col_inc,
1112 unsigned int d_mat_row_size,
1113 unsigned int d_mat_col_size,
1114 unsigned int d_mat_internal_rows,
1115 unsigned int d_mat_internal_cols,
1117 unsigned int result_row_start,
1118 unsigned int result_col_start,
1119 unsigned int result_row_inc,
1120 unsigned int result_col_inc,
1121 unsigned int result_row_size,
1122 unsigned int result_col_size,
1123 unsigned int result_internal_rows,
1124 unsigned int result_internal_cols)
1126 __shared__
unsigned int shared_rows[128];
1127 __shared__ NumericT inter_results[128];
1131 unsigned int group_start = group_boundaries[blockIdx.x];
1132 unsigned int group_end = group_boundaries[blockIdx.x + 1];
1133 unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0;
1135 unsigned int local_index = 0;
1137 for (
unsigned int result_col = 0; result_col < result_col_size; ++result_col)
1139 for (
unsigned int k = 0; k < k_end; ++k)
1141 local_index = group_start + k * blockDim.x + threadIdx.x;
1143 tmp = (local_index < group_end) ? ((
const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
1144 val = (local_index < group_end) ? elements[local_index] * d_mat[DMatIndexT::apply(result_col, tmp.y,
1145 d_mat_row_start, d_mat_row_inc,
1146 d_mat_col_start, d_mat_col_inc,
1147 d_mat_internal_rows, d_mat_internal_cols)] : 0;
1150 if (threadIdx.x == 0 && k > 0)
1152 if (tmp.x == shared_rows[blockDim.x-1])
1153 val += inter_results[blockDim.x-1];
1155 result[ResultIndexT::apply(shared_rows[blockDim.x-1], result_col,
1156 result_row_start, result_row_inc,
1157 result_col_start, result_col_inc,
1158 result_internal_rows, result_internal_cols) ] = inter_results[blockDim.x-1];
1163 shared_rows[threadIdx.x] = tmp.x;
1164 inter_results[threadIdx.x] = val;
1170 left = (threadIdx.x >=
stride && tmp.x == shared_rows[threadIdx.x -
stride]) ? inter_results[threadIdx.x -
stride] : 0;
1172 inter_results[threadIdx.x] += left;
1177 if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
1178 shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
1180 result[ ResultIndexT::apply(tmp.x, result_col,
1181 result_row_start, result_row_inc,
1182 result_col_start, result_col_inc,
1183 result_internal_rows, result_internal_cols) ] = inter_results[threadIdx.x];
1189 if (local_index + 1 == group_end)
1190 result[ ResultIndexT::apply(tmp.x, result_col,
1191 result_row_start, result_row_inc,
1192 result_col_start, result_col_inc,
1193 result_internal_rows, result_internal_cols) ] = inter_results[threadIdx.x];
1205 template<
class ScalarType,
unsigned int ALIGNMENT,
class NumericT,
typename F1,
typename F2>
1213 (detail::cuda_arg<unsigned int>(sp_mat.
handle12().cuda_handle()),
1214 detail::cuda_arg<ScalarType>(sp_mat.
handle().cuda_handle()),
1215 detail::cuda_arg<unsigned int>(sp_mat.
handle3().cuda_handle()),
1217 detail::cuda_arg<NumericT>(d_mat.lhs()),
1223 detail::cuda_arg<NumericT>(result),
1238 template <
typename T>
1242 unsigned int start_x,
1245 unsigned int start_result,
1246 unsigned int inc_result,
1247 unsigned int row_num,
1248 unsigned int col_num,
1249 unsigned int internal_row_num,
1250 unsigned int items_per_row,
1251 unsigned int aligned_items_per_row
1254 unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
1255 unsigned int glb_sz = gridDim.x * blockDim.x;
1257 for(
unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
1261 unsigned int offset = row_id;
1262 for(
unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
1264 T val = elements[offset];
1268 int col = coords[offset];
1269 sum += (x[col * inc_x + start_x] * val);
1273 result[row_id * inc_result + start_result] = sum;
1286 template<
class ScalarType,
unsigned int ALIGNMENT>
1291 ell_matrix_vec_mul_kernel<<<256, 128>>>(detail::cuda_arg<unsigned int>(mat.
handle2().cuda_handle()),
1292 detail::cuda_arg<ScalarType>(mat.
handle().cuda_handle()),
1293 detail::cuda_arg<ScalarType>(vec),
1294 static_cast<unsigned int>(vec.
start()),
1295 static_cast<unsigned int>(vec.
stride()),
1296 detail::cuda_arg<ScalarType>(result),
1297 static_cast<unsigned int>(result.
start()),
1298 static_cast<unsigned int>(result.
stride()),
1299 static_cast<unsigned int>(mat.
size1()),
1300 static_cast<unsigned int>(mat.
size2()),
1302 static_cast<unsigned int>(mat.
maxnnz()),
1308 template <
typename DMatIndexT,
typename ResultIndexT,
typename ScalarType,
typename NumericT >
1310 const ScalarType * sp_mat_elements,
1311 unsigned int sp_mat_row_num,
1312 unsigned int sp_mat_col_num,
1313 unsigned int sp_mat_internal_row_num,
1314 unsigned int sp_mat_items_per_row,
1315 unsigned int sp_mat_aligned_items_per_row,
1316 const NumericT * d_mat,
1317 unsigned int d_mat_row_start,
1318 unsigned int d_mat_col_start,
1319 unsigned int d_mat_row_inc,
1320 unsigned int d_mat_col_inc,
1321 unsigned int d_mat_row_size,
1322 unsigned int d_mat_col_size,
1323 unsigned int d_mat_internal_rows,
1324 unsigned int d_mat_internal_cols,
1326 unsigned int result_row_start,
1327 unsigned int result_col_start,
1328 unsigned int result_row_inc,
1329 unsigned int result_col_inc,
1330 unsigned int result_row_size,
1331 unsigned int result_col_size,
1332 unsigned int result_internal_rows,
1333 unsigned int result_internal_cols) {
1336 unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
1337 unsigned int glb_sz = gridDim.x * blockDim.x;
1338 for(
unsigned int rc = glb_id; rc < (sp_mat_row_num * d_mat_col_size); rc += glb_sz) {
1339 unsigned int row = rc % sp_mat_row_num;
1340 unsigned int col = rc / sp_mat_row_num;
1342 unsigned int offset =
row;
1343 NumericT r = (NumericT)0;
1345 for(
unsigned int k = 0; k < sp_mat_items_per_row; k++, offset += sp_mat_internal_row_num) {
1347 unsigned int j = sp_mat_coords[offset];
1348 NumericT x =
static_cast<NumericT
>(sp_mat_elements[offset]);
1350 if(x != (NumericT)0) {
1352 NumericT y = d_mat[ DMatIndexT::apply(j, col,
1353 d_mat_row_start, d_mat_row_inc,
1354 d_mat_col_start, d_mat_col_inc,
1355 d_mat_internal_rows, d_mat_internal_cols) ];
1360 result [ ResultIndexT::apply(row, col,
1361 result_row_start, result_row_inc,
1362 result_col_start, result_col_inc,
1363 result_internal_rows, result_internal_cols) ] = r;
1377 template<
class ScalarType,
unsigned int ALIGNMENT,
class NumericT,
typename F1,
typename F2 >
1383 (detail::cuda_arg<unsigned int>(sp_mat.
handle2().cuda_handle()),
1384 detail::cuda_arg<ScalarType>(sp_mat.
handle().cuda_handle()),
1385 static_cast<unsigned int>(sp_mat.
size1()),
1386 static_cast<unsigned int>(sp_mat.
size2()),
1388 static_cast<unsigned int>(sp_mat.
maxnnz()),
1390 detail::cuda_arg<NumericT>(d_mat),
1396 detail::cuda_arg<NumericT>(result),
1405 template <
typename DMatIndexT,
typename ResultIndexT,
typename ScalarType,
typename NumericT >
1407 const ScalarType * sp_mat_elements,
1408 unsigned int sp_mat_row_num,
1409 unsigned int sp_mat_col_num,
1410 unsigned int sp_mat_internal_row_num,
1411 unsigned int sp_mat_items_per_row,
1412 unsigned int sp_mat_aligned_items_per_row,
1413 const NumericT * d_mat,
1414 unsigned int d_mat_row_start,
1415 unsigned int d_mat_col_start,
1416 unsigned int d_mat_row_inc,
1417 unsigned int d_mat_col_inc,
1418 unsigned int d_mat_row_size,
1419 unsigned int d_mat_col_size,
1420 unsigned int d_mat_internal_rows,
1421 unsigned int d_mat_internal_cols,
1423 unsigned int result_row_start,
1424 unsigned int result_col_start,
1425 unsigned int result_row_inc,
1426 unsigned int result_col_inc,
1427 unsigned int result_row_size,
1428 unsigned int result_col_size,
1429 unsigned int result_internal_rows,
1430 unsigned int result_internal_cols) {
1433 unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
1434 unsigned int glb_sz = gridDim.x * blockDim.x;
1435 for(
unsigned int rc = glb_id; rc < (sp_mat_row_num * d_mat_row_size); rc += glb_sz) {
1436 unsigned int row = rc % sp_mat_row_num;
1437 unsigned int col = rc / sp_mat_row_num;
1439 unsigned int offset =
row;
1440 NumericT r = (NumericT)0;
1442 for(
unsigned int k = 0; k < sp_mat_items_per_row; k++, offset += sp_mat_internal_row_num) {
1444 unsigned int j = sp_mat_coords[offset];
1445 NumericT x =
static_cast<NumericT
>(sp_mat_elements[offset]);
1447 if(x != (NumericT)0) {
1449 NumericT y = d_mat[ DMatIndexT::apply(col, j,
1450 d_mat_row_start, d_mat_row_inc,
1451 d_mat_col_start, d_mat_col_inc,
1452 d_mat_internal_rows, d_mat_internal_cols) ];
1457 result [ ResultIndexT::apply(row, col,
1458 result_row_start, result_row_inc,
1459 result_col_start, result_col_inc,
1460 result_internal_rows, result_internal_cols) ] = r;
1474 template<
class ScalarType,
unsigned int ALIGNMENT,
class NumericT,
typename F1,
typename F2 >
1482 (detail::cuda_arg<unsigned int>(sp_mat.
handle2().cuda_handle()),
1483 detail::cuda_arg<ScalarType>(sp_mat.
handle().cuda_handle()),
1484 static_cast<unsigned int>(sp_mat.
size1()),
1485 static_cast<unsigned int>(sp_mat.
size2()),
1487 static_cast<unsigned int>(sp_mat.
maxnnz()),
1490 detail::cuda_arg<NumericT>(d_mat.lhs()),
1496 detail::cuda_arg<NumericT>(result),
1511 template <
typename T>
1513 const T * ell_elements,
1514 const unsigned int * csr_rows,
1515 const unsigned int * csr_cols,
1516 const T * csr_elements,
1518 unsigned int start_x,
1521 unsigned int start_result,
1522 unsigned int inc_result,
1523 unsigned int row_num,
1524 unsigned int internal_row_num,
1525 unsigned int items_per_row,
1526 unsigned int aligned_items_per_row
1529 unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
1530 unsigned int glb_sz = gridDim.x * blockDim.x;
1532 for(
unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
1536 unsigned int offset = row_id;
1537 for(
unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
1539 T val = ell_elements[offset];
1544 int col = ell_coords[offset];
1545 sum += (x[col * inc_x + start_x] * val);
1549 unsigned int col_begin = csr_rows[row_id];
1550 unsigned int col_end = csr_rows[row_id + 1];
1552 for(
unsigned int item_id = col_begin; item_id < col_end; item_id++)
1554 sum += (x[csr_cols[item_id] * inc_x + start_x] * csr_elements[item_id]);
1557 result[row_id * inc_result + start_result] = sum;
1571 template<
class ScalarType,
unsigned int ALIGNMENT>
1576 hyb_matrix_vec_mul_kernel<<<256, 128>>>(detail::cuda_arg<unsigned int>(mat.
handle2().cuda_handle()),
1577 detail::cuda_arg<ScalarType>(mat.
handle().cuda_handle()),
1578 detail::cuda_arg<unsigned int>(mat.
handle3().cuda_handle()),
1579 detail::cuda_arg<unsigned int>(mat.
handle4().cuda_handle()),
1580 detail::cuda_arg<ScalarType>(mat.
handle5().cuda_handle()),
1581 detail::cuda_arg<ScalarType>(vec),
1582 static_cast<unsigned int>(vec.
start()),
1583 static_cast<unsigned int>(vec.
stride()),
1584 detail::cuda_arg<ScalarType>(result),
1585 static_cast<unsigned int>(result.
start()),
1586 static_cast<unsigned int>(result.
stride()),
1587 static_cast<unsigned int>(mat.
size1()),
1589 static_cast<unsigned int>(mat.
ell_nnz()),
1597 template <
typename DMatIndexT,
typename ResultIndexT,
typename NumericT>
1599 const NumericT * ell_elements,
1600 const unsigned int * csr_rows,
1601 const unsigned int * csr_cols,
1602 const NumericT * csr_elements,
1603 unsigned int row_num,
1604 unsigned int internal_row_num,
1605 unsigned int items_per_row,
1606 unsigned int aligned_items_per_row,
1607 const NumericT * d_mat,
1608 unsigned int d_mat_row_start,
1609 unsigned int d_mat_col_start,
1610 unsigned int d_mat_row_inc,
1611 unsigned int d_mat_col_inc,
1612 unsigned int d_mat_row_size,
1613 unsigned int d_mat_col_size,
1614 unsigned int d_mat_internal_rows,
1615 unsigned int d_mat_internal_cols,
1617 unsigned int result_row_start,
1618 unsigned int result_col_start,
1619 unsigned int result_row_inc,
1620 unsigned int result_col_inc,
1621 unsigned int result_row_size,
1622 unsigned int result_col_size,
1623 unsigned int result_internal_rows,
1624 unsigned int result_internal_cols)
1626 unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
1627 unsigned int glb_sz = gridDim.x * blockDim.x;
1629 for(
unsigned int result_col = 0; result_col < result_col_size; ++result_col)
1631 for(
unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
1635 unsigned int offset = row_id;
1636 for(
unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
1638 NumericT val = ell_elements[offset];
1642 sum += d_mat[DMatIndexT::apply(ell_coords[offset], result_col,
1643 d_mat_row_start, d_mat_row_inc,
1644 d_mat_col_start, d_mat_col_inc,
1645 d_mat_internal_rows, d_mat_internal_cols)] * val;
1649 unsigned int col_begin = csr_rows[row_id];
1650 unsigned int col_end = csr_rows[row_id + 1];
1652 for(
unsigned int item_id = col_begin; item_id < col_end; item_id++)
1654 sum += d_mat[DMatIndexT::apply(csr_cols[item_id], result_col,
1655 d_mat_row_start, d_mat_row_inc,
1656 d_mat_col_start, d_mat_col_inc,
1657 d_mat_internal_rows, d_mat_internal_cols)] * csr_elements[item_id];
1660 result[ResultIndexT::apply(row_id, result_col,
1661 result_row_start, result_row_inc,
1662 result_col_start, result_col_inc,
1663 result_internal_rows, result_internal_cols)] = sum;
1678 template<
typename NumericT,
unsigned int ALIGNMENT,
typename F1,
typename F2>
1684 detail::cuda_arg<unsigned int>(mat.
handle2().cuda_handle()),
1685 detail::cuda_arg<NumericT>(mat.
handle().cuda_handle()),
1686 detail::cuda_arg<unsigned int>(mat.
handle3().cuda_handle()),
1687 detail::cuda_arg<unsigned int>(mat.
handle4().cuda_handle()),
1688 detail::cuda_arg<NumericT>(mat.
handle5().cuda_handle()),
1689 static_cast<unsigned int>(mat.
size1()),
1691 static_cast<unsigned int>(mat.
ell_nnz()),
1694 detail::cuda_arg<NumericT>(d_mat),
1700 detail::cuda_arg<NumericT>(result),
1711 template <
typename DMatIndexT,
typename ResultIndexT,
typename NumericT>
1713 const NumericT * ell_elements,
1714 const unsigned int * csr_rows,
1715 const unsigned int * csr_cols,
1716 const NumericT * csr_elements,
1717 unsigned int row_num,
1718 unsigned int internal_row_num,
1719 unsigned int items_per_row,
1720 unsigned int aligned_items_per_row,
1721 const NumericT * d_mat,
1722 unsigned int d_mat_row_start,
1723 unsigned int d_mat_col_start,
1724 unsigned int d_mat_row_inc,
1725 unsigned int d_mat_col_inc,
1726 unsigned int d_mat_row_size,
1727 unsigned int d_mat_col_size,
1728 unsigned int d_mat_internal_rows,
1729 unsigned int d_mat_internal_cols,
1731 unsigned int result_row_start,
1732 unsigned int result_col_start,
1733 unsigned int result_row_inc,
1734 unsigned int result_col_inc,
1735 unsigned int result_row_size,
1736 unsigned int result_col_size,
1737 unsigned int result_internal_rows,
1738 unsigned int result_internal_cols)
1740 unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
1741 unsigned int glb_sz = gridDim.x * blockDim.x;
1743 for(
unsigned int result_col = 0; result_col < result_col_size; ++result_col)
1745 for(
unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
1749 unsigned int offset = row_id;
1750 for(
unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
1752 NumericT val = ell_elements[offset];
1756 sum += d_mat[DMatIndexT::apply(result_col, ell_coords[offset],
1757 d_mat_row_start, d_mat_row_inc,
1758 d_mat_col_start, d_mat_col_inc,
1759 d_mat_internal_rows, d_mat_internal_cols)] * val;
1763 unsigned int col_begin = csr_rows[row_id];
1764 unsigned int col_end = csr_rows[row_id + 1];
1766 for(
unsigned int item_id = col_begin; item_id < col_end; item_id++)
1768 sum += d_mat[DMatIndexT::apply(result_col, csr_cols[item_id],
1769 d_mat_row_start, d_mat_row_inc,
1770 d_mat_col_start, d_mat_col_inc,
1771 d_mat_internal_rows, d_mat_internal_cols)] * csr_elements[item_id];
1774 result[ResultIndexT::apply(row_id, result_col,
1775 result_row_start, result_row_inc,
1776 result_col_start, result_col_inc,
1777 result_internal_rows, result_internal_cols)] = sum;
1792 template<
typename NumericT,
unsigned int ALIGNMENT,
typename F1,
typename F2>
1800 detail::cuda_arg<unsigned int>(mat.
handle2().cuda_handle()),
1801 detail::cuda_arg<NumericT>(mat.
handle().cuda_handle()),
1802 detail::cuda_arg<unsigned int>(mat.
handle3().cuda_handle()),
1803 detail::cuda_arg<unsigned int>(mat.
handle4().cuda_handle()),
1804 detail::cuda_arg<NumericT>(mat.
handle5().cuda_handle()),
1805 static_cast<unsigned int>(mat.
size1()),
1807 static_cast<unsigned int>(mat.
ell_nnz()),
1810 detail::cuda_arg<NumericT>(d_mat.lhs()),
1816 detail::cuda_arg<NumericT>(result),
vcl_size_t ell_nnz() const
Definition: hyb_matrix.hpp:78
Simple enable-if variant that uses the SFINAE pattern.
Definition: enable_if.hpp:29
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
Definition: compressed_compressed_matrix.hpp:452
__global__ void coordinate_matrix_d_tr_mat_mul_kernel(const unsigned int *coords, const ScalarType *elements, const unsigned int *group_boundaries, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
Definition: sparse_matrix_operations.hpp:1104
std::size_t vcl_size_t
Definition: forwards.h:58
const handle_type & handle3() const
Definition: hyb_matrix.hpp:83
static __device__ unsigned int apply(unsigned int i, unsigned int j, unsigned int row_start, unsigned int row_inc, unsigned int col_start, unsigned int col_inc, unsigned int internal_rows, unsigned int internal_cols)
Definition: sparse_matrix_operations.hpp:180
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
Definition: compressed_compressed_matrix.hpp:456
result_of::size_type< matrix_base< NumericT, F > >::type stride2(matrix_base< NumericT, F > const &s)
Definition: stride.hpp:68
__global__ void compressed_matrix_vec_mul_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const T *elements, const T *x, unsigned int start_x, unsigned int inc_x, T *result, unsigned int start_result, unsigned int inc_result, unsigned int size_result)
Definition: sparse_matrix_operations.hpp:118
__global__ void hyb_matrix_vec_mul_kernel(const unsigned int *ell_coords, const T *ell_elements, const unsigned int *csr_rows, const unsigned int *csr_cols, const T *csr_elements, const T *x, unsigned int start_x, unsigned int inc_x, T *result, unsigned int start_result, unsigned int inc_result, unsigned int row_num, unsigned int internal_row_num, unsigned int items_per_row, unsigned int aligned_items_per_row)
Definition: sparse_matrix_operations.hpp:1512
Common routines for CUDA execution.
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
Definition: compressed_compressed_matrix.hpp:450
vcl_size_t size1(MatrixType const &mat)
Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.)
Definition: size.hpp:216
A tag class representing a lower triangular matrix.
Definition: forwards.h:703
vcl_size_t size1() const
Definition: ell_matrix.hpp:80
vcl_size_t size2() const
Definition: ell_matrix.hpp:81
void row_info(compressed_matrix< ScalarType, MAT_ALIGNMENT > const &mat, vector_base< ScalarType > &vec, viennacl::linalg::detail::row_info_types info_selector)
Definition: sparse_matrix_operations.hpp:100
void inplace_solve(const matrix_base< NumericT, F1 > &A, matrix_base< NumericT, F2 > &B, SOLVERTAG tag)
Direct inplace solver for triangular systems with multiple right hand sides, i.e. A \ B (MATLAB notat...
Definition: direct_solve.hpp:297
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
Definition: stride.hpp:46
This file provides the forward declarations for the main types used within ViennaCL.
result_of::size_type< T >::type start1(T const &obj)
Definition: start.hpp:64
vcl_size_t internal_maxnnz() const
Definition: ell_matrix.hpp:83
LHS & lhs() const
Get left hand side operand.
Definition: matrix.hpp:174
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
Definition: compressed_matrix.hpp:701
__global__ void ell_matrix_d_tr_mat_mul_kernel(const unsigned int *sp_mat_coords, const ScalarType *sp_mat_elements, unsigned int sp_mat_row_num, unsigned int sp_mat_col_num, unsigned int sp_mat_internal_row_num, unsigned int sp_mat_items_per_row, unsigned int sp_mat_aligned_items_per_row, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
Definition: sparse_matrix_operations.hpp:1406
Sparse matrix class using a hybrid format composed of the ELL and CSR format for storing the nonzeros...
Definition: forwards.h:321
result_of::size_type< MatrixType >::type size2(MatrixType const &mat)
Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc...
Definition: size.hpp:245
Helper struct for accessing an element of a row- or column-major matrix.
Definition: sparse_matrix_operations.hpp:178
#define VIENNACL_CUDA_LAST_ERROR_CHECK(message)
Definition: common.hpp:27
__global__ void compressed_matrix_d_tr_mat_mul_kernel(const unsigned int *sp_mat_row_indices, const unsigned int *sp_mat_col_indices, const T *sp_mat_elements, const T *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, T *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
Definition: sparse_matrix_operations.hpp:294
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
Definition: coordinate_matrix.hpp:354
A dense matrix class.
Definition: forwards.h:290
result_of::size_type< matrix_base< NumericT, F > >::type stride1(matrix_base< NumericT, F > const &s)
Definition: stride.hpp:57
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
Definition: cpu_ram.hpp:29
vcl_size_t internal_size1() const
Definition: hyb_matrix.hpp:71
const vcl_size_t & nnz1() const
Returns the number of nonzero entries.
Definition: compressed_compressed_matrix.hpp:445
__global__ void coordinate_matrix_vec_mul_kernel(const unsigned int *coords, const T *elements, const unsigned int *group_boundaries, const T *x, unsigned int start_x, unsigned int inc_x, T *result, unsigned int start_result, unsigned int inc_result)
Definition: sparse_matrix_operations.hpp:876
__global__ void hyb_matrix_d_tr_mat_mul_kernel(const unsigned int *ell_coords, const NumericT *ell_elements, const unsigned int *csr_rows, const unsigned int *csr_cols, const NumericT *csr_elements, unsigned int row_num, unsigned int internal_row_num, unsigned int items_per_row, unsigned int aligned_items_per_row, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
Definition: sparse_matrix_operations.hpp:1712
const handle_type & handle2() const
Definition: hyb_matrix.hpp:82
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
Definition: size.hpp:144
result_of::size_type< T >::type start2(T const &obj)
Definition: start.hpp:83
const handle_type & handle3() const
Returns the OpenCL handle to the group start index array.
Definition: coordinate_matrix.hpp:356
vcl_size_t internal_ellnnz() const
Definition: hyb_matrix.hpp:77
Sparse matrix class using the ELLPACK format for storing the nonzeros.
Definition: ell_matrix.hpp:53
const handle_type & handle4() const
Definition: hyb_matrix.hpp:84
A tag class representing an upper triangular matrix.
Definition: forwards.h:708
__global__ void compressed_matrix_d_mat_mul_kernel(const unsigned int *sp_mat_row_indices, const unsigned int *sp_mat_col_indices, const T *sp_mat_elements, const T *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, T *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
Definition: sparse_matrix_operations.hpp:205
__global__ void ell_matrix_d_mat_mul_kernel(const unsigned int *sp_mat_coords, const ScalarType *sp_mat_elements, unsigned int sp_mat_row_num, unsigned int sp_mat_col_num, unsigned int sp_mat_internal_row_num, unsigned int sp_mat_items_per_row, unsigned int sp_mat_aligned_items_per_row, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
Definition: sparse_matrix_operations.hpp:1309
__global__ void coo_row_info_extractor(const unsigned int *coords, const T *elements, const unsigned int *group_boundaries, T *result, unsigned int option)
Definition: sparse_matrix_operations.hpp:731
handle_type & handle()
Definition: ell_matrix.hpp:89
const handle_type & handle5() const
Definition: hyb_matrix.hpp:85
A sparse square matrix in compressed sparse rows format optimized for the case that only a few rows c...
Definition: compressed_compressed_matrix.hpp:263
const handle_type & handle3() const
Returns the OpenCL handle to the row index array.
Definition: compressed_compressed_matrix.hpp:454
const handle_type & handle12() const
Returns the OpenCL handle to the (row, column) index array.
Definition: coordinate_matrix.hpp:352
__global__ void hyb_matrix_d_mat_mul_kernel(const unsigned int *ell_coords, const NumericT *ell_elements, const unsigned int *csr_rows, const unsigned int *csr_cols, const NumericT *csr_elements, unsigned int row_num, unsigned int internal_row_num, unsigned int items_per_row, unsigned int aligned_items_per_row, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
Definition: sparse_matrix_operations.hpp:1598
__global__ void csr_row_info_extractor_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const T *elements, T *result, unsigned int size, unsigned int option)
Definition: sparse_matrix_operations.hpp:47
size_type size() const
Returns the length of the vector (cf. std::vector)
Definition: vector.hpp:837
const vcl_size_t & size1() const
Returns the number of rows.
Definition: compressed_matrix.hpp:692
Expression template class for representing a tree of expressions which ultimately result in a matrix...
Definition: forwards.h:283
const handle_type & handle() const
Definition: hyb_matrix.hpp:81
vcl_size_t internal_size2(matrix_base< NumericT, F > const &mat)
Helper routine for obtaining the internal number of entries per column of a ViennaCL matrix...
Definition: size.hpp:287
void block_inplace_solve(const matrix_expression< const compressed_matrix< ScalarType, MAT_ALIGNMENT >, const compressed_matrix< ScalarType, MAT_ALIGNMENT >, op_trans > &L, viennacl::backend::mem_handle const &block_indices, vcl_size_t num_blocks, vector_base< ScalarType > const &, vector_base< ScalarType > &vec, viennacl::linalg::unit_lower_tag)
Definition: sparse_matrix_operations.hpp:613
size_type start() const
Returns the offset within the buffer.
Definition: vector.hpp:845
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
Definition: matrix.hpp:910
Implementations of direct triangular solvers for sparse matrices using CUDA.
vcl_size_t size1() const
Returns the size of the result vector.
Definition: matrix.hpp:180
__global__ void compressed_compressed_matrix_vec_mul_kernel(const unsigned int *row_jumper, const unsigned int *row_indices, const unsigned int *column_indices, const T *elements, unsigned int nonzero_rows, const T *x, unsigned int start_x, unsigned int inc_x, T *result, unsigned int start_result, unsigned int inc_result, unsigned int size_result)
Definition: sparse_matrix_operations.hpp:659
vector_expression< const matrix_base< NumericT, F >, const int, op_matrix_diag > diag(const matrix_base< NumericT, F > &A, int k=0)
Definition: matrix.hpp:895
void dot_prod(const MatrixType &A, unsigned int beg_ind, ScalarType &res)
Dot prod of particular column of martix A with it's self starting at a certain index beg_ind...
Definition: qr.hpp:154
row_info_types
Definition: forwards.h:691
__global__ void compressed_matrix_diagonal_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const T *elements, T *result, unsigned int size)
Definition: sparse_matrix_operations.hpp:389
vcl_size_t maxnnz() const
Definition: ell_matrix.hpp:84
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
vcl_size_t size1() const
Definition: hyb_matrix.hpp:74
handle_type & handle2()
Definition: ell_matrix.hpp:92
const handle_type & handle() const
Returns the memory handle.
Definition: vector.hpp:856
A tag class representing a lower triangular matrix with unit diagonal.
Definition: forwards.h:713
Main abstraction class for multiple memory domains. Represents a buffer in either main RAM...
Definition: mem_handle.hpp:62
A tag class representing transposed matrices.
Definition: forwards.h:165
A tag for column-major storage of a dense matrix.
Definition: forwards.h:263
vcl_size_t internal_size1(matrix_base< NumericT, F > const &mat)
Helper routine for obtaining the internal number of entries per row of a ViennaCL matrix...
Definition: size.hpp:279
Implementation of the ViennaCL scalar class.
__global__ void coordinate_matrix_d_mat_mul_kernel(const unsigned int *coords, const ScalarType *elements, const unsigned int *group_boundaries, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
Definition: sparse_matrix_operations.hpp:976
void clear()
Resets all entries to zero. Does not change the size of the vector.
Definition: vector.hpp:863
vcl_size_t internal_size1() const
Definition: ell_matrix.hpp:77
void prod_impl(const matrix_base< NumericT, F > &mat, const vector_base< NumericT > &vec, vector_base< NumericT > &result)
Carries out matrix-vector multiplication.
Definition: matrix_operations.hpp:1364
size_type stride() const
Returns the stride within the buffer (in multiples of sizeof(SCALARTYPE))
Definition: vector.hpp:849
A tag class representing an upper triangular matrix with unit diagonal.
Definition: forwards.h:718
__global__ void ell_matrix_vec_mul_kernel(const unsigned int *coords, const T *elements, const T *x, unsigned int start_x, unsigned int inc_x, T *result, unsigned int start_result, unsigned int inc_result, unsigned int row_num, unsigned int col_num, unsigned int internal_row_num, unsigned int items_per_row, unsigned int aligned_items_per_row)
Definition: sparse_matrix_operations.hpp:1239
A sparse square matrix, where entries are stored as triplets (i,j, val), where i and j are the row an...
Definition: coordinate_matrix.hpp:186
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
Definition: compressed_matrix.hpp:703
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
Definition: compressed_matrix.hpp:699