1 #ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_ROW_HPP_
2 #define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_ROW_HPP_
40 unsigned int A_start1,
unsigned int A_start2,
41 unsigned int A_inc1,
unsigned int A_inc2,
42 unsigned int A_size1,
unsigned int A_size2,
43 unsigned int A_internal_size1,
unsigned int A_internal_size2,
46 unsigned int options2,
48 unsigned int B_start1,
unsigned int B_start2,
49 unsigned int B_inc1,
unsigned int B_inc2,
50 unsigned int B_internal_size1,
unsigned int B_internal_size2)
53 if (options2 & (1 << 0))
56 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
57 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
59 if (options2 & (1 << 1))
61 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
62 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
63 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha;
67 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
68 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
69 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha;
77 unsigned int A_start1,
unsigned int A_start2,
78 unsigned int A_inc1,
unsigned int A_inc2,
79 unsigned int A_size1,
unsigned int A_size2,
80 unsigned int A_internal_size1,
unsigned int A_internal_size2,
83 unsigned int options2,
85 unsigned int B_start1,
unsigned int B_start2,
86 unsigned int B_inc1,
unsigned int B_inc2,
87 unsigned int B_internal_size1,
unsigned int B_internal_size2)
90 if (options2 & (1 << 0))
93 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
94 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
96 if (options2 & (1 << 1))
98 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
99 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
100 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha;
104 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
105 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
106 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha;
116 template <
typename T>
119 unsigned int A_start1,
unsigned int A_start2,
120 unsigned int A_inc1,
unsigned int A_inc2,
121 unsigned int A_size1,
unsigned int A_size2,
122 unsigned int A_internal_size1,
unsigned int A_internal_size2,
125 unsigned int options2,
127 unsigned int B_start1,
unsigned int B_start2,
128 unsigned int B_inc1,
unsigned int B_inc2,
129 unsigned int B_internal_size1,
unsigned int B_internal_size2,
132 unsigned int options3,
134 unsigned int C_start1,
unsigned int C_start2,
135 unsigned int C_inc1,
unsigned int C_inc2,
136 unsigned int C_internal_size1,
unsigned int C_internal_size2)
139 if (options2 & (1 << 0))
143 if (options3 & (1 << 0))
146 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
147 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
149 if (options2 & (1 << 1))
151 if (options3 & (1 << 1))
153 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
154 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
155 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
156 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
157 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
161 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
162 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
163 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
164 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
165 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
170 if (options3 & (1 << 1))
172 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
173 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
174 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
175 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
176 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
180 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
181 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
182 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
183 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
184 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
191 template <
typename T>
194 unsigned int A_start1,
unsigned int A_start2,
195 unsigned int A_inc1,
unsigned int A_inc2,
196 unsigned int A_size1,
unsigned int A_size2,
197 unsigned int A_internal_size1,
unsigned int A_internal_size2,
200 unsigned int options2,
202 unsigned int B_start1,
unsigned int B_start2,
203 unsigned int B_inc1,
unsigned int B_inc2,
204 unsigned int B_internal_size1,
unsigned int B_internal_size2,
207 unsigned int options3,
209 unsigned int C_start1,
unsigned int C_start2,
210 unsigned int C_inc1,
unsigned int C_inc2,
211 unsigned int C_internal_size1,
unsigned int C_internal_size2)
214 if (options2 & (1 << 0))
218 if (options3 & (1 << 0))
221 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
222 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
224 if (options2 & (1 << 1))
226 if (options3 & (1 << 1))
228 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
229 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
230 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
231 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
232 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
236 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
237 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
238 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
239 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
240 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
245 if (options3 & (1 << 1))
247 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
248 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
249 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
250 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
251 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
255 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
256 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
257 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
258 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
259 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
265 template <
typename T>
268 unsigned int A_start1,
unsigned int A_start2,
269 unsigned int A_inc1,
unsigned int A_inc2,
270 unsigned int A_size1,
unsigned int A_size2,
271 unsigned int A_internal_size1,
unsigned int A_internal_size2,
274 unsigned int options2,
276 unsigned int B_start1,
unsigned int B_start2,
277 unsigned int B_inc1,
unsigned int B_inc2,
278 unsigned int B_internal_size1,
unsigned int B_internal_size2,
281 unsigned int options3,
283 unsigned int C_start1,
unsigned int C_start2,
284 unsigned int C_inc1,
unsigned int C_inc2,
285 unsigned int C_internal_size1,
unsigned int C_internal_size2)
288 if (options2 & (1 << 0))
292 if (options3 & (1 << 0))
295 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
296 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
298 if (options2 & (1 << 1))
300 if (options3 & (1 << 1))
302 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
303 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
304 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
305 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
306 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
310 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
311 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
312 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
313 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
314 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
319 if (options3 & (1 << 1))
321 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
322 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
323 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
324 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
325 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
329 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
330 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
331 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
332 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
333 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
340 template <
typename T>
343 unsigned int A_start1,
unsigned int A_start2,
344 unsigned int A_inc1,
unsigned int A_inc2,
345 unsigned int A_size1,
unsigned int A_size2,
346 unsigned int A_internal_size1,
unsigned int A_internal_size2,
349 unsigned int options2,
351 unsigned int B_start1,
unsigned int B_start2,
352 unsigned int B_inc1,
unsigned int B_inc2,
353 unsigned int B_internal_size1,
unsigned int B_internal_size2,
356 unsigned int options3,
358 unsigned int C_start1,
unsigned int C_start2,
359 unsigned int C_inc1,
unsigned int C_inc2,
360 unsigned int C_internal_size1,
unsigned int C_internal_size2)
363 if (options2 & (1 << 0))
367 if (options3 & (1 << 0))
370 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
371 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
373 if (options2 & (1 << 1))
375 if (options3 & (1 << 1))
377 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
378 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
379 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
380 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
381 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
385 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
386 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
387 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
388 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
389 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
394 if (options3 & (1 << 1))
396 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
397 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
398 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
399 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
400 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
404 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
405 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
406 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
407 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
408 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
419 template <
typename T>
422 unsigned int A_start1,
unsigned int A_start2,
423 unsigned int A_inc1,
unsigned int A_inc2,
424 unsigned int A_size1,
unsigned int A_size2,
425 unsigned int A_internal_size1,
unsigned int A_internal_size2,
428 unsigned int options2,
430 unsigned int B_start1,
unsigned int B_start2,
431 unsigned int B_inc1,
unsigned int B_inc2,
432 unsigned int B_internal_size1,
unsigned int B_internal_size2,
435 unsigned int options3,
437 unsigned int C_start1,
unsigned int C_start2,
438 unsigned int C_inc1,
unsigned int C_inc2,
439 unsigned int C_internal_size1,
unsigned int C_internal_size2)
442 if (options2 & (1 << 0))
446 if (options3 & (1 << 0))
449 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
450 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
452 if (options2 & (1 << 1))
454 if (options3 & (1 << 1))
456 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
457 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
458 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
459 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
460 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
464 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
465 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
466 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
467 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
468 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
473 if (options3 & (1 << 1))
475 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
476 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
477 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
478 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
479 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
483 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
484 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
485 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
486 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
487 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
494 template <
typename T>
497 unsigned int A_start1,
unsigned int A_start2,
498 unsigned int A_inc1,
unsigned int A_inc2,
499 unsigned int A_size1,
unsigned int A_size2,
500 unsigned int A_internal_size1,
unsigned int A_internal_size2,
503 unsigned int options2,
505 unsigned int B_start1,
unsigned int B_start2,
506 unsigned int B_inc1,
unsigned int B_inc2,
507 unsigned int B_internal_size1,
unsigned int B_internal_size2,
510 unsigned int options3,
512 unsigned int C_start1,
unsigned int C_start2,
513 unsigned int C_inc1,
unsigned int C_inc2,
514 unsigned int C_internal_size1,
unsigned int C_internal_size2)
517 if (options2 & (1 << 0))
521 if (options3 & (1 << 0))
524 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
525 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
527 if (options2 & (1 << 1))
529 if (options3 & (1 << 1))
531 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
532 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
533 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
534 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
535 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
539 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
540 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
541 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
542 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
543 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
548 if (options3 & (1 << 1))
550 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
551 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
552 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
553 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
554 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
558 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
559 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
560 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
561 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
562 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
568 template <
typename T>
571 unsigned int A_start1,
unsigned int A_start2,
572 unsigned int A_inc1,
unsigned int A_inc2,
573 unsigned int A_size1,
unsigned int A_size2,
574 unsigned int A_internal_size1,
unsigned int A_internal_size2,
577 unsigned int options2,
579 unsigned int B_start1,
unsigned int B_start2,
580 unsigned int B_inc1,
unsigned int B_inc2,
581 unsigned int B_internal_size1,
unsigned int B_internal_size2,
584 unsigned int options3,
586 unsigned int C_start1,
unsigned int C_start2,
587 unsigned int C_inc1,
unsigned int C_inc2,
588 unsigned int C_internal_size1,
unsigned int C_internal_size2)
591 if (options2 & (1 << 0))
595 if (options3 & (1 << 0))
598 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
599 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
601 if (options2 & (1 << 1))
603 if (options3 & (1 << 1))
605 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
606 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
607 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
608 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
609 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
613 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
614 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
615 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
616 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
617 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
622 if (options3 & (1 << 1))
624 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
625 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
626 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
627 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
628 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
632 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
633 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
634 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
635 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
636 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
643 template <
typename T>
646 unsigned int A_start1,
unsigned int A_start2,
647 unsigned int A_inc1,
unsigned int A_inc2,
648 unsigned int A_size1,
unsigned int A_size2,
649 unsigned int A_internal_size1,
unsigned int A_internal_size2,
652 unsigned int options2,
654 unsigned int B_start1,
unsigned int B_start2,
655 unsigned int B_inc1,
unsigned int B_inc2,
656 unsigned int B_internal_size1,
unsigned int B_internal_size2,
659 unsigned int options3,
661 unsigned int C_start1,
unsigned int C_start2,
662 unsigned int C_inc1,
unsigned int C_inc2,
663 unsigned int C_internal_size1,
unsigned int C_internal_size2)
666 if (options2 & (1 << 0))
670 if (options3 & (1 << 0))
673 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
674 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
676 if (options2 & (1 << 1))
678 if (options3 & (1 << 1))
680 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
681 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
682 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
683 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
684 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
688 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
689 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
690 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
691 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha
692 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
697 if (options3 & (1 << 1))
699 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
700 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
701 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
702 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
703 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta;
707 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
708 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
709 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
710 += B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha
711 + C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta;
720 template <
typename T>
723 unsigned int A_start1,
unsigned int A_start2,
724 unsigned int A_inc1,
unsigned int A_inc2,
725 unsigned int A_size1,
unsigned int A_size2,
726 unsigned int A_internal_size1,
unsigned int A_internal_size2,
729 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
730 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
732 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
733 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
734 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = alpha;
738 template <
typename T>
741 unsigned int A_start1,
unsigned int A_start2,
742 unsigned int A_inc1,
unsigned int A_inc2,
743 unsigned int A_size1,
unsigned int A_size2,
744 unsigned int A_internal_size1,
unsigned int A_internal_size2,
747 unsigned int gid = (blockIdx.x * blockDim.x + threadIdx.x);
749 for (
unsigned int row = gid;
row < A_size1;
row += blockDim.x * gridDim.x)
750 A[(
row * A_inc1 + A_start1) * A_internal_size2 +
row * A_inc2 + A_start2] = alpha;
757 template <
typename T>
760 unsigned int A_start1,
unsigned int A_start2,
761 unsigned int A_inc1,
unsigned int A_inc2,
762 unsigned int A_size1,
unsigned int A_size2,
763 unsigned int A_internal_size1,
unsigned int A_internal_size2,
766 unsigned int B_start1,
unsigned int B_start2,
767 unsigned int B_inc1,
unsigned int B_inc2,
768 unsigned int B_internal_size1,
unsigned int B_internal_size2,
771 unsigned int C_start1,
unsigned int C_start2,
772 unsigned int C_inc1,
unsigned int C_inc2,
773 unsigned int C_internal_size1,
unsigned int C_internal_size2,
775 unsigned int op_type)
777 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
778 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
782 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
783 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
784 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
785 = pow(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2],
786 C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2]);
788 else if (op_type == 1)
790 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
791 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
792 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
793 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]
794 / C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2];
796 else if (op_type == 0)
798 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
799 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
800 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
801 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]
802 * C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2];
806 template <
typename T>
809 unsigned int A_start1,
unsigned int A_start2,
810 unsigned int A_inc1,
unsigned int A_inc2,
811 unsigned int A_size1,
unsigned int A_size2,
812 unsigned int A_internal_size1,
unsigned int A_internal_size2,
815 unsigned int B_start1,
unsigned int B_start2,
816 unsigned int B_inc1,
unsigned int B_inc2,
817 unsigned int B_internal_size1,
unsigned int B_internal_size2,
820 unsigned int C_start1,
unsigned int C_start2,
821 unsigned int C_inc1,
unsigned int C_inc2,
822 unsigned int C_internal_size1,
unsigned int C_internal_size2,
824 unsigned int op_type)
826 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
827 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
831 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
832 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
833 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
834 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]
835 / C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2];
837 else if (op_type == 0)
839 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
840 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
841 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2]
842 = B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]
843 * C[(
row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2];
852 template <
typename T>
855 unsigned int A_start1,
unsigned int A_start2,
856 unsigned int A_inc1,
unsigned int A_inc2,
857 unsigned int A_size1,
unsigned int A_size2,
858 unsigned int A_internal_size1,
unsigned int A_internal_size2,
861 unsigned int B_start1,
unsigned int B_start2,
862 unsigned int B_inc1,
unsigned int B_inc2,
863 unsigned int B_internal_size1,
unsigned int B_internal_size2)
865 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
866 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
868 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
869 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
870 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = abs(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
875 template <
typename T>
878 unsigned int A_start1,
unsigned int A_start2,
879 unsigned int A_inc1,
unsigned int A_inc2,
880 unsigned int A_size1,
unsigned int A_size2,
881 unsigned int A_internal_size1,
unsigned int A_internal_size2,
884 unsigned int B_start1,
unsigned int B_start2,
885 unsigned int B_inc1,
unsigned int B_inc2,
886 unsigned int B_internal_size1,
unsigned int B_internal_size2)
888 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
889 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
891 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
892 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
893 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = acos(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
898 template <
typename T>
901 unsigned int A_start1,
unsigned int A_start2,
902 unsigned int A_inc1,
unsigned int A_inc2,
903 unsigned int A_size1,
unsigned int A_size2,
904 unsigned int A_internal_size1,
unsigned int A_internal_size2,
907 unsigned int B_start1,
unsigned int B_start2,
908 unsigned int B_inc1,
unsigned int B_inc2,
909 unsigned int B_internal_size1,
unsigned int B_internal_size2)
911 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
912 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
914 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
915 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
916 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = asin(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
921 template <
typename T>
924 unsigned int A_start1,
unsigned int A_start2,
925 unsigned int A_inc1,
unsigned int A_inc2,
926 unsigned int A_size1,
unsigned int A_size2,
927 unsigned int A_internal_size1,
unsigned int A_internal_size2,
930 unsigned int B_start1,
unsigned int B_start2,
931 unsigned int B_inc1,
unsigned int B_inc2,
932 unsigned int B_internal_size1,
unsigned int B_internal_size2)
934 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
935 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
937 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
938 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
939 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = atan(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
944 template <
typename T>
947 unsigned int A_start1,
unsigned int A_start2,
948 unsigned int A_inc1,
unsigned int A_inc2,
949 unsigned int A_size1,
unsigned int A_size2,
950 unsigned int A_internal_size1,
unsigned int A_internal_size2,
953 unsigned int B_start1,
unsigned int B_start2,
954 unsigned int B_inc1,
unsigned int B_inc2,
955 unsigned int B_internal_size1,
unsigned int B_internal_size2)
957 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
958 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
960 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
961 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
962 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = ceil(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
967 template <
typename T>
970 unsigned int A_start1,
unsigned int A_start2,
971 unsigned int A_inc1,
unsigned int A_inc2,
972 unsigned int A_size1,
unsigned int A_size2,
973 unsigned int A_internal_size1,
unsigned int A_internal_size2,
976 unsigned int B_start1,
unsigned int B_start2,
977 unsigned int B_inc1,
unsigned int B_inc2,
978 unsigned int B_internal_size1,
unsigned int B_internal_size2)
980 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
981 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
983 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
984 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
985 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = cos(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
990 template <
typename T>
993 unsigned int A_start1,
unsigned int A_start2,
994 unsigned int A_inc1,
unsigned int A_inc2,
995 unsigned int A_size1,
unsigned int A_size2,
996 unsigned int A_internal_size1,
unsigned int A_internal_size2,
999 unsigned int B_start1,
unsigned int B_start2,
1000 unsigned int B_inc1,
unsigned int B_inc2,
1001 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1003 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1004 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1006 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1007 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1008 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = cosh(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1013 template <
typename T>
1016 unsigned int A_start1,
unsigned int A_start2,
1017 unsigned int A_inc1,
unsigned int A_inc2,
1018 unsigned int A_size1,
unsigned int A_size2,
1019 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1022 unsigned int B_start1,
unsigned int B_start2,
1023 unsigned int B_inc1,
unsigned int B_inc2,
1024 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1026 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1027 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1029 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1030 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1031 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = exp(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1036 template <
typename T>
1039 unsigned int A_start1,
unsigned int A_start2,
1040 unsigned int A_inc1,
unsigned int A_inc2,
1041 unsigned int A_size1,
unsigned int A_size2,
1042 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1045 unsigned int B_start1,
unsigned int B_start2,
1046 unsigned int B_inc1,
unsigned int B_inc2,
1047 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1049 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1050 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1052 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1053 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1054 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = fabs(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1059 template <
typename T>
1062 unsigned int A_start1,
unsigned int A_start2,
1063 unsigned int A_inc1,
unsigned int A_inc2,
1064 unsigned int A_size1,
unsigned int A_size2,
1065 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1068 unsigned int B_start1,
unsigned int B_start2,
1069 unsigned int B_inc1,
unsigned int B_inc2,
1070 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1072 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1073 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1075 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1076 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1077 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = floor(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1082 template <
typename T>
1085 unsigned int A_start1,
unsigned int A_start2,
1086 unsigned int A_inc1,
unsigned int A_inc2,
1087 unsigned int A_size1,
unsigned int A_size2,
1088 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1091 unsigned int B_start1,
unsigned int B_start2,
1092 unsigned int B_inc1,
unsigned int B_inc2,
1093 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1095 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1096 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1098 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1099 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1100 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = log(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1105 template <
typename T>
1108 unsigned int A_start1,
unsigned int A_start2,
1109 unsigned int A_inc1,
unsigned int A_inc2,
1110 unsigned int A_size1,
unsigned int A_size2,
1111 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1114 unsigned int B_start1,
unsigned int B_start2,
1115 unsigned int B_inc1,
unsigned int B_inc2,
1116 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1118 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1119 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1121 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1122 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1123 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = log10(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1128 template <
typename T>
1131 unsigned int A_start1,
unsigned int A_start2,
1132 unsigned int A_inc1,
unsigned int A_inc2,
1133 unsigned int A_size1,
unsigned int A_size2,
1134 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1137 unsigned int B_start1,
unsigned int B_start2,
1138 unsigned int B_inc1,
unsigned int B_inc2,
1139 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1141 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1142 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1144 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1145 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1146 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = sin(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1151 template <
typename T>
1154 unsigned int A_start1,
unsigned int A_start2,
1155 unsigned int A_inc1,
unsigned int A_inc2,
1156 unsigned int A_size1,
unsigned int A_size2,
1157 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1160 unsigned int B_start1,
unsigned int B_start2,
1161 unsigned int B_inc1,
unsigned int B_inc2,
1162 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1164 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1165 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1167 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1168 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1169 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = sinh(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1174 template <
typename T>
1177 unsigned int A_start1,
unsigned int A_start2,
1178 unsigned int A_inc1,
unsigned int A_inc2,
1179 unsigned int A_size1,
unsigned int A_size2,
1180 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1183 unsigned int B_start1,
unsigned int B_start2,
1184 unsigned int B_inc1,
unsigned int B_inc2,
1185 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1187 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1188 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1190 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1191 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1192 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = sqrt(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1197 template <
typename T>
1200 unsigned int A_start1,
unsigned int A_start2,
1201 unsigned int A_inc1,
unsigned int A_inc2,
1202 unsigned int A_size1,
unsigned int A_size2,
1203 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1206 unsigned int B_start1,
unsigned int B_start2,
1207 unsigned int B_inc1,
unsigned int B_inc2,
1208 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1210 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1211 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1213 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1214 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1215 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = tan(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1220 template <
typename T>
1223 unsigned int A_start1,
unsigned int A_start2,
1224 unsigned int A_inc1,
unsigned int A_inc2,
1225 unsigned int A_size1,
unsigned int A_size2,
1226 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1229 unsigned int B_start1,
unsigned int B_start2,
1230 unsigned int B_inc1,
unsigned int B_inc2,
1231 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1233 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1234 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1236 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1237 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1238 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = tanh(B[(
row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]);
1247 template <
typename T>
1250 unsigned int A_row_start,
1251 unsigned int A_col_start,
1252 unsigned int A_row_inc,
1253 unsigned int A_col_inc,
1254 unsigned int A_row_size,
1255 unsigned int A_col_size,
1256 unsigned int A_internal_rows,
1257 unsigned int A_internal_cols,
1259 unsigned int v_start,
1261 unsigned int v_size,
1263 unsigned int result_start,
1264 unsigned int result_inc,
1265 unsigned int result_size)
1267 __shared__ T work[128];
1269 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1270 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1271 unsigned int lid = threadIdx.x;
1273 for (
unsigned int row = row_gid;
row < A_row_size;
row += gridDim.x)
1276 for (
unsigned int col = col_gid; col < A_col_size; col += blockDim.x)
1277 dot_prod += A[(
row * A_row_inc + A_row_start) * A_internal_cols + col * A_col_inc + A_col_start] * v[v_start + v_inc * col];
1283 work[lid] += work[lid+
stride];
1287 result[
row * result_inc + result_start] = work[0];
1292 template <
typename T>
1295 unsigned int A_row_start,
1296 unsigned int A_col_start,
1297 unsigned int A_row_inc,
1298 unsigned int A_col_inc,
1299 unsigned int A_row_size,
1300 unsigned int A_col_size,
1301 unsigned int A_internal_rows,
1302 unsigned int A_internal_cols,
1304 unsigned int v_start,
1306 unsigned int v_size,
1308 unsigned int result_start,
1309 unsigned int result_inc,
1310 unsigned int result_size)
1312 for (
unsigned int row = blockIdx.x * blockDim.x + threadIdx.x;
row < A_col_size;
row += gridDim.x * blockDim.x)
1315 for (
unsigned int col = 0; col < A_row_size; ++col)
1316 dot_prod += A[(
row * A_col_inc + A_col_start) + (col * A_row_inc + A_row_start) * A_internal_cols] * v[v_start + v_inc * col];
1317 result[
row * result_inc + result_start] =
dot_prod;
1334 template <
typename T>
1337 unsigned int A_start1,
unsigned int A_start2,
1338 unsigned int A_inc1,
unsigned int A_inc2,
1339 unsigned int A_size1,
unsigned int A_size2,
1340 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1343 unsigned int options2,
1356 if (options2 & (1 << 0))
1358 if (options2 & (1 << 1))
1359 alpha = ((T)(1)) / alpha;
1361 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1362 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1364 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1366 T tmp = alpha * vec1[
row * inc1 +
start1];
1367 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1368 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] += tmp * vec2[col * inc2 +
start2];
1374 template <
typename T>
1377 unsigned int A_start1,
unsigned int A_start2,
1378 unsigned int A_inc1,
unsigned int A_inc2,
1379 unsigned int A_size1,
unsigned int A_size2,
1380 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1383 unsigned int options2,
1396 if (options2 & (1 << 0))
1398 if (options2 & (1 << 1))
1399 alpha = ((T)(1)) / alpha;
1401 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1402 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1404 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1406 T tmp = alpha * vec1[
row * inc1 +
start1];
1407 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1408 A[(
row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] += tmp * vec2[col * inc2 +
start2];
__global__ void trans_vec_mul_row_kernel(const T *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const T *v, unsigned int v_start, unsigned int v_inc, unsigned int v_size, T *result, unsigned int result_start, unsigned int result_inc, unsigned int result_size)
Definition: matrix_operations_row.hpp:1293
__global__ void matrix_row_element_fabs_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_row.hpp:1037
__global__ void matrix_row_element_exp_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_row.hpp:1014
__global__ void matrix_row_element_cosh_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_row.hpp:991
__global__ void matrix_row_element_sinh_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_row.hpp:1152
__global__ void matrix_row_diagonal_assign_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, T alpha)
Definition: matrix_operations_row.hpp:739
__global__ void am_row_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, T fac2, unsigned int options2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_row.hpp:38
vcl_size_t size1(MatrixType const &mat)
Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.)
Definition: size.hpp:216
__global__ void matrix_row_element_acos_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_row.hpp:876
__global__ void element_op_int_row_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, const T *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2, unsigned int op_type)
Definition: matrix_operations_row.hpp:807
__global__ void matrix_row_element_ceil_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_row.hpp:945
__global__ void vec_mul_row_kernel(const T *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const T *v, unsigned int v_start, unsigned int v_inc, unsigned int v_size, T *result, unsigned int result_start, unsigned int result_inc, unsigned int result_size)
Definition: matrix_operations_row.hpp:1248
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
Definition: stride.hpp:46
__global__ void matrix_row_element_sin_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_row.hpp:1129
result_of::size_type< T >::type start1(T const &obj)
Definition: start.hpp:64
result_of::size_type< MatrixType >::type size2(MatrixType const &mat)
Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc...
Definition: size.hpp:245
__global__ void matrix_row_element_atan_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_row.hpp:922
__global__ void matrix_row_assign_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, T alpha)
Definition: matrix_operations_row.hpp:721
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
Definition: cpu_ram.hpp:29
__global__ void matrix_row_element_abs_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_row.hpp:853
__global__ void matrix_row_element_tanh_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_row.hpp:1221
__global__ void ambm_m_row_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, T fac2, unsigned int options2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, T fac3, unsigned int options3, const T *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2)
Definition: matrix_operations_row.hpp:420
result_of::size_type< T >::type start2(T const &obj)
Definition: start.hpp:83
__global__ void matrix_row_element_floor_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_row.hpp:1060
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
Definition: matrix.hpp:910
__global__ void matrix_row_element_sqrt_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_row.hpp:1175
__global__ void matrix_row_element_log10_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_row.hpp:1106
void dot_prod(const MatrixType &A, unsigned int beg_ind, ScalarType &res)
Dot prod of particular column of martix A with it's self starting at a certain index beg_ind...
Definition: qr.hpp:154
__global__ void matrix_row_element_tan_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_row.hpp:1198
__global__ void matrix_row_element_cos_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_row.hpp:968
__global__ void ambm_row_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, T fac2, unsigned int options2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, T fac3, unsigned int options3, const T *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2)
Definition: matrix_operations_row.hpp:117
__global__ void scaled_rank1_update_row_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, T val, unsigned int options2, const T *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, const T *vec2, unsigned int start2, unsigned int inc2, unsigned int size2)
Definition: matrix_operations_row.hpp:1335
__global__ void matrix_row_element_log_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_row.hpp:1083
__global__ void element_op_row_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, const T *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2, unsigned int op_type)
Definition: matrix_operations_row.hpp:758
__global__ void matrix_row_element_asin_kernel(T *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const T *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
Definition: matrix_operations_row.hpp:899