ViennaCL - The Vienna Computing Library  1.5.2
scalar_reduction.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_GENERATOR_GENERATE_SCALAR_REDUCTION_HPP
2 #define VIENNACL_GENERATOR_GENERATE_SCALAR_REDUCTION_HPP
3 
4 /* =========================================================================
5  Copyright (c) 2010-2014, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the PDF manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
21 
27 #include <vector>
28 
30 
32 
35 
37 
38 #include "viennacl/tools/tools.hpp"
39 
40 namespace viennacl{
41 
42  namespace generator{
43 
46  private:
47  typedef std::vector<std::pair<const char *, viennacl::ocl::handle<cl_mem> > > temporaries_type;
48 
49  static void fill_scalartypes(statements_type statements, std::vector<const char *> & res){
50  res.reserve(statements.size());
51  for(statements_type::const_iterator it = statements.begin() ; it != statements.end() ; ++it){
52  if (it->second.lhs.type_family == scheduler::SCALAR_TYPE_FAMILY)
53  {
54  switch(it->second.lhs.numeric_type){
56  res.push_back("float");
57  break;
59  res.push_back("double");
60  break;
61  default:
62  res.push_back("");
63  break;
64  }
65  }
66  else
67  {
68  res.push_back("");
69  }
70  }
71  }
72 
73  public:
74 
75  vcl_size_t lmem_used(vcl_size_t scalartype_size) const {
76  return local_size_1_*scalartype_size;
77  }
78 
79  void init_temporaries(statements_type const & statements) const {
80  if(temporaries_.empty()){
81  //set temporary buffer argument
82  for(statements_type::const_iterator it = statements.begin() ; it != statements.end() ; ++it){
83  scheduler::statement::container_type const & array = it->first.array();
84  vcl_size_t size_of_scalartype;
85  const char * scalartype_name;
86  if (array[0].lhs.type_family != scheduler::SCALAR_TYPE_FAMILY) throw "not implemented";
87  switch(array[0].lhs.numeric_type){
88  case scheduler::FLOAT_TYPE: scalartype_name = "float"; size_of_scalartype = sizeof(float); break;
89  case scheduler::DOUBLE_TYPE: scalartype_name = "double"; size_of_scalartype = sizeof(double); break;
90  default: throw "not implemented";
91  }
92  for(scheduler::statement::container_type::const_iterator iit = array.begin() ; iit != array.end() ; ++iit){
94  temporaries_.push_back(std::make_pair(scalartype_name, viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, static_cast<unsigned int>(num_groups_*size_of_scalartype))));
95  }
96  }
97  }
98  }
99  }
100 
101  void set_size_argument(viennacl::scheduler::statement const & s, viennacl::scheduler::statement_node const & /*root_node*/, unsigned int & n_arg, viennacl::ocl::kernel & k) const {
103  for(scheduler::statement::container_type::iterator it = exprs.begin() ; it != exprs.end() ; ++it){
105  //set size argument
106  scheduler::statement_node const * current_node = &(*it);
107 
109  //The LHS of the prod is a vector
111  {
112  vector_size = utils::call_on_vector(current_node->lhs, utils::internal_size_fun());
113  }
114  else{
115  //The LHS of the prod is a vector expression
116  current_node = &exprs[current_node->lhs.node_index];
118  {
119  vector_size = cl_uint(utils::call_on_vector(current_node->lhs, utils::internal_size_fun()));
120  }
121  else if(current_node->rhs.type_family==scheduler::VECTOR_TYPE_FAMILY)
122  {
123  vector_size = cl_uint(utils::call_on_vector(current_node->lhs, utils::internal_size_fun()));
124  }
125  else{
126  assert(false && bool("unexpected expression tree"));
127  }
128  }
129  k.arg(n_arg++, cl_uint(vector_size/vector_size_));
130  }
131  }
132  }
133 
134  public:
136  scalar_reduction(unsigned int vectorization, unsigned int local_size, unsigned int num_groups, unsigned int decomposition) : profile_base(vectorization, local_size, 1, 2), num_groups_(num_groups), decomposition_(decomposition){ }
137 
138 
139  static std::string csv_format() {
140  return "Vec,LSize,NumGroups,GlobalDecomposition";
141  }
142 
143  std::string csv_representation() const{
144  std::ostringstream oss;
145  oss << vector_size_
146  << "," << local_size_1_
147  << "," << num_groups_
148  << "," << decomposition_;
149  return oss.str();
150  }
151 
152  unsigned int num_groups() const { return num_groups_; }
153 
154 
155  unsigned int decomposition() const { return decomposition_; }
156 
157 
158  void configure_range_enqueue_arguments(vcl_size_t kernel_id, statements_type const & statements, viennacl::ocl::kernel & k, unsigned int & n_arg) const{
159 
160  //create temporaries
161  init_temporaries(statements);
162 
163  //configure ND range
164  if(kernel_id==0){
165  configure_local_sizes(k, 0);
166 
167  vcl_size_t gsize = local_size_1_*num_groups_;
168  k.global_work_size(0,gsize);
169  k.global_work_size(1,1);
170  }
171  else{
172  configure_local_sizes(k, 1);
173 
175  k.global_work_size(1,1);
176  }
177 
178  //set arguments
179  set_size_argument(statements.front().first, statements.front().second, n_arg, k);
180  for(temporaries_type::iterator it = temporaries_.begin() ; it != temporaries_.end() ; ++it){
181  k.arg(n_arg++, it->second);
182  }
183  }
184 
185  void kernel_arguments(statements_type const & statements, std::string & arguments_string) const{
186  init_temporaries(statements);
187  arguments_string += detail::generate_value_kernel_argument("unsigned int", "N");
188  for(temporaries_type::iterator it = temporaries_.begin() ; it != temporaries_.end() ; ++it){
189  arguments_string += detail::generate_pointer_kernel_argument("__global", it->first, "temp" + utils::to_string(std::distance(temporaries_.begin(), it)));
190  }
191  }
192 
193  private:
194 
195  void core_0(utils::kernel_generation_stream& stream, std::vector<detail::mapped_scalar_reduction*> exprs, std::vector<const char *> const & scalartypes, statements_type const & /*statements*/, std::vector<detail::mapping_type> const & /*mapping*/) const {
196 
197  stream << "unsigned int lid = get_local_id(0);" << std::endl;
198 
199  for(vcl_size_t k = 0 ; k < exprs.size() ; ++k)
200  stream << scalartypes[k] << " sum" << k << " = 0;" << std::endl;
201 
202  if(decomposition_){
203  stream << "for(unsigned int i = get_global_id(0) ; i < N ; i += get_global_size(0)){" << std::endl;
204  }
205  else{
206  stream << "unsigned int chunk_size = (N + get_num_groups(0)-1)/get_num_groups(0);" << std::endl;
207  stream << "unsigned int chunk_start = get_group_id(0)*chunk_size;" << std::endl;
208  stream << "unsigned int chunk_end = min(chunk_start+chunk_size, N);" << std::endl;
209  stream << "for(unsigned int i = chunk_start + get_local_id(0) ; i < chunk_end ; i += get_local_size(0)){" << std::endl;
210  }
211  stream.inc_tab();
212 
213  //Fetch vector entry
214  std::set<std::string> fetched;
215 
216  for(std::vector<detail::mapped_scalar_reduction*>::iterator it = exprs.begin() ; it != exprs.end() ; ++it){
217  viennacl::scheduler::statement const & statement = (*it)->statement();
218  viennacl::scheduler::statement_node const & root_node = (*it)->root_node();
219  detail::fetch_all_lhs(fetched,statement,root_node, std::make_pair("i", "0"),vector_size_,stream,(*it)->mapping());
220  detail::fetch_all_rhs(fetched,statement,root_node, std::make_pair("i", "0"),vector_size_,stream,(*it)->mapping());
221  }
222 
223 
224  //Update sums;
225  for(std::vector<detail::mapped_scalar_reduction*>::iterator it = exprs.begin() ; it != exprs.end() ; ++it){
226  viennacl::scheduler::statement const & statement = (*it)->statement();
227  viennacl::scheduler::statement_node const & root_node = (*it)->root_node();
228  if(vector_size_ > 1){
229  for(unsigned int a = 0 ; a < vector_size_ ; ++a){
230  std::string str;
231  detail::generate_all_lhs(statement,root_node,std::make_pair("i","0"),a,str,(*it)->mapping());
232  str += "*";
233  detail::generate_all_rhs(statement,root_node,std::make_pair("i","0"),a,str,(*it)->mapping());
234  stream << " sum" << std::distance(exprs.begin(),it) << " += " << str << ";" << std::endl;
235  }
236  }
237  else{
238  std::string str;
239  detail::generate_all_lhs(statement,root_node,std::make_pair("i","0"),-1,str,(*it)->mapping());
240  str += "*";
241  detail::generate_all_rhs(statement,root_node,std::make_pair("i","0"),-1,str,(*it)->mapping());
242  stream << " sum" << std::distance(exprs.begin(),it) << " += " << str << ";" << std::endl;
243  }
244  }
245 
246 
247  stream.dec_tab();
248  stream << "}" << std::endl;
249  //Declare and fill local memory
250  for(vcl_size_t k = 0 ; k < exprs.size() ; ++k)
251  stream << "__local " << scalartypes[k] << " buf" << k << "[" << local_size_1_ << "];" << std::endl;
252 
253  for(vcl_size_t k = 0 ; k < exprs.size() ; ++k)
254  stream << "buf" << k << "[lid] = sum" << k << ";" << std::endl;
255 
256  //Reduce local memory
257  for(vcl_size_t stride = local_size_1_/2 ; stride>1 ; stride /=2){
258  stream << "barrier(CLK_LOCAL_MEM_FENCE); " << std::endl;
259  stream << "if(lid < " << stride << "){" << std::endl;
260  stream.inc_tab();
261  for(vcl_size_t k = 0 ; k < exprs.size() ; ++k){
262  stream << "buf" << k << "[lid] += buf" << k << "[lid + " << stride << "];" << std::endl;
263  }
264  stream.dec_tab();
265  stream << "}" << std::endl;
266  }
267 
268  //Last reduction and write back to temporary buffer
269  stream << "barrier(CLK_LOCAL_MEM_FENCE); " << std::endl;
270  stream << "if(lid==0){" << std::endl;
271  stream.inc_tab();
272  for(vcl_size_t k = 0 ; k < exprs.size() ; ++k)
273  stream << "buf" << k << "[0] += buf" << k << "[1];" << std::endl;
274 
275  for(vcl_size_t k = 0 ; k < exprs.size() ; ++k)
276  stream << "temp"<< k << "[get_group_id(0)] = buf" << k << "[0];" << std::endl;
277 
278  stream.dec_tab();
279  stream << "}" << std::endl;
280  }
281 
282 
283  void core_1(utils::kernel_generation_stream& stream, std::vector<detail::mapped_scalar_reduction*> exprs, std::vector<const char *> scalartypes, statements_type const & statements, std::vector<detail::mapping_type> const & mapping) const {
284  stream << "unsigned int lid = get_local_id(0);" << std::endl;
285 
286  for(vcl_size_t k = 0 ; k < exprs.size() ; ++k)
287  stream << "__local " << scalartypes[k] << " buf" << k << "[" << local_size_1_ << "];" << std::endl;
288 
289  for(vcl_size_t k = 0 ; k < exprs.size() ; ++k)
290  stream << scalartypes[0] << " sum" << k << " = 0;" << std::endl;
291 
292  stream << "for(unsigned int i = lid ; i < " << num_groups_ << " ; i += get_local_size(0)){" << std::endl;
293  stream.inc_tab();
294  for(vcl_size_t k = 0 ; k < exprs.size() ; ++k)
295  stream << "sum" << k << " += temp" << k << "[i];" << std::endl;
296  stream.dec_tab();
297  stream << "}" << std::endl;
298 
299  for(vcl_size_t k = 0 ; k < exprs.size() ; ++k)
300  stream << "buf" << k << "[lid] = sum" << k << ";" << std::endl;
301 
302  //Reduce local memory
303  for(vcl_size_t stride = local_size_1_/2 ; stride>1 ; stride /=2){
304  stream << "barrier(CLK_LOCAL_MEM_FENCE); " << std::endl;
305  stream << "if(lid < " << stride << "){" << std::endl;
306  stream.inc_tab();
307  for(vcl_size_t k = 0 ; k < exprs.size() ; ++k){
308  stream << "buf" << k << "[lid] += buf" << k << "[lid + " << stride << "];" << std::endl;
309  }
310  stream.dec_tab();
311  stream << "}" << std::endl;
312  }
313 
314  stream << "barrier(CLK_LOCAL_MEM_FENCE); " << std::endl;
315  stream << "if(lid==0){" << std::endl;
316  stream.inc_tab();
317  for(vcl_size_t k = 0 ; k < exprs.size() ; ++k){
318  stream << "buf" << k << "[0] += buf" << k << "[1];" << std::endl;
319  exprs[k]->access_name("buf"+utils::to_string(k)+"[0]");
320  }
321 
322  vcl_size_t i = 0;
323  for(statements_type::const_iterator it = statements.begin() ; it != statements.end() ; ++it){
324  std::string str;
325  detail::traverse(it->first, it->second, detail::expression_generation_traversal(std::make_pair("0", "0"), -1, str, mapping[i++]), false);
326  stream << str << ";" << std::endl;
327  }
328 
329  stream.dec_tab();
330  stream << "}" << std::endl;
331  }
332 
333  void core(vcl_size_t kernel_id, utils::kernel_generation_stream& stream, statements_type const & statements, std::vector<detail::mapping_type> const & mapping) const {
334  std::vector<detail::mapped_scalar_reduction*> exprs;
335  for(std::vector<detail::mapping_type>::const_iterator it = mapping.begin() ; it != mapping.end() ; ++it)
336  for(detail::mapping_type::const_iterator iit = it->begin() ; iit != it->end() ; ++iit)
337  if(detail::mapped_scalar_reduction * p = dynamic_cast<detail::mapped_scalar_reduction*>(iit->second.get()))
338  exprs.push_back(p);
339 
340  std::vector<const char *> scalartypes;
341  fill_scalartypes(statements, scalartypes);
342 
343  if(kernel_id==0){
344  core_0(stream,exprs,scalartypes,statements,mapping);
345  }
346  else{
347  core_1(stream,exprs,scalartypes,statements,mapping);
348  }
349  }
350 
351  private:
352  unsigned int num_groups_;
353  unsigned int decomposition_;
354  mutable temporaries_type temporaries_;
355  };
356 
357 
358  }
359 
360 }
361 
362 #endif
A stream class where the kernel sources are streamed to. Takes care of indentation of the sources...
Definition: utils.hpp:233
void arg(unsigned int pos, cl_char val)
Sets a char argument at the provided position.
Definition: kernel.hpp:124
std::size_t vcl_size_t
Definition: forwards.h:58
vcl_size_t node_index
Definition: forwards.h:276
statement(container_type const &custom_array)
Definition: forwards.h:454
Internal utils for a dynamic OpenCL kernel generation.
void kernel_arguments(statements_type const &statements, std::string &arguments_string) const
Definition: scalar_reduction.hpp:185
Represents an OpenCL kernel within ViennaCL.
Definition: kernel.hpp:59
Base class for an operation profile.
Definition: profile_base.hpp:47
lhs_rhs_element lhs
Definition: forwards.h:422
Various little tools used here and there in ViennaCL.
Definition: forwards.h:217
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
Definition: stride.hpp:46
std::list< std::pair< scheduler::statement, scheduler::statement_node > > statements_type
Definition: profile_base.hpp:49
lhs_rhs_element rhs
Definition: forwards.h:424
std::string csv_representation() const
csv representation of an operation
Definition: scalar_reduction.hpp:143
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
Definition: cpu_ram.hpp:29
several code generation helpers
Definition: forwards.h:170
Base classes for the profiles.
vcl_size_t lmem_used(vcl_size_t scalartype_size) const
Definition: scalar_reduction.hpp:75
static std::string csv_format()
Definition: scalar_reduction.hpp:139
void set_size_argument(viennacl::scheduler::statement const &s, viennacl::scheduler::statement_node const &, unsigned int &n_arg, viennacl::ocl::kernel &k) const
Definition: scalar_reduction.hpp:101
void init_temporaries(statements_type const &statements) const
Definition: scalar_reduction.hpp:79
void configure_range_enqueue_arguments(vcl_size_t kernel_id, statements_type const &statements, viennacl::ocl::kernel &k, unsigned int &n_arg) const
Configures the range and enqueues the arguments associated with the profile.
Definition: scalar_reduction.hpp:158
viennacl::ocl::context & current_context()
Convenience function for returning the current context.
Definition: backend.hpp:192
Definition: forwards.h:173
void configure_local_sizes(viennacl::ocl::kernel &k, vcl_size_t) const
Definition: profile_base.hpp:59
Provides the datastructures for dealing with a single statement such as 'x = y + z;'.
std::vector< value_type > container_type
Definition: forwards.h:452
unsigned int vector_size_
Definition: profile_base.hpp:178
unsigned int vector_size() const
Get the vector size of the kernel.
Definition: profile_base.hpp:90
scalar_reduction(unsigned int vectorization, unsigned int local_size, unsigned int num_groups, unsigned int decomposition)
The user constructor.
Definition: scalar_reduction.hpp:136
container_type const & array() const
Definition: forwards.h:473
unsigned int decomposition() const
Definition: scalar_reduction.hpp:155
std::string to_string(T const t)
Definition: utils.hpp:204
statement_node_type_family type_family
Definition: forwards.h:269
The main class for representing a statement such as x = inner_prod(y,z); at runtime.
Definition: forwards.h:447
size_type global_work_size(int index=0) const
Returns the global work size at the respective dimension.
Definition: kernel.hpp:759
Implementations for the OpenCL backend functionality.
vcl_size_t local_size_1_
Definition: profile_base.hpp:179
Definition: forwards.h:216
Main datastructure for an node in the statement tree.
Definition: forwards.h:420
Functor for returning the internal size of a vector.
Definition: utils.hpp:167
unsigned int num_groups() const
Definition: scalar_reduction.hpp:152
OpenCL kernel generation template for scalar reduction operations such as s = norm_2(x).
Definition: scalar_reduction.hpp:45