Performance penalty of using functor to provide a function or an operator as a C++ template parameter? -


i have family of complex functions performing similar tasks except 1 operator right in middle of function. simplified version of code that:

#include <assert.h>  static void memopxor(char * buffer1, char * buffer2, char * res, unsigned n){     (unsigned x = 0 ; x < n ; x++){         res[x] = buffer1[x] ^ buffer2[x];     } };  static void memopplus(char * buffer1, char * buffer2, char * res, unsigned n){     (unsigned x = 0 ; x < n ; x++){         res[x] = buffer1[x] + buffer2[x];     } };  static void memopmul(char * buffer1, char * buffer2, char * res, unsigned n){     (unsigned x = 0 ; x < n ; x++){         res[x] = buffer1[x] * buffer2[x];     } };   int main(int argc, char ** argv){     char b1[5] = {0, 1, 2, 3, 4};     char b2[5] = {0, 1, 2, 3, 4};      char res1[5] = {};     memopxor(b1, b2, res1, 5);      assert(res1[0] == 0);     assert(res1[1] == 0);     assert(res1[2] == 0);     assert(res1[3] == 0);     assert(res1[4] == 1);      char res2[5] = {};     memopplus(b1, b2, res2, 5);      assert(res2[0] == 0);     assert(res2[1] == 2);     assert(res2[2] == 4);     assert(res2[3] == 6);     assert(res2[4] == 8);      char res3[5] = {};     memopmul(b1, b2, res3, 5);      assert(res3[0] == 0);     assert(res3[1] == 1);     assert(res3[2] == 4);     assert(res3[3] == 9);     assert(res3[4] == 16); } 

it looks case use c++ templates avoid duplicating code, hence looking way change code below (pseudo code):

#include <assert.h>  template <function> void memop<function>(char * buffer1, char * buffer2, char * res, size_t n){     (size_t x = 0 ; x < n ; x++){         res[x] = function(buffer1[x], buffer2[x]);     } }  int main(int argc, char ** argv){     char b1[5] = {0, 1, 2, 3, 4};     char b2[5] = {0, 1, 2, 3, 4};      char res1[5] = {};     memop<operator^>(b1, b2, res1, 5);      assert(res1[0] == 0);     assert(res1[1] == 0);     assert(res1[2] == 0);     assert(res1[3] == 0);     assert(res1[4] == 0);      char res2[5] = {};     memop<operator+>(b1, b2, res2, 5);      assert(res2[0] == 0);     assert(res2[1] == 2);     assert(res2[2] == 4);     assert(res2[3] == 6);     assert(res2[4] == 8);      char res3[5] = {};     memop<operator*>(b1, b2, res3, 5);      assert(res3[0] == 0);     assert(res3[1] == 1);     assert(res3[2] == 4);     assert(res3[3] == 9);     assert(res3[4] == 16); } 

the hard point i'm not willing accept slowdown of resulting code. means solutions implying indirect calls (either through vtable or function pointers) not ok.

the common c++ solution problem seems wrapping operator call inside operator() method of functor class. typically code below:

#include <assert.h>  template <typename op> void memop(char * buffer1, char * buffer2, char * res, unsigned n){     op o;     (unsigned x = 0 ; x < n ; x++){         res[x] = o(buffer1[x], buffer2[x]);     } };   struct xor {     char operator()(char a, char b){         return ^ b;     } };  struct plus {     char operator()(char a, char b){         return + b;     } };  struct mul {     char operator()(char a, char b){         return * b;     } };  int main(int argc, char ** argv){     char b1[5] = {0, 1, 2, 3, 4};     char b2[5] = {0, 1, 2, 3, 4};      char res1[5] = {};     memop<xor>(b1, b2, res1, 5);      assert(res1[0] == 0);     assert(res1[1] == 0);     assert(res1[2] == 0);     assert(res1[3] == 0);     assert(res1[4] == 0);      char res2[5] = {};     memop<plus>(b1, b2, res2, 5);      assert(res2[0] == 0);     assert(res2[1] == 2);     assert(res2[2] == 4);     assert(res2[3] == 6);     assert(res2[4] == 8);      char res3[5] = {};     memop<mul>(b1, b2, res3, 5);      assert(res3[0] == 0);     assert(res3[1] == 1);     assert(res3[2] == 4);     assert(res3[3] == 9);     assert(res3[4] == 16); } 

is there performance penalty doing ?

the code expose useless far bencharmk go.

char cversion() {     char b1[5] = {0, 1, 2, 3, 4};     char b2[5] = {0, 1, 2, 3, 4};      char res1[5] = {};     memopxor(b1, b2, res1, 5);      return res1[4]; }  char cppversion() {     char b1[5] = {0, 1, 2, 3, 4};     char b2[5] = {0, 1, 2, 3, 4};      char res1[5] = {};     memop<xor>(b1, b2, res1, 5);      return res1[4]; } 

is compiled such llvm ir:

define signext i8 @cversion()() nounwind uwtable readnone {   ret i8 0 }  define signext i8 @cppversion()() nounwind uwtable readnone {   ret i8 0 } 

that is, compiler makes whole computation during compilation.

so took liberty of defining new function:

void cppmemopxor(char * buffer1,                  char * buffer2,                  char * res,                  unsigned n) {   memop<xor>(buffer1, buffer2, res, n); } 

and removed static qualifier on memopxor , repeated experience:

define void @memopxor(char*, char*, char*, unsigned int)(i8* nocapture %buffer1, i8* nocapture %buffer2, i8* nocapture %res, i32 %n) nounwind uwtable {   %1 = icmp eq i32 %n, 0   br i1 %1, label %._crit_edge, label %.lr.ph  .lr.ph:                                           ; preds = %.lr.ph, %0   %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]   %2 = getelementptr inbounds i8* %buffer1, i64 %indvars.iv   %3 = load i8* %2, align 1, !tbaa !0   %4 = getelementptr inbounds i8* %buffer2, i64 %indvars.iv   %5 = load i8* %4, align 1, !tbaa !0   %6 = xor i8 %5, %3   %7 = getelementptr inbounds i8* %res, i64 %indvars.iv   store i8 %6, i8* %7, align 1, !tbaa !0   %indvars.iv.next = add i64 %indvars.iv, 1   %lftr.wideiv = trunc i64 %indvars.iv.next i32   %exitcond = icmp eq i32 %lftr.wideiv, %n   br i1 %exitcond, label %._crit_edge, label %.lr.ph  ._crit_edge:                                      ; preds = %.lr.ph, %0   ret void } 

and c++ version templates:

define void @cppmemopxor(char*, char*, char*, unsigned int)(i8* nocapture %buffer1, i8* nocapture %buffer2, i8* nocapture %res, i32 %n) nounwind uwtable {   %1 = icmp eq i32 %n, 0   br i1 %1, label %_zl5memopi3xorevpcs1_s1_j.exit, label %.lr.ph.i  .lr.ph.i:                                         ; preds = %.lr.ph.i, %0   %indvars.iv.i = phi i64 [ %indvars.iv.next.i, %.lr.ph.i ], [ 0, %0 ]   %2 = getelementptr inbounds i8* %buffer1, i64 %indvars.iv.i   %3 = load i8* %2, align 1, !tbaa !0   %4 = getelementptr inbounds i8* %buffer2, i64 %indvars.iv.i   %5 = load i8* %4, align 1, !tbaa !0   %6 = xor i8 %5, %3   %7 = getelementptr inbounds i8* %res, i64 %indvars.iv.i   store i8 %6, i8* %7, align 1, !tbaa !0   %indvars.iv.next.i = add i64 %indvars.iv.i, 1   %lftr.wideiv = trunc i64 %indvars.iv.next.i i32   %exitcond = icmp eq i32 %lftr.wideiv, %n   br i1 %exitcond, label %_zl5memopi3xorevpcs1_s1_j.exit, label %.lr.ph.i  _zl5memopi3xorevpcs1_s1_j.exit:                   ; preds = %.lr.ph.i, %0   ret void } 

as expected, structurally identical functor code has been inlined (which visible without understanding ir).

note not result in isolation. example, std::sort performs twice thrice fast qsort because uses functor instead of indirect function call. of course, using templated function , functor means each different instantiation generate new code, if coded function manually, doing manually anyway.


Comments

Popular posts from this blog

python - ('The SQL contains 0 parameter markers, but 50 parameters were supplied', 'HY000') or TypeError: 'tuple' object is not callable -

objective c - Language Translation API for iPhone -

jasper reports - Fixed header in Excel using JasperReports -