c++assemblyx86intel-edisonint128

An efficient way to do basic 128 bit integer calculations in C++?


Some years ago I needed a way to do some basic 128 bit integer math with Cuda: 128 bit integer on cuda?. Now I am having the same problem, but this time I need to run some basic 128 bit arithmetics (sums, bitshifts and multiplications) on a 32 bit embedded system (Intel Edison) that does not support 128 bits of any kind. There are, however, 64 bit integers supported directly (unsigned long long int).

I tried naively to use the asm code that was answered to me last time on the CPU, but I got a bunch of errors. I am really not experienced with asm, so: what is the most efficient way, having 64 bit integers, to implement additions, multiplications and bit shifting in 128 bits?


Solution

  • Update: Since the OP hasn't accepted an answer yet <hint><hint>, I've attached a bit more code.

    Using the libraries discussed above is probably a good idea. While you might only need a few functions today, eventually you may find that you need one more. Then one more after that. Until eventually you end up writing, debugging and maintaining your own 128bit math library. Which is a waste of your time and effort.

    That said. If you are determined to roll your own:

    1) The cuda question you asked previously already has c code for multiplication. Was there some problem with it?

    2) The shift probably won't benefit from using asm, so a c solution makes sense to me here as well. Although if performance is really an issue here, I'd see if the Edison supports SHLD/SHRD, which might make this a bit faster. Otherwise, m Maybe an approach like this?

    my_uint128_t lshift_uint128 (const my_uint128_t a, int b)
    {
       my_uint128_t res;
       if (b < 32) {    
          res.x = a.x << b;
          res.y = (a.y << b) | (a.x >> (32 - b));
          res.z = (a.z << b) | (a.y >> (32 - b));
          res.w = (a.w << b) | (a.z >> (32 - b));
       } elseif (b < 64) {
          ...
       }
    
       return res;
    }
    

    Update: Since it appears that the Edison may support SHLD/SHRD, here's an alternative which might be more performant than the 'c' code above. As with all code purporting to be faster, you should test it.

    inline
    unsigned int __shld(unsigned int into, unsigned int from, unsigned int c)
    {
       unsigned int res;
    
       if (__builtin_constant_p(into) &&
           __builtin_constant_p(from) &&
           __builtin_constant_p(c))
       {
          res = (into << c) | (from >> (32 - c));
       }
       else
       {
          asm("shld %b3, %2, %0"
              : "=rm" (res)
              : "0" (into), "r" (from), "ic" (c)
              : "cc");
       }
    
       return res;
    }
    
    inline
    unsigned int __shrd(unsigned int into, unsigned int from, unsigned int c)
    {
       unsigned int res;
    
       if (__builtin_constant_p(into) && 
           __builtin_constant_p(from) && 
           __builtin_constant_p(c))
       {
          res = (into >> c) | (from << (32 - c));
       }
       else
       {
          asm("shrd %b3, %2, %0"
              : "=rm" (res)
              : "0" (into), "r" (from), "ic" (c)
              : "cc");
       }
    
       return res;
    }
    
    my_uint128_t lshift_uint128 (const my_uint128_t a, unsigned int b)
    {
       my_uint128_t res;
    
       if (b < 32) {
          res.x = a.x << b;
          res.y = __shld(a.y, a.x, b);
          res.z = __shld(a.z, a.y, b);
          res.w = __shld(a.w, a.z, b);
       } else if (b < 64) {
          res.x = 0;
          res.y = a.x << (b - 32);
          res.z = __shld(a.y, a.x, b - 32);
          res.w = __shld(a.z, a.y, b - 32);
       } else if (b < 96) {
          res.x = 0;
          res.y = 0;
          res.z = a.x << (b - 64);
          res.w = __shld(a.y, a.x, b - 64);
       } else if (b < 128) {
          res.x = 0;
          res.y = 0;
          res.z = 0;
          res.w = a.x << (b - 96);
       } else {
          memset(&res, 0, sizeof(res));
       }
    
       return res;
    }
    
    my_uint128_t rshift_uint128 (const my_uint128_t a, unsigned int b)
    {
       my_uint128_t res;
    
       if (b < 32) {
          res.x = __shrd(a.x, a.y, b);
          res.y = __shrd(a.y, a.z, b);
          res.z = __shrd(a.z, a.w, b);
          res.w = a.w >> b;
       } else if (b < 64) {
          res.x = __shrd(a.y, a.z, b - 32);
          res.y = __shrd(a.z, a.w, b - 32);
          res.z = a.w >> (b - 32);
          res.w = 0;
       } else if (b < 96) {
          res.x = __shrd(a.z, a.w, b - 64);
          res.y = a.w >> (b - 64);
          res.z = 0;
          res.w = 0;
       } else if (b < 128) {
          res.x = a.w >> (b - 96);
          res.y = 0;
          res.z = 0;
          res.w = 0;
       } else {
          memset(&res, 0, sizeof(res));
       }
    
       return res;
    }
    

    3) The addition may benefit from asm. You could try this:

    struct my_uint128_t
    {
       unsigned int x;
       unsigned int y;
       unsigned int z;
       unsigned int w;
    };
    
    my_uint128_t add_uint128 (const my_uint128_t a, const my_uint128_t b)
    {
       my_uint128_t res;
    
        asm ("addl %5, %[resx]\n\t"
             "adcl %7, %[resy]\n\t"
             "adcl %9, %[resz]\n\t"
             "adcl %11, %[resw]\n\t"
             : [resx] "=&r" (res.x), [resy] "=&r" (res.y), 
               [resz] "=&r" (res.z), [resw] "=&r" (res.w)
             : "%0"(a.x), "irm"(b.x), 
               "%1"(a.y), "irm"(b.y), 
               "%2"(a.z), "irm"(b.z), 
               "%3"(a.w), "irm"(b.w)
             : "cc");
    
       return res;
    }
    

    I just dashed this off, so use at your own risk. I don't have an Edison, but this works with x86.

    Update: If you are just doing accumulation (think to += from instead of the code above which is c = a + b), this code might serve you better:

    inline
    void addto_uint128 (my_uint128_t *to, const my_uint128_t from)
    {
       asm ("addl %[fromx], %[tox]\n\t"
            "adcl %[fromy], %[toy]\n\t"
            "adcl %[fromz], %[toz]\n\t"
            "adcl %[fromw], %[tow]\n\t"
            : [tox] "+&r"(to->x), [toy] "+&r"(to->y), 
              [toz] "+&r"(to->z), [tow] "+&r"(to->w)
            : [fromx] "irm"(from.x), [fromy] "irm"(from.y), 
              [fromz] "irm"(from.z), [fromw] "irm"(from.w)
            : "cc");
    }