cgccinteger-divisionlibgcc

Where is GCC's __divti3 defined?


For GCC's __int128 type

__int128 div128(__int128 a, __int128 b) {return a / b;}

GCC will generate a call to __divti3 instead of inline assembly. So where is this actually defined? Github's search only returns docs or tests that use it. I don't know if github's search function is terrible or the function is sneakily renamed somewhere.


Solution

  • GCC defines "Machine Modes", where TImode means 16-byte "tetra integer", i.e. 128-bit for platforms with 8-bit bytes (pretty much all of them).

    In libgcc/libgcc2.h, 64-bit platforms have #define DWtype TItype, while for 32-bit platforms have #define DWtype DItype (their double-word is 64-bit), so don't support __int128.

    Further along there is extern DWtype __divdi3 (DWtype, DWtype);

    In libgcc/libgcc2.c, there is the following definition, which simplifies slightly to unsigned division:

    #ifdef L_divdi3
    DWtype
    __divdi3 (DWtype u, DWtype v)
    {
      Wtype c = 0;
      DWunion uu = {.ll = u};
      DWunion vv = {.ll = v};
      DWtype w;
    
      if (uu.s.high < 0)
        c = ~c,
        uu.ll = -uu.ll;
      if (vv.s.high < 0)
        c = ~c,
        vv.ll = -vv.ll;
    
      w = __udivmoddi4 (uu.ll, vv.ll, (UDWtype *) 0);
      if (c)
        w = -w;
    
      return w;
    }
    #endif
    

    There are two implementations, depending on if hardware divide is available or not.

    #ifdef L_udivmoddi4
    #ifdef TARGET_HAS_NO_HW_DIVIDE
    
    #if (defined (L_udivdi3) || defined (L_divdi3) || \
         defined (L_umoddi3) || defined (L_moddi3) || \
         defined (L_divmoddi4))
    static inline __attribute__ ((__always_inline__))
    #endif
    UDWtype
    __udivmoddi4 (UDWtype n, UDWtype d, UDWtype *rp)
    {
      UDWtype q = 0, r = n, y = d;
      UWtype lz1, lz2, i, k;
    
      /* Implements align divisor shift dividend method. This algorithm
         aligns the divisor under the dividend and then perform number of
         test-subtract iterations which shift the dividend left. Number of
         iterations is k + 1 where k is the number of bit positions the
         divisor must be shifted left to align it under the dividend.
         quotient bits can be saved in the rightmost positions of the dividend
         as it shifts left on each test-subtract iteration. */
    
      if (y <= r)
        {
          lz1 = __builtin_clzll (d);
          lz2 = __builtin_clzll (n);
    
          k = lz1 - lz2;
          y = (y << k);
    
          /* Dividend can exceed 2 ^ (width - 1) - 1 but still be less than the
         aligned divisor. Normal iteration can drops the high order bit
         of the dividend. Therefore, first test-subtract iteration is a
         special case, saving its quotient bit in a separate location and
         not shifting the dividend. */
          if (r >= y)
        {
          r = r - y;
          q =  (1ULL << k);
        }
    
          if (k > 0)
        {
          y = y >> 1;
    
          /* k additional iterations where k regular test subtract shift
            dividend iterations are done.  */
          i = k;
          do
            {
              if (r >= y)
            r = ((r - y) << 1) + 1;
              else
            r =  (r << 1);
              i = i - 1;
            } while (i != 0);
    
          /* First quotient bit is combined with the quotient bits resulting
             from the k regular iterations.  */
          q = q + r;
          r = r >> k;
          q = q - (r << k);
        }
        }
    
      if (rp)
        *rp = r;
      return q;
    }
    
    #else
    
    #if (defined (L_udivdi3) || defined (L_divdi3) || \
         defined (L_umoddi3) || defined (L_moddi3) || \
         defined (L_divmoddi4))
    static inline __attribute__ ((__always_inline__))
    #endif
    UDWtype
    __udivmoddi4 (UDWtype n, UDWtype d, UDWtype *rp)
    {
      const DWunion nn = {.ll = n};
      const DWunion dd = {.ll = d};
      DWunion rr;
      UWtype d0, d1, n0, n1, n2;
      UWtype q0, q1;
      UWtype b, bm;
    
      d0 = dd.s.low;
      d1 = dd.s.high;
      n0 = nn.s.low;
      n1 = nn.s.high;
    
    #if !UDIV_NEEDS_NORMALIZATION
      if (d1 == 0)
        {
          if (d0 > n1)
        {
          /* 0q = nn / 0D */
    
          udiv_qrnnd (q0, n0, n1, n0, d0);
          q1 = 0;
    
          /* Remainder in n0.  */
        }
          else
        {
          /* qq = NN / 0d */
    
          if (d0 == 0)
            d0 = 1 / d0;    /* Divide intentionally by zero.  */
    
          udiv_qrnnd (q1, n1, 0, n1, d0);
          udiv_qrnnd (q0, n0, n1, n0, d0);
    
          /* Remainder in n0.  */
        }
    
          if (rp != 0)
        {
          rr.s.low = n0;
          rr.s.high = 0;
          *rp = rr.ll;
        }
        }
    
    #else /* UDIV_NEEDS_NORMALIZATION */
    
      if (d1 == 0)
        {
          if (d0 > n1)
        {
          /* 0q = nn / 0D */
    
          count_leading_zeros (bm, d0);
    
          if (bm != 0)
            {
              /* Normalize, i.e. make the most significant bit of the
             denominator set.  */
    
              d0 = d0 << bm;
              n1 = (n1 << bm) | (n0 >> (W_TYPE_SIZE - bm));
              n0 = n0 << bm;
            }
    
          udiv_qrnnd (q0, n0, n1, n0, d0);
          q1 = 0;
    
          /* Remainder in n0 >> bm.  */
        }
          else
        {
          /* qq = NN / 0d */
    
          if (d0 == 0)
            d0 = 1 / d0;    /* Divide intentionally by zero.  */
    
          count_leading_zeros (bm, d0);
    
          if (bm == 0)
            {
              /* From (n1 >= d0) /\ (the most significant bit of d0 is set),
             conclude (the most significant bit of n1 is set) /\ (the
             leading quotient digit q1 = 1).
    
             This special case is necessary, not an optimization.
             (Shifts counts of W_TYPE_SIZE are undefined.)  */
    
              n1 -= d0;
              q1 = 1;
            }
          else
            {
              /* Normalize.  */
    
              b = W_TYPE_SIZE - bm;
    
              d0 = d0 << bm;
              n2 = n1 >> b;
              n1 = (n1 << bm) | (n0 >> b);
              n0 = n0 << bm;
    
              udiv_qrnnd (q1, n1, n2, n1, d0);
            }
    
          /* n1 != d0...  */
    
          udiv_qrnnd (q0, n0, n1, n0, d0);
    
          /* Remainder in n0 >> bm.  */
        }
    
          if (rp != 0)
        {
          rr.s.low = n0 >> bm;
          rr.s.high = 0;
          *rp = rr.ll;
        }
        }
    #endif /* UDIV_NEEDS_NORMALIZATION */
    
      else
        {
          if (d1 > n1)
        {
          /* 00 = nn / DD */
    
          q0 = 0;
          q1 = 0;
    
          /* Remainder in n1n0.  */
          if (rp != 0)
            {
              rr.s.low = n0;
              rr.s.high = n1;
              *rp = rr.ll;
            }
        }
          else
        {
          /* 0q = NN / dd */
    
          count_leading_zeros (bm, d1);
          if (bm == 0)
            {
              /* From (n1 >= d1) /\ (the most significant bit of d1 is set),
             conclude (the most significant bit of n1 is set) /\ (the
             quotient digit q0 = 0 or 1).
    
             This special case is necessary, not an optimization.  */
    
              /* The condition on the next line takes advantage of that
             n1 >= d1 (true due to program flow).  */
              if (n1 > d1 || n0 >= d0)
            {
              q0 = 1;
              sub_ddmmss (n1, n0, n1, n0, d1, d0);
            }
              else
            q0 = 0;
    
              q1 = 0;
    
              if (rp != 0)
            {
              rr.s.low = n0;
              rr.s.high = n1;
              *rp = rr.ll;
            }
            }
          else
            {
              UWtype m1, m0;
              /* Normalize.  */
    
              b = W_TYPE_SIZE - bm;
    
              d1 = (d1 << bm) | (d0 >> b);
              d0 = d0 << bm;
              n2 = n1 >> b;
              n1 = (n1 << bm) | (n0 >> b);
              n0 = n0 << bm;
    
              udiv_qrnnd (q0, n1, n2, n1, d1);
              umul_ppmm (m1, m0, q0, d0);
    
              if (m1 > n1 || (m1 == n1 && m0 > n0))
            {
              q0--;
              sub_ddmmss (m1, m0, m1, m0, d1, d0);
            }
    
              q1 = 0;
    
              /* Remainder in (n1n0 - m1m0) >> bm.  */
              if (rp != 0)
            {
              sub_ddmmss (n1, n0, n1, n0, m1, m0);
              rr.s.low = (n1 << b) | (n0 >> bm);
              rr.s.high = n1 >> bm;
              *rp = rr.ll;
            }
            }
        }
        }
    
      const DWunion ww = {{.low = q0, .high = q1}};
      return ww.ll;
    }
    #endif
    #endif
    

    In include/longlong.h

    /* Define auxiliary asm macros.
    ...
       3) udiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
       denominator) divides a UDWtype, composed by the UWtype integers
       HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient
       in QUOTIENT and the remainder in REMAINDER.  HIGH_NUMERATOR must be less
       than DENOMINATOR for correct operation.  If, in addition, the most
       significant bit of DENOMINATOR must be 1, then the pre-processor symbol
       UDIV_NEEDS_NORMALIZATION is defined to 1.
    ...
       If any of these macros are left undefined for a particular CPU,
       C macros are used.  */
    

    For x86-64 HW divide it uses divq (GCC syntax).

    #if defined (__x86_64__) && W_TYPE_SIZE == 64
    ...
    #define udiv_qrnnd(q, r, n1, n0, dv) \
      __asm__ ("div{q} %4"                          \
           : "=a" ((UDItype) (q)),                  \
             "=d" ((UDItype) (r))                   \
           : "0" ((UDItype) (n0)),                  \
             "1" ((UDItype) (n1)),                  \
             "rm" ((UDItype) (dv)))
    #define count_leading_zeros(count, x)   ((count) = __builtin_clzll (x))
    #define count_trailing_zeros(count, x)  ((count) = __builtin_ctzll (x))
    #define UMUL_TIME 40
    #define UDIV_TIME 40
    #endif /* x86_64 */