For GCC's __int128
type
__int128 div128(__int128 a, __int128 b) {return a / b;}
GCC will generate a call to __divti3
instead of inline assembly. So where is this actually defined? Github's search only returns docs or tests that use it. I don't know if github's search function is terrible or the function is sneakily renamed somewhere.
GCC defines "Machine Modes", where TImode
means 16-byte "tetra integer", i.e. 128-bit for platforms with 8-bit bytes (pretty much all of them).
In libgcc/libgcc2.h, 64-bit platforms have #define DWtype TItype
, while for 32-bit platforms have #define DWtype DItype
(their double-word is 64-bit), so don't support __int128
.
Further along there is extern DWtype __divdi3 (DWtype, DWtype);
In libgcc/libgcc2.c, there is the following definition, which simplifies slightly to unsigned division:
#ifdef L_divdi3
DWtype
__divdi3 (DWtype u, DWtype v)
{
Wtype c = 0;
DWunion uu = {.ll = u};
DWunion vv = {.ll = v};
DWtype w;
if (uu.s.high < 0)
c = ~c,
uu.ll = -uu.ll;
if (vv.s.high < 0)
c = ~c,
vv.ll = -vv.ll;
w = __udivmoddi4 (uu.ll, vv.ll, (UDWtype *) 0);
if (c)
w = -w;
return w;
}
#endif
There are two implementations, depending on if hardware divide is available or not.
#ifdef L_udivmoddi4
#ifdef TARGET_HAS_NO_HW_DIVIDE
#if (defined (L_udivdi3) || defined (L_divdi3) || \
defined (L_umoddi3) || defined (L_moddi3) || \
defined (L_divmoddi4))
static inline __attribute__ ((__always_inline__))
#endif
UDWtype
__udivmoddi4 (UDWtype n, UDWtype d, UDWtype *rp)
{
UDWtype q = 0, r = n, y = d;
UWtype lz1, lz2, i, k;
/* Implements align divisor shift dividend method. This algorithm
aligns the divisor under the dividend and then perform number of
test-subtract iterations which shift the dividend left. Number of
iterations is k + 1 where k is the number of bit positions the
divisor must be shifted left to align it under the dividend.
quotient bits can be saved in the rightmost positions of the dividend
as it shifts left on each test-subtract iteration. */
if (y <= r)
{
lz1 = __builtin_clzll (d);
lz2 = __builtin_clzll (n);
k = lz1 - lz2;
y = (y << k);
/* Dividend can exceed 2 ^ (width - 1) - 1 but still be less than the
aligned divisor. Normal iteration can drops the high order bit
of the dividend. Therefore, first test-subtract iteration is a
special case, saving its quotient bit in a separate location and
not shifting the dividend. */
if (r >= y)
{
r = r - y;
q = (1ULL << k);
}
if (k > 0)
{
y = y >> 1;
/* k additional iterations where k regular test subtract shift
dividend iterations are done. */
i = k;
do
{
if (r >= y)
r = ((r - y) << 1) + 1;
else
r = (r << 1);
i = i - 1;
} while (i != 0);
/* First quotient bit is combined with the quotient bits resulting
from the k regular iterations. */
q = q + r;
r = r >> k;
q = q - (r << k);
}
}
if (rp)
*rp = r;
return q;
}
#else
#if (defined (L_udivdi3) || defined (L_divdi3) || \
defined (L_umoddi3) || defined (L_moddi3) || \
defined (L_divmoddi4))
static inline __attribute__ ((__always_inline__))
#endif
UDWtype
__udivmoddi4 (UDWtype n, UDWtype d, UDWtype *rp)
{
const DWunion nn = {.ll = n};
const DWunion dd = {.ll = d};
DWunion rr;
UWtype d0, d1, n0, n1, n2;
UWtype q0, q1;
UWtype b, bm;
d0 = dd.s.low;
d1 = dd.s.high;
n0 = nn.s.low;
n1 = nn.s.high;
#if !UDIV_NEEDS_NORMALIZATION
if (d1 == 0)
{
if (d0 > n1)
{
/* 0q = nn / 0D */
udiv_qrnnd (q0, n0, n1, n0, d0);
q1 = 0;
/* Remainder in n0. */
}
else
{
/* qq = NN / 0d */
if (d0 == 0)
d0 = 1 / d0; /* Divide intentionally by zero. */
udiv_qrnnd (q1, n1, 0, n1, d0);
udiv_qrnnd (q0, n0, n1, n0, d0);
/* Remainder in n0. */
}
if (rp != 0)
{
rr.s.low = n0;
rr.s.high = 0;
*rp = rr.ll;
}
}
#else /* UDIV_NEEDS_NORMALIZATION */
if (d1 == 0)
{
if (d0 > n1)
{
/* 0q = nn / 0D */
count_leading_zeros (bm, d0);
if (bm != 0)
{
/* Normalize, i.e. make the most significant bit of the
denominator set. */
d0 = d0 << bm;
n1 = (n1 << bm) | (n0 >> (W_TYPE_SIZE - bm));
n0 = n0 << bm;
}
udiv_qrnnd (q0, n0, n1, n0, d0);
q1 = 0;
/* Remainder in n0 >> bm. */
}
else
{
/* qq = NN / 0d */
if (d0 == 0)
d0 = 1 / d0; /* Divide intentionally by zero. */
count_leading_zeros (bm, d0);
if (bm == 0)
{
/* From (n1 >= d0) /\ (the most significant bit of d0 is set),
conclude (the most significant bit of n1 is set) /\ (the
leading quotient digit q1 = 1).
This special case is necessary, not an optimization.
(Shifts counts of W_TYPE_SIZE are undefined.) */
n1 -= d0;
q1 = 1;
}
else
{
/* Normalize. */
b = W_TYPE_SIZE - bm;
d0 = d0 << bm;
n2 = n1 >> b;
n1 = (n1 << bm) | (n0 >> b);
n0 = n0 << bm;
udiv_qrnnd (q1, n1, n2, n1, d0);
}
/* n1 != d0... */
udiv_qrnnd (q0, n0, n1, n0, d0);
/* Remainder in n0 >> bm. */
}
if (rp != 0)
{
rr.s.low = n0 >> bm;
rr.s.high = 0;
*rp = rr.ll;
}
}
#endif /* UDIV_NEEDS_NORMALIZATION */
else
{
if (d1 > n1)
{
/* 00 = nn / DD */
q0 = 0;
q1 = 0;
/* Remainder in n1n0. */
if (rp != 0)
{
rr.s.low = n0;
rr.s.high = n1;
*rp = rr.ll;
}
}
else
{
/* 0q = NN / dd */
count_leading_zeros (bm, d1);
if (bm == 0)
{
/* From (n1 >= d1) /\ (the most significant bit of d1 is set),
conclude (the most significant bit of n1 is set) /\ (the
quotient digit q0 = 0 or 1).
This special case is necessary, not an optimization. */
/* The condition on the next line takes advantage of that
n1 >= d1 (true due to program flow). */
if (n1 > d1 || n0 >= d0)
{
q0 = 1;
sub_ddmmss (n1, n0, n1, n0, d1, d0);
}
else
q0 = 0;
q1 = 0;
if (rp != 0)
{
rr.s.low = n0;
rr.s.high = n1;
*rp = rr.ll;
}
}
else
{
UWtype m1, m0;
/* Normalize. */
b = W_TYPE_SIZE - bm;
d1 = (d1 << bm) | (d0 >> b);
d0 = d0 << bm;
n2 = n1 >> b;
n1 = (n1 << bm) | (n0 >> b);
n0 = n0 << bm;
udiv_qrnnd (q0, n1, n2, n1, d1);
umul_ppmm (m1, m0, q0, d0);
if (m1 > n1 || (m1 == n1 && m0 > n0))
{
q0--;
sub_ddmmss (m1, m0, m1, m0, d1, d0);
}
q1 = 0;
/* Remainder in (n1n0 - m1m0) >> bm. */
if (rp != 0)
{
sub_ddmmss (n1, n0, n1, n0, m1, m0);
rr.s.low = (n1 << b) | (n0 >> bm);
rr.s.high = n1 >> bm;
*rp = rr.ll;
}
}
}
}
const DWunion ww = {{.low = q0, .high = q1}};
return ww.ll;
}
#endif
#endif
In include/longlong.h
/* Define auxiliary asm macros.
...
3) udiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
denominator) divides a UDWtype, composed by the UWtype integers
HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient
in QUOTIENT and the remainder in REMAINDER. HIGH_NUMERATOR must be less
than DENOMINATOR for correct operation. If, in addition, the most
significant bit of DENOMINATOR must be 1, then the pre-processor symbol
UDIV_NEEDS_NORMALIZATION is defined to 1.
...
If any of these macros are left undefined for a particular CPU,
C macros are used. */
For x86-64 HW divide it uses divq
(GCC syntax).
#if defined (__x86_64__) && W_TYPE_SIZE == 64
...
#define udiv_qrnnd(q, r, n1, n0, dv) \
__asm__ ("div{q} %4" \
: "=a" ((UDItype) (q)), \
"=d" ((UDItype) (r)) \
: "0" ((UDItype) (n0)), \
"1" ((UDItype) (n1)), \
"rm" ((UDItype) (dv)))
#define count_leading_zeros(count, x) ((count) = __builtin_clzll (x))
#define count_trailing_zeros(count, x) ((count) = __builtin_ctzll (x))
#define UMUL_TIME 40
#define UDIV_TIME 40
#endif /* x86_64 */