here are they:
#define majority(a, b, c) _mm256_xor_si256(_mm256_xor_si256(_mm256_and_si256(a, b), _mm256_and_si256(b, c)), _mm256_and_si256(a, c))
#define choose(e, f, g) _mm256_xor_si256(_mm256_and_si256(e, f), _mm256_and_si256(_mm256_xor_si256(e, allones) , g))
#define ROR32_AVX(x, n) _mm256_or_si256(_mm256_srli_epi32((x), (n)), _mm256_slli_epi32((x), 32 - (n)))
#define sigma0(x) _mm256_xor_si256(_mm256_xor_si256(ROR32_AVX((x), 2), ROR32_AVX((x), 13)), ROR32_AVX((x), 22))
#define sigma1(x) _mm256_xor_si256(_mm256_xor_si256(ROR32_AVX((x), 6), ROR32_AVX((x), 11)), ROR32_AVX((x), 25))
#define step(maj, ch, s0, s1, az, bz, cz, dz, ez, fz, gz, hz, k, m) \
ez = _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(ch(e, f, g), s1(e)), hz), k), m), dz); \
az = _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(ch(e, f, g), s1(e)), hz), k), m); \
az = _mm256_add_epi32(az, _mm256_add_epi32(maj(a, b, c), s0(a)));
I have checked the sha256 standard and wikipedia page and eveything seems to be okay here? I have not yet written the compression function because I wanted to make sure I got the #define's right.
My plan is to call the step
macro like this:
step(majority, choose, s0, s1, a, b, c, d, e, f, g, h, k, m);
step(majority, choose, s0, s1, h, a, b, c, d, e, f, g, k, m);
step(majority, choose, s0, s1, g, h, a, b, c, d, e, f, k, m);
step(majority, choose, s0, s1, f, g, h, a, b, c, d, e, k, m);
EDIT: I have now written the 64 step rounds, set all message blocks to zero and still got the wrong hash. Or does sha256, with the "zero-input" still need padding, e.g. the 0x80 byte appended? So should "empty" message be actually 0x8000.....00001
the one at the end noting the length of one of the message?
I did 64 of these:
step(majority, choose, sigma0, sigma1, a, b, c, d, e, f, g, h, K0x428a2f98, m[0]);
step(majority, choose, sigma0, sigma1, h, a, b, c, d, e, f, g, K0x71374491, m[1]);
step(majority, choose, sigma0, sigma1, g, h, a, b, c, d, e, f, K0xb5c0fbcf, m[2]);
step(majority, choose, sigma0, sigma1, f, g, h, a, b, c, d, e, K0xe9b5dba5, m[3]);
step(majority, choose, sigma0, sigma1, e, f, g, h, a, b, c, d, K0x3956c25b, m[4]);
all the constants are set to the constants values using _mm256_setr_epi32
in another function. I also set the initial working registers to their corresponding values. At the end, when wanting to know the hash, I do this:
H0x6a09e667 = _mm256_add_epi32(H0x6a09e667, a);
H0xbb67ae85 = _mm256_add_epi32(H0xbb67ae85, b);
H0x3c6ef372 = _mm256_add_epi32(H0x3c6ef372, c);
H0xa54ff53a = _mm256_add_epi32(H0xa54ff53a, d);
H0x510e527f = _mm256_add_epi32(H0x510e527f, e);
H0x9b05688c = _mm256_add_epi32(H0x9b05688c, f);
H0x1f83d9ab = _mm256_add_epi32(H0x1f83d9ab, g);
H0x5be0cd19 = _mm256_add_epi32(H0x5be0cd19, h);
unsigned int aa = _mm256_extract_epi32(H0x6a09e667, 0);
unsigned int bb = _mm256_extract_epi32(H0xbb67ae85, 0);
unsigned int cc = _mm256_extract_epi32(H0x3c6ef372, 0);
unsigned int dd = _mm256_extract_epi32(H0xa54ff53a, 0);
unsigned int ee = _mm256_extract_epi32(H0x510e527f, 0);
unsigned int ff = _mm256_extract_epi32(H0x9b05688c, 0);
unsigned int gg = _mm256_extract_epi32(H0x1f83d9ab, 0);
unsigned int hh = _mm256_extract_epi32(H0x5be0cd19, 0);
printf("%x%x%x%x%x%x%x%x\n", aa, bb, cc, dd, ee, ff, gg, hh);
Here is the full code:
This ...
#define step(maj, ch, s0, s1, az, bz, cz, dz, ez, fz, gz, hz, k, m) \ ez = _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(ch(e, f, g), s1(e)), hz), k), m), dz); \ az = _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(ch(e, f, g), s1(e)), hz), k), m); \ az = _mm256_add_epi32(az, _mm256_add_epi32(maj(a, b, c), s0(a)));
... looks wrong, because the arguments to the embedded macro calls are not drawn from the parameters of the step()
macro. It looks like you copied the names + parameter lists from the definitions of those macros and changed only the macro names to match the parameters to step
, not the arguments to those macros. The compiler nevertheless accepts that because the macro parameters are named the same as suitably typed variables that are in scope at the points where you expand step()
. If what you wrote there is actually what you wanted then you have made step()
more complicated than it needs to be.
Possibly you meant something more like this:
#define step(maj, ch, s0, s1, az, bz, cz, dz, ez, fz, gz, hz, k, m) \
ez = _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(ch(ez, fz, gz), s1(ez)), hz), k), m), dz); \
az = _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(ch(ez, fz, gz), s1(ez)), hz), k), m); \
az = _mm256_add_epi32(az, _mm256_add_epi32(maj(az, bz, cz), s0(az)));
(but absolutely do check that).
Additionally, for the usage depicted in the question, step()
is needlessly complicated by accepting parameters maj
, ch
, s0
, and s1
. All uses of step()
shown pass the same names as arguments to these parameters. Even if the definitions of those names are different in different usage contexts, as long as the names themselves are the same, your code would be shorter and a bit clearer if step
just embedded them directly, like so:
#define step(az, bz, cz, dz, ez, fz, gz, hz, k, m) \
ez = _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(choose(ez, fz, gz), sigma1(ez)), hz), k), m), dz); \
az = _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(choose(ez, fz, gz), sigma1(ez)), hz), k), m); \
az = _mm256_add_epi32(az, _mm256_add_epi32(majority(az, bz, cz), sigma0(az)));
// ...
step(a, b, c, d, e, f, g, h, k, m);
step(h, a, b, c, d, e, f, g, k, m);
step(g, h, a, b, c, d, e, f, k, m);
step(f, g, h, a, b, c, d, e, k, m);
Moreover, genuine functions are to be preferred to function-like macros. You wrote in comments that ...
I tried to implement it like functions before but that went wrong; I could not write it like step, step, step etc I had to manually write out all the majority and choose and sigma operations by hand, that was no fun :)
But it looks like at minimum, all the macros other than step()
could easily be made functions without disrupting anything:
__m256i majority(__m256i a, __m256i b, __m256i c) {
return _mm256_xor_si256(_mm256_xor_si256(_mm256_and_si256(a, b), _mm256_and_si256(b, c)), _mm256_and_si256(a, c));
}
__m256i choose(__m256i e, __m256i f, __m256i g) {
return _mm256_xor_si256(_mm256_and_si256(e, f), _mm256_and_si256(_mm256_xor_si256(e, allones) , g));
}
__m256i ROR32_AVX(__m256i x, int n) {
return _mm256_or_si256(_mm256_srli_epi32(x, n), _mm256_slli_epi32(x, 32 - n));
}
__m256i sigma0(__m256i x) {
return _mm256_xor_si256(_mm256_xor_si256(ROR32_AVX(x, 2), ROR32_AVX(x, 13)), ROR32_AVX(x, 22));
}
__m256i sigma1(__m256i x) {
return _mm256_xor_si256(_mm256_xor_si256(ROR32_AVX(x, 6), ROR32_AVX(x, 11)), ROR32_AVX(x, 25));
}
As far as I can tell, the only issue with doing the same with step()
is that it modifies some of its arguments, but this can be addressed by passing pointers instead of the direct values:
// all arguments are pointers for consistency, though it appears that
// only az and ez really need to be
void step(__m256i *az, __m256i *bz, __m256i *cz, __m256i *dz, __m256i *ez,
__m256i *fz, __m256i *gz, __m256i *hz, __m256i *k, __m256i *m) {
*ez = _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(choose(*ez, *fz, *gz), sigma1(*ez)), *hz), *k), *m), *dz);
*az = _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(choose(*ez, *fz, *gz), sigma1(*ez)), *hz), *k), *m);
*az = _mm256_add_epi32(*az, _mm256_add_epi32(majority(*az, *bz, *cz), sigma0(*az)));
}
// ... which would be used like so:
step(&a, &b, &c, &d, &e, &f, &g, &h, &k, &m);
step(&h, &a, &b, &c, &d, &e, &f, &g, &k, &m);
step(&g, &h, &a, &b, &c, &d, &e, &f, &k, &m);
step(&f, &g, &h, &a, &b, &c, &d, &e, &k, &m);
Among other things, that addresses the under-parenthesization of the erstwhile macro parameters in the macro replacement text, and it is less prone to the kind of error I think the original macro contained (see above). As long as the definitions and all uses of those functions appear in the same translation unit, I would expect everything to be inlined when compiled at an appropriate optimization level. If you're doubtful or want to make the intent clearer, then you can declare all of those functions static inline
(again assuming that all calls to them are from the same TU). If necessary, your compiler surely also provides options to influence which functions get inlined.