I'm trying to XOR the 128 bit Initialization Vector with the Plaintext as seen here
In linux x86-64 gcc 12.2
, there's a one liner
*(unsigned __int128 *)( plaintext ) ^= *(unsigned __int128 *)( ivvectext );
For example, https://godbolt.org/z/sc8e66qeo
#include <stdio.h>
#include <stdint.h>
int main()
{
uint8_t plaintext[16] = {'t','h','e','q','u','i','c','k','b','r','o','w','n','f','o','x'};
uint8_t ivvectext[16] = {'w','a','1','2','o','b','s','q','m','v','c','s','s','q','u','w'};
*(unsigned __int128 *)( plaintext ) ^= *(unsigned __int128 *)( ivvectext );
for (int i = 0; i < sizeof(plaintext); i++) { printf("%02X ", (unsigned char)plaintext[i]); }
return 0;
}
Question
In MSVC, what's the preferred method to XOR these 128 bit values?
Update
As noted in one of the answers, use the compiler intrinsic _mm_xor_si128
#include <stdint.h>
#include <immintrin.h>
#include <iostream>
#include <ios>
#include <iomanip>
int main() {
uint8_t plaintext[16] = { 't','h','e','q','u','i','c','k','b','r','o','w','n','f','o','x' };
uint8_t ivvectext[16] = { 'w','a','1','2','o','b','s','q','m','v','c','s','s','q','u','w' };
__m128i plain = _mm_loadu_si128((__m128i*)plaintext);
__m128i ivvec = _mm_loadu_si128((__m128i*)ivvectext);
__m128i xored = _mm_xor_si128(plain, ivvec);
uint8_t* xored_array = (uint8_t*)&xored;
for (int i = 0; i < 16; i++) {
std::cout << std::uppercase << std::setw(2) << std::setfill('0') << std::hex << (int)xored_array[i] << " ";
}
std::cout << std::endl;
return 0;
}
The output matches linux
03 09 54 43 1A 0B 10 1A 0F 04 0C 04 1D 17 1A 0F
However, other answers suggest more readable code
for (int i = 0; i < sizeof(plaintext); i++)
{
plaintext[i] ^= ivvectext[i];
}
and let the compiler optimizations figure out the internal assembly code. :)
If your goal is to optimize your code, then leave this task to the compiler. (Of course you might have to enable optimization.)
You can write a simple loop like
for (int i = 0; i < sizeof(plaintext); i++)
{
plaintext[i] ^= ivvectext[i];
}
and let the compiler optimize this.
For example, x86 msvc v19.latest with option -O2
creates SSE2 instructions from this loop resulting in a single 128-bit operation.
_main PROC ; COMDAT
sub esp, 36 ; 00000024H
mov eax, DWORD PTR ___security_cookie
xor eax, esp
mov DWORD PTR __$ArrayPad$[esp+36], eax
mov DWORD PTR _plaintext$[esp+36], 1902471284 ; 71656874H
mov DWORD PTR _plaintext$[esp+40], 1801677173 ; 6b636975H
mov DWORD PTR _plaintext$[esp+44], 2003792482 ; 776f7262H
mov DWORD PTR _plaintext$[esp+48], 2020566638 ; 786f666eH
movups xmm1, XMMWORD PTR _plaintext$[esp+36]
mov DWORD PTR _ivvectext$[esp+36], 842097015 ; 32316177H
mov DWORD PTR _ivvectext$[esp+40], 1903387247 ; 7173626fH
mov DWORD PTR _ivvectext$[esp+44], 1935898221 ; 7363766dH
mov DWORD PTR _ivvectext$[esp+48], 2004185459 ; 77757173H
movups xmm0, XMMWORD PTR _ivvectext$[esp+36]
push esi
xor esi, esi
pxor xmm1, xmm0
movups XMMWORD PTR _plaintext$[esp+40], xmm1
...
See https://godbolt.org/z/afTPK5von
Additional hints from comments:
Even if you determine that you need to hand-optimize the code and use the intrinsic functions explicitly (e.g., the optimizer doesn't use them for some reason, sad panda), I recommend also keeping the straightforward implementation as a reference implementation for development & debugging purposes. (Eljay's comment)
Sometimes the MS compiler won't optimize what looks like a simple loop, in this case you can enable Vectorizer and parallelizer messages which can give you hints as to why it didn't. (user20716902's comment)