I need to convert big arrays of 16-bit integer values from big-endian to little-endian format.
Now I use for conversion the following function:
inline void Reorder16bit(const uint8_t * src, uint8_t * dst)
{
uint16_t value = *(uint16_t*)src;
*(uint16_t*)dst = value >> 8 | value << 8;
}
void Reorder16bit(const uint8_t * src, size_t size, uint8_t * dst)
{
assert(size%2 == 0);
for(size_t i = 0; i < size; i += 2)
Reorder16bit(src + i, dst + i);
}
I use GCC. Target platform is ARMv7 (Raspberry Phi 2B).
Is there any way to optimize it?
This conversion is needed for loading audio samples which can be as in little- endian as in big-endian format. Of course it is not a bottleneck now, but it takes about 10% of total processing time. And I think that is too much for such a simple operation.
If you want to improve performance of your code you can make following:
1) Processing of 4-bytes for one step:
inline void Reorder16bit(const uint8_t * src, uint8_t * dst)
{
uint16_t value = *(uint16_t*)src;
*(uint16_t*)dst = value >> 8 | value << 8;
}
inline void Reorder16bit2(const uint8_t * src, uint8_t * dst)
{
uint32_t value = *(uint32_t*)src;
*(size_t*)dst = (value & 0xFF00FF00) >> 8 | (value & 0x00FF00FF) << 8;
}
void Reorder16bit(const uint8_t * src, size_t size, uint8_t * dst)
{
assert(size%2 == 0);
size_t alignedSize = size/4*4;
for(size_t i = 0; i < alignedSize; i += 4)
Reorder16bit2(src + i, dst + i);
for(size_t i = alignedSize; i < size; i += 2)
Reorder16bit(src + i, dst + i);
}
If you use a 64-bit platform, it is possible to process 8 bytes for one step the same way.
2) ARMv7 platform supports SIMD instructions called NEON. With using of them you can make you code even faster then in 1):
inline void Reorder16bit(const uint8_t * src, uint8_t * dst)
{
uint16_t value = *(uint16_t*)src;
*(uint16_t*)dst = value >> 8 | value << 8;
}
inline void Reorder16bit8(const uint8_t * src, uint8_t * dst)
{
uint8x16_t _src = vld1q_u8(src);
vst1q_u8(dst, vrev16q_u8(_src));
}
void Reorder16bit(const uint8_t * src, size_t size, uint8_t * dst)
{
assert(size%2 == 0);
size_t alignedSize = size/16*16;
for(size_t i = 0; i < alignedSize; i += 16)
Reorder16bit8(src + i, dst + i);
for(size_t i = alignedSize; i < size; i += 2)
Reorder16bit(src + i, dst + i);
}