I am a newbie in RVV and am rewriting some assembly functions using RVV1.0. Now I have encountered some problems about matrix transposition. Arm NEON provide the vtrn instructions, helping to do transpose:
# element wide = 8
.macro TRANSPOSE4x4 r0 r1 r2 r3
vtrn.16 \r0, \r2
vtrn.16 \r1, \r3
vtrn.8 \r0, \r1
vtrn.8 \r2, \r3
.endm
# element wide = 16
.macro TRANSPOSE4x4_16 d0 d1 d2 d3
vtrn.32 \d0, \d2
vtrn.32 \d1, \d3
vtrn.16 \d0, \d1
vtrn.16 \d2, \d3
.endm
# element wide = 8
.macro TRANSPOSE8x8 r0 r1 r2 r3 r4 r5 r6 r7
vtrn.32 \r0, \r4
vtrn.32 \r1, \r5
vtrn.32 \r2, \r6
vtrn.32 \r3, \r7
vtrn.16 \r0, \r2
vtrn.16 \r1, \r3
vtrn.16 \r4, \r6
vtrn.16 \r5, \r7
vtrn.8 \r0, \r1
vtrn.8 \r2, \r3
vtrn.8 \r4, \r5
vtrn.8 \r6, \r7
.endm
But in RVV, I do not find instructions like vtrn, why RVV don't provide this? If I have a 4*4 matrix, whose each line are stored in v1, v2, v3, v4 respectively, and SEW=16, just like
TRANSPOSE4x4_16
in arm NEON, how can I transpose it?
I try to use some methods in this blog, but failed.About transpose shape, 4x4 and 8x8 are needed, and I want to know will the specific implementation be affected by SEW?
I am rewriting a video codec library named x264, which has a assembly fuction named sub4x4_dct:
.macro SUMSUB_AB sum, diff, a, b
vadd.s16 \sum, \a, \b
vsub.s16 \diff, \a, \b
.endm
.macro DCT_1D d0 d1 d2 d3 d4 d5 d6 d7
SUMSUB_AB \d1, \d6, \d5, \d6
SUMSUB_AB \d3, \d7, \d4, \d7
vadd.s16 \d0, \d3, \d1
vadd.s16 \d4, \d7, \d7
vadd.s16 \d5, \d6, \d6
vsub.s16 \d2, \d3, \d1
vadd.s16 \d1, \d4, \d6
vsub.s16 \d3, \d7, \d5
.endm
function sub4x4_dct_neon
mov r3, #FENC_STRIDE
mov ip, #FDEC_STRIDE
vld1.32 {d0[]}, [r1,:32], r3
vld1.32 {d1[]}, [r2,:32], ip
vld1.32 {d2[]}, [r1,:32], r3
vsubl.u8 q8, d0, d1
vld1.32 {d3[]}, [r2,:32], ip
vld1.32 {d4[]}, [r1,:32], r3
vsubl.u8 q9, d2, d3
vld1.32 {d5[]}, [r2,:32], ip
vld1.32 {d6[]}, [r1,:32], r3
vsubl.u8 q10, d4, d5
vld1.32 {d7[]}, [r2,:32], ip
vsubl.u8 q11, d6, d7
DCT_1D d0, d1, d2, d3, d16, d18, d20, d22
TRANSPOSE4x4_16 d0, d1, d2, d3
DCT_1D d4, d5, d6, d7, d0, d1, d2, d3
vst1.64 {d4-d7}, [r0,:128]
bx lr
endfunc
and the C implementation is :
// dctcoef : int16_t
// pixel: uint8_t
static inline void pixel_sub_wxh( dctcoef *diff, int i_size,
pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )
{
for( int y = 0; y < i_size; y++ )
{
for( int x = 0; x < i_size; x++ )
diff[x + y*i_size] = pix1[x] - pix2[x];
pix1 += i_pix1;
pix2 += i_pix2;
}
}
static void sub4x4_dct( dctcoef dct[16], pixel *pix1, pixel *pix2 )
{
dctcoef d[16];
dctcoef tmp[16];
pixel_sub_wxh( d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
for( int i = 0; i < 4; i++ )
{
int s03 = d[i*4+0] + d[i*4+3];
int s12 = d[i*4+1] + d[i*4+2];
int d03 = d[i*4+0] - d[i*4+3];
int d12 = d[i*4+1] - d[i*4+2];
tmp[0*4+i] = s03 + s12;
tmp[1*4+i] = 2*d03 + d12;
tmp[2*4+i] = s03 - s12;
tmp[3*4+i] = d03 - 2*d12;
}
for( int i = 0; i < 4; i++ )
{
int s03 = tmp[i*4+0] + tmp[i*4+3];
int s12 = tmp[i*4+1] + tmp[i*4+2];
int d03 = tmp[i*4+0] - tmp[i*4+3];
int d12 = tmp[i*4+1] - tmp[i*4+2];
dct[i*4+0] = s03 + s12;
dct[i*4+1] = 2*d03 + d12;
dct[i*4+2] = s03 - s12;
dct[i*4+3] = d03 - 2*d12;
}
}
Here is what I have implemented TRANSPOSE4x4_16 using vslide:
# TRANSPOSE4x4_16 v24, v25, v26, v27
lui t1, 11
addi t1, t1, -1366
vsetivli zero, 4, e16, m1, tu, mu
vmv.s.x v0, t1
lui t1, 5
addi t1, t1, 1365
vmv.s.x v12, t1
vmv1r.v v15, v24
vslideup.vi v15, v25, 1, v0.t
vmv1r.v v14, v26
vslideup.vi v14, v27, 1, v0.t
vmv1r.v v0, v12
vslidedown.vi v25, v24, 1, v0.t
vslidedown.vi v27, v26, 1, v0.t
vmv1r.v v12, v15
vslideup.vi v12, v14, 2
vmv1r.v v13, v25
vslideup.vi v13, v27, 2
vsetivli zero, 2, e16, m1, tu, ma
vslidedown.vi v14, v15, 2
vslidedown.vi v27, v25, 2
vmv1r.v v15, v27
vmv1r.v v24, v12
....
....
ret
endfunc
I finally got the wrong answer.I found I got something misunderstood with arm regesters, and I am fixing it.
Since RVV has a scalable vector length, it's not always possible to get optimal performance with a single implementation when an algorithm has a fixed size restriction. This is the case for the algorithm in question, as very few implementations dispatch based on VL, and this would cause a VLEN>=128 implementation to do unnecessary work.
When transposing a NxN matrix, you could e.g. load a VLx32 block of elements, and use strided segmented stores to store a transposed version of that block. That would scalably use the full vector registers on all implementations.
Another good case is, when you need/can transpose a lot of smaller, e.g. 4x4, matrices at once. Then you could load every nth element of VL matrices into the nth vector register, and switch up the vector register operand of the subsequent operations.
The above approach could work with your sub4x4_dct
code, but that would require the calle to behave differently, so I don't think it's possible to use for x264.
I see two three practical options for your usecase:
Using a single LMUL=4 vrgather.vv to permute the elements into place. The has questionable performance, since current processors have very slow LMUL=4 vrgather.vv implementations (C908,C920). Other processors may not have this drawback. It also doesn't really fit into your code framework, as you can't set aside a LMUL=4 register with indices, so you'd need to load them from memory every time.
Using segmented a load or store into a scratch buffer. This works great, because you can just use the dctcoef dct[16]
as the scratch space.
The dav1d AV1 implementation also uses this approach.
(this was added later) Using zipping the upper and lower half of the matrix twice. Zipping a and b, would mean creating a vector of a0,b0,a1,b1,..., applying this twice is equivalent to transposing all 4x4 matrices stored in a vector register. A zip can be implemented using a vwmaccu.vx
and a vwaddu.vv
instruction, as suggested by topperc.
I benchmarked the three implementation on the C908, and the vrgather implementation was 3 time slower than the other approaches, and the zip one slightly faster than the segmented load/store approach (vsseg: 0.277785 seconds, vrgather.vv 1.545038 seconds, zip 0.249973 seconds
).
Here is the code for transposing a 4x4 matrix of 16-bit elements:
vsetivli x0, 8, e16, m1, ta, ma
vwaddu.vv v2, v0, v1
vwmaccu.vx v2, a2, v1
vwaddu.vv v0, v2, v3
vwmaccu.vx v0, a2, v3
Notice how here the 4x4 matrix is stored in only two vector registers. This assumes a VLEN of 128, and doesn't fit into your surrounding code, so you should probably go for another approach. It also works on up to VLEN=1024 if you set the appropriate fractional LMUL and adding a vslide to move the two halves into separate registers.
Here is how you could implement the TRANSPOSE macros using the segmented load/store:
.macro TRANSPOSE4x4 buf, bstride, v0, v1, v2, v3
vssseg4e8.v \v0, (\buf), \bstride
vle8.v \v0, (\buf)
add \buf, \buf, \bstride
vle8.v \v1, (\buf)
add \buf, \buf, \bstride
vle8.v \v2, (\buf)
add \buf, \buf, \bstride
vle8.v \v3, (\buf)
.endm
.macro TRANSPOSE4x4_16 buf, bstride, v0, v1, v2, v3
vssseg4e16.v \v0, (\buf), \bstride
vle16.v \v0, (\buf)
add \buf, \buf, \bstride
vle16.v \v1, (\buf)
add \buf, \buf, \bstride
vle16.v \v2, (\buf)
add \buf, \buf, \bstride
vle16.v \v3, (\buf)
.endm
.macro TRANSPOSE8x8 buf, bstride, v0, v1, v2, v3, v4, v5, v6, v7
vssseg8e8.v \v0, (\buf), \bstride
vle8.v \v0, (\buf)
add \buf, \buf, \bstride
vle8.v \v1, (\buf)
add \buf, \buf, \bstride
vle8.v \v2, (\buf)
add \buf, \buf, \bstride
vle8.v \v3, (\buf)
add \buf, \buf, \bstride
vle8.v \v4, (\buf)
add \buf, \buf, \bstride
vle8.v \v5, (\buf)
add \buf, \buf, \bstride
vle8.v \v6, (\buf)
add \buf, \buf, \bstride
vle8.v \v7, (\buf)
.endm
Note that bstride
needs to be a register that holds the stride in bytes, and buf
is destructively written to. You should be able to use a0
for it, but make sure to back it up for the last store.
Here is a quick example I used to test the implementation:
# asm.S
.global trans2
trans2: # a0 = uint16_t buf[16]
vsetvli t0, x0, e16, m8, tu, mu
vid.v v0
vsetivli t0, 4, e16, m1, tu, mu
mv t0, a0
li t1, 8
TRANSPOSE4x4_16 t0, t1, v0, v1, v2, v3
mv t0, a0
TRANSPOSE4x4_16 t0, t1, v0, v1, v2, v3
ret
// main.c
#include <stdio.h>
#include <stdint.h>
int main(void) {
uint16_t buf[16] = {0};
void trans2(uint16_t b[16]);
trans2(buf);
for (size_t i = 0; i < 4; ++i) {
for (size_t j = 0; j < 4; ++j)
printf("%02x ", buf[j+i*4]);
puts("");
}
puts("");
}
Two transposes should result in the original value, and as expected the output is the following on a VLEN=128 machine:
00 01 02 03
08 09 0a 0b
10 11 12 13
18 19 1a 1b
Btw, the last line of your sub4x4_dct_rvv
should probably be four regular stores, mirroring the four loads at the start.
I'm also not sure if you use the correct vector registers in every computation, they don't seem to match the choices of the NEON code.