If you want to emulate _mm_movemask_epi8
and you just need an 8 bit scalar mask from 8 byte elements then you can do something like this using AltiVec:
#include <stdio.h>
int main(void)
{
const vector unsigned char vShift = { 0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0 };
// constant shift vector
vector unsigned char isValid = { 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
// sample input
vector unsigned char v1 = vec_sl(isValid, vShift);
// shift input values
vector unsigned int v2 = vec_sum4s(v1, (vector unsigned int)(0));
vector signed int v3 = vec_sum2s((vector signed int)v2, (vector signed int)(0));
// sum shifted values
vector signed int v4 = vec_splat(v3, 1);
unsigned int mask __attribute__ ((aligned(16)));
vec_ste((vector unsigned int)v4, 0, &mask);
// store sum in scalar
printf("v1 = %vu
", v1);
printf("v2 = %#vlx
", v2);
printf("v3 = %#vlx
", v3);
printf("v4 = %#vlx
", v4);
printf("mask = %#x
", mask);
return 0;
}
This is 5 AltiVec instructions versus 1 in SSE. You might be able to lose the vec_splat
and get it down to 4.
与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…