//使用C语言实现RGB图像转灰度图
//Gray = (R * 77 + G * 151 + B * 28 ) / 256
void reference_convert (uint8_t * __restrict dest, uint8_t * __restrict src, int n)
{
int i;
for (i=0; i<n; i++)
{
int r = *src++; // load red
int g = *src++; // load green
int b = *src++; // load blue
// build weighted average:
int y = (r*77)+(g*151)+(b*28);
// undo the scale by 256 and write to memory:
*dest++ = (y>>8);
}
}
//使用NEON Intrinsics优化
void neon_convert (uint8_t * __restrict dest, uint8_t * __restrict src, int n)
{
int i;
//读取8字节的预设值到64位寄存器
uint8x8_t rfac = vdup_n_u8 (77);// 转换权值 R
uint8x8_t gfac = vdup_n_u8 (151);// 转换权值 G
uint8x8_t bfac = vdup_n_u8 (28);// 转换权值 B
n/=8;
for (i=0; i<n; i++)
{
uint16x8_t temp;
uint8x8x3_t rgb = vld3_u8 (src);//一次读取3个unit8x8到3个64位寄存器
uint8x8_t result;
temp = vmull_u8 (rgb.val[0], rfac); // temp=rgb.val[0]*rfac
temp = vmlal_u8 (temp,rgb.val[1], gfac);// temp=temp+rgb.val[1]*gfac
temp = vmlal_u8 (temp,rgb.val[2], bfac);//temp=temp+rgb.val[2]*bfac
result = vshrn_n_u16 (temp, 8); // 128位寄存器每16位右移第二个参数位
vst1_u8 (dest, result); // 转存运算结果到dest
src += 8*3;
dest += 8;
}
}
//NEON汇编代码优化:
static void neon_asm_convert(uint8_t * __restrict dest, uint8_t * __restrict src, int numPixels)
{
asm volatile("lsr %2, %2, #3 \n"
"# build the three constants: \n"
"mov r4, #28 \n" // Blue channel multiplier
"mov r5, #151 \n" // Green channel multiplier
"mov r6, #77 \n" // Red channel multiplier
"vdup.8 d4, r4 \n"
"vdup.8 d5, r5 \n"
"vdup.8 d6, r6 \n"
".loop: \n"
"# load 8 pixels: \n"
"vld4.8 {d0-d3}, [%1]! \n"
"# do the weight average: \n"
"vmull.u8 q7, d0, d4 \n"
"vmlal.u8 q7, d1, d5 \n"
"vmlal.u8 q7, d2, d6 \n"
"# shift and store: \n"
"vshrn.u16 d7, q7, #8 \n" // Divide q3 by 256 and store in the d7
"vst1.8 {d7}, [%0]! \n"
"subs %2, %2, #1 \n" // Decrement iteration count
"bne .loop \n" // Repeat unil iteration count is not zero
:
: "r"(dest), "r"(src), "r"(numPixels)
: "r4", "r5", "r6"
);
}
|