边框检查图像处理

我想在处理图像处理中的任何滤镜时注意边界条件。我正在外推边框并创建新边界。例如,我有4×3输入:

//Input int image[4][3] = 1 2 3 4 2 4 6 8 3 6 9 12 //Output int extensionimage[6][5] = 1 1 2 3 4 4 1 1 2 3 4 4 2 2 4 6 8 8 3 3 6 9 12 12 3 3 6 9 12 12 

我的代码:

 #include  #include  #include  void padd_border(int *img,int *extension,int width,int height); int main(){ int width = 4,height = 3; int *img = new int[(width) * (height)]; for(int j = 0;j < height; j++){ for(int i = 0;i < width; i++){ img[j*width + i] = (i+1)*(j+1); printf("%d\t",img[j*width + i]); } } //Allocate memory for signal extension int *extension = new int[(width + 2) * (height + 2)]; //Check memory allocation if (!extension) return 0; // init to zero memset(extension, 0, sizeof(int)*(width + 2) * (height + 2)); //Padd the input for border conditions padd_border(img,extension,width,height); //HERE using "extension" input for dummy functionality delete[] extension; delete[] img; return 0; } void padd_border(int *image,int *extension,int width,int height){ // Create image extension for (int i = 0; i < height; ++i) { memcpy(extension + (width + 2) * (i + 1) + 1, image + width * i, width * sizeof(int)); extension[(width + 2) * (i + 1)] = image[width * i]; extension[(width + 2) * (i + 2) - 1] = image[width * (i + 1) - 1]; } // Fill first line of image extension memcpy(extension, extension + width + 2, (width + 2) * sizeof(int)); // Fill last line of image extension memcpy(extension + (width + 2) * (height + 1), extension + (width + 2) * height, (width + 2) * sizeof(int)); } 

我的问题:

1)我不想创建“扩展”缓冲区。 我想重复使用图像进行外推。 那有可能吗?

2)我如何使用Neon来执行上述代码?

根据PaulR伪代码更改代码后,我得到一些奇怪的结果:

在修复边框期间编辑我的运行时问题
我的意见:

 221 220 221 223 230 233 234 235 .. 71 73 70 70 92 130 141 143 .. 

我想要这个操作来获取目的地:

  -1*v_m1_m1 + 0*v_m1_0 + 1*v_m1_p1 -1*v_0_m1 + 0*v_0_0 + 1*v_0_p1 ->V_OUT -1*v_p1_m1 + 0*v_p1_0 + 1*v_p1_p1 

更改边框代码后,我得到valuse:

  221 221 221 221 221 220 221 223 230 233 234 235 221 221 221 221 221 220 221 223 230 233 234 235 71 71 71 71 71 73 70 70 92 130 141 143 

在标量代码中,如果我想计算221(@ i,j = 0,0),使用边框它看起来像这样:

  221 221 220 221 221 220 71 71 73 

但是在霓虹灯的矢量化中,我得到的是错误的

 v_m1_m1.0 v_m1_0.1 v_m1_p1.2 v_0_m1.0 v_0_0.1 v_0_p1.2 v_p1_m1.0 v_p1_0.1 v_p1_p1.2 221 221 230 221 221 230 71 71 92 

我的伪代码:

 for i = 0 to nrows - 1 // init row pointers p_row_m1 = src + src_width * MAX(i-1, 0); // pointing to minus1 row p_row_0 = src + src_width * i; // pointing to current row p_row_p1 = src + src_width * MIN(i+1, src_width-1); // pointing to plus1 row v_m1_m1 = vdupq_n_u32(p_row_m1[0]); // fill left vector from src[i-1][0] v_0_m1 = vdupq_n_u32(p_row_0[0]); // fill left vector from src[i][0] v_p1_m1 = vdupq_n_u32(p_row_p1[0]); // fill left vector from src[i+1][0] v_m1_0 = vld1q_u32(&p_row_m1[0]); // load center vector from src[i-1][0..7] v_0_0 = vld1q_u32(&p_row_0[0]); // load center vector from src[i][0..7] v_p1_0 = vld1q_u32(&p_row_p1[0]); // load center vector from src[i+1][0..7] for j = 0 to (ncols - 4) step 4 // assuming 4 elements per SIMD vector v_m1_p1 = vld1q_u32(&p_row_m1[j+4]); // load right vector from src[i-1][0..7] v_0_p1 = vld1q_u32(&p_row_0[j+4]); // load right vector from src[i][0..7] v_p1_p1 = vld1q_u32(&p_row_p1[j+4]); // load right vector from src[i+1][0..7] // // you now have a 3x3 arrangement of vectors on which // you can perform a neighbourhood operation and generate // 16 output pixels for the current iteration: // // v_m1_m1 v_m1_0 v_m1_p1 // v_0_m1 v_0_0 v_0_p1 // v_p1_m1 v_p1_0 v_p1_p1 // // | // V // // v_out vst1q_s32(v_out, &image_out[i][j]) // store output vector at image_out[i][j..j+15] // shuffle vectors so that we can use them on next iteration v_m1_m1 = v_m1_0 v_m1_0 = v_m1_p1 v_0_m1 = v_0_0 v_0_0 = v_0_p1 v_p1_m1 = v_p1_0 v_p1_0 = v_p1_p1 end_for // for final iteration we need to handle right edge pixels... v_m1_p1 = vdupq_n_u32(p_row_m1[ncols-1]) // fill right vector from image[i-1][ncols-1] v_0_p1 = vdupq_n_u32(p_row_0[ncols-1]) // fill right vector from image[i][ncols-1] v_p1_p1 = vdupq_n_u32(p_row_p1[ncols-1]) // fill right vector from image[i+1][ncols-1] // calculate v_out as above vst1q_s32(v_out, &image_out[i][j]) // store output vector at image_out[i][ncols_16..ncols-1] end_for 

下面是一些伪代码,用于使用具有复制边缘像素的SIMD执行3×3邻域操作。 输入图像是image[nrows][ncols] ,输出图像是image_out[nrows][ncols]

 for i = 0 to nrows - 1 // init row pointers p_row_m1 = &image[max(i-1, 0)][0] // pointer to start of row i-1 p_row_0 = &image[i][0] // pointer to start of row i p_row_p1 = &image[min(i+1, ncols-1)][0] // pointer to start of row i+1 v_m1_m1 = init_vec(p_row_m1[0]) // fill left vector from image[i-1][0] v_0_m1 = init_vec(p_row_0[0]) // fill left vector from image[i][0] v_p1_m1 = init_vec(p_row_p1[0]) // fill left vector from image[i+1][0] v_m1_0 = load_vec(&p_row_m1[0]) // load centre vector from image[i-1][0..15] v_0_0 = load_vec(&p_row_0[0]) // load centre vector from image[i][0..15] v_p1_0 = load_vec(&p_row_p1[0]) // load centre vector from image[i+1][0..15] for j = 0 to (ncols - 16) step 16 // assuming 16 elements per SIMD vector v_m1_p1 = load_vec(&p_row_m1[j+16]) // load right vector from image[i-1][0..15] v_0_p1 = load_vec(&p_row_0[j+16]) // load right vector from image[i][0..15] v_p1_p1 = load_vec(&p_row_p1[j+16]) // load right vector from image[i+1][0..15] // // you now have a 3x3 arrangement of vectors on which // you can perform a neighbourhood operation and generate // 16 output pixels for the current iteration: // // v_m1_m1 v_m1_0 v_m1_p1 // v_0_m1 v_0_0 v_0_p1 // v_p1_m1 v_p1_0 v_p1_p1 // // | // V // // v_out // store_vec(v_out, &image_out[i][j]) // store output vector at image_out[i][j..j+15] // shuffle vectors so that we can use them on next iteration v_m1_m1 = v_m1_0 v_m1_0 = v_m1_p1 v_0_m1 = v_0_0 v_0_0 = v_0_p1 v_p1_m1 = v_p1_0 v_p1_0 = v_p1_p1 end_for // for final iteration we need to handle right edge pixels... v_m1_p1 = init_vec(p_row_m1[ncols-1]) // fill right vector from image[i-1][ncols-1] v_0_p1 = init_vec(p_row_0[ncols-1]) // fill right vector from image[i][ncols-1] v_p1_p1 = init_vec(p_row_p1[ncols-1]) // fill right vector from image[i+1][ncols-1] // calculate v_out as above store_vec(v_out, &image_out[i][j]) // store output vector at image_out[i][ncols_16..ncols-1] end_for 

请注意,这假定每个矢量16个像素,并且ncols是16的倍数。