源碼是官方的2014.4 TRD工程里的,整個(gè)工程是基于zc702板子的,但手里只有塊小zybo >_< 里面的硬件設(shè)計(jì)很有參考價(jià)值,最近想用FPGA加速surf算法,先在這分析下TRD工程里sobel edge detection的例程。
wiki
Top Function
這里不同于xapp1167,直接調(diào)用hls::cv的庫(kù)函數(shù),sobel邊緣提取算法是重新實(shí)現(xiàn)的,更方便了解hls的算法實(shí)現(xiàn)的特點(diǎn)。
void image_filter(AXI_STREAM& video_in, AXI_STREAM& video_out, int rows, int cols,
int C_XR0C0, int C_XR0C1, int C_XR0C2, int C_XR1C0, int C_XR1C1, int C_XR1C2, int C_XR2C0, int C_XR2C1, int C_XR2C2,
int C_YR0C0, int C_YR0C1, int C_YR0C2, int C_YR1C0, int C_YR1C1, int C_YR1C2, int C_YR2C0, int C_YR2C1, int C_YR2C2,
int c_high_thresh, int c_low_thresh, int c_invert)
{
//Create AXI streaming interfaces for the core
//這里定義axi-stream接口用于stream圖像數(shù)據(jù)
#pragma HLS INTERFACE axis port=video_in bundle=INPUT_STREAM
#pragma HLS INTERFACE axis port=video_out bundle=OUTPUT_STREAM
//設(shè)置rows、cols 為axilite總線上的寄存器,用于改變處理圖像的大小(圖像的最大尺寸為1920*1080)
#pragma HLS INTERFACE s_axilite port=rows bundle=CONTROL_BUS offset=0x14
#pragma HLS INTERFACE s_axilite port=cols bundle=CONTROL_BUS offset=0x1C
#pragma HLS INTERFACE s_axilite port=return bundle=CONTROL_BUS
//#pragma HLS INTERFACE ap_stable port=rows
//#pragma HLS INTERFACE ap_stable port=cols
//設(shè)置sobel算子x、y方向的濾波模板 方便PS端改變模板(比如可以改成Prewitt算子)
#pragma HLS INTERFACE s_axilite port= C_XR0C0 bundle=CONTROL_BUS
#pragma HLS INTERFACE s_axilite port= C_XR0C1 bundle=CONTROL_BUS
#pragma HLS INTERFACE s_axilite port= C_XR0C2 bundle=CONTROL_BUS
#pragma HLS INTERFACE s_axilite port= C_XR1C0 bundle=CONTROL_BUS
#pragma HLS INTERFACE s_axilite port= C_XR1C1 bundle=CONTROL_BUS
#pragma HLS INTERFACE s_axilite port= C_XR1C2 bundle=CONTROL_BUS
#pragma HLS INTERFACE s_axilite port= C_XR2C0 bundle=CONTROL_BUS
#pragma HLS INTERFACE s_axilite port= C_XR2C1 bundle=CONTROL_BUS
#pragma HLS INTERFACE s_axilite port= C_XR2C2 bundle=CONTROL_BUS
#pragma HLS INTERFACE s_axilite port= C_YR0C0 bundle=CONTROL_BUS
#pragma HLS INTERFACE s_axilite port= C_YR0C1 bundle=CONTROL_BUS
#pragma HLS INTERFACE s_axilite port= C_YR0C2 bundle=CONTROL_BUS
#pragma HLS INTERFACE s_axilite port= C_YR1C0 bundle=CONTROL_BUS
#pragma HLS INTERFACE s_axilite port= C_YR1C1 bundle=CONTROL_BUS
#pragma HLS INTERFACE s_axilite port= C_YR1C2 bundle=CONTROL_BUS
#pragma HLS INTERFACE s_axilite port= C_YR2C0 bundle=CONTROL_BUS
#pragma HLS INTERFACE s_axilite port= C_YR2C1 bundle=CONTROL_BUS
#pragma HLS INTERFACE s_axilite port= C_YR2C2 bundle=CONTROL_BUS
//x、y閾值
#pragma HLS INTERFACE s_axilite port= c_high_thresh bundle=CONTROL_BUS
#pragma HLS INTERFACE s_axilite port= c_low_thresh bundle=CONTROL_BUS
#pragma HLS INTERFACE s_axilite port= c_invert bundle=CONTROL_BUS
YUV_IMAGE img_0(rows, cols);
YUV_IMAGE img_1(rows, cols);
#pragma HLS dataflow
//將axi-stream 轉(zhuǎn)換為 hls::mat (hls::mat是數(shù)據(jù)流的形式)
hls::AXIvideo2Mat(video_in, img_0);
//sobel edge detection implement
sobel_filter_core(img_0, img_1, rows, cols,
C_XR0C0, C_XR0C1, C_XR0C2, C_XR1C0, C_XR1C1, C_XR1C2, C_XR2C0, C_XR2C1, C_XR2C2,
C_YR0C0, C_YR0C1, C_YR0C2, C_YR1C0, C_YR1C1, C_YR1C2, C_YR2C0, C_YR2C1, C_YR2C2,
c_high_thresh, c_low_thresh, c_invert);
//hls::mat 轉(zhuǎn)換為axi-stream輸出
hls::Mat2AXIvideo(img_1, video_out);
}
top function 是一個(gè)標(biāo)準(zhǔn)的hls 圖像處理結(jié)構(gòu),具體內(nèi)容請(qǐng)參看xapp1167文檔
sobel_filter_core
void sobel_filter_core(YUV_IMAGE& src, YUV_IMAGE& dst, int rows, int cols,
int C_XR0C0, int C_XR0C1, int C_XR0C2, int C_XR1C0, int C_XR1C1, int C_XR1C2, int C_XR2C0, int C_XR2C1, int C_XR2C2,
int C_YR0C0, int C_YR0C1, int C_YR0C2, int C_YR1C0, int C_YR1C1, int C_YR1C2, int C_YR2C0, int C_YR2C1, int C_YR2C2,
int c_high_thresh, int c_low_thresh, int c_invert)
{
Y_BUFFER buff_A;
Y_WINDOW buff_C;
//Y_BUFFER Y_WINDOW 定義如下
//typedef hls::Window<3, 3, unsigned char>
Y_WINDOW;
//typedef hls::LineBuffer<3, MAX_WIDTH, unsigned char> Y_BUFFER;
//hls特有的memory結(jié)構(gòu) 具體特征說(shuō)明見(jiàn)下方
for(int row = 0; row < rows+1; row++){
for(int col = 0; col < cols+1; col++){
#pragma HLS loop_flatten off
// loop_flatten 選項(xiàng)說(shuō)明
//Allows nested loops to be collapsed into a single loop with improved latency.
//
#pragma HLS dependence variable=&buff_A false
// dependence 選項(xiàng)說(shuō)明
//Used to provide additional information that can overcome loop-carry dependencies and allow loops to be pipelined (or pipelined with lower intervals).
#pragma HLS PIPELINE II = 1
// PIPELINE 選項(xiàng)說(shuō)明
//Reduces the initiation interval by allowing the concurrent execution of operations within a loop or function.
//流水的迭代次數(shù)為1
// Temp values are used to reduce the number of memory reads
unsigned char temp;
YUV_PIXEL tempx;
//Line Buffer fill
if(col < cols){
buff_A.shift_down(col);
//行存shift
temp = buff_A.getval(0,col);
//這里的行存里的row = 0 和row = 1 的值是相同的
}
//There is an offset to accomodate the active pixel region
//There are only MAX_WIDTH and MAX_HEIGHT valid pixels in the image
if(col < cols && row < rows){
YUV_PIXEL new_pix;
src >> new_pix;
tempx = new_pix;
buff_A.insert_bottom(tempx.val[0],col);
//插入新值
}
//Shift the processing window to make room for the new column
buff_C.shift_right();
//窗右移空出一列
//The Sobel processing window only needs to store luminance values
//rgb2y function computes the luminance from the color pixel
if(col < cols){
//將數(shù)據(jù)從行存里復(fù)制到窗中,
//對(duì)于這里為什么將之前的 temp = buff_A.getval(0,col)
//而不是直接復(fù)制行存中row=1的數(shù)據(jù),就如同對(duì)行存中row=2的數(shù)據(jù)的操作
//eg: buff_C.insert(buff_A.getval(2,col),2,0);
//
buff_C.insert(buff_A.getval(1,col),1,0);
//
buff_C.insert(tempx.val[0],0,0);
//有疑問(wèn),猜想可能是Synthesis的時(shí)候并行化有影響
buff_C.insert(buff_A.getval(2,col),2,0);
buff_C.insert(temp,1,0);
buff_C.insert(tempx.val[0],0,0);
}
YUV_PIXEL edge;
//如下是基本的sobel算法的流程了 注意圖像邊緣的位置排除
//The sobel operator only works on the inner part of the image
//This design assumes there are no edges on the boundary of the image
if( row <= 1 || col <= 1 || row > (rows-1) || col > (cols-1)){
edge.val[0] = 0;
edge.val[1] = 128;
} else {
//Sobel operation on the inner portion of the image
edge = sobel_operator(&buff_C,
C_XR0C0, C_XR0C1, C_XR0C2, C_XR1C0, C_XR1C1, C_XR1C2, C_XR2C0, C_XR2C1, C_XR2C2,
C_YR0C0, C_YR0C1, C_YR0C2, C_YR1C0, C_YR1C1, C_YR1C2, C_YR2C0, C_YR2C1, C_XR2C2,
c_high_thresh, c_low_thresh, c_invert);
}
//The output image is offset from the input to account for the line buffer
if(row > 0 && col > 0) {
dst << edge;
}
}
}
}
參照ug902文檔
hls::LineBuffer
The main features of the LineBuffer class are:
? Support for all data types through parameterization
? User-defined number of rows and columns
? Automatic banking of rows into separate memory banks for increased memory
bandwidth
? Provides all the methods for using and debugging line buffers in an algorithmic design
hls::Window
? Support for all data types through parametrization
? User-defined number of rows and columns
? Automatic partitioning into individual registers for maximum bandwidth
? Provides all the methods to use and debug memory windows in the context of an
algorithm
由于采用的是stearm流的形式處理圖像數(shù)據(jù),所以并不能訪問(wèn)圖像上任意一點(diǎn)的值,為了方便濾波模板的操作提供了這兩種數(shù)據(jù)結(jié)構(gòu),這里的sobel算子要求有一個(gè)3*3大小的窗與模板相乘,要產(chǎn)生這樣的窗則需要一個(gè)三行數(shù)據(jù)的行存,兩層的row、col循環(huán)中每次都把行存中col列的數(shù)據(jù)shift_down操作然后將新得到的數(shù)據(jù)插入到底部,注意這里和ug902的文檔有出入,查看源碼,發(fā)現(xiàn)ug902的文檔上是錯(cuò)的 源碼中定義的 top和bottom方向如下
/* Member functions of LineBuffer class */
/*
+---+---+-... ...-+---+---+
* R-1 | | |
| | |
*
+---+---+-... ...-+---+---+
* R-2 | | |
| | |
*
+---+---+-... ...-+---+---+
*
...
... ... ...
*
+---+---+-... ...-+---+---+
* 1 | | |
| | |
*
+---+---+-... ...-+---+---+
* 0 | | |
| | |
*
+---+---+-... ...-+---+---+
*
0 1 ... ... C-2 C-1 (origin is at bottom-left point)
*/
可能是官方的文檔(v2014.1)沒(méi)有更新
sobel_operator
x、y方向模板與窗相乘 沒(méi)啥可說(shuō)的了
YUV_PIXEL sobel_operator(Y_WINDOW *window,
int XR0C0, int XR0C1, int XR0C2, int XR1C0, int XR1C1, int XR1C2, int XR2C0, int XR2C1, int XR2C2,
int YR0C0, int YR0C1, int YR0C2, int YR1C0, int YR1C1, int YR1C2, int YR2C0, int YR2C1, int YR2C2,
int high_thesh, int low_thresh, int invert)
{
short x_weight = 0;
short y_weight = 0;
short edge_weight;
unsigned char edge_val;
YUV_PIXEL pixel;
char i;
char j;
const char x_op[3][3] = {{XR0C0,XR0C1,XR0C2},
{XR1C0,XR1C1,XR1C2},
{XR2C0,XR2C1,XR2C2}};
const char y_op[3][3] = {{YR0C0,YR0C1,YR0C2},
{YR1C0,YR1C1,YR1C2},
{YR2C0,YR2C1,YR2C2}};
//Compute approximation of the gradients in the X-Y direction
for(i=0; i < 3; i++){
for(j = 0; j < 3; j++){
// X direction gradient
x_weight = x_weight + (window->getval(i,j) * x_op[i][j]);
// Y direction gradient
y_weight = y_weight + (window->getval(i,j) * y_op[i][j]);
}
}
edge_weight = ABS(x_weight) + ABS(y_weight);
if (edge_weight < 255)
edge_val = (255-(unsigned char)(edge_weight));
else
edge_val = 0;
//Edge thresholding
if(edge_val > high_thesh)
edge_val = 255;
else if(edge_val < low_thresh)
edge_val = 0;
// Invert
if (invert == 1)
edge_val = 255 - edge_val;
pixel.val[0] = edge_val;
pixel.val[1] = 128;
return pixel;
}
文章轉(zhuǎn)載自:Thssasuke的博客
評(píng)論
查看更多