1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
20 // * Redistribution's of source code must retain the above copyright notice,
21 // this list of conditions and the following disclaimer.
23 // * Redistribution's in binary form must reproduce the above copyright notice,
24 // this list of conditions and the following disclaimer in the documentation
25 // and/or other materials provided with the distribution.
27 // * The name of the copyright holders may not be used to endorse or promote products
28 // derived from this software without specific prior written permission.
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
44 This is a variation of
45 "Stereo Processing by Semiglobal Matching and Mutual Information"
46 by Heiko Hirschmuller.
48 We match blocks rather than individual pixels, thus the algorithm is called
49 SGBM (Semi-global block matching)
58 typedef uchar PixType;
59 typedef short CostType;
60 typedef short DispType;
62 enum { NR = 16, NR2 = NR/2 };
64 StereoSGBM::StereoSGBM()
66 minDisparity = numberOfDisparities = 0;
72 speckleWindowSize = 0;
78 StereoSGBM::StereoSGBM( int _minDisparity, int _numDisparities, int _SADWindowSize,
79 int _P1, int _P2, int _disp12MaxDiff, int _preFilterCap,
80 int _uniquenessRatio, int _speckleWindowSize, int _speckleRange,
83 minDisparity = _minDisparity;
84 numberOfDisparities = _numDisparities;
85 SADWindowSize = _SADWindowSize;
88 disp12MaxDiff = _disp12MaxDiff;
89 preFilterCap = _preFilterCap;
90 uniquenessRatio = _uniquenessRatio;
91 speckleWindowSize = _speckleWindowSize;
92 speckleRange = _speckleRange;
97 StereoSGBM::~StereoSGBM()
102 For each pixel row1[x], max(-maxD, 0) <= minX <= x < maxX <= width - max(0, -minD),
103 and for each disparity minD<=d<maxD the function
104 computes the cost (cost[(x-minX)*(maxD - minD) + (d - minD)]), depending on the difference between
105 row1[x] and row2[x-d]. The subpixel algorithm from
106 "Depth Discontinuities by Pixel-to-Pixel Stereo" by Stan Birchfield and C. Tomasi
107 is used, hence the suffix BT.
109 the temporary buffer should contain width2*2 elements
111 static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
112 int minD, int maxD, CostType* cost,
113 PixType* buffer, const PixType* tab,
114 int tabOfs, int ftzero )
116 int x, c, width = img1.cols, cn = img1.channels();
117 int minX1 = max(maxD, 0), maxX1 = width + min(minD, 0);
118 int minX2 = max(minX1 - maxD, 0), maxX2 = min(maxX1 - minD, width);
119 int D = maxD - minD, width1 = maxX1 - minX1, width2 = maxX2 - minX2;
120 const PixType *row1 = img1.ptr<PixType>(y), *row2 = img2.ptr<PixType>(y);
121 PixType *prow1 = buffer + width2*2, *prow2 = prow1 + width*cn*2;
125 for( c = 0; c < cn*2; c++ )
127 prow1[width*c] = prow1[width*c + width-1] =
128 prow2[width*c] = prow2[width*c + width-1] = tab[0];
131 int n1 = y > 0 ? -(int)img1.step : 0, s1 = y < img1.rows-1 ? (int)img1.step : 0;
132 int n2 = y > 0 ? -(int)img2.step : 0, s2 = y < img2.rows-1 ? (int)img2.step : 0;
136 for( x = 1; x < width-1; x++ )
138 prow1[x] = tab[(row1[x+1] - row1[x-1])*2 + row1[x+n1+1] - row1[x+n1-1] + row1[x+s1+1] - row1[x+s1-1]];
139 prow2[width-1-x] = tab[(row2[x+1] - row2[x-1])*2 + row2[x+n2+1] - row2[x+n2-1] + row2[x+s2+1] - row2[x+s2-1]];
141 prow1[x+width] = row1[x];
142 prow2[width-1-x+width] = row2[x];
147 for( x = 1; x < width-1; x++ )
149 prow1[x] = tab[(row1[x*3+3] - row1[x*3-3])*2 + row1[x*3+n1+3] - row1[x*3+n1-3] + row1[x*3+s1+3] - row1[x*3+s1-3]];
150 prow1[x+width] = tab[(row1[x*3+4] - row1[x*3-2])*2 + row1[x*3+n1+4] - row1[x*3+n1-2] + row1[x*3+s1+4] - row1[x*3+s1-2]];
151 prow1[x+width*2] = tab[(row1[x*3+5] - row1[x*3-1])*2 + row1[x*3+n1+5] - row1[x*3+n1-1] + row1[x*3+s1+5] - row1[x*3+s1-1]];
153 prow2[width-1-x] = tab[(row2[x*3+3] - row2[x*3-3])*2 + row2[x*3+n2+3] - row2[x*3+n2-3] + row2[x*3+s2+3] - row2[x*3+s2-3]];
154 prow2[width-1-x+width] = tab[(row2[x*3+4] - row2[x*3-2])*2 + row2[x*3+n2+4] - row2[x*3+n2-2] + row2[x*3+s2+4] - row2[x*3+s2-2]];
155 prow2[width-1-x+width*2] = tab[(row2[x*3+5] - row2[x*3-1])*2 + row2[x*3+n2+5] - row2[x*3+n2-1] + row2[x*3+s2+5] - row2[x*3+s2-1]];
157 prow1[x+width*3] = row1[x*3];
158 prow1[x+width*4] = row1[x*3+1];
159 prow1[x+width*5] = row1[x*3+2];
161 prow2[width-1-x+width*3] = row2[x*3];
162 prow2[width-1-x+width*4] = row2[x*3+1];
163 prow2[width-1-x+width*5] = row2[x*3+2];
167 memset( cost, 0, width1*D*sizeof(cost[0]) );
170 cost -= minX1*D + minD; // simplify the cost indices inside the loop
173 volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
177 for( c = 0; c < cn*2; c++, prow1 += width, prow2 += width )
179 int diff_scale = c < cn ? 0 : 2;
182 // v0 = min(row2[x-1/2], row2[x], row2[x+1/2]) and
183 // v1 = max(row2[x-1/2], row2[x], row2[x+1/2]) and
184 for( x = minX2; x < maxX2; x++ )
187 int vl = x > 0 ? (v + prow2[x-1])/2 : v;
188 int vr = x < width-1 ? (v + prow2[x+1])/2 : v;
189 int v0 = min(vl, vr); v0 = min(v0, v);
190 int v1 = max(vl, vr); v1 = max(v1, v);
191 buffer[x] = (PixType)v0;
192 buffer[x + width2] = (PixType)v1;
195 for( x = minX1; x < maxX1; x++ )
198 int ul = x > 0 ? (u + prow1[x-1])/2 : u;
199 int ur = x < width-1 ? (u + prow1[x+1])/2 : u;
200 int u0 = min(ul, ur); u0 = min(u0, u);
201 int u1 = max(ul, ur); u1 = max(u1, u);
206 __m128i _u = _mm_set1_epi8((char)u), _u0 = _mm_set1_epi8((char)u0);
207 __m128i _u1 = _mm_set1_epi8((char)u1), z = _mm_setzero_si128();
208 __m128i ds = _mm_cvtsi32_si128(diff_scale);
210 for( int d = minD; d < maxD; d += 16 )
212 __m128i _v = _mm_loadu_si128((const __m128i*)(prow2 + width-x-1 + d));
213 __m128i _v0 = _mm_loadu_si128((const __m128i*)(buffer + width-x-1 + d));
214 __m128i _v1 = _mm_loadu_si128((const __m128i*)(buffer + width-x-1 + d + width2));
215 __m128i c0 = _mm_max_epu8(_mm_subs_epu8(_u, _v1), _mm_subs_epu8(_v0, _u));
216 __m128i c1 = _mm_max_epu8(_mm_subs_epu8(_v, _u1), _mm_subs_epu8(_u0, _v));
217 __m128i diff = _mm_min_epu8(c0, c1);
219 c0 = _mm_load_si128((__m128i*)(cost + x*D + d));
220 c1 = _mm_load_si128((__m128i*)(cost + x*D + d + 8));
222 _mm_store_si128((__m128i*)(cost + x*D + d), _mm_adds_epi16(c0, _mm_srl_epi16(_mm_unpacklo_epi8(diff,z), ds)));
223 _mm_store_si128((__m128i*)(cost + x*D + d + 8), _mm_adds_epi16(c1, _mm_srl_epi16(_mm_unpackhi_epi8(diff,z), ds)));
229 for( int d = minD; d < maxD; d++ )
231 int v = prow2[width-x-1 + d];
232 int v0 = buffer[width-x-1 + d];
233 int v1 = buffer[width-x-1 + d + width2];
234 int c0 = max(0, u - v1); c0 = max(c0, v0 - u);
235 int c1 = max(0, v - u1); c1 = max(c1, u0 - v);
237 cost[x*D + d] = (CostType)(cost[x*D+d] + (min(c0, c1) >> diff_scale));
243 for( c = 0; c < cn*2; c++, prow1 += width, prow2 += width )
245 for( x = minX1; x < maxX1; x++ )
251 __m128i _u = _mm_set1_epi8(u), z = _mm_setzero_si128();
253 for( int d = minD; d < maxD; d += 16 )
255 __m128i _v = _mm_loadu_si128((const __m128i*)(prow2 + width-1-x + d));
256 __m128i diff = _mm_adds_epu8(_mm_subs_epu8(_u,_v), _mm_subs_epu8(_v,_u));
257 __m128i c0 = _mm_load_si128((__m128i*)(cost + x*D + d));
258 __m128i c1 = _mm_load_si128((__m128i*)(cost + x*D + d + 8));
260 _mm_store_si128((__m128i*)(cost + x*D + d), _mm_adds_epi16(c0, _mm_unpacklo_epi8(diff,z)));
261 _mm_store_si128((__m128i*)(cost + x*D + d + 8), _mm_adds_epi16(c1, _mm_unpackhi_epi8(diff,z)));
267 for( int d = minD; d < maxD; d++ )
269 int v = prow2[width-1-x + d];
270 cost[x*D + d] = (CostType)(cost[x*D + d] + (CostType)std::abs(u - v));
280 computes disparity for "roi" in img1 w.r.t. img2 and write it to disp1buf.
281 that is, disp1buf(x, y)=d means that img1(x+roi.x, y+roi.y) ~ img2(x+roi.x-d, y+roi.y).
283 disp2full is the reverse disparity map, that is:
284 disp2full(x+roi.x,y+roi.y)=d means that img2(x+roi.x, y+roi.y) ~ img1(x+roi.x+d, y+roi.y)
286 note that disp1buf will have the same size as the roi and
287 disp2full will have the same size as img1 (or img2).
288 On exit disp2buf is not the final disparity, it is an intermediate result that becomes
289 final after all the tiles are processed.
291 the disparity in disp1buf is written with sub-pixel accuracy
292 (4 fractional bits, see CvStereoSGBM::DISP_SCALE),
293 using quadratic interpolation, while the disparity in disp2buf
294 is written as is, without interpolation.
296 disp2cost also has the same size as img1 (or img2).
297 It contains the minimum current cost, used to find the best disparity, corresponding to the minimal cost.
299 static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
300 Mat& disp1, const StereoSGBM& params,
304 static const uchar LSBTab[] =
306 0, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
307 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
308 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
309 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
310 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
311 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
312 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
313 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
316 volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
319 const int ALIGN = 16;
320 const int DISP_SHIFT = StereoSGBM::DISP_SHIFT;
321 const int DISP_SCALE = StereoSGBM::DISP_SCALE;
322 const CostType MAX_COST = SHRT_MAX;
324 int minD = params.minDisparity, maxD = minD + params.numberOfDisparities;
326 SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5;
327 int ftzero = max(params.preFilterCap, 15) | 1;
328 int uniquenessRatio = params.uniquenessRatio > 0 ? params.uniquenessRatio : 10;
329 int disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
330 int P1 = params.P1 > 0 ? params.P1 : 2, P2 = max(params.P2 > 0 ? params.P2 : 5, P1+1);
331 int k, width = disp1.cols, height = disp1.rows;
332 int minX1 = max(maxD, 0), maxX1 = width + min(minD, 0);
333 int D = maxD - minD, width1 = maxX1 - minX1;
334 int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
335 int SW2 = SADWindowSize.width/2, SH2 = SADWindowSize.height/2;
336 int npasses = params.fullDP ? 2 : 1;
337 const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;
338 PixType clipTab[TAB_SIZE];
340 for( k = 0; k < TAB_SIZE; k++ )
341 clipTab[k] = (PixType)(min(max(k - TAB_OFS, -ftzero), ftzero) + ftzero);
345 disp1 = Scalar::all(INVALID_DISP_SCALED);
349 CV_Assert( D % 16 == 0 );
351 // NR - the number of directions. the loop on x below that computes Lr assumes that NR == 8.
352 // if you change NR, please, modify the loop as well.
353 int D2 = D+16, NRD2 = NR2*D2;
355 // the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer:
356 // for 8-way dynamic programming we need the current row and
357 // the previous row, i.e. 2 rows in total
359 const int LrBorder = NLR - 1;
361 // for each possible stereo match (img1(x,y) <=> img2(x-d,y))
362 // we keep pixel difference cost (C) and the summary cost over NR directions (S).
363 // we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k)
364 size_t costBufSize = width1*D;
365 size_t CSBufSize = costBufSize*(params.fullDP ? height : 1);
366 size_t minLrSize = (width1 + LrBorder*2)*NR2, LrSize = minLrSize*D2;
367 int hsumBufNRows = SH2*2 + 2;
368 size_t totalBufSize = (LrSize + minLrSize)*NLR*sizeof(CostType) + // minLr[] and Lr[]
369 costBufSize*(hsumBufNRows + 1)*sizeof(CostType) + // hsumBuf, pixdiff
370 CSBufSize*2*sizeof(CostType) + // C, S
371 width*16*img1.channels()*sizeof(PixType) + // temp buffer for computing per-pixel cost
372 width*(sizeof(CostType) + sizeof(DispType)) + 1024; // disp2cost + disp2
374 if( !buffer.data || !buffer.isContinuous() ||
375 buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize )
376 buffer.create(1, totalBufSize, CV_8U);
378 // summary cost over different (nDirs) directions
379 CostType* Cbuf = (CostType*)alignPtr(buffer.data, ALIGN);
380 CostType* Sbuf = Cbuf + CSBufSize;
381 CostType* hsumBuf = Sbuf + CSBufSize;
382 CostType* pixDiff = hsumBuf + costBufSize*hsumBufNRows;
384 CostType* disp2cost = pixDiff + costBufSize + (LrSize + minLrSize)*NLR;
385 DispType* disp2ptr = (DispType*)(disp2cost + width);
386 PixType* tempBuf = (PixType*)(disp2ptr + width);
388 // add P2 to every C(x,y). it saves a few operations in the inner loops
389 for( k = 0; k < width1*D; k++ )
390 Cbuf[k] = (CostType)P2;
392 for( int pass = 1; pass <= npasses; pass++ )
394 int x1, y1, x2, y2, dx, dy;
398 y1 = 0; y2 = height; dy = 1;
399 x1 = 0; x2 = width1; dx = 1;
403 y1 = height-1; y2 = -1; dy = -1;
404 x1 = width1-1; x2 = -1; dx = -1;
407 CostType *Lr[NLR]={0}, *minLr[NLR]={0};
409 for( k = 0; k < NLR; k++ )
411 // shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,
412 // and will occasionally use negative indices with the arrays
413 // we need to shift Lr[k] pointers by 1, to give the space for d=-1.
414 // however, then the alignment will be imperfect, i.e. bad for SSE,
415 // thus we shift the pointers by 8 (8*sizeof(short) == 16 - ideal alignment)
416 Lr[k] = pixDiff + costBufSize + LrSize*k + NRD2*LrBorder + 8;
417 memset( Lr[k] - LrBorder*NRD2 - 8, 0, LrSize*sizeof(CostType) );
418 minLr[k] = pixDiff + costBufSize + LrSize*NLR + minLrSize*k + NR2*2;
419 memset( minLr[k] - LrBorder*NR2, 0, minLrSize*sizeof(CostType) );
422 for( int y = y1; y != y2; y += dy )
425 DispType* disp1ptr = disp1.ptr<DispType>(y);
426 CostType* C = Cbuf + (!params.fullDP ? 0 : y*costBufSize);
427 CostType* S = Sbuf + (!params.fullDP ? 0 : y*costBufSize);
429 if( pass == 1 ) // compute C on the first pass, and reuse it on the second pass, if any.
431 int dy1 = y == 0 ? 0 : y + SH2, dy2 = y == 0 ? SH2 : dy1;
433 for( k = dy1; k <= dy2; k++ )
435 CostType* hsumAdd = hsumBuf + (min(k, height-1) % hsumBufNRows)*costBufSize;
439 calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, clipTab, TAB_OFS, ftzero );
441 memset(hsumAdd, 0, D*sizeof(CostType));
442 for( x = 0; x <= SW2*D; x += D )
444 int scale = x == 0 ? SW2 + 1 : 1;
445 for( d = 0; d < D; d++ )
446 hsumAdd[d] = (CostType)(hsumAdd[d] + pixDiff[x + d]*scale);
451 const CostType* hsumSub = hsumBuf + (max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize;
452 const CostType* Cprev = !params.fullDP || y == 0 ? C : C - costBufSize;
454 for( x = D; x < width1*D; x += D )
456 const CostType* pixAdd = pixDiff + min(x + SW2*D, (width1-1)*D);
457 const CostType* pixSub = pixDiff + max(x - (SW2+1)*D, 0);
462 for( d = 0; d < D; d += 8 )
464 __m128i hv = _mm_load_si128((const __m128i*)(hsumAdd + x - D + d));
465 __m128i Cx = _mm_load_si128((__m128i*)(Cprev + x + d));
466 hv = _mm_adds_epi16(_mm_subs_epi16(hv,
467 _mm_load_si128((const __m128i*)(pixSub + d))),
468 _mm_load_si128((const __m128i*)(pixAdd + d)));
469 Cx = _mm_adds_epi16(_mm_subs_epi16(Cx,
470 _mm_load_si128((const __m128i*)(hsumSub + x + d))),
472 _mm_store_si128((__m128i*)(hsumAdd + x + d), hv);
473 _mm_store_si128((__m128i*)(C + x + d), Cx);
479 for( d = 0; d < D; d++ )
481 int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
482 C[x + d] = (CostType)(Cprev[x + d] + hv - hsumSub[x + d]);
489 for( x = D; x < width1*D; x += D )
491 const CostType* pixAdd = pixDiff + min(x + SW2*D, (width1-1)*D);
492 const CostType* pixSub = pixDiff + max(x - (SW2+1)*D, 0);
494 for( d = 0; d < D; d++ )
495 hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
502 int scale = k == 0 ? SH2 + 1 : 1;
503 for( x = 0; x < width1*D; x++ )
504 C[x] = (CostType)(C[x] + hsumAdd[x]*scale);
508 // also, clear the S buffer
509 for( k = 0; k < width1*D; k++ )
513 // clear the left and the right borders
514 memset( Lr[0] - NRD2*LrBorder - 8, 0, NRD2*LrBorder*sizeof(CostType) );
515 memset( Lr[0] + width1*NRD2 - 8, 0, NRD2*LrBorder*sizeof(CostType) );
516 memset( minLr[0] - NR2*LrBorder, 0, NR2*LrBorder*sizeof(CostType) );
517 memset( minLr[0] + width1*NR2, 0, NR2*LrBorder*sizeof(CostType) );
520 [formula 13 in the paper]
521 compute L_r(p, d) = C(p, d) +
525 min_k L_r(p-r, k) + P2) - min_k L_r(p-r, k)
526 where p = (x,y), r is one of the directions.
527 we process all the directions at once:
537 for( x = x1; x != x2; x += dx )
539 int xm = x*NR2, xd = xm*D2;
541 int delta0 = minLr[0][xm - dx*NR2] + P2, delta1 = minLr[1][xm - NR2 + 1] + P2;
542 int delta2 = minLr[1][xm + 2] + P2, delta3 = minLr[1][xm + NR2 + 3] + P2;
544 CostType* Lr_p0 = Lr[0] + xd - dx*NRD2;
545 CostType* Lr_p1 = Lr[1] + xd - NRD2 + D2;
546 CostType* Lr_p2 = Lr[1] + xd + D2*2;
547 CostType* Lr_p3 = Lr[1] + xd + NRD2 + D2*3;
549 Lr_p0[-1] = Lr_p0[D] = Lr_p1[-1] = Lr_p1[D] =
550 Lr_p2[-1] = Lr_p2[D] = Lr_p3[-1] = Lr_p3[D] = MAX_COST;
552 CostType* Lr_p = Lr[0] + xd;
553 const CostType* Cp = C + x*D;
554 CostType* Sp = S + x*D;
559 __m128i _P1 = _mm_set1_epi16((short)P1);
561 __m128i _delta0 = _mm_set1_epi16((short)delta0);
562 __m128i _delta1 = _mm_set1_epi16((short)delta1);
563 __m128i _delta2 = _mm_set1_epi16((short)delta2);
564 __m128i _delta3 = _mm_set1_epi16((short)delta3);
565 __m128i _minL0 = _mm_set1_epi16((short)MAX_COST);
567 for( d = 0; d < D; d += 8 )
569 __m128i Cpd = _mm_load_si128((const __m128i*)(Cp + d));
570 __m128i L0, L1, L2, L3;
572 L0 = _mm_load_si128((const __m128i*)(Lr_p0 + d));
573 L1 = _mm_load_si128((const __m128i*)(Lr_p1 + d));
574 L2 = _mm_load_si128((const __m128i*)(Lr_p2 + d));
575 L3 = _mm_load_si128((const __m128i*)(Lr_p3 + d));
577 L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d - 1)), _P1));
578 L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d + 1)), _P1));
580 L1 = _mm_min_epi16(L1, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p1 + d - 1)), _P1));
581 L1 = _mm_min_epi16(L1, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p1 + d + 1)), _P1));
583 L2 = _mm_min_epi16(L2, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p2 + d - 1)), _P1));
584 L2 = _mm_min_epi16(L2, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p2 + d + 1)), _P1));
586 L3 = _mm_min_epi16(L3, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p3 + d - 1)), _P1));
587 L3 = _mm_min_epi16(L3, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p3 + d + 1)), _P1));
589 L0 = _mm_min_epi16(L0, _delta0);
590 L0 = _mm_adds_epi16(_mm_subs_epi16(L0, _delta0), Cpd);
592 L1 = _mm_min_epi16(L1, _delta1);
593 L1 = _mm_adds_epi16(_mm_subs_epi16(L1, _delta1), Cpd);
595 L2 = _mm_min_epi16(L2, _delta2);
596 L2 = _mm_adds_epi16(_mm_subs_epi16(L2, _delta2), Cpd);
598 L3 = _mm_min_epi16(L3, _delta3);
599 L3 = _mm_adds_epi16(_mm_subs_epi16(L3, _delta3), Cpd);
601 _mm_store_si128( (__m128i*)(Lr_p + d), L0);
602 _mm_store_si128( (__m128i*)(Lr_p + d + D2), L1);
603 _mm_store_si128( (__m128i*)(Lr_p + d + D2*2), L2);
604 _mm_store_si128( (__m128i*)(Lr_p + d + D2*3), L3);
606 __m128i t0 = _mm_min_epi16(_mm_unpacklo_epi16(L0, L2), _mm_unpackhi_epi16(L0, L2));
607 __m128i t1 = _mm_min_epi16(_mm_unpacklo_epi16(L1, L3), _mm_unpackhi_epi16(L1, L3));
608 t0 = _mm_min_epi16(_mm_unpacklo_epi16(t0, t1), _mm_unpackhi_epi16(t0, t1));
609 _minL0 = _mm_min_epi16(_minL0, t0);
611 __m128i Sval = _mm_load_si128((const __m128i*)(Sp + d));
613 L0 = _mm_adds_epi16(L0, L1);
614 L2 = _mm_adds_epi16(L2, L3);
615 Sval = _mm_adds_epi16(Sval, L0);
616 Sval = _mm_adds_epi16(Sval, L2);
618 _mm_store_si128((__m128i*)(Sp + d), Sval);
621 _minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 8));
622 _mm_storel_epi64((__m128i*)&minLr[0][xm], _minL0);
627 int minL0 = MAX_COST, minL1 = MAX_COST, minL2 = MAX_COST, minL3 = MAX_COST;
629 for( d = 0; d < D; d++ )
631 int Cpd = Cp[d], L0, L1, L2, L3;
633 L0 = Cpd + min((int)Lr_p0[d], min(Lr_p0[d-1] + P1, min(Lr_p0[d+1] + P1, delta0))) - delta0;
634 L1 = Cpd + min((int)Lr_p1[d], min(Lr_p1[d-1] + P1, min(Lr_p1[d+1] + P1, delta1))) - delta1;
635 L2 = Cpd + min((int)Lr_p2[d], min(Lr_p2[d-1] + P1, min(Lr_p2[d+1] + P1, delta2))) - delta2;
636 L3 = Cpd + min((int)Lr_p3[d], min(Lr_p3[d-1] + P1, min(Lr_p3[d+1] + P1, delta3))) - delta3;
638 Lr_p[d] = (CostType)L0;
639 minL0 = min(minL0, L0);
641 Lr_p[d + D2] = (CostType)L1;
642 minL1 = min(minL1, L1);
644 Lr_p[d + D2*2] = (CostType)L2;
645 minL2 = min(minL2, L2);
647 Lr_p[d + D2*3] = (CostType)L3;
648 minL3 = min(minL3, L3);
650 Sp[d] = saturate_cast<CostType>(Sp[d] + L0 + L1 + L2 + L3);
652 minLr[0][xm] = (CostType)minL0;
653 minLr[0][xm+1] = (CostType)minL1;
654 minLr[0][xm+2] = (CostType)minL2;
655 minLr[0][xm+3] = (CostType)minL3;
659 if( pass == npasses )
661 for( x = 0; x < width; x++ )
663 disp1ptr[x] = disp2ptr[x] = (DispType)INVALID_DISP_SCALED;
664 disp2cost[x] = MAX_COST;
667 for( x = width1 - 1; x >= 0; x-- )
669 CostType* Sp = S + x*D;
670 int minS = MAX_COST, bestDisp = -1;
674 int xm = x*NR2, xd = xm*D2;
676 int minL0 = MAX_COST;
677 int delta0 = minLr[0][xm + NR2] + P2;
678 CostType* Lr_p0 = Lr[0] + xd + NRD2;
679 Lr_p0[-1] = Lr_p0[D] = MAX_COST;
680 CostType* Lr_p = Lr[0] + xd;
682 const CostType* Cp = C + x*D;
687 __m128i _P1 = _mm_set1_epi16((short)P1);
688 __m128i _delta0 = _mm_set1_epi16((short)delta0);
690 __m128i _minL0 = _mm_set1_epi16((short)minL0);
691 __m128i _minS = _mm_set1_epi16(MAX_COST), _bestDisp = _mm_set1_epi16(-1);
692 __m128i _d8 = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7), _8 = _mm_set1_epi16(8);
694 for( d = 0; d < D; d += 8 )
696 __m128i Cpd = _mm_load_si128((const __m128i*)(Cp + d)), L0;
698 L0 = _mm_load_si128((const __m128i*)(Lr_p0 + d));
699 L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d - 1)), _P1));
700 L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d + 1)), _P1));
701 L0 = _mm_min_epi16(L0, _delta0);
702 L0 = _mm_adds_epi16(_mm_subs_epi16(L0, _delta0), Cpd);
704 _mm_store_si128((__m128i*)(Lr_p + d), L0);
705 _minL0 = _mm_min_epi16(_minL0, L0);
706 L0 = _mm_adds_epi16(L0, *(__m128i*)(Sp + d));
707 _mm_store_si128((__m128i*)(Sp + d), L0);
709 __m128i mask = _mm_cmpgt_epi16(_minS, L0);
710 _minS = _mm_min_epi16(_minS, L0);
711 _bestDisp = _mm_xor_si128(_bestDisp, _mm_and_si128(_mm_xor_si128(_bestDisp,_d8), mask));
712 _d8 = _mm_adds_epi16(_d8, _8);
715 short CV_DECL_ALIGNED(16) bestDispBuf[8];
716 _mm_store_si128((__m128i*)bestDispBuf, _bestDisp);
718 _minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 8));
719 _minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 4));
720 _minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 2));
722 __m128i qS = _mm_min_epi16(_minS, _mm_srli_si128(_minS, 8));
723 qS = _mm_min_epi16(qS, _mm_srli_si128(qS, 4));
724 qS = _mm_min_epi16(qS, _mm_srli_si128(qS, 2));
726 minLr[0][xm] = (CostType)_mm_cvtsi128_si32(_minL0);
727 minS = (CostType)_mm_cvtsi128_si32(qS);
729 qS = _mm_shuffle_epi32(_mm_unpacklo_epi16(qS, qS), 0);
730 qS = _mm_cmpeq_epi16(_minS, qS);
731 int idx = _mm_movemask_epi8(_mm_packs_epi16(qS, qS)) & 255;
733 bestDisp = bestDispBuf[LSBTab[idx]];
738 for( d = 0; d < D; d++ )
740 int L0 = Cp[d] + min((int)Lr_p0[d], min(Lr_p0[d-1] + P1, min(Lr_p0[d+1] + P1, delta0))) - delta0;
742 Lr_p[d] = (CostType)L0;
743 minL0 = min(minL0, L0);
745 int Sval = Sp[d] = saturate_cast<CostType>(Sp[d] + L0);
752 minLr[0][xm] = (CostType)minL0;
757 for( d = 0; d < D; d++ )
768 for( d = 0; d < D; d++ )
770 if( Sp[d]*(100 - uniquenessRatio) < minS*100 && std::abs(bestDisp - d) > 1 )
776 int x2 = x + minX1 - d - minD;
777 if( disp2cost[x2] > minS )
779 disp2cost[x2] = (CostType)minS;
780 disp2ptr[x2] = (DispType)(d + minD);
783 if( 0 < d && d < D-1 )
785 // do subpixel quadratic interpolation:
786 // fit parabola into (x1=d-1, y1=Sp[d-1]), (x2=d, y2=Sp[d]), (x3=d+1, y3=Sp[d+1])
787 // then find minimum of the parabola.
788 int denom2 = max(Sp[d-1] + Sp[d+1] - 2*Sp[d], 1);
789 d = d*DISP_SCALE + ((Sp[d-1] - Sp[d+1])*DISP_SCALE + denom2)/(denom2*2);
793 disp1ptr[x + minX1] = (DispType)(d + minD*DISP_SCALE);
796 for( x = minX1; x < maxX1; x++ )
798 // we round the computed disparity both towards -inf and +inf and check
799 // if either of the corresponding disparities in disp2 is consistent.
800 // This is to give the computed disparity a chance to look valid if it is.
802 if( d == INVALID_DISP_SCALED )
804 int _d = d >> DISP_SHIFT;
805 int d_ = (d + DISP_SCALE-1) >> DISP_SHIFT;
806 int _x = x - _d, x_ = x - d_;
807 if( 0 <= _x && _x < width && disp2ptr[_x] >= minD && std::abs(disp2ptr[_x] - _d) > disp12MaxDiff &&
808 0 <= x_ && x_ < width && disp2ptr[x_] >= minD && std::abs(disp2ptr[x_] - d_) > disp12MaxDiff )
809 disp1ptr[x] = (DispType)INVALID_DISP_SCALED;
813 // now shift the cyclic buffers
814 std::swap( Lr[0], Lr[1] );
815 std::swap( minLr[0], minLr[1] );
820 typedef cv::Point_<short> Point2s;
822 void filterSpeckles( Mat& img, double _newval, int maxSpeckleSize, double _maxDiff, Mat& _buf )
824 CV_Assert( img.type() == CV_16SC1 );
826 int newVal = cvRound(_newval);
827 int maxDiff = cvRound(_maxDiff);
828 int width = img.cols, height = img.rows, npixels = width*height;
829 size_t bufSize = npixels*(int)(sizeof(Point2s) + sizeof(int) + sizeof(uchar));
830 if( !_buf.isContinuous() || !_buf.data || _buf.cols*_buf.rows*_buf.elemSize() < bufSize )
831 _buf.create(1, bufSize, CV_8U);
833 uchar* buf = _buf.data;
834 int i, j, dstep = img.step/sizeof(short);
835 int* labels = (int*)buf;
836 buf += npixels*sizeof(labels[0]);
837 Point2s* wbuf = (Point2s*)buf;
838 buf += npixels*sizeof(wbuf[0]);
839 uchar* rtype = (uchar*)buf;
842 // clear out label assignments
843 memset(labels, 0, npixels*sizeof(labels[0]));
845 for( i = 0; i < height; i++ )
847 short* ds = img.ptr<short>(i);
848 int* ls = labels + width*i;
850 for( j = 0; j < width; j++ )
852 if( ds[j] != newVal ) // not a bad disparity
854 if( ls[j] ) // has a label, check for bad label
856 if( rtype[ls[j]] ) // small region, zero out disparity
857 ds[j] = (short)newVal;
859 // no label, assign and propagate
862 Point2s* ws = wbuf; // initialize wavefront
863 Point2s p((short)j, (short)i); // current pixel
864 curlabel++; // next label
865 int count = 0; // current region size
868 // wavefront propagation
869 while( ws >= wbuf ) // wavefront not empty
872 // put neighbors onto wavefront
873 short* dpp = &img.at<short>(p.y, p.x);
875 int* lpp = labels + width*p.y + p.x;
877 if( p.x < width-1 && !lpp[+1] && dpp[+1] != newVal && std::abs(dp - dpp[+1]) <= maxDiff )
880 *ws++ = Point2s(p.x+1, p.y);
883 if( p.x > 0 && !lpp[-1] && dpp[-1] != newVal && std::abs(dp - dpp[-1]) <= maxDiff )
886 *ws++ = Point2s(p.x-1, p.y);
889 if( p.y < height-1 && !lpp[+width] && dpp[+dstep] != newVal && std::abs(dp - dpp[+dstep]) <= maxDiff )
891 lpp[+width] = curlabel;
892 *ws++ = Point2s(p.x, p.y+1);
895 if( p.y > 0 && !lpp[-width] && dpp[-dstep] != newVal && std::abs(dp - dpp[-dstep]) <= maxDiff )
897 lpp[-width] = curlabel;
898 *ws++ = Point2s(p.x, p.y-1);
901 // pop most recent and propagate
902 // NB: could try least recent, maybe better convergence
907 if( count <= maxSpeckleSize ) // speckle region
909 rtype[ls[j]] = 1; // small region label
910 ds[j] = (short)newVal;
913 rtype[ls[j]] = 0; // large region label
921 void StereoSGBM::operator ()( const Mat& left, const Mat& right, Mat& disp )
923 CV_Assert( left.size() == right.size() && left.type() == right.type() &&
924 left.depth() == DataType<PixType>::depth );
926 disp.create( left.size(), CV_16S );
928 computeDisparitySGBM( left, right, disp, *this, buffer );
929 medianBlur(disp, disp, 3);
931 if( speckleWindowSize > 0 )
932 filterSpeckles(disp, (minDisparity - 1)*DISP_SCALE, 100, DISP_SCALE, buffer);
936 Rect getValidDisparityROI( Rect roi1, Rect roi2,
938 int numberOfDisparities,
941 int SW2 = SADWindowSize/2;
942 int minD = minDisparity, maxD = minDisparity + numberOfDisparities - 1;
944 int xmin = max(roi1.x, roi2.x + maxD) + SW2;
945 int xmax = min(roi1.x + roi1.width, roi2.x + roi2.width - minD) - SW2;
946 int ymin = max(roi1.y, roi2.y) + SW2;
947 int ymax = min(roi1.y + roi1.height, roi2.y + roi2.height) - SW2;
949 Rect r(xmin, ymin, xmax - xmin, ymax - ymin);
951 return r.width > 0 && r.height > 0 ? r : Rect();
955 void validateDisparity( Mat& disp, const Mat& cost, int minDisparity, int numberOfDisparities, int disp12MaxDiff )
957 int cols = disp.cols, rows = disp.rows;
958 int minD = minDisparity, maxD = minDisparity + numberOfDisparities;
959 int x, minX1 = max(maxD, 0), maxX1 = cols + min(minD, 0);
960 AutoBuffer<int> _disp2buf(cols*2);
961 int* disp2buf = _disp2buf;
962 int* disp2cost = disp2buf + cols;
963 const int DISP_SHIFT = 4, DISP_SCALE = 1 << DISP_SHIFT;
964 int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
965 int costType = cost.type();
967 disp12MaxDiff *= DISP_SCALE;
969 CV_Assert( numberOfDisparities > 0 && disp.type() == CV_16S &&
970 (costType == CV_16S || costType == CV_32S) &&
971 disp.size() == cost.size() );
973 for( int y = 0; y < rows; y++ )
975 short* dptr = disp.ptr<short>(y);
977 for( x = 0; x < cols; x++ )
979 disp2buf[x] = INVALID_DISP;
980 disp2cost[x] = INT_MAX;
983 if( costType == CV_16S )
985 const short* cptr = cost.ptr<short>(y);
987 for( x = minX1; x < maxX1; x++ )
989 int d = dptr[x], c = cptr[x];
990 int x2 = x - ((d + DISP_SCALE/2) >> DISP_SHIFT);
992 if( disp2cost[x2] > c )
1001 const int* cptr = cost.ptr<int>(y);
1003 for( x = minX1; x < maxX1; x++ )
1005 int d = dptr[x], c = cptr[x];
1006 int x2 = x - ((d + DISP_SCALE/2) >> DISP_SHIFT);
1008 if( disp2cost[x2] < c )
1016 for( x = minX1; x < maxX1; x++ )
1018 // we round the computed disparity both towards -inf and +inf and check
1019 // if either of the corresponding disparities in disp2 is consistent.
1020 // This is to give the computed disparity a chance to look valid if it is.
1022 if( d == INVALID_DISP_SCALED )
1024 int d0 = d >> DISP_SHIFT;
1025 int d1 = (d + DISP_SCALE-1) >> DISP_SHIFT;
1026 int x0 = x - d0, x1 = x - d1;
1027 if( (0 <= x0 && x0 < cols && disp2buf[x0] > INVALID_DISP && std::abs(disp2buf[x0] - d) > disp12MaxDiff) &&
1028 (0 <= x1 && x1 < cols && disp2buf[x1] > INVALID_DISP && std::abs(disp2buf[x1] - d) > disp12MaxDiff) )
1029 dptr[x] = (short)INVALID_DISP_SCALED;
1036 CvRect cvGetValidDisparityROI( CvRect roi1, CvRect roi2, int minDisparity,
1037 int numberOfDisparities, int SADWindowSize )
1039 return (CvRect)cv::getValidDisparityROI( roi1, roi2, minDisparity,
1040 numberOfDisparities, SADWindowSize );
1043 void cvValidateDisparity( CvArr* _disp, const CvArr* _cost, int minDisparity,
1044 int numberOfDisparities, int disp12MaxDiff )
1046 cv::Mat disp = cv::cvarrToMat(_disp), cost = cv::cvarrToMat(_cost);
1047 cv::validateDisparity( disp, cost, minDisparity, numberOfDisparities, disp12MaxDiff );