1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
20 // * Redistribution's of source code must retain the above copyright notice,
21 // this list of conditions and the following disclaimer.
23 // * Redistribution's in binary form must reproduce the above copyright notice,
24 // this list of conditions and the following disclaimer in the documentation
25 // and/or other materials provided with the distribution.
27 // * The name of the copyright holders may not be used to endorse or promote products
28 // derived from this software without specific prior written permission.
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
44 This is a variation of
45 "Stereo Processing by Semiglobal Matching and Mutual Information"
46 by Heiko Hirschmuller.
48 We match blocks rather than individual pixels, thus the algorithm is called
49 SGBM (Semi-global block matching)
58 typedef uchar PixType;
59 typedef short CostType;
60 typedef short DispType;
62 enum { NR = 16, NR2 = NR/2 };
64 StereoSGBM::StereoSGBM()
66 minDisparity = numberOfDisparities = 0;
72 speckleWindowSize = 0;
78 StereoSGBM::StereoSGBM( int _minDisparity, int _numDisparities, int _SADWindowSize,
79 int _P1, int _P2, int _disp12MaxDiff, int _preFilterCap,
80 int _uniquenessRatio, int _speckleWindowSize, int _speckleRange,
83 minDisparity = _minDisparity;
84 numberOfDisparities = _numDisparities;
85 SADWindowSize = _SADWindowSize;
88 disp12MaxDiff = _disp12MaxDiff;
89 preFilterCap = _preFilterCap;
90 uniquenessRatio = _uniquenessRatio;
91 speckleWindowSize = _speckleWindowSize;
92 speckleRange = _speckleRange;
97 StereoSGBM::~StereoSGBM()
102 For each pixel row1[x], max(-maxD, 0) <= minX <= x < maxX <= width - max(0, -minD),
103 and for each disparity minD<=d<maxD the function
104 computes the cost (cost[(x-minX)*(maxD - minD) + (d - minD)]), depending on the difference between
105 row1[x] and row2[x-d]. The subpixel algorithm from
106 "Depth Discontinuities by Pixel-to-Pixel Stereo" by Stan Birchfield and C. Tomasi
107 is used, hence the suffix BT.
109 the temporary buffer should contain width2*2 elements
111 static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
112 int minD, int maxD, CostType* cost,
113 PixType* buffer, const PixType* tab, int tabOfs )
115 int x, c, width = img1.cols, cn = img1.channels();
116 int minX1 = max(maxD, 0), maxX1 = width + min(minD, 0);
117 int minX2 = max(minX1 - maxD, 0), maxX2 = min(maxX1 - minD, width);
118 int D = maxD - minD, width1 = maxX1 - minX1, width2 = maxX2 - minX2;
119 const PixType *row1 = img1.ptr<PixType>(y), *row2 = img2.ptr<PixType>(y);
120 PixType *prow1 = buffer + width2*2, *prow2 = prow1 + width*cn;
124 for( c = 0; c < cn; c++ )
126 prow1[width*c] = prow1[width*c + width-1] =
127 prow2[width*c] = prow2[width*c + width-1] = tab[0];
132 int n1 = y > 0 ? -(int)img1.step : 0, s1 = y < img1.rows-1 ? (int)img1.step : 0;
133 int n2 = y > 0 ? -(int)img2.step : 0, s2 = y < img2.rows-1 ? (int)img2.step : 0;
135 for( x = 1; x < width-1; x++ )
137 //prow1[x] = tab[row1[x+1] - row1[x-1]];
138 //prow2[width-1-x] = tab[row2[x+1] - row2[x-1]];
139 prow1[x] = tab[(row1[x+1] - row1[x-1])*2 + row1[x+n1+1] - row1[x+n1-1] + row1[x+s1+1] - row1[x+s1-1]];
140 prow2[width-1-x] = tab[(row2[x+1] - row2[x-1])*2 + row2[x+n2+1] - row2[x+n2-1] + row2[x+s2+1] - row2[x+s2-1]];
145 for( x = 1; x < width-1; x++ )
147 prow1[x] = tab[row1[x*3+3] - row1[x*3-3]];
148 prow1[x+width] = tab[row1[x*3+4] - row1[x*3-2]];
149 prow1[x+width*2] = tab[row1[x*3+5] - row1[x*3-1]];
151 prow2[width-1-x] = tab[row2[x*3+3] - row2[x*3-3]];
152 prow2[width-1-x+width] = tab[row2[x*3+4] - row2[x*3-2]];
153 prow2[width-1-x+width*2] = tab[row2[x*3+5] - row2[x*3-1]];
157 memset( cost, 0, width1*D*sizeof(cost[0]) );
160 cost -= minX1*D + minD; // simplify the cost indices inside the loop
163 volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
167 for( c = 0; c < cn; c++, prow1 += width, prow2 += width )
170 // v0 = min(row2[x-1/2], row2[x], row2[x+1/2]) and
171 // v1 = max(row2[x-1/2], row2[x], row2[x+1/2]) and
172 for( x = minX2; x < maxX2; x++ )
175 int vl = x > 0 ? (v + prow2[x-1])/2 : v;
176 int vr = x < width-1 ? (v + prow2[x+1])/2 : v;
177 int v0 = min(vl, vr); v0 = min(v0, v);
178 int v1 = max(vl, vr); v1 = max(v1, v);
179 buffer[x] = (PixType)v0;
180 buffer[x + width2] = (PixType)v1;
183 for( x = minX1; x < maxX1; x++ )
186 int ul = x > 0 ? (u + prow1[x-1])/2 : u;
187 int ur = x < width-1 ? (u + prow1[x+1])/2 : u;
188 int u0 = min(ul, ur); u0 = min(u0, u);
189 int u1 = max(ul, ur); u1 = max(u1, u);
194 __m128i _u = _mm_set1_epi8((char)u), _u0 = _mm_set1_epi8((char)u0);
195 __m128i _u1 = _mm_set1_epi8((char)u1), z = _mm_setzero_si128();
197 for( int d = minD; d < maxD; d += 16 )
199 __m128i _v = _mm_loadu_si128((const __m128i*)(prow2 + width-x-1 + d));
200 __m128i _v0 = _mm_loadu_si128((const __m128i*)(buffer + width-x-1 + d));
201 __m128i _v1 = _mm_loadu_si128((const __m128i*)(buffer + width-x-1 + d + width2));
202 __m128i c0 = _mm_max_epu8(_mm_subs_epu8(_u, _v1), _mm_subs_epu8(_v0, _u));
203 __m128i c1 = _mm_max_epu8(_mm_subs_epu8(_v, _u1), _mm_subs_epu8(_u0, _v));
204 __m128i diff = _mm_min_epu8(c0, c1);
206 c0 = _mm_load_si128((__m128i*)(cost + x*D + d));
207 c1 = _mm_load_si128((__m128i*)(cost + x*D + d + 8));
209 _mm_store_si128((__m128i*)(cost + x*D + d), _mm_adds_epi16(c0, _mm_unpacklo_epi8(diff,z)));
210 _mm_store_si128((__m128i*)(cost + x*D + d + 8), _mm_adds_epi16(c1, _mm_unpackhi_epi8(diff,z)));
216 for( int d = minD; d < maxD; d++ )
218 int v = prow2[width-x-1 + d];
219 int v0 = buffer[width-x-1 + d];
220 int v1 = buffer[width-x-1 + d + width2];
221 int c0 = max(0, u - v1); c0 = max(c0, v0 - u);
222 int c1 = max(0, v - u1); c1 = max(c1, u0 - v);
224 cost[x*D + d] = (CostType)(cost[x*D+d] + min(c0, c1));
230 for( c = 0; c < cn; c++, prow1 += width, prow2 += width )
232 for( x = minX1; x < maxX1; x++ )
238 __m128i _u = _mm_set1_epi8(u), z = _mm_setzero_si128();
240 for( int d = minD; d < maxD; d += 16 )
242 __m128i _v = _mm_loadu_si128((const __m128i*)(prow2 + width-1-x + d));
243 __m128i diff = _mm_adds_epu8(_mm_subs_epu8(_u,_v), _mm_subs_epu8(_v,_u));
244 __m128i c0 = _mm_load_si128((__m128i*)(cost + x*D + d));
245 __m128i c1 = _mm_load_si128((__m128i*)(cost + x*D + d + 8));
247 _mm_store_si128((__m128i*)(cost + x*D + d), _mm_adds_epi16(c0, _mm_unpacklo_epi8(diff,z)));
248 _mm_store_si128((__m128i*)(cost + x*D + d + 8), _mm_adds_epi16(c1, _mm_unpackhi_epi8(diff,z)));
254 for( int d = minD; d < maxD; d++ )
256 int v = prow2[width-1-x + d];
257 cost[x*D + d] = (CostType)(cost[x*D + d] + std::abs(u - v));
267 computes disparity for "roi" in img1 w.r.t. img2 and write it to disp1buf.
268 that is, disp1buf(x, y)=d means that img1(x+roi.x, y+roi.y) ~ img2(x+roi.x-d, y+roi.y).
270 disp2full is the reverse disparity map, that is:
271 disp2full(x+roi.x,y+roi.y)=d means that img2(x+roi.x, y+roi.y) ~ img1(x+roi.x+d, y+roi.y)
273 note that disp1buf will have the same size as the roi and
274 disp2full will have the same size as img1 (or img2).
275 On exit disp2buf is not the final disparity, it is an intermediate result that becomes
276 final after all the tiles are processed.
278 the disparity in disp1buf is written with sub-pixel accuracy
279 (4 fractional bits, see CvStereoSGBM::DISP_SCALE),
280 using quadratic interpolation, while the disparity in disp2buf
281 is written as is, without interpolation.
283 disp2cost also has the same size as img1 (or img2).
284 It contains the minimum current cost, used to find the best disparity, corresponding to the minimal cost.
286 static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
287 Mat& disp1, const StereoSGBM& params,
291 static const uchar LSBTab[] =
293 0, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
294 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
295 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
296 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
297 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
298 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
299 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
300 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
303 volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
306 const int ALIGN = 16;
307 const int DISP_SHIFT = StereoSGBM::DISP_SHIFT;
308 const int DISP_SCALE = StereoSGBM::DISP_SCALE;
309 const CostType MAX_COST = SHRT_MAX;
311 int minD = params.minDisparity, maxD = minD + params.numberOfDisparities;
313 SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5;
314 int ftzero = max(params.preFilterCap, 15) | 1;
315 int uniquenessRatio = params.uniquenessRatio > 0 ? params.uniquenessRatio : 10;
316 int disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
317 int P1 = params.P1 > 0 ? params.P1 : 2, P2 = max(params.P2 > 0 ? params.P2 : 5, P1+1);
318 int k, width = disp1.cols, height = disp1.rows;
319 int minX1 = max(maxD, 0), maxX1 = width + min(minD, 0);
320 int D = maxD - minD, width1 = maxX1 - minX1;
321 int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
322 int SW2 = SADWindowSize.width/2, SH2 = SADWindowSize.height/2;
323 int npasses = params.fullDP ? 2 : 1;
324 const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;
325 PixType clipTab[TAB_SIZE];
327 for( k = 0; k < TAB_SIZE; k++ )
328 clipTab[k] = (PixType)(min(max(k - TAB_OFS, -ftzero), ftzero) + ftzero);
332 disp1 = Scalar::all(INVALID_DISP_SCALED);
336 CV_Assert( D % 16 == 0 );
338 // NR - the number of directions. the loop on x below that computes Lr assumes that NR == 8.
339 // if you change NR, please, modify the loop as well.
340 int D2 = D+16, NRD2 = NR2*D2;
342 // the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer:
343 // for 8-way dynamic programming we need the current row and
344 // the previous row, i.e. 2 rows in total
346 const int LrBorder = NLR - 1;
348 // for each possible stereo match (img1(x,y) <=> img2(x-d,y))
349 // we keep pixel difference cost (C) and the summary cost over NR directions (S).
350 // we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k)
351 size_t costBufSize = width1*D;
352 size_t CSBufSize = costBufSize*(params.fullDP ? height : 1);
353 size_t minLrSize = (width1 + LrBorder*2)*NR2, LrSize = minLrSize*D2;
354 int hsumBufNRows = SH2*2 + 2;
355 size_t totalBufSize = (LrSize + minLrSize)*NLR*sizeof(CostType) + // minLr[] and Lr[]
356 costBufSize*(hsumBufNRows + 1)*sizeof(CostType) + // hsumBuf, pixdiff
357 CSBufSize*2*sizeof(CostType) + // C, S
358 width*8*img1.channels()*sizeof(PixType) + // temp buffer for computing per-pixel cost
359 width*(sizeof(CostType) + sizeof(DispType)) + 1024; // disp2cost + disp2
361 if( !buffer.data || !buffer.isContinuous() ||
362 buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize )
363 buffer.create(1, totalBufSize, CV_8U);
365 // summary cost over different (nDirs) directions
366 CostType* Cbuf = (CostType*)alignPtr(buffer.data, ALIGN);
367 CostType* Sbuf = Cbuf + CSBufSize;
368 CostType* hsumBuf = Sbuf + CSBufSize;
369 CostType* pixDiff = hsumBuf + costBufSize*hsumBufNRows;
371 CostType* disp2cost = pixDiff + costBufSize + (LrSize + minLrSize)*NLR;
372 DispType* disp2ptr = (DispType*)(disp2cost + width);
373 PixType* tempBuf = (PixType*)(disp2ptr + width);
375 // add P2 to every C(x,y). it saves a few operations in the inner loops
376 for( k = 0; k < width1*D; k++ )
377 Cbuf[k] = (CostType)P2;
379 for( int pass = 1; pass <= npasses; pass++ )
381 int x1, y1, x2, y2, dx, dy;
385 y1 = 0; y2 = height; dy = 1;
386 x1 = 0; x2 = width1; dx = 1;
390 y1 = height-1; y2 = -1; dy = -1;
391 x1 = width1-1; x2 = -1; dx = -1;
394 CostType *Lr[NLR]={0}, *minLr[NLR]={0};
396 for( k = 0; k < NLR; k++ )
398 // shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,
399 // and will occasionally use negative indices with the arrays
400 // we need to shift Lr[k] pointers by 1, to give the space for d=-1.
401 // however, then the alignment will be imperfect, i.e. bad for SSE,
402 // thus we shift the pointers by 8 (8*sizeof(short) == 16 - ideal alignment)
403 Lr[k] = pixDiff + costBufSize + LrSize*k + NRD2*LrBorder + 8;
404 memset( Lr[k] - LrBorder*NRD2 - 8, 0, LrSize*sizeof(CostType) );
405 minLr[k] = pixDiff + costBufSize + LrSize*NLR + minLrSize*k + NR2*2;
406 memset( minLr[k] - LrBorder*NR2, 0, minLrSize*sizeof(CostType) );
409 for( int y = y1; y != y2; y += dy )
412 DispType* disp1ptr = disp1.ptr<DispType>(y);
413 CostType* C = Cbuf + (!params.fullDP ? 0 : y*costBufSize);
414 CostType* S = Sbuf + (!params.fullDP ? 0 : y*costBufSize);
416 if( pass == 1 ) // compute C on the first pass, and reuse it on the second pass, if any.
418 int dy1 = y == 0 ? 0 : y + SH2, dy2 = y == 0 ? SH2 : dy1;
420 for( k = dy1; k <= dy2; k++ )
422 CostType* hsumAdd = hsumBuf + (min(k, height-1) % hsumBufNRows)*costBufSize;
426 calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, clipTab, TAB_OFS );
428 memset(hsumAdd, 0, D*sizeof(CostType));
429 for( x = 0; x <= SW2*D; x += D )
431 int scale = x == 0 ? SW2 + 1 : 1;
432 for( d = 0; d < D; d++ )
433 hsumAdd[d] = (CostType)(hsumAdd[d] + pixDiff[x + d]*scale);
438 const CostType* hsumSub = hsumBuf + (max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize;
439 const CostType* Cprev = !params.fullDP || y == 0 ? C : C - costBufSize;
441 for( x = D; x < width1*D; x += D )
443 const CostType* pixAdd = pixDiff + min(x + SW2*D, (width1-1)*D);
444 const CostType* pixSub = pixDiff + max(x - (SW2+1)*D, 0);
449 for( d = 0; d < D; d += 8 )
451 __m128i hv = _mm_load_si128((const __m128i*)(hsumAdd + x - D + d));
452 __m128i Cx = _mm_load_si128((__m128i*)(Cprev + x + d));
453 hv = _mm_adds_epi16(_mm_subs_epi16(hv,
454 _mm_load_si128((const __m128i*)(pixSub + d))),
455 _mm_load_si128((const __m128i*)(pixAdd + d)));
456 Cx = _mm_adds_epi16(_mm_subs_epi16(Cx,
457 _mm_load_si128((const __m128i*)(hsumSub + x + d))),
459 _mm_store_si128((__m128i*)(hsumAdd + x + d), hv);
460 _mm_store_si128((__m128i*)(C + x + d), Cx);
466 for( d = 0; d < D; d++ )
468 int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
469 C[x + d] = (CostType)(Cprev[x + d] + hv - hsumSub[x + d]);
476 for( x = D; x < width1*D; x += D )
478 const CostType* pixAdd = pixDiff + min(x + SW2*D, (width1-1)*D);
479 const CostType* pixSub = pixDiff + max(x - (SW2+1)*D, 0);
481 for( d = 0; d < D; d++ )
482 hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
489 int scale = k == 0 ? SH2 + 1 : 1;
490 for( x = 0; x < width1*D; x++ )
491 C[x] = (CostType)(C[x] + hsumAdd[x]*scale);
495 // also, clear the S buffer
496 for( k = 0; k < width1*D; k++ )
500 // clear the left and the right borders
501 memset( Lr[0] - NRD2*LrBorder - 8, 0, NRD2*LrBorder*sizeof(CostType) );
502 memset( Lr[0] + width1*NRD2 - 8, 0, NRD2*LrBorder*sizeof(CostType) );
503 memset( minLr[0] - NR2*LrBorder, 0, NR2*LrBorder*sizeof(CostType) );
504 memset( minLr[0] + width1*NR2, 0, NR2*LrBorder*sizeof(CostType) );
507 [formula 13 in the paper]
508 compute L_r(p, d) = C(p, d) +
512 min_k L_r(p-r, k) + P2) - min_k L_r(p-r, k)
513 where p = (x,y), r is one of the directions.
514 we process all the directions at once:
524 for( x = x1; x != x2; x += dx )
526 int xm = x*NR2, xd = xm*D2;
528 int delta0 = minLr[0][xm - dx*NR2] + P2, delta1 = minLr[1][xm - NR2 + 1] + P2;
529 int delta2 = minLr[1][xm + 2] + P2, delta3 = minLr[1][xm + NR2 + 3] + P2;
531 CostType* Lr_p0 = Lr[0] + xd - dx*NRD2;
532 CostType* Lr_p1 = Lr[1] + xd - NRD2 + D2;
533 CostType* Lr_p2 = Lr[1] + xd + D2*2;
534 CostType* Lr_p3 = Lr[1] + xd + NRD2 + D2*3;
536 Lr_p0[-1] = Lr_p0[D] = Lr_p1[-1] = Lr_p1[D] =
537 Lr_p2[-1] = Lr_p2[D] = Lr_p3[-1] = Lr_p3[D] = MAX_COST;
539 CostType* Lr_p = Lr[0] + xd;
540 const CostType* Cp = C + x*D;
541 CostType* Sp = S + x*D;
546 __m128i _P1 = _mm_set1_epi16((short)P1);
548 __m128i _delta0 = _mm_set1_epi16((short)delta0);
549 __m128i _delta1 = _mm_set1_epi16((short)delta1);
550 __m128i _delta2 = _mm_set1_epi16((short)delta2);
551 __m128i _delta3 = _mm_set1_epi16((short)delta3);
552 __m128i _minL0 = _mm_set1_epi16((short)MAX_COST);
554 for( d = 0; d < D; d += 8 )
556 __m128i Cpd = _mm_load_si128((const __m128i*)(Cp + d));
557 __m128i L0, L1, L2, L3;
559 L0 = _mm_load_si128((const __m128i*)(Lr_p0 + d));
560 L1 = _mm_load_si128((const __m128i*)(Lr_p1 + d));
561 L2 = _mm_load_si128((const __m128i*)(Lr_p2 + d));
562 L3 = _mm_load_si128((const __m128i*)(Lr_p3 + d));
564 L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d - 1)), _P1));
565 L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d + 1)), _P1));
567 L1 = _mm_min_epi16(L1, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p1 + d - 1)), _P1));
568 L1 = _mm_min_epi16(L1, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p1 + d + 1)), _P1));
570 L2 = _mm_min_epi16(L2, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p2 + d - 1)), _P1));
571 L2 = _mm_min_epi16(L2, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p2 + d + 1)), _P1));
573 L3 = _mm_min_epi16(L3, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p3 + d - 1)), _P1));
574 L3 = _mm_min_epi16(L3, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p3 + d + 1)), _P1));
576 L0 = _mm_min_epi16(L0, _delta0);
577 L0 = _mm_adds_epi16(_mm_subs_epi16(L0, _delta0), Cpd);
579 L1 = _mm_min_epi16(L1, _delta1);
580 L1 = _mm_adds_epi16(_mm_subs_epi16(L1, _delta1), Cpd);
582 L2 = _mm_min_epi16(L2, _delta2);
583 L2 = _mm_adds_epi16(_mm_subs_epi16(L2, _delta2), Cpd);
585 L3 = _mm_min_epi16(L3, _delta3);
586 L3 = _mm_adds_epi16(_mm_subs_epi16(L3, _delta3), Cpd);
588 _mm_store_si128( (__m128i*)(Lr_p + d), L0);
589 _mm_store_si128( (__m128i*)(Lr_p + d + D2), L1);
590 _mm_store_si128( (__m128i*)(Lr_p + d + D2*2), L2);
591 _mm_store_si128( (__m128i*)(Lr_p + d + D2*3), L3);
593 __m128i t0 = _mm_min_epi16(_mm_unpacklo_epi16(L0, L2), _mm_unpackhi_epi16(L0, L2));
594 __m128i t1 = _mm_min_epi16(_mm_unpacklo_epi16(L1, L3), _mm_unpackhi_epi16(L1, L3));
595 t0 = _mm_min_epi16(_mm_unpacklo_epi16(t0, t1), _mm_unpackhi_epi16(t0, t1));
596 _minL0 = _mm_min_epi16(_minL0, t0);
598 __m128i Sval = _mm_load_si128((const __m128i*)(Sp + d));
600 L0 = _mm_adds_epi16(L0, L1);
601 L2 = _mm_adds_epi16(L2, L3);
602 Sval = _mm_adds_epi16(Sval, L0);
603 Sval = _mm_adds_epi16(Sval, L2);
605 _mm_store_si128((__m128i*)(Sp + d), Sval);
608 _minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 8));
609 _mm_storel_epi64((__m128i*)&minLr[0][xm], _minL0);
614 int minL0 = MAX_COST, minL1 = MAX_COST, minL2 = MAX_COST, minL3 = MAX_COST;
616 for( d = 0; d < D; d++ )
618 int Cpd = Cp[d], L0, L1, L2, L3;
620 L0 = Cpd + min((int)Lr_p0[d], min(Lr_p0[d-1] + P1, min(Lr_p0[d+1] + P1, delta0))) - delta0;
621 L1 = Cpd + min((int)Lr_p1[d], min(Lr_p1[d-1] + P1, min(Lr_p1[d+1] + P1, delta1))) - delta1;
622 L2 = Cpd + min((int)Lr_p2[d], min(Lr_p2[d-1] + P1, min(Lr_p2[d+1] + P1, delta2))) - delta2;
623 L3 = Cpd + min((int)Lr_p3[d], min(Lr_p3[d-1] + P1, min(Lr_p3[d+1] + P1, delta3))) - delta3;
625 Lr_p[d] = (CostType)L0;
626 minL0 = min(minL0, L0);
628 Lr_p[d + D2] = (CostType)L1;
629 minL1 = min(minL1, L1);
631 Lr_p[d + D2*2] = (CostType)L2;
632 minL2 = min(minL2, L2);
634 Lr_p[d + D2*3] = (CostType)L3;
635 minL3 = min(minL3, L3);
637 Sp[d] = saturate_cast<CostType>(Sp[d] + L0 + L1 + L2 + L3);
639 minLr[0][xm] = (CostType)minL0;
640 minLr[0][xm+1] = (CostType)minL1;
641 minLr[0][xm+2] = (CostType)minL2;
642 minLr[0][xm+3] = (CostType)minL3;
646 if( pass == npasses )
648 for( x = 0; x < width; x++ )
650 disp1ptr[x] = disp2ptr[x] = (DispType)INVALID_DISP_SCALED;
651 disp2cost[x] = MAX_COST;
654 for( x = width1 - 1; x >= 0; x-- )
656 CostType* Sp = S + x*D;
657 int minS = MAX_COST, bestDisp = -1;
661 int xm = x*NR2, xd = xm*D2;
663 int minL0 = MAX_COST;
664 int delta0 = minLr[0][xm + NR2] + P2;
665 CostType* Lr_p0 = Lr[0] + xd + NRD2;
666 Lr_p0[-1] = Lr_p0[D] = MAX_COST;
667 CostType* Lr_p = Lr[0] + xd;
669 const CostType* Cp = C + x*D;
674 __m128i _P1 = _mm_set1_epi16((short)P1);
675 __m128i _delta0 = _mm_set1_epi16((short)delta0);
677 __m128i _minL0 = _mm_set1_epi16((short)minL0);
678 __m128i _minS = _mm_set1_epi16(MAX_COST), _bestDisp = _mm_set1_epi16(-1);
679 __m128i _d8 = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7), _8 = _mm_set1_epi16(8);
681 for( d = 0; d < D; d += 8 )
683 __m128i Cpd = _mm_load_si128((const __m128i*)(Cp + d)), L0;
685 L0 = _mm_load_si128((const __m128i*)(Lr_p0 + d));
686 L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d - 1)), _P1));
687 L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d + 1)), _P1));
688 L0 = _mm_min_epi16(L0, _delta0);
689 L0 = _mm_adds_epi16(_mm_subs_epi16(L0, _delta0), Cpd);
691 _mm_store_si128((__m128i*)(Lr_p + d), L0);
692 _minL0 = _mm_min_epi16(_minL0, L0);
693 L0 = _mm_adds_epi16(L0, *(__m128i*)(Sp + d));
694 _mm_store_si128((__m128i*)(Sp + d), L0);
696 __m128i mask = _mm_cmpgt_epi16(_minS, L0);
697 _minS = _mm_min_epi16(_minS, L0);
698 _bestDisp = _mm_xor_si128(_bestDisp, _mm_and_si128(_mm_xor_si128(_bestDisp,_d8), mask));
699 _d8 = _mm_adds_epi16(_d8, _8);
702 short CV_DECL_ALIGNED(16) bestDispBuf[8];
703 _mm_store_si128((__m128i*)bestDispBuf, _bestDisp);
705 _minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 8));
706 _minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 4));
707 _minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 2));
709 __m128i _S = _mm_min_epi16(_minS, _mm_srli_si128(_minS, 8));
710 _S = _mm_min_epi16(_S, _mm_srli_si128(_S, 4));
711 _S = _mm_min_epi16(_S, _mm_srli_si128(_S, 2));
713 minLr[0][xm] = (CostType)_mm_cvtsi128_si32(_minL0);
714 minS = (CostType)_mm_cvtsi128_si32(_S);
716 _S = _mm_shuffle_epi32(_mm_unpacklo_epi16(_S, _S), 0);
717 _S = _mm_cmpeq_epi16(_minS, _S);
718 int idx = _mm_movemask_epi8(_mm_packs_epi16(_S, _S)) & 255;
720 bestDisp = bestDispBuf[LSBTab[idx]];
725 for( d = 0; d < D; d++ )
727 int L0 = Cp[d] + min((int)Lr_p0[d], min(Lr_p0[d-1] + P1, min(Lr_p0[d+1] + P1, delta0))) - delta0;
729 Lr_p[d] = (CostType)L0;
730 minL0 = min(minL0, L0);
732 int Sval = Sp[d] = saturate_cast<CostType>(Sp[d] + L0);
739 minLr[0][xm] = (CostType)minL0;
744 for( d = 0; d < D; d++ )
755 for( d = 0; d < D; d++ )
757 if( Sp[d]*(100 - uniquenessRatio) < minS*100 && std::abs(bestDisp - d) > 1 )
763 int x2 = x + minX1 - d - minD;
764 if( disp2cost[x2] > minS )
766 disp2cost[x2] = (CostType)minS;
767 disp2ptr[x2] = (DispType)(d + minD);
770 if( 0 < d && d < D-1 )
772 // do subpixel quadratic interpolation:
773 // fit parabola into (x1=d-1, y1=Sp[d-1]), (x2=d, y2=Sp[d]), (x3=d+1, y3=Sp[d+1])
774 // then find minimum of the parabola.
775 int denom2 = max(Sp[d-1] + Sp[d+1] - 2*Sp[d], 1);
776 d = d*DISP_SCALE + ((Sp[d-1] - Sp[d+1])*DISP_SCALE + denom2)/(denom2*2);
780 disp1ptr[x + minX1] = (DispType)(d + minD*DISP_SCALE);
783 for( x = minX1; x < maxX1; x++ )
785 // we round the computed disparity both towards -inf and +inf and check
786 // if either of the corresponding disparities in disp2 is consistent.
787 // This is to give the computed disparity a chance to look valid if it is.
789 if( d == INVALID_DISP_SCALED )
791 int _d = d >> DISP_SHIFT;
792 int d_ = (d + DISP_SCALE-1) >> DISP_SHIFT;
793 int _x = x - _d, x_ = x - d_;
794 if( 0 <= _x && _x < width && disp2ptr[_x] >= minD && std::abs(disp2ptr[_x] - _d) > disp12MaxDiff &&
795 0 <= x_ && x_ < width && disp2ptr[x_] >= minD && std::abs(disp2ptr[x_] - d_) > disp12MaxDiff )
796 disp1ptr[x] = (DispType)INVALID_DISP_SCALED;
800 // now shift the cyclic buffers
801 std::swap( Lr[0], Lr[1] );
802 std::swap( minLr[0], minLr[1] );
807 typedef cv::Point_<short> Point2s;
809 void filterSpeckles( Mat& img, double _newval, int maxSpeckleSize, double _maxDiff, Mat& _buf )
811 CV_Assert( img.type() == CV_16SC1 );
813 int newVal = cvRound(_newval);
814 int maxDiff = cvRound(_maxDiff);
815 int width = img.cols, height = img.rows, npixels = width*height;
816 size_t bufSize = npixels*(int)(sizeof(Point2s) + sizeof(int) + sizeof(uchar));
817 if( !_buf.isContinuous() || !_buf.data || _buf.cols*_buf.rows*_buf.elemSize() < bufSize )
818 _buf.create(1, bufSize, CV_8U);
820 uchar* buf = _buf.data;
821 int i, j, dstep = img.step/sizeof(short);
822 int* labels = (int*)buf;
823 buf += npixels*sizeof(labels[0]);
824 Point2s* wbuf = (Point2s*)buf;
825 buf += npixels*sizeof(wbuf[0]);
826 uchar* rtype = (uchar*)buf;
829 // clear out label assignments
830 memset(labels, 0, npixels*sizeof(labels[0]));
832 for( i = 0; i < height; i++ )
834 short* ds = img.ptr<short>(i);
835 int* ls = labels + width*i;
837 for( j = 0; j < width; j++ )
839 if( ds[j] != newVal ) // not a bad disparity
841 if( ls[j] ) // has a label, check for bad label
843 if( rtype[ls[j]] ) // small region, zero out disparity
844 ds[j] = (short)newVal;
846 // no label, assign and propagate
849 Point2s* ws = wbuf; // initialize wavefront
850 Point2s p((short)j, (short)i); // current pixel
851 curlabel++; // next label
852 int count = 0; // current region size
855 // wavefront propagation
856 while( ws >= wbuf ) // wavefront not empty
859 // put neighbors onto wavefront
860 short* dpp = &img.at<short>(p.y, p.x);
862 int* lpp = labels + width*p.y + p.x;
864 if( p.x < width-1 && !lpp[+1] && dpp[+1] != newVal && std::abs(dp - dpp[+1]) <= maxDiff )
867 *ws++ = Point2s(p.x+1, p.y);
870 if( p.x > 0 && !lpp[-1] && dpp[-1] != newVal && std::abs(dp - dpp[-1]) <= maxDiff )
873 *ws++ = Point2s(p.x-1, p.y);
876 if( p.y < height-1 && !lpp[+width] && dpp[+dstep] != newVal && std::abs(dp - dpp[+dstep]) <= maxDiff )
878 lpp[+width] = curlabel;
879 *ws++ = Point2s(p.x, p.y+1);
882 if( p.y > 0 && !lpp[-width] && dpp[-dstep] != newVal && std::abs(dp - dpp[-dstep]) <= maxDiff )
884 lpp[-width] = curlabel;
885 *ws++ = Point2s(p.x, p.y-1);
888 // pop most recent and propagate
889 // NB: could try least recent, maybe better convergence
894 if( count <= maxSpeckleSize ) // speckle region
896 rtype[ls[j]] = 1; // small region label
897 ds[j] = (short)newVal;
900 rtype[ls[j]] = 0; // large region label
908 void StereoSGBM::operator ()( const Mat& left, const Mat& right, Mat& disp )
910 CV_Assert( left.size() == right.size() && left.type() == right.type() &&
911 left.depth() == DataType<PixType>::depth );
913 disp.create( left.size(), CV_16S );
915 computeDisparitySGBM( left, right, disp, *this, buffer );
916 medianBlur(disp, disp, 3);
918 if( speckleWindowSize > 0 )
919 filterSpeckles(disp, (minDisparity - 1)*DISP_SCALE, 100, DISP_SCALE, buffer);
923 Rect getValidDisparityROI( Rect roi1, Rect roi2,
925 int numberOfDisparities,
928 int SW2 = SADWindowSize/2;
929 int minD = minDisparity, maxD = minDisparity + numberOfDisparities - 1;
931 int xmin = max(roi1.x, roi2.x + maxD) + SW2;
932 int xmax = min(roi1.x + roi1.width, roi2.x + roi2.width - minD) - SW2;
933 int ymin = max(roi1.y, roi2.y) + SW2;
934 int ymax = min(roi1.y + roi1.height, roi2.y + roi2.height) - SW2;
936 Rect r(xmin, ymin, xmax - xmin, ymax - ymin);
938 return r.width > 0 && r.height > 0 ? r : Rect();
942 void validateDisparity( Mat& disp, const Mat& cost, int minDisparity, int numberOfDisparities, int disp12MaxDiff )
944 int cols = disp.cols, rows = disp.rows;
945 int minD = minDisparity, maxD = minDisparity + numberOfDisparities;
946 int x, minX1 = max(maxD, 0), maxX1 = cols + min(minD, 0);
947 AutoBuffer<int> _disp2buf(cols*2);
948 int* disp2buf = _disp2buf;
949 int* disp2cost = disp2buf + cols;
950 const int DISP_SHIFT = 4, DISP_SCALE = 1 << DISP_SHIFT;
951 int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
952 int costType = cost.type();
954 disp12MaxDiff *= DISP_SCALE;
956 CV_Assert( numberOfDisparities > 0 && disp.type() == CV_16S &&
957 (costType == CV_16S || costType == CV_32S) &&
958 disp.size() == cost.size() );
960 for( int y = 0; y < rows; y++ )
962 short* dptr = disp.ptr<short>(y);
964 for( x = 0; x < cols; x++ )
966 disp2buf[x] = INVALID_DISP;
967 disp2cost[x] = INT_MAX;
970 if( costType == CV_16S )
972 const short* cptr = cost.ptr<short>(y);
974 for( x = minX1; x < maxX1; x++ )
976 int d = dptr[x], c = cptr[x];
977 int x2 = x - ((d + DISP_SCALE/2) >> DISP_SHIFT);
979 if( disp2cost[x2] > c )
988 const int* cptr = cost.ptr<int>(y);
990 for( x = minX1; x < maxX1; x++ )
992 int d = dptr[x], c = cptr[x];
993 int x2 = x - ((d + DISP_SCALE/2) >> DISP_SHIFT);
995 if( disp2cost[x2] < c )
1003 for( x = minX1; x < maxX1; x++ )
1005 // we round the computed disparity both towards -inf and +inf and check
1006 // if either of the corresponding disparities in disp2 is consistent.
1007 // This is to give the computed disparity a chance to look valid if it is.
1009 if( d == INVALID_DISP_SCALED )
1011 int d0 = d >> DISP_SHIFT;
1012 int d1 = (d + DISP_SCALE-1) >> DISP_SHIFT;
1013 int x0 = x - d0, x1 = x - d1;
1014 if( (0 <= x0 && x0 < cols && disp2buf[x0] > INVALID_DISP && std::abs(disp2buf[x0] - d) > disp12MaxDiff) &&
1015 (0 <= x1 && x1 < cols && disp2buf[x1] > INVALID_DISP && std::abs(disp2buf[x1] - d) > disp12MaxDiff) )
1016 dptr[x] = (short)INVALID_DISP_SCALED;
1023 CvRect cvGetValidDisparityROI( CvRect roi1, CvRect roi2, int minDisparity,
1024 int numberOfDisparities, int SADWindowSize )
1026 return (CvRect)cv::getValidDisparityROI( roi1, roi2, minDisparity,
1027 numberOfDisparities, SADWindowSize );
1030 void cvValidateDisparity( CvArr* _disp, const CvArr* _cost, int minDisparity,
1031 int numberOfDisparities, int disp12MaxDiff )
1033 cv::Mat disp = cv::cvarrToMat(_disp), cost = cv::cvarrToMat(_cost);
1034 cv::validateDisparity( disp, cost, minDisparity, numberOfDisparities, disp12MaxDiff );