opencv/src/cv/cvstereosgbm.cpp

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
  14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // Redistribution and use in source and binary forms, with or without modification,
  18 // are permitted provided that the following conditions are met:
  19 //
  20 //   * Redistribution's of source code must retain the above copyright notice,
  21 //     this list of conditions and the following disclaimer.
  22 //
  23 //   * Redistribution's in binary form must reproduce the above copyright notice,
  24 //     this list of conditions and the following disclaimer in the documentation
  25 //     and/or other materials provided with the distribution.
  26 //
  27 //   * The name of the copyright holders may not be used to endorse or promote products
  28 //     derived from this software without specific prior written permission.
  29 //
  30 // This software is provided by the copyright holders and contributors "as is" and
  31 // any express or implied warranties, including, but not limited to, the implied
  32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  33 // In no event shall the Intel Corporation or contributors be liable for any direct,
  34 // indirect, incidental, special, exemplary, or consequential damages
  35 // (including, but not limited to, procurement of substitute goods or services;
  36 // loss of use, data, or profits; or business interruption) however caused
  37 // and on any theory of liability, whether in contract, strict liability,
  38 // or tort (including negligence or otherwise) arising in any way out of
  39 // the use of this software, even if advised of the possibility of such damage.
  40 //
  41 //M*/
  42
  43 /*
  44  This is a variation of
  45  "Stereo Processing by Semiglobal Matching and Mutual Information"
  46  by Heiko Hirschmuller.
  47
  48  We match blocks rather than individual pixels, thus the algorithm is called
  49  SGBM (Semi-global block matching)
  50  */
  51
  52 #include "_cv.h"
  53 #include <limits.h>
  54
  55 namespace cv
  56 {
  57
  58 typedef uchar PixType;
  59 typedef short CostType;
  60 typedef short DispType;
  61
  62 enum { NR = 16, NR2 = NR/2 };
  63
  64 StereoSGBM::StereoSGBM()
  65 {
  66     minDisparity = numberOfDisparities = 0;
  67     SADWindowSize = 0;
  68     P1 = P2 = 0;
  69     disp12MaxDiff = 0;
  70     preFilterCap = 0;
  71     uniquenessRatio = 0;
  72     speckleWindowSize = 0;
  73     speckleRange = 0;
  74     fullDP = false;
  75 }
  76
  77
  78 StereoSGBM::StereoSGBM( int _minDisparity, int _numDisparities, int _SADWindowSize,
  79                    int _P1, int _P2, int _disp12MaxDiff, int _preFilterCap,
  80                    int _uniquenessRatio, int _speckleWindowSize, int _speckleRange,
  81                    bool _fullDP )
  82 {
  83     minDisparity = _minDisparity;
  84     numberOfDisparities = _numDisparities;
  85     SADWindowSize = _SADWindowSize;
  86     P1 = _P1;
  87     P2 = _P2;
  88     disp12MaxDiff = _disp12MaxDiff;
  89     preFilterCap = _preFilterCap;
  90     uniquenessRatio = _uniquenessRatio;
  91     speckleWindowSize = _speckleWindowSize;
  92     speckleRange = _speckleRange;
  93     fullDP = _fullDP;
  94 }
  95
  96
  97 StereoSGBM::~StereoSGBM()
  98 {
  99 }
 100
 101 /*
 102  For each pixel row1[x], max(-maxD, 0) <= minX <= x < maxX <= width - max(0, -minD),
 103  and for each disparity minD<=d<maxD the function
 104  computes the cost (cost[(x-minX)*(maxD - minD) + (d - minD)]), depending on the difference between
 105  row1[x] and row2[x-d]. The subpixel algorithm from
 106  "Depth Discontinuities by Pixel-to-Pixel Stereo" by Stan Birchfield and C. Tomasi
 107  is used, hence the suffix BT.
 108
 109  the temporary buffer should contain width2*2 elements
 110  */
 111 static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
 112                             int minD, int maxD, CostType* cost,
 113                             PixType* buffer, const PixType* tab, int tabOfs )
 114 {
 115     int x, c, width = img1.cols, cn = img1.channels();
 116     int minX1 = max(maxD, 0), maxX1 = width + min(minD, 0);
 117     int minX2 = max(minX1 - maxD, 0), maxX2 = min(maxX1 - minD, width);
 118     int D = maxD - minD, width1 = maxX1 - minX1, width2 = maxX2 - minX2;
 119     const PixType *row1 = img1.ptr<PixType>(y), *row2 = img2.ptr<PixType>(y);
 120     PixType *prow1 = buffer + width2*2, *prow2 = prow1 + width*cn;
 121
 122     tab += tabOfs;
 123
 124     for( c = 0; c < cn; c++ )
 125     {
 126         prow1[width*c] = prow1[width*c + width-1] =
 127         prow2[width*c] = prow2[width*c + width-1] = tab[0];
 128     }
 129
 130     if( cn == 1 )
 131     {
 132         int n1 = y > 0 ? -(int)img1.step : 0, s1 = y < img1.rows-1 ? (int)img1.step : 0;
 133         int n2 = y > 0 ? -(int)img2.step : 0, s2 = y < img2.rows-1 ? (int)img2.step : 0;
 134
 135         for( x = 1; x < width-1; x++ )
 136         {
 137             //prow1[x] = tab[row1[x+1] - row1[x-1]];
 138             //prow2[width-1-x] = tab[row2[x+1] - row2[x-1]];
 139             prow1[x] = tab[(row1[x+1] - row1[x-1])*2 + row1[x+n1+1] - row1[x+n1-1] + row1[x+s1+1] - row1[x+s1-1]];
 140             prow2[width-1-x] = tab[(row2[x+1] - row2[x-1])*2 + row2[x+n2+1] - row2[x+n2-1] + row2[x+s2+1] - row2[x+s2-1]];
 141         }
 142     }
 143     else
 144     {
 145         for( x = 1; x < width-1; x++ )
 146         {
 147             prow1[x] = tab[row1[x*3+3] - row1[x*3-3]];
 148             prow1[x+width] = tab[row1[x*3+4] - row1[x*3-2]];
 149             prow1[x+width*2] = tab[row1[x*3+5] - row1[x*3-1]];
 150
 151             prow2[width-1-x] = tab[row2[x*3+3] - row2[x*3-3]];
 152             prow2[width-1-x+width] = tab[row2[x*3+4] - row2[x*3-2]];
 153             prow2[width-1-x+width*2] = tab[row2[x*3+5] - row2[x*3-1]];
 154         }
 155     }
 156
 157     memset( cost, 0, width1*D*sizeof(cost[0]) );
 158
 159     buffer -= minX2;
 160     cost -= minX1*D + minD; // simplify the cost indices inside the loop
 161
 162 #if CV_SSE2
 163     volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
 164 #endif
 165
 166 #if 1
 167     for( c = 0; c < cn; c++, prow1 += width, prow2 += width )
 168     {
 169         // precompute
 170         //   v0 = min(row2[x-1/2], row2[x], row2[x+1/2]) and
 171         //   v1 = max(row2[x-1/2], row2[x], row2[x+1/2]) and
 172         for( x = minX2; x < maxX2; x++ )
 173         {
 174             int v = prow2[x];
 175             int vl = x > 0 ? (v + prow2[x-1])/2 : v;
 176             int vr = x < width-1 ? (v + prow2[x+1])/2 : v;
 177             int v0 = min(vl, vr); v0 = min(v0, v);
 178             int v1 = max(vl, vr); v1 = max(v1, v);
 179             buffer[x] = (PixType)v0;
 180             buffer[x + width2] = (PixType)v1;
 181         }
 182
 183         for( x = minX1; x < maxX1; x++ )
 184         {
 185             int u = prow1[x];
 186             int ul = x > 0 ? (u + prow1[x-1])/2 : u;
 187             int ur = x < width-1 ? (u + prow1[x+1])/2 : u;
 188             int u0 = min(ul, ur); u0 = min(u0, u);
 189             int u1 = max(ul, ur); u1 = max(u1, u);
 190
 191         #if CV_SSE2
 192             if( useSIMD )
 193             {
 194                 __m128i _u = _mm_set1_epi8((char)u), _u0 = _mm_set1_epi8((char)u0);
 195                 __m128i _u1 = _mm_set1_epi8((char)u1), z = _mm_setzero_si128();
 196
 197                 for( int d = minD; d < maxD; d += 16 )
 198                 {
 199                     __m128i _v = _mm_loadu_si128((const __m128i*)(prow2 + width-x-1 + d));
 200                     __m128i _v0 = _mm_loadu_si128((const __m128i*)(buffer + width-x-1 + d));
 201                     __m128i _v1 = _mm_loadu_si128((const __m128i*)(buffer + width-x-1 + d + width2));
 202                     __m128i c0 = _mm_max_epu8(_mm_subs_epu8(_u, _v1), _mm_subs_epu8(_v0, _u));
 203                     __m128i c1 = _mm_max_epu8(_mm_subs_epu8(_v, _u1), _mm_subs_epu8(_u0, _v));
 204                     __m128i diff = _mm_min_epu8(c0, c1);
 205
 206                     c0 = _mm_load_si128((__m128i*)(cost + x*D + d));
 207                     c1 = _mm_load_si128((__m128i*)(cost + x*D + d + 8));
 208
 209                     _mm_store_si128((__m128i*)(cost + x*D + d), _mm_adds_epi16(c0, _mm_unpacklo_epi8(diff,z)));
 210                     _mm_store_si128((__m128i*)(cost + x*D + d + 8), _mm_adds_epi16(c1, _mm_unpackhi_epi8(diff,z)));
 211                 }
 212             }
 213             else
 214         #endif
 215             {
 216                 for( int d = minD; d < maxD; d++ )
 217                 {
 218                     int v = prow2[width-x-1 + d];
 219                     int v0 = buffer[width-x-1 + d];
 220                     int v1 = buffer[width-x-1 + d + width2];
 221                     int c0 = max(0, u - v1); c0 = max(c0, v0 - u);
 222                     int c1 = max(0, v - u1); c1 = max(c1, u0 - v);
 223
 224                     cost[x*D + d] = (CostType)(cost[x*D+d] + min(c0, c1));
 225                 }
 226             }
 227         }
 228     }
 229 #else
 230     for( c = 0; c < cn; c++, prow1 += width, prow2 += width )
 231     {
 232         for( x = minX1; x < maxX1; x++ )
 233         {
 234             int u = prow1[x];
 235         #if CV_SSE2
 236             if( useSIMD )
 237             {
 238                 __m128i _u = _mm_set1_epi8(u), z = _mm_setzero_si128();
 239
 240                 for( int d = minD; d < maxD; d += 16 )
 241                 {
 242                     __m128i _v = _mm_loadu_si128((const __m128i*)(prow2 + width-1-x + d));
 243                     __m128i diff = _mm_adds_epu8(_mm_subs_epu8(_u,_v), _mm_subs_epu8(_v,_u));
 244                     __m128i c0 = _mm_load_si128((__m128i*)(cost + x*D + d));
 245                     __m128i c1 = _mm_load_si128((__m128i*)(cost + x*D + d + 8));
 246
 247                     _mm_store_si128((__m128i*)(cost + x*D + d), _mm_adds_epi16(c0, _mm_unpacklo_epi8(diff,z)));
 248                     _mm_store_si128((__m128i*)(cost + x*D + d + 8), _mm_adds_epi16(c1, _mm_unpackhi_epi8(diff,z)));
 249                 }
 250             }
 251             else
 252         #endif
 253             {
 254                 for( int d = minD; d < maxD; d++ )
 255                 {
 256                     int v = prow2[width-1-x + d];
 257                     cost[x*D + d] = (CostType)(cost[x*D + d] + std::abs(u - v));
 258                 }
 259             }
 260         }
 261     }
 262 #endif
 263 }
 264
 265
 266 /*
 267  computes disparity for "roi" in img1 w.r.t. img2 and write it to disp1buf.
 268  that is, disp1buf(x, y)=d means that img1(x+roi.x, y+roi.y) ~ img2(x+roi.x-d, y+roi.y).
 269  minD <= d < maxD.
 270  disp2full is the reverse disparity map, that is:
 271  disp2full(x+roi.x,y+roi.y)=d means that img2(x+roi.x, y+roi.y) ~ img1(x+roi.x+d, y+roi.y)
 272
 273  note that disp1buf will have the same size as the roi and
 274  disp2full will have the same size as img1 (or img2).
 275  On exit disp2buf is not the final disparity, it is an intermediate result that becomes
 276  final after all the tiles are processed.
 277
 278  the disparity in disp1buf is written with sub-pixel accuracy
 279  (4 fractional bits, see CvStereoSGBM::DISP_SCALE),
 280  using quadratic interpolation, while the disparity in disp2buf
 281  is written as is, without interpolation.
 282
 283  disp2cost also has the same size as img1 (or img2).
 284  It contains the minimum current cost, used to find the best disparity, corresponding to the minimal cost.
 285  */
 286 static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
 287                                  Mat& disp1, const StereoSGBM& params,
 288                                  Mat& buffer )
 289 {
 290 #if CV_SSE2
 291     static const uchar LSBTab[] =
 292     {
 293         0, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
 294         5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
 295         6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
 296         5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
 297         7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
 298         5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
 299         6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
 300         5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
 301     };
 302
 303     volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
 304 #endif
 305
 306     const int ALIGN = 16;
 307     const int DISP_SHIFT = StereoSGBM::DISP_SHIFT;
 308     const int DISP_SCALE = StereoSGBM::DISP_SCALE;
 309     const CostType MAX_COST = SHRT_MAX;
 310
 311     int minD = params.minDisparity, maxD = minD + params.numberOfDisparities;
 312     Size SADWindowSize;
 313     SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5;
 314     int ftzero = max(params.preFilterCap, 15) | 1;
 315     int uniquenessRatio = params.uniquenessRatio > 0 ? params.uniquenessRatio : 10;
 316     int disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
 317     int P1 = params.P1 > 0 ? params.P1 : 2, P2 = max(params.P2 > 0 ? params.P2 : 5, P1+1);
 318     int k, width = disp1.cols, height = disp1.rows;
 319     int minX1 = max(maxD, 0), maxX1 = width + min(minD, 0);
 320     int D = maxD - minD, width1 = maxX1 - minX1;
 321     int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
 322     int SW2 = SADWindowSize.width/2, SH2 = SADWindowSize.height/2;
 323     int npasses = params.fullDP ? 2 : 1;
 324     const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;
 325     PixType clipTab[TAB_SIZE];
 326
 327     for( k = 0; k < TAB_SIZE; k++ )
 328         clipTab[k] = (PixType)(min(max(k - TAB_OFS, -ftzero), ftzero) + ftzero);
 329
 330     if( minX1 >= maxX1 )
 331     {
 332         disp1 = Scalar::all(INVALID_DISP_SCALED);
 333         return;
 334     }
 335
 336     CV_Assert( D % 16 == 0 );
 337
 338     // NR - the number of directions. the loop on x below that computes Lr assumes that NR == 8.
 339     // if you change NR, please, modify the loop as well.
 340     int D2 = D+16, NRD2 = NR2*D2;
 341
 342     // the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer:
 343     // for 8-way dynamic programming we need the current row and
 344     // the previous row, i.e. 2 rows in total
 345     const int NLR = 2;
 346     const int LrBorder = NLR - 1;
 347
 348     // for each possible stereo match (img1(x,y) <=> img2(x-d,y))
 349     // we keep pixel difference cost (C) and the summary cost over NR directions (S).
 350     // we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k)
 351     size_t costBufSize = width1*D;
 352     size_t CSBufSize = costBufSize*(params.fullDP ? height : 1);
 353     size_t minLrSize = (width1 + LrBorder*2)*NR2, LrSize = minLrSize*D2;
 354     int hsumBufNRows = SH2*2 + 2;
 355     size_t totalBufSize = (LrSize + minLrSize)*NLR*sizeof(CostType) + // minLr[] and Lr[]
 356     costBufSize*(hsumBufNRows + 1)*sizeof(CostType) + // hsumBuf, pixdiff
 357     CSBufSize*2*sizeof(CostType) + // C, S
 358     width*8*img1.channels()*sizeof(PixType) + // temp buffer for computing per-pixel cost
 359     width*(sizeof(CostType) + sizeof(DispType)) + 1024; // disp2cost + disp2
 360
 361     if( !buffer.data || !buffer.isContinuous() ||
 362        buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize )
 363         buffer.create(1, totalBufSize, CV_8U);
 364
 365     // summary cost over different (nDirs) directions
 366     CostType* Cbuf = (CostType*)alignPtr(buffer.data, ALIGN);
 367     CostType* Sbuf = Cbuf + CSBufSize;
 368     CostType* hsumBuf = Sbuf + CSBufSize;
 369     CostType* pixDiff = hsumBuf + costBufSize*hsumBufNRows;
 370
 371     CostType* disp2cost = pixDiff + costBufSize + (LrSize + minLrSize)*NLR;
 372     DispType* disp2ptr = (DispType*)(disp2cost + width);
 373     PixType* tempBuf = (PixType*)(disp2ptr + width);
 374
 375     // add P2 to every C(x,y). it saves a few operations in the inner loops
 376     for( k = 0; k < width1*D; k++ )
 377         Cbuf[k] = (CostType)P2;
 378
 379     for( int pass = 1; pass <= npasses; pass++ )
 380     {
 381         int x1, y1, x2, y2, dx, dy;
 382
 383         if( pass == 1 )
 384         {
 385             y1 = 0; y2 = height; dy = 1;
 386             x1 = 0; x2 = width1; dx = 1;
 387         }
 388         else
 389         {
 390             y1 = height-1; y2 = -1; dy = -1;
 391             x1 = width1-1; x2 = -1; dx = -1;
 392         }
 393
 394         CostType *Lr[NLR]={0}, *minLr[NLR]={0};
 395
 396         for( k = 0; k < NLR; k++ )
 397         {
 398             // shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,
 399             // and will occasionally use negative indices with the arrays
 400             // we need to shift Lr[k] pointers by 1, to give the space for d=-1.
 401             // however, then the alignment will be imperfect, i.e. bad for SSE,
 402             // thus we shift the pointers by 8 (8*sizeof(short) == 16 - ideal alignment)
 403             Lr[k] = pixDiff + costBufSize + LrSize*k + NRD2*LrBorder + 8;
 404             memset( Lr[k] - LrBorder*NRD2 - 8, 0, LrSize*sizeof(CostType) );
 405             minLr[k] = pixDiff + costBufSize + LrSize*NLR + minLrSize*k + NR2*2;
 406             memset( minLr[k] - LrBorder*NR2, 0, minLrSize*sizeof(CostType) );
 407         }
 408
 409         for( int y = y1; y != y2; y += dy )
 410         {
 411             int x, d;
 412             DispType* disp1ptr = disp1.ptr<DispType>(y);
 413             CostType* C = Cbuf + (!params.fullDP ? 0 : y*costBufSize);
 414             CostType* S = Sbuf + (!params.fullDP ? 0 : y*costBufSize);
 415
 416             if( pass == 1 ) // compute C on the first pass, and reuse it on the second pass, if any.
 417             {
 418                 int dy1 = y == 0 ? 0 : y + SH2, dy2 = y == 0 ? SH2 : dy1;
 419
 420                 for( k = dy1; k <= dy2; k++ )
 421                 {
 422                     CostType* hsumAdd = hsumBuf + (min(k, height-1) % hsumBufNRows)*costBufSize;
 423
 424                     if( k < height )
 425                     {
 426                         calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, clipTab, TAB_OFS );
 427
 428                         memset(hsumAdd, 0, D*sizeof(CostType));
 429                         for( x = 0; x <= SW2*D; x += D )
 430                         {
 431                             int scale = x == 0 ? SW2 + 1 : 1;
 432                             for( d = 0; d < D; d++ )
 433                                 hsumAdd[d] = (CostType)(hsumAdd[d] + pixDiff[x + d]*scale);
 434                         }
 435
 436                         if( y > 0 )
 437                         {
 438                             const CostType* hsumSub = hsumBuf + (max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize;
 439                             const CostType* Cprev = !params.fullDP || y == 0 ? C : C - costBufSize;
 440
 441                             for( x = D; x < width1*D; x += D )
 442                             {
 443                                 const CostType* pixAdd = pixDiff + min(x + SW2*D, (width1-1)*D);
 444                                 const CostType* pixSub = pixDiff + max(x - (SW2+1)*D, 0);
 445
 446                             #if CV_SSE2
 447                                 if( useSIMD )
 448                                 {
 449                                     for( d = 0; d < D; d += 8 )
 450                                     {
 451                                         __m128i hv = _mm_load_si128((const __m128i*)(hsumAdd + x - D + d));
 452                                         __m128i Cx = _mm_load_si128((__m128i*)(Cprev + x + d));
 453                                         hv = _mm_adds_epi16(_mm_subs_epi16(hv,
 454                                                                            _mm_load_si128((const __m128i*)(pixSub + d))),
 455                                                             _mm_load_si128((const __m128i*)(pixAdd + d)));
 456                                         Cx = _mm_adds_epi16(_mm_subs_epi16(Cx,
 457                                                                            _mm_load_si128((const __m128i*)(hsumSub + x + d))),
 458                                                             hv);
 459                                         _mm_store_si128((__m128i*)(hsumAdd + x + d), hv);
 460                                         _mm_store_si128((__m128i*)(C + x + d), Cx);
 461                                     }
 462                                 }
 463                                 else
 464                             #endif
 465                                 {
 466                                     for( d = 0; d < D; d++ )
 467                                     {
 468                                         int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
 469                                         C[x + d] = (CostType)(Cprev[x + d] + hv - hsumSub[x + d]);
 470                                     }
 471                                 }
 472                             }
 473                         }
 474                         else
 475                         {
 476                             for( x = D; x < width1*D; x += D )
 477                             {
 478                                 const CostType* pixAdd = pixDiff + min(x + SW2*D, (width1-1)*D);
 479                                 const CostType* pixSub = pixDiff + max(x - (SW2+1)*D, 0);
 480
 481                                 for( d = 0; d < D; d++ )
 482                                     hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
 483                             }
 484                         }
 485                     }
 486
 487                     if( y == 0 )
 488                     {
 489                         int scale = k == 0 ? SH2 + 1 : 1;
 490                         for( x = 0; x < width1*D; x++ )
 491                             C[x] = (CostType)(C[x] + hsumAdd[x]*scale);
 492                     }
 493                 }
 494
 495                 // also, clear the S buffer
 496                 for( k = 0; k < width1*D; k++ )
 497                     S[k] = 0;
 498             }
 499
 500             // clear the left and the right borders
 501             memset( Lr[0] - NRD2*LrBorder - 8, 0, NRD2*LrBorder*sizeof(CostType) );
 502             memset( Lr[0] + width1*NRD2 - 8, 0, NRD2*LrBorder*sizeof(CostType) );
 503             memset( minLr[0] - NR2*LrBorder, 0, NR2*LrBorder*sizeof(CostType) );
 504             memset( minLr[0] + width1*NR2, 0, NR2*LrBorder*sizeof(CostType) );
 505
 506             /*
 507              [formula 13 in the paper]
 508              compute L_r(p, d) = C(p, d) +
 509              min(L_r(p-r, d),
 510              L_r(p-r, d-1) + P1,
 511              L_r(p-r, d+1) + P1,
 512              min_k L_r(p-r, k) + P2) - min_k L_r(p-r, k)
 513              where p = (x,y), r is one of the directions.
 514              we process all the directions at once:
 515              0: r=(-dx, 0)
 516              1: r=(-1, -dy)
 517              2: r=(0, -dy)
 518              3: r=(1, -dy)
 519              4: r=(-2, -dy)
 520              5: r=(-1, -dy*2)
 521              6: r=(1, -dy*2)
 522              7: r=(2, -dy)
 523              */
 524             for( x = x1; x != x2; x += dx )
 525             {
 526                 int xm = x*NR2, xd = xm*D2;
 527
 528                 int delta0 = minLr[0][xm - dx*NR2] + P2, delta1 = minLr[1][xm - NR2 + 1] + P2;
 529                 int delta2 = minLr[1][xm + 2] + P2, delta3 = minLr[1][xm + NR2 + 3] + P2;
 530
 531                 CostType* Lr_p0 = Lr[0] + xd - dx*NRD2;
 532                 CostType* Lr_p1 = Lr[1] + xd - NRD2 + D2;
 533                 CostType* Lr_p2 = Lr[1] + xd + D2*2;
 534                 CostType* Lr_p3 = Lr[1] + xd + NRD2 + D2*3;
 535
 536                 Lr_p0[-1] = Lr_p0[D] = Lr_p1[-1] = Lr_p1[D] =
 537                 Lr_p2[-1] = Lr_p2[D] = Lr_p3[-1] = Lr_p3[D] = MAX_COST;
 538
 539                 CostType* Lr_p = Lr[0] + xd;
 540                 const CostType* Cp = C + x*D;
 541                 CostType* Sp = S + x*D;
 542
 543             #if CV_SSE2
 544                 if( useSIMD )
 545                 {
 546                     __m128i _P1 = _mm_set1_epi16((short)P1);
 547
 548                     __m128i _delta0 = _mm_set1_epi16((short)delta0);
 549                     __m128i _delta1 = _mm_set1_epi16((short)delta1);
 550                     __m128i _delta2 = _mm_set1_epi16((short)delta2);
 551                     __m128i _delta3 = _mm_set1_epi16((short)delta3);
 552                     __m128i _minL0 = _mm_set1_epi16((short)MAX_COST);
 553
 554                     for( d = 0; d < D; d += 8 )
 555                     {
 556                         __m128i Cpd = _mm_load_si128((const __m128i*)(Cp + d));
 557                         __m128i L0, L1, L2, L3;
 558
 559                         L0 = _mm_load_si128((const __m128i*)(Lr_p0 + d));
 560                         L1 = _mm_load_si128((const __m128i*)(Lr_p1 + d));
 561                         L2 = _mm_load_si128((const __m128i*)(Lr_p2 + d));
 562                         L3 = _mm_load_si128((const __m128i*)(Lr_p3 + d));
 563
 564                         L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d - 1)), _P1));
 565                         L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d + 1)), _P1));
 566
 567                         L1 = _mm_min_epi16(L1, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p1 + d - 1)), _P1));
 568                         L1 = _mm_min_epi16(L1, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p1 + d + 1)), _P1));
 569
 570                         L2 = _mm_min_epi16(L2, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p2 + d - 1)), _P1));
 571                         L2 = _mm_min_epi16(L2, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p2 + d + 1)), _P1));
 572
 573                         L3 = _mm_min_epi16(L3, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p3 + d - 1)), _P1));
 574                         L3 = _mm_min_epi16(L3, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p3 + d + 1)), _P1));
 575
 576                         L0 = _mm_min_epi16(L0, _delta0);
 577                         L0 = _mm_adds_epi16(_mm_subs_epi16(L0, _delta0), Cpd);
 578
 579                         L1 = _mm_min_epi16(L1, _delta1);
 580                         L1 = _mm_adds_epi16(_mm_subs_epi16(L1, _delta1), Cpd);
 581
 582                         L2 = _mm_min_epi16(L2, _delta2);
 583                         L2 = _mm_adds_epi16(_mm_subs_epi16(L2, _delta2), Cpd);
 584
 585                         L3 = _mm_min_epi16(L3, _delta3);
 586                         L3 = _mm_adds_epi16(_mm_subs_epi16(L3, _delta3), Cpd);
 587
 588                         _mm_store_si128( (__m128i*)(Lr_p + d), L0);
 589                         _mm_store_si128( (__m128i*)(Lr_p + d + D2), L1);
 590                         _mm_store_si128( (__m128i*)(Lr_p + d + D2*2), L2);
 591                         _mm_store_si128( (__m128i*)(Lr_p + d + D2*3), L3);
 592
 593                         __m128i t0 = _mm_min_epi16(_mm_unpacklo_epi16(L0, L2), _mm_unpackhi_epi16(L0, L2));
 594                         __m128i t1 = _mm_min_epi16(_mm_unpacklo_epi16(L1, L3), _mm_unpackhi_epi16(L1, L3));
 595                         t0 = _mm_min_epi16(_mm_unpacklo_epi16(t0, t1), _mm_unpackhi_epi16(t0, t1));
 596                         _minL0 = _mm_min_epi16(_minL0, t0);
 597
 598                         __m128i Sval = _mm_load_si128((const __m128i*)(Sp + d));
 599
 600                         L0 = _mm_adds_epi16(L0, L1);
 601                         L2 = _mm_adds_epi16(L2, L3);
 602                         Sval = _mm_adds_epi16(Sval, L0);
 603                         Sval = _mm_adds_epi16(Sval, L2);
 604
 605                         _mm_store_si128((__m128i*)(Sp + d), Sval);
 606                     }
 607
 608                     _minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 8));
 609                     _mm_storel_epi64((__m128i*)&minLr[0][xm], _minL0);
 610                 }
 611                 else
 612             #endif
 613                 {
 614                     int minL0 = MAX_COST, minL1 = MAX_COST, minL2 = MAX_COST, minL3 = MAX_COST;
 615
 616                     for( d = 0; d < D; d++ )
 617                     {
 618                         int Cpd = Cp[d], L0, L1, L2, L3;
 619
 620                         L0 = Cpd + min((int)Lr_p0[d], min(Lr_p0[d-1] + P1, min(Lr_p0[d+1] + P1, delta0))) - delta0;
 621                         L1 = Cpd + min((int)Lr_p1[d], min(Lr_p1[d-1] + P1, min(Lr_p1[d+1] + P1, delta1))) - delta1;
 622                         L2 = Cpd + min((int)Lr_p2[d], min(Lr_p2[d-1] + P1, min(Lr_p2[d+1] + P1, delta2))) - delta2;
 623                         L3 = Cpd + min((int)Lr_p3[d], min(Lr_p3[d-1] + P1, min(Lr_p3[d+1] + P1, delta3))) - delta3;
 624
 625                         Lr_p[d] = (CostType)L0;
 626                         minL0 = min(minL0, L0);
 627
 628                         Lr_p[d + D2] = (CostType)L1;
 629                         minL1 = min(minL1, L1);
 630
 631                         Lr_p[d + D2*2] = (CostType)L2;
 632                         minL2 = min(minL2, L2);
 633
 634                         Lr_p[d + D2*3] = (CostType)L3;
 635                         minL3 = min(minL3, L3);
 636
 637                         Sp[d] = saturate_cast<CostType>(Sp[d] + L0 + L1 + L2 + L3);
 638                     }
 639                     minLr[0][xm] = (CostType)minL0;
 640                     minLr[0][xm+1] = (CostType)minL1;
 641                     minLr[0][xm+2] = (CostType)minL2;
 642                     minLr[0][xm+3] = (CostType)minL3;
 643                 }
 644             }
 645
 646             if( pass == npasses )
 647             {
 648                 for( x = 0; x < width; x++ )
 649                 {
 650                     disp1ptr[x] = disp2ptr[x] = (DispType)INVALID_DISP_SCALED;
 651                     disp2cost[x] = MAX_COST;
 652                 }
 653
 654                 for( x = width1 - 1; x >= 0; x-- )
 655                 {
 656                     CostType* Sp = S + x*D;
 657                     int minS = MAX_COST, bestDisp = -1;
 658
 659                     if( npasses == 1 )
 660                     {
 661                         int xm = x*NR2, xd = xm*D2;
 662
 663                         int minL0 = MAX_COST;
 664                         int delta0 = minLr[0][xm + NR2] + P2;
 665                         CostType* Lr_p0 = Lr[0] + xd + NRD2;
 666                         Lr_p0[-1] = Lr_p0[D] = MAX_COST;
 667                         CostType* Lr_p = Lr[0] + xd;
 668
 669                         const CostType* Cp = C + x*D;
 670
 671                     #if CV_SSE2
 672                         if( useSIMD )
 673                         {
 674                             __m128i _P1 = _mm_set1_epi16((short)P1);
 675                             __m128i _delta0 = _mm_set1_epi16((short)delta0);
 676
 677                             __m128i _minL0 = _mm_set1_epi16((short)minL0);
 678                             __m128i _minS = _mm_set1_epi16(MAX_COST), _bestDisp = _mm_set1_epi16(-1);
 679                             __m128i _d8 = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7), _8 = _mm_set1_epi16(8);
 680
 681                             for( d = 0; d < D; d += 8 )
 682                             {
 683                                 __m128i Cpd = _mm_load_si128((const __m128i*)(Cp + d)), L0;
 684
 685                                 L0 = _mm_load_si128((const __m128i*)(Lr_p0 + d));
 686                                 L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d - 1)), _P1));
 687                                 L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d + 1)), _P1));
 688                                 L0 = _mm_min_epi16(L0, _delta0);
 689                                 L0 = _mm_adds_epi16(_mm_subs_epi16(L0, _delta0), Cpd);
 690
 691                                 _mm_store_si128((__m128i*)(Lr_p + d), L0);
 692                                 _minL0 = _mm_min_epi16(_minL0, L0);
 693                                 L0 = _mm_adds_epi16(L0, *(__m128i*)(Sp + d));
 694                                 _mm_store_si128((__m128i*)(Sp + d), L0);
 695
 696                                 __m128i mask = _mm_cmpgt_epi16(_minS, L0);
 697                                 _minS = _mm_min_epi16(_minS, L0);
 698                                 _bestDisp = _mm_xor_si128(_bestDisp, _mm_and_si128(_mm_xor_si128(_bestDisp,_d8), mask));
 699                                 _d8 = _mm_adds_epi16(_d8, _8);
 700                             }
 701
 702                             short CV_DECL_ALIGNED(16) bestDispBuf[8];
 703                             _mm_store_si128((__m128i*)bestDispBuf, _bestDisp);
 704
 705                             _minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 8));
 706                             _minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 4));
 707                             _minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 2));
 708
 709                             __m128i _S = _mm_min_epi16(_minS, _mm_srli_si128(_minS, 8));
 710                             _S = _mm_min_epi16(_S, _mm_srli_si128(_S, 4));
 711                             _S = _mm_min_epi16(_S, _mm_srli_si128(_S, 2));
 712
 713                             minLr[0][xm] = (CostType)_mm_cvtsi128_si32(_minL0);
 714                             minS = (CostType)_mm_cvtsi128_si32(_S);
 715
 716                             _S = _mm_shuffle_epi32(_mm_unpacklo_epi16(_S, _S), 0);
 717                             _S = _mm_cmpeq_epi16(_minS, _S);
 718                             int idx = _mm_movemask_epi8(_mm_packs_epi16(_S, _S)) & 255;
 719
 720                             bestDisp = bestDispBuf[LSBTab[idx]];
 721                         }
 722                         else
 723                     #endif
 724                         {
 725                             for( d = 0; d < D; d++ )
 726                             {
 727                                 int L0 = Cp[d] + min((int)Lr_p0[d], min(Lr_p0[d-1] + P1, min(Lr_p0[d+1] + P1, delta0))) - delta0;
 728
 729                                 Lr_p[d] = (CostType)L0;
 730                                 minL0 = min(minL0, L0);
 731
 732                                 int Sval = Sp[d] = saturate_cast<CostType>(Sp[d] + L0);
 733                                 if( Sval < minS )
 734                                 {
 735                                     minS = Sval;
 736                                     bestDisp = d;
 737                                 }
 738                             }
 739                             minLr[0][xm] = (CostType)minL0;
 740                         }
 741                     }
 742                     else
 743                     {
 744                         for( d = 0; d < D; d++ )
 745                         {
 746                             int Sval = Sp[d];
 747                             if( Sval < minS )
 748                             {
 749                                 minS = Sval;
 750                                 bestDisp = d;
 751                             }
 752                         }
 753                     }
 754
 755                     for( d = 0; d < D; d++ )
 756                     {
 757                         if( Sp[d]*(100 - uniquenessRatio) < minS*100 && std::abs(bestDisp - d) > 1 )
 758                             break;
 759                     }
 760                     if( d < D )
 761                         continue;
 762                     d = bestDisp;
 763                     int x2 = x + minX1 - d - minD;
 764                     if( disp2cost[x2] > minS )
 765                     {
 766                         disp2cost[x2] = (CostType)minS;
 767                         disp2ptr[x2] = (DispType)(d + minD);
 768                     }
 769
 770                     if( 0 < d && d < D-1 )
 771                     {
 772                         // do subpixel quadratic interpolation:
 773                         //   fit parabola into (x1=d-1, y1=Sp[d-1]), (x2=d, y2=Sp[d]), (x3=d+1, y3=Sp[d+1])
 774                         //   then find minimum of the parabola.
 775                         int denom2 = max(Sp[d-1] + Sp[d+1] - 2*Sp[d], 1);
 776                         d = d*DISP_SCALE + ((Sp[d-1] - Sp[d+1])*DISP_SCALE + denom2)/(denom2*2);
 777                     }
 778                     else
 779                         d *= DISP_SCALE;
 780                     disp1ptr[x + minX1] = (DispType)(d + minD*DISP_SCALE);
 781                 }
 782
 783                 for( x = minX1; x < maxX1; x++ )
 784                 {
 785                     // we round the computed disparity both towards -inf and +inf and check
 786                     // if either of the corresponding disparities in disp2 is consistent.
 787                     // This is to give the computed disparity a chance to look valid if it is.
 788                     int d = disp1ptr[x];
 789                     if( d == INVALID_DISP_SCALED )
 790                         continue;
 791                     int _d = d >> DISP_SHIFT;
 792                     int d_ = (d + DISP_SCALE-1) >> DISP_SHIFT;
 793                     int _x = x - _d, x_ = x - d_;
 794                     if( 0 <= _x && _x < width && disp2ptr[_x] >= minD && std::abs(disp2ptr[_x] - _d) > disp12MaxDiff &&
 795                        0 <= x_ && x_ < width && disp2ptr[x_] >= minD && std::abs(disp2ptr[x_] - d_) > disp12MaxDiff )
 796                         disp1ptr[x] = (DispType)INVALID_DISP_SCALED;
 797                 }
 798             }
 799
 800             // now shift the cyclic buffers
 801             std::swap( Lr[0], Lr[1] );
 802             std::swap( minLr[0], minLr[1] );
 803         }
 804     }
 805 }
 806
 807 typedef cv::Point_<short> Point2s;
 808
 809 void filterSpeckles( Mat& img, double _newval, int maxSpeckleSize, double _maxDiff, Mat& _buf )
 810 {
 811     CV_Assert( img.type() == CV_16SC1 );
 812
 813     int newVal = cvRound(_newval);
 814     int maxDiff = cvRound(_maxDiff);
 815     int width = img.cols, height = img.rows, npixels = width*height;
 816     size_t bufSize = npixels*(int)(sizeof(Point2s) + sizeof(int) + sizeof(uchar));
 817     if( !_buf.isContinuous() || !_buf.data || _buf.cols*_buf.rows*_buf.elemSize() < bufSize )
 818         _buf.create(1, bufSize, CV_8U);
 819
 820     uchar* buf = _buf.data;
 821     int i, j, dstep = img.step/sizeof(short);
 822     int* labels = (int*)buf;
 823     buf += npixels*sizeof(labels[0]);
 824     Point2s* wbuf = (Point2s*)buf;
 825     buf += npixels*sizeof(wbuf[0]);
 826     uchar* rtype = (uchar*)buf;
 827     int curlabel = 0;
 828
 829     // clear out label assignments
 830     memset(labels, 0, npixels*sizeof(labels[0]));
 831
 832     for( i = 0; i < height; i++ )
 833     {
 834         short* ds = img.ptr<short>(i);
 835         int* ls = labels + width*i;
 836
 837         for( j = 0; j < width; j++ )
 838         {
 839             if( ds[j] != newVal )       // not a bad disparity
 840             {
 841                 if( ls[j] )             // has a label, check for bad label
 842                 {
 843                     if( rtype[ls[j]] ) // small region, zero out disparity
 844                         ds[j] = (short)newVal;
 845                 }
 846                 // no label, assign and propagate
 847                 else
 848                 {
 849                     Point2s* ws = wbuf; // initialize wavefront
 850                     Point2s p((short)j, (short)i);      // current pixel
 851                     curlabel++; // next label
 852                     int count = 0;      // current region size
 853                     ls[j] = curlabel;
 854
 855                     // wavefront propagation
 856                     while( ws >= wbuf ) // wavefront not empty
 857                     {
 858                         count++;
 859                         // put neighbors onto wavefront
 860                         short* dpp = &img.at<short>(p.y, p.x);
 861                         short dp = *dpp;
 862                         int* lpp = labels + width*p.y + p.x;
 863
 864                         if( p.x < width-1 && !lpp[+1] && dpp[+1] != newVal && std::abs(dp - dpp[+1]) <= maxDiff )
 865                         {
 866                             lpp[+1] = curlabel;
 867                             *ws++ = Point2s(p.x+1, p.y);
 868                         }
 869
 870                         if( p.x > 0 && !lpp[-1] && dpp[-1] != newVal && std::abs(dp - dpp[-1]) <= maxDiff )
 871                         {
 872                             lpp[-1] = curlabel;
 873                             *ws++ = Point2s(p.x-1, p.y);
 874                         }
 875
 876                         if( p.y < height-1 && !lpp[+width] && dpp[+dstep] != newVal && std::abs(dp - dpp[+dstep]) <= maxDiff )
 877                         {
 878                             lpp[+width] = curlabel;
 879                             *ws++ = Point2s(p.x, p.y+1);
 880                         }
 881
 882                         if( p.y > 0 && !lpp[-width] && dpp[-dstep] != newVal && std::abs(dp - dpp[-dstep]) <= maxDiff )
 883                         {
 884                             lpp[-width] = curlabel;
 885                             *ws++ = Point2s(p.x, p.y-1);
 886                         }
 887
 888                         // pop most recent and propagate
 889                         // NB: could try least recent, maybe better convergence
 890                         p = *--ws;
 891                     }
 892
 893                     // assign label type
 894                     if( count <= maxSpeckleSize )       // speckle region
 895                     {
 896                         rtype[ls[j]] = 1;       // small region label
 897                         ds[j] = (short)newVal;
 898                     }
 899                     else
 900                         rtype[ls[j]] = 0;       // large region label
 901                 }
 902             }
 903         }
 904     }
 905 }
 906
 907
 908 void StereoSGBM::operator ()( const Mat& left, const Mat& right, Mat& disp )
 909 {
 910     CV_Assert( left.size() == right.size() && left.type() == right.type() &&
 911               left.depth() == DataType<PixType>::depth );
 912
 913     disp.create( left.size(), CV_16S );
 914
 915     computeDisparitySGBM( left, right, disp, *this, buffer );
 916     medianBlur(disp, disp, 3);
 917
 918     if( speckleWindowSize > 0 )
 919         filterSpeckles(disp, (minDisparity - 1)*DISP_SCALE, 100, DISP_SCALE, buffer);
 920 }
 921
 922
 923 Rect getValidDisparityROI( Rect roi1, Rect roi2,
 924                            int minDisparity,
 925                            int numberOfDisparities,
 926                            int SADWindowSize )
 927 {
 928     int SW2 = SADWindowSize/2;
 929     int minD = minDisparity, maxD = minDisparity + numberOfDisparities - 1;
 930
 931     int xmin = max(roi1.x, roi2.x + maxD) + SW2;
 932     int xmax = min(roi1.x + roi1.width, roi2.x + roi2.width - minD) - SW2;
 933     int ymin = max(roi1.y, roi2.y) + SW2;
 934     int ymax = min(roi1.y + roi1.height, roi2.y + roi2.height) - SW2;
 935
 936     Rect r(xmin, ymin, xmax - xmin, ymax - ymin);
 937
 938     return r.width > 0 && r.height > 0 ? r : Rect();
 939 }
 940
 941
 942 void validateDisparity( Mat& disp, const Mat& cost, int minDisparity, int numberOfDisparities, int disp12MaxDiff )
 943 {
 944     int cols = disp.cols, rows = disp.rows;
 945     int minD = minDisparity, maxD = minDisparity + numberOfDisparities;
 946     int x, minX1 = max(maxD, 0), maxX1 = cols + min(minD, 0);
 947     AutoBuffer<int> _disp2buf(cols*2);
 948     int* disp2buf = _disp2buf;
 949     int* disp2cost = disp2buf + cols;
 950     const int DISP_SHIFT = 4, DISP_SCALE = 1 << DISP_SHIFT;
 951     int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
 952     int costType = cost.type();
 953
 954     disp12MaxDiff *= DISP_SCALE;
 955
 956     CV_Assert( numberOfDisparities > 0 && disp.type() == CV_16S &&
 957               (costType == CV_16S || costType == CV_32S) &&
 958               disp.size() == cost.size() );
 959
 960     for( int y = 0; y < rows; y++ )
 961     {
 962         short* dptr = disp.ptr<short>(y);
 963
 964         for( x = 0; x < cols; x++ )
 965         {
 966             disp2buf[x] = INVALID_DISP;
 967             disp2cost[x] = INT_MAX;
 968         }
 969
 970         if( costType == CV_16S )
 971         {
 972             const short* cptr = cost.ptr<short>(y);
 973
 974             for( x = minX1; x < maxX1; x++ )
 975             {
 976                 int d = dptr[x], c = cptr[x];
 977                 int x2 = x - ((d + DISP_SCALE/2) >> DISP_SHIFT);
 978
 979                 if( disp2cost[x2] > c )
 980                 {
 981                     disp2cost[x2] = c;
 982                     disp2buf[x2] = d;
 983                 }
 984             }
 985         }
 986         else
 987         {
 988             const int* cptr = cost.ptr<int>(y);
 989
 990             for( x = minX1; x < maxX1; x++ )
 991             {
 992                 int d = dptr[x], c = cptr[x];
 993                 int x2 = x - ((d + DISP_SCALE/2) >> DISP_SHIFT);
 994
 995                 if( disp2cost[x2] < c )
 996                 {
 997                     disp2cost[x2] = c;
 998                     disp2buf[x2] = d;
 999                 }
1000             }
1001         }
1002
1003         for( x = minX1; x < maxX1; x++ )
1004         {
1005             // we round the computed disparity both towards -inf and +inf and check
1006             // if either of the corresponding disparities in disp2 is consistent.
1007             // This is to give the computed disparity a chance to look valid if it is.
1008             int d = dptr[x];
1009             if( d == INVALID_DISP_SCALED )
1010                 continue;
1011             int d0 = d >> DISP_SHIFT;
1012             int d1 = (d + DISP_SCALE-1) >> DISP_SHIFT;
1013             int x0 = x - d0, x1 = x - d1;
1014             if( (0 <= x0 && x0 < cols && disp2buf[x0] > INVALID_DISP && std::abs(disp2buf[x0] - d) > disp12MaxDiff) &&
1015                 (0 <= x1 && x1 < cols && disp2buf[x1] > INVALID_DISP && std::abs(disp2buf[x1] - d) > disp12MaxDiff) )
1016                 dptr[x] = (short)INVALID_DISP_SCALED;
1017         }
1018     }
1019 }
1020
1021 }
1022
1023 CvRect cvGetValidDisparityROI( CvRect roi1, CvRect roi2, int minDisparity,
1024                                int numberOfDisparities, int SADWindowSize )
1025 {
1026     return (CvRect)cv::getValidDisparityROI( roi1, roi2, minDisparity,
1027                                              numberOfDisparities, SADWindowSize );
1028 }
1029
1030 void cvValidateDisparity( CvArr* _disp, const CvArr* _cost, int minDisparity,
1031                           int numberOfDisparities, int disp12MaxDiff )
1032 {
1033     cv::Mat disp = cv::cvarrToMat(_disp), cost = cv::cvarrToMat(_cost);
1034     cv::validateDisparity( disp, cost, minDisparity, numberOfDisparities, disp12MaxDiff );
1035 }