From: vp153 <vp153@73c94f0f-984f-4a5f-82bc-2d8db8d8ee08>
Date: Fri, 19 Feb 2010 23:14:03 +0000 (+0000)
Subject: optimized SGBM stereo correspondence algorithm with SSE
X-Git-Url: http://rtime.felk.cvut.cz/gitweb/opencv.git/commitdiff_plain/890d74e0eb006c40f3164f84a6cde5bb9adfd81e

optimized SGBM stereo correspondence algorithm with SSE

git-svn-id: https://code.ros.org/svn/opencv/trunk@2707 73c94f0f-984f-4a5f-82bc-2d8db8d8ee08
---

diff --git a/opencv/src/cv/cvstereosgbm.cpp b/opencv/src/cv/cvstereosgbm.cpp
index fd188849..00e67a4d 100644
--- a/opencv/src/cv/cvstereosgbm.cpp
+++ b/opencv/src/cv/cvstereosgbm.cpp
@@ -50,13 +50,13 @@
 */ 
 
 #include "_cv.h"
-#include <limits>
+#include <limits.h>
 
 namespace cv
 {
 
 typedef uchar PixType;
-typedef int CostType;
+typedef short CostType;
 typedef short DispType;
     
 enum { NR = 16, NR2 = NR/2 };
@@ -117,42 +117,46 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
     PixType *prow1 = buffer + width2*2, *prow2 = prow1 + width*cn;
     
     tab += tabOfs;
-    #define clip(x) (PixType)tab[(x)]
     
     for( c = 0; c < cn; c++ )
     {
         prow1[width*c] = prow1[width*c + width-1] = 
-        prow2[width*c] = prow2[width*c + width-1] = clip(0);
+        prow2[width*c] = prow2[width*c + width-1] = tab[0];
     }
     
     if( cn == 1 )
     {
-        for( x = 0; x < width-1; x++ )
+        int n1 = y > 0 ? -img1.step : 0, s1 = y < img1.rows-1 ? img1.step : 0;
+        int n2 = y > 0 ? -img2.step : 0, s2 = y < img2.rows-1 ? img2.step : 0;
+        
+        for( x = 1; x < width-1; x++ )
         {
-            prow1[x] = clip(row1[x+1] - row1[x-1]);
-            prow2[x] = clip(row2[x+1] - row2[x-1]);
+            //prow1[x] = tab[row1[x+1] - row1[x-1]];
+            //prow2[width-1-x] = tab[row2[x+1] - row2[x-1]];
+            prow1[x] = tab[(row1[x+1] - row1[x-1])*2 + row1[x+n1+1] - row1[x+n1-1] + row1[x+s1+1] - row1[x+s1-1]];
+            prow2[width-1-x] = tab[(row2[x+1] - row2[x-1])*2 + row2[x+n2+1] - row2[x+n2-1] + row2[x+s2+1] - row2[x+s2-1]];
         }
     }
     else
     {
-        for( x = 0; x < width-1; x++ )
+        for( x = 1; x < width-1; x++ )
         {
-            prow1[x] = clip(row1[x*3+3] - row1[x*3-3]);
-            prow1[x+width] = clip(row1[x*3+4] - row1[x*3-2]);
-            prow1[x+width*2] = clip(row1[x*3+5] - row1[x*3-1]);
+            prow1[x] = tab[row1[x*3+3] - row1[x*3-3]];
+            prow1[x+width] = tab[row1[x*3+4] - row1[x*3-2]];
+            prow1[x+width*2] = tab[row1[x*3+5] - row1[x*3-1]];
             
-            prow2[x] = clip(row2[x*3+3] - row2[x*3-3]);
-            prow2[x+width] = clip(row2[x*3+4] - row2[x*3-2]);
-            prow2[x+width*2] = clip(row2[x*3+5] - row2[x*3-1]);
+            prow2[width-1-x] = tab[row2[x*3+3] - row2[x*3-3]];
+            prow2[width-1-x+width] = tab[row2[x*3+4] - row2[x*3-2]];
+            prow2[width-1-x+width*2] = tab[row2[x*3+5] - row2[x*3-1]];
         }
     }
     
-    #undef clip
     memset( cost, 0, width1*D*sizeof(cost[0]) );
     
     buffer -= minX2;
     cost -= minX1*D + minD; // simplify the cost indices inside the loop
     
+#if 1    
     for( c = 0; c < cn; c++, prow1 += width, prow2 += width )
     {
         // precompute
@@ -177,21 +181,71 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
             int u0 = min(ul, ur); u0 = min(u0, u);
             int u1 = max(ul, ur); u1 = max(u1, u);
             
+        #if CV_SSE2
+            __m128i _u = _mm_set1_epi8(u), _u0 = _mm_set1_epi8(u0);
+            __m128i _u1 = _mm_set1_epi8(u1), z = _mm_setzero_si128();
+            
+            for( int d = minD; d < maxD; d += 16 )
+            {
+                __m128i _v = _mm_loadu_si128((const __m128i*)(prow2 + width-x-1 + d));
+                __m128i _v0 = _mm_loadu_si128((const __m128i*)(buffer + width-x-1 + d));
+                __m128i _v1 = _mm_loadu_si128((const __m128i*)(buffer + width-x-1 + d + width2));
+                __m128i c0 = _mm_max_epu8(_mm_subs_epu8(_u, _v1), _mm_subs_epu8(_v0, _u));
+                __m128i c1 = _mm_max_epu8(_mm_subs_epu8(_v, _u1), _mm_subs_epu8(_u0, _v));
+                __m128i diff = _mm_min_epu8(c0, c1);
+                
+                c0 = _mm_load_si128((__m128i*)(cost + x*D + d));
+                c1 = _mm_load_si128((__m128i*)(cost + x*D + d + 8));
+                
+                _mm_store_si128((__m128i*)(cost + x*D + d), _mm_adds_epi16(c0, _mm_unpacklo_epi8(diff,z)));
+                _mm_store_si128((__m128i*)(cost + x*D + d + 8), _mm_adds_epi16(c1, _mm_unpackhi_epi8(diff,z)));
+            }
+        #else
             for( int d = minD; d < maxD; d++ )
             {
-                int v = prow2[x - d];
-                int v0 = buffer[x - d];
-                int v1 = buffer[x - d + width2];
+                int v = prow2[width-x-1 + d];
+                int v0 = buffer[width-x-1 + d];
+                int v1 = buffer[width-x-1 + d + width2];
                 int c0 = max(0, u - v1); c0 = max(c0, v0 - u);
                 int c1 = max(0, v - u1); c1 = max(c1, u0 - v);
                 
-                cost[x*D + d] = cost[x*D + d] + (CostType)min(c0, c1);
+                cost[x*D + d] = (CostType)(cost[x*D+d] + min(c0, c1));
+            }
+        #endif
+        }
+    }
+#else
+    for( c = 0; c < cn; c++, prow1 += width, prow2 += width )
+    {
+        for( x = minX1; x < maxX1; x++ )
+        {
+            int u = prow1[x];
+        #if CV_SSE2
+            __m128i _u = _mm_set1_epi8(u), z = _mm_setzero_si128();
+            
+            for( int d = minD; d < maxD; d += 16 )
+            {
+                __m128i _v = _mm_loadu_si128((const __m128i*)(prow2 + width-1-x + d));
+                __m128i diff = _mm_adds_epu8(_mm_subs_epu8(_u,_v), _mm_subs_epu8(_v,_u));
+                __m128i c0 = _mm_load_si128((__m128i*)(cost + x*D + d));
+                __m128i c1 = _mm_load_si128((__m128i*)(cost + x*D + d + 8));
+                
+                _mm_store_si128((__m128i*)(cost + x*D + d), _mm_adds_epi16(c0, _mm_unpacklo_epi8(diff,z)));
+                _mm_store_si128((__m128i*)(cost + x*D + d + 8), _mm_adds_epi16(c1, _mm_unpackhi_epi8(diff,z)));
             }
+        #else
+            for( int d = minD; d < maxD; d++ )
+            {
+                int v = prow2[width-1-x + d];
+                cost[x*D + d] = (CostType)(cost[x*D + d] + std::abs(u - v));
+            }
+        #endif
         }
     }
+#endif
 }
 
-
+    
 /*
   computes disparity for "roi" in img1 w.r.t. img2 and write it to disp1buf.
   that is, disp1buf(x, y)=d means that img1(x+roi.x, y+roi.y) ~ img2(x+roi.x-d, y+roi.y).
@@ -216,14 +270,15 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                                   Mat& disp1, const StereoSGBM& params, 
                                   Mat& buffer )
 {
+    const int ALIGN = 16;
     const int DISP_SHIFT = StereoSGBM::DISP_SHIFT;
     const int DISP_SCALE = StereoSGBM::DISP_SCALE;
-    const CostType MAX_COST = std::numeric_limits<CostType>::max();
+    const CostType MAX_COST = SHRT_MAX;
     
     int minD = params.minDisparity, maxD = minD + params.numberOfDisparities;
     Size SADWindowSize;
     SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5;
-    int ftzero = max(params.preFilterCap, 31) | 1;
+    int ftzero = max(params.preFilterCap, 15) | 1;
     int uniquenessRatio = params.uniquenessRatio > 0 ? params.uniquenessRatio : 10;
     int disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
     int P1 = params.P1 > 0 ? params.P1 : 2, P2 = max(params.P2 > 0 ? params.P2 : 5, P1+1);
@@ -232,7 +287,7 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
     int D = maxD - minD, width1 = maxX1 - minX1;
     int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
     int SW2 = SADWindowSize.width/2, SH2 = SADWindowSize.height/2;
-    const int TAB_OFS = 256, TAB_SIZE = TAB_OFS*3;
+    const int TAB_OFS = 256*4, TAB_SIZE = 256 + TAB_OFS*2;
     PixType clipTab[TAB_SIZE];
     
     for( k = 0; k < TAB_SIZE; k++ )
@@ -248,7 +303,7 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
     
     // NR - the number of directions. the loop on x below that computes Lr assumes that NR == 8.
     // if you change NR, please, modify the loop as well.
-    int NRD2 = NR2*D;
+    int D2 = D+16, NRD2 = NR2*D2;
     
     // the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer:
     // for 8-way dynamic programming we need the current row and
@@ -260,19 +315,19 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
     // we keep pixel difference cost (C) and the summary cost over NR directions (S).
     // we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k)
     size_t costBufSize = width1*D;
-    size_t minLrSize = (width1 + LrBorder*2)*NR2, LrSize = minLrSize*D;
+    size_t minLrSize = (width1 + LrBorder*2)*NR2, LrSize = minLrSize*D2;
     int hsumBufNRows = SH2*2 + 2;
     size_t totalBufSize = (LrSize + minLrSize)*NLR*sizeof(CostType) + // minLr[] and Lr[]
-    costBufSize*(hsumBufNRows + 3)*sizeof(CostType) + // hsumBuf, pixdiff, C and S
-    width*8*img1.channels()*sizeof(PixType) + // temp buffer for computing per-pixel cost
-    width*(sizeof(CostType) + sizeof(DispType)); // disp2cost + disp2
+        costBufSize*(hsumBufNRows + 3)*sizeof(CostType) + // hsumBuf, pixdiff, C and S
+        width*8*img1.channels()*sizeof(PixType) + // temp buffer for computing per-pixel cost
+        width*(sizeof(CostType) + sizeof(DispType)) + 1024; // disp2cost + disp2
     
     if( !buffer.data || !buffer.isContinuous() ||
         buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize )
         buffer.create(1, totalBufSize, CV_8U);
     
     // summary cost over different (nDirs) directions
-    CostType* C = (CostType*)buffer.data;
+    CostType* C = (CostType*)alignPtr(buffer.data, ALIGN);
     CostType* hsumBuf = C + costBufSize;
     CostType* pixDiff = hsumBuf + costBufSize*hsumBufNRows;
     CostType* S = pixDiff + costBufSize;
@@ -281,8 +336,8 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
     {
         // shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,
         // and will occasionally use negative indices with the arrays
-        Lr[k] = S + costBufSize + LrSize*k + NRD2*2;
-        memset( Lr[k] - LrBorder*NRD2, 0, LrSize*sizeof(CostType) );
+        Lr[k] = S + costBufSize + LrSize*k + NRD2*LrBorder + 8;
+        memset( Lr[k] - LrBorder*NRD2 - 8, 0, LrSize*sizeof(CostType) );
         minLr[k] = S + costBufSize + LrSize*NLR + minLrSize*k + NR2*2;
         memset( minLr[k] - LrBorder*NR2, 0, minLrSize*sizeof(CostType) );
     }
@@ -292,11 +347,19 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
     
     PixType* tempBuf = (PixType*)(disp2ptr + width);
     
-    memset( C, 0, costBufSize*sizeof(C[0]) );
+    for( k = 0; k < width1*D; k++ )
+        C[k] = P2;
     
     for( int y = 0; y < height; y++ )
     {
         int x, d, y1 = y == 0 ? 0 : y + SH2, y2 = y == 0 ? SH2 : y1;
+        DispType* disp1ptr = disp1.ptr<DispType>(y);
+        
+        for( x = 0; x < width; x++ )
+        {
+            disp1ptr[x] = disp2ptr[x] = INVALID_DISP_SCALED;
+            disp2cost[x] = MAX_COST;
+        }
         
         for( k = y1; k <= y2; k++ )
         {
@@ -313,14 +376,48 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                     for( d = 0; d < D; d++ )
                         hsumAdd[d] = (CostType)(hsumAdd[d] + pixDiff[x + d]*scale);
                 }
-                
-                for( x = D; x < width1*D; x += D )
+
+                if( y > 0 )
                 {
-                    const CostType* pixAdd = pixDiff + min(x + SW2*D, (width1-1)*D);
-                    const CostType* pixSub = pixDiff + max(x - (SW2+1)*D, 0);
-                    
-                    for( d = 0; d < D; d++ )
-                        hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
+                    const CostType* hsumSub = hsumBuf + (max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize;
+                    for( x = D; x < width1*D; x += D )
+                    {
+                        const CostType* pixAdd = pixDiff + min(x + SW2*D, (width1-1)*D);
+                        const CostType* pixSub = pixDiff + max(x - (SW2+1)*D, 0);
+                        
+                    #if CV_SSE2
+                        for( d = 0; d < D; d += 8 )
+                        {
+                            __m128i hv = _mm_load_si128((const __m128i*)(hsumAdd + x - D + d));
+                            __m128i Cx = _mm_load_si128((__m128i*)(C + x + d));
+                            hv = _mm_adds_epi16(_mm_subs_epi16(hv,
+                                            _mm_load_si128((const __m128i*)(pixSub + d))),
+                                            _mm_load_si128((const __m128i*)(pixAdd + d)));
+                            Cx = _mm_adds_epi16(_mm_subs_epi16(Cx,
+                                                _mm_load_si128((const __m128i*)(hsumSub + x + d))),
+                                                hv);
+                            _mm_store_si128((__m128i*)(hsumAdd + x + d), hv);
+                            _mm_store_si128((__m128i*)(C + x + d), Cx);
+                        }
+                    #else
+                        for( d = 0; d < D; d++ )
+                        {
+                            int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
+                            C[x + d] = (CostType)(C[x + d] + hv - hsumSub[x + d]);
+                        }
+                    #endif
+                    }
+                }
+                else
+                {
+                    for( x = D; x < width1*D; x += D )
+                    {
+                        const CostType* pixAdd = pixDiff + min(x + SW2*D, (width1-1)*D);
+                        const CostType* pixSub = pixDiff + max(x - (SW2+1)*D, 0);
+                        
+                        for( d = 0; d < D; d++ )
+                            hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
+                    }
                 }
             }
             
@@ -330,20 +427,14 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                 for( x = 0; x < width1*D; x++ )
                     C[x] = (CostType)(C[x] + hsumAdd[x]*scale);
             }
-            else
-            {
-                const CostType* hsumSub = hsumBuf + (max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize;
-                for( x = 0; x < width1*D; x++ )
-                    C[x] = (CostType)(C[x] + hsumAdd[x] - hsumSub[x]);
-            }
         }
         
         // clear the left and the right borders
-        memset( Lr[0] - NRD2*LrBorder, 0, NRD2*LrBorder*sizeof(CostType) );
-        memset( Lr[0] + width1*NRD2, 0, NRD2*LrBorder*sizeof(CostType) );
+        memset( Lr[0] - NRD2*LrBorder - 8, 0, NRD2*LrBorder*sizeof(CostType) );
+        memset( Lr[0] + width1*NRD2 - 8, 0, NRD2*LrBorder*sizeof(CostType) );
         memset( minLr[0] - NR2*LrBorder, 0, NR2*LrBorder*sizeof(CostType) );
         memset( minLr[0] + width1*NR2, 0, NR2*LrBorder*sizeof(CostType) );
-
+        
         /*
            [formula 13 in the paper]
            compute L_r(p, d) = C(p, d) +
@@ -364,157 +455,220 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
         */
         for( x = 0; x < width1; x++ )
         {
-            int xm = x*NR2, xd = xm*D;
+            int xm = x*NR2, xd = xm*D2;
             
-            int minL0 = MAX_COST, minL1 = MAX_COST, minL2 = MAX_COST, minL3 = MAX_COST;
-            int delta0 = minLr[0][xm - NR2], delta1 = minLr[1][xm - NR2 + 1];
-            int delta2 = minLr[1][xm + 2], delta3 = minLr[1][xm + NR2 + 3];
+            int delta0 = minLr[0][xm - NR2] + P2, delta1 = minLr[1][xm - NR2 + 1] + P2;
+            int delta2 = minLr[1][xm + 2] + P2, delta3 = minLr[1][xm + NR2 + 3] + P2;
             
-            const CostType* Lr_p0 = Lr[0] + xd - NRD2;
-            const CostType* Lr_p1 = Lr[1] + xd - NRD2 + D;
-            const CostType* Lr_p2 = Lr[1] + xd + D*2;
-            const CostType* Lr_p3 = Lr[1] + xd + NRD2 + D*3;
+            CostType* Lr_p0 = Lr[0] + xd - NRD2;
+            CostType* Lr_p1 = Lr[1] + xd - NRD2 + D2;
+            CostType* Lr_p2 = Lr[1] + xd + D2*2;
+            CostType* Lr_p3 = Lr[1] + xd + NRD2 + D2*3;
             
-        #define STEREO_HH_16 1
+            Lr_p0[-1] = Lr_p0[D] = Lr_p1[-1] = Lr_p1[D] =
+            Lr_p2[-1] = Lr_p2[D] = Lr_p3[-1] = Lr_p3[D] = MAX_COST;
             
-        #if STEREO_HH_16                
-            int minL4 = MAX_COST, minL5 = MAX_COST, minL6 = MAX_COST, minL7 = MAX_COST;
-            int delta4 = minLr[1][xm - NR2*2 + 4], delta5 = minLr[2][xm - NR2 + 5];
-            int delta6 = minLr[2][xm + NR2 + 6], delta7 = minLr[1][xm + NR2*2 + 7];
+            CostType* Lr_p = Lr[0] + xd;
+            const CostType* Cp = C + x*D;
+            CostType* Sp = S + x*D;
             
-            const CostType* Lr_p4 = Lr[1] + xd - NRD2*2 + D*4;
-            const CostType* Lr_p5 = Lr[2] + xd - NRD2 + D*5;
-            const CostType* Lr_p6 = Lr[2] + xd + NRD2 + D*6;
-            const CostType* Lr_p7 = Lr[1] + xd + NRD2*2 + D*7;
-        #endif            
+        #if CV_SSE2
+            __m128i _P1 = _mm_set1_epi16((short)P1);
             
-            for( d = 0; d < D; d++ )
+            __m128i _delta0 = _mm_set1_epi16((short)delta0);
+            __m128i _delta1 = _mm_set1_epi16((short)delta1);
+            __m128i _delta2 = _mm_set1_epi16((short)delta2);
+            __m128i _delta3 = _mm_set1_epi16((short)delta3);
+            __m128i _minL0 = _mm_set1_epi16((short)MAX_COST);
+            
+            for( d = 0; d < D; d += 8 )
             {
-                int d1 = max(d-1, 0), d2 = min(d+1, D-1), Cpd = C[x*D + d];
-                int L0, L1, L2, L3;
+                __m128i Cpd = _mm_load_si128((const __m128i*)(Cp + d));
+                __m128i L0, L1, L2, L3;
                 
-                L0 = Cpd + min((int)Lr_p0[d], min(Lr_p0[d1] + P1, min(Lr_p0[d2] + P1, delta0 + P2))) - delta0;
-                L1 = Cpd + min((int)Lr_p1[d], min(Lr_p1[d1] + P1, min(Lr_p1[d2] + P1, delta1 + P2))) - delta1;                    
-                L2 = Cpd + min((int)Lr_p2[d], min(Lr_p2[d1] + P1, min(Lr_p2[d2] + P1, delta2 + P2))) - delta2;
-                L3 = Cpd + min((int)Lr_p3[d], min(Lr_p3[d1] + P1, min(Lr_p3[d2] + P1, delta3 + P2))) - delta3;
+                L0 = _mm_load_si128((const __m128i*)(Lr_p0 + d));
+                L1 = _mm_load_si128((const __m128i*)(Lr_p1 + d));
+                L2 = _mm_load_si128((const __m128i*)(Lr_p2 + d));
+                L3 = _mm_load_si128((const __m128i*)(Lr_p3 + d));
                 
-                CostType* Lr_p = Lr[0] + xd;
+                L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d - 1)), _P1));
+                L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d + 1)), _P1));
                 
-                Lr_p[d] = (CostType)L0;
-                minL0 = min(minL0, L0);
+                L1 = _mm_min_epi16(L1, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p1 + d - 1)), _P1));
+                L1 = _mm_min_epi16(L1, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p1 + d + 1)), _P1));
                 
-                Lr_p[d + D] = (CostType)L1;
-                minL1 = min(minL1, L1);
+                L2 = _mm_min_epi16(L2, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p2 + d - 1)), _P1));
+                L2 = _mm_min_epi16(L2, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p2 + d + 1)), _P1));
                 
-                Lr_p[d + D*2] = (CostType)L2;
-                minL2 = min(minL2, L2);
+                L3 = _mm_min_epi16(L3, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p3 + d - 1)), _P1));
+                L3 = _mm_min_epi16(L3, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p3 + d + 1)), _P1));
                 
-                Lr_p[d + D*3] = (CostType)L3;
-                minL3 = min(minL3, L3);
+                L0 = _mm_min_epi16(L0, _delta0);
+                L0 = _mm_adds_epi16(_mm_subs_epi16(L0, _delta0), Cpd);
+                
+                L1 = _mm_min_epi16(L1, _delta1);
+                L1 = _mm_adds_epi16(_mm_subs_epi16(L1, _delta1), Cpd);
+                
+                L2 = _mm_min_epi16(L2, _delta2);
+                L2 = _mm_adds_epi16(_mm_subs_epi16(L2, _delta2), Cpd);
+                
+                L3 = _mm_min_epi16(L3, _delta3);
+                L3 = _mm_adds_epi16(_mm_subs_epi16(L3, _delta3), Cpd);
+                
+                _mm_store_si128( (__m128i*)(Lr_p + d), L0);
+                _mm_store_si128( (__m128i*)(Lr_p + d + D2), L1);
+                _mm_store_si128( (__m128i*)(Lr_p + d + D2*2), L2);
+                _mm_store_si128( (__m128i*)(Lr_p + d + D2*3), L3);
+                
+                __m128i t0 = _mm_min_epi16(_mm_unpacklo_epi16(L0, L2), _mm_unpackhi_epi16(L0, L2));
+                __m128i t1 = _mm_min_epi16(_mm_unpacklo_epi16(L1, L3), _mm_unpackhi_epi16(L1, L3));
+                t0 = _mm_min_epi16(_mm_unpacklo_epi16(t0, t1), _mm_unpackhi_epi16(t0, t1));
+                _minL0 = _mm_min_epi16(_minL0, t0);
                 
-                S[x*D + d] = (CostType)(L0 + L1 + L2 + L3);
+                L0 = _mm_adds_epi16(L0, L1);
+                L2 = _mm_adds_epi16(L2, L3);
+                L0 = _mm_adds_epi16(L0, L2);
                 
-            #if STEREO_HH_16
-                int L4, L5, L6, L7;
+                _mm_store_si128((__m128i*)(Sp + d), L0);
+            }
+            
+            _minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 8));
+            _mm_storel_epi64((__m128i*)&minLr[0][xm], _minL0);
+            
+        #else
+            int minL0 = MAX_COST, minL1 = MAX_COST, minL2 = MAX_COST, minL3 = MAX_COST;
+            
+            for( d = 0; d < D; d++ )
+            {
+                int Cpd = Cp[d], L0, L1, L2, L3;
                 
-                L4 = Cpd + min((int)Lr_p4[d], min(Lr_p4[d1] + P1, min(Lr_p4[d2] + P1, delta4 + P2))) - delta4;                    
-                L5 = Cpd + min((int)Lr_p5[d], min(Lr_p5[d1] + P1, min(Lr_p5[d2] + P1, delta5 + P2))) - delta5;
-                L6 = Cpd + min((int)Lr_p6[d], min(Lr_p6[d1] + P1, min(Lr_p6[d2] + P1, delta6 + P2))) - delta6;
-                L7 = Cpd + min((int)Lr_p7[d], min(Lr_p7[d1] + P1, min(Lr_p7[d2] + P1, delta7 + P2))) - delta7;
+                L0 = Cpd + min((int)Lr_p0[d], min(Lr_p0[d-1] + P1, min(Lr_p0[d+1] + P1, delta0))) - delta0;
+                L1 = Cpd + min((int)Lr_p1[d], min(Lr_p1[d-1] + P1, min(Lr_p1[d+1] + P1, delta1))) - delta1;                    
+                L2 = Cpd + min((int)Lr_p2[d], min(Lr_p2[d-1] + P1, min(Lr_p2[d+1] + P1, delta2))) - delta2;
+                L3 = Cpd + min((int)Lr_p3[d], min(Lr_p3[d-1] + P1, min(Lr_p3[d+1] + P1, delta3))) - delta3;
                 
-                Lr_p[d + D*4] = (CostType)L4;
-                minL4 = min(minL4, L4);
+                Lr_p[d] = (CostType)L0;
+                minL0 = min(minL0, L0);
                 
-                Lr_p[d + D*5] = (CostType)L5;
-                minL5 = min(minL5, L5);
+                Lr_p[d + D2] = (CostType)L1;
+                minL1 = min(minL1, L1);
                 
-                Lr_p[d + D*6] = (CostType)L6;
-                minL6 = min(minL6, L6);
+                Lr_p[d + D2*2] = (CostType)L2;
+                minL2 = min(minL2, L2);
                 
-                Lr_p[d + D*7] = (CostType)L7;
-                minL7 = min(minL7, L7);
+                Lr_p[d + D2*3] = (CostType)L3;
+                minL3 = min(minL3, L3);
                 
-                S[x*D + d] = (CostType)(S[x*D + d] + L4 + L5 + L6 + L7);
-            #endif    
+                Sp[d] = saturate_cast<CostType>(L0 + L1 + L2 + L3);
             }
             minLr[0][xm] = (CostType)minL0;
             minLr[0][xm+1] = (CostType)minL1;
             minLr[0][xm+2] = (CostType)minL2;
             minLr[0][xm+3] = (CostType)minL3;
-            
-        #if STEREO_HH_16
-            minLr[0][xm+4] = (CostType)minL4;
-            minLr[0][xm+5] = (CostType)minL5;
-            minLr[0][xm+6] = (CostType)minL6;
-            minLr[0][xm+7] = (CostType)minL7;
         #endif
         }
         
         for( x = width1 - 1; x >= 0; x-- )
         {
-            int xm = x*NR2, xd = xm*D;
+            int xm = x*NR2, xd = xm*D2;
             
             int minL0 = MAX_COST;
-            int delta0 = minLr[0][xm + NR2];
-            const CostType* Lr_p0 = Lr[0] + xd + NRD2;
+            int delta0 = minLr[0][xm + NR2] + P2;
+            CostType* Lr_p0 = Lr[0] + xd + NRD2;
+            Lr_p0[-1] = Lr_p0[D] = MAX_COST;
+            CostType* Lr_p = Lr[0] + xd;
             
-            for( d = 0; d < D; d++ )
+            const CostType* Cp = C + x*D;
+            CostType* Sp = S + x*D;
+            CostType minS = MAX_COST;
+            int bestDisp = -1;
+
+        #if CV_SSE2
+            __m128i _P1 = _mm_set1_epi16((short)P1);
+            __m128i _delta0 = _mm_set1_epi16((short)delta0);
+            
+            __m128i _minL0 = _mm_set1_epi16((short)minL0);
+            __m128i _minS = _mm_set1_epi16(MAX_COST), _bestDisp = _mm_set1_epi16(-1);
+            __m128i _d8 = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7), _8 = _mm_set1_epi16(8);
+            
+            for( d = 0; d < D; d += 8 )
             {
-                int d1 = max(d-1, 0), d2 = min(d+1, D-1), Cpd = C[x*D + d];
-                int L0 = Cpd + min((int)Lr_p0[d], min(Lr_p0[d1] + P1, min(Lr_p0[d2] + P1, delta0 + P2))) - delta0;
+                __m128i Cpd = _mm_load_si128((const __m128i*)(Cp + d)), L0;
                 
-                CostType* Lr_p = Lr[0] + xd;
+                L0 = _mm_load_si128((const __m128i*)(Lr_p0 + d));
+                L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d - 1)), _P1));
+                L0 = _mm_min_epi16(L0, _mm_adds_epi16(_mm_loadu_si128((const __m128i*)(Lr_p0 + d + 1)), _P1));
+                L0 = _mm_min_epi16(L0, _delta0);
+                L0 = _mm_adds_epi16(_mm_subs_epi16(L0, _delta0), Cpd);
                 
-                Lr_p[d + D*4] = (CostType)L0;
-                minL0 = min(minL0, L0);
+                _mm_store_si128((__m128i*)(Lr_p + d), L0);
+                _minL0 = _mm_min_epi16(_minL0, L0);
+                L0 = _mm_adds_epi16(L0, *(__m128i*)(Sp + d));
+                _mm_store_si128((__m128i*)(Sp + d), L0);
                 
-                S[x*D + d] = (CostType)(S[x*D + d] + L0);
+                __m128i mask = _mm_cmpgt_epi16(_minS, L0);
+                _minS = _mm_min_epi16(_minS, L0);
+                _bestDisp = _mm_xor_si128(_bestDisp, _mm_and_si128(_mm_xor_si128(_bestDisp,_d8), mask));
+                _d8 = _mm_adds_epi16(_d8, _8);
             }
-            minLr[0][xm] = (CostType)minL0;
-        }
-        
-        // now shift the cyclic buffers
-        std::swap( Lr[0], Lr[1] );
-        std::swap( Lr[0], Lr[2] );
-        std::swap( minLr[0], minLr[1] );
-        std::swap( minLr[0], minLr[2] );
-        
-        DispType* disp1ptr = disp1.ptr<DispType>(y);
-        
-        for( x = 0; x < width; x++ )
-        {
-            disp1ptr[x] = disp2ptr[x] = INVALID_DISP_SCALED;
-            disp2cost[x] = MAX_COST;
-        }
-        
-        for( x = 0; x < width1; x++ )
-        {
-            CostType minS = MAX_COST;
-            CostType* Sp = S + x*D;
-            int bestDisp = -1;
+            
+            short CV_DECL_ALIGNED(16) bestDispBuf[8];
+            _mm_store_si128((__m128i*)bestDispBuf, _bestDisp);
+            
+            _minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 8));
+            _minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 4));
+            _minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 2));
+            
+            __m128i _S = _mm_min_epi16(_minS, _mm_srli_si128(_minS, 8));
+            _S = _mm_min_epi16(_S, _mm_srli_si128(_S, 4));
+            _S = _mm_min_epi16(_S, _mm_srli_si128(_S, 2));
+            
+            minLr[0][xm] = (CostType)_mm_cvtsi128_si32(_minL0);
+            minS = (CostType)_mm_cvtsi128_si32(_S);
+            
+            _S = _mm_shuffle_epi32(_mm_unpacklo_epi16(_S, _S), 0);
+            _S = _mm_cmpeq_epi16(_minS, _S);
+            int idx = _mm_movemask_epi8(_mm_packs_epi16(_S, _S)) & 255;
+            
+            static const uchar LSBTab[] =
+            {
+                0, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+                5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+                6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+                5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+                7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+                5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+                6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+                5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
+            };
+            
+            bestDisp = bestDispBuf[LSBTab[idx]];
+        #else
             for( d = 0; d < D; d++ )
             {
-                if( minS > Sp[d] )
+                int L0 = Cp[d] + min((int)Lr_p0[d], min(Lr_p0[d-1] + P1, min(Lr_p0[d+1] + P1, delta0))) - delta0;
+                
+                Lr_p[d] = (CostType)L0;
+                minL0 = min(minL0, L0);
+                
+                int Sval = Sp[d] = saturate_cast<CostType>(Sp[d] + L0);
+                if( Sval < minS )
                 {
-                    minS = Sp[d];
+                    minS = Sval;
                     bestDisp = d;
                 }
             }
-                
-            //d = bestDisp;
+            minLr[0][xm] = (CostType)minL0;
+        #endif
+            
             for( d = 0; d < D; d++ )
             {
-                if( Sp[d]*(100 - uniquenessRatio) < Sp[bestDisp]*100 && std::abs(bestDisp - d) > 1 )
+                if( Sp[d]*(100 - uniquenessRatio) < minS*100 && std::abs(bestDisp - d) > 1 )
                     break;
             }
             if( d < D )
-            {
-                disp1ptr[x + minX1] = (DispType)INVALID_DISP_SCALED;
                 continue;
-            }
             d = bestDisp;
-            
-            // update disp2full & disp2cost
             int x2 = x + minX1 - d - minD;
             if( disp2cost[x2] > minS )
             {
@@ -527,7 +681,7 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                 // do subpixel quadratic interpolation:
                 //   fit parabola into (x1=d-1, y1=Sp[d-1]), (x2=d, y2=Sp[d]), (x3=d+1, y3=Sp[d+1])
                 //   then find minimum of the parabola.
-                int denom2 = Sp[d-1] + Sp[d+1] - 2*Sp[d];
+                int denom2 = max(Sp[d-1] + Sp[d+1] - 2*Sp[d], 1);
                 d = d*DISP_SCALE + ((Sp[d-1] - Sp[d+1])*DISP_SCALE + denom2)/(denom2*2);
             }
             else
@@ -547,9 +701,15 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
             int d_ = (d + DISP_SCALE-1) >> DISP_SHIFT;
             int _x = x - _d, x_ = x - d_;
             if( 0 <= _x && _x < width && disp2ptr[_x] >= minD && std::abs(disp2ptr[_x] - _d) > disp12MaxDiff &&
-                0 <= x_ && x_ < width && disp2ptr[x_] >= minD && std::abs(disp2ptr[x_] - d_) > disp12MaxDiff )
+               0 <= x_ && x_ < width && disp2ptr[x_] >= minD && std::abs(disp2ptr[x_] - d_) > disp12MaxDiff )
                 disp1ptr[x] = (DispType)INVALID_DISP_SCALED;
         }
+        
+        // now shift the cyclic buffers
+        std::swap( Lr[0], Lr[1] );
+        std::swap( Lr[0], Lr[2] );
+        std::swap( minLr[0], minLr[1] );
+        std::swap( minLr[0], minLr[2] );
     }
 }