renamed all the _[A-Z] variables to avoid possible name conflicts.

[opencv.git] / opencv / src / cv / cvstereosgbm.cpp
diff --git a/opencv/src/cv/cvstereosgbm.cpp b/opencv/src/cv/cvstereosgbm.cpp

index 267b75f7f053297a66d1a3a125b3b3c6ab2f3e11..bbd6ea83cabf9023b856e00e3cbb3350986e61db 100644 (file)
--- a/opencv/src/cv/cvstereosgbm.cpp
+++ b/opencv/src/cv/cvstereosgbm.cpp
@@ -110,47 +110,57 @@ StereoSGBM::~StereoSGBM()
   */
  static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
                              int minD, int maxD, CostType* cost,
-                            PixType* buffer, const PixType* tab, int tabOfs )
+                            PixType* buffer, const PixType* tab,
+                            int tabOfs, int ftzero )
  {
      int x, c, width = img1.cols, cn = img1.channels();
      int minX1 = max(maxD, 0), maxX1 = width + min(minD, 0);
      int minX2 = max(minX1 - maxD, 0), maxX2 = min(maxX1 - minD, width);
      int D = maxD - minD, width1 = maxX1 - minX1, width2 = maxX2 - minX2;
      const PixType *row1 = img1.ptr<PixType>(y), *row2 = img2.ptr<PixType>(y);
-    PixType *prow1 = buffer + width2*2, *prow2 = prow1 + width*cn;
+    PixType *prow1 = buffer + width2*2, *prow2 = prow1 + width*cn*2;
      
      tab += tabOfs;
      
-    for( c = 0; c < cn; c++ )
+    for( c = 0; c < cn*2; c++ )
      {
          prow1[width*c] = prow1[width*c + width-1] = 
          prow2[width*c] = prow2[width*c + width-1] = tab[0];
      }
      
+    int n1 = y > 0 ? -(int)img1.step : 0, s1 = y < img1.rows-1 ? (int)img1.step : 0;
+    int n2 = y > 0 ? -(int)img2.step : 0, s2 = y < img2.rows-1 ? (int)img2.step : 0;
+    
      if( cn == 1 )
      {
-        int n1 = y > 0 ? -img1.step : 0, s1 = y < img1.rows-1 ? img1.step : 0;
-        int n2 = y > 0 ? -img2.step : 0, s2 = y < img2.rows-1 ? img2.step : 0;
-        
          for( x = 1; x < width-1; x++ )
          {
-            //prow1[x] = tab[row1[x+1] - row1[x-1]];
-            //prow2[width-1-x] = tab[row2[x+1] - row2[x-1]];
              prow1[x] = tab[(row1[x+1] - row1[x-1])*2 + row1[x+n1+1] - row1[x+n1-1] + row1[x+s1+1] - row1[x+s1-1]];
              prow2[width-1-x] = tab[(row2[x+1] - row2[x-1])*2 + row2[x+n2+1] - row2[x+n2-1] + row2[x+s2+1] - row2[x+s2-1]];
+            
+            prow1[x+width] = row1[x];
+            prow2[width-1-x+width] = row2[x];
          }
      }
      else
      {
          for( x = 1; x < width-1; x++ )
          {
-            prow1[x] = tab[row1[x*3+3] - row1[x*3-3]];
-            prow1[x+width] = tab[row1[x*3+4] - row1[x*3-2]];
-            prow1[x+width*2] = tab[row1[x*3+5] - row1[x*3-1]];
+            prow1[x] = tab[(row1[x*3+3] - row1[x*3-3])*2 + row1[x*3+n1+3] - row1[x*3+n1-3] + row1[x*3+s1+3] - row1[x*3+s1-3]];
+            prow1[x+width] = tab[(row1[x*3+4] - row1[x*3-2])*2 + row1[x*3+n1+4] - row1[x*3+n1-2] + row1[x*3+s1+4] - row1[x*3+s1-2]];
+            prow1[x+width*2] = tab[(row1[x*3+5] - row1[x*3-1])*2 + row1[x*3+n1+5] - row1[x*3+n1-1] + row1[x*3+s1+5] - row1[x*3+s1-1]];
+            
+            prow2[width-1-x] = tab[(row2[x*3+3] - row2[x*3-3])*2 + row2[x*3+n2+3] - row2[x*3+n2-3] + row2[x*3+s2+3] - row2[x*3+s2-3]];
+            prow2[width-1-x+width] = tab[(row2[x*3+4] - row2[x*3-2])*2 + row2[x*3+n2+4] - row2[x*3+n2-2] + row2[x*3+s2+4] - row2[x*3+s2-2]];
+            prow2[width-1-x+width*2] = tab[(row2[x*3+5] - row2[x*3-1])*2 + row2[x*3+n2+5] - row2[x*3+n2-1] + row2[x*3+s2+5] - row2[x*3+s2-1]];
              
-            prow2[width-1-x] = tab[row2[x*3+3] - row2[x*3-3]];
-            prow2[width-1-x+width] = tab[row2[x*3+4] - row2[x*3-2]];
-            prow2[width-1-x+width*2] = tab[row2[x*3+5] - row2[x*3-1]];
+            prow1[x+width*3] = row1[x*3];
+            prow1[x+width*4] = row1[x*3+1];
+            prow1[x+width*5] = row1[x*3+2];
+            
+            prow2[width-1-x+width*3] = row2[x*3];
+            prow2[width-1-x+width*4] = row2[x*3+1];
+            prow2[width-1-x+width*5] = row2[x*3+2];
          }
      }
      
@@ -159,11 +169,15 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
      buffer -= minX2;
      cost -= minX1*D + minD; // simplify the cost indices inside the loop
      
+#if CV_SSE2    
      volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
+#endif
      
  #if 1    
-    for( c = 0; c < cn; c++, prow1 += width, prow2 += width )
+    for( c = 0; c < cn*2; c++, prow1 += width, prow2 += width )
      {
+        int diff_scale = c < cn ? 0 : 2;
+        
          // precompute
          //   v0 = min(row2[x-1/2], row2[x], row2[x+1/2]) and
          //   v1 = max(row2[x-1/2], row2[x], row2[x+1/2]) and
@@ -189,8 +203,9 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
          #if CV_SSE2
              if( useSIMD )
              {
-                __m128i _u = _mm_set1_epi8(u), _u0 = _mm_set1_epi8(u0);
-                __m128i _u1 = _mm_set1_epi8(u1), z = _mm_setzero_si128();
+                __m128i _u = _mm_set1_epi8((char)u), _u0 = _mm_set1_epi8((char)u0);
+                __m128i _u1 = _mm_set1_epi8((char)u1), z = _mm_setzero_si128();
+                __m128i ds = _mm_cvtsi32_si128(diff_scale);
                  
                  for( int d = minD; d < maxD; d += 16 )
                  {
@@ -204,8 +219,8 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
                      c0 = _mm_load_si128((__m128i*)(cost + x*D + d));
                      c1 = _mm_load_si128((__m128i*)(cost + x*D + d + 8));
                      
-                    _mm_store_si128((__m128i*)(cost + x*D + d), _mm_adds_epi16(c0, _mm_unpacklo_epi8(diff,z)));
-                    _mm_store_si128((__m128i*)(cost + x*D + d + 8), _mm_adds_epi16(c1, _mm_unpackhi_epi8(diff,z)));
+                    _mm_store_si128((__m128i*)(cost + x*D + d), _mm_adds_epi16(c0, _mm_srl_epi16(_mm_unpacklo_epi8(diff,z), ds)));
+                    _mm_store_si128((__m128i*)(cost + x*D + d + 8), _mm_adds_epi16(c1, _mm_srl_epi16(_mm_unpackhi_epi8(diff,z), ds)));
                  }
              }
              else
@@ -219,13 +234,13 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
                      int c0 = max(0, u - v1); c0 = max(c0, v0 - u);
                      int c1 = max(0, v - u1); c1 = max(c1, u0 - v);
                      
-                    cost[x*D + d] = (CostType)(cost[x*D+d] + min(c0, c1));
+                    cost[x*D + d] = (CostType)(cost[x*D+d] + (min(c0, c1) >> diff_scale));
                  }
              }
          }
      }
  #else
-    for( c = 0; c < cn; c++, prow1 += width, prow2 += width )
+    for( c = 0; c < cn*2; c++, prow1 += width, prow2 += width )
      {
          for( x = minX1; x < maxX1; x++ )
          {
@@ -252,7 +267,7 @@ static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
                  for( int d = minD; d < maxD; d++ )
                  {
                      int v = prow2[width-1-x + d];
-                    cost[x*D + d] = (CostType)(cost[x*D + d] + std::abs(u - v));
+                    cost[x*D + d] = (CostType)(cost[x*D + d] + (CostType)std::abs(u - v));
                  }
              }
          }
@@ -297,6 +312,8 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
          6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
          5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
      };
+    
+    volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
  #endif    
      
      const int ALIGN = 16;
@@ -351,7 +368,7 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
      size_t totalBufSize = (LrSize + minLrSize)*NLR*sizeof(CostType) + // minLr[] and Lr[]
      costBufSize*(hsumBufNRows + 1)*sizeof(CostType) + // hsumBuf, pixdiff
      CSBufSize*2*sizeof(CostType) + // C, S
-    width*8*img1.channels()*sizeof(PixType) + // temp buffer for computing per-pixel cost
+    width*16*img1.channels()*sizeof(PixType) + // temp buffer for computing per-pixel cost
      width*(sizeof(CostType) + sizeof(DispType)) + 1024; // disp2cost + disp2
      
      if( !buffer.data || !buffer.isContinuous() ||
@@ -368,11 +385,9 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
      DispType* disp2ptr = (DispType*)(disp2cost + width);
      PixType* tempBuf = (PixType*)(disp2ptr + width);
      
-    volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
-    
      // add P2 to every C(x,y). it saves a few operations in the inner loops
      for( k = 0; k < width1*D; k++ )
-        Cbuf[k] = P2;
+        Cbuf[k] = (CostType)P2;
      
      for( int pass = 1; pass <= npasses; pass++ )
      {
@@ -421,7 +436,7 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                      
                      if( k < height )
                      {
-                        calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, clipTab, TAB_OFS );
+                        calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, clipTab, TAB_OFS, ftzero );
                          
                          memset(hsumAdd, 0, D*sizeof(CostType));
                          for( x = 0; x <= SW2*D; x += D )
@@ -645,15 +660,14 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
              {
                  for( x = 0; x < width; x++ )
                  {
-                    disp1ptr[x] = disp2ptr[x] = INVALID_DISP_SCALED;
+                    disp1ptr[x] = disp2ptr[x] = (DispType)INVALID_DISP_SCALED;
                      disp2cost[x] = MAX_COST;
                  }
                  
                  for( x = width1 - 1; x >= 0; x-- )
                  {
                      CostType* Sp = S + x*D;
-                    CostType minS = MAX_COST;
-                    int bestDisp = -1;
+                    int minS = MAX_COST, bestDisp = -1;
                      
                      if( npasses == 1 )
                      {
@@ -705,16 +719,16 @@ static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
                              _minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 4));
                              _minL0 = _mm_min_epi16(_minL0, _mm_srli_si128(_minL0, 2));
                              
-                            __m128i _S = _mm_min_epi16(_minS, _mm_srli_si128(_minS, 8));
-                            _S = _mm_min_epi16(_S, _mm_srli_si128(_S, 4));
-                            _S = _mm_min_epi16(_S, _mm_srli_si128(_S, 2));
+                            __m128i qS = _mm_min_epi16(_minS, _mm_srli_si128(_minS, 8));
+                            qS = _mm_min_epi16(qS, _mm_srli_si128(qS, 4));
+                            qS = _mm_min_epi16(qS, _mm_srli_si128(qS, 2));
                              
                              minLr[0][xm] = (CostType)_mm_cvtsi128_si32(_minL0);
-                            minS = (CostType)_mm_cvtsi128_si32(_S);
+                            minS = (CostType)_mm_cvtsi128_si32(qS);
                              
-                            _S = _mm_shuffle_epi32(_mm_unpacklo_epi16(_S, _S), 0);
-                            _S = _mm_cmpeq_epi16(_minS, _S);
-                            int idx = _mm_movemask_epi8(_mm_packs_epi16(_S, _S)) & 255;
+                            qS = _mm_shuffle_epi32(_mm_unpacklo_epi16(qS, qS), 0);
+                            qS = _mm_cmpeq_epi16(_minS, qS);
+                            int idx = _mm_movemask_epi8(_mm_packs_epi16(qS, qS)) & 255;
                              
                              bestDisp = bestDispBuf[LSBTab[idx]];
                          }
@@ -840,13 +854,13 @@ void filterSpeckles( Mat& img, double _newval, int maxSpeckleSize, double _maxDi
                  if( ls[j] )            // has a label, check for bad label
                  {  
                      if( rtype[ls[j]] ) // small region, zero out disparity
-                        ds[j] = newVal;
+                        ds[j] = (short)newVal;
                  }
                  // no label, assign and propagate
                  else
                  {
                      Point2s* ws = wbuf;        // initialize wavefront
-                    Point2s p(j, i);   // current pixel
+                    Point2s p((short)j, (short)i);     // current pixel
                      curlabel++;        // next label
                      int count = 0;     // current region size
                      ls[j] = curlabel;
@@ -893,7 +907,7 @@ void filterSpeckles( Mat& img, double _newval, int maxSpeckleSize, double _maxDi
                      if( count <= maxSpeckleSize )      // speckle region
                      {
                          rtype[ls[j]] = 1;      // small region label
-                        ds[j] = newVal;
+                        ds[j] = (short)newVal;
                      }
                      else
                          rtype[ls[j]] = 0;      // large region label