From: vp153 <vp153@73c94f0f-984f-4a5f-82bc-2d8db8d8ee08>
Date: Thu, 25 Mar 2010 00:33:27 +0000 (+0000)
Subject: converted HOG & LKPyramid tracker from OpenMP to TBB. OpenMP is now commented off... 
X-Git-Url: https://rtime.felk.cvut.cz/gitweb/opencv.git/commitdiff_plain/e79c7fa63cea3883a1c1fc48694b190b7c1d85c0?ds=sidebyside

converted HOG & LKPyramid tracker from OpenMP to TBB. OpenMP is now commented off in CMake.

git-svn-id: https://code.ros.org/svn/opencv/trunk@2903 73c94f0f-984f-4a5f-82bc-2d8db8d8ee08
---

diff --git a/opencv/CMakeLists.txt b/opencv/CMakeLists.txt
index b0be94e6..5223ce4f 100644
--- a/opencv/CMakeLists.txt
+++ b/opencv/CMakeLists.txt
@@ -203,12 +203,12 @@ if(UNIX)
     include(CheckIncludeFile)
 endif()
 
-if(MSVC)
-	set(DEFAULT_ENABLE_OPENMP ON)
-else()
-	set(DEFAULT_ENABLE_OPENMP OFF)
-endif()
-set(ENABLE_OPENMP ${DEFAULT_ENABLE_OPENMP} CACHE BOOL "")
+#if(MSVC)
+#	set(DEFAULT_ENABLE_OPENMP ON)
+#else()
+#	set(DEFAULT_ENABLE_OPENMP OFF)
+#endif()
+#set(ENABLE_OPENMP ${DEFAULT_ENABLE_OPENMP} CACHE BOOL "")
 
 if(CMAKE_COMPILER_IS_GNUCXX)
     set(ENABLE_PROFILING OFF CACHE BOOL "Enable profiling in the GCC compiler (Add flags: -g -pg)")
@@ -658,9 +658,9 @@ if(MSVC)
 
     set(EXTRA_EXE_LINKER_FLAGS_RELEASE "${EXTRA_EXE_LINKER_FLAGS_RELEASE} /debug")
 
-    if(ENABLE_OPENMP)
-        set(EXTRA_C_FLAGS_RELEASE "${EXTRA_C_FLAGS_RELEASE} /openmp")
-    endif()
+    #if(ENABLE_OPENMP)
+    #    set(EXTRA_C_FLAGS_RELEASE "${EXTRA_C_FLAGS_RELEASE} /openmp")
+    #endif()
 
     # Remove unreferenced functions: function level linking
     set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} /Gy")
@@ -754,10 +754,10 @@ if(CMAKE_COMPILER_IS_GNUCXX)
     endif()
 
     # Parallel mode
-    if(ENABLE_OPENMP)
-        set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -D_GLIBCXX_PARALLEL -fopenmp")
-        set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} gomp)
-    endif()
+    #if(ENABLE_OPENMP)
+    #    set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -D_GLIBCXX_PARALLEL -fopenmp")
+    #    set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} gomp)
+    #endif()
 
     set(EXTRA_C_FLAGS_RELEASE "${EXTRA_C_FLAGS_RELEASE} -DNDEBUG")
     set(EXTRA_C_FLAGS_DEBUG "${EXTRA_C_FLAGS_DEBUG} -O0 -DDEBUG -D_DEBUG")
diff --git a/opencv/samples/c/peopledetect.cpp b/opencv/samples/c/peopledetect.cpp
index 3c516ae0..9e4ec3be 100644
--- a/opencv/samples/c/peopledetect.cpp
+++ b/opencv/samples/c/peopledetect.cpp
@@ -36,6 +36,7 @@ int main(int argc, char** argv)
 
     HOGDescriptor hog;
     hog.setSVMDetector(HOGDescriptor::getDefaultPeopleDetector());
+    namedWindow("people detector", 1);
 
     for(;;)
     {
diff --git a/opencv/src/cv/cvlkpyramid.cpp b/opencv/src/cv/cvlkpyramid.cpp
index 1b735bf8..ffda27ae 100644
--- a/opencv/src/cv/cvlkpyramid.cpp
+++ b/opencv/src/cv/cvlkpyramid.cpp
@@ -391,7 +391,7 @@ icvInitPyramidalAlgorithm( const CvMat* imgA, const CvMat* imgB,
         CV_Error( CV_StsNullPtr, "Some of the precomputed pyramids are missing" );
 
     if( level < 0 )
-        CV_Error( CV_StsOutOfRange, "The number of pyramid layers is negative" );
+        CV_Error( CV_StsOutOfRange, "The number of pyramid levels is negative" );
 
     switch( criteria->type )
     {
@@ -542,147 +542,50 @@ icvCalcIxIy_32f( const float* src, int src_step, float* dstX, float* dstY, int d
 }
 
 
-CV_IMPL void
-cvCalcOpticalFlowPyrLK( const void* arrA, const void* arrB,
-                        void* pyrarrA, void* pyrarrB,
-                        const CvPoint2D32f * featuresA,
-                        CvPoint2D32f * featuresB,
-                        int count, CvSize winSize, int level,
-                        char *status, float *error,
-                        CvTermCriteria criteria, int flags )
+namespace cv
 {
-    cv::AutoBuffer<uchar> pyrBuffer;
-    cv::AutoBuffer<uchar> buffer;
-    cv::AutoBuffer<char> _status;
-
-    const int MAX_ITERS = 100;
-
-    CvMat stubA, *imgA = (CvMat*)arrA;
-    CvMat stubB, *imgB = (CvMat*)arrB;
-    CvMat pstubA, *pyrA = (CvMat*)pyrarrA;
-    CvMat pstubB, *pyrB = (CvMat*)pyrarrB;
-    CvSize imgSize;
-    static const float smoothKernel[] = { 0.09375, 0.3125, 0.09375 };  /* 3/32, 10/32, 3/32 */
-    
-    int bufferBytes = 0;
-    uchar **imgI = 0;
-    uchar **imgJ = 0;
-    int *step = 0;
-    double *scale = 0;
-    CvSize* size = 0;
-
-    int threadCount = cvGetNumThreads();
-    float* _patchI[CV_MAX_THREADS];
-    float* _patchJ[CV_MAX_THREADS];
-    float* _Ix[CV_MAX_THREADS];
-    float* _Iy[CV_MAX_THREADS];
-
-    int i, l;
-
-    CvSize patchSize = cvSize( winSize.width * 2 + 1, winSize.height * 2 + 1 );
-    int patchLen = patchSize.width * patchSize.height;
-    int srcPatchLen = (patchSize.width + 2)*(patchSize.height + 2);
-
-    imgA = cvGetMat( imgA, &stubA );
-    imgB = cvGetMat( imgB, &stubB );
-
-    if( CV_MAT_TYPE( imgA->type ) != CV_8UC1 )
-        CV_Error( CV_StsUnsupportedFormat, "" );
-
-    if( !CV_ARE_TYPES_EQ( imgA, imgB ))
-        CV_Error( CV_StsUnmatchedFormats, "" );
-
-    if( !CV_ARE_SIZES_EQ( imgA, imgB ))
-        CV_Error( CV_StsUnmatchedSizes, "" );
-
-    if( imgA->step != imgB->step )
-        CV_Error( CV_StsUnmatchedSizes, "imgA and imgB must have equal steps" );
-
-    imgSize = cvGetMatSize( imgA );
-
-    if( pyrA )
-    {
-        pyrA = cvGetMat( pyrA, &pstubA );
-
-        if( pyrA->step*pyrA->height < icvMinimalPyramidSize( imgSize ) )
-            CV_Error( CV_StsBadArg, "pyramid A has insufficient size" );
-    }
-    else
-    {
-        pyrA = &pstubA;
-        pyrA->data.ptr = 0;
-    }
-
-    if( pyrB )
-    {
-        pyrB = cvGetMat( pyrB, &pstubB );
-
-        if( pyrB->step*pyrB->height < icvMinimalPyramidSize( imgSize ) )
-            CV_Error( CV_StsBadArg, "pyramid B has insufficient size" );
-    }
-    else
-    {
-        pyrB = &pstubB;
-        pyrB->data.ptr = 0;
-    }
-
-    if( count == 0 )
-        return;
 
-    if( !featuresA || !featuresB )
-        CV_Error( CV_StsNullPtr, "Some of arrays of point coordinates are missing" );
-
-    if( count < 0 )
-        CV_Error( CV_StsOutOfRange, "The number of tracked points is negative or zero" );
-
-    if( winSize.width <= 1 || winSize.height <= 1 )
-        CV_Error( CV_StsBadSize, "Invalid search window size" );
-
-    for( i = 0; i < threadCount; i++ )
-        _patchI[i] = _patchJ[i] = _Ix[i] = _Iy[i] = 0;
-
-    icvInitPyramidalAlgorithm( imgA, imgB, pyrA, pyrB,
-        level, &criteria, MAX_ITERS, flags,
-        &imgI, &imgJ, &step, &size, &scale, &pyrBuffer );
-
-    if( !status )
+struct LKTrackerInvoker
+{
+    LKTrackerInvoker( const CvMat* _imgI, const CvMat* _imgJ,
+                      const CvPoint2D32f* _featuresA,
+                      CvPoint2D32f* _featuresB,
+                      char* _status, float* _error,
+                      CvTermCriteria _criteria,
+                      CvSize _winSize, int _level, int _flags )
     {
-        _status.allocate(count);
+        imgI = _imgI;
+        imgJ = _imgJ;
+        featuresA = _featuresA;
+        featuresB = _featuresB;
         status = _status;
+        error = _error;
+        criteria = _criteria;
+        winSize = _winSize;
+        level = _level;
+        flags = _flags;
     }
-
-    /* buffer_size = <size for patches> + <size for pyramids> */
-    bufferBytes = (srcPatchLen + patchLen * 3) * sizeof( _patchI[0][0] ) * threadCount;
-    buffer.allocate(bufferBytes);
-
-    for( i = 0; i < threadCount; i++ )
-    {
-        _patchI[i] = i == 0 ? (float*)(uchar*)buffer : _Iy[i-1] + patchLen;
-        _patchJ[i] = _patchI[i] + srcPatchLen;
-        _Ix[i] = _patchJ[i] + patchLen;
-        _Iy[i] = _Ix[i] + patchLen;
-    }
-
-    memset( status, 1, count );
-    if( error )
-        memset( error, 0, count*sizeof(error[0]) );
-
-    if( !(flags & CV_LKFLOW_INITIAL_GUESSES) )
-        memcpy( featuresB, featuresA, count*sizeof(featuresA[0]));
-
-    /* do processing from top pyramid level (smallest image)
-       to the bottom (original image) */
-    for( l = level; l >= 0; l-- )
+    
+    void operator()(const BlockedRange& range) const
     {
-        CvSize levelSize = size[l];
-        int levelStep = step[l];
-
-        {
-#ifdef _OPENMP
-        #pragma omp parallel for num_threads(threadCount) schedule(dynamic) 
-#endif // _OPENMP
-        /* find flow for each given point */
-        for( i = 0; i < count; i++ )
+        static const float smoothKernel[] = { 0.09375, 0.3125, 0.09375 };  // 3/32, 10/32, 3/32
+        
+        int i, i1 = range.begin(), i2 = range.end();
+        
+        CvSize patchSize = cvSize( winSize.width * 2 + 1, winSize.height * 2 + 1 );
+        int patchLen = patchSize.width * patchSize.height;
+        int srcPatchLen = (patchSize.width + 2)*(patchSize.height + 2);
+        
+        AutoBuffer<float> buf(patchLen*3 + srcPatchLen);
+        float* patchI = buf;
+        float* patchJ = patchI + srcPatchLen;
+        float* Ix = patchJ + patchLen;
+        float* Iy = Ix + patchLen;
+        float scaleL = 1.f/(1 << level);
+        CvSize levelSize = cvGetMatSize(imgI);
+        
+        // find flow for each given point
+        for( i = i1; i < i2; i++ )
         {
             CvPoint2D32f v;
             CvPoint minI, maxI, minJ, maxJ;
@@ -693,91 +596,76 @@ cvCalcOpticalFlowPyrLK( const void* arrA, const void* arrB,
             double Gxx = 0, Gxy = 0, Gyy = 0, D = 0, minEig = 0;
             float prev_mx = 0, prev_my = 0;
             int j, x, y;
-            int threadIdx = cvGetThreadNum();
-            float* patchI = _patchI[threadIdx];
-            float* patchJ = _patchJ[threadIdx];
-            float* Ix = _Ix[threadIdx];
-            float* Iy = _Iy[threadIdx];
-
-            v.x = featuresB[i].x;
-            v.y = featuresB[i].y;
-            if( l < level )
-            {
-                v.x += v.x;
-                v.y += v.y;
-            }
-            else
-            {
-                v.x = (float)(v.x * scale[l]);
-                v.y = (float)(v.y * scale[l]);
-            }
-
+            
+            v.x = featuresB[i].x*2;
+            v.y = featuresB[i].y*2;
+            
             pt_status = status[i];
             if( !pt_status )
                 continue;
-
-            minI = maxI = minJ = maxJ = cvPoint( 0, 0 );
-
-            u.x = (float) (featuresA[i].x * scale[l]);
-            u.y = (float) (featuresA[i].y * scale[l]);
-
+            
+            minI = maxI = minJ = maxJ = cvPoint(0, 0);
+            
+            u.x = featuresA[i].x * scaleL;
+            u.y = featuresA[i].y * scaleL;
+            
             intersect( u, winSize, levelSize, &minI, &maxI );
             isz = jsz = cvSize(maxI.x - minI.x + 2, maxI.y - minI.y + 2);
             u.x += (minI.x - (patchSize.width - maxI.x + 1))*0.5f;
             u.y += (minI.y - (patchSize.height - maxI.y + 1))*0.5f;
-
+            
             if( isz.width < 3 || isz.height < 3 ||
-                icvGetRectSubPix_8u32f_C1R( imgI[l], levelStep, levelSize,
-                    patchI, isz.width*sizeof(patchI[0]), isz, u ) < 0 )
+                icvGetRectSubPix_8u32f_C1R( imgI->data.ptr, imgI->step, levelSize,
+                                            patchI, isz.width*sizeof(patchI[0]), isz, u ) < 0 )
             {
-                /* point is outside the image. take the next */
+                // point is outside the first image. take the next
                 status[i] = 0;
                 continue;
             }
-
+            
             icvCalcIxIy_32f( patchI, isz.width*sizeof(patchI[0]), Ix, Iy,
-                (isz.width-2)*sizeof(patchI[0]), isz, smoothKernel, patchJ );
-
+                             (isz.width-2)*sizeof(patchI[0]), isz, smoothKernel, patchJ );
+            
             for( j = 0; j < criteria.max_iter; j++ )
             {
                 double bx = 0, by = 0;
                 float mx, my;
                 CvPoint2D32f _v;
-
+                
                 intersect( v, winSize, levelSize, &minJ, &maxJ );
-
+                
                 minJ.x = MAX( minJ.x, minI.x );
                 minJ.y = MAX( minJ.y, minI.y );
-
+                
                 maxJ.x = MIN( maxJ.x, maxI.x );
                 maxJ.y = MIN( maxJ.y, maxI.y );
-
+                
                 jsz = cvSize(maxJ.x - minJ.x, maxJ.y - minJ.y);
-
+                
                 _v.x = v.x + (minJ.x - (patchSize.width - maxJ.x + 1))*0.5f;
                 _v.y = v.y + (minJ.y - (patchSize.height - maxJ.y + 1))*0.5f;
-
+                
                 if( jsz.width < 1 || jsz.height < 1 ||
-                    icvGetRectSubPix_8u32f_C1R( imgJ[l], levelStep, levelSize, patchJ,
+                    icvGetRectSubPix_8u32f_C1R( imgJ->data.ptr, imgJ->step, levelSize, patchJ,
                                                 jsz.width*sizeof(patchJ[0]), jsz, _v ) < 0 )
                 {
-                    /* point is outside image. take the next */
+                    // point is outside of the second image. take the next
                     pt_status = 0;
                     break;
                 }
-
+                
                 if( maxJ.x == prev_maxJ.x && maxJ.y == prev_maxJ.y &&
                     minJ.x == prev_minJ.x && minJ.y == prev_minJ.y )
                 {
                     for( y = 0; y < jsz.height; y++ )
                     {
                         const float* pi = patchI +
-                            (y + minJ.y - minI.y + 1)*isz.width + minJ.x - minI.x + 1;
+                        (y + minJ.y - minI.y + 1)*isz.width + minJ.x - minI.x + 1;
                         const float* pj = patchJ + y*jsz.width;
                         const float* ix = Ix +
-                            (y + minJ.y - minI.y)*(isz.width-2) + minJ.x - minI.x;
+                        (y + minJ.y - minI.y)*(isz.width-2) + minJ.x - minI.x;
                         const float* iy = Iy + (ix - Ix);
-
+                        
                         for( x = 0; x < jsz.width; x++ )
                         {
                             double t0 = pi[x] - pj[x];
@@ -792,12 +680,12 @@ cvCalcOpticalFlowPyrLK( const void* arrA, const void* arrB,
                     for( y = 0; y < jsz.height; y++ )
                     {
                         const float* pi = patchI +
-                            (y + minJ.y - minI.y + 1)*isz.width + minJ.x - minI.x + 1;
+                        (y + minJ.y - minI.y + 1)*isz.width + minJ.x - minI.x + 1;
                         const float* pj = patchJ + y*jsz.width;
                         const float* ix = Ix +
-                            (y + minJ.y - minI.y)*(isz.width-2) + minJ.x - minI.x;
+                        (y + minJ.y - minI.y)*(isz.width-2) + minJ.x - minI.x;
                         const float* iy = Iy + (ix - Ix);
-
+                        
                         for( x = 0; x < jsz.width; x++ )
                         {
                             double t = pi[x] - pj[x];
@@ -808,33 +696,33 @@ cvCalcOpticalFlowPyrLK( const void* arrA, const void* arrB,
                             Gyy += iy[x] * iy[x];
                         }
                     }
-
+                    
                     D = Gxx * Gyy - Gxy * Gxy;
                     if( D < DBL_EPSILON )
                     {
                         pt_status = 0;
                         break;
                     }
-
+                    
                     // Adi Shavit - 2008.05
                     if( flags & CV_LKFLOW_GET_MIN_EIGENVALS )
                         minEig = (Gyy + Gxx - sqrt((Gxx-Gyy)*(Gxx-Gyy) + 4.*Gxy*Gxy))/(2*jsz.height*jsz.width);
-
+                        
                     D = 1. / D;
-
+                        
                     prev_minJ = minJ;
                     prev_maxJ = maxJ;
                 }
-
+                
                 mx = (float) ((Gyy * bx - Gxy * by) * D);
                 my = (float) ((Gxx * by - Gxy * bx) * D);
-
+                
                 v.x += mx;
                 v.y += my;
-
+                
                 if( mx * mx + my * my < criteria.epsilon )
                     break;
-
+                
                 if( j > 0 && fabs(mx + prev_mx) < 0.01 && fabs(my + prev_my) < 0.01 )
                 {
                     v.x -= mx*0.5f;
@@ -844,12 +732,12 @@ cvCalcOpticalFlowPyrLK( const void* arrA, const void* arrB,
                 prev_mx = mx;
                 prev_my = my;
             }
-
+            
             featuresB[i] = v;
             status[i] = (char)pt_status;
-            if( l == 0 && error && pt_status )
+            if( level == 0 && error && pt_status )
             {
-                /* calc error */
+                // calc error
                 double err = 0;
                 if( flags & CV_LKFLOW_GET_MIN_EIGENVALS )
                     err = minEig;
@@ -858,9 +746,9 @@ cvCalcOpticalFlowPyrLK( const void* arrA, const void* arrB,
                     for( y = 0; y < jsz.height; y++ )
                     {
                         const float* pi = patchI +
-                            (y + minJ.y - minI.y + 1)*isz.width + minJ.x - minI.x + 1;
+                        (y + minJ.y - minI.y + 1)*isz.width + minJ.x - minI.x + 1;
                         const float* pj = patchJ + y*jsz.width;
-
+                        
                         for( x = 0; x < jsz.width; x++ )
                         {
                             double t = pi[x] - pj[x];
@@ -872,7 +760,143 @@ cvCalcOpticalFlowPyrLK( const void* arrA, const void* arrB,
                 error[i] = (float)err;
             }
         } // end of point processing loop (i)
-        }
+    }
+    
+    const CvMat* imgI;
+    const CvMat* imgJ;
+    const CvPoint2D32f* featuresA;
+    CvPoint2D32f* featuresB;
+    char* status;
+    float* error;
+    CvTermCriteria criteria;
+    CvSize winSize;
+    int level;
+    int flags;
+};
+    
+    
+}
+
+
+CV_IMPL void
+cvCalcOpticalFlowPyrLK( const void* arrA, const void* arrB,
+                        void* pyrarrA, void* pyrarrB,
+                        const CvPoint2D32f * featuresA,
+                        CvPoint2D32f * featuresB,
+                        int count, CvSize winSize, int level,
+                        char *status, float *error,
+                        CvTermCriteria criteria, int flags )
+{
+    cv::AutoBuffer<uchar> pyrBuffer;
+    cv::AutoBuffer<uchar> buffer;
+    cv::AutoBuffer<char> _status;
+
+    const int MAX_ITERS = 100;
+
+    CvMat stubA, *imgA = (CvMat*)arrA;
+    CvMat stubB, *imgB = (CvMat*)arrB;
+    CvMat pstubA, *pyrA = (CvMat*)pyrarrA;
+    CvMat pstubB, *pyrB = (CvMat*)pyrarrB;
+    CvSize imgSize;
+    
+    uchar **imgI = 0;
+    uchar **imgJ = 0;
+    int *step = 0;
+    double *scale = 0;
+    CvSize* size = 0;
+
+    int i, l;
+
+    imgA = cvGetMat( imgA, &stubA );
+    imgB = cvGetMat( imgB, &stubB );
+
+    if( CV_MAT_TYPE( imgA->type ) != CV_8UC1 )
+        CV_Error( CV_StsUnsupportedFormat, "" );
+
+    if( !CV_ARE_TYPES_EQ( imgA, imgB ))
+        CV_Error( CV_StsUnmatchedFormats, "" );
+
+    if( !CV_ARE_SIZES_EQ( imgA, imgB ))
+        CV_Error( CV_StsUnmatchedSizes, "" );
+
+    if( imgA->step != imgB->step )
+        CV_Error( CV_StsUnmatchedSizes, "imgA and imgB must have equal steps" );
+
+    imgSize = cvGetMatSize( imgA );
+
+    if( pyrA )
+    {
+        pyrA = cvGetMat( pyrA, &pstubA );
+
+        if( pyrA->step*pyrA->height < icvMinimalPyramidSize( imgSize ) )
+            CV_Error( CV_StsBadArg, "pyramid A has insufficient size" );
+    }
+    else
+    {
+        pyrA = &pstubA;
+        pyrA->data.ptr = 0;
+    }
+
+    if( pyrB )
+    {
+        pyrB = cvGetMat( pyrB, &pstubB );
+
+        if( pyrB->step*pyrB->height < icvMinimalPyramidSize( imgSize ) )
+            CV_Error( CV_StsBadArg, "pyramid B has insufficient size" );
+    }
+    else
+    {
+        pyrB = &pstubB;
+        pyrB->data.ptr = 0;
+    }
+
+    if( count == 0 )
+        return;
+
+    if( !featuresA || !featuresB )
+        CV_Error( CV_StsNullPtr, "Some of arrays of point coordinates are missing" );
+
+    if( count < 0 )
+        CV_Error( CV_StsOutOfRange, "The number of tracked points is negative or zero" );
+
+    if( winSize.width <= 1 || winSize.height <= 1 )
+        CV_Error( CV_StsBadSize, "Invalid search window size" );
+
+    icvInitPyramidalAlgorithm( imgA, imgB, pyrA, pyrB,
+        level, &criteria, MAX_ITERS, flags,
+        &imgI, &imgJ, &step, &size, &scale, &pyrBuffer );
+
+    if( !status )
+    {
+        _status.allocate(count);
+        status = _status;
+    }
+
+    memset( status, 1, count );
+    if( error )
+        memset( error, 0, count*sizeof(error[0]) );
+
+    if( !(flags & CV_LKFLOW_INITIAL_GUESSES) )
+        memcpy( featuresB, featuresA, count*sizeof(featuresA[0]));
+    
+    for( i = 0; i < count; i++ )
+    {
+        featuresB[i].x = (float)(featuresB[i].x * scale[level] * 0.5);
+        featuresB[i].y = (float)(featuresB[i].y * scale[level] * 0.5);
+    }
+
+    /* do processing from top pyramid level (smallest image)
+       to the bottom (original image) */
+    for( l = level; l >= 0; l-- )
+    {
+        CvMat imgI_l, imgJ_l;        
+        cvInitMatHeader(&imgI_l, size[l].height, size[l].width, imgA->type, imgI[l], step[l]);
+        cvInitMatHeader(&imgJ_l, size[l].height, size[l].width, imgB->type, imgJ[l], step[l]);
+        
+        cv::parallel_for(cv::BlockedRange(0, count),
+                         cv::LKTrackerInvoker(&imgI_l, &imgJ_l, featuresA,
+                                              featuresB, status, error,
+                                              criteria, winSize, l, flags));
     } // end of pyramid levels loop (l)
 }
 
diff --git a/opencv/src/cvaux/cvhog.cpp b/opencv/src/cvaux/cvhog.cpp
index 1b8a4c65..a3bc42d3 100644
--- a/opencv/src/cvaux/cvhog.cpp
+++ b/opencv/src/cvaux/cvhog.cpp
@@ -785,73 +785,84 @@ void HOGDescriptor::detect(const Mat& img,
     }
 }
 
-
-struct HOGThreadData
+    
+struct HOGInvoker
 {
-    vector<Rect> rectangles;
-    vector<Point> locations;
-    Mat smallerImgBuf;
+    HOGInvoker( const HOGDescriptor* _hog, const Mat& _img,
+                double _hitThreshold, Size _winStride, Size _padding,
+                const double* _levelScale, ConcurrentRectVector* _vec ) 
+    {
+        hog = _hog;
+        img = _img;
+        hitThreshold = _hitThreshold;
+        winStride = _winStride;
+        padding = _padding;
+        levelScale = _levelScale;
+        vec = _vec;
+    }
+    
+    void operator()( const BlockedRange& range ) const
+    {
+        int i, i1 = range.begin(), i2 = range.end();
+        double minScale = i1 > 0 ? levelScale[i1] : i2 > 1 ? levelScale[i1+1] : std::max(img.cols, img.rows);
+        Size maxSz(cvCeil(img.cols/minScale), cvCeil(img.rows/minScale));
+        Mat smallerImgBuf(maxSz, img.type());
+        vector<Point> locations;
+        
+        for( i = i1; i < i2; i++ )
+        {
+            double scale = levelScale[i];
+            Size sz(cvRound(img.cols/scale), cvRound(img.rows/scale));
+            Mat smallerImg(sz, img.type(), smallerImgBuf.data);
+            if( sz == img.size() )
+                smallerImg = Mat(sz, img.type(), img.data, img.step);
+            else
+                resize(img, smallerImg, sz);
+            hog->detect(smallerImg, locations, hitThreshold, winStride, padding);
+            Size scaledWinSize = Size(cvRound(hog->winSize.width*scale), cvRound(hog->winSize.height*scale));
+            for( size_t j = 0; j < locations.size(); j++ )
+                vec->push_back(Rect(cvRound(locations[j].x*scale),
+                                    cvRound(locations[j].y*scale),
+                                    scaledWinSize.width, scaledWinSize.height));
+        }
+    }
+    
+    const HOGDescriptor* hog;
+    Mat img;
+    double hitThreshold;
+    Size winStride;
+    Size padding;
+    const double* levelScale;
+    ConcurrentRectVector* vec;
 };
 
+
 void HOGDescriptor::detectMultiScale(
     const Mat& img, vector<Rect>& foundLocations,
     double hitThreshold, Size winStride, Size padding,
     double scale0, int groupThreshold) const
 {
     double scale = 1.;
-    foundLocations.clear();
-    int i, levels = 0;
     const int maxLevels = 64;
 
-    int t, nthreads = getNumThreads();
-    vector<HOGThreadData> threadData(nthreads);
-
-    for( t = 0; t < nthreads; t++ )
-        threadData[t].smallerImgBuf.create(img.size(), img.type());
-
-    vector<double> levelScale(maxLevels);
-    for( levels = 0; levels < maxLevels; levels++ )
+    vector<double> levelScale;
+    for( int levels = 0; levels < maxLevels; levels++ )
     {
-        levelScale[levels] = scale;
+        levelScale.push_back(scale);
         if( cvRound(img.cols/scale) < winSize.width ||
             cvRound(img.rows/scale) < winSize.height ||
             scale0 <= 1 )
             break;
         scale *= scale0;
     }
-    levels = std::max(levels, 1);
-    levelScale.resize(levels);
-
-    {
-#ifdef _OPENMP
-    #pragma omp parallel for num_threads(nthreads) schedule(dynamic)
-#endif // _OPENMP
-    for( i = 0; i < levels; i++ )
-    {
-        HOGThreadData& tdata = threadData[getThreadNum()];
-        double scale = levelScale[i];
-        Size sz(cvRound(img.cols/scale), cvRound(img.rows/scale));
-        Mat smallerImg(sz, img.type(), tdata.smallerImgBuf.data);
-        if( sz == img.size() )
-            smallerImg = Mat(sz, img.type(), img.data, img.step);
-        else
-            resize(img, smallerImg, sz);
-        detect(smallerImg, tdata.locations, hitThreshold, winStride, padding);
-        Size scaledWinSize = Size(cvRound(winSize.width*scale), cvRound(winSize.height*scale));
-        for( size_t j = 0; j < tdata.locations.size(); j++ )
-            tdata.rectangles.push_back(Rect(
-                cvRound(tdata.locations[j].x*scale),
-                cvRound(tdata.locations[j].y*scale),
-                scaledWinSize.width, scaledWinSize.height));
-    }
-    }
 
-    for( t = 0; t < nthreads; t++ )
-    {
-        HOGThreadData& tdata = threadData[t];
-        std::copy(tdata.rectangles.begin(), tdata.rectangles.end(),
-            std::back_inserter(foundLocations));
-    }
+    ConcurrentRectVector allCandidates;
+    
+    parallel_for(BlockedRange(0, (int)levelScale.size()),
+                 HOGInvoker(this, img, hitThreshold, winStride, padding, &levelScale[0], &allCandidates));
+    
+    foundLocations.resize(allCandidates.size());
+    std::copy(allCandidates.begin(), allCandidates.end(), foundLocations.begin());
 
     groupRectangles(foundLocations, groupThreshold, 0.2);
 }