1 //M*//////////////////////////////////////////////////////////////////////////////////////
\r
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
\r
5 // By downloading, copying, installing or using the software you agree to this license.
\r
6 // If you do not agree to this license, do not download, install,
\r
7 // copy or use the software.
\r
10 // Intel License Agreement
\r
11 // For Open Source Computer Vision Library
\r
13 // Copyright (C) 2000, Intel Corporation, all rights reserved.
\r
14 // Third party copyrights are property of their respective owners.
\r
16 // Redistribution and use in source and binary forms, with or without modification,
\r
17 // are permitted provided that the following conditions are met:
\r
19 // * Redistribution's of source code must retain the above copyright notice,
\r
20 // this list of conditions and the following disclaimer.
\r
22 // * Redistribution's in binary form must reproduce the above copyright notice,
\r
23 // this list of conditions and the following disclaimer in the documentation
\r
24 // and/or other materials provided with the distribution.
\r
26 // * The name of Intel Corporation may not be used to endorse or promote products
\r
27 // derived from this software without specific prior written permission.
\r
29 // This software is provided by the copyright holders and contributors "as is" and
\r
30 // any express or implied warranties, including, but not limited to, the implied
\r
31 // warranties of merchantability and fitness for a particular purpose are disclaimed.
\r
32 // In no event shall the Intel Corporation or contributors be liable for any direct,
\r
33 // indirect, incidental, special, exemplary, or consequential damages
\r
34 // (including, but not limited to, procurement of substitute goods or services;
\r
35 // loss of use, data, or profits; or business interruption) however caused
\r
36 // and on any theory of liability, whether in contract, strict liability,
\r
37 // or tort (including negligence or otherwise) arising in any way out of
\r
38 // the use of this software, even if advised of the possibility of such damage.
\r
42 /****************************************************************************************\
\r
43 * Very fast SAD-based (Sum-of-Absolute-Diffrences) stereo correspondence algorithm. *
\r
44 * Contributed by Kurt Konolige *
\r
45 \****************************************************************************************/
\r
52 //#include "emmintrin.h"
\r
57 CV_IMPL CvStereoBMState* cvCreateStereoBMState( int /*preset*/, int numberOfDisparities )
\r
59 CvStereoBMState* state = (CvStereoBMState*)cvAlloc( sizeof(*state) );
\r
63 state->preFilterType = CV_STEREO_BM_XSOBEL; //CV_STEREO_BM_NORMALIZED_RESPONSE;
\r
64 state->preFilterSize = 9;
\r
65 state->preFilterCap = 31;
\r
66 state->SADWindowSize = 15;
\r
67 state->minDisparity = 0;
\r
68 state->numberOfDisparities = numberOfDisparities > 0 ? numberOfDisparities : 64;
\r
69 state->textureThreshold = 10;
\r
70 state->uniquenessRatio = 15;
\r
71 state->speckleRange = state->speckleWindowSize = 0;
\r
72 state->trySmallerWindows = 0;
\r
73 state->roi1 = state->roi2 = cvRect(0,0,0,0);
\r
74 state->disp12MaxDiff = -1;
\r
76 state->preFilteredImg0 = state->preFilteredImg1 = state->slidingSumBuf =
\r
77 state->disp = state->cost = 0;
\r
82 CV_IMPL void cvReleaseStereoBMState( CvStereoBMState** state )
\r
85 CV_Error( CV_StsNullPtr, "" );
\r
90 cvReleaseMat( &(*state)->preFilteredImg0 );
\r
91 cvReleaseMat( &(*state)->preFilteredImg1 );
\r
92 cvReleaseMat( &(*state)->slidingSumBuf );
\r
93 cvReleaseMat( &(*state)->disp );
\r
94 cvReleaseMat( &(*state)->cost );
\r
101 static void prefilterNorm( const Mat& src, Mat& dst, int winsize, int ftzero, uchar* buf )
\r
103 int x, y, wsz2 = winsize/2;
\r
104 int* vsum = (int*)alignPtr(buf + (wsz2 + 1)*sizeof(vsum[0]), 32);
\r
105 int scale_g = winsize*winsize/8, scale_s = (1024 + scale_g)/(scale_g*2);
\r
106 const int OFS = 256*5, TABSZ = OFS*2 + 256;
\r
108 const uchar* sptr = src.data;
\r
109 int srcstep = src.step;
\r
110 Size size = src.size();
\r
112 scale_g *= scale_s;
\r
114 for( x = 0; x < TABSZ; x++ )
\r
115 tab[x] = (uchar)(x - OFS < -ftzero ? 0 : x - OFS > ftzero ? ftzero*2 : x - OFS + ftzero);
\r
117 for( x = 0; x < size.width; x++ )
\r
118 vsum[x] = (ushort)(sptr[x]*(wsz2 + 2));
\r
120 for( y = 1; y < wsz2; y++ )
\r
122 for( x = 0; x < size.width; x++ )
\r
123 vsum[x] = (ushort)(vsum[x] + sptr[srcstep*y + x]);
\r
126 for( y = 0; y < size.height; y++ )
\r
128 const uchar* top = sptr + srcstep*MAX(y-wsz2-1,0);
\r
129 const uchar* bottom = sptr + srcstep*MIN(y+wsz2,size.height-1);
\r
130 const uchar* prev = sptr + srcstep*MAX(y-1,0);
\r
131 const uchar* curr = sptr + srcstep*y;
\r
132 const uchar* next = sptr + srcstep*MIN(y+1,size.height-1);
\r
133 uchar* dptr = dst.ptr<uchar>(y);
\r
136 for( ; x < size.width; x++ )
\r
137 vsum[x] = (ushort)(vsum[x] + bottom[x] - top[x]);
\r
139 for( x = 0; x <= wsz2; x++ )
\r
141 vsum[-x-1] = vsum[0];
\r
142 vsum[size.width+x] = vsum[size.width-1];
\r
145 int sum = vsum[0]*(wsz2 + 1);
\r
146 for( x = 1; x <= wsz2; x++ )
\r
149 int val = ((curr[0]*5 + curr[1] + prev[0] + next[0])*scale_g - sum*scale_s) >> 10;
\r
150 dptr[0] = tab[val + OFS];
\r
152 for( x = 1; x < size.width-1; x++ )
\r
154 sum += vsum[x+wsz2] - vsum[x-wsz2-1];
\r
155 val = ((curr[x]*4 + curr[x-1] + curr[x+1] + prev[x] + next[x])*scale_g - sum*scale_s) >> 10;
\r
156 dptr[x] = tab[val + OFS];
\r
159 sum += vsum[x+wsz2] - vsum[x-wsz2-1];
\r
160 val = ((curr[x]*5 + curr[x-1] + prev[x] + next[x])*scale_g - sum*scale_s) >> 10;
\r
161 dptr[x] = tab[val + OFS];
\r
167 prefilterXSobel( const Mat& src, Mat& dst, int ftzero )
\r
170 const int OFS = 256*4, TABSZ = OFS*2 + 256;
\r
172 Size size = src.size();
\r
174 for( x = 0; x < TABSZ; x++ )
\r
175 tab[x] = (uchar)(x - OFS < -ftzero ? 0 : x - OFS > ftzero ? ftzero*2 : x - OFS + ftzero);
\r
176 uchar val0 = tab[0 + OFS];
\r
179 volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
\r
182 for( y = 0; y < size.height-1; y += 2 )
\r
184 const uchar* srow1 = src.ptr<uchar>(y);
\r
185 const uchar* srow0 = y > 0 ? srow1 - src.step : size.height > 1 ? srow1 + src.step : srow1;
\r
186 const uchar* srow2 = y < size.height-1 ? srow1 + src.step : size.height > 1 ? srow1 - src.step : srow1;
\r
187 const uchar* srow3 = y < size.height-2 ? srow1 + src.step*2 : srow1;
\r
188 uchar* dptr0 = dst.ptr<uchar>(y);
\r
189 uchar* dptr1 = dptr0 + dst.step;
\r
191 dptr0[0] = dptr0[size.width-1] = dptr1[0] = dptr1[size.width-1] = val0;
\r
197 __m128i z = _mm_setzero_si128(), ftz = _mm_set1_epi16(ftzero), ftz2 = _mm_set1_epi8(CV_CAST_8U(ftzero*2));
\r
198 for( ; x <= size.width-9; x += 8 )
\r
200 __m128i c0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow0 + x - 1)), z);
\r
201 __m128i c1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow1 + x - 1)), z);
\r
202 __m128i d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow0 + x + 1)), z);
\r
203 __m128i d1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow1 + x + 1)), z);
\r
205 d0 = _mm_sub_epi16(d0, c0);
\r
206 d1 = _mm_sub_epi16(d1, c1);
\r
208 __m128i c2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow2 + x - 1)), z);
\r
209 __m128i c3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow2 + x - 1)), z);
\r
210 __m128i d2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow2 + x + 1)), z);
\r
211 __m128i d3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow2 + x + 1)), z);
\r
213 d2 = _mm_sub_epi16(d2, c2);
\r
214 d3 = _mm_sub_epi16(d3, c3);
\r
216 __m128i v0 = _mm_add_epi16(d0, _mm_add_epi16(d2, _mm_add_epi16(d1, d1)));
\r
217 __m128i v1 = _mm_add_epi16(d1, _mm_add_epi16(d3, _mm_add_epi16(d2, d2)));
\r
218 v0 = _mm_packus_epi16(_mm_add_epi16(v0, ftz), _mm_add_epi16(v1, ftz));
\r
219 v0 = _mm_min_epu8(v0, ftz2);
\r
221 _mm_storel_epi64((__m128i*)(dptr0 + x), v0);
\r
222 _mm_storel_epi64((__m128i*)(dptr1 + x), _mm_unpackhi_epi64(v0, v0));
\r
227 for( ; x < size.width-1; x++ )
\r
229 int d0 = srow0[x+1] - srow0[x-1], d1 = srow1[x+1] - srow1[x-1],
\r
230 d2 = srow2[x+1] - srow2[x-1], d3 = srow3[x+1] - srow3[x-1];
\r
231 int v0 = tab[d0 + d1*2 + d2 + OFS];
\r
232 int v1 = tab[d1 + d2*2 + d3 + OFS];
\r
233 dptr0[x] = (uchar)v0;
\r
234 dptr1[x] = (uchar)v1;
\r
238 for( ; y < size.height; y++ )
\r
240 uchar* dptr = dst.ptr<uchar>(y);
\r
241 for( x = 0; x < size.width; x++ )
\r
247 static const int DISPARITY_SHIFT = 4;
\r
250 static void findStereoCorrespondenceBM_SSE2( const Mat& left, const Mat& right,
\r
251 Mat& disp, Mat& cost, CvStereoBMState& state,
\r
252 uchar* buf, int _dy0, int _dy1 )
\r
254 const int ALIGN = 16;
\r
256 int wsz = state.SADWindowSize, wsz2 = wsz/2;
\r
257 int dy0 = MIN(_dy0, wsz2+1), dy1 = MIN(_dy1, wsz2+1);
\r
258 int ndisp = state.numberOfDisparities;
\r
259 int mindisp = state.minDisparity;
\r
260 int lofs = MAX(ndisp - 1 + mindisp, 0);
\r
261 int rofs = -MIN(ndisp - 1 + mindisp, 0);
\r
262 int width = left.cols, height = left.rows;
\r
263 int width1 = width - rofs - ndisp + 1;
\r
264 int ftzero = state.preFilterCap;
\r
265 int textureThreshold = state.textureThreshold;
\r
266 int uniquenessRatio = state.uniquenessRatio*256/100;
\r
267 short FILTERED = (short)((mindisp - 1) << DISPARITY_SHIFT);
\r
269 ushort *sad, *hsad0, *hsad, *hsad_sub;
\r
271 uchar *cbuf0, *cbuf;
\r
272 const uchar* lptr0 = left.data + lofs;
\r
273 const uchar* rptr0 = right.data + rofs;
\r
274 const uchar *lptr, *lptr_sub, *rptr;
\r
275 short* dptr = (short*)disp.data;
\r
276 int sstep = left.step;
\r
277 int dstep = disp.step/sizeof(dptr[0]);
\r
278 int cstep = (height + dy0 + dy1)*ndisp;
\r
280 int coststep = cost.data ? cost.step/sizeof(costbuf) : 0;
\r
281 const int TABSZ = 256;
\r
283 const __m128i d0_8 = _mm_setr_epi16(0,1,2,3,4,5,6,7), dd_8 = _mm_set1_epi16(8);
\r
285 sad = (ushort*)alignPtr(buf + sizeof(sad[0]), ALIGN);
\r
286 hsad0 = (ushort*)alignPtr(sad + ndisp + 1 + dy0*ndisp, ALIGN);
\r
287 htext = (int*)alignPtr((int*)(hsad0 + (height+dy1)*ndisp) + wsz2 + 2, ALIGN);
\r
288 cbuf0 = (uchar*)alignPtr(htext + height + wsz2 + 2 + dy0*ndisp, ALIGN);
\r
290 for( x = 0; x < TABSZ; x++ )
\r
291 tab[x] = (uchar)std::abs(x - ftzero);
\r
293 // initialize buffers
\r
294 memset( hsad0 - dy0*ndisp, 0, (height + dy0 + dy1)*ndisp*sizeof(hsad0[0]) );
\r
295 memset( htext - wsz2 - 1, 0, (height + wsz + 1)*sizeof(htext[0]) );
\r
297 for( x = -wsz2-1; x < wsz2; x++ )
\r
299 hsad = hsad0 - dy0*ndisp; cbuf = cbuf0 + (x + wsz2 + 1)*cstep - dy0*ndisp;
\r
300 lptr = lptr0 + MIN(MAX(x, -lofs), width-lofs-1) - dy0*sstep;
\r
301 rptr = rptr0 + MIN(MAX(x, -rofs), width-rofs-1) - dy0*sstep;
\r
303 for( y = -dy0; y < height + dy1; y++, hsad += ndisp, cbuf += ndisp, lptr += sstep, rptr += sstep )
\r
305 int lval = lptr[0];
\r
306 __m128i lv = _mm_set1_epi8((char)lval), z = _mm_setzero_si128();
\r
307 for( d = 0; d < ndisp; d += 16 )
\r
309 __m128i rv = _mm_loadu_si128((const __m128i*)(rptr + d));
\r
310 __m128i hsad_l = _mm_load_si128((__m128i*)(hsad + d));
\r
311 __m128i hsad_h = _mm_load_si128((__m128i*)(hsad + d + 8));
\r
312 __m128i diff = _mm_adds_epu8(_mm_subs_epu8(lv, rv), _mm_subs_epu8(rv, lv));
\r
313 _mm_store_si128((__m128i*)(cbuf + d), diff);
\r
314 hsad_l = _mm_add_epi16(hsad_l, _mm_unpacklo_epi8(diff,z));
\r
315 hsad_h = _mm_add_epi16(hsad_h, _mm_unpackhi_epi8(diff,z));
\r
316 _mm_store_si128((__m128i*)(hsad + d), hsad_l);
\r
317 _mm_store_si128((__m128i*)(hsad + d + 8), hsad_h);
\r
319 htext[y] += tab[lval];
\r
323 // initialize the left and right borders of the disparity map
\r
324 for( y = 0; y < height; y++ )
\r
326 for( x = 0; x < lofs; x++ )
\r
327 dptr[y*dstep + x] = FILTERED;
\r
328 for( x = lofs + width1; x < width; x++ )
\r
329 dptr[y*dstep + x] = FILTERED;
\r
333 for( x = 0; x < width1; x++, dptr++ )
\r
335 short* costptr = cost.data ? (short*)cost.data + lofs + x : &costbuf;
\r
336 int x0 = x - wsz2 - 1, x1 = x + wsz2;
\r
337 const uchar* cbuf_sub = cbuf0 + ((x0 + wsz2 + 1) % (wsz + 1))*cstep - dy0*ndisp;
\r
338 uchar* cbuf = cbuf0 + ((x1 + wsz2 + 1) % (wsz + 1))*cstep - dy0*ndisp;
\r
339 hsad = hsad0 - dy0*ndisp;
\r
340 lptr_sub = lptr0 + MIN(MAX(x0, -lofs), width-1-lofs) - dy0*sstep;
\r
341 lptr = lptr0 + MIN(MAX(x1, -lofs), width-1-lofs) - dy0*sstep;
\r
342 rptr = rptr0 + MIN(MAX(x1, -rofs), width-1-rofs) - dy0*sstep;
\r
344 for( y = -dy0; y < height + dy1; y++, cbuf += ndisp, cbuf_sub += ndisp,
\r
345 hsad += ndisp, lptr += sstep, lptr_sub += sstep, rptr += sstep )
\r
347 int lval = lptr[0];
\r
348 __m128i lv = _mm_set1_epi8((char)lval), z = _mm_setzero_si128();
\r
349 for( d = 0; d < ndisp; d += 16 )
\r
351 __m128i rv = _mm_loadu_si128((const __m128i*)(rptr + d));
\r
352 __m128i hsad_l = _mm_load_si128((__m128i*)(hsad + d));
\r
353 __m128i hsad_h = _mm_load_si128((__m128i*)(hsad + d + 8));
\r
354 __m128i cbs = _mm_load_si128((const __m128i*)(cbuf_sub + d));
\r
355 __m128i diff = _mm_adds_epu8(_mm_subs_epu8(lv, rv), _mm_subs_epu8(rv, lv));
\r
356 __m128i diff_h = _mm_sub_epi16(_mm_unpackhi_epi8(diff, z), _mm_unpackhi_epi8(cbs, z));
\r
357 _mm_store_si128((__m128i*)(cbuf + d), diff);
\r
358 diff = _mm_sub_epi16(_mm_unpacklo_epi8(diff, z), _mm_unpacklo_epi8(cbs, z));
\r
359 hsad_h = _mm_add_epi16(hsad_h, diff_h);
\r
360 hsad_l = _mm_add_epi16(hsad_l, diff);
\r
361 _mm_store_si128((__m128i*)(hsad + d), hsad_l);
\r
362 _mm_store_si128((__m128i*)(hsad + d + 8), hsad_h);
\r
364 htext[y] += tab[lval] - tab[lptr_sub[0]];
\r
368 for( y = dy1; y <= wsz2; y++ )
\r
369 htext[height+y] = htext[height+dy1-1];
\r
370 for( y = -wsz2-1; y < -dy0; y++ )
\r
371 htext[y] = htext[-dy0];
\r
374 for( d = 0; d < ndisp; d++ )
\r
375 sad[d] = (ushort)(hsad0[d-ndisp*dy0]*(wsz2 + 2 - dy0));
\r
377 hsad = hsad0 + (1 - dy0)*ndisp;
\r
378 for( y = 1 - dy0; y < wsz2; y++, hsad += ndisp )
\r
379 for( d = 0; d < ndisp; d += 16 )
\r
381 __m128i s0 = _mm_load_si128((__m128i*)(sad + d));
\r
382 __m128i s1 = _mm_load_si128((__m128i*)(sad + d + 8));
\r
383 __m128i t0 = _mm_load_si128((__m128i*)(hsad + d));
\r
384 __m128i t1 = _mm_load_si128((__m128i*)(hsad + d + 8));
\r
385 s0 = _mm_add_epi16(s0, t0);
\r
386 s1 = _mm_add_epi16(s1, t1);
\r
387 _mm_store_si128((__m128i*)(sad + d), s0);
\r
388 _mm_store_si128((__m128i*)(sad + d + 8), s1);
\r
391 for( y = -wsz2-1; y < wsz2; y++ )
\r
394 // finally, start the real processing
\r
395 for( y = 0; y < height; y++ )
\r
397 int minsad = INT_MAX, mind = -1;
\r
398 hsad = hsad0 + MIN(y + wsz2, height+dy1-1)*ndisp;
\r
399 hsad_sub = hsad0 + MAX(y - wsz2 - 1, -dy0)*ndisp;
\r
400 __m128i minsad8 = _mm_set1_epi16(SHRT_MAX);
\r
401 __m128i mind8 = _mm_set1_epi16(0), d8 = d0_8, mask;
\r
403 for( d = 0; d < ndisp; d += 16 )
\r
405 __m128i u0 = _mm_load_si128((__m128i*)(hsad_sub + d));
\r
406 __m128i u1 = _mm_load_si128((__m128i*)(hsad + d));
\r
408 __m128i v0 = _mm_load_si128((__m128i*)(hsad_sub + d + 8));
\r
409 __m128i v1 = _mm_load_si128((__m128i*)(hsad + d + 8));
\r
411 __m128i usad8 = _mm_load_si128((__m128i*)(sad + d));
\r
412 __m128i vsad8 = _mm_load_si128((__m128i*)(sad + d + 8));
\r
414 u1 = _mm_sub_epi16(u1, u0);
\r
415 v1 = _mm_sub_epi16(v1, v0);
\r
416 usad8 = _mm_add_epi16(usad8, u1);
\r
417 vsad8 = _mm_add_epi16(vsad8, v1);
\r
419 mask = _mm_cmpgt_epi16(minsad8, usad8);
\r
420 minsad8 = _mm_min_epi16(minsad8, usad8);
\r
421 mind8 = _mm_max_epi16(mind8, _mm_and_si128(mask, d8));
\r
423 _mm_store_si128((__m128i*)(sad + d), usad8);
\r
424 _mm_store_si128((__m128i*)(sad + d + 8), vsad8);
\r
426 mask = _mm_cmpgt_epi16(minsad8, vsad8);
\r
427 minsad8 = _mm_min_epi16(minsad8, vsad8);
\r
429 d8 = _mm_add_epi16(d8, dd_8);
\r
430 mind8 = _mm_max_epi16(mind8, _mm_and_si128(mask, d8));
\r
431 d8 = _mm_add_epi16(d8, dd_8);
\r
434 tsum += htext[y + wsz2] - htext[y - wsz2 - 1];
\r
435 if( tsum < textureThreshold )
\r
437 dptr[y*dstep] = FILTERED;
\r
441 __m128i minsad82 = _mm_unpackhi_epi64(minsad8, minsad8);
\r
442 __m128i mind82 = _mm_unpackhi_epi64(mind8, mind8);
\r
443 mask = _mm_cmpgt_epi16(minsad8, minsad82);
\r
444 mind8 = _mm_xor_si128(mind8,_mm_and_si128(_mm_xor_si128(mind82,mind8),mask));
\r
445 minsad8 = _mm_min_epi16(minsad8, minsad82);
\r
447 minsad82 = _mm_shufflelo_epi16(minsad8, _MM_SHUFFLE(3,2,3,2));
\r
448 mind82 = _mm_shufflelo_epi16(mind8, _MM_SHUFFLE(3,2,3,2));
\r
449 mask = _mm_cmpgt_epi16(minsad8, minsad82);
\r
450 mind8 = _mm_xor_si128(mind8,_mm_and_si128(_mm_xor_si128(mind82,mind8),mask));
\r
451 minsad8 = _mm_min_epi16(minsad8, minsad82);
\r
453 minsad82 = _mm_shufflelo_epi16(minsad8, 1);
\r
454 mind82 = _mm_shufflelo_epi16(mind8, 1);
\r
455 mask = _mm_cmpgt_epi16(minsad8, minsad82);
\r
456 mind8 = _mm_xor_si128(mind8,_mm_and_si128(_mm_xor_si128(mind82,mind8),mask));
\r
457 mind = (short)_mm_cvtsi128_si32(mind8);
\r
458 minsad = sad[mind];
\r
460 if( uniquenessRatio > 0 )
\r
462 int thresh = minsad + ((minsad * uniquenessRatio) >> 8);
\r
463 __m128i thresh8 = _mm_set1_epi16((short)(thresh + 1));
\r
464 __m128i d1 = _mm_set1_epi16((short)(mind-1)), d2 = _mm_set1_epi16((short)(mind+1));
\r
465 __m128i dd_16 = _mm_add_epi16(dd_8, dd_8), d8 = _mm_sub_epi16(d0_8, dd_16);
\r
467 for( d = 0; d < ndisp; d += 16 )
\r
469 __m128i usad8 = _mm_load_si128((__m128i*)(sad + d));
\r
470 __m128i vsad8 = _mm_load_si128((__m128i*)(sad + d + 8));
\r
471 mask = _mm_cmpgt_epi16( thresh8, _mm_min_epi16(usad8,vsad8));
\r
472 d8 = _mm_add_epi16(d8, dd_16);
\r
473 if( !_mm_movemask_epi8(mask) )
\r
475 mask = _mm_cmpgt_epi16( thresh8, usad8);
\r
476 mask = _mm_and_si128(mask, _mm_or_si128(_mm_cmpgt_epi16(d1,d8), _mm_cmpgt_epi16(d8,d2)));
\r
477 if( _mm_movemask_epi8(mask) )
\r
479 __m128i t8 = _mm_add_epi16(d8, dd_8);
\r
480 mask = _mm_cmpgt_epi16( thresh8, vsad8);
\r
481 mask = _mm_and_si128(mask, _mm_or_si128(_mm_cmpgt_epi16(d1,t8), _mm_cmpgt_epi16(t8,d2)));
\r
482 if( _mm_movemask_epi8(mask) )
\r
487 dptr[y*dstep] = FILTERED;
\r
492 if( 0 < mind && mind < ndisp - 1 )
\r
494 int p = sad[mind+1], n = sad[mind-1], d = p + n - 2*sad[mind];
\r
495 dptr[y*dstep] = (short)(((ndisp - mind - 1 + mindisp)*256 + (d != 0 ? (p-n)*128/d : 0) + 15) >> 4);
\r
498 dptr[y*dstep] = (ndisp - mind - 1)*16;
\r
499 costptr[y*coststep] = sad[mind];
\r
506 findStereoCorrespondenceBM( const Mat& left, const Mat& right,
\r
507 Mat& disp, Mat& cost, const CvStereoBMState& state,
\r
508 uchar* buf, int _dy0, int _dy1 )
\r
510 const int ALIGN = 16;
\r
512 int wsz = state.SADWindowSize, wsz2 = wsz/2;
\r
513 int dy0 = MIN(_dy0, wsz2+1), dy1 = MIN(_dy1, wsz2+1);
\r
514 int ndisp = state.numberOfDisparities;
\r
515 int mindisp = state.minDisparity;
\r
516 int lofs = MAX(ndisp - 1 + mindisp, 0);
\r
517 int rofs = -MIN(ndisp - 1 + mindisp, 0);
\r
518 int width = left.cols, height = left.rows;
\r
519 int width1 = width - rofs - ndisp + 1;
\r
520 int ftzero = state.preFilterCap;
\r
521 int textureThreshold = state.textureThreshold;
\r
522 int uniquenessRatio = state.uniquenessRatio;
\r
523 short FILTERED = (short)((mindisp - 1) << DISPARITY_SHIFT);
\r
525 int *sad, *hsad0, *hsad, *hsad_sub, *htext;
\r
526 uchar *cbuf0, *cbuf;
\r
527 const uchar* lptr0 = left.data + lofs;
\r
528 const uchar* rptr0 = right.data + rofs;
\r
529 const uchar *lptr, *lptr_sub, *rptr;
\r
530 short* dptr = (short*)disp.data;
\r
531 int sstep = left.step;
\r
532 int dstep = disp.step/sizeof(dptr[0]);
\r
533 int cstep = (height+dy0+dy1)*ndisp;
\r
535 int coststep = cost.data ? cost.step/sizeof(costbuf) : 0;
\r
536 const int TABSZ = 256;
\r
539 sad = (int*)alignPtr(buf + sizeof(sad[0]), ALIGN);
\r
540 hsad0 = (int*)alignPtr(sad + ndisp + 1 + dy0*ndisp, ALIGN);
\r
541 htext = (int*)alignPtr((int*)(hsad0 + (height+dy1)*ndisp) + wsz2 + 2, ALIGN);
\r
542 cbuf0 = (uchar*)alignPtr(htext + height + wsz2 + 2 + dy0*ndisp, ALIGN);
\r
544 for( x = 0; x < TABSZ; x++ )
\r
545 tab[x] = (uchar)std::abs(x - ftzero);
\r
547 // initialize buffers
\r
548 memset( hsad0 - dy0*ndisp, 0, (height + dy0 + dy1)*ndisp*sizeof(hsad0[0]) );
\r
549 memset( htext - wsz2 - 1, 0, (height + wsz + 1)*sizeof(htext[0]) );
\r
551 for( x = -wsz2-1; x < wsz2; x++ )
\r
553 hsad = hsad0 - dy0*ndisp; cbuf = cbuf0 + (x + wsz2 + 1)*cstep - dy0*ndisp;
\r
554 lptr = lptr0 + MIN(MAX(x, -lofs), width-lofs-1) - dy0*sstep;
\r
555 rptr = rptr0 + MIN(MAX(x, -rofs), width-rofs-1) - dy0*sstep;
\r
557 for( y = -dy0; y < height + dy1; y++, hsad += ndisp, cbuf += ndisp, lptr += sstep, rptr += sstep )
\r
559 int lval = lptr[0];
\r
560 for( d = 0; d < ndisp; d++ )
\r
562 int diff = std::abs(lval - rptr[d]);
\r
563 cbuf[d] = (uchar)diff;
\r
564 hsad[d] = (int)(hsad[d] + diff);
\r
566 htext[y] += tab[lval];
\r
570 // initialize the left and right borders of the disparity map
\r
571 for( y = 0; y < height; y++ )
\r
573 for( x = 0; x < lofs; x++ )
\r
574 dptr[y*dstep + x] = FILTERED;
\r
575 for( x = lofs + width1; x < width; x++ )
\r
576 dptr[y*dstep + x] = FILTERED;
\r
580 for( x = 0; x < width1; x++, dptr++ )
\r
582 int* costptr = cost.data ? (int*)cost.data + lofs + x : &costbuf;
\r
583 int x0 = x - wsz2 - 1, x1 = x + wsz2;
\r
584 const uchar* cbuf_sub = cbuf0 + ((x0 + wsz2 + 1) % (wsz + 1))*cstep - dy0*ndisp;
\r
585 uchar* cbuf = cbuf0 + ((x1 + wsz2 + 1) % (wsz + 1))*cstep - dy0*ndisp;
\r
586 hsad = hsad0 - dy0*ndisp;
\r
587 lptr_sub = lptr0 + MIN(MAX(x0, -lofs), width-1-lofs) - dy0*sstep;
\r
588 lptr = lptr0 + MIN(MAX(x1, -lofs), width-1-lofs) - dy0*sstep;
\r
589 rptr = rptr0 + MIN(MAX(x1, -rofs), width-1-rofs) - dy0*sstep;
\r
591 for( y = -dy0; y < height + dy1; y++, cbuf += ndisp, cbuf_sub += ndisp,
\r
592 hsad += ndisp, lptr += sstep, lptr_sub += sstep, rptr += sstep )
\r
594 int lval = lptr[0];
\r
595 for( d = 0; d < ndisp; d++ )
\r
597 int diff = std::abs(lval - rptr[d]);
\r
598 cbuf[d] = (uchar)diff;
\r
599 hsad[d] = hsad[d] + diff - cbuf_sub[d];
\r
601 htext[y] += tab[lval] - tab[lptr_sub[0]];
\r
605 for( y = dy1; y <= wsz2; y++ )
\r
606 htext[height+y] = htext[height+dy1-1];
\r
607 for( y = -wsz2-1; y < -dy0; y++ )
\r
608 htext[y] = htext[-dy0];
\r
611 for( d = 0; d < ndisp; d++ )
\r
612 sad[d] = (int)(hsad0[d-ndisp*dy0]*(wsz2 + 2 - dy0));
\r
614 hsad = hsad0 + (1 - dy0)*ndisp;
\r
615 for( y = 1 - dy0; y < wsz2; y++, hsad += ndisp )
\r
616 for( d = 0; d < ndisp; d++ )
\r
617 sad[d] = (int)(sad[d] + hsad[d]);
\r
619 for( y = -wsz2-1; y < wsz2; y++ )
\r
622 // finally, start the real processing
\r
623 for( y = 0; y < height; y++ )
\r
625 int minsad = INT_MAX, mind = -1;
\r
626 hsad = hsad0 + MIN(y + wsz2, height+dy1-1)*ndisp;
\r
627 hsad_sub = hsad0 + MAX(y - wsz2 - 1, -dy0)*ndisp;
\r
629 for( d = 0; d < ndisp; d++ )
\r
631 int currsad = sad[d] + hsad[d] - hsad_sub[d];
\r
633 if( currsad < minsad )
\r
639 tsum += htext[y + wsz2] - htext[y - wsz2 - 1];
\r
640 if( tsum < textureThreshold )
\r
642 dptr[y*dstep] = FILTERED;
\r
646 if( uniquenessRatio > 0 )
\r
648 int thresh = minsad + (minsad * uniquenessRatio/100);
\r
649 for( d = 0; d < ndisp; d++ )
\r
651 if( sad[d] <= thresh && (d < mind-1 || d > mind+1))
\r
656 dptr[y*dstep] = FILTERED;
\r
663 sad[ndisp] = sad[ndisp-2];
\r
664 int p = sad[mind+1], n = sad[mind-1], d = p + n - 2*sad[mind];
\r
665 dptr[y*dstep] = (short)(((ndisp - mind - 1 + mindisp)*256 + (d != 0 ? (p-n)*128/d : 0) + 15) >> 4);
\r
666 costptr[y*coststep] = sad[mind];
\r
672 struct PrefilterInvoker
\r
674 PrefilterInvoker(const Mat& left0, const Mat& right0, Mat& left, Mat& right,
\r
675 uchar* buf0, uchar* buf1, CvStereoBMState* _state )
\r
677 imgs0[0] = &left0; imgs0[1] = &right0;
\r
678 imgs[0] = &left; imgs[1] = &right;
\r
679 buf[0] = buf0; buf[1] = buf1;
\r
683 void operator()( int ind ) const
\r
685 if( state->preFilterType == CV_STEREO_BM_NORMALIZED_RESPONSE )
\r
686 prefilterNorm( *imgs0[ind], *imgs[ind], state->preFilterSize, state->preFilterCap, buf[ind] );
\r
688 prefilterXSobel( *imgs0[ind], *imgs[ind], state->preFilterCap );
\r
691 const Mat* imgs0[2];
\r
694 CvStereoBMState *state;
\r
698 struct FindStereoCorrespInvoker
\r
700 FindStereoCorrespInvoker( const Mat& _left, const Mat& _right,
\r
701 Mat& _disp, CvStereoBMState* _state,
\r
702 int _nstripes, int _stripeBufSize,
\r
703 bool _useShorts, Rect _validDisparityRect )
\r
705 left = &_left; right = &_right;
\r
706 disp = &_disp; state = _state;
\r
707 nstripes = _nstripes; stripeBufSize = _stripeBufSize;
\r
708 useShorts = _useShorts;
\r
709 validDisparityRect = _validDisparityRect;
\r
712 void operator()( const BlockedRange& range ) const
\r
714 int cols = left->cols, rows = left->rows;
\r
715 int _row0 = min(cvRound(range.begin() * rows / nstripes), rows);
\r
716 int _row1 = min(cvRound(range.end() * rows / nstripes), rows);
\r
717 uchar *ptr = state->slidingSumBuf->data.ptr + range.begin() * stripeBufSize;
\r
718 int FILTERED = (state->minDisparity - 1)*16;
\r
720 Rect roi = validDisparityRect & Rect(0, _row0, cols, _row1);
\r
721 if( roi.height == 0 )
\r
724 int row1 = roi.y + roi.height;
\r
729 part = disp->rowRange(_row0, row0);
\r
730 part = Scalar::all(FILTERED);
\r
734 part = disp->rowRange(row1, _row1);
\r
735 part = Scalar::all(FILTERED);
\r
738 Mat left_i = left->rowRange(row0, row1);
\r
739 Mat right_i = right->rowRange(row0, row1);
\r
740 Mat disp_i = disp->rowRange(row0, row1);
\r
741 Mat cost_i = state->disp12MaxDiff >= 0 ? Mat(state->cost).rowRange(row0, row1) : Mat();
\r
745 findStereoCorrespondenceBM_SSE2( left_i, right_i, disp_i, cost_i, *state, ptr, row0, rows - row1 );
\r
748 findStereoCorrespondenceBM( left_i, right_i, disp_i, cost_i, *state, ptr, row0, rows - row1 );
\r
750 if( state->disp12MaxDiff >= 0 )
\r
751 validateDisparity( disp_i, cost_i, state->minDisparity, state->numberOfDisparities, state->disp12MaxDiff );
\r
755 part = disp_i.colRange(0, roi.x);
\r
756 part = Scalar::all(FILTERED);
\r
758 if( roi.x + roi.width < cols )
\r
760 part = disp_i.colRange(roi.x + roi.width, cols);
\r
761 part = Scalar::all(FILTERED);
\r
766 const Mat *left, *right;
\r
768 CvStereoBMState *state;
\r
773 Rect validDisparityRect;
\r
776 static void findStereoCorrespondenceBM( const Mat& left0, const Mat& right0, Mat& disp0, CvStereoBMState* state)
\r
778 if (left0.size() != right0.size() || disp0.size() != left0.size())
\r
779 CV_Error( CV_StsUnmatchedSizes, "All the images must have the same size" );
\r
781 if (left0.type() != CV_8UC1 || right0.type() != CV_8UC1)
\r
782 CV_Error( CV_StsUnsupportedFormat, "Both input images must have CV_8UC1" );
\r
784 if (disp0.type() != CV_16SC1 && disp0.type() != CV_32FC1)
\r
785 CV_Error( CV_StsUnsupportedFormat, "Disparity image must have CV_16SC1 or CV_32FC1 format" );
\r
788 CV_Error( CV_StsNullPtr, "Stereo BM state is NULL." );
\r
790 if( state->preFilterType != CV_STEREO_BM_NORMALIZED_RESPONSE && state->preFilterType != CV_STEREO_BM_XSOBEL )
\r
791 CV_Error( CV_StsOutOfRange, "preFilterType must be = CV_STEREO_BM_NORMALIZED_RESPONSE" );
\r
793 if( state->preFilterSize < 5 || state->preFilterSize > 255 || state->preFilterSize % 2 == 0 )
\r
794 CV_Error( CV_StsOutOfRange, "preFilterSize must be odd and be within 5..255" );
\r
796 if( state->preFilterCap < 1 || state->preFilterCap > 63 )
\r
797 CV_Error( CV_StsOutOfRange, "preFilterCap must be within 1..63" );
\r
799 if( state->SADWindowSize < 5 || state->SADWindowSize > 255 || state->SADWindowSize % 2 == 0 ||
\r
800 state->SADWindowSize >= min(left0.cols, left0.rows) )
\r
801 CV_Error( CV_StsOutOfRange, "SADWindowSize must be odd, be within 5..255 and be not larger than image width or height" );
\r
803 if( state->numberOfDisparities <= 0 || state->numberOfDisparities % 16 != 0 )
\r
804 CV_Error( CV_StsOutOfRange, "numberOfDisparities must be positive and divisble by 16" );
\r
806 if( state->textureThreshold < 0 )
\r
807 CV_Error( CV_StsOutOfRange, "texture threshold must be non-negative" );
\r
809 if( state->uniquenessRatio < 0 )
\r
810 CV_Error( CV_StsOutOfRange, "uniqueness ratio must be non-negative" );
\r
812 if( !state->preFilteredImg0 || state->preFilteredImg0->cols * state->preFilteredImg0->rows < left0.cols * left0.rows )
\r
814 cvReleaseMat( &state->preFilteredImg0 );
\r
815 cvReleaseMat( &state->preFilteredImg1 );
\r
816 cvReleaseMat( &state->cost );
\r
818 state->preFilteredImg0 = cvCreateMat( left0.rows, left0.cols, CV_8U );
\r
819 state->preFilteredImg1 = cvCreateMat( left0.rows, left0.cols, CV_8U );
\r
820 state->cost = cvCreateMat( left0.rows, left0.cols, CV_16S );
\r
822 Mat left(left0.size(), CV_8U, state->preFilteredImg0->data.ptr);
\r
823 Mat right(right0.size(), CV_8U, state->preFilteredImg1->data.ptr);
\r
825 int mindisp = state->minDisparity;
\r
826 int ndisp = state->numberOfDisparities;
\r
828 int width = left0.cols;
\r
829 int height = left0.rows;
\r
830 int lofs = max(ndisp - 1 + mindisp, 0);
\r
831 int rofs = -min(ndisp - 1 + mindisp, 0);
\r
832 int width1 = width - rofs - ndisp + 1;
\r
833 int FILTERED = (state->minDisparity - 1) << DISPARITY_SHIFT;
\r
835 if( lofs >= width || rofs >= width || width1 < 1 )
\r
837 disp0 = Scalar::all( FILTERED * ( disp0.type() < CV_32F ? 1 : 1./(1 << DISPARITY_SHIFT) ) );
\r
843 if( disp0.type() == CV_32F)
\r
845 if( !state->disp || state->disp->rows != disp0.rows || state->disp->cols != disp0.cols )
\r
847 cvReleaseMat( &state->disp );
\r
848 state->disp = cvCreateMat(disp0.rows, disp0.cols, CV_16S);
\r
850 disp = cv::cvarrToMat(state->disp);
\r
853 int wsz = state->SADWindowSize;
\r
854 int bufSize0 = (ndisp + 2)*sizeof(int) + (height+wsz+2)*ndisp*sizeof(int) +
\r
855 (height + wsz + 2)*sizeof(int) +
\r
856 (height+wsz+2)*ndisp*(wsz+1)*sizeof(uchar) + 256;
\r
857 int bufSize1 = (width + state->preFilterSize + 2) * sizeof(int) + 256;
\r
859 if( state->speckleRange >= 0 && state->speckleWindowSize > 0 )
\r
860 bufSize2 = width*height*(sizeof(cv::Point_<short>) + sizeof(int) + sizeof(uchar));
\r
863 bool useShorts = state->preFilterCap <= 31 && state->SADWindowSize <= 21 && checkHardwareSupport(CV_CPU_SSE2);
\r
865 const bool useShorts = false;
\r
869 const double SAD_overhead_coeff = 10.0;
\r
870 double N0 = 100000 / (useShorts ? 1 : 4); // approx tbb's min number instructions reasonable for one thread
\r
871 double maxStripeSize = min(max(N0 / (width * ndisp), (wsz-1) * SAD_overhead_coeff), (double)height);
\r
872 int nstripes = cvCeil(height / maxStripeSize);
\r
874 const int nstripes = 1;
\r
877 int bufSize = max(bufSize0 * nstripes, max(bufSize1 * 2, bufSize2));
\r
879 if( !state->slidingSumBuf || state->slidingSumBuf->cols < bufSize )
\r
881 cvReleaseMat( &state->slidingSumBuf );
\r
882 state->slidingSumBuf = cvCreateMat( 1, bufSize, CV_8U );
\r
885 uchar *_buf = state->slidingSumBuf->data.ptr;
\r
887 parallel_do(idx, idx+2, PrefilterInvoker(left0, right0, left, right, _buf, _buf + bufSize1, state));
\r
889 Rect validDisparityRect(0, 0, width, height), R1 = state->roi1, R2 = state->roi2;
\r
890 validDisparityRect = getValidDisparityROI(R1.area() > 0 ? Rect(0, 0, width, height) : validDisparityRect,
\r
891 R2.area() > 0 ? Rect(0, 0, width, height) : validDisparityRect,
\r
892 state->minDisparity, state->numberOfDisparities,
\r
893 state->SADWindowSize);
\r
895 parallel_for(BlockedRange(0, nstripes),
\r
896 FindStereoCorrespInvoker(left, right, disp, state, nstripes,
\r
897 bufSize0, useShorts, validDisparityRect));
\r
899 if( state->speckleRange >= 0 && state->speckleWindowSize > 0 )
\r
901 Mat buf(state->slidingSumBuf);
\r
902 filterSpeckles(disp, FILTERED, state->speckleRange, state->speckleWindowSize, buf);
\r
905 if (disp0.data != disp.data)
\r
906 disp.convertTo(disp0, disp0.type(), 1./(1 << DISPARITY_SHIFT), 0);
\r
909 StereoBM::StereoBM()
\r
910 { state = cvCreateStereoBMState(); }
\r
912 StereoBM::StereoBM(int _preset, int _ndisparities, int _SADWindowSize)
\r
913 { init(_preset, _ndisparities, _SADWindowSize); }
\r
915 void StereoBM::init(int _preset, int _ndisparities, int _SADWindowSize)
\r
917 state = cvCreateStereoBMState(_preset, _ndisparities);
\r
918 state->SADWindowSize = _SADWindowSize;
\r
921 void StereoBM::operator()( const Mat& left, const Mat& right, Mat& disparity, int disptype )
\r
923 CV_Assert( disptype == CV_16S || disptype == CV_32F );
\r
924 disparity.create(left.size(), disptype);
\r
926 findStereoCorrespondenceBM(left, right, disparity, state);
\r
933 CV_IMPL void cvFindStereoCorrespondenceBM( const CvArr* leftarr, const CvArr* rightarr,
\r
934 CvArr* disparr, CvStereoBMState* state )
\r
936 cv::Mat left = cv::cvarrToMat(leftarr),
\r
937 right = cv::cvarrToMat(rightarr),
\r
938 disp = cv::cvarrToMat(disparr);
\r
939 cv::findStereoCorrespondenceBM(left, right, disp, state);
\r