for( y = -dy0; y < height + dy1; y++, hsad += ndisp, cbuf += ndisp, lptr += sstep, rptr += sstep )\r
{\r
int lval = lptr[0];\r
- for( d = 0; d < ndisp; d++ )\r
+ __m128i lv = _mm_set1_epi8((char)lval), z = _mm_setzero_si128();\r
+ for( d = 0; d < ndisp; d += 16 )\r
{\r
- int diff = abs(lval - rptr[d]);\r
- cbuf[d] = (uchar)diff;\r
- hsad[d] = (ushort)(hsad[d] + diff);\r
+ __m128i rv = _mm_loadu_si128((const __m128i*)(rptr + d));\r
+ __m128i hsad_l = _mm_load_si128((__m128i*)(hsad + d));\r
+ __m128i hsad_h = _mm_load_si128((__m128i*)(hsad + d + 8));\r
+ __m128i diff = _mm_adds_epu8(_mm_subs_epu8(lv, rv), _mm_subs_epu8(rv, lv));\r
+ _mm_store_si128((__m128i*)(cbuf + d), diff);\r
+ hsad_l = _mm_add_epi16(hsad_l, _mm_unpacklo_epi8(diff,z));\r
+ hsad_h = _mm_add_epi16(hsad_h, _mm_unpackhi_epi8(diff,z));\r
+ _mm_store_si128((__m128i*)(hsad + d), hsad_l);\r
+ _mm_store_si128((__m128i*)(hsad + d + 8), hsad_h);\r
}\r
htext[y] += tab[lval];\r
}\r