2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
25 #include "libavutil/x86_cpu.h"
26 #include "libavcodec/dsputil.h"
27 #include "libavcodec/mpegvideo.h"
28 #include "dsputil_mmx.h"
31 static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
34 "mov $-128, %%"REG_a" \n\t"
35 "pxor %%mm7, %%mm7 \n\t"
38 "movq (%0), %%mm0 \n\t"
39 "movq (%0, %2), %%mm2 \n\t"
40 "movq %%mm0, %%mm1 \n\t"
41 "movq %%mm2, %%mm3 \n\t"
42 "punpcklbw %%mm7, %%mm0 \n\t"
43 "punpckhbw %%mm7, %%mm1 \n\t"
44 "punpcklbw %%mm7, %%mm2 \n\t"
45 "punpckhbw %%mm7, %%mm3 \n\t"
46 "movq %%mm0, (%1, %%"REG_a") \n\t"
47 "movq %%mm1, 8(%1, %%"REG_a") \n\t"
48 "movq %%mm2, 16(%1, %%"REG_a") \n\t"
49 "movq %%mm3, 24(%1, %%"REG_a") \n\t"
51 "add $32, %%"REG_a" \n\t"
54 : "r" (block+64), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*2)
59 static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
62 "pxor %%mm7, %%mm7 \n\t"
63 "mov $-128, %%"REG_a" \n\t"
66 "movq (%0), %%mm0 \n\t"
67 "movq (%1), %%mm2 \n\t"
68 "movq %%mm0, %%mm1 \n\t"
69 "movq %%mm2, %%mm3 \n\t"
70 "punpcklbw %%mm7, %%mm0 \n\t"
71 "punpckhbw %%mm7, %%mm1 \n\t"
72 "punpcklbw %%mm7, %%mm2 \n\t"
73 "punpckhbw %%mm7, %%mm3 \n\t"
74 "psubw %%mm2, %%mm0 \n\t"
75 "psubw %%mm3, %%mm1 \n\t"
76 "movq %%mm0, (%2, %%"REG_a") \n\t"
77 "movq %%mm1, 8(%2, %%"REG_a") \n\t"
80 "add $16, %%"REG_a" \n\t"
82 : "+r" (s1), "+r" (s2)
83 : "r" (block+64), "r" ((x86_reg)stride)
88 static int pix_sum16_mmx(uint8_t * pix, int line_size){
91 x86_reg index= -line_size*h;
94 "pxor %%mm7, %%mm7 \n\t"
95 "pxor %%mm6, %%mm6 \n\t"
97 "movq (%2, %1), %%mm0 \n\t"
98 "movq (%2, %1), %%mm1 \n\t"
99 "movq 8(%2, %1), %%mm2 \n\t"
100 "movq 8(%2, %1), %%mm3 \n\t"
101 "punpcklbw %%mm7, %%mm0 \n\t"
102 "punpckhbw %%mm7, %%mm1 \n\t"
103 "punpcklbw %%mm7, %%mm2 \n\t"
104 "punpckhbw %%mm7, %%mm3 \n\t"
105 "paddw %%mm0, %%mm1 \n\t"
106 "paddw %%mm2, %%mm3 \n\t"
107 "paddw %%mm1, %%mm3 \n\t"
108 "paddw %%mm3, %%mm6 \n\t"
111 "movq %%mm6, %%mm5 \n\t"
112 "psrlq $32, %%mm6 \n\t"
113 "paddw %%mm5, %%mm6 \n\t"
114 "movq %%mm6, %%mm5 \n\t"
115 "psrlq $16, %%mm6 \n\t"
116 "paddw %%mm5, %%mm6 \n\t"
117 "movd %%mm6, %0 \n\t"
118 "andl $0xFFFF, %0 \n\t"
119 : "=&r" (sum), "+r" (index)
120 : "r" (pix - index), "r" ((x86_reg)line_size)
126 static int pix_norm1_mmx(uint8_t *pix, int line_size) {
133 "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */
134 "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */
136 "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */
138 "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
139 "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
141 "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */
142 "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
143 "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
145 "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
146 "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
148 "pmaddwd %%mm3,%%mm3\n"
149 "pmaddwd %%mm4,%%mm4\n"
151 "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
152 pix2^2+pix3^2+pix6^2+pix7^2) */
153 "paddd %%mm3,%%mm4\n"
154 "paddd %%mm2,%%mm7\n"
157 "paddd %%mm4,%%mm7\n"
162 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
163 "paddd %%mm7,%%mm1\n"
165 : "+r" (pix), "=r"(tmp) : "r" ((x86_reg)line_size) : "%ecx" );
169 static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
174 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
175 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
177 "movq (%0),%%mm1\n" /* mm1 = pix1[0][0-7] */
178 "movq (%1),%%mm2\n" /* mm2 = pix2[0][0-7] */
179 "movq (%0,%3),%%mm3\n" /* mm3 = pix1[1][0-7] */
180 "movq (%1,%3),%%mm4\n" /* mm4 = pix2[1][0-7] */
182 /* todo: mm1-mm2, mm3-mm4 */
183 /* algo: subtract mm1 from mm2 with saturation and vice versa */
184 /* OR the results to get absolute difference */
187 "psubusb %%mm2,%%mm1\n"
188 "psubusb %%mm4,%%mm3\n"
189 "psubusb %%mm5,%%mm2\n"
190 "psubusb %%mm6,%%mm4\n"
195 /* now convert to 16-bit vectors so we can square them */
199 "punpckhbw %%mm0,%%mm2\n"
200 "punpckhbw %%mm0,%%mm4\n"
201 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
202 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
204 "pmaddwd %%mm2,%%mm2\n"
205 "pmaddwd %%mm4,%%mm4\n"
206 "pmaddwd %%mm1,%%mm1\n"
207 "pmaddwd %%mm3,%%mm3\n"
209 "lea (%0,%3,2), %0\n" /* pix1 += 2*line_size */
210 "lea (%1,%3,2), %1\n" /* pix2 += 2*line_size */
212 "paddd %%mm2,%%mm1\n"
213 "paddd %%mm4,%%mm3\n"
214 "paddd %%mm1,%%mm7\n"
215 "paddd %%mm3,%%mm7\n"
221 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
222 "paddd %%mm7,%%mm1\n"
224 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
225 : "r" ((x86_reg)line_size) , "m" (h)
230 static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
234 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
235 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
237 "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */
238 "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */
239 "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */
240 "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */
242 /* todo: mm1-mm2, mm3-mm4 */
243 /* algo: subtract mm1 from mm2 with saturation and vice versa */
244 /* OR the results to get absolute difference */
247 "psubusb %%mm2,%%mm1\n"
248 "psubusb %%mm4,%%mm3\n"
249 "psubusb %%mm5,%%mm2\n"
250 "psubusb %%mm6,%%mm4\n"
255 /* now convert to 16-bit vectors so we can square them */
259 "punpckhbw %%mm0,%%mm2\n"
260 "punpckhbw %%mm0,%%mm4\n"
261 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
262 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
264 "pmaddwd %%mm2,%%mm2\n"
265 "pmaddwd %%mm4,%%mm4\n"
266 "pmaddwd %%mm1,%%mm1\n"
267 "pmaddwd %%mm3,%%mm3\n"
272 "paddd %%mm2,%%mm1\n"
273 "paddd %%mm4,%%mm3\n"
274 "paddd %%mm1,%%mm7\n"
275 "paddd %%mm3,%%mm7\n"
281 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
282 "paddd %%mm7,%%mm1\n"
284 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
285 : "r" ((x86_reg)line_size) , "m" (h)
290 static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
294 "pxor %%xmm0,%%xmm0\n" /* mm0 = 0 */
295 "pxor %%xmm7,%%xmm7\n" /* mm7 holds the sum */
297 "movdqu (%0),%%xmm1\n" /* mm1 = pix1[0][0-15] */
298 "movdqu (%1),%%xmm2\n" /* mm2 = pix2[0][0-15] */
299 "movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */
300 "movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */
302 /* todo: mm1-mm2, mm3-mm4 */
303 /* algo: subtract mm1 from mm2 with saturation and vice versa */
304 /* OR the results to get absolute difference */
305 "movdqa %%xmm1,%%xmm5\n"
306 "movdqa %%xmm3,%%xmm6\n"
307 "psubusb %%xmm2,%%xmm1\n"
308 "psubusb %%xmm4,%%xmm3\n"
309 "psubusb %%xmm5,%%xmm2\n"
310 "psubusb %%xmm6,%%xmm4\n"
312 "por %%xmm1,%%xmm2\n"
313 "por %%xmm3,%%xmm4\n"
315 /* now convert to 16-bit vectors so we can square them */
316 "movdqa %%xmm2,%%xmm1\n"
317 "movdqa %%xmm4,%%xmm3\n"
319 "punpckhbw %%xmm0,%%xmm2\n"
320 "punpckhbw %%xmm0,%%xmm4\n"
321 "punpcklbw %%xmm0,%%xmm1\n" /* mm1 now spread over (mm1,mm2) */
322 "punpcklbw %%xmm0,%%xmm3\n" /* mm4 now spread over (mm3,mm4) */
324 "pmaddwd %%xmm2,%%xmm2\n"
325 "pmaddwd %%xmm4,%%xmm4\n"
326 "pmaddwd %%xmm1,%%xmm1\n"
327 "pmaddwd %%xmm3,%%xmm3\n"
329 "lea (%0,%4,2), %0\n" /* pix1 += 2*line_size */
330 "lea (%1,%4,2), %1\n" /* pix2 += 2*line_size */
332 "paddd %%xmm2,%%xmm1\n"
333 "paddd %%xmm4,%%xmm3\n"
334 "paddd %%xmm1,%%xmm7\n"
335 "paddd %%xmm3,%%xmm7\n"
340 "movdqa %%xmm7,%%xmm1\n"
341 "psrldq $8, %%xmm7\n" /* shift hi qword to lo */
342 "paddd %%xmm1,%%xmm7\n"
343 "movdqa %%xmm7,%%xmm1\n"
344 "psrldq $4, %%xmm7\n" /* shift hi dword to lo */
345 "paddd %%xmm1,%%xmm7\n"
347 : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp)
348 : "r" ((x86_reg)line_size));
352 static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
360 "movq %%mm0, %%mm1\n"
364 "movq %%mm0, %%mm2\n"
365 "movq %%mm1, %%mm3\n"
366 "punpcklbw %%mm7,%%mm0\n"
367 "punpcklbw %%mm7,%%mm1\n"
368 "punpckhbw %%mm7,%%mm2\n"
369 "punpckhbw %%mm7,%%mm3\n"
370 "psubw %%mm1, %%mm0\n"
371 "psubw %%mm3, %%mm2\n"
376 "movq %%mm4, %%mm1\n"
380 "movq %%mm4, %%mm5\n"
381 "movq %%mm1, %%mm3\n"
382 "punpcklbw %%mm7,%%mm4\n"
383 "punpcklbw %%mm7,%%mm1\n"
384 "punpckhbw %%mm7,%%mm5\n"
385 "punpckhbw %%mm7,%%mm3\n"
386 "psubw %%mm1, %%mm4\n"
387 "psubw %%mm3, %%mm5\n"
388 "psubw %%mm4, %%mm0\n"
389 "psubw %%mm5, %%mm2\n"
390 "pxor %%mm3, %%mm3\n"
391 "pxor %%mm1, %%mm1\n"
392 "pcmpgtw %%mm0, %%mm3\n\t"
393 "pcmpgtw %%mm2, %%mm1\n\t"
394 "pxor %%mm3, %%mm0\n"
395 "pxor %%mm1, %%mm2\n"
396 "psubw %%mm3, %%mm0\n"
397 "psubw %%mm1, %%mm2\n"
398 "paddw %%mm0, %%mm2\n"
399 "paddw %%mm2, %%mm6\n"
405 "movq %%mm0, %%mm1\n"
409 "movq %%mm0, %%mm2\n"
410 "movq %%mm1, %%mm3\n"
411 "punpcklbw %%mm7,%%mm0\n"
412 "punpcklbw %%mm7,%%mm1\n"
413 "punpckhbw %%mm7,%%mm2\n"
414 "punpckhbw %%mm7,%%mm3\n"
415 "psubw %%mm1, %%mm0\n"
416 "psubw %%mm3, %%mm2\n"
417 "psubw %%mm0, %%mm4\n"
418 "psubw %%mm2, %%mm5\n"
419 "pxor %%mm3, %%mm3\n"
420 "pxor %%mm1, %%mm1\n"
421 "pcmpgtw %%mm4, %%mm3\n\t"
422 "pcmpgtw %%mm5, %%mm1\n\t"
423 "pxor %%mm3, %%mm4\n"
424 "pxor %%mm1, %%mm5\n"
425 "psubw %%mm3, %%mm4\n"
426 "psubw %%mm1, %%mm5\n"
427 "paddw %%mm4, %%mm5\n"
428 "paddw %%mm5, %%mm6\n"
433 "movq %%mm4, %%mm1\n"
437 "movq %%mm4, %%mm5\n"
438 "movq %%mm1, %%mm3\n"
439 "punpcklbw %%mm7,%%mm4\n"
440 "punpcklbw %%mm7,%%mm1\n"
441 "punpckhbw %%mm7,%%mm5\n"
442 "punpckhbw %%mm7,%%mm3\n"
443 "psubw %%mm1, %%mm4\n"
444 "psubw %%mm3, %%mm5\n"
445 "psubw %%mm4, %%mm0\n"
446 "psubw %%mm5, %%mm2\n"
447 "pxor %%mm3, %%mm3\n"
448 "pxor %%mm1, %%mm1\n"
449 "pcmpgtw %%mm0, %%mm3\n\t"
450 "pcmpgtw %%mm2, %%mm1\n\t"
451 "pxor %%mm3, %%mm0\n"
452 "pxor %%mm1, %%mm2\n"
453 "psubw %%mm3, %%mm0\n"
454 "psubw %%mm1, %%mm2\n"
455 "paddw %%mm0, %%mm2\n"
456 "paddw %%mm2, %%mm6\n"
462 "movq %%mm6, %%mm0\n"
463 "punpcklwd %%mm7,%%mm0\n"
464 "punpckhwd %%mm7,%%mm6\n"
465 "paddd %%mm0, %%mm6\n"
469 "paddd %%mm6,%%mm0\n"
471 : "+r" (pix1), "=r"(tmp)
472 : "r" ((x86_reg)line_size) , "g" (h-2)
477 static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
487 "movq %%mm0, %%mm2\n"
488 "movq %%mm1, %%mm3\n"
489 "punpcklbw %%mm7,%%mm0\n"
490 "punpcklbw %%mm7,%%mm1\n"
491 "punpckhbw %%mm7,%%mm2\n"
492 "punpckhbw %%mm7,%%mm3\n"
493 "psubw %%mm1, %%mm0\n"
494 "psubw %%mm3, %%mm2\n"
500 "movq %%mm4, %%mm5\n"
501 "movq %%mm1, %%mm3\n"
502 "punpcklbw %%mm7,%%mm4\n"
503 "punpcklbw %%mm7,%%mm1\n"
504 "punpckhbw %%mm7,%%mm5\n"
505 "punpckhbw %%mm7,%%mm3\n"
506 "psubw %%mm1, %%mm4\n"
507 "psubw %%mm3, %%mm5\n"
508 "psubw %%mm4, %%mm0\n"
509 "psubw %%mm5, %%mm2\n"
510 "pxor %%mm3, %%mm3\n"
511 "pxor %%mm1, %%mm1\n"
512 "pcmpgtw %%mm0, %%mm3\n\t"
513 "pcmpgtw %%mm2, %%mm1\n\t"
514 "pxor %%mm3, %%mm0\n"
515 "pxor %%mm1, %%mm2\n"
516 "psubw %%mm3, %%mm0\n"
517 "psubw %%mm1, %%mm2\n"
518 "paddw %%mm0, %%mm2\n"
519 "paddw %%mm2, %%mm6\n"
526 "movq %%mm0, %%mm2\n"
527 "movq %%mm1, %%mm3\n"
528 "punpcklbw %%mm7,%%mm0\n"
529 "punpcklbw %%mm7,%%mm1\n"
530 "punpckhbw %%mm7,%%mm2\n"
531 "punpckhbw %%mm7,%%mm3\n"
532 "psubw %%mm1, %%mm0\n"
533 "psubw %%mm3, %%mm2\n"
534 "psubw %%mm0, %%mm4\n"
535 "psubw %%mm2, %%mm5\n"
536 "pxor %%mm3, %%mm3\n"
537 "pxor %%mm1, %%mm1\n"
538 "pcmpgtw %%mm4, %%mm3\n\t"
539 "pcmpgtw %%mm5, %%mm1\n\t"
540 "pxor %%mm3, %%mm4\n"
541 "pxor %%mm1, %%mm5\n"
542 "psubw %%mm3, %%mm4\n"
543 "psubw %%mm1, %%mm5\n"
544 "paddw %%mm4, %%mm5\n"
545 "paddw %%mm5, %%mm6\n"
551 "movq %%mm4, %%mm5\n"
552 "movq %%mm1, %%mm3\n"
553 "punpcklbw %%mm7,%%mm4\n"
554 "punpcklbw %%mm7,%%mm1\n"
555 "punpckhbw %%mm7,%%mm5\n"
556 "punpckhbw %%mm7,%%mm3\n"
557 "psubw %%mm1, %%mm4\n"
558 "psubw %%mm3, %%mm5\n"
559 "psubw %%mm4, %%mm0\n"
560 "psubw %%mm5, %%mm2\n"
561 "pxor %%mm3, %%mm3\n"
562 "pxor %%mm1, %%mm1\n"
563 "pcmpgtw %%mm0, %%mm3\n\t"
564 "pcmpgtw %%mm2, %%mm1\n\t"
565 "pxor %%mm3, %%mm0\n"
566 "pxor %%mm1, %%mm2\n"
567 "psubw %%mm3, %%mm0\n"
568 "psubw %%mm1, %%mm2\n"
569 "paddw %%mm0, %%mm2\n"
570 "paddw %%mm2, %%mm6\n"
576 "movq %%mm6, %%mm0\n"
577 "punpcklwd %%mm7,%%mm0\n"
578 "punpckhwd %%mm7,%%mm6\n"
579 "paddd %%mm0, %%mm6\n"
583 "paddd %%mm6,%%mm0\n"
585 : "+r" (pix1), "=r"(tmp)
586 : "r" ((x86_reg)line_size) , "g" (h-2)
588 return tmp + hf_noise8_mmx(pix+8, line_size, h);
591 static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
592 MpegEncContext *c = p;
595 if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
596 else score1 = sse16_mmx(c, pix1, pix2, line_size, h);
597 score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
599 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
600 else return score1 + FFABS(score2)*8;
603 static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
604 MpegEncContext *c = p;
605 int score1= sse8_mmx(c, pix1, pix2, line_size, h);
606 int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
608 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
609 else return score1 + FFABS(score2)*8;
612 static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
615 assert( (((int)pix) & 7) == 0);
616 assert((line_size &7) ==0);
618 #define SUM(in0, in1, out0, out1) \
619 "movq (%0), %%mm2\n"\
620 "movq 8(%0), %%mm3\n"\
622 "movq %%mm2, " #out0 "\n"\
623 "movq %%mm3, " #out1 "\n"\
624 "psubusb " #in0 ", %%mm2\n"\
625 "psubusb " #in1 ", %%mm3\n"\
626 "psubusb " #out0 ", " #in0 "\n"\
627 "psubusb " #out1 ", " #in1 "\n"\
628 "por %%mm2, " #in0 "\n"\
629 "por %%mm3, " #in1 "\n"\
630 "movq " #in0 ", %%mm2\n"\
631 "movq " #in1 ", %%mm3\n"\
632 "punpcklbw %%mm7, " #in0 "\n"\
633 "punpcklbw %%mm7, " #in1 "\n"\
634 "punpckhbw %%mm7, %%mm2\n"\
635 "punpckhbw %%mm7, %%mm3\n"\
636 "paddw " #in1 ", " #in0 "\n"\
637 "paddw %%mm3, %%mm2\n"\
638 "paddw %%mm2, " #in0 "\n"\
639 "paddw " #in0 ", %%mm6\n"
650 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
653 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
655 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
662 "paddw %%mm6,%%mm0\n"
665 "paddw %%mm6,%%mm0\n"
667 : "+r" (pix), "=r"(tmp)
668 : "r" ((x86_reg)line_size) , "m" (h)
674 static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
677 assert( (((int)pix) & 7) == 0);
678 assert((line_size &7) ==0);
680 #define SUM(in0, in1, out0, out1) \
681 "movq (%0), " #out0 "\n"\
682 "movq 8(%0), " #out1 "\n"\
684 "psadbw " #out0 ", " #in0 "\n"\
685 "psadbw " #out1 ", " #in1 "\n"\
686 "paddw " #in1 ", " #in0 "\n"\
687 "paddw " #in0 ", %%mm6\n"
697 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
700 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
702 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
708 : "+r" (pix), "=r"(tmp)
709 : "r" ((x86_reg)line_size) , "m" (h)
715 static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
718 assert( (((int)pix1) & 7) == 0);
719 assert( (((int)pix2) & 7) == 0);
720 assert((line_size &7) ==0);
722 #define SUM(in0, in1, out0, out1) \
724 "movq (%1)," #out0 "\n"\
725 "movq 8(%0),%%mm3\n"\
726 "movq 8(%1)," #out1 "\n"\
729 "psubb " #out0 ", %%mm2\n"\
730 "psubb " #out1 ", %%mm3\n"\
731 "pxor %%mm7, %%mm2\n"\
732 "pxor %%mm7, %%mm3\n"\
733 "movq %%mm2, " #out0 "\n"\
734 "movq %%mm3, " #out1 "\n"\
735 "psubusb " #in0 ", %%mm2\n"\
736 "psubusb " #in1 ", %%mm3\n"\
737 "psubusb " #out0 ", " #in0 "\n"\
738 "psubusb " #out1 ", " #in1 "\n"\
739 "por %%mm2, " #in0 "\n"\
740 "por %%mm3, " #in1 "\n"\
741 "movq " #in0 ", %%mm2\n"\
742 "movq " #in1 ", %%mm3\n"\
743 "punpcklbw %%mm7, " #in0 "\n"\
744 "punpcklbw %%mm7, " #in1 "\n"\
745 "punpckhbw %%mm7, %%mm2\n"\
746 "punpckhbw %%mm7, %%mm3\n"\
747 "paddw " #in1 ", " #in0 "\n"\
748 "paddw %%mm3, %%mm2\n"\
749 "paddw %%mm2, " #in0 "\n"\
750 "paddw " #in0 ", %%mm6\n"
756 "pcmpeqw %%mm7,%%mm7\n"
758 "packsswb %%mm7, %%mm7\n"
766 "psubb %%mm2, %%mm0\n"
767 "psubb %%mm3, %%mm1\n"
768 "pxor %%mm7, %%mm0\n"
769 "pxor %%mm7, %%mm1\n"
770 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
773 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
775 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
782 "paddw %%mm6,%%mm0\n"
785 "paddw %%mm6,%%mm0\n"
787 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
788 : "r" ((x86_reg)line_size) , "m" (h)
794 static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
797 assert( (((int)pix1) & 7) == 0);
798 assert( (((int)pix2) & 7) == 0);
799 assert((line_size &7) ==0);
801 #define SUM(in0, in1, out0, out1) \
802 "movq (%0)," #out0 "\n"\
804 "movq 8(%0)," #out1 "\n"\
805 "movq 8(%1),%%mm3\n"\
808 "psubb %%mm2, " #out0 "\n"\
809 "psubb %%mm3, " #out1 "\n"\
810 "pxor %%mm7, " #out0 "\n"\
811 "pxor %%mm7, " #out1 "\n"\
812 "psadbw " #out0 ", " #in0 "\n"\
813 "psadbw " #out1 ", " #in1 "\n"\
814 "paddw " #in1 ", " #in0 "\n"\
815 "paddw " #in0 ", %%mm6\n"
820 "pcmpeqw %%mm7,%%mm7\n"
822 "packsswb %%mm7, %%mm7\n"
830 "psubb %%mm2, %%mm0\n"
831 "psubb %%mm3, %%mm1\n"
832 "pxor %%mm7, %%mm0\n"
833 "pxor %%mm7, %%mm1\n"
834 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
837 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
839 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
845 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
846 : "r" ((x86_reg)line_size) , "m" (h)
852 static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
856 "movq (%2, %0), %%mm0 \n\t"
857 "movq (%1, %0), %%mm1 \n\t"
858 "psubb %%mm0, %%mm1 \n\t"
859 "movq %%mm1, (%3, %0) \n\t"
860 "movq 8(%2, %0), %%mm0 \n\t"
861 "movq 8(%1, %0), %%mm1 \n\t"
862 "psubb %%mm0, %%mm1 \n\t"
863 "movq %%mm1, 8(%3, %0) \n\t"
868 : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w-15)
871 dst[i+0] = src1[i+0]-src2[i+0];
874 static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
880 "movq -1(%1, %0), %%mm0 \n\t" // LT
881 "movq (%1, %0), %%mm1 \n\t" // T
882 "movq -1(%2, %0), %%mm2 \n\t" // L
883 "movq (%2, %0), %%mm3 \n\t" // X
884 "movq %%mm2, %%mm4 \n\t" // L
885 "psubb %%mm0, %%mm2 \n\t"
886 "paddb %%mm1, %%mm2 \n\t" // L + T - LT
887 "movq %%mm4, %%mm5 \n\t" // L
888 "pmaxub %%mm1, %%mm4 \n\t" // max(T, L)
889 "pminub %%mm5, %%mm1 \n\t" // min(T, L)
890 "pminub %%mm2, %%mm4 \n\t"
891 "pmaxub %%mm1, %%mm4 \n\t"
892 "psubb %%mm4, %%mm3 \n\t" // dst - pred
893 "movq %%mm3, (%3, %0) \n\t"
898 : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w)
904 dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
906 *left_top= src1[w-1];
910 #define DIFF_PIXELS_1(m,a,t,p1,p2)\
911 "mov"#m" "#p1", "#a" \n\t"\
912 "mov"#m" "#p2", "#t" \n\t"\
913 "punpcklbw "#a", "#t" \n\t"\
914 "punpcklbw "#a", "#a" \n\t"\
915 "psubw "#t", "#a" \n\t"\
917 #define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\
918 uint8_t *p1b=p1, *p2b=p2;\
920 DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\
921 DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\
922 DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\
925 DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\
926 DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\
927 DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\
928 DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\
929 "mov"#m1" "#mm"0, %0 \n\t"\
930 DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\
931 "mov"#m1" %0, "#mm"0 \n\t"\
932 : "+m"(temp), "+r"(p1b), "+r"(p2b)\
933 : "r"((x86_reg)stride), "r"((x86_reg)stride*3)\
936 //the "+m"(temp) is needed as gcc 2.95 sometimes fails to compile "=m"(temp)
938 #define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q, %%mm, p1, p2, stride, temp)
939 #define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp)
941 #define LBUTTERFLY2(a1,b1,a2,b2)\
942 "paddw " #b1 ", " #a1 " \n\t"\
943 "paddw " #b2 ", " #a2 " \n\t"\
944 "paddw " #b1 ", " #b1 " \n\t"\
945 "paddw " #b2 ", " #b2 " \n\t"\
946 "psubw " #a1 ", " #b1 " \n\t"\
947 "psubw " #a2 ", " #b2 " \n\t"
949 #define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\
950 LBUTTERFLY2(m0, m1, m2, m3)\
951 LBUTTERFLY2(m4, m5, m6, m7)\
952 LBUTTERFLY2(m0, m2, m1, m3)\
953 LBUTTERFLY2(m4, m6, m5, m7)\
954 LBUTTERFLY2(m0, m4, m1, m5)\
955 LBUTTERFLY2(m2, m6, m3, m7)\
957 #define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7)
959 #define MMABS_MMX(a,z)\
960 "pxor " #z ", " #z " \n\t"\
961 "pcmpgtw " #a ", " #z " \n\t"\
962 "pxor " #z ", " #a " \n\t"\
963 "psubw " #z ", " #a " \n\t"
965 #define MMABS_MMX2(a,z)\
966 "pxor " #z ", " #z " \n\t"\
967 "psubw " #a ", " #z " \n\t"\
968 "pmaxsw " #z ", " #a " \n\t"
970 #define MMABS_SSSE3(a,z)\
971 "pabsw " #a ", " #a " \n\t"
973 #define MMABS_SUM(a,z, sum)\
975 "paddusw " #a ", " #sum " \n\t"
977 #define MMABS_SUM_8x8_NOSPILL\
978 MMABS(%%xmm0, %%xmm8)\
979 MMABS(%%xmm1, %%xmm9)\
980 MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\
981 MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\
982 MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\
983 MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\
984 MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\
985 MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\
986 "paddusw %%xmm1, %%xmm0 \n\t"
989 #define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL
991 #define MMABS_SUM_8x8_SSE2\
992 "movdqa %%xmm7, (%1) \n\t"\
993 MMABS(%%xmm0, %%xmm7)\
994 MMABS(%%xmm1, %%xmm7)\
995 MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\
996 MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\
997 MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\
998 MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\
999 MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\
1000 "movdqa (%1), %%xmm2 \n\t"\
1001 MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\
1002 "paddusw %%xmm1, %%xmm0 \n\t"
1005 #define LOAD4(o, a, b, c, d)\
1006 "movq "#o"(%1), "#a" \n\t"\
1007 "movq "#o"+8(%1), "#b" \n\t"\
1008 "movq "#o"+16(%1), "#c" \n\t"\
1009 "movq "#o"+24(%1), "#d" \n\t"\
1011 #define STORE4(o, a, b, c, d)\
1012 "movq "#a", "#o"(%1) \n\t"\
1013 "movq "#b", "#o"+8(%1) \n\t"\
1014 "movq "#c", "#o"+16(%1) \n\t"\
1015 "movq "#d", "#o"+24(%1) \n\t"\
1017 /* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
1018 * about 100k on extreme inputs. But that's very unlikely to occur in natural video,
1019 * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */
1020 #define HSUM_MMX(a, t, dst)\
1021 "movq "#a", "#t" \n\t"\
1022 "psrlq $32, "#a" \n\t"\
1023 "paddusw "#t", "#a" \n\t"\
1024 "movq "#a", "#t" \n\t"\
1025 "psrlq $16, "#a" \n\t"\
1026 "paddusw "#t", "#a" \n\t"\
1027 "movd "#a", "#dst" \n\t"\
1029 #define HSUM_MMX2(a, t, dst)\
1030 "pshufw $0x0E, "#a", "#t" \n\t"\
1031 "paddusw "#t", "#a" \n\t"\
1032 "pshufw $0x01, "#a", "#t" \n\t"\
1033 "paddusw "#t", "#a" \n\t"\
1034 "movd "#a", "#dst" \n\t"\
1036 #define HSUM_SSE2(a, t, dst)\
1037 "movhlps "#a", "#t" \n\t"\
1038 "paddusw "#t", "#a" \n\t"\
1039 "pshuflw $0x0E, "#a", "#t" \n\t"\
1040 "paddusw "#t", "#a" \n\t"\
1041 "pshuflw $0x01, "#a", "#t" \n\t"\
1042 "paddusw "#t", "#a" \n\t"\
1043 "movd "#a", "#dst" \n\t"\
1045 #define HADAMARD8_DIFF_MMX(cpu) \
1046 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1047 DECLARE_ALIGNED_8(uint64_t, temp[13]);\
1052 DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\
1057 "movq %%mm7, 96(%1) \n\t"\
1059 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1060 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)\
1062 "movq 96(%1), %%mm7 \n\t"\
1063 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1064 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)\
1070 DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\
1075 "movq %%mm7, 96(%1) \n\t"\
1077 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1078 STORE4(32, %%mm0, %%mm3, %%mm7, %%mm2)\
1080 "movq 96(%1), %%mm7 \n\t"\
1081 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1082 "movq %%mm7, %%mm5 \n\t"/*FIXME remove*/\
1083 "movq %%mm6, %%mm7 \n\t"\
1084 "movq %%mm0, %%mm6 \n\t"\
1086 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)\
1089 "movq %%mm7, 64(%1) \n\t"\
1090 MMABS(%%mm0, %%mm7)\
1091 MMABS(%%mm1, %%mm7)\
1092 MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1093 MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1094 MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1095 MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1096 MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1097 "movq 64(%1), %%mm2 \n\t"\
1098 MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1099 "paddusw %%mm1, %%mm0 \n\t"\
1100 "movq %%mm0, 64(%1) \n\t"\
1102 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)\
1103 LOAD4(32, %%mm4, %%mm5, %%mm6, %%mm7)\
1106 "movq %%mm7, (%1) \n\t"\
1107 MMABS(%%mm0, %%mm7)\
1108 MMABS(%%mm1, %%mm7)\
1109 MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1110 MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1111 MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1112 MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1113 MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1114 "movq (%1), %%mm2 \n\t"\
1115 MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1116 "paddusw 64(%1), %%mm0 \n\t"\
1117 "paddusw %%mm1, %%mm0 \n\t"\
1119 HSUM(%%mm0, %%mm1, %0)\
1126 WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1128 #define HADAMARD8_DIFF_SSE2(cpu) \
1129 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1130 DECLARE_ALIGNED_16(uint64_t, temp[4]);\
1135 DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\
1138 HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\
1139 TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\
1140 HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\
1142 HSUM_SSE2(%%xmm0, %%xmm1, %0)\
1148 WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1150 #define MMABS(a,z) MMABS_MMX(a,z)
1151 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
1152 HADAMARD8_DIFF_MMX(mmx)
1156 #define MMABS(a,z) MMABS_MMX2(a,z)
1157 #define MMABS_SUM_8x8 MMABS_SUM_8x8_SSE2
1158 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
1159 HADAMARD8_DIFF_MMX(mmx2)
1160 HADAMARD8_DIFF_SSE2(sse2)
1162 #undef MMABS_SUM_8x8
1166 #define MMABS(a,z) MMABS_SSSE3(a,z)
1167 #define MMABS_SUM_8x8 MMABS_SUM_8x8_NOSPILL
1168 HADAMARD8_DIFF_SSE2(ssse3)
1170 #undef MMABS_SUM_8x8
1173 #define DCT_SAD4(m,mm,o)\
1174 "mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\
1175 "mov"#m" "#o"+16(%1), "#mm"3 \n\t"\
1176 "mov"#m" "#o"+32(%1), "#mm"4 \n\t"\
1177 "mov"#m" "#o"+48(%1), "#mm"5 \n\t"\
1178 MMABS_SUM(mm##2, mm##6, mm##0)\
1179 MMABS_SUM(mm##3, mm##7, mm##1)\
1180 MMABS_SUM(mm##4, mm##6, mm##0)\
1181 MMABS_SUM(mm##5, mm##7, mm##1)\
1183 #define DCT_SAD_MMX\
1184 "pxor %%mm0, %%mm0 \n\t"\
1185 "pxor %%mm1, %%mm1 \n\t"\
1186 DCT_SAD4(q, %%mm, 0)\
1187 DCT_SAD4(q, %%mm, 8)\
1188 DCT_SAD4(q, %%mm, 64)\
1189 DCT_SAD4(q, %%mm, 72)\
1190 "paddusw %%mm1, %%mm0 \n\t"\
1191 HSUM(%%mm0, %%mm1, %0)
1193 #define DCT_SAD_SSE2\
1194 "pxor %%xmm0, %%xmm0 \n\t"\
1195 "pxor %%xmm1, %%xmm1 \n\t"\
1196 DCT_SAD4(dqa, %%xmm, 0)\
1197 DCT_SAD4(dqa, %%xmm, 64)\
1198 "paddusw %%xmm1, %%xmm0 \n\t"\
1199 HSUM(%%xmm0, %%xmm1, %0)
1201 #define DCT_SAD_FUNC(cpu) \
1202 static int sum_abs_dctelem_##cpu(DCTELEM *block){\
1212 #define DCT_SAD DCT_SAD_MMX
1213 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
1214 #define MMABS(a,z) MMABS_MMX(a,z)
1219 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
1220 #define MMABS(a,z) MMABS_MMX2(a,z)
1225 #define DCT_SAD DCT_SAD_SSE2
1226 #define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)
1231 #define MMABS(a,z) MMABS_SSSE3(a,z)
1238 static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){
1242 "pxor %%mm4, %%mm4 \n"
1245 "movq (%2,%0), %%mm2 \n"
1246 "movq (%3,%0,2), %%mm0 \n"
1247 "movq 8(%3,%0,2), %%mm1 \n"
1248 "punpckhbw %%mm2, %%mm3 \n"
1249 "punpcklbw %%mm2, %%mm2 \n"
1250 "psraw $8, %%mm3 \n"
1251 "psraw $8, %%mm2 \n"
1252 "psubw %%mm3, %%mm1 \n"
1253 "psubw %%mm2, %%mm0 \n"
1254 "pmaddwd %%mm1, %%mm1 \n"
1255 "pmaddwd %%mm0, %%mm0 \n"
1256 "paddd %%mm1, %%mm4 \n"
1257 "paddd %%mm0, %%mm4 \n"
1259 "movq %%mm4, %%mm3 \n"
1260 "psrlq $32, %%mm3 \n"
1261 "paddd %%mm3, %%mm4 \n"
1264 :"r"(pix1), "r"(pix2)
1269 #define PHADDD(a, t)\
1270 "movq "#a", "#t" \n\t"\
1271 "psrlq $32, "#a" \n\t"\
1272 "paddd "#t", "#a" \n\t"
1274 pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31]
1275 pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31]
1276 pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30]
1278 #define PMULHRW(x, y, s, o)\
1279 "pmulhw " #s ", "#x " \n\t"\
1280 "pmulhw " #s ", "#y " \n\t"\
1281 "paddw " #o ", "#x " \n\t"\
1282 "paddw " #o ", "#y " \n\t"\
1283 "psraw $1, "#x " \n\t"\
1284 "psraw $1, "#y " \n\t"
1285 #define DEF(x) x ## _mmx
1286 #define SET_RND MOVQ_WONE
1287 #define SCALE_OFFSET 1
1289 #include "dsputil_mmx_qns.h"
1296 #define DEF(x) x ## _3dnow
1298 #define SCALE_OFFSET 0
1299 #define PMULHRW(x, y, s, o)\
1300 "pmulhrw " #s ", "#x " \n\t"\
1301 "pmulhrw " #s ", "#y " \n\t"
1303 #include "dsputil_mmx_qns.h"
1312 #define DEF(x) x ## _ssse3
1314 #define SCALE_OFFSET -1
1315 #define PHADDD(a, t)\
1316 "pshufw $0x0E, "#a", "#t" \n\t"\
1317 "paddd "#t", "#a" \n\t" /* faster than phaddd on core2 */
1318 #define PMULHRW(x, y, s, o)\
1319 "pmulhrsw " #s ", "#x " \n\t"\
1320 "pmulhrsw " #s ", "#y " \n\t"
1322 #include "dsputil_mmx_qns.h"
1333 void ff_flac_compute_autocorr_sse2(const int32_t *data, int len, int lag,
1337 void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
1339 if (mm_flags & MM_MMX) {
1340 const int dct_algo = avctx->dct_algo;
1341 if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
1342 if(mm_flags & MM_SSE2){
1343 c->fdct = ff_fdct_sse2;
1344 }else if(mm_flags & MM_MMXEXT){
1345 c->fdct = ff_fdct_mmx2;
1347 c->fdct = ff_fdct_mmx;
1351 c->get_pixels = get_pixels_mmx;
1352 c->diff_pixels = diff_pixels_mmx;
1353 c->pix_sum = pix_sum16_mmx;
1355 c->diff_bytes= diff_bytes_mmx;
1356 c->sum_abs_dctelem= sum_abs_dctelem_mmx;
1358 c->hadamard8_diff[0]= hadamard8_diff16_mmx;
1359 c->hadamard8_diff[1]= hadamard8_diff_mmx;
1361 c->pix_norm1 = pix_norm1_mmx;
1362 c->sse[0] = (mm_flags & MM_SSE2) ? sse16_sse2 : sse16_mmx;
1363 c->sse[1] = sse8_mmx;
1364 c->vsad[4]= vsad_intra16_mmx;
1366 c->nsse[0] = nsse16_mmx;
1367 c->nsse[1] = nsse8_mmx;
1368 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1369 c->vsad[0] = vsad16_mmx;
1372 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1373 c->try_8x8basis= try_8x8basis_mmx;
1375 c->add_8x8basis= add_8x8basis_mmx;
1377 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;
1380 if (mm_flags & MM_MMXEXT) {
1381 c->sum_abs_dctelem= sum_abs_dctelem_mmx2;
1382 c->hadamard8_diff[0]= hadamard8_diff16_mmx2;
1383 c->hadamard8_diff[1]= hadamard8_diff_mmx2;
1384 c->vsad[4]= vsad_intra16_mmx2;
1386 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1387 c->vsad[0] = vsad16_mmx2;
1390 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
1393 if(mm_flags & MM_SSE2){
1394 c->sum_abs_dctelem= sum_abs_dctelem_sse2;
1395 c->hadamard8_diff[0]= hadamard8_diff16_sse2;
1396 c->hadamard8_diff[1]= hadamard8_diff_sse2;
1397 if (ENABLE_FLAC_ENCODER)
1398 c->flac_compute_autocorr = ff_flac_compute_autocorr_sse2;
1402 if(mm_flags & MM_SSSE3){
1403 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1404 c->try_8x8basis= try_8x8basis_ssse3;
1406 c->add_8x8basis= add_8x8basis_ssse3;
1407 c->sum_abs_dctelem= sum_abs_dctelem_ssse3;
1408 c->hadamard8_diff[0]= hadamard8_diff16_ssse3;
1409 c->hadamard8_diff[1]= hadamard8_diff_ssse3;
1413 if(mm_flags & MM_3DNOW){
1414 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1415 c->try_8x8basis= try_8x8basis_3dnow;
1417 c->add_8x8basis= add_8x8basis_3dnow;
1421 dsputil_init_pix_mmx(c, avctx);