2 * ARM NEON optimised DSP functions
3 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
32 1: vld1.64 {d0, d1}, [r1], r2
33 vld1.64 {d2, d3}, [r1], r2
34 vld1.64 {d4, d5}, [r1], r2
36 vld1.64 {d6, d7}, [r1], r2
41 vld1.64 {d16,d17}, [ip,:128], r2
43 vld1.64 {d18,d19}, [ip,:128], r2
45 vld1.64 {d20,d21}, [ip,:128], r2
47 vld1.64 {d22,d23}, [ip,:128], r2
51 vst1.64 {d0, d1}, [r0,:128], r2
52 vst1.64 {d2, d3}, [r0,:128], r2
53 vst1.64 {d4, d5}, [r0,:128], r2
54 vst1.64 {d6, d7}, [r0,:128], r2
59 .macro pixels16_x2 vhadd=vrhadd.u8
60 1: vld1.64 {d0-d2}, [r1], r2
61 vld1.64 {d4-d6}, [r1], r2
69 vst1.64 {d0, d1}, [r0,:128], r2
70 vst1.64 {d4, d5}, [r0,:128], r2
75 .macro pixels16_y2 vhadd=vrhadd.u8
79 vld1.64 {d0, d1}, [r1], lr
80 vld1.64 {d2, d3}, [ip], lr
83 vld1.64 {d0, d1}, [r1], lr
85 vld1.64 {d2, d3}, [ip], lr
88 vst1.64 {d4, d5}, [r0,:128], r2
89 vst1.64 {d6, d7}, [r0,:128], r2
94 .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0
98 vld1.64 {d0-d2}, [r1], lr
99 vld1.64 {d4-d6}, [ip], lr
105 vext.8 q1, q0, q1, #1
106 vext.8 q3, q2, q3, #1
112 vld1.64 {d0-d2}, [r1], lr
116 vadd.u16 q12, q12, q13
118 vext.8 q15, q0, q1, #1
119 vadd.u16 q1 , q10, q11
126 vld1.64 {d2-d4}, [ip], lr
127 vaddl.u8 q10, d1, d31
128 vst1.64 {d28,d29}, [r0,:128], r2
132 vadd.u16 q12, q12, q13
134 vext.8 q2, q1, q2, #1
135 vadd.u16 q0, q10, q11
143 vst1.64 {d30,d31}, [r0,:128], r2
149 1: vld1.64 {d0}, [r1], r2
150 vld1.64 {d1}, [r1], r2
151 vld1.64 {d2}, [r1], r2
153 vld1.64 {d3}, [r1], r2
158 vst1.64 {d0}, [r0,:64], r2
159 vst1.64 {d1}, [r0,:64], r2
160 vst1.64 {d2}, [r0,:64], r2
161 vst1.64 {d3}, [r0,:64], r2
166 .macro pixels8_x2 vhadd=vrhadd.u8
167 1: vld1.64 {d0, d1}, [r1], r2
168 vext.8 d1, d0, d1, #1
169 vld1.64 {d2, d3}, [r1], r2
170 vext.8 d3, d2, d3, #1
176 vst1.64 {d0}, [r0,:64], r2
177 vst1.64 {d1}, [r0,:64], r2
182 .macro pixels8_y2 vhadd=vrhadd.u8
186 vld1.64 {d0}, [r1], lr
187 vld1.64 {d1}, [ip], lr
190 vld1.64 {d0}, [r1], lr
192 vld1.64 {d1}, [ip], lr
195 vst1.64 {d4}, [r0,:64], r2
196 vst1.64 {d5}, [r0,:64], r2
201 .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0
205 vld1.64 {d0, d1}, [r1], lr
206 vld1.64 {d2, d3}, [ip], lr
212 vext.8 d4, d0, d1, #1
213 vext.8 d6, d2, d3, #1
217 vld1.64 {d0, d1}, [r1], lr
220 vext.8 d4, d0, d1, #1
222 vadd.u16 q10, q10, q11
226 vld1.64 {d2, d3}, [ip], lr
230 vadd.u16 q10, q10, q11
232 vst1.64 {d5}, [r0,:64], r2
234 vext.8 d6, d2, d3, #1
236 vst1.64 {d7}, [r0,:64], r2
241 .macro pixfunc pfx name suf rnd_op args:vararg
242 function ff_\pfx\name\suf\()_neon, export=1
247 .macro pixfunc2 pfx name args:vararg
249 pixfunc \pfx \name \args
252 function ff_put_h264_qpel16_mc00_neon, export=1
256 pixfunc put_ pixels16
257 pixfunc2 put_ pixels16_x2, _no_rnd, vhadd.u8
258 pixfunc2 put_ pixels16_y2, _no_rnd, vhadd.u8
259 pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1
261 function ff_avg_h264_qpel16_mc00_neon, export=1
265 pixfunc avg_ pixels16,, 1
267 function ff_put_h264_qpel8_mc00_neon, export=1
272 pixfunc2 put_ pixels8_x2, _no_rnd, vhadd.u8
273 pixfunc2 put_ pixels8_y2, _no_rnd, vhadd.u8
274 pixfunc2 put_ pixels8_xy2, _no_rnd, vshrn.u16, 1
276 function ff_put_signed_pixels_clamped_neon, export=1
278 vld1.64 {d16-d17}, [r0,:128]!
280 vld1.64 {d18-d19}, [r0,:128]!
282 vld1.64 {d16-d17}, [r0,:128]!
284 vld1.64 {d18-d19}, [r0,:128]!
286 vld1.64 {d20-d21}, [r0,:128]!
288 vld1.64 {d22-d23}, [r0,:128]!
290 vst1.64 {d0}, [r1,:64], r2
292 vst1.64 {d1}, [r1,:64], r2
294 vst1.64 {d2}, [r1,:64], r2
296 vld1.64 {d24-d25}, [r0,:128]!
298 vld1.64 {d26-d27}, [r0,:128]!
301 vst1.64 {d3}, [r1,:64], r2
303 vst1.64 {d4}, [r1,:64], r2
305 vst1.64 {d5}, [r1,:64], r2
308 vst1.64 {d6}, [r1,:64], r2
309 vst1.64 {d7}, [r1,:64], r2
313 function ff_add_pixels_clamped_neon, export=1
315 vld1.64 {d16}, [r1,:64], r2
316 vld1.64 {d0-d1}, [r0,:128]!
318 vld1.64 {d17}, [r1,:64], r2
319 vld1.64 {d2-d3}, [r0,:128]!
321 vld1.64 {d18}, [r1,:64], r2
323 vld1.64 {d4-d5}, [r0,:128]!
325 vst1.64 {d0}, [r3,:64], r2
327 vld1.64 {d19}, [r1,:64], r2
328 vld1.64 {d6-d7}, [r0,:128]!
331 vst1.64 {d2}, [r3,:64], r2
332 vld1.64 {d16}, [r1,:64], r2
334 vld1.64 {d0-d1}, [r0,:128]!
336 vst1.64 {d4}, [r3,:64], r2
337 vld1.64 {d17}, [r1,:64], r2
338 vld1.64 {d2-d3}, [r0,:128]!
340 vst1.64 {d6}, [r3,:64], r2
342 vld1.64 {d18}, [r1,:64], r2
343 vld1.64 {d4-d5}, [r0,:128]!
345 vst1.64 {d0}, [r3,:64], r2
347 vld1.64 {d19}, [r1,:64], r2
349 vld1.64 {d6-d7}, [r0,:128]!
351 vst1.64 {d2}, [r3,:64], r2
353 vst1.64 {d4}, [r3,:64], r2
354 vst1.64 {d6}, [r3,:64], r2
358 function ff_float_to_int16_neon, export=1
360 vld1.64 {d0-d1}, [r1,:128]!
361 vcvt.s32.f32 q8, q0, #16
362 vld1.64 {d2-d3}, [r1,:128]!
363 vcvt.s32.f32 q9, q1, #16
368 vshrn.s32 d4, q8, #16
369 vld1.64 {d0-d1}, [r1,:128]!
370 vcvt.s32.f32 q0, q0, #16
371 vshrn.s32 d5, q9, #16
372 vld1.64 {d2-d3}, [r1,:128]!
373 vcvt.s32.f32 q1, q1, #16
374 vshrn.s32 d6, q0, #16
375 vst1.64 {d4-d5}, [r0,:128]!
376 vshrn.s32 d7, q1, #16
377 vld1.64 {d16-d17},[r1,:128]!
378 vcvt.s32.f32 q8, q8, #16
379 vld1.64 {d18-d19},[r1,:128]!
380 vcvt.s32.f32 q9, q9, #16
381 vst1.64 {d6-d7}, [r0,:128]!
385 2: vld1.64 {d0-d1}, [r1,:128]!
386 vshrn.s32 d4, q8, #16
387 vcvt.s32.f32 q0, q0, #16
388 vld1.64 {d2-d3}, [r1,:128]!
389 vshrn.s32 d5, q9, #16
390 vcvt.s32.f32 q1, q1, #16
391 vshrn.s32 d6, q0, #16
392 vst1.64 {d4-d5}, [r0,:128]!
393 vshrn.s32 d7, q1, #16
394 vst1.64 {d6-d7}, [r0,:128]!
396 3: vshrn.s32 d4, q8, #16
397 vshrn.s32 d5, q9, #16
398 vst1.64 {d4-d5}, [r0,:128]!
402 function ff_float_to_int16_interleave_neon, export=1
405 blt ff_float_to_int16_neon
412 vld1.64 {d0-d1}, [r3,:128]!
413 vcvt.s32.f32 q8, q0, #16
414 vld1.64 {d2-d3}, [r3,:128]!
415 vcvt.s32.f32 q9, q1, #16
416 vld1.64 {d20-d21},[r1,:128]!
417 vcvt.s32.f32 q10, q10, #16
418 vld1.64 {d22-d23},[r1,:128]!
419 vcvt.s32.f32 q11, q11, #16
424 vld1.64 {d0-d1}, [r3,:128]!
425 vcvt.s32.f32 q0, q0, #16
427 vld1.64 {d2-d3}, [r3,:128]!
428 vcvt.s32.f32 q1, q1, #16
429 vld1.64 {d24-d25},[r1,:128]!
430 vcvt.s32.f32 q12, q12, #16
431 vld1.64 {d26-d27},[r1,:128]!
433 vst1.64 {d20-d21},[r0,:128]!
434 vcvt.s32.f32 q13, q13, #16
435 vst1.64 {d22-d23},[r0,:128]!
437 vld1.64 {d16-d17},[r3,:128]!
439 vst1.64 {d24-d25},[r0,:128]!
440 vcvt.s32.f32 q8, q8, #16
441 vld1.64 {d18-d19},[r3,:128]!
442 vcvt.s32.f32 q9, q9, #16
443 vld1.64 {d20-d21},[r1,:128]!
444 vcvt.s32.f32 q10, q10, #16
445 vld1.64 {d22-d23},[r1,:128]!
446 vcvt.s32.f32 q11, q11, #16
447 vst1.64 {d26-d27},[r0,:128]!
451 2: vsri.32 q10, q8, #16
452 vld1.64 {d0-d1}, [r3,:128]!
453 vcvt.s32.f32 q0, q0, #16
454 vld1.64 {d2-d3}, [r3,:128]!
455 vcvt.s32.f32 q1, q1, #16
456 vld1.64 {d24-d25},[r1,:128]!
457 vcvt.s32.f32 q12, q12, #16
459 vld1.64 {d26-d27},[r1,:128]!
460 vcvt.s32.f32 q13, q13, #16
461 vst1.64 {d20-d21},[r0,:128]!
463 vst1.64 {d22-d23},[r0,:128]!
465 vst1.64 {d24-d27},[r0,:128]!
467 3: vsri.32 q10, q8, #16
469 vst1.64 {d20-d23},[r0,:128]!
478 5: ldmia r1!, {r4-r7}
481 vld1.64 {d16-d17},[r4,:128]!
482 vcvt.s32.f32 q8, q8, #16
483 vld1.64 {d18-d19},[r5,:128]!
484 vcvt.s32.f32 q9, q9, #16
485 vld1.64 {d20-d21},[r6,:128]!
486 vcvt.s32.f32 q10, q10, #16
487 vld1.64 {d22-d23},[r7,:128]!
488 vcvt.s32.f32 q11, q11, #16
490 vld1.64 {d0-d1}, [r4,:128]!
491 vcvt.s32.f32 q0, q0, #16
493 vld1.64 {d2-d3}, [r5,:128]!
494 vcvt.s32.f32 q1, q1, #16
495 vsri.32 q11, q10, #16
496 vld1.64 {d4-d5}, [r6,:128]!
497 vcvt.s32.f32 q2, q2, #16
499 vld1.64 {d6-d7}, [r7,:128]!
500 vcvt.s32.f32 q3, q3, #16
502 vst1.64 {d18}, [r8], ip
504 vst1.64 {d22}, [r8], ip
506 vst1.64 {d19}, [r8], ip
508 vst1.64 {d23}, [r8], ip
511 vld1.64 {d16-d17},[r4,:128]!
512 vcvt.s32.f32 q8, q8, #16
513 vst1.64 {d2}, [r8], ip
514 vld1.64 {d18-d19},[r5,:128]!
515 vcvt.s32.f32 q9, q9, #16
516 vst1.64 {d6}, [r8], ip
517 vld1.64 {d20-d21},[r6,:128]!
518 vcvt.s32.f32 q10, q10, #16
519 vst1.64 {d3}, [r8], ip
520 vld1.64 {d22-d23},[r7,:128]!
521 vcvt.s32.f32 q11, q11, #16
522 vst1.64 {d7}, [r8], ip
524 7: vst1.64 {d2}, [r8], ip
525 vst1.64 {d6}, [r8], ip
526 vst1.64 {d3}, [r8], ip
527 vst1.64 {d7}, [r8], ip
541 vld1.64 {d16-d17},[r4,:128]!
542 vcvt.s32.f32 q8, q8, #16
543 vld1.64 {d18-d19},[r5,:128]!
544 vcvt.s32.f32 q9, q9, #16
545 vld1.64 {d20-d21},[r4,:128]!
546 vcvt.s32.f32 q10, q10, #16
547 vld1.64 {d22-d23},[r5,:128]!
548 vcvt.s32.f32 q11, q11, #16
552 vsri.32 d18, d16, #16
553 vsri.32 d19, d17, #16
554 vld1.64 {d16-d17},[r4,:128]!
555 vcvt.s32.f32 q8, q8, #16
556 vst1.32 {d18[0]}, [r8], ip
557 vsri.32 d22, d20, #16
558 vst1.32 {d18[1]}, [r8], ip
559 vsri.32 d23, d21, #16
560 vst1.32 {d19[0]}, [r8], ip
561 vst1.32 {d19[1]}, [r8], ip
562 vld1.64 {d18-d19},[r5,:128]!
563 vcvt.s32.f32 q9, q9, #16
564 vst1.32 {d22[0]}, [r8], ip
565 vst1.32 {d22[1]}, [r8], ip
566 vld1.64 {d20-d21},[r4,:128]!
567 vcvt.s32.f32 q10, q10, #16
568 vst1.32 {d23[0]}, [r8], ip
569 vst1.32 {d23[1]}, [r8], ip
570 vld1.64 {d22-d23},[r5,:128]!
571 vcvt.s32.f32 q11, q11, #16
573 vld1.64 {d0-d1}, [r4,:128]!
574 vcvt.s32.f32 q0, q0, #16
575 vsri.32 d18, d16, #16
576 vld1.64 {d2-d3}, [r5,:128]!
577 vcvt.s32.f32 q1, q1, #16
578 vsri.32 d19, d17, #16
579 vld1.64 {d4-d5}, [r4,:128]!
580 vcvt.s32.f32 q2, q2, #16
581 vld1.64 {d6-d7}, [r5,:128]!
582 vcvt.s32.f32 q3, q3, #16
583 vst1.32 {d18[0]}, [r8], ip
584 vsri.32 d22, d20, #16
585 vst1.32 {d18[1]}, [r8], ip
586 vsri.32 d23, d21, #16
587 vst1.32 {d19[0]}, [r8], ip
589 vst1.32 {d19[1]}, [r8], ip
591 vst1.32 {d22[0]}, [r8], ip
593 vst1.32 {d22[1]}, [r8], ip
595 vst1.32 {d23[0]}, [r8], ip
596 vst1.32 {d23[1]}, [r8], ip
598 vld1.64 {d16-d17},[r4,:128]!
599 vcvt.s32.f32 q8, q8, #16
600 vst1.32 {d2[0]}, [r8], ip
601 vst1.32 {d2[1]}, [r8], ip
602 vld1.64 {d18-d19},[r5,:128]!
603 vcvt.s32.f32 q9, q9, #16
604 vst1.32 {d3[0]}, [r8], ip
605 vst1.32 {d3[1]}, [r8], ip
606 vld1.64 {d20-d21},[r4,:128]!
607 vcvt.s32.f32 q10, q10, #16
608 vst1.32 {d6[0]}, [r8], ip
609 vst1.32 {d6[1]}, [r8], ip
610 vld1.64 {d22-d23},[r5,:128]!
611 vcvt.s32.f32 q11, q11, #16
612 vst1.32 {d7[0]}, [r8], ip
613 vst1.32 {d7[1]}, [r8], ip
615 6: vst1.32 {d2[0]}, [r8], ip
616 vst1.32 {d2[1]}, [r8], ip
617 vst1.32 {d3[0]}, [r8], ip
618 vst1.32 {d3[1]}, [r8], ip
619 vst1.32 {d6[0]}, [r8], ip
620 vst1.32 {d6[1]}, [r8], ip
621 vst1.32 {d7[0]}, [r8], ip
622 vst1.32 {d7[1]}, [r8], ip
624 7: vsri.32 d18, d16, #16
625 vsri.32 d19, d17, #16
626 vst1.32 {d18[0]}, [r8], ip
627 vsri.32 d22, d20, #16
628 vst1.32 {d18[1]}, [r8], ip
629 vsri.32 d23, d21, #16
630 vst1.32 {d19[0]}, [r8], ip
631 vst1.32 {d19[1]}, [r8], ip
632 vst1.32 {d22[0]}, [r8], ip
633 vst1.32 {d22[1]}, [r8], ip
634 vst1.32 {d23[0]}, [r8], ip
635 vst1.32 {d23[1]}, [r8], ip
645 vld1.64 {d0-d1}, [r4,:128]!
646 vcvt.s32.f32 q0, q0, #16
647 vld1.64 {d2-d3}, [r4,:128]!
648 vcvt.s32.f32 q1, q1, #16
651 vld1.64 {d4-d5}, [r4,:128]!
652 vcvt.s32.f32 q2, q2, #16
653 vld1.64 {d6-d7}, [r4,:128]!
654 vcvt.s32.f32 q3, q3, #16
655 vst1.16 {d0[1]}, [r5,:16], ip
656 vst1.16 {d0[3]}, [r5,:16], ip
657 vst1.16 {d1[1]}, [r5,:16], ip
658 vst1.16 {d1[3]}, [r5,:16], ip
659 vst1.16 {d2[1]}, [r5,:16], ip
660 vst1.16 {d2[3]}, [r5,:16], ip
661 vst1.16 {d3[1]}, [r5,:16], ip
662 vst1.16 {d3[3]}, [r5,:16], ip
664 vld1.64 {d0-d1}, [r4,:128]!
665 vcvt.s32.f32 q0, q0, #16
666 vld1.64 {d2-d3}, [r4,:128]!
667 vcvt.s32.f32 q1, q1, #16
668 7: vst1.16 {d4[1]}, [r5,:16], ip
669 vst1.16 {d4[3]}, [r5,:16], ip
670 vst1.16 {d5[1]}, [r5,:16], ip
671 vst1.16 {d5[3]}, [r5,:16], ip
672 vst1.16 {d6[1]}, [r5,:16], ip
673 vst1.16 {d6[3]}, [r5,:16], ip
674 vst1.16 {d7[1]}, [r5,:16], ip
675 vst1.16 {d7[3]}, [r5,:16], ip
679 vst1.16 {d0[1]}, [r5,:16], ip
680 vst1.16 {d0[3]}, [r5,:16], ip
681 vst1.16 {d1[1]}, [r5,:16], ip
682 vst1.16 {d1[3]}, [r5,:16], ip
683 vst1.16 {d2[1]}, [r5,:16], ip
684 vst1.16 {d2[3]}, [r5,:16], ip
685 vst1.16 {d3[1]}, [r5,:16], ip
686 vst1.16 {d3[3]}, [r5,:16], ip
688 vld1.64 {d0-d1}, [r4,:128]!
689 vcvt.s32.f32 q0, q0, #16
690 vld1.64 {d2-d3}, [r4,:128]!
691 vcvt.s32.f32 q1, q1, #16
695 function ff_vector_fmul_neon, export=1
698 vld1.64 {d0-d3}, [r0,:128]!
699 vld1.64 {d4-d7}, [r1,:128]!
706 vld1.64 {d0-d1}, [r0,:128]!
707 vld1.64 {d4-d5}, [r1,:128]!
709 vld1.64 {d2-d3}, [r0,:128]!
710 vld1.64 {d6-d7}, [r1,:128]!
712 vst1.64 {d16-d19},[r3,:128]!
713 vld1.64 {d0-d1}, [r0,:128]!
714 vld1.64 {d4-d5}, [r1,:128]!
716 vld1.64 {d2-d3}, [r0,:128]!
717 vld1.64 {d6-d7}, [r1,:128]!
719 vst1.64 {d20-d23},[r3,:128]!
723 2: vld1.64 {d0-d1}, [r0,:128]!
724 vld1.64 {d4-d5}, [r1,:128]!
725 vst1.64 {d16-d17},[r3,:128]!
727 vld1.64 {d2-d3}, [r0,:128]!
728 vld1.64 {d6-d7}, [r1,:128]!
729 vst1.64 {d18-d19},[r3,:128]!
731 3: vst1.64 {d16-d19},[r3,:128]!
735 function ff_vector_fmul_window_neon, export=1
736 vld1.32 {d16[],d17[]}, [sp,:32]
741 add r2, r2, r5, lsl #2
742 add r4, r3, r5, lsl #3
743 add ip, r0, r5, lsl #3
745 vld1.64 {d0,d1}, [r1,:128]!
746 vld1.64 {d2,d3}, [r2,:128], r5
747 vld1.64 {d4,d5}, [r3,:128]!
748 vld1.64 {d6,d7}, [r4,:128], r5
760 vld1.64 {d0,d1}, [r1,:128]!
762 vld1.64 {d18,d19},[r2,:128], r5
764 vld1.64 {d24,d25},[r3,:128]!
766 vld1.64 {d6,d7}, [r4,:128], r5
771 vst1.64 {d20,d21},[r0,:128]!
772 vst1.64 {d22,d23},[ip,:128], r5
774 2: vmla.f32 d22, d3, d7
780 vst1.64 {d20,d21},[r0,:128]!
781 vst1.64 {d22,d23},[ip,:128], r5