2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7
40 .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7
47 .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7
58 /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
59 .macro h264_chroma_mc8 type
60 function ff_\type\()_h264_chroma_mc8_neon, export=1
70 rsb r6, r7, r5, lsl #3
71 rsb ip, r7, r4, lsl #3
72 sub r4, r7, r4, lsl #3
73 sub r4, r4, r5, lsl #3
83 vld1.64 {d4, d5}, [r1], r4
85 vld1.64 {d6, d7}, [r5], r4
94 vld1.64 {d4, d5}, [r1], r4
103 vrshrn.u16 d16, q8, #6
104 vld1.64 {d6, d7}, [r5], r4
106 vrshrn.u16 d17, q9, #6
108 vld1.64 {d20}, [lr,:64], r2
109 vld1.64 {d21}, [lr,:64], r2
110 vrhadd.u8 q8, q8, q10
112 vext.8 d7, d6, d7, #1
113 vst1.64 {d16}, [r0,:64], r2
114 vst1.64 {d17}, [r0,:64], r2
128 vld1.64 {d4}, [r1], r4
129 vld1.64 {d6}, [r5], r4
134 vld1.64 {d4}, [r1], r4
137 vld1.64 {d6}, [r5], r4
138 vrshrn.u16 d16, q8, #6
139 vrshrn.u16 d17, q9, #6
141 vld1.64 {d20}, [lr,:64], r2
142 vld1.64 {d21}, [lr,:64], r2
143 vrhadd.u8 q8, q8, q10
147 vst1.64 {d16}, [r0,:64], r2
148 vst1.64 {d17}, [r0,:64], r2
153 4: vld1.64 {d4, d5}, [r1], r2
154 vld1.64 {d6, d7}, [r1], r2
155 vext.8 d5, d4, d5, #1
156 vext.8 d7, d6, d7, #1
162 vld1.64 {d4, d5}, [r1], r2
166 vext.8 d5, d4, d5, #1
167 vrshrn.u16 d16, q8, #6
168 vrshrn.u16 d17, q9, #6
170 vld1.64 {d20}, [lr,:64], r2
171 vld1.64 {d21}, [lr,:64], r2
172 vrhadd.u8 q8, q8, q10
174 vld1.64 {d6, d7}, [r1], r2
175 vext.8 d7, d6, d7, #1
176 vst1.64 {d16}, [r0,:64], r2
177 vst1.64 {d17}, [r0,:64], r2
184 /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
185 .macro h264_chroma_mc4 type
186 function ff_\type\()_h264_chroma_mc4_neon, export=1
196 rsb r6, r7, r5, lsl #3
197 rsb ip, r7, r4, lsl #3
198 sub r4, r7, r4, lsl #3
199 sub r4, r4, r5, lsl #3
209 vld1.64 {d4}, [r1], r4
211 vld1.64 {d6}, [r5], r4
214 vext.8 d5, d4, d5, #1
215 vext.8 d7, d6, d7, #1
225 vld1.64 {d4}, [r1], r4
226 vext.8 d5, d4, d5, #1
230 vld1.64 {d6}, [r5], r4
231 vadd.i16 d16, d16, d17
232 vadd.i16 d17, d18, d19
233 vrshrn.u16 d16, q8, #6
237 vld1.32 {d20[0]}, [lr,:32], r2
238 vld1.32 {d20[1]}, [lr,:32], r2
239 vrhadd.u8 d16, d16, d20
241 vext.8 d7, d6, d7, #1
243 vst1.32 {d16[0]}, [r0,:32], r2
244 vst1.32 {d16[1]}, [r0,:32], r2
257 vext.32 d1, d0, d1, #1
260 vld1.32 {d4[0]}, [r1], r4
261 vld1.32 {d4[1]}, [r5], r4
265 vld1.32 {d4[0]}, [r1], r4
267 vld1.32 {d4[1]}, [r5], r4
268 vadd.i16 d16, d16, d17
269 vadd.i16 d17, d18, d19
270 vrshrn.u16 d16, q8, #6
272 vld1.32 {d20[0]}, [lr,:32], r2
273 vld1.32 {d20[1]}, [lr,:32], r2
274 vrhadd.u8 d16, d16, d20
278 vst1.32 {d16[0]}, [r0,:32], r2
279 vst1.32 {d16[1]}, [r0,:32], r2
284 4: vld1.64 {d4}, [r1], r2
285 vld1.64 {d6}, [r1], r2
286 vext.8 d5, d4, d5, #1
287 vext.8 d7, d6, d7, #1
291 5: vmull.u8 q8, d4, d0
294 vld1.64 {d4}, [r1], r2
295 vext.8 d5, d4, d5, #1
297 vadd.i16 d16, d16, d17
298 vadd.i16 d17, d18, d19
300 vrshrn.u16 d16, q8, #6
302 vld1.32 {d20[0]}, [lr,:32], r2
303 vld1.32 {d20[1]}, [lr,:32], r2
304 vrhadd.u8 d16, d16, d20
306 vld1.64 {d6}, [r1], r2
307 vext.8 d7, d6, d7, #1
310 vst1.32 {d16[0]}, [r0,:32], r2
311 vst1.32 {d16[1]}, [r0,:32], r2
326 /* H.264 loop filter */
328 .macro h264_loop_filter_start
334 and ip, ip, ip, lsl #16
336 ands ip, ip, ip, lsl #8
340 .macro align_push_regs
344 vst1.64 {d12-d15}, [sp,:128]
346 vst1.64 {d8-d11}, [sp,:128]
349 .macro align_pop_regs
350 vld1.64 {d8-d11}, [sp,:128]!
351 vld1.64 {d12-d15}, [sp,:128], ip
354 .macro h264_loop_filter_luma
355 vdup.8 q11, r2 @ alpha
357 vabd.u8 q6, q8, q0 @ abs(p0 - q0)
359 vabd.u8 q14, q9, q8 @ abs(p1 - p0)
361 vabd.u8 q15, q1, q0 @ abs(q1 - q0)
362 vsli.32 q12, q12, #16
363 vclt.u8 q6, q6, q11 @ < alpha
364 vdup.8 q11, r3 @ beta
366 vclt.u8 q14, q14, q11 @ < beta
367 vclt.u8 q15, q15, q11 @ < beta
369 vabd.u8 q4, q10, q8 @ abs(p2 - p0)
371 vabd.u8 q5, q2, q0 @ abs(q2 - q0)
372 vclt.u8 q4, q4, q11 @ < beta
374 vclt.u8 q5, q5, q11 @ < beta
378 vrhadd.u8 q14, q8, q0
381 vhadd.u8 q10, q10, q14
383 vhadd.u8 q14, q2, q14
385 vqsub.u8 q11, q9, q12
388 vqsub.u8 q11, q1, q12
391 vmax.u8 q14, q14, q11
394 vsubw.u8 q10, q10, d17
396 vshl.i16 q10, q10, #2
398 vaddw.u8 q10, q10, d19
400 vsubw.u8 q10, q10, d3
401 vrshrn.i16 d4, q2, #3
402 vrshrn.i16 d5, q10, #3
412 vaddw.s8 q14, q14, d4
414 vsubw.s8 q11, q11, d4
415 vsubw.s8 q12, q12, d5
422 function ff_h264_v_loop_filter_luma_neon, export=1
423 h264_loop_filter_start
425 vld1.64 {d0, d1}, [r0,:128], r1
426 vld1.64 {d2, d3}, [r0,:128], r1
427 vld1.64 {d4, d5}, [r0,:128], r1
428 sub r0, r0, r1, lsl #2
429 sub r0, r0, r1, lsl #1
430 vld1.64 {d20,d21}, [r0,:128], r1
431 vld1.64 {d18,d19}, [r0,:128], r1
432 vld1.64 {d16,d17}, [r0,:128], r1
436 h264_loop_filter_luma
438 sub r0, r0, r1, lsl #1
439 vst1.64 {d8, d9}, [r0,:128], r1
440 vst1.64 {d16,d17}, [r0,:128], r1
441 vst1.64 {d0, d1}, [r0,:128], r1
442 vst1.64 {d10,d11}, [r0,:128]
448 function ff_h264_h_loop_filter_luma_neon, export=1
449 h264_loop_filter_start
452 vld1.64 {d6}, [r0], r1
453 vld1.64 {d20}, [r0], r1
454 vld1.64 {d18}, [r0], r1
455 vld1.64 {d16}, [r0], r1
456 vld1.64 {d0}, [r0], r1
457 vld1.64 {d2}, [r0], r1
458 vld1.64 {d4}, [r0], r1
459 vld1.64 {d26}, [r0], r1
460 vld1.64 {d7}, [r0], r1
461 vld1.64 {d21}, [r0], r1
462 vld1.64 {d19}, [r0], r1
463 vld1.64 {d17}, [r0], r1
464 vld1.64 {d1}, [r0], r1
465 vld1.64 {d3}, [r0], r1
466 vld1.64 {d5}, [r0], r1
467 vld1.64 {d27}, [r0], r1
469 transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13
473 vst1.64 {d4, d5}, [sp,:128]
475 vst1.64 {d20,d21}, [sp,:128]
477 h264_loop_filter_luma
479 vld1.64 {d20,d21}, [sp,:128]!
480 vld1.64 {d4, d5}, [sp,:128]!
482 transpose_8x8 q3, q10, q4, q8, q0, q5, q2, q13
484 sub r0, r0, r1, lsl #4
485 vst1.64 {d6}, [r0], r1
486 vst1.64 {d20}, [r0], r1
487 vst1.64 {d8}, [r0], r1
488 vst1.64 {d16}, [r0], r1
489 vst1.64 {d0}, [r0], r1
490 vst1.64 {d10}, [r0], r1
491 vst1.64 {d4}, [r0], r1
492 vst1.64 {d26}, [r0], r1
493 vst1.64 {d7}, [r0], r1
494 vst1.64 {d21}, [r0], r1
495 vst1.64 {d9}, [r0], r1
496 vst1.64 {d17}, [r0], r1
497 vst1.64 {d1}, [r0], r1
498 vst1.64 {d11}, [r0], r1
499 vst1.64 {d5}, [r0], r1
500 vst1.64 {d27}, [r0], r1
506 .macro h264_loop_filter_chroma
507 vdup.8 d22, r2 @ alpha
509 vabd.u8 d26, d16, d0 @ abs(p0 - q0)
511 vabd.u8 d28, d18, d16 @ abs(p1 - p0)
515 vabd.u8 d30, d2, d0 @ abs(q1 - q0)
517 vclt.u8 d26, d26, d22 @ < alpha
519 vdup.8 d22, r3 @ beta
521 vrshrn.i16 d4, q2, #3
522 vclt.u8 d28, d28, d22 @ < beta
524 vclt.u8 d30, d30, d22 @ < beta
533 vaddw.s8 q14, q14, d4
534 vsubw.s8 q11, q11, d4
539 function ff_h264_v_loop_filter_chroma_neon, export=1
540 h264_loop_filter_start
542 sub r0, r0, r1, lsl #1
543 vld1.64 {d18}, [r0,:64], r1
544 vld1.64 {d16}, [r0,:64], r1
545 vld1.64 {d0}, [r0,:64], r1
546 vld1.64 {d2}, [r0,:64]
548 h264_loop_filter_chroma
550 sub r0, r0, r1, lsl #1
551 vst1.64 {d16}, [r0,:64], r1
552 vst1.64 {d0}, [r0,:64], r1
557 function ff_h264_h_loop_filter_chroma_neon, export=1
558 h264_loop_filter_start
561 vld1.32 {d18[0]}, [r0], r1
562 vld1.32 {d16[0]}, [r0], r1
563 vld1.32 {d0[0]}, [r0], r1
564 vld1.32 {d2[0]}, [r0], r1
565 vld1.32 {d18[1]}, [r0], r1
566 vld1.32 {d16[1]}, [r0], r1
567 vld1.32 {d0[1]}, [r0], r1
568 vld1.32 {d2[1]}, [r0], r1
575 h264_loop_filter_chroma
582 sub r0, r0, r1, lsl #3
583 vst1.32 {d18[0]}, [r0], r1
584 vst1.32 {d16[0]}, [r0], r1
585 vst1.32 {d0[0]}, [r0], r1
586 vst1.32 {d2[0]}, [r0], r1
587 vst1.32 {d18[1]}, [r0], r1
588 vst1.32 {d16[1]}, [r0], r1
589 vst1.32 {d0[1]}, [r0], r1
590 vst1.32 {d2[1]}, [r0], r1
597 .macro lowpass_const r
603 .macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
611 vext.8 d2, \r0, \r1, #2
612 vext.8 d3, \r0, \r1, #3
614 vext.8 d4, \r0, \r1, #1
615 vext.8 d5, \r0, \r1, #4
617 vext.8 d30, \r0, \r1, #5
618 vaddl.u8 t0, \r0, d30
619 vext.8 d18, \r2, \r3, #2
620 vmla.i16 t0, q1, d6[1]
621 vext.8 d19, \r2, \r3, #3
622 vaddl.u8 q9, d18, d19
623 vext.8 d20, \r2, \r3, #1
624 vmls.i16 t0, q2, d6[0]
625 vext.8 d21, \r2, \r3, #4
626 vaddl.u8 q10, d20, d21
627 vext.8 d31, \r2, \r3, #5
628 vaddl.u8 t1, \r2, d31
629 vmla.i16 t1, q9, d6[1]
630 vmls.i16 t1, q10, d6[0]
632 vqrshrun.s16 \d0, t0, #5
633 vqrshrun.s16 \d1, t1, #5
639 .macro lowpass_8_1 r0, r1, d0, narrow=1
645 vext.8 d2, \r0, \r1, #2
646 vext.8 d3, \r0, \r1, #3
648 vext.8 d4, \r0, \r1, #1
649 vext.8 d5, \r0, \r1, #4
651 vext.8 d30, \r0, \r1, #5
652 vaddl.u8 t0, \r0, d30
653 vmla.i16 t0, q1, d6[1]
654 vmls.i16 t0, q2, d6[0]
656 vqrshrun.s16 \d0, t0, #5
661 .macro lowpass_8.16 r0, r1, l0, h0, l1, h1, d
662 vext.16 q1, \r0, \r1, #2
663 vext.16 q0, \r0, \r1, #3
665 vext.16 q2, \r0, \r1, #1
667 vext.16 q3, \r0, \r1, #4
668 vaddl.s16 q10, d4, d6
669 vext.16 \r1, \r0, \r1, #5
671 vaddl.s16 q0, \h0, \h1
672 vaddl.s16 q8, \l0, \l1
676 vshl.i32 q15, q10, #2
678 vadd.i32 q10, q10, q15
692 vrshrn.s32 d18, q9, #10
693 vrshrn.s32 d19, q1, #10
698 function put_h264_qpel16_h_lowpass_neon_packed
702 bl put_h264_qpel8_h_lowpass_neon
703 sub r1, r1, r2, lsl #4
707 b put_h264_qpel8_h_lowpass_neon
710 function put_h264_qpel16_h_lowpass_neon
713 bl put_h264_qpel8_h_lowpass_neon
714 sub r0, r0, r3, lsl #4
715 sub r1, r1, r2, lsl #4
722 function put_h264_qpel8_h_lowpass_neon
723 1: vld1.64 {d0, d1}, [r1], r2
724 vld1.64 {d16,d17}, [r1], r2
726 lowpass_8 d0, d1, d16, d17, d0, d16
727 vst1.64 {d0}, [r0,:64], r3
728 vst1.64 {d16}, [r0,:64], r3
733 function put_h264_qpel16_h_lowpass_l2_neon
736 bl put_h264_qpel8_h_lowpass_l2_neon
737 sub r0, r0, r2, lsl #4
738 sub r1, r1, r2, lsl #4
739 sub r3, r3, r2, lsl #4
747 function put_h264_qpel8_h_lowpass_l2_neon
748 1: vld1.64 {d0, d1}, [r1], r2
749 vld1.64 {d16,d17}, [r1], r2
750 vld1.64 {d28}, [r3], r2
751 vld1.64 {d29}, [r3], r2
753 lowpass_8 d0, d1, d16, d17, d0, d1
754 vrhadd.u8 q0, q0, q14
755 vst1.64 {d0}, [r0,:64], r2
756 vst1.64 {d1}, [r0,:64], r2
761 function put_h264_qpel16_v_lowpass_neon_packed
764 bl put_h264_qpel8_v_lowpass_neon
765 sub r1, r1, r3, lsl #2
766 bl put_h264_qpel8_v_lowpass_neon
767 sub r1, r1, r3, lsl #4
768 sub r1, r1, r3, lsl #2
770 bl put_h264_qpel8_v_lowpass_neon
771 sub r1, r1, r3, lsl #2
773 b put_h264_qpel8_v_lowpass_neon
776 function put_h264_qpel16_v_lowpass_neon
778 bl put_h264_qpel8_v_lowpass_neon
779 sub r1, r1, r3, lsl #2
780 bl put_h264_qpel8_v_lowpass_neon
781 sub r0, r0, r2, lsl #4
783 sub r1, r1, r3, lsl #4
784 sub r1, r1, r3, lsl #2
786 bl put_h264_qpel8_v_lowpass_neon
787 sub r1, r1, r3, lsl #2
791 function put_h264_qpel8_v_lowpass_neon
792 vld1.64 {d8}, [r1], r3
793 vld1.64 {d10}, [r1], r3
794 vld1.64 {d12}, [r1], r3
795 vld1.64 {d14}, [r1], r3
796 vld1.64 {d22}, [r1], r3
797 vld1.64 {d24}, [r1], r3
798 vld1.64 {d26}, [r1], r3
799 vld1.64 {d28}, [r1], r3
800 vld1.64 {d9}, [r1], r3
801 vld1.64 {d11}, [r1], r3
802 vld1.64 {d13}, [r1], r3
803 vld1.64 {d15}, [r1], r3
806 transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
807 lowpass_8 d8, d9, d10, d11, d8, d10
808 lowpass_8 d12, d13, d14, d15, d12, d14
809 lowpass_8 d22, d23, d24, d25, d22, d24
810 lowpass_8 d26, d27, d28, d29, d26, d28
811 transpose_8x8 d8, d10, d12, d14, d22, d24, d26, d28
813 vst1.64 {d8}, [r0,:64], r2
814 vst1.64 {d10}, [r0,:64], r2
815 vst1.64 {d12}, [r0,:64], r2
816 vst1.64 {d14}, [r0,:64], r2
817 vst1.64 {d22}, [r0,:64], r2
818 vst1.64 {d24}, [r0,:64], r2
819 vst1.64 {d26}, [r0,:64], r2
820 vst1.64 {d28}, [r0,:64], r2
825 function put_h264_qpel16_v_lowpass_l2_neon
827 bl put_h264_qpel8_v_lowpass_l2_neon
828 sub r1, r1, r3, lsl #2
829 bl put_h264_qpel8_v_lowpass_l2_neon
830 sub r0, r0, r3, lsl #4
831 sub ip, ip, r2, lsl #4
834 sub r1, r1, r3, lsl #4
835 sub r1, r1, r3, lsl #2
837 bl put_h264_qpel8_v_lowpass_l2_neon
838 sub r1, r1, r3, lsl #2
842 function put_h264_qpel8_v_lowpass_l2_neon
843 vld1.64 {d8}, [r1], r3
844 vld1.64 {d10}, [r1], r3
845 vld1.64 {d12}, [r1], r3
846 vld1.64 {d14}, [r1], r3
847 vld1.64 {d22}, [r1], r3
848 vld1.64 {d24}, [r1], r3
849 vld1.64 {d26}, [r1], r3
850 vld1.64 {d28}, [r1], r3
851 vld1.64 {d9}, [r1], r3
852 vld1.64 {d11}, [r1], r3
853 vld1.64 {d13}, [r1], r3
854 vld1.64 {d15}, [r1], r3
857 transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
858 lowpass_8 d8, d9, d10, d11, d8, d9
859 lowpass_8 d12, d13, d14, d15, d12, d13
860 lowpass_8 d22, d23, d24, d25, d22, d23
861 lowpass_8 d26, d27, d28, d29, d26, d27
862 transpose_8x8 d8, d9, d12, d13, d22, d23, d26, d27
864 vld1.64 {d0}, [ip], r2
865 vld1.64 {d1}, [ip], r2
866 vld1.64 {d2}, [ip], r2
867 vld1.64 {d3}, [ip], r2
868 vld1.64 {d4}, [ip], r2
870 vld1.64 {d5}, [ip], r2
872 vld1.64 {d10}, [ip], r2
873 vrhadd.u8 q2, q2, q11
874 vld1.64 {d11}, [ip], r2
876 vst1.64 {d0}, [r0,:64], r3
877 vst1.64 {d1}, [r0,:64], r3
878 vrhadd.u8 q5, q5, q13
879 vst1.64 {d2}, [r0,:64], r3
880 vst1.64 {d3}, [r0,:64], r3
881 vst1.64 {d4}, [r0,:64], r3
882 vst1.64 {d5}, [r0,:64], r3
883 vst1.64 {d10}, [r0,:64], r3
884 vst1.64 {d11}, [r0,:64], r3
889 function put_h264_qpel8_hv_lowpass_neon_top
892 1: vld1.64 {d0, d1}, [r1], r3
893 vld1.64 {d16,d17}, [r1], r3
895 lowpass_8 d0, d1, d16, d17, q11, q12, narrow=0
896 vst1.64 {d22-d25}, [r4,:128]!
899 vld1.64 {d0, d1}, [r1]
900 lowpass_8_1 d0, d1, q12, narrow=0
904 vld1.64 {d30,d31}, [r4,:128], ip
905 vld1.64 {d20,d21}, [r4,:128], ip
906 vld1.64 {d18,d19}, [r4,:128], ip
907 vld1.64 {d16,d17}, [r4,:128], ip
908 vld1.64 {d14,d15}, [r4,:128], ip
909 vld1.64 {d12,d13}, [r4,:128], ip
910 vld1.64 {d10,d11}, [r4,:128], ip
911 vld1.64 {d8, d9}, [r4,:128], ip
912 vld1.64 {d6, d7}, [r4,:128], ip
913 vld1.64 {d4, d5}, [r4,:128], ip
914 vld1.64 {d2, d3}, [r4,:128], ip
915 vld1.64 {d0, d1}, [r4,:128]
917 swap4 d1, d3, d5, d7, d8, d10, d12, d14
918 transpose16_4x4 q0, q1, q2, q3, q4, q5, q6, q7
920 swap4 d17, d19, d21, d31, d24, d26, d28, d22
921 transpose16_4x4 q8, q9, q10, q15, q12, q13, q14, q11
923 vst1.64 {d30,d31}, [r4,:128]!
924 vst1.64 {d6, d7}, [r4,:128]!
925 vst1.64 {d20,d21}, [r4,:128]!
926 vst1.64 {d4, d5}, [r4,:128]!
927 vst1.64 {d18,d19}, [r4,:128]!
928 vst1.64 {d2, d3}, [r4,:128]!
929 vst1.64 {d16,d17}, [r4,:128]!
930 vst1.64 {d0, d1}, [r4,:128]
932 lowpass_8.16 q4, q12, d8, d9, d24, d25, d8
933 lowpass_8.16 q5, q13, d10, d11, d26, d27, d9
934 lowpass_8.16 q6, q14, d12, d13, d28, d29, d10
935 lowpass_8.16 q7, q11, d14, d15, d22, d23, d11
937 vld1.64 {d16,d17}, [r4,:128], ip
938 vld1.64 {d30,d31}, [r4,:128], ip
939 lowpass_8.16 q8, q15, d16, d17, d30, d31, d12
940 vld1.64 {d16,d17}, [r4,:128], ip
941 vld1.64 {d30,d31}, [r4,:128], ip
942 lowpass_8.16 q8, q15, d16, d17, d30, d31, d13
943 vld1.64 {d16,d17}, [r4,:128], ip
944 vld1.64 {d30,d31}, [r4,:128], ip
945 lowpass_8.16 q8, q15, d16, d17, d30, d31, d14
946 vld1.64 {d16,d17}, [r4,:128], ip
947 vld1.64 {d30,d31}, [r4,:128]
948 lowpass_8.16 q8, q15, d16, d17, d30, d31, d15
950 transpose_8x8 d12, d13, d14, d15, d8, d9, d10, d11
955 function put_h264_qpel8_hv_lowpass_neon
957 bl put_h264_qpel8_hv_lowpass_neon_top
958 vst1.64 {d12}, [r0,:64], r2
959 vst1.64 {d13}, [r0,:64], r2
960 vst1.64 {d14}, [r0,:64], r2
961 vst1.64 {d15}, [r0,:64], r2
962 vst1.64 {d8}, [r0,:64], r2
963 vst1.64 {d9}, [r0,:64], r2
964 vst1.64 {d10}, [r0,:64], r2
965 vst1.64 {d11}, [r0,:64], r2
971 function put_h264_qpel8_hv_lowpass_l2_neon
973 bl put_h264_qpel8_hv_lowpass_neon_top
975 vld1.64 {d0, d1}, [r2,:128]!
976 vld1.64 {d2, d3}, [r2,:128]!
978 vld1.64 {d4, d5}, [r2,:128]!
980 vld1.64 {d6, d7}, [r2,:128]!
983 vst1.64 {d0}, [r0,:64], r3
985 vst1.64 {d1}, [r0,:64], r3
986 vst1.64 {d2}, [r0,:64], r3
987 vst1.64 {d3}, [r0,:64], r3
988 vst1.64 {d4}, [r0,:64], r3
989 vst1.64 {d5}, [r0,:64], r3
990 vst1.64 {d6}, [r0,:64], r3
991 vst1.64 {d7}, [r0,:64], r3
997 function put_h264_qpel16_hv_lowpass_neon
999 bl put_h264_qpel8_hv_lowpass_neon
1000 sub r1, r1, r3, lsl #2
1001 bl put_h264_qpel8_hv_lowpass_neon
1002 sub r1, r1, r3, lsl #4
1003 sub r1, r1, r3, lsl #2
1005 sub r0, r0, r2, lsl #4
1007 bl put_h264_qpel8_hv_lowpass_neon
1008 sub r1, r1, r3, lsl #2
1010 b put_h264_qpel8_hv_lowpass_neon
1013 function put_h264_qpel16_hv_lowpass_l2_neon
1016 bl put_h264_qpel8_hv_lowpass_l2_neon
1017 sub r1, r1, r3, lsl #2
1018 bl put_h264_qpel8_hv_lowpass_l2_neon
1019 sub r1, r1, r3, lsl #4
1020 sub r1, r1, r3, lsl #2
1022 sub r0, r0, r3, lsl #4
1024 bl put_h264_qpel8_hv_lowpass_l2_neon
1025 sub r1, r1, r3, lsl #2
1027 b put_h264_qpel8_hv_lowpass_l2_neon
1030 function ff_put_h264_qpel8_mc10_neon, export=1
1035 b put_h264_qpel8_h_lowpass_l2_neon
1038 function ff_put_h264_qpel8_mc20_neon, export=1
1043 b put_h264_qpel8_h_lowpass_neon
1046 function ff_put_h264_qpel8_mc30_neon, export=1
1051 b put_h264_qpel8_h_lowpass_l2_neon
1054 function ff_put_h264_qpel8_mc01_neon, export=1
1057 put_h264_qpel8_mc01:
1060 sub r1, r1, r2, lsl #1
1062 bl put_h264_qpel8_v_lowpass_l2_neon
1067 function ff_put_h264_qpel8_mc11_neon, export=1
1068 push {r0, r1, r2, lr}
1069 put_h264_qpel8_mc11:
1077 bl put_h264_qpel8_h_lowpass_neon
1081 sub r1, r1, r2, lsl #1
1083 bl put_h264_qpel8_v_lowpass_l2_neon
1089 function ff_put_h264_qpel8_mc21_neon, export=1
1090 push {r0, r1, r4, r10, r11, lr}
1091 put_h264_qpel8_mc21:
1095 sub sp, sp, #(8*8+16*12)
1101 bl put_h264_qpel8_h_lowpass_neon
1104 sub r1, r1, r2, lsl #1
1108 bl put_h264_qpel8_hv_lowpass_l2_neon
1111 pop {r4, r10, r11, pc}
1114 function ff_put_h264_qpel8_mc31_neon, export=1
1116 push {r0, r1, r2, lr}
1118 b put_h264_qpel8_mc11
1121 function ff_put_h264_qpel8_mc02_neon, export=1
1124 sub r1, r1, r2, lsl #1
1127 bl put_h264_qpel8_v_lowpass_neon
1132 function ff_put_h264_qpel8_mc12_neon, export=1
1133 push {r0, r1, r4, r10, r11, lr}
1134 put_h264_qpel8_mc12:
1138 sub sp, sp, #(8*8+16*12)
1139 sub r1, r1, r2, lsl #1
1144 bl put_h264_qpel8_v_lowpass_neon
1147 sub r1, r1, r3, lsl #1
1150 bl put_h264_qpel8_hv_lowpass_l2_neon
1153 pop {r4, r10, r11, pc}
1156 function ff_put_h264_qpel8_mc22_neon, export=1
1157 push {r4, r10, r11, lr}
1160 sub r1, r1, r2, lsl #1
1163 sub sp, sp, #(16*12)
1166 bl put_h264_qpel8_hv_lowpass_neon
1169 pop {r4, r10, r11, pc}
1172 function ff_put_h264_qpel8_mc32_neon, export=1
1173 push {r0, r1, r4, r10, r11, lr}
1175 b put_h264_qpel8_mc12
1178 function ff_put_h264_qpel8_mc03_neon, export=1
1181 b put_h264_qpel8_mc01
1184 function ff_put_h264_qpel8_mc13_neon, export=1
1185 push {r0, r1, r2, lr}
1187 b put_h264_qpel8_mc11
1190 function ff_put_h264_qpel8_mc23_neon, export=1
1191 push {r0, r1, r4, r10, r11, lr}
1193 b put_h264_qpel8_mc21
1196 function ff_put_h264_qpel8_mc33_neon, export=1
1198 push {r0, r1, r2, lr}
1201 b put_h264_qpel8_mc11
1204 function ff_put_h264_qpel16_mc10_neon, export=1
1208 b put_h264_qpel16_h_lowpass_l2_neon
1211 function ff_put_h264_qpel16_mc20_neon, export=1
1215 b put_h264_qpel16_h_lowpass_neon
1218 function ff_put_h264_qpel16_mc30_neon, export=1
1222 b put_h264_qpel16_h_lowpass_l2_neon
1225 function ff_put_h264_qpel16_mc01_neon, export=1
1228 put_h264_qpel16_mc01:
1231 sub r1, r1, r2, lsl #1
1233 bl put_h264_qpel16_v_lowpass_l2_neon
1238 function ff_put_h264_qpel16_mc11_neon, export=1
1239 push {r0, r1, r4, lr}
1240 put_h264_qpel16_mc11:
1247 bl put_h264_qpel16_h_lowpass_neon
1252 sub r1, r1, r2, lsl #1
1254 bl put_h264_qpel16_v_lowpass_l2_neon
1256 add sp, sp, #(256+8)
1260 function ff_put_h264_qpel16_mc21_neon, export=1
1261 push {r0, r1, r4-r5, r9-r11, lr}
1262 put_h264_qpel16_mc21:
1266 sub sp, sp, #(16*16+16*12)
1270 bl put_h264_qpel16_h_lowpass_neon_packed
1273 sub r1, r1, r2, lsl #1
1276 bl put_h264_qpel16_hv_lowpass_l2_neon
1279 pop {r4-r5, r9-r11, pc}
1282 function ff_put_h264_qpel16_mc31_neon, export=1
1284 push {r0, r1, r4, lr}
1286 b put_h264_qpel16_mc11
1289 function ff_put_h264_qpel16_mc02_neon, export=1
1292 sub r1, r1, r2, lsl #1
1295 bl put_h264_qpel16_v_lowpass_neon
1300 function ff_put_h264_qpel16_mc12_neon, export=1
1301 push {r0, r1, r4-r5, r9-r11, lr}
1302 put_h264_qpel16_mc12:
1306 sub sp, sp, #(16*16+16*12)
1307 sub r1, r1, r2, lsl #1
1311 bl put_h264_qpel16_v_lowpass_neon_packed
1314 sub r1, r1, r3, lsl #1
1317 bl put_h264_qpel16_hv_lowpass_l2_neon
1320 pop {r4-r5, r9-r11, pc}
1323 function ff_put_h264_qpel16_mc22_neon, export=1
1324 push {r4, r9-r11, lr}
1328 sub r1, r1, r2, lsl #1
1331 sub sp, sp, #(16*12)
1334 bl put_h264_qpel16_hv_lowpass_neon
1337 pop {r4, r9-r11, pc}
1340 function ff_put_h264_qpel16_mc32_neon, export=1
1341 push {r0, r1, r4-r5, r9-r11, lr}
1343 b put_h264_qpel16_mc12
1346 function ff_put_h264_qpel16_mc03_neon, export=1
1349 b put_h264_qpel16_mc01
1352 function ff_put_h264_qpel16_mc13_neon, export=1
1353 push {r0, r1, r4, lr}
1355 b put_h264_qpel16_mc11
1358 function ff_put_h264_qpel16_mc23_neon, export=1
1359 push {r0, r1, r4-r5, r9-r11, lr}
1361 b put_h264_qpel16_mc21
1364 function ff_put_h264_qpel16_mc33_neon, export=1
1366 push {r0, r1, r4, lr}
1369 b put_h264_qpel16_mc11
1372 @ Biweighted prediction
1374 .macro biweight_16 macs, macd
1380 vld1.8 {d20-d21},[r0,:128], r2
1384 vld1.8 {d22-d23},[r1,:128], r2
1389 vld1.8 {d28-d29},[r0,:128], r2
1394 vld1.8 {d30-d31},[r1,:128], r2
1402 vshl.s16 q12, q12, q9
1403 vshl.s16 q13, q13, q9
1404 vqmovun.s16 d24, q12
1405 vqmovun.s16 d25, q13
1407 vst1.8 {d4- d5}, [r6,:128], r2
1409 vst1.8 {d24-d25},[r6,:128], r2
1414 .macro biweight_8 macs, macd
1420 vld1.8 {d4},[r0,:64], r2
1423 vld1.8 {d5},[r1,:64], r2
1426 vld1.8 {d6},[r0,:64], r2
1429 vld1.8 {d7},[r1,:64], r2
1434 vshl.s16 q10, q10, q9
1437 vst1.8 {d2},[r6,:64], r2
1439 vst1.8 {d4},[r6,:64], r2
1444 .macro biweight_4 macs, macd
1450 vld1.32 {d4[0]},[r0,:32], r2
1451 vld1.32 {d4[1]},[r0,:32], r2
1454 vld1.32 {d5[0]},[r1,:32], r2
1455 vld1.32 {d5[1]},[r1,:32], r2
1459 vld1.32 {d6[0]},[r0,:32], r2
1460 vld1.32 {d6[1]},[r0,:32], r2
1463 vld1.32 {d7[0]},[r1,:32], r2
1464 vld1.32 {d7[1]},[r1,:32], r2
1469 vshl.s16 q10, q10, q9
1472 vst1.32 {d2[0]},[r6,:32], r2
1473 vst1.32 {d2[1]},[r6,:32], r2
1475 vst1.32 {d4[0]},[r6,:32], r2
1476 vst1.32 {d4[1]},[r6,:32], r2
1479 2: vshl.s16 q1, q1, q9
1481 vst1.32 {d2[0]},[r6,:32], r2
1482 vst1.32 {d2[1]},[r6,:32], r2
1486 .macro biweight_func w
1487 function biweight_h264_pixels_\w\()_neon
1493 eors lr, lr, r5, lsr #30
1506 10: biweight_\w vmlal.u8, vmlal.u8
1508 biweight_\w vmlal.u8, vmlsl.u8
1511 biweight_\w vmlsl.u8, vmlsl.u8
1513 biweight_\w vmlsl.u8, vmlal.u8
1517 .macro biweight_entry w, h, b=1
1518 function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1
1521 b biweight_h264_pixels_\w\()_neon
1526 biweight_entry 16, 8
1527 biweight_entry 16, 16, b=0
1530 biweight_entry 8, 16
1532 biweight_entry 8, 8, b=0
1537 biweight_entry 4, 4, b=0
1540 @ Weighted prediction
1542 .macro weight_16 mac
1547 vld1.8 {d20-d21},[r0,:128], r1
1552 vld1.8 {d28-d29},[r0,:128], r1
1561 vshl.s16 q12, q12, q9
1562 vshl.s16 q13, q13, q9
1563 vqmovun.s16 d24, q12
1564 vqmovun.s16 d25, q13
1566 vst1.8 {d4- d5}, [r4,:128], r1
1568 vst1.8 {d24-d25},[r4,:128], r1
1578 vld1.8 {d4},[r0,:64], r1
1581 vld1.8 {d6},[r0,:64], r1
1586 vshl.s16 q10, q10, q9
1589 vst1.8 {d2},[r4,:64], r1
1591 vst1.8 {d4},[r4,:64], r1
1601 vld1.32 {d4[0]},[r0,:32], r1
1602 vld1.32 {d4[1]},[r0,:32], r1
1606 vld1.32 {d6[0]},[r0,:32], r1
1607 vld1.32 {d6[1]},[r0,:32], r1
1612 vshl.s16 q10, q10, q9
1615 vst1.32 {d2[0]},[r4,:32], r1
1616 vst1.32 {d2[1]},[r4,:32], r1
1618 vst1.32 {d4[0]},[r4,:32], r1
1619 vst1.32 {d4[1]},[r4,:32], r1
1622 2: vshl.s16 q1, q1, q9
1624 vst1.32 {d2[0]},[r4,:32], r1
1625 vst1.32 {d2[1]},[r4,:32], r1
1629 .macro weight_func w
1630 function weight_h264_pixels_\w\()_neon
1638 addge r4, r4, lr, lsl r2
1649 .macro weight_entry w, h, b=1
1650 function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1
1653 b weight_h264_pixels_\w\()_neon
1659 weight_entry 16, 16, b=0
1664 weight_entry 8, 8, b=0
1669 weight_entry 4, 4, b=0