2 * ARM NEON optimised DSP functions
3 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
32 1: vld1.64 {d0, d1}, [r1], r2
33 vld1.64 {d2, d3}, [r1], r2
34 vld1.64 {d4, d5}, [r1], r2
36 vld1.64 {d6, d7}, [r1], r2
41 vld1.64 {d16,d17}, [ip], r2
43 vld1.64 {d18,d19}, [ip], r2
45 vld1.64 {d20,d21}, [ip], r2
47 vld1.64 {d22,d23}, [ip], r2
51 vst1.64 {d0, d1}, [r0,:128], r2
52 vst1.64 {d2, d3}, [r0,:128], r2
53 vst1.64 {d4, d5}, [r0,:128], r2
54 vst1.64 {d6, d7}, [r0,:128], r2
59 .macro pixels16_x2 vhadd=vrhadd.u8
60 1: vld1.64 {d0-d2}, [r1], r2
61 vld1.64 {d4-d6}, [r1], r2
69 vst1.64 {d0, d1}, [r0,:128], r2
70 vst1.64 {d4, d5}, [r0,:128], r2
75 .macro pixels16_y2 vhadd=vrhadd.u8
79 vld1.64 {d0, d1}, [r1], lr
80 vld1.64 {d2, d3}, [ip], lr
83 vld1.64 {d0, d1}, [r1], lr
85 vld1.64 {d2, d3}, [ip], lr
88 vst1.64 {d4, d5}, [r0,:128], r2
89 vst1.64 {d6, d7}, [r0,:128], r2
94 .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0
98 vld1.64 {d0-d2}, [r1], lr
99 vld1.64 {d4-d6}, [ip], lr
105 vext.8 q1, q0, q1, #1
106 vext.8 q3, q2, q3, #1
112 vld1.64 {d0-d2}, [r1], lr
116 vadd.u16 q12, q12, q13
118 vext.8 q15, q0, q1, #1
119 vadd.u16 q1 , q10, q11
126 vld1.64 {d2-d4}, [ip], lr
127 vaddl.u8 q10, d1, d31
128 vst1.64 {d28,d29}, [r0,:128], r2
132 vadd.u16 q12, q12, q13
134 vext.8 q2, q1, q2, #1
135 vadd.u16 q0, q10, q11
143 vst1.64 {d30,d31}, [r0,:128], r2
149 1: vld1.64 {d0}, [r1], r2
150 vld1.64 {d1}, [r1], r2
151 vld1.64 {d2}, [r1], r2
153 vld1.64 {d3}, [r1], r2
158 vst1.64 {d0}, [r0,:64], r2
159 vst1.64 {d1}, [r0,:64], r2
160 vst1.64 {d2}, [r0,:64], r2
161 vst1.64 {d3}, [r0,:64], r2
166 .macro pixels8_x2 vhadd=vrhadd.u8
167 1: vld1.64 {d0, d1}, [r1], r2
168 vext.8 d1, d0, d1, #1
169 vld1.64 {d2, d3}, [r1], r2
170 vext.8 d3, d2, d3, #1
176 vst1.64 {d0}, [r0,:64], r2
177 vst1.64 {d1}, [r0,:64], r2
182 .macro pixels8_y2 vhadd=vrhadd.u8
186 vld1.64 {d0}, [r1], lr
187 vld1.64 {d1}, [ip], lr
190 vld1.64 {d0}, [r1], lr
192 vld1.64 {d1}, [ip], lr
195 vst1.64 {d4}, [r0,:64], r2
196 vst1.64 {d5}, [r0,:64], r2
201 .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0
205 vld1.64 {d0, d1}, [r1], lr
206 vld1.64 {d2, d3}, [ip], lr
212 vext.8 d4, d0, d1, #1
213 vext.8 d6, d2, d3, #1
217 vld1.64 {d0, d1}, [r1], lr
220 vext.8 d4, d0, d1, #1
222 vadd.u16 q10, q10, q11
226 vld1.64 {d2, d3}, [ip], lr
230 vadd.u16 q10, q10, q11
232 vst1.64 {d5}, [r0,:64], r2
234 vext.8 d6, d2, d3, #1
236 vst1.64 {d7}, [r0,:64], r2
241 .macro pixfunc pfx name suf rnd_op args:vararg
242 function ff_\pfx\name\suf\()_neon, export=1
247 .macro pixfunc2 pfx name args:vararg
249 pixfunc \pfx \name \args
252 function ff_put_h264_qpel16_mc00_neon, export=1
256 pixfunc put_ pixels16
257 pixfunc2 put_ pixels16_x2, _no_rnd, vhadd.u8
258 pixfunc2 put_ pixels16_y2, _no_rnd, vhadd.u8
259 pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1
261 function ff_avg_h264_qpel16_mc00_neon, export=1
265 pixfunc avg_ pixels16,, 1
267 function ff_put_h264_qpel8_mc00_neon, export=1
272 pixfunc2 put_ pixels8_x2, _no_rnd, vhadd.u8
273 pixfunc2 put_ pixels8_y2, _no_rnd, vhadd.u8
274 pixfunc2 put_ pixels8_xy2, _no_rnd, vshrn.u16, 1
276 function ff_float_to_int16_neon, export=1
278 vld1.64 {d0-d1}, [r1,:128]!
279 vcvt.s32.f32 q8, q0, #16
280 vld1.64 {d2-d3}, [r1,:128]!
281 vcvt.s32.f32 q9, q1, #16
286 vshrn.s32 d4, q8, #16
287 vld1.64 {d0-d1}, [r1,:128]!
288 vcvt.s32.f32 q0, q0, #16
289 vshrn.s32 d5, q9, #16
290 vld1.64 {d2-d3}, [r1,:128]!
291 vcvt.s32.f32 q1, q1, #16
292 vshrn.s32 d6, q0, #16
293 vst1.64 {d4-d5}, [r0,:128]!
294 vshrn.s32 d7, q1, #16
295 vld1.64 {d16-d17},[r1,:128]!
296 vcvt.s32.f32 q8, q8, #16
297 vld1.64 {d18-d19},[r1,:128]!
298 vcvt.s32.f32 q9, q9, #16
299 vst1.64 {d6-d7}, [r0,:128]!
303 2: vld1.64 {d0-d1}, [r1,:128]!
304 vshrn.s32 d4, q8, #16
305 vcvt.s32.f32 q0, q0, #16
306 vld1.64 {d2-d3}, [r1,:128]!
307 vshrn.s32 d5, q9, #16
308 vcvt.s32.f32 q1, q1, #16
309 vshrn.s32 d6, q0, #16
310 vst1.64 {d4-d5}, [r0,:128]!
311 vshrn.s32 d7, q1, #16
312 vst1.64 {d6-d7}, [r0,:128]!
314 3: vshrn.s32 d4, q8, #16
315 vshrn.s32 d5, q9, #16
316 vst1.64 {d4-d5}, [r0,:128]!
320 function ff_float_to_int16_interleave_neon, export=1
323 blt ff_float_to_int16_neon
330 vld1.64 {d0-d1}, [r3,:128]!
331 vcvt.s32.f32 q8, q0, #16
332 vld1.64 {d2-d3}, [r3,:128]!
333 vcvt.s32.f32 q9, q1, #16
334 vld1.64 {d20-d21},[r1,:128]!
335 vcvt.s32.f32 q10, q10, #16
336 vld1.64 {d22-d23},[r1,:128]!
337 vcvt.s32.f32 q11, q11, #16
342 vld1.64 {d0-d1}, [r3,:128]!
343 vcvt.s32.f32 q0, q0, #16
345 vld1.64 {d2-d3}, [r3,:128]!
346 vcvt.s32.f32 q1, q1, #16
347 vld1.64 {d24-d25},[r1,:128]!
348 vcvt.s32.f32 q12, q12, #16
349 vld1.64 {d26-d27},[r1,:128]!
351 vst1.64 {d20-d21},[r0,:128]!
352 vcvt.s32.f32 q13, q13, #16
353 vst1.64 {d22-d23},[r0,:128]!
355 vld1.64 {d16-d17},[r3,:128]!
357 vst1.64 {d24-d25},[r0,:128]!
358 vcvt.s32.f32 q8, q8, #16
359 vld1.64 {d18-d19},[r3,:128]!
360 vcvt.s32.f32 q9, q9, #16
361 vld1.64 {d20-d21},[r1,:128]!
362 vcvt.s32.f32 q10, q10, #16
363 vld1.64 {d22-d23},[r1,:128]!
364 vcvt.s32.f32 q11, q11, #16
365 vst1.64 {d26-d27},[r0,:128]!
369 2: vsri.32 q10, q8, #16
370 vld1.64 {d0-d1}, [r3,:128]!
371 vcvt.s32.f32 q0, q0, #16
372 vld1.64 {d2-d3}, [r3,:128]!
373 vcvt.s32.f32 q1, q1, #16
374 vld1.64 {d24-d25},[r1,:128]!
375 vcvt.s32.f32 q12, q12, #16
377 vld1.64 {d26-d27},[r1,:128]!
378 vcvt.s32.f32 q13, q13, #16
379 vst1.64 {d20-d21},[r0,:128]!
381 vst1.64 {d22-d23},[r0,:128]!
383 vst1.64 {d24-d27},[r0,:128]!
385 3: vsri.32 q10, q8, #16
387 vst1.64 {d20-d23},[r0,:128]!
396 5: ldmia r1!, {r4-r7}
399 vld1.64 {d16-d17},[r4,:128]!
400 vcvt.s32.f32 q8, q8, #16
401 vld1.64 {d18-d19},[r5,:128]!
402 vcvt.s32.f32 q9, q9, #16
403 vld1.64 {d20-d21},[r6,:128]!
404 vcvt.s32.f32 q10, q10, #16
405 vld1.64 {d22-d23},[r7,:128]!
406 vcvt.s32.f32 q11, q11, #16
408 vld1.64 {d0-d1}, [r4,:128]!
409 vcvt.s32.f32 q0, q0, #16
411 vld1.64 {d2-d3}, [r5,:128]!
412 vcvt.s32.f32 q1, q1, #16
413 vsri.32 q11, q10, #16
414 vld1.64 {d4-d5}, [r6,:128]!
415 vcvt.s32.f32 q2, q2, #16
417 vld1.64 {d6-d7}, [r7,:128]!
418 vcvt.s32.f32 q3, q3, #16
420 vst1.64 {d18}, [r8], ip
422 vst1.64 {d22}, [r8], ip
424 vst1.64 {d19}, [r8], ip
426 vst1.64 {d23}, [r8], ip
429 vld1.64 {d16-d17},[r4,:128]!
430 vcvt.s32.f32 q8, q8, #16
431 vst1.64 {d2}, [r8], ip
432 vld1.64 {d18-d19},[r5,:128]!
433 vcvt.s32.f32 q9, q9, #16
434 vst1.64 {d6}, [r8], ip
435 vld1.64 {d20-d21},[r6,:128]!
436 vcvt.s32.f32 q10, q10, #16
437 vst1.64 {d3}, [r8], ip
438 vld1.64 {d22-d23},[r7,:128]!
439 vcvt.s32.f32 q11, q11, #16
440 vst1.64 {d7}, [r8], ip
442 7: vst1.64 {d2}, [r8], ip
443 vst1.64 {d6}, [r8], ip
444 vst1.64 {d3}, [r8], ip
445 vst1.64 {d7}, [r8], ip
459 vld1.64 {d16-d17},[r4,:128]!
460 vcvt.s32.f32 q8, q8, #16
461 vld1.64 {d18-d19},[r5,:128]!
462 vcvt.s32.f32 q9, q9, #16
463 vld1.64 {d20-d21},[r4,:128]!
464 vcvt.s32.f32 q10, q10, #16
465 vld1.64 {d22-d23},[r5,:128]!
466 vcvt.s32.f32 q11, q11, #16
470 vsri.32 d18, d16, #16
471 vsri.32 d19, d17, #16
472 vld1.64 {d16-d17},[r4,:128]!
473 vcvt.s32.f32 q8, q8, #16
474 vst1.32 {d18[0]}, [r8], ip
475 vsri.32 d22, d20, #16
476 vst1.32 {d18[1]}, [r8], ip
477 vsri.32 d23, d21, #16
478 vst1.32 {d19[0]}, [r8], ip
479 vst1.32 {d19[1]}, [r8], ip
480 vld1.64 {d18-d19},[r5,:128]!
481 vcvt.s32.f32 q9, q9, #16
482 vst1.32 {d22[0]}, [r8], ip
483 vst1.32 {d22[1]}, [r8], ip
484 vld1.64 {d20-d21},[r4,:128]!
485 vcvt.s32.f32 q10, q10, #16
486 vst1.32 {d23[0]}, [r8], ip
487 vst1.32 {d23[1]}, [r8], ip
488 vld1.64 {d22-d23},[r5,:128]!
489 vcvt.s32.f32 q11, q11, #16
491 vld1.64 {d0-d1}, [r4,:128]!
492 vcvt.s32.f32 q0, q0, #16
493 vsri.32 d18, d16, #16
494 vld1.64 {d2-d3}, [r5,:128]!
495 vcvt.s32.f32 q1, q1, #16
496 vsri.32 d19, d17, #16
497 vld1.64 {d4-d5}, [r4,:128]!
498 vcvt.s32.f32 q2, q2, #16
499 vld1.64 {d6-d7}, [r5,:128]!
500 vcvt.s32.f32 q3, q3, #16
501 vst1.32 {d18[0]}, [r8], ip
502 vsri.32 d22, d20, #16
503 vst1.32 {d18[1]}, [r8], ip
504 vsri.32 d23, d21, #16
505 vst1.32 {d19[0]}, [r8], ip
507 vst1.32 {d19[1]}, [r8], ip
509 vst1.32 {d22[0]}, [r8], ip
511 vst1.32 {d22[1]}, [r8], ip
513 vst1.32 {d23[0]}, [r8], ip
514 vst1.32 {d23[1]}, [r8], ip
516 vld1.64 {d16-d17},[r4,:128]!
517 vcvt.s32.f32 q8, q8, #16
518 vst1.32 {d2[0]}, [r8], ip
519 vst1.32 {d2[1]}, [r8], ip
520 vld1.64 {d18-d19},[r5,:128]!
521 vcvt.s32.f32 q9, q9, #16
522 vst1.32 {d3[0]}, [r8], ip
523 vst1.32 {d3[1]}, [r8], ip
524 vld1.64 {d20-d21},[r4,:128]!
525 vcvt.s32.f32 q10, q10, #16
526 vst1.32 {d6[0]}, [r8], ip
527 vst1.32 {d6[1]}, [r8], ip
528 vld1.64 {d22-d23},[r5,:128]!
529 vcvt.s32.f32 q11, q11, #16
530 vst1.32 {d7[0]}, [r8], ip
531 vst1.32 {d7[1]}, [r8], ip
533 6: vst1.32 {d2[0]}, [r8], ip
534 vst1.32 {d2[1]}, [r8], ip
535 vst1.32 {d3[0]}, [r8], ip
536 vst1.32 {d3[1]}, [r8], ip
537 vst1.32 {d6[0]}, [r8], ip
538 vst1.32 {d6[1]}, [r8], ip
539 vst1.32 {d7[0]}, [r8], ip
540 vst1.32 {d7[1]}, [r8], ip
542 7: vsri.32 d18, d16, #16
543 vsri.32 d19, d17, #16
544 vst1.32 {d18[0]}, [r8], ip
545 vsri.32 d22, d20, #16
546 vst1.32 {d18[1]}, [r8], ip
547 vsri.32 d23, d21, #16
548 vst1.32 {d19[0]}, [r8], ip
549 vst1.32 {d19[1]}, [r8], ip
550 vst1.32 {d22[0]}, [r8], ip
551 vst1.32 {d22[1]}, [r8], ip
552 vst1.32 {d23[0]}, [r8], ip
553 vst1.32 {d23[1]}, [r8], ip
563 vld1.64 {d0-d1}, [r4,:128]!
564 vcvt.s32.f32 q0, q0, #16
565 vld1.64 {d2-d3}, [r4,:128]!
566 vcvt.s32.f32 q1, q1, #16
569 vld1.64 {d4-d5}, [r4,:128]!
570 vcvt.s32.f32 q2, q2, #16
571 vld1.64 {d6-d7}, [r4,:128]!
572 vcvt.s32.f32 q3, q3, #16
573 vst1.16 {d0[1]}, [r5,:16], ip
574 vst1.16 {d0[3]}, [r5,:16], ip
575 vst1.16 {d1[1]}, [r5,:16], ip
576 vst1.16 {d1[3]}, [r5,:16], ip
577 vst1.16 {d2[1]}, [r5,:16], ip
578 vst1.16 {d2[3]}, [r5,:16], ip
579 vst1.16 {d3[1]}, [r5,:16], ip
580 vst1.16 {d3[3]}, [r5,:16], ip
582 vld1.64 {d0-d1}, [r4,:128]!
583 vcvt.s32.f32 q0, q0, #16
584 vld1.64 {d2-d3}, [r4,:128]!
585 vcvt.s32.f32 q1, q1, #16
586 7: vst1.16 {d4[1]}, [r5,:16], ip
587 vst1.16 {d4[3]}, [r5,:16], ip
588 vst1.16 {d5[1]}, [r5,:16], ip
589 vst1.16 {d5[3]}, [r5,:16], ip
590 vst1.16 {d6[1]}, [r5,:16], ip
591 vst1.16 {d6[3]}, [r5,:16], ip
592 vst1.16 {d7[1]}, [r5,:16], ip
593 vst1.16 {d7[3]}, [r5,:16], ip
597 vst1.16 {d0[1]}, [r5,:16], ip
598 vst1.16 {d0[3]}, [r5,:16], ip
599 vst1.16 {d1[1]}, [r5,:16], ip
600 vst1.16 {d1[3]}, [r5,:16], ip
601 vst1.16 {d2[1]}, [r5,:16], ip
602 vst1.16 {d2[3]}, [r5,:16], ip
603 vst1.16 {d3[1]}, [r5,:16], ip
604 vst1.16 {d3[3]}, [r5,:16], ip
606 vld1.64 {d0-d1}, [r4,:128]!
607 vcvt.s32.f32 q0, q0, #16
608 vld1.64 {d2-d3}, [r4,:128]!
609 vcvt.s32.f32 q1, q1, #16
613 function ff_vector_fmul_neon, export=1
616 vld1.64 {d0-d3}, [r0,:128]!
617 vld1.64 {d4-d7}, [r1,:128]!
624 vld1.64 {d0-d1}, [r0,:128]!
625 vld1.64 {d4-d5}, [r1,:128]!
627 vld1.64 {d2-d3}, [r0,:128]!
628 vld1.64 {d6-d7}, [r1,:128]!
630 vst1.64 {d16-d19},[r3,:128]!
631 vld1.64 {d0-d1}, [r0,:128]!
632 vld1.64 {d4-d5}, [r1,:128]!
634 vld1.64 {d2-d3}, [r0,:128]!
635 vld1.64 {d6-d7}, [r1,:128]!
637 vst1.64 {d20-d23},[r3,:128]!
641 2: vld1.64 {d0-d1}, [r0,:128]!
642 vld1.64 {d4-d5}, [r1,:128]!
643 vst1.64 {d16-d17},[r3,:128]!
645 vld1.64 {d2-d3}, [r0,:128]!
646 vld1.64 {d6-d7}, [r1,:128]!
647 vst1.64 {d18-d19},[r3,:128]!
649 3: vst1.64 {d16-d19},[r3,:128]!
653 function ff_vector_fmul_window_neon, export=1
654 vld1.32 {d16[],d17[]}, [sp,:32]
659 add r2, r2, r5, lsl #2
660 add r4, r3, r5, lsl #3
661 add ip, r0, r5, lsl #3
663 vld1.64 {d0,d1}, [r1,:128]!
664 vld1.64 {d2,d3}, [r2,:128], r5
665 vld1.64 {d4,d5}, [r3,:128]!
666 vld1.64 {d6,d7}, [r4,:128], r5
678 vld1.64 {d0,d1}, [r1,:128]!
680 vld1.64 {d18,d19},[r2,:128], r5
682 vld1.64 {d24,d25},[r3,:128]!
684 vld1.64 {d6,d7}, [r4,:128], r5
689 vst1.64 {d20,d21},[r0,:128]!
690 vst1.64 {d22,d23},[ip,:128], r5
692 2: vmla.f32 d22, d3, d7
698 vst1.64 {d20,d21},[r0,:128]!
699 vst1.64 {d22,d23},[ip,:128], r5