]> rtime.felk.cvut.cz Git - frescor/ffmpeg.git/blob - libavcodec/arm/h264dsp_neon.S
ARM: NEON optimised H.264 weighted prediction
[frescor/ffmpeg.git] / libavcodec / arm / h264dsp_neon.S
1 /*
2  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20
21 #include "asm.S"
22
23         .fpu neon
24
25         .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7
26         vtrn.32         \r0, \r4
27         vtrn.32         \r1, \r5
28         vtrn.32         \r2, \r6
29         vtrn.32         \r3, \r7
30         vtrn.16         \r0, \r2
31         vtrn.16         \r1, \r3
32         vtrn.16         \r4, \r6
33         vtrn.16         \r5, \r7
34         vtrn.8          \r0, \r1
35         vtrn.8          \r2, \r3
36         vtrn.8          \r4, \r5
37         vtrn.8          \r6, \r7
38         .endm
39
40         .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7
41         vswp            \r0, \r4
42         vswp            \r1, \r5
43         vswp            \r2, \r6
44         vswp            \r3, \r7
45         .endm
46
47         .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7
48         vtrn.32         \r0, \r2
49         vtrn.32         \r1, \r3
50         vtrn.32         \r4, \r6
51         vtrn.32         \r5, \r7
52         vtrn.16         \r0, \r1
53         vtrn.16         \r2, \r3
54         vtrn.16         \r4, \r5
55         vtrn.16         \r6, \r7
56         .endm
57
58 /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
59         .macro  h264_chroma_mc8 type
60 function ff_\type\()_h264_chroma_mc8_neon, export=1
61         push            {r4-r7, lr}
62         ldrd            r4,  [sp, #20]
63 .ifc \type,avg
64         mov             lr,  r0
65 .endif
66         pld             [r1]
67         pld             [r1, r2]
68
69         muls            r7,  r4,  r5
70         rsb             r6,  r7,  r5,  lsl #3
71         rsb             ip,  r7,  r4,  lsl #3
72         sub             r4,  r7,  r4,  lsl #3
73         sub             r4,  r4,  r5,  lsl #3
74         add             r4,  r4,  #64
75
76         beq             2f
77
78         add             r5,  r1,  r2
79
80         vdup.8          d0,  r4
81         lsl             r4,  r2,  #1
82         vdup.8          d1,  ip
83         vld1.64         {d4, d5}, [r1], r4
84         vdup.8          d2,  r6
85         vld1.64         {d6, d7}, [r5], r4
86         vdup.8          d3,  r7
87
88         vext.8          d5,  d4,  d5,  #1
89         vext.8          d7,  d6,  d7,  #1
90
91 1:      pld             [r5]
92         vmull.u8        q8,  d4,  d0
93         vmlal.u8        q8,  d5,  d1
94         vld1.64         {d4, d5}, [r1], r4
95         vmlal.u8        q8,  d6,  d2
96         vext.8          d5,  d4,  d5,  #1
97         vmlal.u8        q8,  d7,  d3
98         vmull.u8        q9,  d6,  d0
99         subs            r3,  r3,  #2
100         vmlal.u8        q9,  d7,  d1
101         vmlal.u8        q9,  d4,  d2
102         vmlal.u8        q9,  d5,  d3
103         vrshrn.u16      d16, q8,  #6
104         vld1.64         {d6, d7}, [r5], r4
105         pld             [r1]
106         vrshrn.u16      d17, q9,  #6
107 .ifc \type,avg
108         vld1.64         {d20}, [lr,:64], r2
109         vld1.64         {d21}, [lr,:64], r2
110         vrhadd.u8       q8,  q8,  q10
111 .endif
112         vext.8          d7,  d6,  d7,  #1
113         vst1.64         {d16}, [r0,:64], r2
114         vst1.64         {d17}, [r0,:64], r2
115         bgt             1b
116
117         pop             {r4-r7, pc}
118
119 2:      tst             r6,  r6
120         add             ip,  ip,  r6
121         vdup.8          d0,  r4
122         vdup.8          d1,  ip
123
124         beq             4f
125
126         add             r5,  r1,  r2
127         lsl             r4,  r2,  #1
128         vld1.64         {d4}, [r1], r4
129         vld1.64         {d6}, [r5], r4
130
131 3:      pld             [r5]
132         vmull.u8        q8,  d4,  d0
133         vmlal.u8        q8,  d6,  d1
134         vld1.64         {d4}, [r1], r4
135         vmull.u8        q9,  d6,  d0
136         vmlal.u8        q9,  d4,  d1
137         vld1.64         {d6}, [r5], r4
138         vrshrn.u16      d16, q8,  #6
139         vrshrn.u16      d17, q9,  #6
140 .ifc \type,avg
141         vld1.64         {d20}, [lr,:64], r2
142         vld1.64         {d21}, [lr,:64], r2
143         vrhadd.u8       q8,  q8,  q10
144 .endif
145         subs            r3,  r3,  #2
146         pld             [r1]
147         vst1.64         {d16}, [r0,:64], r2
148         vst1.64         {d17}, [r0,:64], r2
149         bgt             3b
150
151         pop             {r4-r7, pc}
152
153 4:      vld1.64         {d4, d5}, [r1], r2
154         vld1.64         {d6, d7}, [r1], r2
155         vext.8          d5,  d4,  d5,  #1
156         vext.8          d7,  d6,  d7,  #1
157
158 5:      pld             [r1]
159         subs            r3,  r3,  #2
160         vmull.u8        q8,  d4,  d0
161         vmlal.u8        q8,  d5,  d1
162         vld1.64         {d4, d5}, [r1], r2
163         vmull.u8        q9,  d6,  d0
164         vmlal.u8        q9,  d7,  d1
165         pld             [r1]
166         vext.8          d5,  d4,  d5,  #1
167         vrshrn.u16      d16, q8,  #6
168         vrshrn.u16      d17, q9,  #6
169 .ifc \type,avg
170         vld1.64         {d20}, [lr,:64], r2
171         vld1.64         {d21}, [lr,:64], r2
172         vrhadd.u8       q8,  q8,  q10
173 .endif
174         vld1.64         {d6, d7}, [r1], r2
175         vext.8          d7,  d6,  d7,  #1
176         vst1.64         {d16}, [r0,:64], r2
177         vst1.64         {d17}, [r0,:64], r2
178         bgt             5b
179
180         pop             {r4-r7, pc}
181         .endfunc
182         .endm
183
184 /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
185         .macro  h264_chroma_mc4 type
186 function ff_\type\()_h264_chroma_mc4_neon, export=1
187         push            {r4-r7, lr}
188         ldrd            r4,  [sp, #20]
189 .ifc \type,avg
190         mov             lr,  r0
191 .endif
192         pld             [r1]
193         pld             [r1, r2]
194
195         muls            r7,  r4,  r5
196         rsb             r6,  r7,  r5,  lsl #3
197         rsb             ip,  r7,  r4,  lsl #3
198         sub             r4,  r7,  r4,  lsl #3
199         sub             r4,  r4,  r5,  lsl #3
200         add             r4,  r4,  #64
201
202         beq             2f
203
204         add             r5,  r1,  r2
205
206         vdup.8          d0,  r4
207         lsl             r4,  r2,  #1
208         vdup.8          d1,  ip
209         vld1.64         {d4},     [r1], r4
210         vdup.8          d2,  r6
211         vld1.64         {d6},     [r5], r4
212         vdup.8          d3,  r7
213
214         vext.8          d5,  d4,  d5,  #1
215         vext.8          d7,  d6,  d7,  #1
216         vtrn.32         d4,  d5
217         vtrn.32         d6,  d7
218
219         vtrn.32         d0,  d1
220         vtrn.32         d2,  d3
221
222 1:      pld             [r5]
223         vmull.u8        q8,  d4,  d0
224         vmlal.u8        q8,  d6,  d2
225         vld1.64         {d4},     [r1], r4
226         vext.8          d5,  d4,  d5,  #1
227         vtrn.32         d4,  d5
228         vmull.u8        q9,  d6,  d0
229         vmlal.u8        q9,  d4,  d2
230         vld1.64         {d6},     [r5], r4
231         vadd.i16        d16, d16, d17
232         vadd.i16        d17, d18, d19
233         vrshrn.u16      d16, q8,  #6
234         subs            r3,  r3,  #2
235         pld             [r1]
236 .ifc \type,avg
237         vld1.32         {d20[0]}, [lr,:32], r2
238         vld1.32         {d20[1]}, [lr,:32], r2
239         vrhadd.u8       d16, d16, d20
240 .endif
241         vext.8          d7,  d6,  d7,  #1
242         vtrn.32         d6,  d7
243         vst1.32         {d16[0]}, [r0,:32], r2
244         vst1.32         {d16[1]}, [r0,:32], r2
245         bgt             1b
246
247         pop             {r4-r7, pc}
248
249 2:      tst             r6,  r6
250         add             ip,  ip,  r6
251         vdup.8          d0,  r4
252         vdup.8          d1,  ip
253         vtrn.32         d0,  d1
254
255         beq             4f
256
257         vext.32         d1,  d0,  d1,  #1
258         add             r5,  r1,  r2
259         lsl             r4,  r2,  #1
260         vld1.32         {d4[0]},  [r1], r4
261         vld1.32         {d4[1]},  [r5], r4
262
263 3:      pld             [r5]
264         vmull.u8        q8,  d4,  d0
265         vld1.32         {d4[0]},  [r1], r4
266         vmull.u8        q9,  d4,  d1
267         vld1.32         {d4[1]},  [r5], r4
268         vadd.i16        d16, d16, d17
269         vadd.i16        d17, d18, d19
270         vrshrn.u16      d16, q8,  #6
271 .ifc \type,avg
272         vld1.32         {d20[0]}, [lr,:32], r2
273         vld1.32         {d20[1]}, [lr,:32], r2
274         vrhadd.u8       d16, d16, d20
275 .endif
276         subs            r3,  r3,  #2
277         pld             [r1]
278         vst1.32         {d16[0]}, [r0,:32], r2
279         vst1.32         {d16[1]}, [r0,:32], r2
280         bgt             3b
281
282         pop             {r4-r7, pc}
283
284 4:      vld1.64         {d4},     [r1], r2
285         vld1.64         {d6},     [r1], r2
286         vext.8          d5,  d4,  d5,  #1
287         vext.8          d7,  d6,  d7,  #1
288         vtrn.32         d4,  d5
289         vtrn.32         d6,  d7
290
291 5:      vmull.u8        q8,  d4,  d0
292         vmull.u8        q9,  d6,  d0
293         subs            r3,  r3,  #2
294         vld1.64         {d4},     [r1], r2
295         vext.8          d5,  d4,  d5,  #1
296         vtrn.32         d4,  d5
297         vadd.i16        d16, d16, d17
298         vadd.i16        d17, d18, d19
299         pld             [r1]
300         vrshrn.u16      d16, q8,  #6
301 .ifc \type,avg
302         vld1.32         {d20[0]}, [lr,:32], r2
303         vld1.32         {d20[1]}, [lr,:32], r2
304         vrhadd.u8       d16, d16, d20
305 .endif
306         vld1.64         {d6},     [r1], r2
307         vext.8          d7,  d6,  d7,  #1
308         vtrn.32         d6,  d7
309         pld             [r1]
310         vst1.32         {d16[0]}, [r0,:32], r2
311         vst1.32         {d16[1]}, [r0,:32], r2
312         bgt             5b
313
314         pop             {r4-r7, pc}
315         .endfunc
316         .endm
317
318         .text
319         .align
320
321         h264_chroma_mc8 put
322         h264_chroma_mc8 avg
323         h264_chroma_mc4 put
324         h264_chroma_mc4 avg
325
326         /* H.264 loop filter */
327
328         .macro h264_loop_filter_start
329         ldr             ip,  [sp]
330         tst             r2,  r2
331         ldr             ip,  [ip]
332         tstne           r3,  r3
333         vmov.32         d24[0], ip
334         and             ip,  ip,  ip, lsl #16
335         bxeq            lr
336         ands            ip,  ip,  ip, lsl #8
337         bxlt            lr
338         .endm
339
340         .macro align_push_regs
341         and             ip,  sp,  #15
342         add             ip,  ip,  #32
343         sub             sp,  sp,  ip
344         vst1.64         {d12-d15}, [sp,:128]
345         sub             sp,  sp,  #32
346         vst1.64         {d8-d11},  [sp,:128]
347         .endm
348
349         .macro align_pop_regs
350         vld1.64         {d8-d11},  [sp,:128]!
351         vld1.64         {d12-d15}, [sp,:128], ip
352         .endm
353
354         .macro h264_loop_filter_luma
355         vdup.8          q11, r2         @ alpha
356         vmovl.u8        q12, d24
357         vabd.u8         q6,  q8,  q0    @ abs(p0 - q0)
358         vmovl.u16       q12, d24
359         vabd.u8         q14, q9,  q8    @ abs(p1 - p0)
360         vsli.16         q12, q12, #8
361         vabd.u8         q15, q1,  q0    @ abs(q1 - q0)
362         vsli.32         q12, q12, #16
363         vclt.u8         q6,  q6,  q11   @ < alpha
364         vdup.8          q11, r3         @ beta
365         vclt.s8         q7,  q12, #0
366         vclt.u8         q14, q14, q11   @ < beta
367         vclt.u8         q15, q15, q11   @ < beta
368         vbic            q6,  q6,  q7
369         vabd.u8         q4,  q10, q8    @ abs(p2 - p0)
370         vand            q6,  q6,  q14
371         vabd.u8         q5,  q2,  q0    @ abs(q2 - q0)
372         vclt.u8         q4,  q4,  q11   @ < beta
373         vand            q6,  q6,  q15
374         vclt.u8         q5,  q5,  q11   @ < beta
375         vand            q4,  q4,  q6
376         vand            q5,  q5,  q6
377         vand            q12, q12, q6
378         vrhadd.u8       q14, q8,  q0
379         vsub.i8         q6,  q12, q4
380         vqadd.u8        q7,  q9,  q12
381         vhadd.u8        q10, q10, q14
382         vsub.i8         q6,  q6,  q5
383         vhadd.u8        q14, q2,  q14
384         vmin.u8         q7,  q7,  q10
385         vqsub.u8        q11, q9,  q12
386         vqadd.u8        q2,  q1,  q12
387         vmax.u8         q7,  q7,  q11
388         vqsub.u8        q11, q1,  q12
389         vmin.u8         q14, q2,  q14
390         vmovl.u8        q2,  d0
391         vmax.u8         q14, q14, q11
392         vmovl.u8        q10, d1
393         vsubw.u8        q2,  q2,  d16
394         vsubw.u8        q10, q10, d17
395         vshl.i16        q2,  q2,  #2
396         vshl.i16        q10, q10, #2
397         vaddw.u8        q2,  q2,  d18
398         vaddw.u8        q10, q10, d19
399         vsubw.u8        q2,  q2,  d2
400         vsubw.u8        q10, q10, d3
401         vrshrn.i16      d4,  q2,  #3
402         vrshrn.i16      d5,  q10, #3
403         vbsl            q4,  q7,  q9
404         vbsl            q5,  q14, q1
405         vneg.s8         q7,  q6
406         vmovl.u8        q14, d16
407         vmin.s8         q2,  q2,  q6
408         vmovl.u8        q6,  d17
409         vmax.s8         q2,  q2,  q7
410         vmovl.u8        q11, d0
411         vmovl.u8        q12, d1
412         vaddw.s8        q14, q14, d4
413         vaddw.s8        q6,  q6,  d5
414         vsubw.s8        q11, q11, d4
415         vsubw.s8        q12, q12, d5
416         vqmovun.s16     d16, q14
417         vqmovun.s16     d17, q6
418         vqmovun.s16     d0,  q11
419         vqmovun.s16     d1,  q12
420         .endm
421
422 function ff_h264_v_loop_filter_luma_neon, export=1
423         h264_loop_filter_start
424
425         vld1.64         {d0, d1},  [r0,:128], r1
426         vld1.64         {d2, d3},  [r0,:128], r1
427         vld1.64         {d4, d5},  [r0,:128], r1
428         sub             r0,  r0,  r1, lsl #2
429         sub             r0,  r0,  r1, lsl #1
430         vld1.64         {d20,d21}, [r0,:128], r1
431         vld1.64         {d18,d19}, [r0,:128], r1
432         vld1.64         {d16,d17}, [r0,:128], r1
433
434         align_push_regs
435
436         h264_loop_filter_luma
437
438         sub             r0,  r0,  r1, lsl #1
439         vst1.64         {d8, d9},  [r0,:128], r1
440         vst1.64         {d16,d17}, [r0,:128], r1
441         vst1.64         {d0, d1},  [r0,:128], r1
442         vst1.64         {d10,d11}, [r0,:128]
443
444         align_pop_regs
445         bx              lr
446         .endfunc
447
448 function ff_h264_h_loop_filter_luma_neon, export=1
449         h264_loop_filter_start
450
451         sub             r0,  r0,  #4
452         vld1.64         {d6},  [r0], r1
453         vld1.64         {d20}, [r0], r1
454         vld1.64         {d18}, [r0], r1
455         vld1.64         {d16}, [r0], r1
456         vld1.64         {d0},  [r0], r1
457         vld1.64         {d2},  [r0], r1
458         vld1.64         {d4},  [r0], r1
459         vld1.64         {d26}, [r0], r1
460         vld1.64         {d7},  [r0], r1
461         vld1.64         {d21}, [r0], r1
462         vld1.64         {d19}, [r0], r1
463         vld1.64         {d17}, [r0], r1
464         vld1.64         {d1},  [r0], r1
465         vld1.64         {d3},  [r0], r1
466         vld1.64         {d5},  [r0], r1
467         vld1.64         {d27}, [r0], r1
468
469         transpose_8x8   q3, q10, q9, q8, q0, q1, q2, q13
470
471         align_push_regs
472         sub             sp,  sp,  #16
473         vst1.64         {d4, d5},  [sp,:128]
474         sub             sp,  sp,  #16
475         vst1.64         {d20,d21}, [sp,:128]
476
477         h264_loop_filter_luma
478
479         vld1.64         {d20,d21}, [sp,:128]!
480         vld1.64         {d4, d5},  [sp,:128]!
481
482         transpose_8x8   q3, q10, q4, q8, q0, q5, q2, q13
483
484         sub             r0,  r0,  r1, lsl #4
485         vst1.64         {d6},  [r0], r1
486         vst1.64         {d20}, [r0], r1
487         vst1.64         {d8},  [r0], r1
488         vst1.64         {d16}, [r0], r1
489         vst1.64         {d0},  [r0], r1
490         vst1.64         {d10}, [r0], r1
491         vst1.64         {d4},  [r0], r1
492         vst1.64         {d26}, [r0], r1
493         vst1.64         {d7},  [r0], r1
494         vst1.64         {d21}, [r0], r1
495         vst1.64         {d9},  [r0], r1
496         vst1.64         {d17}, [r0], r1
497         vst1.64         {d1},  [r0], r1
498         vst1.64         {d11}, [r0], r1
499         vst1.64         {d5},  [r0], r1
500         vst1.64         {d27}, [r0], r1
501
502         align_pop_regs
503         bx              lr
504         .endfunc
505
506         .macro h264_loop_filter_chroma
507         vdup.8          d22, r2         @ alpha
508         vmovl.u8        q12, d24
509         vabd.u8         d26, d16, d0    @ abs(p0 - q0)
510         vmovl.u8        q2,  d0
511         vabd.u8         d28, d18, d16   @ abs(p1 - p0)
512         vsubw.u8        q2,  q2,  d16
513         vsli.16         d24, d24, #8
514         vshl.i16        q2,  q2,  #2
515         vabd.u8         d30, d2,  d0    @ abs(q1 - q0)
516         vaddw.u8        q2,  q2,  d18
517         vclt.u8         d26, d26, d22   @ < alpha
518         vsubw.u8        q2,  q2,  d2
519         vdup.8          d22, r3         @ beta
520         vclt.s8         d25, d24, #0
521         vrshrn.i16      d4,  q2,  #3
522         vclt.u8         d28, d28, d22   @ < beta
523         vbic            d26, d26, d25
524         vclt.u8         d30, d30, d22   @ < beta
525         vand            d26, d26, d28
526         vneg.s8         d25, d24
527         vand            d26, d26, d30
528         vmin.s8         d4,  d4,  d24
529         vmovl.u8        q14, d16
530         vand            d4,  d4,  d26
531         vmax.s8         d4,  d4,  d25
532         vmovl.u8        q11, d0
533         vaddw.s8        q14, q14, d4
534         vsubw.s8        q11, q11, d4
535         vqmovun.s16     d16, q14
536         vqmovun.s16     d0,  q11
537         .endm
538
539 function ff_h264_v_loop_filter_chroma_neon, export=1
540         h264_loop_filter_start
541
542         sub             r0,  r0,  r1, lsl #1
543         vld1.64         {d18}, [r0,:64], r1
544         vld1.64         {d16}, [r0,:64], r1
545         vld1.64         {d0},  [r0,:64], r1
546         vld1.64         {d2},  [r0,:64]
547
548         h264_loop_filter_chroma
549
550         sub             r0,  r0,  r1, lsl #1
551         vst1.64         {d16}, [r0,:64], r1
552         vst1.64         {d0},  [r0,:64], r1
553
554         bx              lr
555         .endfunc
556
557 function ff_h264_h_loop_filter_chroma_neon, export=1
558         h264_loop_filter_start
559
560         sub             r0,  r0,  #2
561         vld1.32         {d18[0]}, [r0], r1
562         vld1.32         {d16[0]}, [r0], r1
563         vld1.32         {d0[0]},  [r0], r1
564         vld1.32         {d2[0]},  [r0], r1
565         vld1.32         {d18[1]}, [r0], r1
566         vld1.32         {d16[1]}, [r0], r1
567         vld1.32         {d0[1]},  [r0], r1
568         vld1.32         {d2[1]},  [r0], r1
569
570         vtrn.16         d18, d0
571         vtrn.16         d16, d2
572         vtrn.8          d18, d16
573         vtrn.8          d0,  d2
574
575         h264_loop_filter_chroma
576
577         vtrn.16         d18, d0
578         vtrn.16         d16, d2
579         vtrn.8          d18, d16
580         vtrn.8          d0,  d2
581
582         sub             r0,  r0,  r1, lsl #3
583         vst1.32         {d18[0]}, [r0], r1
584         vst1.32         {d16[0]}, [r0], r1
585         vst1.32         {d0[0]},  [r0], r1
586         vst1.32         {d2[0]},  [r0], r1
587         vst1.32         {d18[1]}, [r0], r1
588         vst1.32         {d16[1]}, [r0], r1
589         vst1.32         {d0[1]},  [r0], r1
590         vst1.32         {d2[1]},  [r0], r1
591
592         bx              lr
593         .endfunc
594
595         /* H.264 qpel MC */
596
597         .macro  lowpass_const r
598         movw            \r,  #5
599         movt            \r,  #20
600         vmov.32         d6[0], \r
601         .endm
602
603         .macro  lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
604 .if \narrow
605         t0 .req q0
606         t1 .req q8
607 .else
608         t0 .req \d0
609         t1 .req \d1
610 .endif
611         vext.8          d2,  \r0, \r1, #2
612         vext.8          d3,  \r0, \r1, #3
613         vaddl.u8        q1,  d2,  d3
614         vext.8          d4,  \r0, \r1, #1
615         vext.8          d5,  \r0, \r1, #4
616         vaddl.u8        q2,  d4,  d5
617         vext.8          d30, \r0, \r1, #5
618         vaddl.u8        t0,  \r0, d30
619         vext.8          d18, \r2, \r3, #2
620         vmla.i16        t0,  q1,  d6[1]
621         vext.8          d19, \r2, \r3, #3
622         vaddl.u8        q9,  d18, d19
623         vext.8          d20, \r2, \r3, #1
624         vmls.i16        t0,  q2,  d6[0]
625         vext.8          d21, \r2, \r3, #4
626         vaddl.u8        q10, d20, d21
627         vext.8          d31, \r2, \r3, #5
628         vaddl.u8        t1,  \r2, d31
629         vmla.i16        t1,  q9,  d6[1]
630         vmls.i16        t1,  q10, d6[0]
631 .if \narrow
632         vqrshrun.s16    \d0, t0,  #5
633         vqrshrun.s16    \d1, t1,  #5
634 .endif
635         .unreq  t0
636         .unreq  t1
637         .endm
638
639         .macro  lowpass_8_1 r0, r1, d0, narrow=1
640 .if \narrow
641         t0 .req q0
642 .else
643         t0 .req \d0
644 .endif
645         vext.8          d2,  \r0, \r1, #2
646         vext.8          d3,  \r0, \r1, #3
647         vaddl.u8        q1,  d2,  d3
648         vext.8          d4,  \r0, \r1, #1
649         vext.8          d5,  \r0, \r1, #4
650         vaddl.u8        q2,  d4,  d5
651         vext.8          d30, \r0, \r1, #5
652         vaddl.u8        t0,  \r0, d30
653         vmla.i16        t0,  q1,  d6[1]
654         vmls.i16        t0,  q2,  d6[0]
655 .if \narrow
656         vqrshrun.s16    \d0, t0,  #5
657 .endif
658         .unreq  t0
659         .endm
660
661         .macro  lowpass_8.16 r0, r1, l0, h0, l1, h1, d
662         vext.16         q1,  \r0, \r1, #2
663         vext.16         q0,  \r0, \r1, #3
664         vaddl.s16       q9,  d2,  d0
665         vext.16         q2,  \r0, \r1, #1
666         vaddl.s16       q1,  d3,  d1
667         vext.16         q3,  \r0, \r1, #4
668         vaddl.s16       q10, d4,  d6
669         vext.16         \r1, \r0, \r1, #5
670         vaddl.s16       q2,  d5,  d7
671         vaddl.s16       q0,  \h0, \h1
672         vaddl.s16       q8,  \l0, \l1
673
674         vshl.i32        q3,  q9,  #4
675         vshl.i32        q9,  q9,  #2
676         vshl.i32        q15, q10, #2
677         vadd.i32        q9,  q9,  q3
678         vadd.i32        q10, q10, q15
679
680         vshl.i32        q3,  q1,  #4
681         vshl.i32        q1,  q1,  #2
682         vshl.i32        q15, q2,  #2
683         vadd.i32        q1,  q1,  q3
684         vadd.i32        q2,  q2,  q15
685
686         vadd.i32        q9,  q9,  q8
687         vsub.i32        q9,  q9,  q10
688
689         vadd.i32        q1,  q1,  q0
690         vsub.i32        q1,  q1,  q2
691
692         vrshrn.s32      d18, q9,  #10
693         vrshrn.s32      d19, q1,  #10
694
695         vqmovun.s16     \d,  q9
696         .endm
697
698 function put_h264_qpel16_h_lowpass_neon_packed
699         mov             r4,  lr
700         mov             ip,  #16
701         mov             r3,  #8
702         bl              put_h264_qpel8_h_lowpass_neon
703         sub             r1,  r1,  r2, lsl #4
704         add             r1,  r1,  #8
705         mov             ip,  #16
706         mov             lr,  r4
707         b               put_h264_qpel8_h_lowpass_neon
708         .endfunc
709
710 function put_h264_qpel16_h_lowpass_neon
711         push            {lr}
712         mov             ip,  #16
713         bl              put_h264_qpel8_h_lowpass_neon
714         sub             r0,  r0,  r3, lsl #4
715         sub             r1,  r1,  r2, lsl #4
716         add             r0,  r0,  #8
717         add             r1,  r1,  #8
718         mov             ip,  #16
719         pop             {lr}
720         .endfunc
721
722 function put_h264_qpel8_h_lowpass_neon
723 1:      vld1.64         {d0, d1},  [r1], r2
724         vld1.64         {d16,d17}, [r1], r2
725         subs            ip,  ip,  #2
726         lowpass_8       d0,  d1,  d16, d17, d0,  d16
727         vst1.64         {d0},     [r0,:64], r3
728         vst1.64         {d16},    [r0,:64], r3
729         bne             1b
730         bx              lr
731         .endfunc
732
733 function put_h264_qpel16_h_lowpass_l2_neon
734         push            {lr}
735         mov             ip,  #16
736         bl              put_h264_qpel8_h_lowpass_l2_neon
737         sub             r0,  r0,  r2, lsl #4
738         sub             r1,  r1,  r2, lsl #4
739         sub             r3,  r3,  r2, lsl #4
740         add             r0,  r0,  #8
741         add             r1,  r1,  #8
742         add             r3,  r3,  #8
743         mov             ip,  #16
744         pop             {lr}
745         .endfunc
746
747 function put_h264_qpel8_h_lowpass_l2_neon
748 1:      vld1.64         {d0, d1},  [r1], r2
749         vld1.64         {d16,d17}, [r1], r2
750         vld1.64         {d28},     [r3], r2
751         vld1.64         {d29},     [r3], r2
752         subs            ip,  ip,  #2
753         lowpass_8       d0,  d1,  d16, d17, d0,  d1
754         vrhadd.u8       q0,  q0,  q14
755         vst1.64         {d0},      [r0,:64], r2
756         vst1.64         {d1},      [r0,:64], r2
757         bne             1b
758         bx              lr
759         .endfunc
760
761 function put_h264_qpel16_v_lowpass_neon_packed
762         mov             r4,  lr
763         mov             r2,  #8
764         bl              put_h264_qpel8_v_lowpass_neon
765         sub             r1,  r1,  r3, lsl #2
766         bl              put_h264_qpel8_v_lowpass_neon
767         sub             r1,  r1,  r3, lsl #4
768         sub             r1,  r1,  r3, lsl #2
769         add             r1,  r1,  #8
770         bl              put_h264_qpel8_v_lowpass_neon
771         sub             r1,  r1,  r3, lsl #2
772         mov             lr,  r4
773         b               put_h264_qpel8_v_lowpass_neon
774         .endfunc
775
776 function put_h264_qpel16_v_lowpass_neon
777         mov             r4,  lr
778         bl              put_h264_qpel8_v_lowpass_neon
779         sub             r1,  r1,  r3, lsl #2
780         bl              put_h264_qpel8_v_lowpass_neon
781         sub             r0,  r0,  r2, lsl #4
782         add             r0,  r0,  #8
783         sub             r1,  r1,  r3, lsl #4
784         sub             r1,  r1,  r3, lsl #2
785         add             r1,  r1,  #8
786         bl              put_h264_qpel8_v_lowpass_neon
787         sub             r1,  r1,  r3, lsl #2
788         mov             lr,  r4
789         .endfunc
790
791 function put_h264_qpel8_v_lowpass_neon
792         vld1.64         {d8},  [r1], r3
793         vld1.64         {d10}, [r1], r3
794         vld1.64         {d12}, [r1], r3
795         vld1.64         {d14}, [r1], r3
796         vld1.64         {d22}, [r1], r3
797         vld1.64         {d24}, [r1], r3
798         vld1.64         {d26}, [r1], r3
799         vld1.64         {d28}, [r1], r3
800         vld1.64         {d9},  [r1], r3
801         vld1.64         {d11}, [r1], r3
802         vld1.64         {d13}, [r1], r3
803         vld1.64         {d15}, [r1], r3
804         vld1.64         {d23}, [r1]
805
806         transpose_8x8   q4,  q5,  q6,  q7,  q11, q12, q13, q14
807         lowpass_8       d8,  d9,  d10, d11, d8,  d10
808         lowpass_8       d12, d13, d14, d15, d12, d14
809         lowpass_8       d22, d23, d24, d25, d22, d24
810         lowpass_8       d26, d27, d28, d29, d26, d28
811         transpose_8x8   d8,  d10, d12, d14, d22, d24, d26, d28
812
813         vst1.64         {d8},  [r0,:64], r2
814         vst1.64         {d10}, [r0,:64], r2
815         vst1.64         {d12}, [r0,:64], r2
816         vst1.64         {d14}, [r0,:64], r2
817         vst1.64         {d22}, [r0,:64], r2
818         vst1.64         {d24}, [r0,:64], r2
819         vst1.64         {d26}, [r0,:64], r2
820         vst1.64         {d28}, [r0,:64], r2
821
822         bx              lr
823         .endfunc
824
825 function put_h264_qpel16_v_lowpass_l2_neon
826         mov             r4,  lr
827         bl              put_h264_qpel8_v_lowpass_l2_neon
828         sub             r1,  r1,  r3, lsl #2
829         bl              put_h264_qpel8_v_lowpass_l2_neon
830         sub             r0,  r0,  r3, lsl #4
831         sub             ip,  ip,  r2, lsl #4
832         add             r0,  r0,  #8
833         add             ip,  ip,  #8
834         sub             r1,  r1,  r3, lsl #4
835         sub             r1,  r1,  r3, lsl #2
836         add             r1,  r1,  #8
837         bl              put_h264_qpel8_v_lowpass_l2_neon
838         sub             r1,  r1,  r3, lsl #2
839         mov             lr,  r4
840         .endfunc
841
842 function put_h264_qpel8_v_lowpass_l2_neon
843         vld1.64         {d8},  [r1], r3
844         vld1.64         {d10}, [r1], r3
845         vld1.64         {d12}, [r1], r3
846         vld1.64         {d14}, [r1], r3
847         vld1.64         {d22}, [r1], r3
848         vld1.64         {d24}, [r1], r3
849         vld1.64         {d26}, [r1], r3
850         vld1.64         {d28}, [r1], r3
851         vld1.64         {d9},  [r1], r3
852         vld1.64         {d11}, [r1], r3
853         vld1.64         {d13}, [r1], r3
854         vld1.64         {d15}, [r1], r3
855         vld1.64         {d23}, [r1]
856
857         transpose_8x8   q4,  q5,  q6,  q7,  q11, q12, q13, q14
858         lowpass_8       d8,  d9,  d10, d11, d8,  d9
859         lowpass_8       d12, d13, d14, d15, d12, d13
860         lowpass_8       d22, d23, d24, d25, d22, d23
861         lowpass_8       d26, d27, d28, d29, d26, d27
862         transpose_8x8   d8,  d9,  d12, d13, d22, d23, d26, d27
863
864         vld1.64         {d0},  [ip], r2
865         vld1.64         {d1},  [ip], r2
866         vld1.64         {d2},  [ip], r2
867         vld1.64         {d3},  [ip], r2
868         vld1.64         {d4},  [ip], r2
869         vrhadd.u8       q0,  q0,  q4
870         vld1.64         {d5},  [ip], r2
871         vrhadd.u8       q1,  q1,  q6
872         vld1.64         {d10}, [ip], r2
873         vrhadd.u8       q2,  q2,  q11
874         vld1.64         {d11}, [ip], r2
875
876         vst1.64         {d0},  [r0,:64], r3
877         vst1.64         {d1},  [r0,:64], r3
878         vrhadd.u8       q5,  q5,  q13
879         vst1.64         {d2},  [r0,:64], r3
880         vst1.64         {d3},  [r0,:64], r3
881         vst1.64         {d4},  [r0,:64], r3
882         vst1.64         {d5},  [r0,:64], r3
883         vst1.64         {d10}, [r0,:64], r3
884         vst1.64         {d11}, [r0,:64], r3
885
886         bx              lr
887         .endfunc
888
889 function put_h264_qpel8_hv_lowpass_neon_top
890         lowpass_const   ip
891         mov             ip,  #12
892 1:      vld1.64         {d0, d1},  [r1], r3
893         vld1.64         {d16,d17}, [r1], r3
894         subs            ip,  ip,  #2
895         lowpass_8       d0,  d1,  d16, d17, q11, q12, narrow=0
896         vst1.64         {d22-d25}, [r4,:128]!
897         bne             1b
898
899         vld1.64         {d0, d1},  [r1]
900         lowpass_8_1     d0,  d1,  q12, narrow=0
901
902         mov             ip,  #-16
903         add             r4,  r4,  ip
904         vld1.64         {d30,d31}, [r4,:128], ip
905         vld1.64         {d20,d21}, [r4,:128], ip
906         vld1.64         {d18,d19}, [r4,:128], ip
907         vld1.64         {d16,d17}, [r4,:128], ip
908         vld1.64         {d14,d15}, [r4,:128], ip
909         vld1.64         {d12,d13}, [r4,:128], ip
910         vld1.64         {d10,d11}, [r4,:128], ip
911         vld1.64         {d8, d9},  [r4,:128], ip
912         vld1.64         {d6, d7},  [r4,:128], ip
913         vld1.64         {d4, d5},  [r4,:128], ip
914         vld1.64         {d2, d3},  [r4,:128], ip
915         vld1.64         {d0, d1},  [r4,:128]
916
917         swap4           d1,  d3,  d5,  d7,  d8,  d10, d12, d14
918         transpose16_4x4 q0,  q1,  q2,  q3,  q4,  q5,  q6,  q7
919
920         swap4           d17, d19, d21, d31, d24, d26, d28, d22
921         transpose16_4x4 q8,  q9,  q10, q15, q12, q13, q14, q11
922
923         vst1.64         {d30,d31}, [r4,:128]!
924         vst1.64         {d6, d7},  [r4,:128]!
925         vst1.64         {d20,d21}, [r4,:128]!
926         vst1.64         {d4, d5},  [r4,:128]!
927         vst1.64         {d18,d19}, [r4,:128]!
928         vst1.64         {d2, d3},  [r4,:128]!
929         vst1.64         {d16,d17}, [r4,:128]!
930         vst1.64         {d0, d1},  [r4,:128]
931
932         lowpass_8.16    q4,  q12, d8,  d9,  d24, d25, d8
933         lowpass_8.16    q5,  q13, d10, d11, d26, d27, d9
934         lowpass_8.16    q6,  q14, d12, d13, d28, d29, d10
935         lowpass_8.16    q7,  q11, d14, d15, d22, d23, d11
936
937         vld1.64         {d16,d17}, [r4,:128], ip
938         vld1.64         {d30,d31}, [r4,:128], ip
939         lowpass_8.16    q8,  q15, d16, d17, d30, d31, d12
940         vld1.64         {d16,d17}, [r4,:128], ip
941         vld1.64         {d30,d31}, [r4,:128], ip
942         lowpass_8.16    q8,  q15, d16, d17, d30, d31, d13
943         vld1.64         {d16,d17}, [r4,:128], ip
944         vld1.64         {d30,d31}, [r4,:128], ip
945         lowpass_8.16    q8,  q15, d16, d17, d30, d31, d14
946         vld1.64         {d16,d17}, [r4,:128], ip
947         vld1.64         {d30,d31}, [r4,:128]
948         lowpass_8.16    q8,  q15, d16, d17, d30, d31, d15
949
950         transpose_8x8   d12, d13, d14, d15, d8,  d9,  d10, d11
951
952         bx              lr
953         .endfunc
954
955 function put_h264_qpel8_hv_lowpass_neon
956         mov             r10, lr
957         bl              put_h264_qpel8_hv_lowpass_neon_top
958         vst1.64         {d12},     [r0,:64], r2
959         vst1.64         {d13},     [r0,:64], r2
960         vst1.64         {d14},     [r0,:64], r2
961         vst1.64         {d15},     [r0,:64], r2
962         vst1.64         {d8},      [r0,:64], r2
963         vst1.64         {d9},      [r0,:64], r2
964         vst1.64         {d10},     [r0,:64], r2
965         vst1.64         {d11},     [r0,:64], r2
966
967         mov             lr,  r10
968         bx              lr
969         .endfunc
970
971 function put_h264_qpel8_hv_lowpass_l2_neon
972         mov             r10, lr
973         bl              put_h264_qpel8_hv_lowpass_neon_top
974
975         vld1.64         {d0, d1},  [r2,:128]!
976         vld1.64         {d2, d3},  [r2,:128]!
977         vrhadd.u8       q0,  q0,  q6
978         vld1.64         {d4, d5},  [r2,:128]!
979         vrhadd.u8       q1,  q1,  q7
980         vld1.64         {d6, d7},  [r2,:128]!
981         vrhadd.u8       q2,  q2,  q4
982
983         vst1.64         {d0},      [r0,:64], r3
984         vrhadd.u8       q3,  q3,  q5
985         vst1.64         {d1},      [r0,:64], r3
986         vst1.64         {d2},      [r0,:64], r3
987         vst1.64         {d3},      [r0,:64], r3
988         vst1.64         {d4},      [r0,:64], r3
989         vst1.64         {d5},      [r0,:64], r3
990         vst1.64         {d6},      [r0,:64], r3
991         vst1.64         {d7},      [r0,:64], r3
992
993         mov             lr,  r10
994         bx              lr
995         .endfunc
996
997 function put_h264_qpel16_hv_lowpass_neon
998         mov             r9,  lr
999         bl              put_h264_qpel8_hv_lowpass_neon
1000         sub             r1,  r1,  r3, lsl #2
1001         bl              put_h264_qpel8_hv_lowpass_neon
1002         sub             r1,  r1,  r3, lsl #4
1003         sub             r1,  r1,  r3, lsl #2
1004         add             r1,  r1,  #8
1005         sub             r0,  r0,  r2, lsl #4
1006         add             r0,  r0,  #8
1007         bl              put_h264_qpel8_hv_lowpass_neon
1008         sub             r1,  r1,  r3, lsl #2
1009         mov             lr,  r9
1010         b               put_h264_qpel8_hv_lowpass_neon
1011         .endfunc
1012
1013 function put_h264_qpel16_hv_lowpass_l2_neon
1014         mov             r9,  lr
1015         sub             r2,  r4,  #256
1016         bl              put_h264_qpel8_hv_lowpass_l2_neon
1017         sub             r1,  r1,  r3, lsl #2
1018         bl              put_h264_qpel8_hv_lowpass_l2_neon
1019         sub             r1,  r1,  r3, lsl #4
1020         sub             r1,  r1,  r3, lsl #2
1021         add             r1,  r1,  #8
1022         sub             r0,  r0,  r3, lsl #4
1023         add             r0,  r0,  #8
1024         bl              put_h264_qpel8_hv_lowpass_l2_neon
1025         sub             r1,  r1,  r3, lsl #2
1026         mov             lr,  r9
1027         b               put_h264_qpel8_hv_lowpass_l2_neon
1028         .endfunc
1029
1030 function ff_put_h264_qpel8_mc10_neon, export=1
1031         lowpass_const   r3
1032         mov             r3,  r1
1033         sub             r1,  r1,  #2
1034         mov             ip,  #8
1035         b               put_h264_qpel8_h_lowpass_l2_neon
1036         .endfunc
1037
1038 function ff_put_h264_qpel8_mc20_neon, export=1
1039         lowpass_const   r3
1040         sub             r1,  r1,  #2
1041         mov             r3,  r2
1042         mov             ip,  #8
1043         b               put_h264_qpel8_h_lowpass_neon
1044         .endfunc
1045
1046 function ff_put_h264_qpel8_mc30_neon, export=1
1047         lowpass_const   r3
1048         add             r3,  r1,  #1
1049         sub             r1,  r1,  #2
1050         mov             ip,  #8
1051         b               put_h264_qpel8_h_lowpass_l2_neon
1052         .endfunc
1053
1054 function ff_put_h264_qpel8_mc01_neon, export=1
1055         push            {lr}
1056         mov             ip,  r1
1057 put_h264_qpel8_mc01:
1058         lowpass_const   r3
1059         mov             r3,  r2
1060         sub             r1,  r1,  r2, lsl #1
1061         vpush           {d8-d15}
1062         bl              put_h264_qpel8_v_lowpass_l2_neon
1063         vpop            {d8-d15}
1064         pop             {pc}
1065         .endfunc
1066
1067 function ff_put_h264_qpel8_mc11_neon, export=1
1068         push            {r0, r1, r2, lr}
1069 put_h264_qpel8_mc11:
1070         lowpass_const   r3
1071         sub             sp,  sp,  #64
1072         mov             r0,  sp
1073         sub             r1,  r1,  #2
1074         mov             r3,  #8
1075         mov             ip,  #8
1076         vpush           {d8-d15}
1077         bl              put_h264_qpel8_h_lowpass_neon
1078         ldrd            r0,  [sp, #128]
1079         mov             r3,  r2
1080         add             ip,  sp,  #64
1081         sub             r1,  r1,  r2, lsl #1
1082         mov             r2,  #8
1083         bl              put_h264_qpel8_v_lowpass_l2_neon
1084         vpop            {d8-d15}
1085         add             sp,  sp,  #76
1086         pop             {pc}
1087         .endfunc
1088
1089 function ff_put_h264_qpel8_mc21_neon, export=1
1090         push            {r0, r1, r4, r10, r11, lr}
1091 put_h264_qpel8_mc21:
1092         lowpass_const   r3
1093         mov             r11, sp
1094         bic             sp,  sp,  #15
1095         sub             sp,  sp,  #(8*8+16*12)
1096         sub             r1,  r1,  #2
1097         mov             r3,  #8
1098         mov             r0,  sp
1099         mov             ip,  #8
1100         vpush           {d8-d15}
1101         bl              put_h264_qpel8_h_lowpass_neon
1102         mov             r4,  r0
1103         ldrd            r0,  [r11]
1104         sub             r1,  r1,  r2, lsl #1
1105         sub             r1,  r1,  #2
1106         mov             r3,  r2
1107         sub             r2,  r4,  #64
1108         bl              put_h264_qpel8_hv_lowpass_l2_neon
1109         vpop            {d8-d15}
1110         add             sp,  r11,  #8
1111         pop             {r4, r10, r11, pc}
1112         .endfunc
1113
1114 function ff_put_h264_qpel8_mc31_neon, export=1
1115         add             r1,  r1,  #1
1116         push            {r0, r1, r2, lr}
1117         sub             r1,  r1,  #1
1118         b               put_h264_qpel8_mc11
1119         .endfunc
1120
1121 function ff_put_h264_qpel8_mc02_neon, export=1
1122         push            {lr}
1123         lowpass_const   r3
1124         sub             r1,  r1,  r2, lsl #1
1125         mov             r3,  r2
1126         vpush           {d8-d15}
1127         bl              put_h264_qpel8_v_lowpass_neon
1128         vpop            {d8-d15}
1129         pop             {pc}
1130         .endfunc
1131
1132 function ff_put_h264_qpel8_mc12_neon, export=1
1133         push            {r0, r1, r4, r10, r11, lr}
1134 put_h264_qpel8_mc12:
1135         lowpass_const   r3
1136         mov             r11, sp
1137         bic             sp,  sp,  #15
1138         sub             sp,  sp,  #(8*8+16*12)
1139         sub             r1,  r1,  r2, lsl #1
1140         mov             r3,  r2
1141         mov             r2,  #8
1142         mov             r0,  sp
1143         vpush           {d8-d15}
1144         bl              put_h264_qpel8_v_lowpass_neon
1145         mov             r4,  r0
1146         ldrd            r0,  [r11]
1147         sub             r1,  r1,  r3, lsl #1
1148         sub             r1,  r1,  #2
1149         sub             r2,  r4,  #64
1150         bl              put_h264_qpel8_hv_lowpass_l2_neon
1151         vpop            {d8-d15}
1152         add             sp,  r11,  #8
1153         pop             {r4, r10, r11, pc}
1154         .endfunc
1155
1156 function ff_put_h264_qpel8_mc22_neon, export=1
1157         push            {r4, r10, r11, lr}
1158         mov             r11, sp
1159         bic             sp,  sp,  #15
1160         sub             r1,  r1,  r2, lsl #1
1161         sub             r1,  r1,  #2
1162         mov             r3,  r2
1163         sub             sp,  sp,  #(16*12)
1164         mov             r4,  sp
1165         vpush           {d8-d15}
1166         bl              put_h264_qpel8_hv_lowpass_neon
1167         vpop            {d8-d15}
1168         mov             sp,  r11
1169         pop             {r4, r10, r11, pc}
1170         .endfunc
1171
1172 function ff_put_h264_qpel8_mc32_neon, export=1
1173         push            {r0, r1, r4, r10, r11, lr}
1174         add             r1,  r1,  #1
1175         b               put_h264_qpel8_mc12
1176         .endfunc
1177
1178 function ff_put_h264_qpel8_mc03_neon, export=1
1179         push            {lr}
1180         add             ip,  r1,  r2
1181         b               put_h264_qpel8_mc01
1182         .endfunc
1183
1184 function ff_put_h264_qpel8_mc13_neon, export=1
1185         push            {r0, r1, r2, lr}
1186         add             r1,  r1,  r2
1187         b               put_h264_qpel8_mc11
1188         .endfunc
1189
1190 function ff_put_h264_qpel8_mc23_neon, export=1
1191         push            {r0, r1, r4, r10, r11, lr}
1192         add             r1,  r1,  r2
1193         b               put_h264_qpel8_mc21
1194         .endfunc
1195
1196 function ff_put_h264_qpel8_mc33_neon, export=1
1197         add             r1,  r1,  #1
1198         push            {r0, r1, r2, lr}
1199         add             r1,  r1,  r2
1200         sub             r1,  r1,  #1
1201         b               put_h264_qpel8_mc11
1202         .endfunc
1203
1204 function ff_put_h264_qpel16_mc10_neon, export=1
1205         lowpass_const   r3
1206         mov             r3,  r1
1207         sub             r1,  r1,  #2
1208         b               put_h264_qpel16_h_lowpass_l2_neon
1209         .endfunc
1210
1211 function ff_put_h264_qpel16_mc20_neon, export=1
1212         lowpass_const   r3
1213         sub             r1,  r1,  #2
1214         mov             r3,  r2
1215         b               put_h264_qpel16_h_lowpass_neon
1216         .endfunc
1217
1218 function ff_put_h264_qpel16_mc30_neon, export=1
1219         lowpass_const   r3
1220         add             r3,  r1,  #1
1221         sub             r1,  r1,  #2
1222         b               put_h264_qpel16_h_lowpass_l2_neon
1223         .endfunc
1224
1225 function ff_put_h264_qpel16_mc01_neon, export=1
1226         push            {r4, lr}
1227         mov             ip,  r1
1228 put_h264_qpel16_mc01:
1229         lowpass_const   r3
1230         mov             r3,  r2
1231         sub             r1,  r1,  r2, lsl #1
1232         vpush           {d8-d15}
1233         bl              put_h264_qpel16_v_lowpass_l2_neon
1234         vpop            {d8-d15}
1235         pop             {r4, pc}
1236         .endfunc
1237
1238 function ff_put_h264_qpel16_mc11_neon, export=1
1239         push            {r0, r1, r4, lr}
1240 put_h264_qpel16_mc11:
1241         lowpass_const   r3
1242         sub             sp,  sp,  #256
1243         mov             r0,  sp
1244         sub             r1,  r1,  #2
1245         mov             r3,  #16
1246         vpush           {d8-d15}
1247         bl              put_h264_qpel16_h_lowpass_neon
1248         add             r0,  sp,  #256
1249         ldrd            r0,  [r0, #64]
1250         mov             r3,  r2
1251         add             ip,  sp,  #64
1252         sub             r1,  r1,  r2, lsl #1
1253         mov             r2,  #16
1254         bl              put_h264_qpel16_v_lowpass_l2_neon
1255         vpop            {d8-d15}
1256         add             sp,  sp,  #(256+8)
1257         pop             {r4, pc}
1258         .endfunc
1259
1260 function ff_put_h264_qpel16_mc21_neon, export=1
1261         push            {r0, r1, r4-r5, r9-r11, lr}
1262 put_h264_qpel16_mc21:
1263         lowpass_const   r3
1264         mov             r11, sp
1265         bic             sp,  sp,  #15
1266         sub             sp,  sp,  #(16*16+16*12)
1267         sub             r1,  r1,  #2
1268         mov             r0,  sp
1269         vpush           {d8-d15}
1270         bl              put_h264_qpel16_h_lowpass_neon_packed
1271         mov             r4,  r0
1272         ldrd            r0,  [r11]
1273         sub             r1,  r1,  r2, lsl #1
1274         sub             r1,  r1,  #2
1275         mov             r3,  r2
1276         bl              put_h264_qpel16_hv_lowpass_l2_neon
1277         vpop            {d8-d15}
1278         add             sp,  r11,  #8
1279         pop             {r4-r5, r9-r11, pc}
1280         .endfunc
1281
1282 function ff_put_h264_qpel16_mc31_neon, export=1
1283         add             r1,  r1,  #1
1284         push            {r0, r1, r4, lr}
1285         sub             r1,  r1,  #1
1286         b               put_h264_qpel16_mc11
1287         .endfunc
1288
1289 function ff_put_h264_qpel16_mc02_neon, export=1
1290         push            {r4, lr}
1291         lowpass_const   r3
1292         sub             r1,  r1,  r2, lsl #1
1293         mov             r3,  r2
1294         vpush           {d8-d15}
1295         bl              put_h264_qpel16_v_lowpass_neon
1296         vpop            {d8-d15}
1297         pop             {r4, pc}
1298         .endfunc
1299
1300 function ff_put_h264_qpel16_mc12_neon, export=1
1301         push            {r0, r1, r4-r5, r9-r11, lr}
1302 put_h264_qpel16_mc12:
1303         lowpass_const   r3
1304         mov             r11, sp
1305         bic             sp,  sp,  #15
1306         sub             sp,  sp,  #(16*16+16*12)
1307         sub             r1,  r1,  r2, lsl #1
1308         mov             r0,  sp
1309         mov             r3,  r2
1310         vpush           {d8-d15}
1311         bl              put_h264_qpel16_v_lowpass_neon_packed
1312         mov             r4,  r0
1313         ldrd            r0,  [r11]
1314         sub             r1,  r1,  r3, lsl #1
1315         sub             r1,  r1,  #2
1316         mov             r2,  r3
1317         bl              put_h264_qpel16_hv_lowpass_l2_neon
1318         vpop            {d8-d15}
1319         add             sp,  r11,  #8
1320         pop             {r4-r5, r9-r11, pc}
1321         .endfunc
1322
1323 function ff_put_h264_qpel16_mc22_neon, export=1
1324         push            {r4, r9-r11, lr}
1325         lowpass_const   r3
1326         mov             r11, sp
1327         bic             sp,  sp,  #15
1328         sub             r1,  r1,  r2, lsl #1
1329         sub             r1,  r1,  #2
1330         mov             r3,  r2
1331         sub             sp,  sp,  #(16*12)
1332         mov             r4,  sp
1333         vpush           {d8-d15}
1334         bl              put_h264_qpel16_hv_lowpass_neon
1335         vpop            {d8-d15}
1336         mov             sp,  r11
1337         pop             {r4, r9-r11, pc}
1338         .endfunc
1339
1340 function ff_put_h264_qpel16_mc32_neon, export=1
1341         push            {r0, r1, r4-r5, r9-r11, lr}
1342         add             r1,  r1,  #1
1343         b               put_h264_qpel16_mc12
1344         .endfunc
1345
1346 function ff_put_h264_qpel16_mc03_neon, export=1
1347         push            {r4, lr}
1348         add             ip,  r1,  r2
1349         b               put_h264_qpel16_mc01
1350         .endfunc
1351
1352 function ff_put_h264_qpel16_mc13_neon, export=1
1353         push            {r0, r1, r4, lr}
1354         add             r1,  r1,  r2
1355         b               put_h264_qpel16_mc11
1356         .endfunc
1357
1358 function ff_put_h264_qpel16_mc23_neon, export=1
1359         push            {r0, r1, r4-r5, r9-r11, lr}
1360         add             r1,  r1,  r2
1361         b               put_h264_qpel16_mc21
1362         .endfunc
1363
1364 function ff_put_h264_qpel16_mc33_neon, export=1
1365         add             r1,  r1,  #1
1366         push            {r0, r1, r4, lr}
1367         add             r1,  r1,  r2
1368         sub             r1,  r1,  #1
1369         b               put_h264_qpel16_mc11
1370         .endfunc
1371
1372 @ Biweighted prediction
1373
1374         .macro  biweight_16 macs, macd
1375         vdup.8          d0,  r4
1376         vdup.8          d1,  r5
1377         vmov            q2,  q8
1378         vmov            q3,  q8
1379 1:      subs            ip,  ip,  #2
1380         vld1.8          {d20-d21},[r0,:128], r2
1381         \macd           q2,  d0,  d20
1382         pld             [r0]
1383         \macd           q3,  d0,  d21
1384         vld1.8          {d22-d23},[r1,:128], r2
1385         \macs           q2,  d1,  d22
1386         pld             [r1]
1387         \macs           q3,  d1,  d23
1388         vmov            q12, q8
1389         vld1.8          {d28-d29},[r0,:128], r2
1390         vmov            q13, q8
1391         \macd           q12, d0,  d28
1392         pld             [r0]
1393         \macd           q13, d0,  d29
1394         vld1.8          {d30-d31},[r1,:128], r2
1395         \macs           q12, d1,  d30
1396         pld             [r1]
1397         \macs           q13, d1,  d31
1398         vshl.s16        q2,  q2,  q9
1399         vshl.s16        q3,  q3,  q9
1400         vqmovun.s16     d4,  q2
1401         vqmovun.s16     d5,  q3
1402         vshl.s16        q12, q12, q9
1403         vshl.s16        q13, q13, q9
1404         vqmovun.s16     d24, q12
1405         vqmovun.s16     d25, q13
1406         vmov            q3,  q8
1407         vst1.8          {d4- d5}, [r6,:128], r2
1408         vmov            q2,  q8
1409         vst1.8          {d24-d25},[r6,:128], r2
1410         bne             1b
1411         pop             {r4-r6, pc}
1412         .endm
1413
1414         .macro  biweight_8 macs, macd
1415         vdup.8          d0,  r4
1416         vdup.8          d1,  r5
1417         vmov            q1,  q8
1418         vmov            q10, q8
1419 1:      subs            ip,  ip,  #2
1420         vld1.8          {d4},[r0,:64], r2
1421         \macd           q1,  d0,  d4
1422         pld             [r0]
1423         vld1.8          {d5},[r1,:64], r2
1424         \macs           q1,  d1,  d5
1425         pld             [r1]
1426         vld1.8          {d6},[r0,:64], r2
1427         \macd           q10, d0,  d6
1428         pld             [r0]
1429         vld1.8          {d7},[r1,:64], r2
1430         \macs           q10, d1,  d7
1431         pld             [r1]
1432         vshl.s16        q1,  q1,  q9
1433         vqmovun.s16     d2,  q1
1434         vshl.s16        q10, q10, q9
1435         vqmovun.s16     d4,  q10
1436         vmov            q10, q8
1437         vst1.8          {d2},[r6,:64], r2
1438         vmov            q1,  q8
1439         vst1.8          {d4},[r6,:64], r2
1440         bne             1b
1441         pop             {r4-r6, pc}
1442         .endm
1443
1444         .macro  biweight_4 macs, macd
1445         vdup.8          d0,  r4
1446         vdup.8          d1,  r5
1447         vmov            q1,  q8
1448         vmov            q10, q8
1449 1:      subs            ip,  ip,  #4
1450         vld1.32         {d4[0]},[r0,:32], r2
1451         vld1.32         {d4[1]},[r0,:32], r2
1452         \macd           q1,  d0,  d4
1453         pld             [r0]
1454         vld1.32         {d5[0]},[r1,:32], r2
1455         vld1.32         {d5[1]},[r1,:32], r2
1456         \macs           q1,  d1,  d5
1457         pld             [r1]
1458         blt             2f
1459         vld1.32         {d6[0]},[r0,:32], r2
1460         vld1.32         {d6[1]},[r0,:32], r2
1461         \macd           q10, d0,  d6
1462         pld             [r0]
1463         vld1.32         {d7[0]},[r1,:32], r2
1464         vld1.32         {d7[1]},[r1,:32], r2
1465         \macs           q10, d1,  d7
1466         pld             [r1]
1467         vshl.s16        q1,  q1,  q9
1468         vqmovun.s16     d2,  q1
1469         vshl.s16        q10, q10, q9
1470         vqmovun.s16     d4,  q10
1471         vmov            q10, q8
1472         vst1.32         {d2[0]},[r6,:32], r2
1473         vst1.32         {d2[1]},[r6,:32], r2
1474         vmov            q1,  q8
1475         vst1.32         {d4[0]},[r6,:32], r2
1476         vst1.32         {d4[1]},[r6,:32], r2
1477         bne             1b
1478         pop             {r4-r6, pc}
1479 2:      vshl.s16        q1,  q1,  q9
1480         vqmovun.s16     d2,  q1
1481         vst1.32         {d2[0]},[r6,:32], r2
1482         vst1.32         {d2[1]},[r6,:32], r2
1483         pop             {r4-r6, pc}
1484         .endm
1485
1486         .macro  biweight_func w
1487 function biweight_h264_pixels_\w\()_neon
1488         push            {r4-r6, lr}
1489         add             r4,  sp,  #16
1490         ldm             r4,  {r4-r6}
1491         lsr             lr,  r4,  #31
1492         add             r6,  r6,  #1
1493         eors            lr,  lr,  r5,  lsr #30
1494         orr             r6,  r6,  #1
1495         vdup.16         q9,  r3
1496         lsl             r6,  r6,  r3
1497         vmvn            q9,  q9
1498         vdup.16         q8,  r6
1499         mov             r6,  r0
1500         beq             10f
1501         subs            lr,  lr,  #1
1502         beq             20f
1503         subs            lr,  lr,  #1
1504         beq             30f
1505         b               40f
1506 10:     biweight_\w     vmlal.u8, vmlal.u8
1507 20:     rsb             r4,  r4,  #0
1508         biweight_\w     vmlal.u8, vmlsl.u8
1509 30:     rsb             r4,  r4,  #0
1510         rsb             r5,  r5,  #0
1511         biweight_\w     vmlsl.u8, vmlsl.u8
1512 40:     rsb             r5,  r5,  #0
1513         biweight_\w     vmlsl.u8, vmlal.u8
1514         .endfunc
1515         .endm
1516
1517         .macro  biweight_entry w, h, b=1
1518 function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1
1519         mov             ip,  #\h
1520 .if \b
1521         b               biweight_h264_pixels_\w\()_neon
1522 .endif
1523         .endfunc
1524         .endm
1525
1526         biweight_entry  16, 8
1527         biweight_entry  16, 16, b=0
1528         biweight_func   16
1529
1530         biweight_entry  8,  16
1531         biweight_entry  8,  4
1532         biweight_entry  8,  8,  b=0
1533         biweight_func   8
1534
1535         biweight_entry  4,  8
1536         biweight_entry  4,  2
1537         biweight_entry  4,  4,  b=0
1538         biweight_func   4
1539
1540 @ Weighted prediction
1541
1542         .macro  weight_16 mac
1543         vdup.8          d0,  r3
1544         vmov            q2,  q8
1545         vmov            q3,  q8
1546 1:      subs            ip,  ip,  #2
1547         vld1.8          {d20-d21},[r0,:128], r1
1548         \mac            q2,  d0,  d20
1549         pld             [r0]
1550         \mac            q3,  d0,  d21
1551         vmov            q12, q8
1552         vld1.8          {d28-d29},[r0,:128], r1
1553         vmov            q13, q8
1554         \mac            q12, d0,  d28
1555         pld             [r0]
1556         \mac            q13, d0,  d29
1557         vshl.s16        q2,  q2,  q9
1558         vshl.s16        q3,  q3,  q9
1559         vqmovun.s16     d4,  q2
1560         vqmovun.s16     d5,  q3
1561         vshl.s16        q12, q12, q9
1562         vshl.s16        q13, q13, q9
1563         vqmovun.s16     d24, q12
1564         vqmovun.s16     d25, q13
1565         vmov            q3,  q8
1566         vst1.8          {d4- d5}, [r4,:128], r1
1567         vmov            q2,  q8
1568         vst1.8          {d24-d25},[r4,:128], r1
1569         bne             1b
1570         pop             {r4, pc}
1571         .endm
1572
1573         .macro  weight_8 mac
1574         vdup.8          d0,  r3
1575         vmov            q1,  q8
1576         vmov            q10, q8
1577 1:      subs            ip,  ip,  #2
1578         vld1.8          {d4},[r0,:64], r1
1579         \mac            q1,  d0,  d4
1580         pld             [r0]
1581         vld1.8          {d6},[r0,:64], r1
1582         \mac            q10, d0,  d6
1583         pld             [r0]
1584         vshl.s16        q1,  q1,  q9
1585         vqmovun.s16     d2,  q1
1586         vshl.s16        q10, q10, q9
1587         vqmovun.s16     d4,  q10
1588         vmov            q10, q8
1589         vst1.8          {d2},[r4,:64], r1
1590         vmov            q1,  q8
1591         vst1.8          {d4},[r4,:64], r1
1592         bne             1b
1593         pop             {r4, pc}
1594         .endm
1595
1596         .macro  weight_4 mac
1597         vdup.8          d0,  r3
1598         vmov            q1,  q8
1599         vmov            q10, q8
1600 1:      subs            ip,  ip,  #4
1601         vld1.32         {d4[0]},[r0,:32], r1
1602         vld1.32         {d4[1]},[r0,:32], r1
1603         \mac            q1,  d0,  d4
1604         pld             [r0]
1605         blt             2f
1606         vld1.32         {d6[0]},[r0,:32], r1
1607         vld1.32         {d6[1]},[r0,:32], r1
1608         \mac            q10, d0,  d6
1609         pld             [r0]
1610         vshl.s16        q1,  q1,  q9
1611         vqmovun.s16     d2,  q1
1612         vshl.s16        q10, q10, q9
1613         vqmovun.s16     d4,  q10
1614         vmov            q10, q8
1615         vst1.32         {d2[0]},[r4,:32], r1
1616         vst1.32         {d2[1]},[r4,:32], r1
1617         vmov            q1,  q8
1618         vst1.32         {d4[0]},[r4,:32], r1
1619         vst1.32         {d4[1]},[r4,:32], r1
1620         bne             1b
1621         pop             {r4, pc}
1622 2:      vshl.s16        q1,  q1,  q9
1623         vqmovun.s16     d2,  q1
1624         vst1.32         {d2[0]},[r4,:32], r1
1625         vst1.32         {d2[1]},[r4,:32], r1
1626         pop             {r4, pc}
1627         .endm
1628
1629         .macro  weight_func w
1630 function weight_h264_pixels_\w\()_neon
1631         push            {r4, lr}
1632         ldr             r4,  [sp, #8]
1633         vdup.16         q9,  r2
1634         mov             lr,  #1
1635         lsl             r4,  r4,  r2
1636         subs            r2,  r2,  #1
1637         vneg.s16        q9,  q9
1638         addge           r4,  r4,  lr,  lsl r2
1639         cmp             r3,  #0
1640         vdup.16         q8,  r4
1641         mov             r4,  r0
1642         blt             10f
1643         weight_\w       vmlal.u8
1644 10:     rsb             r3,  r3,  #0
1645         weight_\w       vmlsl.u8
1646         .endfunc
1647         .endm
1648
1649         .macro  weight_entry w, h, b=1
1650 function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1
1651         mov             ip,  #\h
1652 .if \b
1653         b               weight_h264_pixels_\w\()_neon
1654 .endif
1655         .endfunc
1656         .endm
1657
1658         weight_entry    16, 8
1659         weight_entry    16, 16, b=0
1660         weight_func     16
1661
1662         weight_entry    8,  16
1663         weight_entry    8,  4
1664         weight_entry    8,  8,  b=0
1665         weight_func     8
1666
1667         weight_entry    4,  8
1668         weight_entry    4,  2
1669         weight_entry    4,  4,  b=0
1670         weight_func     4