]> rtime.felk.cvut.cz Git - frescor/ffmpeg.git/blob - libavcodec/arm/dsputil_neon_s.S
f16293db031565028ea4397eaa53764d28dc2774
[frescor/ffmpeg.git] / libavcodec / arm / dsputil_neon_s.S
1 /*
2  * ARM NEON optimised DSP functions
3  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21
22 #include "asm.S"
23
24         preserve8
25         .fpu neon
26         .text
27
28         .macro pixels16 avg=0
29 .if \avg
30         mov             ip,  r0
31 .endif
32 1:      vld1.64         {d0, d1},  [r1], r2
33         vld1.64         {d2, d3},  [r1], r2
34         vld1.64         {d4, d5},  [r1], r2
35         pld             [r1, r2, lsl #2]
36         vld1.64         {d6, d7},  [r1], r2
37         pld             [r1]
38         pld             [r1, r2]
39         pld             [r1, r2, lsl #1]
40 .if \avg
41         vld1.64         {d16,d17}, [ip,:128], r2
42         vrhadd.u8       q0,  q0,  q8
43         vld1.64         {d18,d19}, [ip,:128], r2
44         vrhadd.u8       q1,  q1,  q9
45         vld1.64         {d20,d21}, [ip,:128], r2
46         vrhadd.u8       q2,  q2,  q10
47         vld1.64         {d22,d23}, [ip,:128], r2
48         vrhadd.u8       q3,  q3,  q11
49 .endif
50         subs            r3,  r3,  #4
51         vst1.64         {d0, d1},  [r0,:128], r2
52         vst1.64         {d2, d3},  [r0,:128], r2
53         vst1.64         {d4, d5},  [r0,:128], r2
54         vst1.64         {d6, d7},  [r0,:128], r2
55         bne             1b
56         bx              lr
57         .endm
58
59         .macro pixels16_x2 vhadd=vrhadd.u8
60 1:      vld1.64         {d0-d2},   [r1], r2
61         vld1.64         {d4-d6},   [r1], r2
62         pld             [r1]
63         pld             [r1, r2]
64         subs            r3,  r3,  #2
65         vext.8          q1,  q0,  q1,  #1
66         \vhadd          q0,  q0,  q1
67         vext.8          q3,  q2,  q3,  #1
68         \vhadd          q2,  q2,  q3
69         vst1.64         {d0, d1},  [r0,:128], r2
70         vst1.64         {d4, d5},  [r0,:128], r2
71         bne             1b
72         bx              lr
73         .endm
74
75         .macro pixels16_y2 vhadd=vrhadd.u8
76         push            {lr}
77         add             ip,  r1,  r2
78         lsl             lr,  r2,  #1
79         vld1.64         {d0, d1},  [r1], lr
80         vld1.64         {d2, d3},  [ip], lr
81 1:      subs            r3,  r3,  #2
82         \vhadd          q2,  q0,  q1
83         vld1.64         {d0, d1},  [r1],      lr
84         \vhadd          q3,  q0,  q1
85         vld1.64         {d2, d3},  [ip],      lr
86         pld             [r1]
87         pld             [ip]
88         vst1.64         {d4, d5},  [r0,:128], r2
89         vst1.64         {d6, d7},  [r0,:128], r2
90         bne             1b
91         pop             {pc}
92         .endm
93
94         .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0
95         push            {lr}
96         lsl             lr,  r2,  #1
97         add             ip,  r1,  r2
98         vld1.64         {d0-d2},   [r1], lr
99         vld1.64         {d4-d6},   [ip], lr
100 .if \no_rnd
101         vmov.i16        q13, #1
102 .endif
103         pld             [r1]
104         pld             [ip]
105         vext.8          q1,  q0,  q1,  #1
106         vext.8          q3,  q2,  q3,  #1
107         vaddl.u8        q8,  d0,  d2
108         vaddl.u8        q10, d1,  d3
109         vaddl.u8        q9,  d4,  d6
110         vaddl.u8        q11, d5,  d7
111 1:      subs            r3,  r3,  #2
112         vld1.64         {d0-d2},   [r1], lr
113         vadd.u16        q12, q8,  q9
114         pld             [r1]
115 .if \no_rnd
116         vadd.u16        q12, q12, q13
117 .endif
118         vext.8          q15, q0,  q1,  #1
119         vadd.u16        q1 , q10, q11
120         \vshrn          d28, q12, #2
121 .if \no_rnd
122         vadd.u16        q1,  q1,  q13
123 .endif
124         \vshrn          d29, q1,  #2
125         vaddl.u8        q8,  d0,  d30
126         vld1.64         {d2-d4},   [ip], lr
127         vaddl.u8        q10, d1,  d31
128         vst1.64         {d28,d29}, [r0,:128], r2
129         vadd.u16        q12, q8,  q9
130         pld             [ip]
131 .if \no_rnd
132         vadd.u16        q12, q12, q13
133 .endif
134         vext.8          q2,  q1,  q2,  #1
135         vadd.u16        q0,  q10, q11
136         \vshrn          d30, q12, #2
137 .if \no_rnd
138         vadd.u16        q0,  q0,  q13
139 .endif
140         \vshrn          d31, q0,  #2
141         vaddl.u8        q9,  d2,  d4
142         vaddl.u8        q11, d3,  d5
143         vst1.64         {d30,d31}, [r0,:128], r2
144         bgt             1b
145         pop             {pc}
146         .endm
147
148         .macro pixels8
149 1:      vld1.64         {d0}, [r1], r2
150         vld1.64         {d1}, [r1], r2
151         vld1.64         {d2}, [r1], r2
152         pld             [r1, r2, lsl #2]
153         vld1.64         {d3}, [r1], r2
154         pld             [r1]
155         pld             [r1, r2]
156         pld             [r1, r2, lsl #1]
157         subs            r3,  r3,  #4
158         vst1.64         {d0}, [r0,:64], r2
159         vst1.64         {d1}, [r0,:64], r2
160         vst1.64         {d2}, [r0,:64], r2
161         vst1.64         {d3}, [r0,:64], r2
162         bne             1b
163         bx              lr
164         .endm
165
166         .macro pixels8_x2 vhadd=vrhadd.u8
167 1:      vld1.64         {d0, d1},  [r1], r2
168         vext.8          d1,  d0,  d1,  #1
169         vld1.64         {d2, d3},  [r1], r2
170         vext.8          d3,  d2,  d3,  #1
171         pld             [r1]
172         pld             [r1, r2]
173         subs            r3,  r3,  #2
174         vswp            d1,  d2
175         \vhadd          q0,  q0,  q1
176         vst1.64         {d0},      [r0,:64], r2
177         vst1.64         {d1},      [r0,:64], r2
178         bne             1b
179         bx              lr
180         .endm
181
182         .macro pixels8_y2 vhadd=vrhadd.u8
183         push            {lr}
184         add             ip,  r1,  r2
185         lsl             lr,  r2,  #1
186         vld1.64         {d0},      [r1], lr
187         vld1.64         {d1},      [ip], lr
188 1:      subs            r3,  r3,  #2
189         \vhadd          d4,  d0,  d1
190         vld1.64         {d0},      [r1],     lr
191         \vhadd          d5,  d0,  d1
192         vld1.64         {d1},      [ip],     lr
193         pld             [r1]
194         pld             [ip]
195         vst1.64         {d4},      [r0,:64], r2
196         vst1.64         {d5},      [r0,:64], r2
197         bne             1b
198         pop             {pc}
199         .endm
200
201         .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0
202         push            {lr}
203         lsl             lr,  r2,  #1
204         add             ip,  r1,  r2
205         vld1.64         {d0, d1},  [r1], lr
206         vld1.64         {d2, d3},  [ip], lr
207 .if \no_rnd
208         vmov.i16        q11, #1
209 .endif
210         pld             [r1]
211         pld             [ip]
212         vext.8          d4,  d0,  d1,  #1
213         vext.8          d6,  d2,  d3,  #1
214         vaddl.u8        q8,  d0,  d4
215         vaddl.u8        q9,  d2,  d6
216 1:      subs            r3,  r3,  #2
217         vld1.64         {d0, d1},  [r1], lr
218         pld             [r1]
219         vadd.u16        q10, q8,  q9
220         vext.8          d4,  d0,  d1,  #1
221 .if \no_rnd
222         vadd.u16        q10, q10, q11
223 .endif
224         vaddl.u8        q8,  d0,  d4
225         \vshrn          d5,  q10, #2
226         vld1.64         {d2, d3},  [ip], lr
227         vadd.u16        q10, q8,  q9
228         pld             [ip]
229 .if \no_rnd
230         vadd.u16        q10, q10, q11
231 .endif
232         vst1.64         {d5},      [r0,:64], r2
233         \vshrn          d7,  q10, #2
234         vext.8          d6,  d2,  d3,  #1
235         vaddl.u8        q9,  d2,  d6
236         vst1.64         {d7},      [r0,:64], r2
237         bgt             1b
238         pop             {pc}
239         .endm
240
241         .macro pixfunc pfx name suf rnd_op args:vararg
242 function ff_\pfx\name\suf\()_neon, export=1
243         \name \rnd_op \args
244         .endfunc
245         .endm
246
247         .macro pixfunc2 pfx name args:vararg
248         pixfunc \pfx \name
249         pixfunc \pfx \name \args
250         .endm
251
252 function ff_put_h264_qpel16_mc00_neon, export=1
253         mov   r3, #16
254         .endfunc
255
256         pixfunc  put_ pixels16
257         pixfunc2 put_ pixels16_x2,  _no_rnd, vhadd.u8
258         pixfunc2 put_ pixels16_y2,  _no_rnd, vhadd.u8
259         pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1
260
261 function ff_avg_h264_qpel16_mc00_neon, export=1
262         mov   r3, #16
263         .endfunc
264
265         pixfunc  avg_ pixels16,, 1
266
267 function ff_put_h264_qpel8_mc00_neon, export=1
268         mov   r3, #8
269         .endfunc
270
271         pixfunc  put_ pixels8
272         pixfunc2 put_ pixels8_x2,   _no_rnd, vhadd.u8
273         pixfunc2 put_ pixels8_y2,   _no_rnd, vhadd.u8
274         pixfunc2 put_ pixels8_xy2,  _no_rnd, vshrn.u16, 1
275
276 function ff_put_signed_pixels_clamped_neon, export=1
277         vmov.u8         d31, #128
278         vld1.64         {d16-d17}, [r0,:128]!
279         vqmovn.s16      d0, q8
280         vld1.64         {d18-d19}, [r0,:128]!
281         vqmovn.s16      d1, q9
282         vld1.64         {d16-d17}, [r0,:128]!
283         vqmovn.s16      d2, q8
284         vld1.64         {d18-d19}, [r0,:128]!
285         vadd.u8         d0, d0, d31
286         vld1.64         {d20-d21}, [r0,:128]!
287         vadd.u8         d1, d1, d31
288         vld1.64         {d22-d23}, [r0,:128]!
289         vadd.u8         d2, d2, d31
290         vst1.64         {d0},      [r1,:64], r2
291         vqmovn.s16      d3, q9
292         vst1.64         {d1},      [r1,:64], r2
293         vqmovn.s16      d4, q10
294         vst1.64         {d2},      [r1,:64], r2
295         vqmovn.s16      d5, q11
296         vld1.64         {d24-d25}, [r0,:128]!
297         vadd.u8         d3, d3, d31
298         vld1.64         {d26-d27}, [r0,:128]!
299         vadd.u8         d4, d4, d31
300         vadd.u8         d5, d5, d31
301         vst1.64         {d3},      [r1,:64], r2
302         vqmovn.s16      d6, q12
303         vst1.64         {d4},      [r1,:64], r2
304         vqmovn.s16      d7, q13
305         vst1.64         {d5},      [r1,:64], r2
306         vadd.u8         d6, d6, d31
307         vadd.u8         d7, d7, d31
308         vst1.64         {d6},      [r1,:64], r2
309         vst1.64         {d7},      [r1,:64], r2
310         bx              lr
311         .endfunc
312
313 function ff_add_pixels_clamped_neon, export=1
314         mov             r3, r1
315         vld1.64         {d16},   [r1,:64], r2
316         vld1.64         {d0-d1}, [r0,:128]!
317         vaddw.u8        q0, q0, d16
318         vld1.64         {d17},   [r1,:64], r2
319         vld1.64         {d2-d3}, [r0,:128]!
320         vqmovun.s16     d0, q0
321         vld1.64         {d18},   [r1,:64], r2
322         vaddw.u8        q1, q1, d17
323         vld1.64         {d4-d5}, [r0,:128]!
324         vaddw.u8        q2, q2, d18
325         vst1.64         {d0},    [r3,:64], r2
326         vqmovun.s16     d2, q1
327         vld1.64         {d19},   [r1,:64], r2
328         vld1.64         {d6-d7}, [r0,:128]!
329         vaddw.u8        q3, q3, d19
330         vqmovun.s16     d4, q2
331         vst1.64         {d2},    [r3,:64], r2
332         vld1.64         {d16},   [r1,:64], r2
333         vqmovun.s16     d6, q3
334         vld1.64         {d0-d1}, [r0,:128]!
335         vaddw.u8        q0, q0, d16
336         vst1.64         {d4},    [r3,:64], r2
337         vld1.64         {d17},   [r1,:64], r2
338         vld1.64         {d2-d3}, [r0,:128]!
339         vaddw.u8        q1, q1, d17
340         vst1.64         {d6},    [r3,:64], r2
341         vqmovun.s16     d0, q0
342         vld1.64         {d18},   [r1,:64], r2
343         vld1.64         {d4-d5}, [r0,:128]!
344         vaddw.u8        q2, q2, d18
345         vst1.64         {d0},    [r3,:64], r2
346         vqmovun.s16     d2, q1
347         vld1.64         {d19},   [r1,:64], r2
348         vqmovun.s16     d4, q2
349         vld1.64         {d6-d7}, [r0,:128]!
350         vaddw.u8        q3, q3, d19
351         vst1.64         {d2},    [r3,:64], r2
352         vqmovun.s16     d6, q3
353         vst1.64         {d4},    [r3,:64], r2
354         vst1.64         {d6},    [r3,:64], r2
355         bx              lr
356         .endfunc
357
358 function ff_float_to_int16_neon, export=1
359         subs            r2,  r2,  #8
360         vld1.64         {d0-d1},  [r1,:128]!
361         vcvt.s32.f32    q8,  q0,  #16
362         vld1.64         {d2-d3},  [r1,:128]!
363         vcvt.s32.f32    q9,  q1,  #16
364         beq             3f
365         bics            ip,  r2,  #15
366         beq             2f
367 1:      subs            ip,  ip,  #16
368         vshrn.s32       d4,  q8,  #16
369         vld1.64         {d0-d1},  [r1,:128]!
370         vcvt.s32.f32    q0,  q0,  #16
371         vshrn.s32       d5,  q9,  #16
372         vld1.64         {d2-d3},  [r1,:128]!
373         vcvt.s32.f32    q1,  q1,  #16
374         vshrn.s32       d6,  q0,  #16
375         vst1.64         {d4-d5},  [r0,:128]!
376         vshrn.s32       d7,  q1,  #16
377         vld1.64         {d16-d17},[r1,:128]!
378         vcvt.s32.f32    q8,  q8,  #16
379         vld1.64         {d18-d19},[r1,:128]!
380         vcvt.s32.f32    q9,  q9,  #16
381         vst1.64         {d6-d7},  [r0,:128]!
382         bne             1b
383         ands            r2,  r2,  #15
384         beq             3f
385 2:      vld1.64         {d0-d1},  [r1,:128]!
386         vshrn.s32       d4,  q8,  #16
387         vcvt.s32.f32    q0,  q0,  #16
388         vld1.64         {d2-d3},  [r1,:128]!
389         vshrn.s32       d5,  q9,  #16
390         vcvt.s32.f32    q1,  q1,  #16
391         vshrn.s32       d6,  q0,  #16
392         vst1.64         {d4-d5},  [r0,:128]!
393         vshrn.s32       d7,  q1,  #16
394         vst1.64         {d6-d7},  [r0,:128]!
395         bx              lr
396 3:      vshrn.s32       d4,  q8,  #16
397         vshrn.s32       d5,  q9,  #16
398         vst1.64         {d4-d5},  [r0,:128]!
399         bx              lr
400         .endfunc
401
402 function ff_float_to_int16_interleave_neon, export=1
403         cmp             r3, #2
404         ldrlt           r1, [r1]
405         blt             ff_float_to_int16_neon
406         bne             4f
407
408         ldr             r3, [r1]
409         ldr             r1, [r1, #4]
410
411         subs            r2,  r2,  #8
412         vld1.64         {d0-d1},  [r3,:128]!
413         vcvt.s32.f32    q8,  q0,  #16
414         vld1.64         {d2-d3},  [r3,:128]!
415         vcvt.s32.f32    q9,  q1,  #16
416         vld1.64         {d20-d21},[r1,:128]!
417         vcvt.s32.f32    q10, q10, #16
418         vld1.64         {d22-d23},[r1,:128]!
419         vcvt.s32.f32    q11, q11, #16
420         beq             3f
421         bics            ip,  r2,  #15
422         beq             2f
423 1:      subs            ip,  ip,  #16
424         vld1.64         {d0-d1},  [r3,:128]!
425         vcvt.s32.f32    q0,  q0,  #16
426         vsri.32         q10, q8,  #16
427         vld1.64         {d2-d3},  [r3,:128]!
428         vcvt.s32.f32    q1,  q1,  #16
429         vld1.64         {d24-d25},[r1,:128]!
430         vcvt.s32.f32    q12, q12, #16
431         vld1.64         {d26-d27},[r1,:128]!
432         vsri.32         q11, q9,  #16
433         vst1.64         {d20-d21},[r0,:128]!
434         vcvt.s32.f32    q13, q13, #16
435         vst1.64         {d22-d23},[r0,:128]!
436         vsri.32         q12, q0,  #16
437         vld1.64         {d16-d17},[r3,:128]!
438         vsri.32         q13, q1,  #16
439         vst1.64         {d24-d25},[r0,:128]!
440         vcvt.s32.f32    q8,  q8,  #16
441         vld1.64         {d18-d19},[r3,:128]!
442         vcvt.s32.f32    q9,  q9,  #16
443         vld1.64         {d20-d21},[r1,:128]!
444         vcvt.s32.f32    q10, q10, #16
445         vld1.64         {d22-d23},[r1,:128]!
446         vcvt.s32.f32    q11, q11, #16
447         vst1.64         {d26-d27},[r0,:128]!
448         bne             1b
449         ands            r2,  r2,  #15
450         beq             3f
451 2:      vsri.32         q10, q8,  #16
452         vld1.64         {d0-d1},  [r3,:128]!
453         vcvt.s32.f32    q0,  q0,  #16
454         vld1.64         {d2-d3},  [r3,:128]!
455         vcvt.s32.f32    q1,  q1,  #16
456         vld1.64         {d24-d25},[r1,:128]!
457         vcvt.s32.f32    q12, q12, #16
458         vsri.32         q11, q9,  #16
459         vld1.64         {d26-d27},[r1,:128]!
460         vcvt.s32.f32    q13, q13, #16
461         vst1.64         {d20-d21},[r0,:128]!
462         vsri.32         q12, q0,  #16
463         vst1.64         {d22-d23},[r0,:128]!
464         vsri.32         q13, q1,  #16
465         vst1.64         {d24-d27},[r0,:128]!
466         bx              lr
467 3:      vsri.32         q10, q8,  #16
468         vsri.32         q11, q9,  #16
469         vst1.64         {d20-d23},[r0,:128]!
470         bx              lr
471
472 4:      push            {r4-r8,lr}
473         cmp             r3,  #4
474         lsl             ip,  r3,  #1
475         blt             4f
476
477         @ 4 channels
478 5:      ldmia           r1!, {r4-r7}
479         mov             lr,  r2
480         mov             r8,  r0
481         vld1.64         {d16-d17},[r4,:128]!
482         vcvt.s32.f32    q8,  q8,  #16
483         vld1.64         {d18-d19},[r5,:128]!
484         vcvt.s32.f32    q9,  q9,  #16
485         vld1.64         {d20-d21},[r6,:128]!
486         vcvt.s32.f32    q10, q10, #16
487         vld1.64         {d22-d23},[r7,:128]!
488         vcvt.s32.f32    q11, q11, #16
489 6:      subs            lr,  lr,  #8
490         vld1.64         {d0-d1},  [r4,:128]!
491         vcvt.s32.f32    q0,  q0,  #16
492         vsri.32         q9,  q8,  #16
493         vld1.64         {d2-d3},  [r5,:128]!
494         vcvt.s32.f32    q1,  q1,  #16
495         vsri.32         q11, q10, #16
496         vld1.64         {d4-d5},  [r6,:128]!
497         vcvt.s32.f32    q2,  q2,  #16
498         vzip.32         d18, d22
499         vld1.64         {d6-d7},  [r7,:128]!
500         vcvt.s32.f32    q3,  q3,  #16
501         vzip.32         d19, d23
502         vst1.64         {d18},    [r8], ip
503         vsri.32         q1,  q0,  #16
504         vst1.64         {d22},    [r8], ip
505         vsri.32         q3,  q2,  #16
506         vst1.64         {d19},    [r8], ip
507         vzip.32         d2,  d6
508         vst1.64         {d23},    [r8], ip
509         vzip.32         d3,  d7
510         beq             7f
511         vld1.64         {d16-d17},[r4,:128]!
512         vcvt.s32.f32    q8,  q8,  #16
513         vst1.64         {d2},     [r8], ip
514         vld1.64         {d18-d19},[r5,:128]!
515         vcvt.s32.f32    q9,  q9,  #16
516         vst1.64         {d6},     [r8], ip
517         vld1.64         {d20-d21},[r6,:128]!
518         vcvt.s32.f32    q10, q10, #16
519         vst1.64         {d3},     [r8], ip
520         vld1.64         {d22-d23},[r7,:128]!
521         vcvt.s32.f32    q11, q11, #16
522         vst1.64         {d7},     [r8], ip
523         b               6b
524 7:      vst1.64         {d2},     [r8], ip
525         vst1.64         {d6},     [r8], ip
526         vst1.64         {d3},     [r8], ip
527         vst1.64         {d7},     [r8], ip
528         subs            r3,  r3,  #4
529         popeq           {r4-r8,pc}
530         cmp             r3,  #4
531         add             r0,  r0,  #8
532         bge             5b
533
534         @ 2 channels
535 4:      cmp             r3,  #2
536         blt             4f
537         ldmia           r1!, {r4-r5}
538         mov             lr,  r2
539         mov             r8,  r0
540         tst             lr,  #8
541         vld1.64         {d16-d17},[r4,:128]!
542         vcvt.s32.f32    q8,  q8,  #16
543         vld1.64         {d18-d19},[r5,:128]!
544         vcvt.s32.f32    q9,  q9,  #16
545         vld1.64         {d20-d21},[r4,:128]!
546         vcvt.s32.f32    q10, q10, #16
547         vld1.64         {d22-d23},[r5,:128]!
548         vcvt.s32.f32    q11, q11, #16
549         beq             6f
550         subs            lr,  lr,  #8
551         beq             7f
552         vsri.32         d18, d16, #16
553         vsri.32         d19, d17, #16
554         vld1.64         {d16-d17},[r4,:128]!
555         vcvt.s32.f32    q8,  q8,  #16
556         vst1.32         {d18[0]}, [r8], ip
557         vsri.32         d22, d20, #16
558         vst1.32         {d18[1]}, [r8], ip
559         vsri.32         d23, d21, #16
560         vst1.32         {d19[0]}, [r8], ip
561         vst1.32         {d19[1]}, [r8], ip
562         vld1.64         {d18-d19},[r5,:128]!
563         vcvt.s32.f32    q9,  q9,  #16
564         vst1.32         {d22[0]}, [r8], ip
565         vst1.32         {d22[1]}, [r8], ip
566         vld1.64         {d20-d21},[r4,:128]!
567         vcvt.s32.f32    q10, q10, #16
568         vst1.32         {d23[0]}, [r8], ip
569         vst1.32         {d23[1]}, [r8], ip
570         vld1.64         {d22-d23},[r5,:128]!
571         vcvt.s32.f32    q11, q11, #16
572 6:      subs            lr,  lr,  #16
573         vld1.64         {d0-d1},  [r4,:128]!
574         vcvt.s32.f32    q0,  q0,  #16
575         vsri.32         d18, d16, #16
576         vld1.64         {d2-d3},  [r5,:128]!
577         vcvt.s32.f32    q1,  q1,  #16
578         vsri.32         d19, d17, #16
579         vld1.64         {d4-d5},  [r4,:128]!
580         vcvt.s32.f32    q2,  q2,  #16
581         vld1.64         {d6-d7},  [r5,:128]!
582         vcvt.s32.f32    q3,  q3,  #16
583         vst1.32         {d18[0]}, [r8], ip
584         vsri.32         d22, d20, #16
585         vst1.32         {d18[1]}, [r8], ip
586         vsri.32         d23, d21, #16
587         vst1.32         {d19[0]}, [r8], ip
588         vsri.32         d2,  d0,  #16
589         vst1.32         {d19[1]}, [r8], ip
590         vsri.32         d3,  d1,  #16
591         vst1.32         {d22[0]}, [r8], ip
592         vsri.32         d6,  d4,  #16
593         vst1.32         {d22[1]}, [r8], ip
594         vsri.32         d7,  d5,  #16
595         vst1.32         {d23[0]}, [r8], ip
596         vst1.32         {d23[1]}, [r8], ip
597         beq             6f
598         vld1.64         {d16-d17},[r4,:128]!
599         vcvt.s32.f32    q8,  q8,  #16
600         vst1.32         {d2[0]},  [r8], ip
601         vst1.32         {d2[1]},  [r8], ip
602         vld1.64         {d18-d19},[r5,:128]!
603         vcvt.s32.f32    q9,  q9,  #16
604         vst1.32         {d3[0]},  [r8], ip
605         vst1.32         {d3[1]},  [r8], ip
606         vld1.64         {d20-d21},[r4,:128]!
607         vcvt.s32.f32    q10, q10, #16
608         vst1.32         {d6[0]},  [r8], ip
609         vst1.32         {d6[1]},  [r8], ip
610         vld1.64         {d22-d23},[r5,:128]!
611         vcvt.s32.f32    q11, q11, #16
612         vst1.32         {d7[0]},  [r8], ip
613         vst1.32         {d7[1]},  [r8], ip
614         bgt             6b
615 6:      vst1.32         {d2[0]},  [r8], ip
616         vst1.32         {d2[1]},  [r8], ip
617         vst1.32         {d3[0]},  [r8], ip
618         vst1.32         {d3[1]},  [r8], ip
619         vst1.32         {d6[0]},  [r8], ip
620         vst1.32         {d6[1]},  [r8], ip
621         vst1.32         {d7[0]},  [r8], ip
622         vst1.32         {d7[1]},  [r8], ip
623         b               8f
624 7:      vsri.32         d18, d16, #16
625         vsri.32         d19, d17, #16
626         vst1.32         {d18[0]}, [r8], ip
627         vsri.32         d22, d20, #16
628         vst1.32         {d18[1]}, [r8], ip
629         vsri.32         d23, d21, #16
630         vst1.32         {d19[0]}, [r8], ip
631         vst1.32         {d19[1]}, [r8], ip
632         vst1.32         {d22[0]}, [r8], ip
633         vst1.32         {d22[1]}, [r8], ip
634         vst1.32         {d23[0]}, [r8], ip
635         vst1.32         {d23[1]}, [r8], ip
636 8:      subs            r3,  r3,  #2
637         add             r0,  r0,  #4
638         popeq           {r4-r8,pc}
639
640         @ 1 channel
641 4:      ldr             r4,  [r1],#4
642         tst             r2,  #8
643         mov             lr,  r2
644         mov             r5,  r0
645         vld1.64         {d0-d1},  [r4,:128]!
646         vcvt.s32.f32    q0,  q0,  #16
647         vld1.64         {d2-d3},  [r4,:128]!
648         vcvt.s32.f32    q1,  q1,  #16
649         bne             8f
650 6:      subs            lr,  lr,  #16
651         vld1.64         {d4-d5},  [r4,:128]!
652         vcvt.s32.f32    q2,  q2,  #16
653         vld1.64         {d6-d7},  [r4,:128]!
654         vcvt.s32.f32    q3,  q3,  #16
655         vst1.16         {d0[1]},  [r5,:16], ip
656         vst1.16         {d0[3]},  [r5,:16], ip
657         vst1.16         {d1[1]},  [r5,:16], ip
658         vst1.16         {d1[3]},  [r5,:16], ip
659         vst1.16         {d2[1]},  [r5,:16], ip
660         vst1.16         {d2[3]},  [r5,:16], ip
661         vst1.16         {d3[1]},  [r5,:16], ip
662         vst1.16         {d3[3]},  [r5,:16], ip
663         beq             7f
664         vld1.64         {d0-d1},  [r4,:128]!
665         vcvt.s32.f32    q0,  q0,  #16
666         vld1.64         {d2-d3},  [r4,:128]!
667         vcvt.s32.f32    q1,  q1,  #16
668 7:      vst1.16         {d4[1]},  [r5,:16], ip
669         vst1.16         {d4[3]},  [r5,:16], ip
670         vst1.16         {d5[1]},  [r5,:16], ip
671         vst1.16         {d5[3]},  [r5,:16], ip
672         vst1.16         {d6[1]},  [r5,:16], ip
673         vst1.16         {d6[3]},  [r5,:16], ip
674         vst1.16         {d7[1]},  [r5,:16], ip
675         vst1.16         {d7[3]},  [r5,:16], ip
676         bgt             6b
677         pop             {r4-r8,pc}
678 8:      subs            lr,  lr,  #8
679         vst1.16         {d0[1]},  [r5,:16], ip
680         vst1.16         {d0[3]},  [r5,:16], ip
681         vst1.16         {d1[1]},  [r5,:16], ip
682         vst1.16         {d1[3]},  [r5,:16], ip
683         vst1.16         {d2[1]},  [r5,:16], ip
684         vst1.16         {d2[3]},  [r5,:16], ip
685         vst1.16         {d3[1]},  [r5,:16], ip
686         vst1.16         {d3[3]},  [r5,:16], ip
687         popeq           {r4-r8,pc}
688         vld1.64         {d0-d1},  [r4,:128]!
689         vcvt.s32.f32    q0,  q0,  #16
690         vld1.64         {d2-d3},  [r4,:128]!
691         vcvt.s32.f32    q1,  q1,  #16
692         b               6b
693         .endfunc
694
695 function ff_vector_fmul_neon, export=1
696         mov             r3,  r0
697         subs            r2,  r2,  #8
698         vld1.64         {d0-d3},  [r0,:128]!
699         vld1.64         {d4-d7},  [r1,:128]!
700         vmul.f32        q8,  q0,  q2
701         vmul.f32        q9,  q1,  q3
702         beq             3f
703         bics            ip,  r2,  #15
704         beq             2f
705 1:      subs            ip,  ip,  #16
706         vld1.64         {d0-d1},  [r0,:128]!
707         vld1.64         {d4-d5},  [r1,:128]!
708         vmul.f32        q10, q0,  q2
709         vld1.64         {d2-d3},  [r0,:128]!
710         vld1.64         {d6-d7},  [r1,:128]!
711         vmul.f32        q11, q1,  q3
712         vst1.64         {d16-d19},[r3,:128]!
713         vld1.64         {d0-d1},  [r0,:128]!
714         vld1.64         {d4-d5},  [r1,:128]!
715         vmul.f32        q8,  q0,  q2
716         vld1.64         {d2-d3},  [r0,:128]!
717         vld1.64         {d6-d7},  [r1,:128]!
718         vmul.f32        q9,  q1,  q3
719         vst1.64         {d20-d23},[r3,:128]!
720         bne             1b
721         ands            r2,  r2,  #15
722         beq             3f
723 2:      vld1.64         {d0-d1},  [r0,:128]!
724         vld1.64         {d4-d5},  [r1,:128]!
725         vst1.64         {d16-d17},[r3,:128]!
726         vmul.f32        q8,  q0,  q2
727         vld1.64         {d2-d3},  [r0,:128]!
728         vld1.64         {d6-d7},  [r1,:128]!
729         vst1.64         {d18-d19},[r3,:128]!
730         vmul.f32        q9,  q1,  q3
731 3:      vst1.64         {d16-d19},[r3,:128]!
732         bx              lr
733         .endfunc
734
735 function ff_vector_fmul_window_neon, export=1
736         vld1.32         {d16[],d17[]}, [sp,:32]
737         push            {r4,r5,lr}
738         ldr             lr,  [sp, #16]
739         sub             r2,  r2,  #8
740         sub             r5,  lr,  #2
741         add             r2,  r2,  r5, lsl #2
742         add             r4,  r3,  r5, lsl #3
743         add             ip,  r0,  r5, lsl #3
744         mov             r5,  #-16
745         vld1.64         {d0,d1},  [r1,:128]!
746         vld1.64         {d2,d3},  [r2,:128], r5
747         vld1.64         {d4,d5},  [r3,:128]!
748         vld1.64         {d6,d7},  [r4,:128], r5
749 1:      subs            lr,  lr,  #4
750         vmov            q11, q8
751         vmla.f32        d22, d0,  d4
752         vmov            q10, q8
753         vmla.f32        d23, d1,  d5
754         vrev64.32       q3,  q3
755         vmla.f32        d20, d0,  d7
756         vrev64.32       q1,  q1
757         vmla.f32        d21, d1,  d6
758         beq             2f
759         vmla.f32        d22, d3,  d7
760         vld1.64         {d0,d1},  [r1,:128]!
761         vmla.f32        d23, d2,  d6
762         vld1.64         {d18,d19},[r2,:128], r5
763         vmls.f32        d20, d3,  d4
764         vld1.64         {d24,d25},[r3,:128]!
765         vmls.f32        d21, d2,  d5
766         vld1.64         {d6,d7},  [r4,:128], r5
767         vmov            q1,  q9
768         vrev64.32       q11, q11
769         vmov            q2,  q12
770         vswp            d22, d23
771         vst1.64         {d20,d21},[r0,:128]!
772         vst1.64         {d22,d23},[ip,:128], r5
773         b               1b
774 2:      vmla.f32        d22, d3,  d7
775         vmla.f32        d23, d2,  d6
776         vmls.f32        d20, d3,  d4
777         vmls.f32        d21, d2,  d5
778         vrev64.32       q11, q11
779         vswp            d22, d23
780         vst1.64         {d20,d21},[r0,:128]!
781         vst1.64         {d22,d23},[ip,:128], r5
782         pop             {r4,r5,pc}
783         .endfunc