]> rtime.felk.cvut.cz Git - frescor/ffmpeg.git/blob - libavcodec/arm/dsputil_neon_s.S
3b39d2e75e4bd237f8af3bbb709e0aca49854543
[frescor/ffmpeg.git] / libavcodec / arm / dsputil_neon_s.S
1 /*
2  * ARM NEON optimised DSP functions
3  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21
22 #include "asm.S"
23
24         preserve8
25         .fpu neon
26         .text
27
28         .macro pixels16 avg=0
29 .if \avg
30         mov             ip,  r0
31 .endif
32 1:      vld1.64         {d0, d1},  [r1], r2
33         vld1.64         {d2, d3},  [r1], r2
34         vld1.64         {d4, d5},  [r1], r2
35         pld             [r1, r2, lsl #2]
36         vld1.64         {d6, d7},  [r1], r2
37         pld             [r1]
38         pld             [r1, r2]
39         pld             [r1, r2, lsl #1]
40 .if \avg
41         vld1.64         {d16,d17}, [ip], r2
42         vrhadd.u8       q0,  q0,  q8
43         vld1.64         {d18,d19}, [ip], r2
44         vrhadd.u8       q1,  q1,  q9
45         vld1.64         {d20,d21}, [ip], r2
46         vrhadd.u8       q2,  q2,  q10
47         vld1.64         {d22,d23}, [ip], r2
48         vrhadd.u8       q3,  q3,  q11
49 .endif
50         subs            r3,  r3,  #4
51         vst1.64         {d0, d1},  [r0,:128], r2
52         vst1.64         {d2, d3},  [r0,:128], r2
53         vst1.64         {d4, d5},  [r0,:128], r2
54         vst1.64         {d6, d7},  [r0,:128], r2
55         bne             1b
56         bx              lr
57         .endm
58
59         .macro pixels16_x2 vhadd=vrhadd.u8
60 1:      vld1.64         {d0-d2},   [r1], r2
61         vld1.64         {d4-d6},   [r1], r2
62         pld             [r1]
63         pld             [r1, r2]
64         subs            r3,  r3,  #2
65         vext.8          q1,  q0,  q1,  #1
66         \vhadd          q0,  q0,  q1
67         vext.8          q3,  q2,  q3,  #1
68         \vhadd          q2,  q2,  q3
69         vst1.64         {d0, d1},  [r0,:128], r2
70         vst1.64         {d4, d5},  [r0,:128], r2
71         bne             1b
72         bx              lr
73         .endm
74
75         .macro pixels16_y2 vhadd=vrhadd.u8
76         push            {lr}
77         add             ip,  r1,  r2
78         lsl             lr,  r2,  #1
79         vld1.64         {d0, d1},  [r1], lr
80         vld1.64         {d2, d3},  [ip], lr
81 1:      subs            r3,  r3,  #2
82         \vhadd          q2,  q0,  q1
83         vld1.64         {d0, d1},  [r1],      lr
84         \vhadd          q3,  q0,  q1
85         vld1.64         {d2, d3},  [ip],      lr
86         pld             [r1]
87         pld             [ip]
88         vst1.64         {d4, d5},  [r0,:128], r2
89         vst1.64         {d6, d7},  [r0,:128], r2
90         bne             1b
91         pop             {pc}
92         .endm
93
94         .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0
95         push            {lr}
96         lsl             lr,  r2,  #1
97         add             ip,  r1,  r2
98         vld1.64         {d0-d2},   [r1], lr
99         vld1.64         {d4-d6},   [ip], lr
100 .if \no_rnd
101         vmov.i16        q13, #1
102 .endif
103         pld             [r1]
104         pld             [ip]
105         vext.8          q1,  q0,  q1,  #1
106         vext.8          q3,  q2,  q3,  #1
107         vaddl.u8        q8,  d0,  d2
108         vaddl.u8        q10, d1,  d3
109         vaddl.u8        q9,  d4,  d6
110         vaddl.u8        q11, d5,  d7
111 1:      subs            r3,  r3,  #2
112         vld1.64         {d0-d2},   [r1], lr
113         vadd.u16        q12, q8,  q9
114         pld             [r1]
115 .if \no_rnd
116         vadd.u16        q12, q12, q13
117 .endif
118         vext.8          q15, q0,  q1,  #1
119         vadd.u16        q1 , q10, q11
120         \vshrn          d28, q12, #2
121 .if \no_rnd
122         vadd.u16        q1,  q1,  q13
123 .endif
124         \vshrn          d29, q1,  #2
125         vaddl.u8        q8,  d0,  d30
126         vld1.64         {d2-d4},   [ip], lr
127         vaddl.u8        q10, d1,  d31
128         vst1.64         {d28,d29}, [r0,:128], r2
129         vadd.u16        q12, q8,  q9
130         pld             [ip]
131 .if \no_rnd
132         vadd.u16        q12, q12, q13
133 .endif
134         vext.8          q2,  q1,  q2,  #1
135         vadd.u16        q0,  q10, q11
136         \vshrn          d30, q12, #2
137 .if \no_rnd
138         vadd.u16        q0,  q0,  q13
139 .endif
140         \vshrn          d31, q0,  #2
141         vaddl.u8        q9,  d2,  d4
142         vaddl.u8        q11, d3,  d5
143         vst1.64         {d30,d31}, [r0,:128], r2
144         bgt             1b
145         pop             {pc}
146         .endm
147
148         .macro pixels8
149 1:      vld1.64         {d0}, [r1], r2
150         vld1.64         {d1}, [r1], r2
151         vld1.64         {d2}, [r1], r2
152         pld             [r1, r2, lsl #2]
153         vld1.64         {d3}, [r1], r2
154         pld             [r1]
155         pld             [r1, r2]
156         pld             [r1, r2, lsl #1]
157         subs            r3,  r3,  #4
158         vst1.64         {d0}, [r0,:64], r2
159         vst1.64         {d1}, [r0,:64], r2
160         vst1.64         {d2}, [r0,:64], r2
161         vst1.64         {d3}, [r0,:64], r2
162         bne             1b
163         bx              lr
164         .endm
165
166         .macro pixels8_x2 vhadd=vrhadd.u8
167 1:      vld1.64         {d0, d1},  [r1], r2
168         vext.8          d1,  d0,  d1,  #1
169         vld1.64         {d2, d3},  [r1], r2
170         vext.8          d3,  d2,  d3,  #1
171         pld             [r1]
172         pld             [r1, r2]
173         subs            r3,  r3,  #2
174         vswp            d1,  d2
175         \vhadd          q0,  q0,  q1
176         vst1.64         {d0},      [r0,:64], r2
177         vst1.64         {d1},      [r0,:64], r2
178         bne             1b
179         bx              lr
180         .endm
181
182         .macro pixels8_y2 vhadd=vrhadd.u8
183         push            {lr}
184         add             ip,  r1,  r2
185         lsl             lr,  r2,  #1
186         vld1.64         {d0},      [r1], lr
187         vld1.64         {d1},      [ip], lr
188 1:      subs            r3,  r3,  #2
189         \vhadd          d4,  d0,  d1
190         vld1.64         {d0},      [r1],     lr
191         \vhadd          d5,  d0,  d1
192         vld1.64         {d1},      [ip],     lr
193         pld             [r1]
194         pld             [ip]
195         vst1.64         {d4},      [r0,:64], r2
196         vst1.64         {d5},      [r0,:64], r2
197         bne             1b
198         pop             {pc}
199         .endm
200
201         .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0
202         push            {lr}
203         lsl             lr,  r2,  #1
204         add             ip,  r1,  r2
205         vld1.64         {d0, d1},  [r1], lr
206         vld1.64         {d2, d3},  [ip], lr
207 .if \no_rnd
208         vmov.i16        q11, #1
209 .endif
210         pld             [r1]
211         pld             [ip]
212         vext.8          d4,  d0,  d1,  #1
213         vext.8          d6,  d2,  d3,  #1
214         vaddl.u8        q8,  d0,  d4
215         vaddl.u8        q9,  d2,  d6
216 1:      subs            r3,  r3,  #2
217         vld1.64         {d0, d1},  [r1], lr
218         pld             [r1]
219         vadd.u16        q10, q8,  q9
220         vext.8          d4,  d0,  d1,  #1
221 .if \no_rnd
222         vadd.u16        q10, q10, q11
223 .endif
224         vaddl.u8        q8,  d0,  d4
225         \vshrn          d5,  q10, #2
226         vld1.64         {d2, d3},  [ip], lr
227         vadd.u16        q10, q8,  q9
228         pld             [ip]
229 .if \no_rnd
230         vadd.u16        q10, q10, q11
231 .endif
232         vst1.64         {d5},      [r0,:64], r2
233         \vshrn          d7,  q10, #2
234         vext.8          d6,  d2,  d3,  #1
235         vaddl.u8        q9,  d2,  d6
236         vst1.64         {d7},      [r0,:64], r2
237         bgt             1b
238         pop             {pc}
239         .endm
240
241         .macro pixfunc pfx name suf rnd_op args:vararg
242 function ff_\pfx\name\suf\()_neon, export=1
243         \name \rnd_op \args
244         .endfunc
245         .endm
246
247         .macro pixfunc2 pfx name args:vararg
248         pixfunc \pfx \name
249         pixfunc \pfx \name \args
250         .endm
251
252 function ff_put_h264_qpel16_mc00_neon, export=1
253         mov   r3, #16
254         .endfunc
255
256         pixfunc  put_ pixels16
257         pixfunc2 put_ pixels16_x2,  _no_rnd, vhadd.u8
258         pixfunc2 put_ pixels16_y2,  _no_rnd, vhadd.u8
259         pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1
260
261 function ff_avg_h264_qpel16_mc00_neon, export=1
262         mov   r3, #16
263         .endfunc
264
265         pixfunc  avg_ pixels16,, 1
266
267 function ff_put_h264_qpel8_mc00_neon, export=1
268         mov   r3, #8
269         .endfunc
270
271         pixfunc  put_ pixels8
272         pixfunc2 put_ pixels8_x2,   _no_rnd, vhadd.u8
273         pixfunc2 put_ pixels8_y2,   _no_rnd, vhadd.u8
274         pixfunc2 put_ pixels8_xy2,  _no_rnd, vshrn.u16, 1
275
276 function ff_float_to_int16_neon, export=1
277         subs            r2,  r2,  #8
278         vld1.64         {d0-d1},  [r1,:128]!
279         vcvt.s32.f32    q8,  q0,  #16
280         vld1.64         {d2-d3},  [r1,:128]!
281         vcvt.s32.f32    q9,  q1,  #16
282         beq             3f
283         bics            ip,  r2,  #15
284         beq             2f
285 1:      subs            ip,  ip,  #16
286         vshrn.s32       d4,  q8,  #16
287         vld1.64         {d0-d1},  [r1,:128]!
288         vcvt.s32.f32    q0,  q0,  #16
289         vshrn.s32       d5,  q9,  #16
290         vld1.64         {d2-d3},  [r1,:128]!
291         vcvt.s32.f32    q1,  q1,  #16
292         vshrn.s32       d6,  q0,  #16
293         vst1.64         {d4-d5},  [r0,:128]!
294         vshrn.s32       d7,  q1,  #16
295         vld1.64         {d16-d17},[r1,:128]!
296         vcvt.s32.f32    q8,  q8,  #16
297         vld1.64         {d18-d19},[r1,:128]!
298         vcvt.s32.f32    q9,  q9,  #16
299         vst1.64         {d6-d7},  [r0,:128]!
300         bne             1b
301         ands            r2,  r2,  #15
302         beq             3f
303 2:      vld1.64         {d0-d1},  [r1,:128]!
304         vshrn.s32       d4,  q8,  #16
305         vcvt.s32.f32    q0,  q0,  #16
306         vld1.64         {d2-d3},  [r1,:128]!
307         vshrn.s32       d5,  q9,  #16
308         vcvt.s32.f32    q1,  q1,  #16
309         vshrn.s32       d6,  q0,  #16
310         vst1.64         {d4-d5},  [r0,:128]!
311         vshrn.s32       d7,  q1,  #16
312         vst1.64         {d6-d7},  [r0,:128]!
313         bx              lr
314 3:      vshrn.s32       d4,  q8,  #16
315         vshrn.s32       d5,  q9,  #16
316         vst1.64         {d4-d5},  [r0,:128]!
317         bx              lr
318         .endfunc
319
320 function ff_float_to_int16_interleave_neon, export=1
321         cmp             r3, #2
322         ldrlt           r1, [r1]
323         blt             ff_float_to_int16_neon
324         bne             4f
325
326         ldr             r3, [r1]
327         ldr             r1, [r1, #4]
328
329         subs            r2,  r2,  #8
330         vld1.64         {d0-d1},  [r3,:128]!
331         vcvt.s32.f32    q8,  q0,  #16
332         vld1.64         {d2-d3},  [r3,:128]!
333         vcvt.s32.f32    q9,  q1,  #16
334         vld1.64         {d20-d21},[r1,:128]!
335         vcvt.s32.f32    q10, q10, #16
336         vld1.64         {d22-d23},[r1,:128]!
337         vcvt.s32.f32    q11, q11, #16
338         beq             3f
339         bics            ip,  r2,  #15
340         beq             2f
341 1:      subs            ip,  ip,  #16
342         vld1.64         {d0-d1},  [r3,:128]!
343         vcvt.s32.f32    q0,  q0,  #16
344         vsri.32         q10, q8,  #16
345         vld1.64         {d2-d3},  [r3,:128]!
346         vcvt.s32.f32    q1,  q1,  #16
347         vld1.64         {d24-d25},[r1,:128]!
348         vcvt.s32.f32    q12, q12, #16
349         vld1.64         {d26-d27},[r1,:128]!
350         vsri.32         q11, q9,  #16
351         vst1.64         {d20-d21},[r0,:128]!
352         vcvt.s32.f32    q13, q13, #16
353         vst1.64         {d22-d23},[r0,:128]!
354         vsri.32         q12, q0,  #16
355         vld1.64         {d16-d17},[r3,:128]!
356         vsri.32         q13, q1,  #16
357         vst1.64         {d24-d25},[r0,:128]!
358         vcvt.s32.f32    q8,  q8,  #16
359         vld1.64         {d18-d19},[r3,:128]!
360         vcvt.s32.f32    q9,  q9,  #16
361         vld1.64         {d20-d21},[r1,:128]!
362         vcvt.s32.f32    q10, q10, #16
363         vld1.64         {d22-d23},[r1,:128]!
364         vcvt.s32.f32    q11, q11, #16
365         vst1.64         {d26-d27},[r0,:128]!
366         bne             1b
367         ands            r2,  r2,  #15
368         beq             3f
369 2:      vsri.32         q10, q8,  #16
370         vld1.64         {d0-d1},  [r3,:128]!
371         vcvt.s32.f32    q0,  q0,  #16
372         vld1.64         {d2-d3},  [r3,:128]!
373         vcvt.s32.f32    q1,  q1,  #16
374         vld1.64         {d24-d25},[r1,:128]!
375         vcvt.s32.f32    q12, q12, #16
376         vsri.32         q11, q9,  #16
377         vld1.64         {d26-d27},[r1,:128]!
378         vcvt.s32.f32    q13, q13, #16
379         vst1.64         {d20-d21},[r0,:128]!
380         vsri.32         q12, q0,  #16
381         vst1.64         {d22-d23},[r0,:128]!
382         vsri.32         q13, q1,  #16
383         vst1.64         {d24-d27},[r0,:128]!
384         bx              lr
385 3:      vsri.32         q10, q8,  #16
386         vsri.32         q11, q9,  #16
387         vst1.64         {d20-d23},[r0,:128]!
388         bx              lr
389
390 4:      push            {r4-r8,lr}
391         cmp             r3,  #4
392         lsl             ip,  r3,  #1
393         blt             4f
394
395         @ 4 channels
396 5:      ldmia           r1!, {r4-r7}
397         mov             lr,  r2
398         mov             r8,  r0
399         vld1.64         {d16-d17},[r4,:128]!
400         vcvt.s32.f32    q8,  q8,  #16
401         vld1.64         {d18-d19},[r5,:128]!
402         vcvt.s32.f32    q9,  q9,  #16
403         vld1.64         {d20-d21},[r6,:128]!
404         vcvt.s32.f32    q10, q10, #16
405         vld1.64         {d22-d23},[r7,:128]!
406         vcvt.s32.f32    q11, q11, #16
407 6:      subs            lr,  lr,  #8
408         vld1.64         {d0-d1},  [r4,:128]!
409         vcvt.s32.f32    q0,  q0,  #16
410         vsri.32         q9,  q8,  #16
411         vld1.64         {d2-d3},  [r5,:128]!
412         vcvt.s32.f32    q1,  q1,  #16
413         vsri.32         q11, q10, #16
414         vld1.64         {d4-d5},  [r6,:128]!
415         vcvt.s32.f32    q2,  q2,  #16
416         vzip.32         d18, d22
417         vld1.64         {d6-d7},  [r7,:128]!
418         vcvt.s32.f32    q3,  q3,  #16
419         vzip.32         d19, d23
420         vst1.64         {d18},    [r8], ip
421         vsri.32         q1,  q0,  #16
422         vst1.64         {d22},    [r8], ip
423         vsri.32         q3,  q2,  #16
424         vst1.64         {d19},    [r8], ip
425         vzip.32         d2,  d6
426         vst1.64         {d23},    [r8], ip
427         vzip.32         d3,  d7
428         beq             7f
429         vld1.64         {d16-d17},[r4,:128]!
430         vcvt.s32.f32    q8,  q8,  #16
431         vst1.64         {d2},     [r8], ip
432         vld1.64         {d18-d19},[r5,:128]!
433         vcvt.s32.f32    q9,  q9,  #16
434         vst1.64         {d6},     [r8], ip
435         vld1.64         {d20-d21},[r6,:128]!
436         vcvt.s32.f32    q10, q10, #16
437         vst1.64         {d3},     [r8], ip
438         vld1.64         {d22-d23},[r7,:128]!
439         vcvt.s32.f32    q11, q11, #16
440         vst1.64         {d7},     [r8], ip
441         b               6b
442 7:      vst1.64         {d2},     [r8], ip
443         vst1.64         {d6},     [r8], ip
444         vst1.64         {d3},     [r8], ip
445         vst1.64         {d7},     [r8], ip
446         subs            r3,  r3,  #4
447         popeq           {r4-r8,pc}
448         cmp             r3,  #4
449         add             r0,  r0,  #8
450         bge             5b
451
452         @ 2 channels
453 4:      cmp             r3,  #2
454         blt             4f
455         ldmia           r1!, {r4-r5}
456         mov             lr,  r2
457         mov             r8,  r0
458         tst             lr,  #8
459         vld1.64         {d16-d17},[r4,:128]!
460         vcvt.s32.f32    q8,  q8,  #16
461         vld1.64         {d18-d19},[r5,:128]!
462         vcvt.s32.f32    q9,  q9,  #16
463         vld1.64         {d20-d21},[r4,:128]!
464         vcvt.s32.f32    q10, q10, #16
465         vld1.64         {d22-d23},[r5,:128]!
466         vcvt.s32.f32    q11, q11, #16
467         beq             6f
468         subs            lr,  lr,  #8
469         beq             7f
470         vsri.32         d18, d16, #16
471         vsri.32         d19, d17, #16
472         vld1.64         {d16-d17},[r4,:128]!
473         vcvt.s32.f32    q8,  q8,  #16
474         vst1.32         {d18[0]}, [r8], ip
475         vsri.32         d22, d20, #16
476         vst1.32         {d18[1]}, [r8], ip
477         vsri.32         d23, d21, #16
478         vst1.32         {d19[0]}, [r8], ip
479         vst1.32         {d19[1]}, [r8], ip
480         vld1.64         {d18-d19},[r5,:128]!
481         vcvt.s32.f32    q9,  q9,  #16
482         vst1.32         {d22[0]}, [r8], ip
483         vst1.32         {d22[1]}, [r8], ip
484         vld1.64         {d20-d21},[r4,:128]!
485         vcvt.s32.f32    q10, q10, #16
486         vst1.32         {d23[0]}, [r8], ip
487         vst1.32         {d23[1]}, [r8], ip
488         vld1.64         {d22-d23},[r5,:128]!
489         vcvt.s32.f32    q11, q11, #16
490 6:      subs            lr,  lr,  #16
491         vld1.64         {d0-d1},  [r4,:128]!
492         vcvt.s32.f32    q0,  q0,  #16
493         vsri.32         d18, d16, #16
494         vld1.64         {d2-d3},  [r5,:128]!
495         vcvt.s32.f32    q1,  q1,  #16
496         vsri.32         d19, d17, #16
497         vld1.64         {d4-d5},  [r4,:128]!
498         vcvt.s32.f32    q2,  q2,  #16
499         vld1.64         {d6-d7},  [r5,:128]!
500         vcvt.s32.f32    q3,  q3,  #16
501         vst1.32         {d18[0]}, [r8], ip
502         vsri.32         d22, d20, #16
503         vst1.32         {d18[1]}, [r8], ip
504         vsri.32         d23, d21, #16
505         vst1.32         {d19[0]}, [r8], ip
506         vsri.32         d2,  d0,  #16
507         vst1.32         {d19[1]}, [r8], ip
508         vsri.32         d3,  d1,  #16
509         vst1.32         {d22[0]}, [r8], ip
510         vsri.32         d6,  d4,  #16
511         vst1.32         {d22[1]}, [r8], ip
512         vsri.32         d7,  d5,  #16
513         vst1.32         {d23[0]}, [r8], ip
514         vst1.32         {d23[1]}, [r8], ip
515         beq             6f
516         vld1.64         {d16-d17},[r4,:128]!
517         vcvt.s32.f32    q8,  q8,  #16
518         vst1.32         {d2[0]},  [r8], ip
519         vst1.32         {d2[1]},  [r8], ip
520         vld1.64         {d18-d19},[r5,:128]!
521         vcvt.s32.f32    q9,  q9,  #16
522         vst1.32         {d3[0]},  [r8], ip
523         vst1.32         {d3[1]},  [r8], ip
524         vld1.64         {d20-d21},[r4,:128]!
525         vcvt.s32.f32    q10, q10, #16
526         vst1.32         {d6[0]},  [r8], ip
527         vst1.32         {d6[1]},  [r8], ip
528         vld1.64         {d22-d23},[r5,:128]!
529         vcvt.s32.f32    q11, q11, #16
530         vst1.32         {d7[0]},  [r8], ip
531         vst1.32         {d7[1]},  [r8], ip
532         bgt             6b
533 6:      vst1.32         {d2[0]},  [r8], ip
534         vst1.32         {d2[1]},  [r8], ip
535         vst1.32         {d3[0]},  [r8], ip
536         vst1.32         {d3[1]},  [r8], ip
537         vst1.32         {d6[0]},  [r8], ip
538         vst1.32         {d6[1]},  [r8], ip
539         vst1.32         {d7[0]},  [r8], ip
540         vst1.32         {d7[1]},  [r8], ip
541         b               8f
542 7:      vsri.32         d18, d16, #16
543         vsri.32         d19, d17, #16
544         vst1.32         {d18[0]}, [r8], ip
545         vsri.32         d22, d20, #16
546         vst1.32         {d18[1]}, [r8], ip
547         vsri.32         d23, d21, #16
548         vst1.32         {d19[0]}, [r8], ip
549         vst1.32         {d19[1]}, [r8], ip
550         vst1.32         {d22[0]}, [r8], ip
551         vst1.32         {d22[1]}, [r8], ip
552         vst1.32         {d23[0]}, [r8], ip
553         vst1.32         {d23[1]}, [r8], ip
554 8:      subs            r3,  r3,  #2
555         add             r0,  r0,  #4
556         popeq           {r4-r8,pc}
557
558         @ 1 channel
559 4:      ldr             r4,  [r1],#4
560         tst             r2,  #8
561         mov             lr,  r2
562         mov             r5,  r0
563         vld1.64         {d0-d1},  [r4,:128]!
564         vcvt.s32.f32    q0,  q0,  #16
565         vld1.64         {d2-d3},  [r4,:128]!
566         vcvt.s32.f32    q1,  q1,  #16
567         bne             8f
568 6:      subs            lr,  lr,  #16
569         vld1.64         {d4-d5},  [r4,:128]!
570         vcvt.s32.f32    q2,  q2,  #16
571         vld1.64         {d6-d7},  [r4,:128]!
572         vcvt.s32.f32    q3,  q3,  #16
573         vst1.16         {d0[1]},  [r5,:16], ip
574         vst1.16         {d0[3]},  [r5,:16], ip
575         vst1.16         {d1[1]},  [r5,:16], ip
576         vst1.16         {d1[3]},  [r5,:16], ip
577         vst1.16         {d2[1]},  [r5,:16], ip
578         vst1.16         {d2[3]},  [r5,:16], ip
579         vst1.16         {d3[1]},  [r5,:16], ip
580         vst1.16         {d3[3]},  [r5,:16], ip
581         beq             7f
582         vld1.64         {d0-d1},  [r4,:128]!
583         vcvt.s32.f32    q0,  q0,  #16
584         vld1.64         {d2-d3},  [r4,:128]!
585         vcvt.s32.f32    q1,  q1,  #16
586 7:      vst1.16         {d4[1]},  [r5,:16], ip
587         vst1.16         {d4[3]},  [r5,:16], ip
588         vst1.16         {d5[1]},  [r5,:16], ip
589         vst1.16         {d5[3]},  [r5,:16], ip
590         vst1.16         {d6[1]},  [r5,:16], ip
591         vst1.16         {d6[3]},  [r5,:16], ip
592         vst1.16         {d7[1]},  [r5,:16], ip
593         vst1.16         {d7[3]},  [r5,:16], ip
594         bgt             6b
595         pop             {r4-r8,pc}
596 8:      subs            lr,  lr,  #8
597         vst1.16         {d0[1]},  [r5,:16], ip
598         vst1.16         {d0[3]},  [r5,:16], ip
599         vst1.16         {d1[1]},  [r5,:16], ip
600         vst1.16         {d1[3]},  [r5,:16], ip
601         vst1.16         {d2[1]},  [r5,:16], ip
602         vst1.16         {d2[3]},  [r5,:16], ip
603         vst1.16         {d3[1]},  [r5,:16], ip
604         vst1.16         {d3[3]},  [r5,:16], ip
605         popeq           {r4-r8,pc}
606         vld1.64         {d0-d1},  [r4,:128]!
607         vcvt.s32.f32    q0,  q0,  #16
608         vld1.64         {d2-d3},  [r4,:128]!
609         vcvt.s32.f32    q1,  q1,  #16
610         b               6b
611         .endfunc
612
613 function ff_vector_fmul_neon, export=1
614         mov             r3,  r0
615         subs            r2,  r2,  #8
616         vld1.64         {d0-d3},  [r0,:128]!
617         vld1.64         {d4-d7},  [r1,:128]!
618         vmul.f32        q8,  q0,  q2
619         vmul.f32        q9,  q1,  q3
620         beq             3f
621         bics            ip,  r2,  #15
622         beq             2f
623 1:      subs            ip,  ip,  #16
624         vld1.64         {d0-d1},  [r0,:128]!
625         vld1.64         {d4-d5},  [r1,:128]!
626         vmul.f32        q10, q0,  q2
627         vld1.64         {d2-d3},  [r0,:128]!
628         vld1.64         {d6-d7},  [r1,:128]!
629         vmul.f32        q11, q1,  q3
630         vst1.64         {d16-d19},[r3,:128]!
631         vld1.64         {d0-d1},  [r0,:128]!
632         vld1.64         {d4-d5},  [r1,:128]!
633         vmul.f32        q8,  q0,  q2
634         vld1.64         {d2-d3},  [r0,:128]!
635         vld1.64         {d6-d7},  [r1,:128]!
636         vmul.f32        q9,  q1,  q3
637         vst1.64         {d20-d23},[r3,:128]!
638         bne             1b
639         ands            r2,  r2,  #15
640         beq             3f
641 2:      vld1.64         {d0-d1},  [r0,:128]!
642         vld1.64         {d4-d5},  [r1,:128]!
643         vst1.64         {d16-d17},[r3,:128]!
644         vmul.f32        q8,  q0,  q2
645         vld1.64         {d2-d3},  [r0,:128]!
646         vld1.64         {d6-d7},  [r1,:128]!
647         vst1.64         {d18-d19},[r3,:128]!
648         vmul.f32        q9,  q1,  q3
649 3:      vst1.64         {d16-d19},[r3,:128]!
650         bx              lr
651         .endfunc
652
653 function ff_vector_fmul_window_neon, export=1
654         vld1.32         {d16[],d17[]}, [sp,:32]
655         push            {r4,r5,lr}
656         ldr             lr,  [sp, #16]
657         sub             r2,  r2,  #8
658         sub             r5,  lr,  #2
659         add             r2,  r2,  r5, lsl #2
660         add             r4,  r3,  r5, lsl #3
661         add             ip,  r0,  r5, lsl #3
662         mov             r5,  #-16
663         vld1.64         {d0,d1},  [r1,:128]!
664         vld1.64         {d2,d3},  [r2,:128], r5
665         vld1.64         {d4,d5},  [r3,:128]!
666         vld1.64         {d6,d7},  [r4,:128], r5
667 1:      subs            lr,  lr,  #4
668         vmov            q11, q8
669         vmla.f32        d22, d0,  d4
670         vmov            q10, q8
671         vmla.f32        d23, d1,  d5
672         vrev64.32       q3,  q3
673         vmla.f32        d20, d0,  d7
674         vrev64.32       q1,  q1
675         vmla.f32        d21, d1,  d6
676         beq             2f
677         vmla.f32        d22, d3,  d7
678         vld1.64         {d0,d1},  [r1,:128]!
679         vmla.f32        d23, d2,  d6
680         vld1.64         {d18,d19},[r2,:128], r5
681         vmls.f32        d20, d3,  d4
682         vld1.64         {d24,d25},[r3,:128]!
683         vmls.f32        d21, d2,  d5
684         vld1.64         {d6,d7},  [r4,:128], r5
685         vmov            q1,  q9
686         vrev64.32       q11, q11
687         vmov            q2,  q12
688         vswp            d22, d23
689         vst1.64         {d20,d21},[r0,:128]!
690         vst1.64         {d22,d23},[ip,:128], r5
691         b               1b
692 2:      vmla.f32        d22, d3,  d7
693         vmla.f32        d23, d2,  d6
694         vmls.f32        d20, d3,  d4
695         vmls.f32        d21, d2,  d5
696         vrev64.32       q11, q11
697         vswp            d22, d23
698         vst1.64         {d20,d21},[r0,:128]!
699         vst1.64         {d22,d23},[ip,:128], r5
700         pop             {r4,r5,pc}
701         .endfunc