]> rtime.felk.cvut.cz Git - frescor/ffmpeg.git/blob - libavcodec/arm/dsputil_neon_s.S
9027353c4a67bbe4b287a2f3485eb51b9f459150
[frescor/ffmpeg.git] / libavcodec / arm / dsputil_neon_s.S
1 /*
2  * ARM NEON optimised DSP functions
3  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21
22 #include "asm.S"
23
24         preserve8
25         .fpu neon
26         .text
27
28         .macro pixels16 avg=0
29 .if \avg
30         mov             ip,  r0
31 .endif
32 1:      vld1.64         {d0, d1},  [r1], r2
33         vld1.64         {d2, d3},  [r1], r2
34         vld1.64         {d4, d5},  [r1], r2
35         pld             [r1, r2, lsl #2]
36         vld1.64         {d6, d7},  [r1], r2
37         pld             [r1]
38         pld             [r1, r2]
39         pld             [r1, r2, lsl #1]
40 .if \avg
41         vld1.64         {d16,d17}, [ip], r2
42         vrhadd.u8       q0,  q0,  q8
43         vld1.64         {d18,d19}, [ip], r2
44         vrhadd.u8       q1,  q1,  q9
45         vld1.64         {d20,d21}, [ip], r2
46         vrhadd.u8       q2,  q2,  q10
47         vld1.64         {d22,d23}, [ip], r2
48         vrhadd.u8       q3,  q3,  q11
49 .endif
50         subs            r3,  r3,  #4
51         vst1.64         {d0, d1},  [r0,:128], r2
52         vst1.64         {d2, d3},  [r0,:128], r2
53         vst1.64         {d4, d5},  [r0,:128], r2
54         vst1.64         {d6, d7},  [r0,:128], r2
55         bne             1b
56         bx              lr
57         .endm
58
59         .macro pixels16_x2 vhadd=vrhadd.u8
60 1:      vld1.64         {d0-d2},   [r1], r2
61         vld1.64         {d4-d6},   [r1], r2
62         pld             [r1]
63         pld             [r1, r2]
64         subs            r3,  r3,  #2
65         vext.8          q1,  q0,  q1,  #1
66         \vhadd          q0,  q0,  q1
67         vext.8          q3,  q2,  q3,  #1
68         \vhadd          q2,  q2,  q3
69         vst1.64         {d0, d1},  [r0,:128], r2
70         vst1.64         {d4, d5},  [r0,:128], r2
71         bne             1b
72         bx              lr
73         .endm
74
75         .macro pixels16_y2 vhadd=vrhadd.u8
76         push            {lr}
77         add             ip,  r1,  r2
78         lsl             lr,  r2,  #1
79         vld1.64         {d0, d1},  [r1], lr
80         vld1.64         {d2, d3},  [ip], lr
81 1:      subs            r3,  r3,  #2
82         \vhadd          q2,  q0,  q1
83         vld1.64         {d0, d1},  [r1],      lr
84         \vhadd          q3,  q0,  q1
85         vld1.64         {d2, d3},  [ip],      lr
86         pld             [r1]
87         pld             [ip]
88         vst1.64         {d4, d5},  [r0,:128], r2
89         vst1.64         {d6, d7},  [r0,:128], r2
90         bne             1b
91         pop             {pc}
92         .endm
93
94         .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0
95         push            {lr}
96         lsl             lr,  r2,  #1
97         add             ip,  r1,  r2
98         vld1.64         {d0-d2},   [r1], lr
99         vld1.64         {d4-d6},   [ip], lr
100 .if \no_rnd
101         vmov.i16        q13, #1
102 .endif
103         pld             [r1]
104         pld             [ip]
105         vext.8          q1,  q0,  q1,  #1
106         vext.8          q3,  q2,  q3,  #1
107         vaddl.u8        q8,  d0,  d2
108         vaddl.u8        q10, d1,  d3
109         vaddl.u8        q9,  d4,  d6
110         vaddl.u8        q11, d5,  d7
111 1:      subs            r3,  r3,  #2
112         vld1.64         {d0-d2},   [r1], lr
113         vadd.u16        q12, q8,  q9
114         pld             [r1]
115 .if \no_rnd
116         vadd.u16        q12, q12, q13
117 .endif
118         vext.8          q15, q0,  q1,  #1
119         vadd.u16        q1 , q10, q11
120         \vshrn          d28, q12, #2
121 .if \no_rnd
122         vadd.u16        q1,  q1,  q13
123 .endif
124         \vshrn          d29, q1,  #2
125         vaddl.u8        q8,  d0,  d30
126         vld1.64         {d2-d4},   [ip], lr
127         vaddl.u8        q10, d1,  d31
128         vst1.64         {d28,d29}, [r0,:128], r2
129         vadd.u16        q12, q8,  q9
130         pld             [ip]
131 .if \no_rnd
132         vadd.u16        q12, q12, q13
133 .endif
134         vext.8          q2,  q1,  q2,  #1
135         vadd.u16        q0,  q10, q11
136         \vshrn          d30, q12, #2
137 .if \no_rnd
138         vadd.u16        q0,  q0,  q13
139 .endif
140         \vshrn          d31, q0,  #2
141         vaddl.u8        q9,  d2,  d4
142         vaddl.u8        q11, d3,  d5
143         vst1.64         {d30,d31}, [r0,:128], r2
144         bgt             1b
145         pop             {pc}
146         .endm
147
148         .macro pixels8
149 1:      vld1.64         {d0}, [r1], r2
150         vld1.64         {d1}, [r1], r2
151         vld1.64         {d2}, [r1], r2
152         pld             [r1, r2, lsl #2]
153         vld1.64         {d3}, [r1], r2
154         pld             [r1]
155         pld             [r1, r2]
156         pld             [r1, r2, lsl #1]
157         subs            r3,  r3,  #4
158         vst1.64         {d0}, [r0,:64], r2
159         vst1.64         {d1}, [r0,:64], r2
160         vst1.64         {d2}, [r0,:64], r2
161         vst1.64         {d3}, [r0,:64], r2
162         bne             1b
163         bx              lr
164         .endm
165
166         .macro pixels8_x2 vhadd=vrhadd.u8
167 1:      vld1.64         {d0, d1},  [r1], r2
168         vext.8          d1,  d0,  d1,  #1
169         vld1.64         {d2, d3},  [r1], r2
170         vext.8          d3,  d2,  d3,  #1
171         pld             [r1]
172         pld             [r1, r2]
173         subs            r3,  r3,  #2
174         vswp            d1,  d2
175         \vhadd          q0,  q0,  q1
176         vst1.64         {d0},      [r0,:64], r2
177         vst1.64         {d1},      [r0,:64], r2
178         bne             1b
179         bx              lr
180         .endm
181
182         .macro pixels8_y2 vhadd=vrhadd.u8
183         push            {lr}
184         add             ip,  r1,  r2
185         lsl             lr,  r2,  #1
186         vld1.64         {d0},      [r1], lr
187         vld1.64         {d1},      [ip], lr
188 1:      subs            r3,  r3,  #2
189         \vhadd          d4,  d0,  d1
190         vld1.64         {d0},      [r1],     lr
191         \vhadd          d5,  d0,  d1
192         vld1.64         {d1},      [ip],     lr
193         pld             [r1]
194         pld             [ip]
195         vst1.64         {d4},      [r0,:64], r2
196         vst1.64         {d5},      [r0,:64], r2
197         bne             1b
198         pop             {pc}
199         .endm
200
201         .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0
202         push            {lr}
203         lsl             lr,  r2,  #1
204         add             ip,  r1,  r2
205         vld1.64         {d0, d1},  [r1], lr
206         vld1.64         {d2, d3},  [ip], lr
207 .if \no_rnd
208         vmov.i16        q11, #1
209 .endif
210         pld             [r1]
211         pld             [ip]
212         vext.8          d4,  d0,  d1,  #1
213         vext.8          d6,  d2,  d3,  #1
214         vaddl.u8        q8,  d0,  d4
215         vaddl.u8        q9,  d2,  d6
216 1:      subs            r3,  r3,  #2
217         vld1.64         {d0, d1},  [r1], lr
218         pld             [r1]
219         vadd.u16        q10, q8,  q9
220         vext.8          d4,  d0,  d1,  #1
221 .if \no_rnd
222         vadd.u16        q10, q10, q11
223 .endif
224         vaddl.u8        q8,  d0,  d4
225         \vshrn          d5,  q10, #2
226         vld1.64         {d2, d3},  [ip], lr
227         vadd.u16        q10, q8,  q9
228         pld             [ip]
229 .if \no_rnd
230         vadd.u16        q10, q10, q11
231 .endif
232         vst1.64         {d5},      [r0,:64], r2
233         \vshrn          d7,  q10, #2
234         vext.8          d6,  d2,  d3,  #1
235         vaddl.u8        q9,  d2,  d6
236         vst1.64         {d7},      [r0,:64], r2
237         bgt             1b
238         pop             {pc}
239         .endm
240
241         .macro pixfunc pfx name suf rnd_op args:vararg
242 function ff_\pfx\name\suf\()_neon, export=1
243         \name \rnd_op \args
244         .endfunc
245         .endm
246
247         .macro pixfunc2 pfx name args:vararg
248         pixfunc \pfx \name
249         pixfunc \pfx \name \args
250         .endm
251
252 function ff_put_h264_qpel16_mc00_neon, export=1
253         mov   r3, #16
254         .endfunc
255
256         pixfunc  put_ pixels16
257         pixfunc2 put_ pixels16_x2,  _no_rnd, vhadd.u8
258         pixfunc2 put_ pixels16_y2,  _no_rnd, vhadd.u8
259         pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1
260
261 function ff_avg_h264_qpel16_mc00_neon, export=1
262         mov   r3, #16
263         .endfunc
264
265         pixfunc  avg_ pixels16,, 1
266
267 function ff_put_h264_qpel8_mc00_neon, export=1
268         mov   r3, #8
269         .endfunc
270
271         pixfunc  put_ pixels8
272         pixfunc2 put_ pixels8_x2,   _no_rnd, vhadd.u8
273         pixfunc2 put_ pixels8_y2,   _no_rnd, vhadd.u8
274         pixfunc2 put_ pixels8_xy2,  _no_rnd, vshrn.u16, 1
275
276 function ff_float_to_int16_neon, export=1
277         subs            r2,  r2,  #8
278         vld1.64         {d0-d1},  [r1,:128]!
279         vcvt.s32.f32    q8,  q0,  #16
280         vld1.64         {d2-d3},  [r1,:128]!
281         vcvt.s32.f32    q9,  q1,  #16
282         beq             3f
283         bics            ip,  r2,  #15
284         beq             2f
285 1:      subs            ip,  ip,  #16
286         vshrn.s32       d4,  q8,  #16
287         vld1.64         {d0-d1},  [r1,:128]!
288         vcvt.s32.f32    q0,  q0,  #16
289         vshrn.s32       d5,  q9,  #16
290         vld1.64         {d2-d3},  [r1,:128]!
291         vcvt.s32.f32    q1,  q1,  #16
292         vshrn.s32       d6,  q0,  #16
293         vst1.64         {d4-d5},  [r0,:128]!
294         vshrn.s32       d7,  q1,  #16
295         vld1.64         {d16-d17},[r1,:128]!
296         vcvt.s32.f32    q8,  q8,  #16
297         vld1.64         {d18-d19},[r1,:128]!
298         vcvt.s32.f32    q9,  q9,  #16
299         vst1.64         {d6-d7},  [r0,:128]!
300         bne             1b
301         ands            r2,  r2,  #15
302         beq             3f
303 2:      vld1.64         {d0-d1},  [r1,:128]!
304         vshrn.s32       d4,  q8,  #16
305         vcvt.s32.f32    q0,  q0,  #16
306         vld1.64         {d2-d3},  [r1,:128]!
307         vshrn.s32       d5,  q9,  #16
308         vcvt.s32.f32    q1,  q1,  #16
309         vshrn.s32       d6,  q0,  #16
310         vst1.64         {d4-d5},  [r0,:128]!
311         vshrn.s32       d7,  q1,  #16
312         vst1.64         {d6-d7},  [r0,:128]!
313         bx              lr
314 3:      vshrn.s32       d4,  q8,  #16
315         vshrn.s32       d5,  q9,  #16
316         vst1.64         {d4-d5},  [r0,:128]!
317         bx              lr
318         .endfunc
319
320 function ff_float_to_int16_interleave_neon, export=1
321         cmp             r3, #2
322         ldrlt           r1, [r1]
323         blt             ff_float_to_int16_neon
324         bne             4f
325
326         ldr             r3, [r1]
327         ldr             r1, [r1, #4]
328
329         subs            r2,  r2,  #8
330         vld1.64         {d0-d1},  [r3,:128]!
331         vcvt.s32.f32    q8,  q0,  #16
332         vld1.64         {d2-d3},  [r3,:128]!
333         vcvt.s32.f32    q9,  q1,  #16
334         vld1.64         {d20-d21},[r1,:128]!
335         vcvt.s32.f32    q10, q10, #16
336         vld1.64         {d22-d23},[r1,:128]!
337         vcvt.s32.f32    q11, q11, #16
338         beq             3f
339         bics            ip,  r2,  #15
340         beq             2f
341 1:      subs            ip,  ip,  #16
342         vld1.64         {d0-d1},  [r3,:128]!
343         vcvt.s32.f32    q0,  q0,  #16
344         vsri.32         q10, q8,  #16
345         vld1.64         {d2-d3},  [r3,:128]!
346         vcvt.s32.f32    q1,  q1,  #16
347         vld1.64         {d24-d25},[r1,:128]!
348         vcvt.s32.f32    q12, q12, #16
349         vld1.64         {d26-d27},[r1,:128]!
350         vsri.32         q11, q9,  #16
351         vst1.64         {d20-d21},[r0,:128]!
352         vcvt.s32.f32    q13, q13, #16
353         vst1.64         {d22-d23},[r0,:128]!
354         vsri.32         q12, q0,  #16
355         vld1.64         {d16-d17},[r3,:128]!
356         vsri.32         q13, q1,  #16
357         vst1.64         {d24-d25},[r0,:128]!
358         vcvt.s32.f32    q8,  q8,  #16
359         vld1.64         {d18-d19},[r3,:128]!
360         vcvt.s32.f32    q9,  q9,  #16
361         vld1.64         {d20-d21},[r1,:128]!
362         vcvt.s32.f32    q10, q10, #16
363         vld1.64         {d22-d23},[r1,:128]!
364         vcvt.s32.f32    q11, q11, #16
365         vst1.64         {d26-d27},[r0,:128]!
366         bne             1b
367         ands            r2,  r2,  #15
368         beq             3f
369 2:      vsri.32         q10, q8,  #16
370         vld1.64         {d0-d1},  [r3,:128]!
371         vcvt.s32.f32    q0,  q0,  #16
372         vld1.64         {d2-d3},  [r3,:128]!
373         vcvt.s32.f32    q1,  q1,  #16
374         vld1.64         {d24-d25},[r1,:128]!
375         vcvt.s32.f32    q12, q12, #16
376         vsri.32         q11, q9,  #16
377         vld1.64         {d26-d27},[r1,:128]!
378         vcvt.s32.f32    q13, q13, #16
379         vst1.64         {d20-d21},[r0,:128]!
380         vsri.32         q12, q0,  #16
381         vst1.64         {d22-d23},[r0,:128]!
382         vsri.32         q13, q1,  #16
383         vst1.64         {d24-d27},[r0,:128]!
384         bx              lr
385 3:      vsri.32         q10, q8,  #16
386         vsri.32         q11, q9,  #16
387         vst1.64         {d20-d23},[r0,:128]!
388         bx              lr
389
390 4:      push            {r4-r8,lr}
391         cmp             r3,  #4
392         lsl             ip,  r3,  #1
393         blt             4f
394
395         @ 4 channels
396 5:      ldmia           r1!, {r4-r7}
397         mov             lr,  r2
398         mov             r8,  r0
399         vld1.64         {d16-d17},[r4,:128]!
400         vcvt.s32.f32    q8,  q8,  #16
401         vld1.64         {d18-d19},[r5,:128]!
402         vcvt.s32.f32    q9,  q9,  #16
403         vld1.64         {d20-d21},[r6,:128]!
404         vcvt.s32.f32    q10, q10, #16
405         vld1.64         {d22-d23},[r7,:128]!
406         vcvt.s32.f32    q11, q11, #16
407 6:      subs            lr,  lr,  #8
408         vld1.64         {d0-d1},  [r4,:128]!
409         vcvt.s32.f32    q0,  q0,  #16
410         vsri.32         q9,  q8,  #16
411         vld1.64         {d2-d3},  [r5,:128]!
412         vcvt.s32.f32    q1,  q1,  #16
413         vsri.32         q11, q10, #16
414         vld1.64         {d4-d5},  [r6,:128]!
415         vcvt.s32.f32    q2,  q2,  #16
416         vzip.32         d18, d22
417         vld1.64         {d6-d7},  [r7,:128]!
418         vcvt.s32.f32    q3,  q3,  #16
419         vzip.32         d19, d23
420         vst1.64         {d18},    [r8], ip
421         vsri.32         q1,  q0,  #16
422         vst1.64         {d22},    [r8], ip
423         vsri.32         q3,  q2,  #16
424         vst1.64         {d19},    [r8], ip
425         vzip.32         d2,  d6
426         vst1.64         {d23},    [r8], ip
427         vzip.32         d3,  d7
428         beq             7f
429         vld1.64         {d16-d17},[r4,:128]!
430         vcvt.s32.f32    q8,  q8,  #16
431         vst1.64         {d2},     [r8], ip
432         vld1.64         {d18-d19},[r5,:128]!
433         vcvt.s32.f32    q9,  q9,  #16
434         vst1.64         {d6},     [r8], ip
435         vld1.64         {d20-d21},[r6,:128]!
436         vcvt.s32.f32    q10, q10, #16
437         vst1.64         {d3},     [r8], ip
438         vld1.64         {d22-d23},[r7,:128]!
439         vcvt.s32.f32    q11, q11, #16
440         vst1.64         {d7},     [r8], ip
441         b               6b
442 7:      vst1.64         {d2},     [r8], ip
443         vst1.64         {d6},     [r8], ip
444         vst1.64         {d3},     [r8], ip
445         vst1.64         {d7},     [r8], ip
446         subs            r3,  r3,  #4
447         popeq           {r4-r8,pc}
448         cmp             r3,  #4
449         add             r0,  r0,  #8
450         bge             5b
451
452         @ 2 channels
453 4:      cmp             r3,  #2
454         blt             4f
455         ldmia           r1!, {r4-r5}
456         mov             lr,  r2
457         mov             r8,  r0
458         tst             lr,  #8
459         vld1.64         {d16-d17},[r4,:128]!
460         vcvt.s32.f32    q8,  q8,  #16
461         vld1.64         {d18-d19},[r5,:128]!
462         vcvt.s32.f32    q9,  q9,  #16
463         vld1.64         {d20-d21},[r4,:128]!
464         vcvt.s32.f32    q10, q10, #16
465         vld1.64         {d22-d23},[r5,:128]!
466         vcvt.s32.f32    q11, q11, #16
467         beq             6f
468         subs            lr,  lr,  #8
469         beq             7f
470         vsri.32         d18, d16, #16
471         vsri.32         d19, d17, #16
472         vld1.64         {d16-d17},[r4,:128]!
473         vcvt.s32.f32    q8,  q8,  #16
474         vst1.32         {d18[0]}, [r8], ip
475         vsri.32         d22, d20, #16
476         vst1.32         {d18[1]}, [r8], ip
477         vsri.32         d23, d21, #16
478         vst1.32         {d19[0]}, [r8], ip
479         vst1.32         {d19[1]}, [r8], ip
480         vld1.64         {d18-d19},[r5,:128]!
481         vcvt.s32.f32    q9,  q9,  #16
482         vst1.32         {d22[0]}, [r8], ip
483         vst1.32         {d22[1]}, [r8], ip
484         vld1.64         {d20-d21},[r4,:128]!
485         vcvt.s32.f32    q10, q10, #16
486         vst1.32         {d23[0]}, [r8], ip
487         vst1.32         {d23[1]}, [r8], ip
488         vld1.64         {d22-d23},[r5,:128]!
489         vcvt.s32.f32    q11, q11, #16
490 6:      subs            lr,  lr,  #16
491         vld1.64         {d0-d1},  [r4,:128]!
492         vcvt.s32.f32    q0,  q0,  #16
493         vsri.32         d18, d16, #16
494         vld1.64         {d2-d3},  [r5,:128]!
495         vcvt.s32.f32    q1,  q1,  #16
496         vsri.32         d19, d17, #16
497         vld1.64         {d4-d5},  [r4,:128]!
498         vcvt.s32.f32    q2,  q2,  #16
499         vld1.64         {d6-d7},  [r5,:128]!
500         vcvt.s32.f32    q3,  q3,  #16
501         vst1.32         {d18[0]}, [r8], ip
502         vsri.32         d22, d20, #16
503         vst1.32         {d18[1]}, [r8], ip
504         vsri.32         d23, d21, #16
505         vst1.32         {d19[0]}, [r8], ip
506         vsri.32         d2,  d0,  #16
507         vst1.32         {d19[1]}, [r8], ip
508         vsri.32         d3,  d1,  #16
509         vst1.32         {d22[0]}, [r8], ip
510         vsri.32         d6,  d4,  #16
511         vst1.32         {d22[1]}, [r8], ip
512         vsri.32         d7,  d5,  #16
513         vst1.32         {d23[0]}, [r8], ip
514         vst1.32         {d23[1]}, [r8], ip
515         beq             6f
516         vld1.64         {d16-d17},[r4,:128]!
517         vcvt.s32.f32    q8,  q8,  #16
518         vst1.32         {d2[0]},  [r8], ip
519         vst1.32         {d2[1]},  [r8], ip
520         vld1.64         {d18-d19},[r5,:128]!
521         vcvt.s32.f32    q9,  q9,  #16
522         vst1.32         {d3[0]},  [r8], ip
523         vst1.32         {d3[1]},  [r8], ip
524         vld1.64         {d20-d21},[r4,:128]!
525         vcvt.s32.f32    q10, q10, #16
526         vst1.32         {d6[0]},  [r8], ip
527         vst1.32         {d6[1]},  [r8], ip
528         vld1.64         {d22-d23},[r5,:128]!
529         vcvt.s32.f32    q11, q11, #16
530         vst1.32         {d7[0]},  [r8], ip
531         vst1.32         {d7[1]},  [r8], ip
532         bgt             6b
533 6:      vst1.32         {d2[0]},  [r8], ip
534         vst1.32         {d2[1]},  [r8], ip
535         vst1.32         {d3[0]},  [r8], ip
536         vst1.32         {d3[1]},  [r8], ip
537         vst1.32         {d6[0]},  [r8], ip
538         vst1.32         {d6[1]},  [r8], ip
539         vst1.32         {d7[0]},  [r8], ip
540         vst1.32         {d7[1]},  [r8], ip
541         b               8f
542 7:      vsri.32         d18, d16, #16
543         vsri.32         d19, d17, #16
544         vst1.32         {d18[0]}, [r8], ip
545         vsri.32         d22, d20, #16
546         vst1.32         {d18[1]}, [r8], ip
547         vsri.32         d23, d21, #16
548         vst1.32         {d19[0]}, [r8], ip
549         vst1.32         {d19[1]}, [r8], ip
550         vst1.32         {d22[0]}, [r8], ip
551         vst1.32         {d22[1]}, [r8], ip
552         vst1.32         {d23[0]}, [r8], ip
553         vst1.32         {d23[1]}, [r8], ip
554 8:      subs            r3,  r3,  #2
555         add             r0,  r0,  #4
556         popeq           {r4-r8,pc}
557
558         @ 1 channel
559 4:      ldr             r4,  [r1],#4
560         tst             r2,  #8
561         mov             lr,  r2
562         mov             r5,  r0
563         vld1.64         {d0-d1},  [r4,:128]!
564         vcvt.s32.f32    q0,  q0,  #16
565         vld1.64         {d2-d3},  [r4,:128]!
566         vcvt.s32.f32    q1,  q1,  #16
567         bne             8f
568 6:      subs            lr,  lr,  #16
569         vld1.64         {d4-d5},  [r4,:128]!
570         vcvt.s32.f32    q2,  q2,  #16
571         vld1.64         {d6-d7},  [r4,:128]!
572         vcvt.s32.f32    q3,  q3,  #16
573         vst1.16         {d0[1]},  [r5,:16], ip
574         vst1.16         {d0[3]},  [r5,:16], ip
575         vst1.16         {d1[1]},  [r5,:16], ip
576         vst1.16         {d1[3]},  [r5,:16], ip
577         vst1.16         {d2[1]},  [r5,:16], ip
578         vst1.16         {d2[3]},  [r5,:16], ip
579         vst1.16         {d3[1]},  [r5,:16], ip
580         vst1.16         {d3[3]},  [r5,:16], ip
581         beq             7f
582         vld1.64         {d0-d1},  [r4,:128]!
583         vcvt.s32.f32    q0,  q0,  #16
584         vld1.64         {d2-d3},  [r4,:128]!
585         vcvt.s32.f32    q1,  q1,  #16
586 7:      vst1.16         {d4[1]},  [r5,:16], ip
587         vst1.16         {d4[3]},  [r5,:16], ip
588         vst1.16         {d5[1]},  [r5,:16], ip
589         vst1.16         {d5[3]},  [r5,:16], ip
590         vst1.16         {d6[1]},  [r5,:16], ip
591         vst1.16         {d6[3]},  [r5,:16], ip
592         vst1.16         {d7[1]},  [r5,:16], ip
593         vst1.16         {d7[3]},  [r5,:16], ip
594         bgt             6b
595         pop             {r4-r8,pc}
596 8:      subs            lr,  lr,  #8
597         vst1.16         {d0[1]},  [r5,:16], ip
598         vst1.16         {d0[3]},  [r5,:16], ip
599         vst1.16         {d1[1]},  [r5,:16], ip
600         vst1.16         {d1[3]},  [r5,:16], ip
601         vst1.16         {d2[1]},  [r5,:16], ip
602         vst1.16         {d2[3]},  [r5,:16], ip
603         vst1.16         {d3[1]},  [r5,:16], ip
604         vst1.16         {d3[3]},  [r5,:16], ip
605         popeq           {r4-r8,pc}
606         vld1.64         {d0-d1},  [r4,:128]!
607         vcvt.s32.f32    q0,  q0,  #16
608         vld1.64         {d2-d3},  [r4,:128]!
609         vcvt.s32.f32    q1,  q1,  #16
610         b               6b
611         .endfunc
612
613 function ff_vector_fmul_neon, export=1
614         mov             r3,  r0
615         subs            r2,  r2,  #8
616         vld1.64         {d0-d3},  [r0,:128]!
617         vld1.64         {d4-d7},  [r1,:128]!
618         vmul.f32        q8,  q0,  q2
619         vmul.f32        q9,  q1,  q3
620         beq             3f
621         bics            ip,  r2,  #15
622         beq             2f
623 1:      subs            ip,  ip,  #16
624         vld1.64         {d0-d1},  [r0,:128]!
625         vld1.64         {d4-d5},  [r1,:128]!
626         vmul.f32        q10, q0,  q2
627         vld1.64         {d2-d3},  [r0,:128]!
628         vld1.64         {d6-d7},  [r1,:128]!
629         vmul.f32        q11, q1,  q3
630         vst1.64         {d16-d19},[r3,:128]!
631         vld1.64         {d0-d1},  [r0,:128]!
632         vld1.64         {d4-d5},  [r1,:128]!
633         vmul.f32        q8,  q0,  q2
634         vld1.64         {d2-d3},  [r0,:128]!
635         vld1.64         {d6-d7},  [r1,:128]!
636         vmul.f32        q9,  q1,  q3
637         vst1.64         {d20-d23},[r3,:128]!
638         bne             1b
639         ands            r2,  r2,  #15
640         beq             3f
641 2:      vld1.64         {d0-d1},  [r0,:128]!
642         vld1.64         {d4-d5},  [r1,:128]!
643         vst1.64         {d16-d17},[r3,:128]!
644         vmul.f32        q8,  q0,  q2
645         vld1.64         {d2-d3},  [r0,:128]!
646         vld1.64         {d6-d7},  [r1,:128]!
647         vst1.64         {d18-d19},[r3,:128]!
648         vmul.f32        q9,  q1,  q3
649 3:      vst1.64         {d16-d19},[r3,:128]!
650         bx              lr
651         .endfunc