]> rtime.felk.cvut.cz Git - frescor/ffmpeg.git/blob - libavcodec/arm/dsputil_neon_s.S
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
[frescor/ffmpeg.git] / libavcodec / arm / dsputil_neon_s.S
1 /*
2  * ARM NEON optimised DSP functions
3  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21
22 #include "asm.S"
23
24         preserve8
25         .fpu neon
26         .text
27
28         .macro pixels16 avg=0
29 .if \avg
30         mov             ip,  r0
31 .endif
32 1:      vld1.64         {d0, d1},  [r1], r2
33         vld1.64         {d2, d3},  [r1], r2
34         vld1.64         {d4, d5},  [r1], r2
35         pld             [r1, r2, lsl #2]
36         vld1.64         {d6, d7},  [r1], r2
37         pld             [r1]
38         pld             [r1, r2]
39         pld             [r1, r2, lsl #1]
40 .if \avg
41         vld1.64         {d16,d17}, [ip,:128], r2
42         vrhadd.u8       q0,  q0,  q8
43         vld1.64         {d18,d19}, [ip,:128], r2
44         vrhadd.u8       q1,  q1,  q9
45         vld1.64         {d20,d21}, [ip,:128], r2
46         vrhadd.u8       q2,  q2,  q10
47         vld1.64         {d22,d23}, [ip,:128], r2
48         vrhadd.u8       q3,  q3,  q11
49 .endif
50         subs            r3,  r3,  #4
51         vst1.64         {d0, d1},  [r0,:128], r2
52         vst1.64         {d2, d3},  [r0,:128], r2
53         vst1.64         {d4, d5},  [r0,:128], r2
54         vst1.64         {d6, d7},  [r0,:128], r2
55         bne             1b
56         bx              lr
57         .endm
58
59         .macro pixels16_x2 vhadd=vrhadd.u8
60 1:      vld1.64         {d0-d2},   [r1], r2
61         vld1.64         {d4-d6},   [r1], r2
62         pld             [r1]
63         pld             [r1, r2]
64         subs            r3,  r3,  #2
65         vext.8          q1,  q0,  q1,  #1
66         \vhadd          q0,  q0,  q1
67         vext.8          q3,  q2,  q3,  #1
68         \vhadd          q2,  q2,  q3
69         vst1.64         {d0, d1},  [r0,:128], r2
70         vst1.64         {d4, d5},  [r0,:128], r2
71         bne             1b
72         bx              lr
73         .endm
74
75         .macro pixels16_y2 vhadd=vrhadd.u8
76         vld1.64         {d0, d1},  [r1], r2
77         vld1.64         {d2, d3},  [r1], r2
78 1:      subs            r3,  r3,  #2
79         \vhadd          q2,  q0,  q1
80         vld1.64         {d0, d1},  [r1], r2
81         \vhadd          q3,  q0,  q1
82         vld1.64         {d2, d3},  [r1], r2
83         pld             [r1]
84         pld             [r1, r2]
85         vst1.64         {d4, d5},  [r0,:128], r2
86         vst1.64         {d6, d7},  [r0,:128], r2
87         bne             1b
88         bx              lr
89         .endm
90
91         .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0
92         vld1.64         {d0-d2},   [r1], r2
93         vld1.64         {d4-d6},   [r1], r2
94 .if \no_rnd
95         vmov.i16        q13, #1
96 .endif
97         pld             [r1]
98         pld             [r1, r2]
99         vext.8          q1,  q0,  q1,  #1
100         vext.8          q3,  q2,  q3,  #1
101         vaddl.u8        q8,  d0,  d2
102         vaddl.u8        q10, d1,  d3
103         vaddl.u8        q9,  d4,  d6
104         vaddl.u8        q11, d5,  d7
105 1:      subs            r3,  r3,  #2
106         vld1.64         {d0-d2},   [r1], r2
107         vadd.u16        q12, q8,  q9
108         pld             [r1]
109 .if \no_rnd
110         vadd.u16        q12, q12, q13
111 .endif
112         vext.8          q15, q0,  q1,  #1
113         vadd.u16        q1 , q10, q11
114         \vshrn          d28, q12, #2
115 .if \no_rnd
116         vadd.u16        q1,  q1,  q13
117 .endif
118         \vshrn          d29, q1,  #2
119         vaddl.u8        q8,  d0,  d30
120         vld1.64         {d2-d4},   [r1], r2
121         vaddl.u8        q10, d1,  d31
122         vst1.64         {d28,d29}, [r0,:128], r2
123         vadd.u16        q12, q8,  q9
124         pld             [r1, r2]
125 .if \no_rnd
126         vadd.u16        q12, q12, q13
127 .endif
128         vext.8          q2,  q1,  q2,  #1
129         vadd.u16        q0,  q10, q11
130         \vshrn          d30, q12, #2
131 .if \no_rnd
132         vadd.u16        q0,  q0,  q13
133 .endif
134         \vshrn          d31, q0,  #2
135         vaddl.u8        q9,  d2,  d4
136         vaddl.u8        q11, d3,  d5
137         vst1.64         {d30,d31}, [r0,:128], r2
138         bgt             1b
139         bx              lr
140         .endm
141
142         .macro pixels8
143 1:      vld1.64         {d0}, [r1], r2
144         vld1.64         {d1}, [r1], r2
145         vld1.64         {d2}, [r1], r2
146         pld             [r1, r2, lsl #2]
147         vld1.64         {d3}, [r1], r2
148         pld             [r1]
149         pld             [r1, r2]
150         pld             [r1, r2, lsl #1]
151         subs            r3,  r3,  #4
152         vst1.64         {d0}, [r0,:64], r2
153         vst1.64         {d1}, [r0,:64], r2
154         vst1.64         {d2}, [r0,:64], r2
155         vst1.64         {d3}, [r0,:64], r2
156         bne             1b
157         bx              lr
158         .endm
159
160         .macro pixels8_x2 vhadd=vrhadd.u8
161 1:      vld1.64         {d0, d1},  [r1], r2
162         vext.8          d1,  d0,  d1,  #1
163         vld1.64         {d2, d3},  [r1], r2
164         vext.8          d3,  d2,  d3,  #1
165         pld             [r1]
166         pld             [r1, r2]
167         subs            r3,  r3,  #2
168         vswp            d1,  d2
169         \vhadd          q0,  q0,  q1
170         vst1.64         {d0},      [r0,:64], r2
171         vst1.64         {d1},      [r0,:64], r2
172         bne             1b
173         bx              lr
174         .endm
175
176         .macro pixels8_y2 vhadd=vrhadd.u8
177         vld1.64         {d0},      [r1], r2
178         vld1.64         {d1},      [r1], r2
179 1:      subs            r3,  r3,  #2
180         \vhadd          d4,  d0,  d1
181         vld1.64         {d0},      [r1], r2
182         \vhadd          d5,  d0,  d1
183         vld1.64         {d1},      [r1], r2
184         pld             [r1]
185         pld             [r1, r2]
186         vst1.64         {d4},      [r0,:64], r2
187         vst1.64         {d5},      [r0,:64], r2
188         bne             1b
189         bx              lr
190         .endm
191
192         .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0
193         vld1.64         {d0, d1},  [r1], r2
194         vld1.64         {d2, d3},  [r1], r2
195 .if \no_rnd
196         vmov.i16        q11, #1
197 .endif
198         pld             [r1]
199         pld             [r1, r2]
200         vext.8          d4,  d0,  d1,  #1
201         vext.8          d6,  d2,  d3,  #1
202         vaddl.u8        q8,  d0,  d4
203         vaddl.u8        q9,  d2,  d6
204 1:      subs            r3,  r3,  #2
205         vld1.64         {d0, d1},  [r1], r2
206         pld             [r1]
207         vadd.u16        q10, q8,  q9
208         vext.8          d4,  d0,  d1,  #1
209 .if \no_rnd
210         vadd.u16        q10, q10, q11
211 .endif
212         vaddl.u8        q8,  d0,  d4
213         \vshrn          d5,  q10, #2
214         vld1.64         {d2, d3},  [r1], r2
215         vadd.u16        q10, q8,  q9
216         pld             [r1, r2]
217 .if \no_rnd
218         vadd.u16        q10, q10, q11
219 .endif
220         vst1.64         {d5},      [r0,:64], r2
221         \vshrn          d7,  q10, #2
222         vext.8          d6,  d2,  d3,  #1
223         vaddl.u8        q9,  d2,  d6
224         vst1.64         {d7},      [r0,:64], r2
225         bgt             1b
226         bx              lr
227         .endm
228
229         .macro pixfunc pfx name suf rnd_op args:vararg
230 function ff_\pfx\name\suf\()_neon, export=1
231         \name \rnd_op \args
232         .endfunc
233         .endm
234
235         .macro pixfunc2 pfx name args:vararg
236         pixfunc \pfx \name
237         pixfunc \pfx \name \args
238         .endm
239
240 function ff_put_h264_qpel16_mc00_neon, export=1
241         mov   r3, #16
242         .endfunc
243
244         pixfunc  put_ pixels16
245         pixfunc2 put_ pixels16_x2,  _no_rnd, vhadd.u8
246         pixfunc2 put_ pixels16_y2,  _no_rnd, vhadd.u8
247         pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1
248
249 function ff_avg_h264_qpel16_mc00_neon, export=1
250         mov   r3, #16
251         .endfunc
252
253         pixfunc  avg_ pixels16,, 1
254
255 function ff_put_h264_qpel8_mc00_neon, export=1
256         mov   r3, #8
257         .endfunc
258
259         pixfunc  put_ pixels8
260         pixfunc2 put_ pixels8_x2,   _no_rnd, vhadd.u8
261         pixfunc2 put_ pixels8_y2,   _no_rnd, vhadd.u8
262         pixfunc2 put_ pixels8_xy2,  _no_rnd, vshrn.u16, 1
263
264 function ff_put_pixels_clamped_neon, export=1
265         vld1.64         {d16-d19}, [r0,:128]!
266         vqmovun.s16     d0, q8
267         vld1.64         {d20-d23}, [r0,:128]!
268         vqmovun.s16     d1, q9
269         vld1.64         {d24-d27}, [r0,:128]!
270         vqmovun.s16     d2, q10
271         vld1.64         {d28-d31}, [r0,:128]!
272         vqmovun.s16     d3, q11
273         vst1.64         {d0},      [r1,:64], r2
274         vqmovun.s16     d4, q12
275         vst1.64         {d1},      [r1,:64], r2
276         vqmovun.s16     d5, q13
277         vst1.64         {d2},      [r1,:64], r2
278         vqmovun.s16     d6, q14
279         vst1.64         {d3},      [r1,:64], r2
280         vqmovun.s16     d7, q15
281         vst1.64         {d4},      [r1,:64], r2
282         vst1.64         {d5},      [r1,:64], r2
283         vst1.64         {d6},      [r1,:64], r2
284         vst1.64         {d7},      [r1,:64], r2
285         bx              lr
286         .endfunc
287
288 function ff_put_signed_pixels_clamped_neon, export=1
289         vmov.u8         d31, #128
290         vld1.64         {d16-d17}, [r0,:128]!
291         vqmovn.s16      d0, q8
292         vld1.64         {d18-d19}, [r0,:128]!
293         vqmovn.s16      d1, q9
294         vld1.64         {d16-d17}, [r0,:128]!
295         vqmovn.s16      d2, q8
296         vld1.64         {d18-d19}, [r0,:128]!
297         vadd.u8         d0, d0, d31
298         vld1.64         {d20-d21}, [r0,:128]!
299         vadd.u8         d1, d1, d31
300         vld1.64         {d22-d23}, [r0,:128]!
301         vadd.u8         d2, d2, d31
302         vst1.64         {d0},      [r1,:64], r2
303         vqmovn.s16      d3, q9
304         vst1.64         {d1},      [r1,:64], r2
305         vqmovn.s16      d4, q10
306         vst1.64         {d2},      [r1,:64], r2
307         vqmovn.s16      d5, q11
308         vld1.64         {d24-d25}, [r0,:128]!
309         vadd.u8         d3, d3, d31
310         vld1.64         {d26-d27}, [r0,:128]!
311         vadd.u8         d4, d4, d31
312         vadd.u8         d5, d5, d31
313         vst1.64         {d3},      [r1,:64], r2
314         vqmovn.s16      d6, q12
315         vst1.64         {d4},      [r1,:64], r2
316         vqmovn.s16      d7, q13
317         vst1.64         {d5},      [r1,:64], r2
318         vadd.u8         d6, d6, d31
319         vadd.u8         d7, d7, d31
320         vst1.64         {d6},      [r1,:64], r2
321         vst1.64         {d7},      [r1,:64], r2
322         bx              lr
323         .endfunc
324
325 function ff_add_pixels_clamped_neon, export=1
326         mov             r3, r1
327         vld1.64         {d16},   [r1,:64], r2
328         vld1.64         {d0-d1}, [r0,:128]!
329         vaddw.u8        q0, q0, d16
330         vld1.64         {d17},   [r1,:64], r2
331         vld1.64         {d2-d3}, [r0,:128]!
332         vqmovun.s16     d0, q0
333         vld1.64         {d18},   [r1,:64], r2
334         vaddw.u8        q1, q1, d17
335         vld1.64         {d4-d5}, [r0,:128]!
336         vaddw.u8        q2, q2, d18
337         vst1.64         {d0},    [r3,:64], r2
338         vqmovun.s16     d2, q1
339         vld1.64         {d19},   [r1,:64], r2
340         vld1.64         {d6-d7}, [r0,:128]!
341         vaddw.u8        q3, q3, d19
342         vqmovun.s16     d4, q2
343         vst1.64         {d2},    [r3,:64], r2
344         vld1.64         {d16},   [r1,:64], r2
345         vqmovun.s16     d6, q3
346         vld1.64         {d0-d1}, [r0,:128]!
347         vaddw.u8        q0, q0, d16
348         vst1.64         {d4},    [r3,:64], r2
349         vld1.64         {d17},   [r1,:64], r2
350         vld1.64         {d2-d3}, [r0,:128]!
351         vaddw.u8        q1, q1, d17
352         vst1.64         {d6},    [r3,:64], r2
353         vqmovun.s16     d0, q0
354         vld1.64         {d18},   [r1,:64], r2
355         vld1.64         {d4-d5}, [r0,:128]!
356         vaddw.u8        q2, q2, d18
357         vst1.64         {d0},    [r3,:64], r2
358         vqmovun.s16     d2, q1
359         vld1.64         {d19},   [r1,:64], r2
360         vqmovun.s16     d4, q2
361         vld1.64         {d6-d7}, [r0,:128]!
362         vaddw.u8        q3, q3, d19
363         vst1.64         {d2},    [r3,:64], r2
364         vqmovun.s16     d6, q3
365         vst1.64         {d4},    [r3,:64], r2
366         vst1.64         {d6},    [r3,:64], r2
367         bx              lr
368         .endfunc
369
370 function ff_float_to_int16_neon, export=1
371         subs            r2,  r2,  #8
372         vld1.64         {d0-d1},  [r1,:128]!
373         vcvt.s32.f32    q8,  q0,  #16
374         vld1.64         {d2-d3},  [r1,:128]!
375         vcvt.s32.f32    q9,  q1,  #16
376         beq             3f
377         bics            ip,  r2,  #15
378         beq             2f
379 1:      subs            ip,  ip,  #16
380         vshrn.s32       d4,  q8,  #16
381         vld1.64         {d0-d1},  [r1,:128]!
382         vcvt.s32.f32    q0,  q0,  #16
383         vshrn.s32       d5,  q9,  #16
384         vld1.64         {d2-d3},  [r1,:128]!
385         vcvt.s32.f32    q1,  q1,  #16
386         vshrn.s32       d6,  q0,  #16
387         vst1.64         {d4-d5},  [r0,:128]!
388         vshrn.s32       d7,  q1,  #16
389         vld1.64         {d16-d17},[r1,:128]!
390         vcvt.s32.f32    q8,  q8,  #16
391         vld1.64         {d18-d19},[r1,:128]!
392         vcvt.s32.f32    q9,  q9,  #16
393         vst1.64         {d6-d7},  [r0,:128]!
394         bne             1b
395         ands            r2,  r2,  #15
396         beq             3f
397 2:      vld1.64         {d0-d1},  [r1,:128]!
398         vshrn.s32       d4,  q8,  #16
399         vcvt.s32.f32    q0,  q0,  #16
400         vld1.64         {d2-d3},  [r1,:128]!
401         vshrn.s32       d5,  q9,  #16
402         vcvt.s32.f32    q1,  q1,  #16
403         vshrn.s32       d6,  q0,  #16
404         vst1.64         {d4-d5},  [r0,:128]!
405         vshrn.s32       d7,  q1,  #16
406         vst1.64         {d6-d7},  [r0,:128]!
407         bx              lr
408 3:      vshrn.s32       d4,  q8,  #16
409         vshrn.s32       d5,  q9,  #16
410         vst1.64         {d4-d5},  [r0,:128]!
411         bx              lr
412         .endfunc
413
414 function ff_float_to_int16_interleave_neon, export=1
415         cmp             r3, #2
416         ldrlt           r1, [r1]
417         blt             ff_float_to_int16_neon
418         bne             4f
419
420         ldr             r3, [r1]
421         ldr             r1, [r1, #4]
422
423         subs            r2,  r2,  #8
424         vld1.64         {d0-d1},  [r3,:128]!
425         vcvt.s32.f32    q8,  q0,  #16
426         vld1.64         {d2-d3},  [r3,:128]!
427         vcvt.s32.f32    q9,  q1,  #16
428         vld1.64         {d20-d21},[r1,:128]!
429         vcvt.s32.f32    q10, q10, #16
430         vld1.64         {d22-d23},[r1,:128]!
431         vcvt.s32.f32    q11, q11, #16
432         beq             3f
433         bics            ip,  r2,  #15
434         beq             2f
435 1:      subs            ip,  ip,  #16
436         vld1.64         {d0-d1},  [r3,:128]!
437         vcvt.s32.f32    q0,  q0,  #16
438         vsri.32         q10, q8,  #16
439         vld1.64         {d2-d3},  [r3,:128]!
440         vcvt.s32.f32    q1,  q1,  #16
441         vld1.64         {d24-d25},[r1,:128]!
442         vcvt.s32.f32    q12, q12, #16
443         vld1.64         {d26-d27},[r1,:128]!
444         vsri.32         q11, q9,  #16
445         vst1.64         {d20-d21},[r0,:128]!
446         vcvt.s32.f32    q13, q13, #16
447         vst1.64         {d22-d23},[r0,:128]!
448         vsri.32         q12, q0,  #16
449         vld1.64         {d16-d17},[r3,:128]!
450         vsri.32         q13, q1,  #16
451         vst1.64         {d24-d25},[r0,:128]!
452         vcvt.s32.f32    q8,  q8,  #16
453         vld1.64         {d18-d19},[r3,:128]!
454         vcvt.s32.f32    q9,  q9,  #16
455         vld1.64         {d20-d21},[r1,:128]!
456         vcvt.s32.f32    q10, q10, #16
457         vld1.64         {d22-d23},[r1,:128]!
458         vcvt.s32.f32    q11, q11, #16
459         vst1.64         {d26-d27},[r0,:128]!
460         bne             1b
461         ands            r2,  r2,  #15
462         beq             3f
463 2:      vsri.32         q10, q8,  #16
464         vld1.64         {d0-d1},  [r3,:128]!
465         vcvt.s32.f32    q0,  q0,  #16
466         vld1.64         {d2-d3},  [r3,:128]!
467         vcvt.s32.f32    q1,  q1,  #16
468         vld1.64         {d24-d25},[r1,:128]!
469         vcvt.s32.f32    q12, q12, #16
470         vsri.32         q11, q9,  #16
471         vld1.64         {d26-d27},[r1,:128]!
472         vcvt.s32.f32    q13, q13, #16
473         vst1.64         {d20-d21},[r0,:128]!
474         vsri.32         q12, q0,  #16
475         vst1.64         {d22-d23},[r0,:128]!
476         vsri.32         q13, q1,  #16
477         vst1.64         {d24-d27},[r0,:128]!
478         bx              lr
479 3:      vsri.32         q10, q8,  #16
480         vsri.32         q11, q9,  #16
481         vst1.64         {d20-d23},[r0,:128]!
482         bx              lr
483
484 4:      push            {r4-r8,lr}
485         cmp             r3,  #4
486         lsl             ip,  r3,  #1
487         blt             4f
488
489         @ 4 channels
490 5:      ldmia           r1!, {r4-r7}
491         mov             lr,  r2
492         mov             r8,  r0
493         vld1.64         {d16-d17},[r4,:128]!
494         vcvt.s32.f32    q8,  q8,  #16
495         vld1.64         {d18-d19},[r5,:128]!
496         vcvt.s32.f32    q9,  q9,  #16
497         vld1.64         {d20-d21},[r6,:128]!
498         vcvt.s32.f32    q10, q10, #16
499         vld1.64         {d22-d23},[r7,:128]!
500         vcvt.s32.f32    q11, q11, #16
501 6:      subs            lr,  lr,  #8
502         vld1.64         {d0-d1},  [r4,:128]!
503         vcvt.s32.f32    q0,  q0,  #16
504         vsri.32         q9,  q8,  #16
505         vld1.64         {d2-d3},  [r5,:128]!
506         vcvt.s32.f32    q1,  q1,  #16
507         vsri.32         q11, q10, #16
508         vld1.64         {d4-d5},  [r6,:128]!
509         vcvt.s32.f32    q2,  q2,  #16
510         vzip.32         d18, d22
511         vld1.64         {d6-d7},  [r7,:128]!
512         vcvt.s32.f32    q3,  q3,  #16
513         vzip.32         d19, d23
514         vst1.64         {d18},    [r8], ip
515         vsri.32         q1,  q0,  #16
516         vst1.64         {d22},    [r8], ip
517         vsri.32         q3,  q2,  #16
518         vst1.64         {d19},    [r8], ip
519         vzip.32         d2,  d6
520         vst1.64         {d23},    [r8], ip
521         vzip.32         d3,  d7
522         beq             7f
523         vld1.64         {d16-d17},[r4,:128]!
524         vcvt.s32.f32    q8,  q8,  #16
525         vst1.64         {d2},     [r8], ip
526         vld1.64         {d18-d19},[r5,:128]!
527         vcvt.s32.f32    q9,  q9,  #16
528         vst1.64         {d6},     [r8], ip
529         vld1.64         {d20-d21},[r6,:128]!
530         vcvt.s32.f32    q10, q10, #16
531         vst1.64         {d3},     [r8], ip
532         vld1.64         {d22-d23},[r7,:128]!
533         vcvt.s32.f32    q11, q11, #16
534         vst1.64         {d7},     [r8], ip
535         b               6b
536 7:      vst1.64         {d2},     [r8], ip
537         vst1.64         {d6},     [r8], ip
538         vst1.64         {d3},     [r8], ip
539         vst1.64         {d7},     [r8], ip
540         subs            r3,  r3,  #4
541         popeq           {r4-r8,pc}
542         cmp             r3,  #4
543         add             r0,  r0,  #8
544         bge             5b
545
546         @ 2 channels
547 4:      cmp             r3,  #2
548         blt             4f
549         ldmia           r1!, {r4-r5}
550         mov             lr,  r2
551         mov             r8,  r0
552         tst             lr,  #8
553         vld1.64         {d16-d17},[r4,:128]!
554         vcvt.s32.f32    q8,  q8,  #16
555         vld1.64         {d18-d19},[r5,:128]!
556         vcvt.s32.f32    q9,  q9,  #16
557         vld1.64         {d20-d21},[r4,:128]!
558         vcvt.s32.f32    q10, q10, #16
559         vld1.64         {d22-d23},[r5,:128]!
560         vcvt.s32.f32    q11, q11, #16
561         beq             6f
562         subs            lr,  lr,  #8
563         beq             7f
564         vsri.32         d18, d16, #16
565         vsri.32         d19, d17, #16
566         vld1.64         {d16-d17},[r4,:128]!
567         vcvt.s32.f32    q8,  q8,  #16
568         vst1.32         {d18[0]}, [r8], ip
569         vsri.32         d22, d20, #16
570         vst1.32         {d18[1]}, [r8], ip
571         vsri.32         d23, d21, #16
572         vst1.32         {d19[0]}, [r8], ip
573         vst1.32         {d19[1]}, [r8], ip
574         vld1.64         {d18-d19},[r5,:128]!
575         vcvt.s32.f32    q9,  q9,  #16
576         vst1.32         {d22[0]}, [r8], ip
577         vst1.32         {d22[1]}, [r8], ip
578         vld1.64         {d20-d21},[r4,:128]!
579         vcvt.s32.f32    q10, q10, #16
580         vst1.32         {d23[0]}, [r8], ip
581         vst1.32         {d23[1]}, [r8], ip
582         vld1.64         {d22-d23},[r5,:128]!
583         vcvt.s32.f32    q11, q11, #16
584 6:      subs            lr,  lr,  #16
585         vld1.64         {d0-d1},  [r4,:128]!
586         vcvt.s32.f32    q0,  q0,  #16
587         vsri.32         d18, d16, #16
588         vld1.64         {d2-d3},  [r5,:128]!
589         vcvt.s32.f32    q1,  q1,  #16
590         vsri.32         d19, d17, #16
591         vld1.64         {d4-d5},  [r4,:128]!
592         vcvt.s32.f32    q2,  q2,  #16
593         vld1.64         {d6-d7},  [r5,:128]!
594         vcvt.s32.f32    q3,  q3,  #16
595         vst1.32         {d18[0]}, [r8], ip
596         vsri.32         d22, d20, #16
597         vst1.32         {d18[1]}, [r8], ip
598         vsri.32         d23, d21, #16
599         vst1.32         {d19[0]}, [r8], ip
600         vsri.32         d2,  d0,  #16
601         vst1.32         {d19[1]}, [r8], ip
602         vsri.32         d3,  d1,  #16
603         vst1.32         {d22[0]}, [r8], ip
604         vsri.32         d6,  d4,  #16
605         vst1.32         {d22[1]}, [r8], ip
606         vsri.32         d7,  d5,  #16
607         vst1.32         {d23[0]}, [r8], ip
608         vst1.32         {d23[1]}, [r8], ip
609         beq             6f
610         vld1.64         {d16-d17},[r4,:128]!
611         vcvt.s32.f32    q8,  q8,  #16
612         vst1.32         {d2[0]},  [r8], ip
613         vst1.32         {d2[1]},  [r8], ip
614         vld1.64         {d18-d19},[r5,:128]!
615         vcvt.s32.f32    q9,  q9,  #16
616         vst1.32         {d3[0]},  [r8], ip
617         vst1.32         {d3[1]},  [r8], ip
618         vld1.64         {d20-d21},[r4,:128]!
619         vcvt.s32.f32    q10, q10, #16
620         vst1.32         {d6[0]},  [r8], ip
621         vst1.32         {d6[1]},  [r8], ip
622         vld1.64         {d22-d23},[r5,:128]!
623         vcvt.s32.f32    q11, q11, #16
624         vst1.32         {d7[0]},  [r8], ip
625         vst1.32         {d7[1]},  [r8], ip
626         bgt             6b
627 6:      vst1.32         {d2[0]},  [r8], ip
628         vst1.32         {d2[1]},  [r8], ip
629         vst1.32         {d3[0]},  [r8], ip
630         vst1.32         {d3[1]},  [r8], ip
631         vst1.32         {d6[0]},  [r8], ip
632         vst1.32         {d6[1]},  [r8], ip
633         vst1.32         {d7[0]},  [r8], ip
634         vst1.32         {d7[1]},  [r8], ip
635         b               8f
636 7:      vsri.32         d18, d16, #16
637         vsri.32         d19, d17, #16
638         vst1.32         {d18[0]}, [r8], ip
639         vsri.32         d22, d20, #16
640         vst1.32         {d18[1]}, [r8], ip
641         vsri.32         d23, d21, #16
642         vst1.32         {d19[0]}, [r8], ip
643         vst1.32         {d19[1]}, [r8], ip
644         vst1.32         {d22[0]}, [r8], ip
645         vst1.32         {d22[1]}, [r8], ip
646         vst1.32         {d23[0]}, [r8], ip
647         vst1.32         {d23[1]}, [r8], ip
648 8:      subs            r3,  r3,  #2
649         add             r0,  r0,  #4
650         popeq           {r4-r8,pc}
651
652         @ 1 channel
653 4:      ldr             r4,  [r1],#4
654         tst             r2,  #8
655         mov             lr,  r2
656         mov             r5,  r0
657         vld1.64         {d0-d1},  [r4,:128]!
658         vcvt.s32.f32    q0,  q0,  #16
659         vld1.64         {d2-d3},  [r4,:128]!
660         vcvt.s32.f32    q1,  q1,  #16
661         bne             8f
662 6:      subs            lr,  lr,  #16
663         vld1.64         {d4-d5},  [r4,:128]!
664         vcvt.s32.f32    q2,  q2,  #16
665         vld1.64         {d6-d7},  [r4,:128]!
666         vcvt.s32.f32    q3,  q3,  #16
667         vst1.16         {d0[1]},  [r5,:16], ip
668         vst1.16         {d0[3]},  [r5,:16], ip
669         vst1.16         {d1[1]},  [r5,:16], ip
670         vst1.16         {d1[3]},  [r5,:16], ip
671         vst1.16         {d2[1]},  [r5,:16], ip
672         vst1.16         {d2[3]},  [r5,:16], ip
673         vst1.16         {d3[1]},  [r5,:16], ip
674         vst1.16         {d3[3]},  [r5,:16], ip
675         beq             7f
676         vld1.64         {d0-d1},  [r4,:128]!
677         vcvt.s32.f32    q0,  q0,  #16
678         vld1.64         {d2-d3},  [r4,:128]!
679         vcvt.s32.f32    q1,  q1,  #16
680 7:      vst1.16         {d4[1]},  [r5,:16], ip
681         vst1.16         {d4[3]},  [r5,:16], ip
682         vst1.16         {d5[1]},  [r5,:16], ip
683         vst1.16         {d5[3]},  [r5,:16], ip
684         vst1.16         {d6[1]},  [r5,:16], ip
685         vst1.16         {d6[3]},  [r5,:16], ip
686         vst1.16         {d7[1]},  [r5,:16], ip
687         vst1.16         {d7[3]},  [r5,:16], ip
688         bgt             6b
689         pop             {r4-r8,pc}
690 8:      subs            lr,  lr,  #8
691         vst1.16         {d0[1]},  [r5,:16], ip
692         vst1.16         {d0[3]},  [r5,:16], ip
693         vst1.16         {d1[1]},  [r5,:16], ip
694         vst1.16         {d1[3]},  [r5,:16], ip
695         vst1.16         {d2[1]},  [r5,:16], ip
696         vst1.16         {d2[3]},  [r5,:16], ip
697         vst1.16         {d3[1]},  [r5,:16], ip
698         vst1.16         {d3[3]},  [r5,:16], ip
699         popeq           {r4-r8,pc}
700         vld1.64         {d0-d1},  [r4,:128]!
701         vcvt.s32.f32    q0,  q0,  #16
702         vld1.64         {d2-d3},  [r4,:128]!
703         vcvt.s32.f32    q1,  q1,  #16
704         b               6b
705         .endfunc
706
707 function ff_vector_fmul_neon, export=1
708         mov             r3,  r0
709         subs            r2,  r2,  #8
710         vld1.64         {d0-d3},  [r0,:128]!
711         vld1.64         {d4-d7},  [r1,:128]!
712         vmul.f32        q8,  q0,  q2
713         vmul.f32        q9,  q1,  q3
714         beq             3f
715         bics            ip,  r2,  #15
716         beq             2f
717 1:      subs            ip,  ip,  #16
718         vld1.64         {d0-d1},  [r0,:128]!
719         vld1.64         {d4-d5},  [r1,:128]!
720         vmul.f32        q10, q0,  q2
721         vld1.64         {d2-d3},  [r0,:128]!
722         vld1.64         {d6-d7},  [r1,:128]!
723         vmul.f32        q11, q1,  q3
724         vst1.64         {d16-d19},[r3,:128]!
725         vld1.64         {d0-d1},  [r0,:128]!
726         vld1.64         {d4-d5},  [r1,:128]!
727         vmul.f32        q8,  q0,  q2
728         vld1.64         {d2-d3},  [r0,:128]!
729         vld1.64         {d6-d7},  [r1,:128]!
730         vmul.f32        q9,  q1,  q3
731         vst1.64         {d20-d23},[r3,:128]!
732         bne             1b
733         ands            r2,  r2,  #15
734         beq             3f
735 2:      vld1.64         {d0-d1},  [r0,:128]!
736         vld1.64         {d4-d5},  [r1,:128]!
737         vst1.64         {d16-d17},[r3,:128]!
738         vmul.f32        q8,  q0,  q2
739         vld1.64         {d2-d3},  [r0,:128]!
740         vld1.64         {d6-d7},  [r1,:128]!
741         vst1.64         {d18-d19},[r3,:128]!
742         vmul.f32        q9,  q1,  q3
743 3:      vst1.64         {d16-d19},[r3,:128]!
744         bx              lr
745         .endfunc
746
747 function ff_vector_fmul_window_neon, export=1
748         vld1.32         {d16[],d17[]}, [sp,:32]
749         push            {r4,r5,lr}
750         ldr             lr,  [sp, #16]
751         sub             r2,  r2,  #8
752         sub             r5,  lr,  #2
753         add             r2,  r2,  r5, lsl #2
754         add             r4,  r3,  r5, lsl #3
755         add             ip,  r0,  r5, lsl #3
756         mov             r5,  #-16
757         vld1.64         {d0,d1},  [r1,:128]!
758         vld1.64         {d2,d3},  [r2,:128], r5
759         vld1.64         {d4,d5},  [r3,:128]!
760         vld1.64         {d6,d7},  [r4,:128], r5
761 1:      subs            lr,  lr,  #4
762         vmov            q11, q8
763         vmla.f32        d22, d0,  d4
764         vmov            q10, q8
765         vmla.f32        d23, d1,  d5
766         vrev64.32       q3,  q3
767         vmla.f32        d20, d0,  d7
768         vrev64.32       q1,  q1
769         vmla.f32        d21, d1,  d6
770         beq             2f
771         vmla.f32        d22, d3,  d7
772         vld1.64         {d0,d1},  [r1,:128]!
773         vmla.f32        d23, d2,  d6
774         vld1.64         {d18,d19},[r2,:128], r5
775         vmls.f32        d20, d3,  d4
776         vld1.64         {d24,d25},[r3,:128]!
777         vmls.f32        d21, d2,  d5
778         vld1.64         {d6,d7},  [r4,:128], r5
779         vmov            q1,  q9
780         vrev64.32       q11, q11
781         vmov            q2,  q12
782         vswp            d22, d23
783         vst1.64         {d20,d21},[r0,:128]!
784         vst1.64         {d22,d23},[ip,:128], r5
785         b               1b
786 2:      vmla.f32        d22, d3,  d7
787         vmla.f32        d23, d2,  d6
788         vmls.f32        d20, d3,  d4
789         vmls.f32        d21, d2,  d5
790         vrev64.32       q11, q11
791         vswp            d22, d23
792         vst1.64         {d20,d21},[r0,:128]!
793         vst1.64         {d22,d23},[ip,:128], r5
794         pop             {r4,r5,pc}
795         .endfunc