2 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 //#define DEBUG_ALIGNMENT
22 #ifdef DEBUG_ALIGNMENT
23 #define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
25 #define ASSERT_ALIGNED(ptr) ;
28 /* this code assume that stride % 16 == 0 */
30 #define CHROMA_MC8_ALTIVEC_CORE \
31 vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\
32 vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\
34 psum = vec_mladd(vA, vsrc0ssH, v32ss);\
35 psum = vec_mladd(vB, vsrc1ssH, psum);\
36 psum = vec_mladd(vC, vsrc2ssH, psum);\
37 psum = vec_mladd(vD, vsrc3ssH, psum);\
38 psum = vec_sr(psum, v6us);\
40 vdst = vec_ld(0, dst);\
41 ppsum = (vec_u8)vec_pack(psum, psum);\
42 vfdst = vec_perm(vdst, ppsum, fperm);\
44 OP_U8_ALTIVEC(fsum, vfdst, vdst);\
46 vec_st(fsum, 0, dst);\
54 #define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
56 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\
57 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\
59 psum = vec_mladd(vA, vsrc0ssH, v32ss);\
60 psum = vec_mladd(vE, vsrc1ssH, psum);\
61 psum = vec_sr(psum, v6us);\
63 vdst = vec_ld(0, dst);\
64 ppsum = (vec_u8)vec_pack(psum, psum);\
65 vfdst = vec_perm(vdst, ppsum, fperm);\
67 OP_U8_ALTIVEC(fsum, vfdst, vdst);\
69 vec_st(fsum, 0, dst);\
74 void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
75 int stride, int h, int x, int y) {
76 POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1);
77 DECLARE_ALIGNED_16(signed int, ABCD[4]) =
84 const vec_s32 vABCD = vec_ld(0, ABCD);
85 const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
86 const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
87 const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
88 const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
90 const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
91 const vec_u16 v6us = vec_splat_u16(6);
92 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
93 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
95 vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
96 vec_u8 vsrc0uc, vsrc1uc;
97 vec_s16 vsrc0ssH, vsrc1ssH;
98 vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
99 vec_s16 vsrc2ssH, vsrc3ssH, psum;
100 vec_u8 vdst, ppsum, vfdst, fsum;
102 POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1);
104 if (((unsigned long)dst) % 16 == 0) {
105 fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
106 0x14, 0x15, 0x16, 0x17,
107 0x08, 0x09, 0x0A, 0x0B,
108 0x0C, 0x0D, 0x0E, 0x0F};
110 fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
111 0x04, 0x05, 0x06, 0x07,
112 0x18, 0x19, 0x1A, 0x1B,
113 0x1C, 0x1D, 0x1E, 0x1F};
116 vsrcAuc = vec_ld(0, src);
119 vsrcBuc = vec_ld(16, src);
120 vsrcperm0 = vec_lvsl(0, src);
121 vsrcperm1 = vec_lvsl(1, src);
123 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
127 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
129 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);
130 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);
133 if (!loadSecond) {// -> !reallyBadAlign
134 for (i = 0 ; i < h ; i++) {
135 vsrcCuc = vec_ld(stride + 0, src);
136 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
137 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
139 CHROMA_MC8_ALTIVEC_CORE
143 for (i = 0 ; i < h ; i++) {
144 vsrcCuc = vec_ld(stride + 0, src);
145 vsrcDuc = vec_ld(stride + 16, src);
146 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
150 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
152 CHROMA_MC8_ALTIVEC_CORE
156 const vec_s16 vE = vec_add(vB, vC);
157 if (ABCD[2]) { // x == 0 B == 0
158 if (!loadSecond) {// -> !reallyBadAlign
159 for (i = 0 ; i < h ; i++) {
160 vsrcCuc = vec_ld(stride + 0, src);
161 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
162 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
168 for (i = 0 ; i < h ; i++) {
169 vsrcCuc = vec_ld(stride + 0, src);
170 vsrcDuc = vec_ld(stride + 15, src);
171 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
172 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
177 } else { // y == 0 C == 0
178 if (!loadSecond) {// -> !reallyBadAlign
179 for (i = 0 ; i < h ; i++) {
180 vsrcCuc = vec_ld(0, src);
181 vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
182 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
184 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
188 for (i = 0 ; i < h ; i++) {
189 vsrcCuc = vec_ld(0, src);
190 vsrcDuc = vec_ld(15, src);
191 vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
195 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
197 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
202 POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1);
205 #undef CHROMA_MC8_ALTIVEC_CORE
207 /* this code assume stride % 16 == 0 */
208 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
209 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1);
213 const vec_u8 permM2 = vec_lvsl(-2, src);
214 const vec_u8 permM1 = vec_lvsl(-1, src);
215 const vec_u8 permP0 = vec_lvsl(+0, src);
216 const vec_u8 permP1 = vec_lvsl(+1, src);
217 const vec_u8 permP2 = vec_lvsl(+2, src);
218 const vec_u8 permP3 = vec_lvsl(+3, src);
219 const vec_s16 v5ss = vec_splat_s16(5);
220 const vec_u16 v5us = vec_splat_u16(5);
221 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
222 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
224 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
226 register int align = ((((unsigned long)src) - 2) % 16);
228 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
229 srcP2A, srcP2B, srcP3A, srcP3B,
230 srcM1A, srcM1B, srcM2A, srcM2B,
231 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
232 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
233 psumA, psumB, sumA, sumB;
235 vec_u8 sum, vdst, fsum;
237 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
239 for (i = 0 ; i < 16 ; i ++) {
240 vec_u8 srcR1 = vec_ld(-2, src);
241 vec_u8 srcR2 = vec_ld(14, src);
245 srcM2 = vec_perm(srcR1, srcR2, permM2);
246 srcM1 = vec_perm(srcR1, srcR2, permM1);
247 srcP0 = vec_perm(srcR1, srcR2, permP0);
248 srcP1 = vec_perm(srcR1, srcR2, permP1);
249 srcP2 = vec_perm(srcR1, srcR2, permP2);
250 srcP3 = vec_perm(srcR1, srcR2, permP3);
253 srcM2 = vec_perm(srcR1, srcR2, permM2);
254 srcM1 = vec_perm(srcR1, srcR2, permM1);
255 srcP0 = vec_perm(srcR1, srcR2, permP0);
256 srcP1 = vec_perm(srcR1, srcR2, permP1);
257 srcP2 = vec_perm(srcR1, srcR2, permP2);
261 vec_u8 srcR3 = vec_ld(30, src);
262 srcM2 = vec_perm(srcR1, srcR2, permM2);
263 srcM1 = vec_perm(srcR1, srcR2, permM1);
264 srcP0 = vec_perm(srcR1, srcR2, permP0);
265 srcP1 = vec_perm(srcR1, srcR2, permP1);
267 srcP3 = vec_perm(srcR2, srcR3, permP3);
270 vec_u8 srcR3 = vec_ld(30, src);
271 srcM2 = vec_perm(srcR1, srcR2, permM2);
272 srcM1 = vec_perm(srcR1, srcR2, permM1);
273 srcP0 = vec_perm(srcR1, srcR2, permP0);
275 srcP2 = vec_perm(srcR2, srcR3, permP2);
276 srcP3 = vec_perm(srcR2, srcR3, permP3);
279 vec_u8 srcR3 = vec_ld(30, src);
280 srcM2 = vec_perm(srcR1, srcR2, permM2);
281 srcM1 = vec_perm(srcR1, srcR2, permM1);
283 srcP1 = vec_perm(srcR2, srcR3, permP1);
284 srcP2 = vec_perm(srcR2, srcR3, permP2);
285 srcP3 = vec_perm(srcR2, srcR3, permP3);
288 vec_u8 srcR3 = vec_ld(30, src);
289 srcM2 = vec_perm(srcR1, srcR2, permM2);
291 srcP0 = vec_perm(srcR2, srcR3, permP0);
292 srcP1 = vec_perm(srcR2, srcR3, permP1);
293 srcP2 = vec_perm(srcR2, srcR3, permP2);
294 srcP3 = vec_perm(srcR2, srcR3, permP3);
298 srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
299 srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
300 srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
301 srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
303 srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
304 srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
305 srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
306 srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
308 srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
309 srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
310 srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
311 srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
313 sum1A = vec_adds(srcP0A, srcP1A);
314 sum1B = vec_adds(srcP0B, srcP1B);
315 sum2A = vec_adds(srcM1A, srcP2A);
316 sum2B = vec_adds(srcM1B, srcP2B);
317 sum3A = vec_adds(srcM2A, srcP3A);
318 sum3B = vec_adds(srcM2B, srcP3B);
320 pp1A = vec_mladd(sum1A, v20ss, v16ss);
321 pp1B = vec_mladd(sum1B, v20ss, v16ss);
323 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
324 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
326 pp3A = vec_add(sum3A, pp1A);
327 pp3B = vec_add(sum3B, pp1B);
329 psumA = vec_sub(pp3A, pp2A);
330 psumB = vec_sub(pp3B, pp2B);
332 sumA = vec_sra(psumA, v5us);
333 sumB = vec_sra(psumB, v5us);
335 sum = vec_packsu(sumA, sumB);
338 vdst = vec_ld(0, dst);
340 OP_U8_ALTIVEC(fsum, sum, vdst);
342 vec_st(fsum, 0, dst);
347 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
350 /* this code assume stride % 16 == 0 */
351 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
352 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1);
357 const vec_u8 perm = vec_lvsl(0, src);
358 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
359 const vec_u16 v5us = vec_splat_u16(5);
360 const vec_s16 v5ss = vec_splat_s16(5);
361 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
363 uint8_t *srcbis = src - (srcStride * 2);
365 const vec_u8 srcM2a = vec_ld(0, srcbis);
366 const vec_u8 srcM2b = vec_ld(16, srcbis);
367 const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm);
368 //srcbis += srcStride;
369 const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride);
370 const vec_u8 srcM1b = vec_ld(16, srcbis);
371 const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm);
372 //srcbis += srcStride;
373 const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride);
374 const vec_u8 srcP0b = vec_ld(16, srcbis);
375 const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm);
376 //srcbis += srcStride;
377 const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride);
378 const vec_u8 srcP1b = vec_ld(16, srcbis);
379 const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm);
380 //srcbis += srcStride;
381 const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride);
382 const vec_u8 srcP2b = vec_ld(16, srcbis);
383 const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm);
384 //srcbis += srcStride;
386 vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2);
387 vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2);
388 vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1);
389 vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1);
390 vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0);
391 vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0);
392 vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1);
393 vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1);
394 vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2);
395 vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2);
397 vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
398 psumA, psumB, sumA, sumB,
400 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
402 vec_u8 sum, vdst, fsum, srcP3a, srcP3b, srcP3;
404 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
406 for (i = 0 ; i < 16 ; i++) {
407 srcP3a = vec_ld(0, srcbis += srcStride);
408 srcP3b = vec_ld(16, srcbis);
409 srcP3 = vec_perm(srcP3a, srcP3b, perm);
410 srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3);
411 srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3);
412 //srcbis += srcStride;
414 sum1A = vec_adds(srcP0ssA, srcP1ssA);
415 sum1B = vec_adds(srcP0ssB, srcP1ssB);
416 sum2A = vec_adds(srcM1ssA, srcP2ssA);
417 sum2B = vec_adds(srcM1ssB, srcP2ssB);
418 sum3A = vec_adds(srcM2ssA, srcP3ssA);
419 sum3B = vec_adds(srcM2ssB, srcP3ssB);
432 pp1A = vec_mladd(sum1A, v20ss, v16ss);
433 pp1B = vec_mladd(sum1B, v20ss, v16ss);
435 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
436 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
438 pp3A = vec_add(sum3A, pp1A);
439 pp3B = vec_add(sum3B, pp1B);
441 psumA = vec_sub(pp3A, pp2A);
442 psumB = vec_sub(pp3B, pp2B);
444 sumA = vec_sra(psumA, v5us);
445 sumB = vec_sra(psumB, v5us);
447 sum = vec_packsu(sumA, sumB);
450 vdst = vec_ld(0, dst);
452 OP_U8_ALTIVEC(fsum, sum, vdst);
454 vec_st(fsum, 0, dst);
458 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
461 /* this code assume stride % 16 == 0 *and* tmp is properly aligned */
462 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
463 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1);
466 const vec_u8 permM2 = vec_lvsl(-2, src);
467 const vec_u8 permM1 = vec_lvsl(-1, src);
468 const vec_u8 permP0 = vec_lvsl(+0, src);
469 const vec_u8 permP1 = vec_lvsl(+1, src);
470 const vec_u8 permP2 = vec_lvsl(+2, src);
471 const vec_u8 permP3 = vec_lvsl(+3, src);
472 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
473 const vec_u32 v10ui = vec_splat_u32(10);
474 const vec_s16 v5ss = vec_splat_s16(5);
475 const vec_s16 v1ss = vec_splat_s16(1);
476 const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
477 const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
479 register int align = ((((unsigned long)src) - 2) % 16);
481 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
482 srcP2A, srcP2B, srcP3A, srcP3B,
483 srcM1A, srcM1B, srcM2A, srcM2B,
484 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
485 pp1A, pp1B, pp2A, pp2B, psumA, psumB;
487 const vec_u8 mperm = (const vec_u8)
488 {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
489 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
490 int16_t *tmpbis = tmp;
492 vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
493 tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
496 vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
497 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
498 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
499 ssumAe, ssumAo, ssumBe, ssumBo;
500 vec_u8 fsum, sumv, sum, vdst;
501 vec_s16 ssume, ssumo;
503 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
504 src -= (2 * srcStride);
505 for (i = 0 ; i < 21 ; i ++) {
506 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
507 vec_u8 srcR1 = vec_ld(-2, src);
508 vec_u8 srcR2 = vec_ld(14, src);
512 srcM2 = vec_perm(srcR1, srcR2, permM2);
513 srcM1 = vec_perm(srcR1, srcR2, permM1);
514 srcP0 = vec_perm(srcR1, srcR2, permP0);
515 srcP1 = vec_perm(srcR1, srcR2, permP1);
516 srcP2 = vec_perm(srcR1, srcR2, permP2);
517 srcP3 = vec_perm(srcR1, srcR2, permP3);
520 srcM2 = vec_perm(srcR1, srcR2, permM2);
521 srcM1 = vec_perm(srcR1, srcR2, permM1);
522 srcP0 = vec_perm(srcR1, srcR2, permP0);
523 srcP1 = vec_perm(srcR1, srcR2, permP1);
524 srcP2 = vec_perm(srcR1, srcR2, permP2);
528 vec_u8 srcR3 = vec_ld(30, src);
529 srcM2 = vec_perm(srcR1, srcR2, permM2);
530 srcM1 = vec_perm(srcR1, srcR2, permM1);
531 srcP0 = vec_perm(srcR1, srcR2, permP0);
532 srcP1 = vec_perm(srcR1, srcR2, permP1);
534 srcP3 = vec_perm(srcR2, srcR3, permP3);
537 vec_u8 srcR3 = vec_ld(30, src);
538 srcM2 = vec_perm(srcR1, srcR2, permM2);
539 srcM1 = vec_perm(srcR1, srcR2, permM1);
540 srcP0 = vec_perm(srcR1, srcR2, permP0);
542 srcP2 = vec_perm(srcR2, srcR3, permP2);
543 srcP3 = vec_perm(srcR2, srcR3, permP3);
546 vec_u8 srcR3 = vec_ld(30, src);
547 srcM2 = vec_perm(srcR1, srcR2, permM2);
548 srcM1 = vec_perm(srcR1, srcR2, permM1);
550 srcP1 = vec_perm(srcR2, srcR3, permP1);
551 srcP2 = vec_perm(srcR2, srcR3, permP2);
552 srcP3 = vec_perm(srcR2, srcR3, permP3);
555 vec_u8 srcR3 = vec_ld(30, src);
556 srcM2 = vec_perm(srcR1, srcR2, permM2);
558 srcP0 = vec_perm(srcR2, srcR3, permP0);
559 srcP1 = vec_perm(srcR2, srcR3, permP1);
560 srcP2 = vec_perm(srcR2, srcR3, permP2);
561 srcP3 = vec_perm(srcR2, srcR3, permP3);
565 srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
566 srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
567 srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
568 srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
570 srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
571 srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
572 srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
573 srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
575 srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
576 srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
577 srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
578 srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
580 sum1A = vec_adds(srcP0A, srcP1A);
581 sum1B = vec_adds(srcP0B, srcP1B);
582 sum2A = vec_adds(srcM1A, srcP2A);
583 sum2B = vec_adds(srcM1B, srcP2B);
584 sum3A = vec_adds(srcM2A, srcP3A);
585 sum3B = vec_adds(srcM2B, srcP3B);
587 pp1A = vec_mladd(sum1A, v20ss, sum3A);
588 pp1B = vec_mladd(sum1B, v20ss, sum3B);
590 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
591 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
593 psumA = vec_sub(pp1A, pp2A);
594 psumB = vec_sub(pp1B, pp2B);
596 vec_st(psumA, 0, tmp);
597 vec_st(psumB, 16, tmp);
600 tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
603 tmpM2ssA = vec_ld(0, tmpbis);
604 tmpM2ssB = vec_ld(16, tmpbis);
606 tmpM1ssA = vec_ld(0, tmpbis);
607 tmpM1ssB = vec_ld(16, tmpbis);
609 tmpP0ssA = vec_ld(0, tmpbis);
610 tmpP0ssB = vec_ld(16, tmpbis);
612 tmpP1ssA = vec_ld(0, tmpbis);
613 tmpP1ssB = vec_ld(16, tmpbis);
615 tmpP2ssA = vec_ld(0, tmpbis);
616 tmpP2ssB = vec_ld(16, tmpbis);
619 for (i = 0 ; i < 16 ; i++) {
620 const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
621 const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
623 const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
624 const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
625 const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
626 const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
627 const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
628 const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
643 pp1Ae = vec_mule(sum1A, v20ss);
644 pp1Ao = vec_mulo(sum1A, v20ss);
645 pp1Be = vec_mule(sum1B, v20ss);
646 pp1Bo = vec_mulo(sum1B, v20ss);
648 pp2Ae = vec_mule(sum2A, v5ss);
649 pp2Ao = vec_mulo(sum2A, v5ss);
650 pp2Be = vec_mule(sum2B, v5ss);
651 pp2Bo = vec_mulo(sum2B, v5ss);
653 pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
654 pp3Ao = vec_mulo(sum3A, v1ss);
655 pp3Be = vec_sra((vec_s32)sum3B, v16ui);
656 pp3Bo = vec_mulo(sum3B, v1ss);
658 pp1cAe = vec_add(pp1Ae, v512si);
659 pp1cAo = vec_add(pp1Ao, v512si);
660 pp1cBe = vec_add(pp1Be, v512si);
661 pp1cBo = vec_add(pp1Bo, v512si);
663 pp32Ae = vec_sub(pp3Ae, pp2Ae);
664 pp32Ao = vec_sub(pp3Ao, pp2Ao);
665 pp32Be = vec_sub(pp3Be, pp2Be);
666 pp32Bo = vec_sub(pp3Bo, pp2Bo);
668 sumAe = vec_add(pp1cAe, pp32Ae);
669 sumAo = vec_add(pp1cAo, pp32Ao);
670 sumBe = vec_add(pp1cBe, pp32Be);
671 sumBo = vec_add(pp1cBo, pp32Bo);
673 ssumAe = vec_sra(sumAe, v10ui);
674 ssumAo = vec_sra(sumAo, v10ui);
675 ssumBe = vec_sra(sumBe, v10ui);
676 ssumBo = vec_sra(sumBo, v10ui);
678 ssume = vec_packs(ssumAe, ssumBe);
679 ssumo = vec_packs(ssumAo, ssumBo);
681 sumv = vec_packsu(ssume, ssumo);
682 sum = vec_perm(sumv, sumv, mperm);
685 vdst = vec_ld(0, dst);
687 OP_U8_ALTIVEC(fsum, sum, vdst);
689 vec_st(fsum, 0, dst);
693 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);