3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
32 #include "mpegvideo.h"
33 #include "simple_idct.h"
39 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
42 void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
44 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
45 uint32_t ff_squareTbl[512] = {0, };
47 const uint8_t ff_zigzag_direct[64] = {
48 0, 1, 8, 16, 9, 2, 3, 10,
49 17, 24, 32, 25, 18, 11, 4, 5,
50 12, 19, 26, 33, 40, 48, 41, 34,
51 27, 20, 13, 6, 7, 14, 21, 28,
52 35, 42, 49, 56, 57, 50, 43, 36,
53 29, 22, 15, 23, 30, 37, 44, 51,
54 58, 59, 52, 45, 38, 31, 39, 46,
55 53, 60, 61, 54, 47, 55, 62, 63
58 /* Specific zigzag scan for 248 idct. NOTE that unlike the
59 specification, we interleave the fields */
60 const uint8_t ff_zigzag248_direct[64] = {
61 0, 8, 1, 9, 16, 24, 2, 10,
62 17, 25, 32, 40, 48, 56, 33, 41,
63 18, 26, 3, 11, 4, 12, 19, 27,
64 34, 42, 49, 57, 50, 58, 35, 43,
65 20, 28, 5, 13, 6, 14, 21, 29,
66 36, 44, 51, 59, 52, 60, 37, 45,
67 22, 30, 7, 15, 23, 31, 38, 46,
68 53, 61, 54, 62, 39, 47, 55, 63,
71 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
72 DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
74 const uint8_t ff_alternate_horizontal_scan[64] = {
75 0, 1, 2, 3, 8, 9, 16, 17,
76 10, 11, 4, 5, 6, 7, 15, 14,
77 13, 12, 19, 18, 24, 25, 32, 33,
78 26, 27, 20, 21, 22, 23, 28, 29,
79 30, 31, 34, 35, 40, 41, 48, 49,
80 42, 43, 36, 37, 38, 39, 44, 45,
81 46, 47, 50, 51, 56, 57, 58, 59,
82 52, 53, 54, 55, 60, 61, 62, 63,
85 const uint8_t ff_alternate_vertical_scan[64] = {
86 0, 8, 16, 24, 1, 9, 2, 10,
87 17, 25, 32, 40, 48, 56, 57, 49,
88 41, 33, 26, 18, 3, 11, 4, 12,
89 19, 27, 34, 42, 50, 58, 35, 43,
90 51, 59, 20, 28, 5, 13, 6, 14,
91 21, 29, 36, 44, 52, 60, 37, 45,
92 53, 61, 22, 30, 7, 15, 23, 31,
93 38, 46, 54, 62, 39, 47, 55, 63,
96 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
97 const uint32_t ff_inverse[256]={
98 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
99 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
100 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
101 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
102 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
103 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
104 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
105 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
106 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
107 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
108 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
109 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
110 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
111 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
112 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
113 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
114 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
115 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
116 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
117 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
118 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
119 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
120 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
121 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
122 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
123 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
124 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
125 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
126 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
127 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
128 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
129 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
132 /* Input permutation for the simple_idct_mmx */
133 static const uint8_t simple_mmx_permutation[64]={
134 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
135 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
136 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
137 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
138 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
139 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
140 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
141 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
144 static int pix_sum_c(uint8_t * pix, int line_size)
149 for (i = 0; i < 16; i++) {
150 for (j = 0; j < 16; j += 8) {
161 pix += line_size - 16;
166 static int pix_norm1_c(uint8_t * pix, int line_size)
169 uint32_t *sq = ff_squareTbl + 256;
172 for (i = 0; i < 16; i++) {
173 for (j = 0; j < 16; j += 8) {
184 #if LONG_MAX > 2147483647
185 register uint64_t x=*(uint64_t*)pix;
187 s += sq[(x>>8)&0xff];
188 s += sq[(x>>16)&0xff];
189 s += sq[(x>>24)&0xff];
190 s += sq[(x>>32)&0xff];
191 s += sq[(x>>40)&0xff];
192 s += sq[(x>>48)&0xff];
193 s += sq[(x>>56)&0xff];
195 register uint32_t x=*(uint32_t*)pix;
197 s += sq[(x>>8)&0xff];
198 s += sq[(x>>16)&0xff];
199 s += sq[(x>>24)&0xff];
200 x=*(uint32_t*)(pix+4);
202 s += sq[(x>>8)&0xff];
203 s += sq[(x>>16)&0xff];
204 s += sq[(x>>24)&0xff];
209 pix += line_size - 16;
214 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
217 for(i=0; i+8<=w; i+=8){
218 dst[i+0]= bswap_32(src[i+0]);
219 dst[i+1]= bswap_32(src[i+1]);
220 dst[i+2]= bswap_32(src[i+2]);
221 dst[i+3]= bswap_32(src[i+3]);
222 dst[i+4]= bswap_32(src[i+4]);
223 dst[i+5]= bswap_32(src[i+5]);
224 dst[i+6]= bswap_32(src[i+6]);
225 dst[i+7]= bswap_32(src[i+7]);
228 dst[i+0]= bswap_32(src[i+0]);
232 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
235 uint32_t *sq = ff_squareTbl + 256;
238 for (i = 0; i < h; i++) {
239 s += sq[pix1[0] - pix2[0]];
240 s += sq[pix1[1] - pix2[1]];
241 s += sq[pix1[2] - pix2[2]];
242 s += sq[pix1[3] - pix2[3]];
249 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
252 uint32_t *sq = ff_squareTbl + 256;
255 for (i = 0; i < h; i++) {
256 s += sq[pix1[0] - pix2[0]];
257 s += sq[pix1[1] - pix2[1]];
258 s += sq[pix1[2] - pix2[2]];
259 s += sq[pix1[3] - pix2[3]];
260 s += sq[pix1[4] - pix2[4]];
261 s += sq[pix1[5] - pix2[5]];
262 s += sq[pix1[6] - pix2[6]];
263 s += sq[pix1[7] - pix2[7]];
270 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
273 uint32_t *sq = ff_squareTbl + 256;
276 for (i = 0; i < h; i++) {
277 s += sq[pix1[ 0] - pix2[ 0]];
278 s += sq[pix1[ 1] - pix2[ 1]];
279 s += sq[pix1[ 2] - pix2[ 2]];
280 s += sq[pix1[ 3] - pix2[ 3]];
281 s += sq[pix1[ 4] - pix2[ 4]];
282 s += sq[pix1[ 5] - pix2[ 5]];
283 s += sq[pix1[ 6] - pix2[ 6]];
284 s += sq[pix1[ 7] - pix2[ 7]];
285 s += sq[pix1[ 8] - pix2[ 8]];
286 s += sq[pix1[ 9] - pix2[ 9]];
287 s += sq[pix1[10] - pix2[10]];
288 s += sq[pix1[11] - pix2[11]];
289 s += sq[pix1[12] - pix2[12]];
290 s += sq[pix1[13] - pix2[13]];
291 s += sq[pix1[14] - pix2[14]];
292 s += sq[pix1[15] - pix2[15]];
301 #ifdef CONFIG_SNOW_ENCODER //dwt is in snow.c
302 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
304 const int dec_count= w==8 ? 3 : 4;
307 static const int scale[2][2][4][4]={
311 {268, 239, 239, 213},
315 // 9/7 16x16 or 32x32 dec=4
316 {344, 310, 310, 280},
324 {275, 245, 245, 218},
328 // 5/3 16x16 or 32x32 dec=4
329 {352, 317, 317, 286},
337 for (i = 0; i < h; i++) {
338 for (j = 0; j < w; j+=4) {
339 tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
340 tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
341 tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
342 tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
348 ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
352 for(level=0; level<dec_count; level++){
353 for(ori= level ? 1 : 0; ori<4; ori++){
354 int size= w>>(dec_count-level);
355 int sx= (ori&1) ? size : 0;
356 int stride= 32<<(dec_count-level);
357 int sy= (ori&2) ? stride>>1 : 0;
359 for(i=0; i<size; i++){
360 for(j=0; j<size; j++){
361 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
371 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
372 return w_c(v, pix1, pix2, line_size, 8, h, 1);
375 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
376 return w_c(v, pix1, pix2, line_size, 8, h, 0);
379 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
380 return w_c(v, pix1, pix2, line_size, 16, h, 1);
383 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
384 return w_c(v, pix1, pix2, line_size, 16, h, 0);
387 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
388 return w_c(v, pix1, pix2, line_size, 32, h, 1);
391 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
392 return w_c(v, pix1, pix2, line_size, 32, h, 0);
396 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
400 /* read the pixels */
402 block[0] = pixels[0];
403 block[1] = pixels[1];
404 block[2] = pixels[2];
405 block[3] = pixels[3];
406 block[4] = pixels[4];
407 block[5] = pixels[5];
408 block[6] = pixels[6];
409 block[7] = pixels[7];
415 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
416 const uint8_t *s2, int stride){
419 /* read the pixels */
421 block[0] = s1[0] - s2[0];
422 block[1] = s1[1] - s2[1];
423 block[2] = s1[2] - s2[2];
424 block[3] = s1[3] - s2[3];
425 block[4] = s1[4] - s2[4];
426 block[5] = s1[5] - s2[5];
427 block[6] = s1[6] - s2[6];
428 block[7] = s1[7] - s2[7];
436 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
440 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
442 /* read the pixels */
444 pixels[0] = cm[block[0]];
445 pixels[1] = cm[block[1]];
446 pixels[2] = cm[block[2]];
447 pixels[3] = cm[block[3]];
448 pixels[4] = cm[block[4]];
449 pixels[5] = cm[block[5]];
450 pixels[6] = cm[block[6]];
451 pixels[7] = cm[block[7]];
458 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
462 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
464 /* read the pixels */
466 pixels[0] = cm[block[0]];
467 pixels[1] = cm[block[1]];
468 pixels[2] = cm[block[2]];
469 pixels[3] = cm[block[3]];
476 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
480 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
482 /* read the pixels */
484 pixels[0] = cm[block[0]];
485 pixels[1] = cm[block[1]];
492 static void put_signed_pixels_clamped_c(const DCTELEM *block,
493 uint8_t *restrict pixels,
498 for (i = 0; i < 8; i++) {
499 for (j = 0; j < 8; j++) {
502 else if (*block > 127)
505 *pixels = (uint8_t)(*block + 128);
509 pixels += (line_size - 8);
513 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
517 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
519 /* read the pixels */
521 pixels[0] = cm[pixels[0] + block[0]];
522 pixels[1] = cm[pixels[1] + block[1]];
523 pixels[2] = cm[pixels[2] + block[2]];
524 pixels[3] = cm[pixels[3] + block[3]];
525 pixels[4] = cm[pixels[4] + block[4]];
526 pixels[5] = cm[pixels[5] + block[5]];
527 pixels[6] = cm[pixels[6] + block[6]];
528 pixels[7] = cm[pixels[7] + block[7]];
534 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
538 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
540 /* read the pixels */
542 pixels[0] = cm[pixels[0] + block[0]];
543 pixels[1] = cm[pixels[1] + block[1]];
544 pixels[2] = cm[pixels[2] + block[2]];
545 pixels[3] = cm[pixels[3] + block[3]];
551 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
555 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
557 /* read the pixels */
559 pixels[0] = cm[pixels[0] + block[0]];
560 pixels[1] = cm[pixels[1] + block[1]];
566 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
570 pixels[0] += block[0];
571 pixels[1] += block[1];
572 pixels[2] += block[2];
573 pixels[3] += block[3];
574 pixels[4] += block[4];
575 pixels[5] += block[5];
576 pixels[6] += block[6];
577 pixels[7] += block[7];
583 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
587 pixels[0] += block[0];
588 pixels[1] += block[1];
589 pixels[2] += block[2];
590 pixels[3] += block[3];
596 static int sum_abs_dctelem_c(DCTELEM *block)
600 sum+= FFABS(block[i]);
606 #define PIXOP2(OPNAME, OP) \
607 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
611 OP(*((uint64_t*)block), AV_RN64(pixels));\
617 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
621 const uint64_t a= AV_RN64(pixels );\
622 const uint64_t b= AV_RN64(pixels+1);\
623 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
629 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
633 const uint64_t a= AV_RN64(pixels );\
634 const uint64_t b= AV_RN64(pixels+1);\
635 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
641 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
645 const uint64_t a= AV_RN64(pixels );\
646 const uint64_t b= AV_RN64(pixels+line_size);\
647 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
653 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
657 const uint64_t a= AV_RN64(pixels );\
658 const uint64_t b= AV_RN64(pixels+line_size);\
659 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
665 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
668 const uint64_t a= AV_RN64(pixels );\
669 const uint64_t b= AV_RN64(pixels+1);\
670 uint64_t l0= (a&0x0303030303030303ULL)\
671 + (b&0x0303030303030303ULL)\
672 + 0x0202020202020202ULL;\
673 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
674 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
678 for(i=0; i<h; i+=2){\
679 uint64_t a= AV_RN64(pixels );\
680 uint64_t b= AV_RN64(pixels+1);\
681 l1= (a&0x0303030303030303ULL)\
682 + (b&0x0303030303030303ULL);\
683 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
684 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
685 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
688 a= AV_RN64(pixels );\
689 b= AV_RN64(pixels+1);\
690 l0= (a&0x0303030303030303ULL)\
691 + (b&0x0303030303030303ULL)\
692 + 0x0202020202020202ULL;\
693 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
694 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
695 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
701 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
704 const uint64_t a= AV_RN64(pixels );\
705 const uint64_t b= AV_RN64(pixels+1);\
706 uint64_t l0= (a&0x0303030303030303ULL)\
707 + (b&0x0303030303030303ULL)\
708 + 0x0101010101010101ULL;\
709 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
710 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
714 for(i=0; i<h; i+=2){\
715 uint64_t a= AV_RN64(pixels );\
716 uint64_t b= AV_RN64(pixels+1);\
717 l1= (a&0x0303030303030303ULL)\
718 + (b&0x0303030303030303ULL);\
719 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
720 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
721 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
724 a= AV_RN64(pixels );\
725 b= AV_RN64(pixels+1);\
726 l0= (a&0x0303030303030303ULL)\
727 + (b&0x0303030303030303ULL)\
728 + 0x0101010101010101ULL;\
729 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
730 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
731 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
737 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
738 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
739 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
740 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
741 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
742 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
743 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
745 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
746 #else // 64 bit variant
748 #define PIXOP2(OPNAME, OP) \
749 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
752 OP(*((uint16_t*)(block )), AV_RN16(pixels ));\
757 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
760 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
765 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
768 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
769 OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
774 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
775 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
778 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
779 int src_stride1, int src_stride2, int h){\
783 a= AV_RN32(&src1[i*src_stride1 ]);\
784 b= AV_RN32(&src2[i*src_stride2 ]);\
785 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
786 a= AV_RN32(&src1[i*src_stride1+4]);\
787 b= AV_RN32(&src2[i*src_stride2+4]);\
788 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
792 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
793 int src_stride1, int src_stride2, int h){\
797 a= AV_RN32(&src1[i*src_stride1 ]);\
798 b= AV_RN32(&src2[i*src_stride2 ]);\
799 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
800 a= AV_RN32(&src1[i*src_stride1+4]);\
801 b= AV_RN32(&src2[i*src_stride2+4]);\
802 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
806 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
807 int src_stride1, int src_stride2, int h){\
811 a= AV_RN32(&src1[i*src_stride1 ]);\
812 b= AV_RN32(&src2[i*src_stride2 ]);\
813 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
817 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
818 int src_stride1, int src_stride2, int h){\
822 a= AV_RN16(&src1[i*src_stride1 ]);\
823 b= AV_RN16(&src2[i*src_stride2 ]);\
824 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
828 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
829 int src_stride1, int src_stride2, int h){\
830 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
831 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
834 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
835 int src_stride1, int src_stride2, int h){\
836 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
837 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
840 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
841 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
844 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
845 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
848 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
849 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
852 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
853 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
856 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
857 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
860 uint32_t a, b, c, d, l0, l1, h0, h1;\
861 a= AV_RN32(&src1[i*src_stride1]);\
862 b= AV_RN32(&src2[i*src_stride2]);\
863 c= AV_RN32(&src3[i*src_stride3]);\
864 d= AV_RN32(&src4[i*src_stride4]);\
865 l0= (a&0x03030303UL)\
868 h0= ((a&0xFCFCFCFCUL)>>2)\
869 + ((b&0xFCFCFCFCUL)>>2);\
870 l1= (c&0x03030303UL)\
872 h1= ((c&0xFCFCFCFCUL)>>2)\
873 + ((d&0xFCFCFCFCUL)>>2);\
874 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
875 a= AV_RN32(&src1[i*src_stride1+4]);\
876 b= AV_RN32(&src2[i*src_stride2+4]);\
877 c= AV_RN32(&src3[i*src_stride3+4]);\
878 d= AV_RN32(&src4[i*src_stride4+4]);\
879 l0= (a&0x03030303UL)\
882 h0= ((a&0xFCFCFCFCUL)>>2)\
883 + ((b&0xFCFCFCFCUL)>>2);\
884 l1= (c&0x03030303UL)\
886 h1= ((c&0xFCFCFCFCUL)>>2)\
887 + ((d&0xFCFCFCFCUL)>>2);\
888 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
892 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
893 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
896 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
897 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
900 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
901 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
904 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
905 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
908 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
909 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
912 uint32_t a, b, c, d, l0, l1, h0, h1;\
913 a= AV_RN32(&src1[i*src_stride1]);\
914 b= AV_RN32(&src2[i*src_stride2]);\
915 c= AV_RN32(&src3[i*src_stride3]);\
916 d= AV_RN32(&src4[i*src_stride4]);\
917 l0= (a&0x03030303UL)\
920 h0= ((a&0xFCFCFCFCUL)>>2)\
921 + ((b&0xFCFCFCFCUL)>>2);\
922 l1= (c&0x03030303UL)\
924 h1= ((c&0xFCFCFCFCUL)>>2)\
925 + ((d&0xFCFCFCFCUL)>>2);\
926 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
927 a= AV_RN32(&src1[i*src_stride1+4]);\
928 b= AV_RN32(&src2[i*src_stride2+4]);\
929 c= AV_RN32(&src3[i*src_stride3+4]);\
930 d= AV_RN32(&src4[i*src_stride4+4]);\
931 l0= (a&0x03030303UL)\
934 h0= ((a&0xFCFCFCFCUL)>>2)\
935 + ((b&0xFCFCFCFCUL)>>2);\
936 l1= (c&0x03030303UL)\
938 h1= ((c&0xFCFCFCFCUL)>>2)\
939 + ((d&0xFCFCFCFCUL)>>2);\
940 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
943 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
944 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
945 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
946 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
948 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
949 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
950 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
951 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
954 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
956 int i, a0, b0, a1, b1;\
963 for(i=0; i<h; i+=2){\
969 block[0]= (a1+a0)>>2; /* FIXME non put */\
970 block[1]= (b1+b0)>>2;\
980 block[0]= (a1+a0)>>2;\
981 block[1]= (b1+b0)>>2;\
987 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
990 const uint32_t a= AV_RN32(pixels );\
991 const uint32_t b= AV_RN32(pixels+1);\
992 uint32_t l0= (a&0x03030303UL)\
995 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
996 + ((b&0xFCFCFCFCUL)>>2);\
1000 for(i=0; i<h; i+=2){\
1001 uint32_t a= AV_RN32(pixels );\
1002 uint32_t b= AV_RN32(pixels+1);\
1003 l1= (a&0x03030303UL)\
1004 + (b&0x03030303UL);\
1005 h1= ((a&0xFCFCFCFCUL)>>2)\
1006 + ((b&0xFCFCFCFCUL)>>2);\
1007 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1010 a= AV_RN32(pixels );\
1011 b= AV_RN32(pixels+1);\
1012 l0= (a&0x03030303UL)\
1015 h0= ((a&0xFCFCFCFCUL)>>2)\
1016 + ((b&0xFCFCFCFCUL)>>2);\
1017 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1023 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1026 for(j=0; j<2; j++){\
1028 const uint32_t a= AV_RN32(pixels );\
1029 const uint32_t b= AV_RN32(pixels+1);\
1030 uint32_t l0= (a&0x03030303UL)\
1033 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1034 + ((b&0xFCFCFCFCUL)>>2);\
1038 for(i=0; i<h; i+=2){\
1039 uint32_t a= AV_RN32(pixels );\
1040 uint32_t b= AV_RN32(pixels+1);\
1041 l1= (a&0x03030303UL)\
1042 + (b&0x03030303UL);\
1043 h1= ((a&0xFCFCFCFCUL)>>2)\
1044 + ((b&0xFCFCFCFCUL)>>2);\
1045 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1048 a= AV_RN32(pixels );\
1049 b= AV_RN32(pixels+1);\
1050 l0= (a&0x03030303UL)\
1053 h0= ((a&0xFCFCFCFCUL)>>2)\
1054 + ((b&0xFCFCFCFCUL)>>2);\
1055 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1059 pixels+=4-line_size*(h+1);\
1060 block +=4-line_size*h;\
1064 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1067 for(j=0; j<2; j++){\
1069 const uint32_t a= AV_RN32(pixels );\
1070 const uint32_t b= AV_RN32(pixels+1);\
1071 uint32_t l0= (a&0x03030303UL)\
1074 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1075 + ((b&0xFCFCFCFCUL)>>2);\
1079 for(i=0; i<h; i+=2){\
1080 uint32_t a= AV_RN32(pixels );\
1081 uint32_t b= AV_RN32(pixels+1);\
1082 l1= (a&0x03030303UL)\
1083 + (b&0x03030303UL);\
1084 h1= ((a&0xFCFCFCFCUL)>>2)\
1085 + ((b&0xFCFCFCFCUL)>>2);\
1086 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1089 a= AV_RN32(pixels );\
1090 b= AV_RN32(pixels+1);\
1091 l0= (a&0x03030303UL)\
1094 h0= ((a&0xFCFCFCFCUL)>>2)\
1095 + ((b&0xFCFCFCFCUL)>>2);\
1096 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1100 pixels+=4-line_size*(h+1);\
1101 block +=4-line_size*h;\
1105 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1106 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1107 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1108 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1109 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1110 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1111 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1112 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1114 #define op_avg(a, b) a = rnd_avg32(a, b)
1116 #define op_put(a, b) a = b
1123 #define avg2(a,b) ((a+b+1)>>1)
1124 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1126 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1127 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1130 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1131 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1134 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1136 const int A=(16-x16)*(16-y16);
1137 const int B=( x16)*(16-y16);
1138 const int C=(16-x16)*( y16);
1139 const int D=( x16)*( y16);
1144 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1145 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1146 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1147 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1148 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1149 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1150 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1151 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1157 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1158 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1161 const int s= 1<<shift;
1171 for(x=0; x<8; x++){ //XXX FIXME optimize
1172 int src_x, src_y, frac_x, frac_y, index;
1176 frac_x= src_x&(s-1);
1177 frac_y= src_y&(s-1);
1181 if((unsigned)src_x < width){
1182 if((unsigned)src_y < height){
1183 index= src_x + src_y*stride;
1184 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1185 + src[index +1]* frac_x )*(s-frac_y)
1186 + ( src[index+stride ]*(s-frac_x)
1187 + src[index+stride+1]* frac_x )* frac_y
1190 index= src_x + av_clip(src_y, 0, height)*stride;
1191 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1192 + src[index +1]* frac_x )*s
1196 if((unsigned)src_y < height){
1197 index= av_clip(src_x, 0, width) + src_y*stride;
1198 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1199 + src[index+stride ]* frac_y )*s
1202 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1203 dst[y*stride + x]= src[index ];
1215 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1217 case 2: put_pixels2_c (dst, src, stride, height); break;
1218 case 4: put_pixels4_c (dst, src, stride, height); break;
1219 case 8: put_pixels8_c (dst, src, stride, height); break;
1220 case 16:put_pixels16_c(dst, src, stride, height); break;
1224 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1226 for (i=0; i < height; i++) {
1227 for (j=0; j < width; j++) {
1228 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1235 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1237 for (i=0; i < height; i++) {
1238 for (j=0; j < width; j++) {
1239 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1246 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1248 for (i=0; i < height; i++) {
1249 for (j=0; j < width; j++) {
1250 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1257 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1259 for (i=0; i < height; i++) {
1260 for (j=0; j < width; j++) {
1261 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1268 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1270 for (i=0; i < height; i++) {
1271 for (j=0; j < width; j++) {
1272 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1279 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1281 for (i=0; i < height; i++) {
1282 for (j=0; j < width; j++) {
1283 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1290 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1292 for (i=0; i < height; i++) {
1293 for (j=0; j < width; j++) {
1294 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1301 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1303 for (i=0; i < height; i++) {
1304 for (j=0; j < width; j++) {
1305 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1312 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1314 case 2: avg_pixels2_c (dst, src, stride, height); break;
1315 case 4: avg_pixels4_c (dst, src, stride, height); break;
1316 case 8: avg_pixels8_c (dst, src, stride, height); break;
1317 case 16:avg_pixels16_c(dst, src, stride, height); break;
1321 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1323 for (i=0; i < height; i++) {
1324 for (j=0; j < width; j++) {
1325 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1332 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1334 for (i=0; i < height; i++) {
1335 for (j=0; j < width; j++) {
1336 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1343 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1345 for (i=0; i < height; i++) {
1346 for (j=0; j < width; j++) {
1347 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1354 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1356 for (i=0; i < height; i++) {
1357 for (j=0; j < width; j++) {
1358 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1365 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1367 for (i=0; i < height; i++) {
1368 for (j=0; j < width; j++) {
1369 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1376 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1378 for (i=0; i < height; i++) {
1379 for (j=0; j < width; j++) {
1380 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1387 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1389 for (i=0; i < height; i++) {
1390 for (j=0; j < width; j++) {
1391 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1398 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1400 for (i=0; i < height; i++) {
1401 for (j=0; j < width; j++) {
1402 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1409 #define TPEL_WIDTH(width)\
1410 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1411 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1412 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1413 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1414 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1415 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1416 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1417 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1418 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1419 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1420 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1421 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1422 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1423 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1424 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1425 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1426 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1427 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1430 #define H264_CHROMA_MC(OPNAME, OP)\
1431 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1432 const int A=(8-x)*(8-y);\
1433 const int B=( x)*(8-y);\
1434 const int C=(8-x)*( y);\
1435 const int D=( x)*( y);\
1438 assert(x<8 && y<8 && x>=0 && y>=0);\
1442 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1443 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1449 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1450 const int A=(8-x)*(8-y);\
1451 const int B=( x)*(8-y);\
1452 const int C=(8-x)*( y);\
1453 const int D=( x)*( y);\
1456 assert(x<8 && y<8 && x>=0 && y>=0);\
1460 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1461 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1462 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1463 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1469 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1470 const int A=(8-x)*(8-y);\
1471 const int B=( x)*(8-y);\
1472 const int C=(8-x)*( y);\
1473 const int D=( x)*( y);\
1476 assert(x<8 && y<8 && x>=0 && y>=0);\
1480 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1481 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1482 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1483 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1484 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1485 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1486 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1487 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1493 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1494 #define op_put(a, b) a = (((b) + 32)>>6)
1496 H264_CHROMA_MC(put_ , op_put)
1497 H264_CHROMA_MC(avg_ , op_avg)
1501 static void put_no_rnd_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1502 const int A=(8-x)*(8-y);
1503 const int B=( x)*(8-y);
1504 const int C=(8-x)*( y);
1505 const int D=( x)*( y);
1508 assert(x<8 && y<8 && x>=0 && y>=0);
1512 dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1513 dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1514 dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1515 dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1516 dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1517 dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1518 dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1519 dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1525 #define QPEL_MC(r, OPNAME, RND, OP) \
1526 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1527 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1531 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1532 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1533 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1534 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1535 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1536 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1537 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1538 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1544 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1546 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1550 const int src0= src[0*srcStride];\
1551 const int src1= src[1*srcStride];\
1552 const int src2= src[2*srcStride];\
1553 const int src3= src[3*srcStride];\
1554 const int src4= src[4*srcStride];\
1555 const int src5= src[5*srcStride];\
1556 const int src6= src[6*srcStride];\
1557 const int src7= src[7*srcStride];\
1558 const int src8= src[8*srcStride];\
1559 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1560 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1561 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1562 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1563 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1564 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1565 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1566 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1572 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1573 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1578 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1579 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1580 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1581 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1582 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1583 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1584 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1585 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1586 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1587 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1588 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1589 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1590 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1591 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1592 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1593 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1599 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1600 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1605 const int src0= src[0*srcStride];\
1606 const int src1= src[1*srcStride];\
1607 const int src2= src[2*srcStride];\
1608 const int src3= src[3*srcStride];\
1609 const int src4= src[4*srcStride];\
1610 const int src5= src[5*srcStride];\
1611 const int src6= src[6*srcStride];\
1612 const int src7= src[7*srcStride];\
1613 const int src8= src[8*srcStride];\
1614 const int src9= src[9*srcStride];\
1615 const int src10= src[10*srcStride];\
1616 const int src11= src[11*srcStride];\
1617 const int src12= src[12*srcStride];\
1618 const int src13= src[13*srcStride];\
1619 const int src14= src[14*srcStride];\
1620 const int src15= src[15*srcStride];\
1621 const int src16= src[16*srcStride];\
1622 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1623 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1624 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1625 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1626 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1627 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1628 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1629 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1630 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1631 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1632 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1633 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1634 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1635 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1636 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1637 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1643 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1644 OPNAME ## pixels8_c(dst, src, stride, 8);\
1647 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1649 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1650 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1653 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1654 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1657 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1659 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1660 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1663 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1664 uint8_t full[16*9];\
1666 copy_block9(full, src, 16, stride, 9);\
1667 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1668 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1671 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1672 uint8_t full[16*9];\
1673 copy_block9(full, src, 16, stride, 9);\
1674 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1677 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1678 uint8_t full[16*9];\
1680 copy_block9(full, src, 16, stride, 9);\
1681 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1682 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1684 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1685 uint8_t full[16*9];\
1688 uint8_t halfHV[64];\
1689 copy_block9(full, src, 16, stride, 9);\
1690 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1691 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1692 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1693 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1695 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1696 uint8_t full[16*9];\
1698 uint8_t halfHV[64];\
1699 copy_block9(full, src, 16, stride, 9);\
1700 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1701 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1702 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1703 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1705 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1706 uint8_t full[16*9];\
1709 uint8_t halfHV[64];\
1710 copy_block9(full, src, 16, stride, 9);\
1711 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1712 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1713 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1714 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1716 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1717 uint8_t full[16*9];\
1719 uint8_t halfHV[64];\
1720 copy_block9(full, src, 16, stride, 9);\
1721 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1722 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1723 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1724 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1726 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1727 uint8_t full[16*9];\
1730 uint8_t halfHV[64];\
1731 copy_block9(full, src, 16, stride, 9);\
1732 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1733 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1734 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1735 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1737 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1738 uint8_t full[16*9];\
1740 uint8_t halfHV[64];\
1741 copy_block9(full, src, 16, stride, 9);\
1742 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1743 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1744 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1745 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1747 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1748 uint8_t full[16*9];\
1751 uint8_t halfHV[64];\
1752 copy_block9(full, src, 16, stride, 9);\
1753 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1754 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1755 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1756 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1758 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1759 uint8_t full[16*9];\
1761 uint8_t halfHV[64];\
1762 copy_block9(full, src, 16, stride, 9);\
1763 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1764 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1765 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1766 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1768 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1770 uint8_t halfHV[64];\
1771 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1772 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1773 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1775 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1777 uint8_t halfHV[64];\
1778 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1779 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1780 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1782 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1783 uint8_t full[16*9];\
1786 uint8_t halfHV[64];\
1787 copy_block9(full, src, 16, stride, 9);\
1788 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1789 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1790 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1791 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1793 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1794 uint8_t full[16*9];\
1796 copy_block9(full, src, 16, stride, 9);\
1797 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1798 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1799 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1801 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1802 uint8_t full[16*9];\
1805 uint8_t halfHV[64];\
1806 copy_block9(full, src, 16, stride, 9);\
1807 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1808 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1809 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1810 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1812 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1813 uint8_t full[16*9];\
1815 copy_block9(full, src, 16, stride, 9);\
1816 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1817 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1818 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1820 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1822 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1823 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1825 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1826 OPNAME ## pixels16_c(dst, src, stride, 16);\
1829 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1831 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1832 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1835 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1836 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1839 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1841 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1842 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1845 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1846 uint8_t full[24*17];\
1848 copy_block17(full, src, 24, stride, 17);\
1849 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1850 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1853 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1854 uint8_t full[24*17];\
1855 copy_block17(full, src, 24, stride, 17);\
1856 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1859 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1860 uint8_t full[24*17];\
1862 copy_block17(full, src, 24, stride, 17);\
1863 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1864 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1866 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1867 uint8_t full[24*17];\
1868 uint8_t halfH[272];\
1869 uint8_t halfV[256];\
1870 uint8_t halfHV[256];\
1871 copy_block17(full, src, 24, stride, 17);\
1872 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1873 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1874 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1875 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1877 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1878 uint8_t full[24*17];\
1879 uint8_t halfH[272];\
1880 uint8_t halfHV[256];\
1881 copy_block17(full, src, 24, stride, 17);\
1882 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1883 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1884 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1885 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1887 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1888 uint8_t full[24*17];\
1889 uint8_t halfH[272];\
1890 uint8_t halfV[256];\
1891 uint8_t halfHV[256];\
1892 copy_block17(full, src, 24, stride, 17);\
1893 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1894 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1895 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1896 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1898 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1899 uint8_t full[24*17];\
1900 uint8_t halfH[272];\
1901 uint8_t halfHV[256];\
1902 copy_block17(full, src, 24, stride, 17);\
1903 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1904 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1905 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1906 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1908 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1909 uint8_t full[24*17];\
1910 uint8_t halfH[272];\
1911 uint8_t halfV[256];\
1912 uint8_t halfHV[256];\
1913 copy_block17(full, src, 24, stride, 17);\
1914 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1915 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1916 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1917 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1919 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1920 uint8_t full[24*17];\
1921 uint8_t halfH[272];\
1922 uint8_t halfHV[256];\
1923 copy_block17(full, src, 24, stride, 17);\
1924 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1925 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1926 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1927 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1929 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1930 uint8_t full[24*17];\
1931 uint8_t halfH[272];\
1932 uint8_t halfV[256];\
1933 uint8_t halfHV[256];\
1934 copy_block17(full, src, 24, stride, 17);\
1935 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1936 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1937 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1938 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1940 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1941 uint8_t full[24*17];\
1942 uint8_t halfH[272];\
1943 uint8_t halfHV[256];\
1944 copy_block17(full, src, 24, stride, 17);\
1945 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1946 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1947 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1948 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1950 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1951 uint8_t halfH[272];\
1952 uint8_t halfHV[256];\
1953 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1954 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1955 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1957 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1958 uint8_t halfH[272];\
1959 uint8_t halfHV[256];\
1960 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1961 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1962 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1964 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1965 uint8_t full[24*17];\
1966 uint8_t halfH[272];\
1967 uint8_t halfV[256];\
1968 uint8_t halfHV[256];\
1969 copy_block17(full, src, 24, stride, 17);\
1970 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1971 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1972 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1973 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1975 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1976 uint8_t full[24*17];\
1977 uint8_t halfH[272];\
1978 copy_block17(full, src, 24, stride, 17);\
1979 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1980 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1981 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1983 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1984 uint8_t full[24*17];\
1985 uint8_t halfH[272];\
1986 uint8_t halfV[256];\
1987 uint8_t halfHV[256];\
1988 copy_block17(full, src, 24, stride, 17);\
1989 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1990 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1991 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1992 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1994 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1995 uint8_t full[24*17];\
1996 uint8_t halfH[272];\
1997 copy_block17(full, src, 24, stride, 17);\
1998 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1999 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2000 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2002 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2003 uint8_t halfH[272];\
2004 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2005 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2008 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2009 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2010 #define op_put(a, b) a = cm[((b) + 16)>>5]
2011 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2013 QPEL_MC(0, put_ , _ , op_put)
2014 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2015 QPEL_MC(0, avg_ , _ , op_avg)
2016 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
2018 #undef op_avg_no_rnd
2020 #undef op_put_no_rnd
2023 #define H264_LOWPASS(OPNAME, OP, OP2) \
2024 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2026 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2030 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2031 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2037 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2039 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2043 const int srcB= src[-2*srcStride];\
2044 const int srcA= src[-1*srcStride];\
2045 const int src0= src[0 *srcStride];\
2046 const int src1= src[1 *srcStride];\
2047 const int src2= src[2 *srcStride];\
2048 const int src3= src[3 *srcStride];\
2049 const int src4= src[4 *srcStride];\
2050 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2051 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2057 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2060 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2062 src -= 2*srcStride;\
2063 for(i=0; i<h+5; i++)\
2065 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2066 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2070 tmp -= tmpStride*(h+5-2);\
2073 const int tmpB= tmp[-2*tmpStride];\
2074 const int tmpA= tmp[-1*tmpStride];\
2075 const int tmp0= tmp[0 *tmpStride];\
2076 const int tmp1= tmp[1 *tmpStride];\
2077 const int tmp2= tmp[2 *tmpStride];\
2078 const int tmp3= tmp[3 *tmpStride];\
2079 const int tmp4= tmp[4 *tmpStride];\
2080 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2081 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2086 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2088 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2092 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2093 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2094 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2095 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2101 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2103 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2107 const int srcB= src[-2*srcStride];\
2108 const int srcA= src[-1*srcStride];\
2109 const int src0= src[0 *srcStride];\
2110 const int src1= src[1 *srcStride];\
2111 const int src2= src[2 *srcStride];\
2112 const int src3= src[3 *srcStride];\
2113 const int src4= src[4 *srcStride];\
2114 const int src5= src[5 *srcStride];\
2115 const int src6= src[6 *srcStride];\
2116 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2117 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2118 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2119 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2125 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2128 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2130 src -= 2*srcStride;\
2131 for(i=0; i<h+5; i++)\
2133 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2134 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2135 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2136 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2140 tmp -= tmpStride*(h+5-2);\
2143 const int tmpB= tmp[-2*tmpStride];\
2144 const int tmpA= tmp[-1*tmpStride];\
2145 const int tmp0= tmp[0 *tmpStride];\
2146 const int tmp1= tmp[1 *tmpStride];\
2147 const int tmp2= tmp[2 *tmpStride];\
2148 const int tmp3= tmp[3 *tmpStride];\
2149 const int tmp4= tmp[4 *tmpStride];\
2150 const int tmp5= tmp[5 *tmpStride];\
2151 const int tmp6= tmp[6 *tmpStride];\
2152 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2153 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2154 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2155 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2161 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2163 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2167 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2168 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2169 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2170 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2171 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2172 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2173 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2174 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2180 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2182 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2186 const int srcB= src[-2*srcStride];\
2187 const int srcA= src[-1*srcStride];\
2188 const int src0= src[0 *srcStride];\
2189 const int src1= src[1 *srcStride];\
2190 const int src2= src[2 *srcStride];\
2191 const int src3= src[3 *srcStride];\
2192 const int src4= src[4 *srcStride];\
2193 const int src5= src[5 *srcStride];\
2194 const int src6= src[6 *srcStride];\
2195 const int src7= src[7 *srcStride];\
2196 const int src8= src[8 *srcStride];\
2197 const int src9= src[9 *srcStride];\
2198 const int src10=src[10*srcStride];\
2199 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2200 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2201 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2202 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2203 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2204 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2205 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2206 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2212 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2215 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2217 src -= 2*srcStride;\
2218 for(i=0; i<h+5; i++)\
2220 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2221 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2222 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2223 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2224 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2225 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2226 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2227 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2231 tmp -= tmpStride*(h+5-2);\
2234 const int tmpB= tmp[-2*tmpStride];\
2235 const int tmpA= tmp[-1*tmpStride];\
2236 const int tmp0= tmp[0 *tmpStride];\
2237 const int tmp1= tmp[1 *tmpStride];\
2238 const int tmp2= tmp[2 *tmpStride];\
2239 const int tmp3= tmp[3 *tmpStride];\
2240 const int tmp4= tmp[4 *tmpStride];\
2241 const int tmp5= tmp[5 *tmpStride];\
2242 const int tmp6= tmp[6 *tmpStride];\
2243 const int tmp7= tmp[7 *tmpStride];\
2244 const int tmp8= tmp[8 *tmpStride];\
2245 const int tmp9= tmp[9 *tmpStride];\
2246 const int tmp10=tmp[10*tmpStride];\
2247 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2248 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2249 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2250 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2251 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2252 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2253 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2254 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2260 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2261 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2262 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2263 src += 8*srcStride;\
2264 dst += 8*dstStride;\
2265 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2266 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2269 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2270 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2271 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2272 src += 8*srcStride;\
2273 dst += 8*dstStride;\
2274 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2275 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2278 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2279 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2280 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2281 src += 8*srcStride;\
2282 dst += 8*dstStride;\
2283 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2284 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2287 #define H264_MC(OPNAME, SIZE) \
2288 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2289 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2292 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2293 uint8_t half[SIZE*SIZE];\
2294 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2295 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2298 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2299 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2302 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2303 uint8_t half[SIZE*SIZE];\
2304 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2305 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2308 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2309 uint8_t full[SIZE*(SIZE+5)];\
2310 uint8_t * const full_mid= full + SIZE*2;\
2311 uint8_t half[SIZE*SIZE];\
2312 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2313 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2314 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2317 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2318 uint8_t full[SIZE*(SIZE+5)];\
2319 uint8_t * const full_mid= full + SIZE*2;\
2320 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2321 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2324 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2325 uint8_t full[SIZE*(SIZE+5)];\
2326 uint8_t * const full_mid= full + SIZE*2;\
2327 uint8_t half[SIZE*SIZE];\
2328 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2329 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2330 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2333 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2334 uint8_t full[SIZE*(SIZE+5)];\
2335 uint8_t * const full_mid= full + SIZE*2;\
2336 uint8_t halfH[SIZE*SIZE];\
2337 uint8_t halfV[SIZE*SIZE];\
2338 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2339 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2340 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2341 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2344 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2345 uint8_t full[SIZE*(SIZE+5)];\
2346 uint8_t * const full_mid= full + SIZE*2;\
2347 uint8_t halfH[SIZE*SIZE];\
2348 uint8_t halfV[SIZE*SIZE];\
2349 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2350 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2351 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2352 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2355 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2356 uint8_t full[SIZE*(SIZE+5)];\
2357 uint8_t * const full_mid= full + SIZE*2;\
2358 uint8_t halfH[SIZE*SIZE];\
2359 uint8_t halfV[SIZE*SIZE];\
2360 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2361 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2362 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2363 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2366 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2367 uint8_t full[SIZE*(SIZE+5)];\
2368 uint8_t * const full_mid= full + SIZE*2;\
2369 uint8_t halfH[SIZE*SIZE];\
2370 uint8_t halfV[SIZE*SIZE];\
2371 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2372 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2373 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2374 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2377 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2378 int16_t tmp[SIZE*(SIZE+5)];\
2379 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2382 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2383 int16_t tmp[SIZE*(SIZE+5)];\
2384 uint8_t halfH[SIZE*SIZE];\
2385 uint8_t halfHV[SIZE*SIZE];\
2386 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2387 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2388 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2391 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2392 int16_t tmp[SIZE*(SIZE+5)];\
2393 uint8_t halfH[SIZE*SIZE];\
2394 uint8_t halfHV[SIZE*SIZE];\
2395 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2396 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2397 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2400 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2401 uint8_t full[SIZE*(SIZE+5)];\
2402 uint8_t * const full_mid= full + SIZE*2;\
2403 int16_t tmp[SIZE*(SIZE+5)];\
2404 uint8_t halfV[SIZE*SIZE];\
2405 uint8_t halfHV[SIZE*SIZE];\
2406 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2407 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2408 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2409 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2412 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2413 uint8_t full[SIZE*(SIZE+5)];\
2414 uint8_t * const full_mid= full + SIZE*2;\
2415 int16_t tmp[SIZE*(SIZE+5)];\
2416 uint8_t halfV[SIZE*SIZE];\
2417 uint8_t halfHV[SIZE*SIZE];\
2418 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2419 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2420 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2421 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2424 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2425 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2426 #define op_put(a, b) a = cm[((b) + 16)>>5]
2427 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2428 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2430 H264_LOWPASS(put_ , op_put, op2_put)
2431 H264_LOWPASS(avg_ , op_avg, op2_avg)
2446 #define op_scale1(x) block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2447 #define op_scale2(x) dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2448 #define H264_WEIGHT(W,H) \
2449 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2451 offset <<= log2_denom; \
2452 if(log2_denom) offset += 1<<(log2_denom-1); \
2453 for(y=0; y<H; y++, block += stride){ \
2456 if(W==2) continue; \
2459 if(W==4) continue; \
2464 if(W==8) continue; \
2475 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2477 offset = ((offset + 1) | 1) << log2_denom; \
2478 for(y=0; y<H; y++, dst += stride, src += stride){ \
2481 if(W==2) continue; \
2484 if(W==4) continue; \
2489 if(W==8) continue; \
2516 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2517 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2521 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2522 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2523 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2524 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2525 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2526 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2527 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2528 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2534 #ifdef CONFIG_CAVS_DECODER
2536 void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2538 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2539 put_pixels8_c(dst, src, stride, 8);
2541 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2542 avg_pixels8_c(dst, src, stride, 8);
2544 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2545 put_pixels16_c(dst, src, stride, 16);
2547 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2548 avg_pixels16_c(dst, src, stride, 16);
2550 #endif /* CONFIG_CAVS_DECODER */
2552 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
2554 void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2556 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2557 put_pixels8_c(dst, src, stride, 8);
2559 #endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
2561 #if defined(CONFIG_H264_ENCODER)
2563 void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2564 #endif /* CONFIG_H264_ENCODER */
2566 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2567 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2571 const int src_1= src[ -srcStride];
2572 const int src0 = src[0 ];
2573 const int src1 = src[ srcStride];
2574 const int src2 = src[2*srcStride];
2575 const int src3 = src[3*srcStride];
2576 const int src4 = src[4*srcStride];
2577 const int src5 = src[5*srcStride];
2578 const int src6 = src[6*srcStride];
2579 const int src7 = src[7*srcStride];
2580 const int src8 = src[8*srcStride];
2581 const int src9 = src[9*srcStride];
2582 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2583 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2584 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2585 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2586 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2587 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2588 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2589 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2595 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2596 put_pixels8_c(dst, src, stride, 8);
2599 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2601 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2602 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2605 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2606 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2609 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2611 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2612 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2615 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2616 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2619 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2623 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2624 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2625 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2626 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2628 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2632 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2633 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2634 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2635 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2637 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2639 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2640 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2643 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2644 if(ENABLE_ANY_H263) {
2646 const int strength= ff_h263_loop_filter_strength[qscale];
2650 int p0= src[x-2*stride];
2651 int p1= src[x-1*stride];
2652 int p2= src[x+0*stride];
2653 int p3= src[x+1*stride];
2654 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2656 if (d<-2*strength) d1= 0;
2657 else if(d<- strength) d1=-2*strength - d;
2658 else if(d< strength) d1= d;
2659 else if(d< 2*strength) d1= 2*strength - d;
2664 if(p1&256) p1= ~(p1>>31);
2665 if(p2&256) p2= ~(p2>>31);
2667 src[x-1*stride] = p1;
2668 src[x+0*stride] = p2;
2672 d2= av_clip((p0-p3)/4, -ad1, ad1);
2674 src[x-2*stride] = p0 - d2;
2675 src[x+ stride] = p3 + d2;
2680 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2681 if(ENABLE_ANY_H263) {
2683 const int strength= ff_h263_loop_filter_strength[qscale];
2687 int p0= src[y*stride-2];
2688 int p1= src[y*stride-1];
2689 int p2= src[y*stride+0];
2690 int p3= src[y*stride+1];
2691 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2693 if (d<-2*strength) d1= 0;
2694 else if(d<- strength) d1=-2*strength - d;
2695 else if(d< strength) d1= d;
2696 else if(d< 2*strength) d1= 2*strength - d;
2701 if(p1&256) p1= ~(p1>>31);
2702 if(p2&256) p2= ~(p2>>31);
2704 src[y*stride-1] = p1;
2705 src[y*stride+0] = p2;
2709 d2= av_clip((p0-p3)/4, -ad1, ad1);
2711 src[y*stride-2] = p0 - d2;
2712 src[y*stride+1] = p3 + d2;
2717 static void h261_loop_filter_c(uint8_t *src, int stride){
2722 temp[x ] = 4*src[x ];
2723 temp[x + 7*8] = 4*src[x + 7*stride];
2727 xy = y * stride + x;
2729 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2734 src[ y*stride] = (temp[ y*8] + 2)>>2;
2735 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2737 xy = y * stride + x;
2739 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2744 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2747 for( i = 0; i < 4; i++ ) {
2752 for( d = 0; d < 4; d++ ) {
2753 const int p0 = pix[-1*xstride];
2754 const int p1 = pix[-2*xstride];
2755 const int p2 = pix[-3*xstride];
2756 const int q0 = pix[0];
2757 const int q1 = pix[1*xstride];
2758 const int q2 = pix[2*xstride];
2760 if( FFABS( p0 - q0 ) < alpha &&
2761 FFABS( p1 - p0 ) < beta &&
2762 FFABS( q1 - q0 ) < beta ) {
2767 if( FFABS( p2 - p0 ) < beta ) {
2768 pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2771 if( FFABS( q2 - q0 ) < beta ) {
2772 pix[ xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2776 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2777 pix[-xstride] = av_clip_uint8( p0 + i_delta ); /* p0' */
2778 pix[0] = av_clip_uint8( q0 - i_delta ); /* q0' */
2784 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2786 h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2788 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2790 h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2793 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2796 for( i = 0; i < 4; i++ ) {
2797 const int tc = tc0[i];
2802 for( d = 0; d < 2; d++ ) {
2803 const int p0 = pix[-1*xstride];
2804 const int p1 = pix[-2*xstride];
2805 const int q0 = pix[0];
2806 const int q1 = pix[1*xstride];
2808 if( FFABS( p0 - q0 ) < alpha &&
2809 FFABS( p1 - p0 ) < beta &&
2810 FFABS( q1 - q0 ) < beta ) {
2812 int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2814 pix[-xstride] = av_clip_uint8( p0 + delta ); /* p0' */
2815 pix[0] = av_clip_uint8( q0 - delta ); /* q0' */
2821 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2823 h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2825 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2827 h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
2830 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2833 for( d = 0; d < 8; d++ ) {
2834 const int p0 = pix[-1*xstride];
2835 const int p1 = pix[-2*xstride];
2836 const int q0 = pix[0];
2837 const int q1 = pix[1*xstride];
2839 if( FFABS( p0 - q0 ) < alpha &&
2840 FFABS( p1 - p0 ) < beta &&
2841 FFABS( q1 - q0 ) < beta ) {
2843 pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */
2844 pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */
2849 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2851 h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
2853 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2855 h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
2858 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2864 s += abs(pix1[0] - pix2[0]);
2865 s += abs(pix1[1] - pix2[1]);
2866 s += abs(pix1[2] - pix2[2]);
2867 s += abs(pix1[3] - pix2[3]);
2868 s += abs(pix1[4] - pix2[4]);
2869 s += abs(pix1[5] - pix2[5]);
2870 s += abs(pix1[6] - pix2[6]);
2871 s += abs(pix1[7] - pix2[7]);
2872 s += abs(pix1[8] - pix2[8]);
2873 s += abs(pix1[9] - pix2[9]);
2874 s += abs(pix1[10] - pix2[10]);
2875 s += abs(pix1[11] - pix2[11]);
2876 s += abs(pix1[12] - pix2[12]);
2877 s += abs(pix1[13] - pix2[13]);
2878 s += abs(pix1[14] - pix2[14]);
2879 s += abs(pix1[15] - pix2[15]);
2886 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2892 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2893 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2894 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2895 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2896 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2897 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2898 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2899 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2900 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2901 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2902 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2903 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2904 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2905 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2906 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2907 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2914 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2917 uint8_t *pix3 = pix2 + line_size;
2921 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2922 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2923 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2924 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2925 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2926 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2927 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2928 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2929 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2930 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2931 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2932 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2933 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2934 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2935 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2936 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2944 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2947 uint8_t *pix3 = pix2 + line_size;
2951 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2952 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2953 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2954 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2955 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2956 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2957 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2958 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2959 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2960 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2961 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2962 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2963 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2964 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2965 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2966 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2974 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2980 s += abs(pix1[0] - pix2[0]);
2981 s += abs(pix1[1] - pix2[1]);
2982 s += abs(pix1[2] - pix2[2]);
2983 s += abs(pix1[3] - pix2[3]);
2984 s += abs(pix1[4] - pix2[4]);
2985 s += abs(pix1[5] - pix2[5]);
2986 s += abs(pix1[6] - pix2[6]);
2987 s += abs(pix1[7] - pix2[7]);
2994 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3000 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3001 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3002 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3003 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3004 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3005 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3006 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3007 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3014 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3017 uint8_t *pix3 = pix2 + line_size;
3021 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3022 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3023 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3024 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3025 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3026 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3027 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3028 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3036 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3039 uint8_t *pix3 = pix2 + line_size;
3043 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3044 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3045 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3046 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3047 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3048 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3049 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3050 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3058 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3059 MpegEncContext *c = v;
3065 for(x=0; x<16; x++){
3066 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3069 for(x=0; x<15; x++){
3070 score2+= FFABS( s1[x ] - s1[x +stride]
3071 - s1[x+1] + s1[x+1+stride])
3072 -FFABS( s2[x ] - s2[x +stride]
3073 - s2[x+1] + s2[x+1+stride]);
3080 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3081 else return score1 + FFABS(score2)*8;
3084 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3085 MpegEncContext *c = v;
3092 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3096 score2+= FFABS( s1[x ] - s1[x +stride]
3097 - s1[x+1] + s1[x+1+stride])
3098 -FFABS( s2[x ] - s2[x +stride]
3099 - s2[x+1] + s2[x+1+stride]);
3106 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3107 else return score1 + FFABS(score2)*8;
3110 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3114 for(i=0; i<8*8; i++){
3115 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3118 assert(-512<b && b<512);
3120 sum += (w*b)*(w*b)>>4;
3125 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3128 for(i=0; i<8*8; i++){
3129 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3134 * permutes an 8x8 block.
3135 * @param block the block which will be permuted according to the given permutation vector
3136 * @param permutation the permutation vector
3137 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3138 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3139 * (inverse) permutated to scantable order!
3141 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3147 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3149 for(i=0; i<=last; i++){
3150 const int j= scantable[i];
3155 for(i=0; i<=last; i++){
3156 const int j= scantable[i];
3157 const int perm_j= permutation[j];
3158 block[perm_j]= temp[j];
3162 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3166 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3169 memset(cmp, 0, sizeof(void*)*5);
3177 cmp[i]= c->hadamard8_diff[i];
3183 cmp[i]= c->dct_sad[i];
3186 cmp[i]= c->dct264_sad[i];
3189 cmp[i]= c->dct_max[i];
3192 cmp[i]= c->quant_psnr[i];
3212 #ifdef CONFIG_SNOW_ENCODER
3221 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3227 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3229 static void clear_blocks_c(DCTELEM *blocks)
3231 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3234 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3236 for(i=0; i+7<w; i+=8){
3237 dst[i+0] += src[i+0];
3238 dst[i+1] += src[i+1];
3239 dst[i+2] += src[i+2];
3240 dst[i+3] += src[i+3];
3241 dst[i+4] += src[i+4];
3242 dst[i+5] += src[i+5];
3243 dst[i+6] += src[i+6];
3244 dst[i+7] += src[i+7];
3247 dst[i+0] += src[i+0];
3250 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3252 for(i=0; i+7<w; i+=8){
3253 dst[i+0] = src1[i+0]-src2[i+0];
3254 dst[i+1] = src1[i+1]-src2[i+1];
3255 dst[i+2] = src1[i+2]-src2[i+2];
3256 dst[i+3] = src1[i+3]-src2[i+3];
3257 dst[i+4] = src1[i+4]-src2[i+4];
3258 dst[i+5] = src1[i+5]-src2[i+5];
3259 dst[i+6] = src1[i+6]-src2[i+6];
3260 dst[i+7] = src1[i+7]-src2[i+7];
3263 dst[i+0] = src1[i+0]-src2[i+0];
3266 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3274 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3284 #define BUTTERFLY2(o1,o2,i1,i2) \
3288 #define BUTTERFLY1(x,y) \
3297 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3299 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3307 //FIXME try pointer walks
3308 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3309 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3310 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3311 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3313 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3314 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3315 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3316 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3318 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3319 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3320 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3321 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3325 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3326 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3327 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3328 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3330 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3331 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3332 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3333 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3336 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3337 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3338 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3339 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3345 printf("MAX:%d\n", maxi);
3351 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3359 //FIXME try pointer walks
3360 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3361 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3362 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3363 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3365 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3366 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3367 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3368 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3370 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3371 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3372 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3373 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3377 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3378 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3379 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3380 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3382 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3383 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3384 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3385 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3388 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3389 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3390 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3391 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3394 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3399 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3400 MpegEncContext * const s= (MpegEncContext *)c;
3401 DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3402 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3406 s->dsp.diff_pixels(temp, src1, src2, stride);
3408 return s->dsp.sum_abs_dctelem(temp);
3413 const int s07 = SRC(0) + SRC(7);\
3414 const int s16 = SRC(1) + SRC(6);\
3415 const int s25 = SRC(2) + SRC(5);\
3416 const int s34 = SRC(3) + SRC(4);\
3417 const int a0 = s07 + s34;\
3418 const int a1 = s16 + s25;\
3419 const int a2 = s07 - s34;\
3420 const int a3 = s16 - s25;\
3421 const int d07 = SRC(0) - SRC(7);\
3422 const int d16 = SRC(1) - SRC(6);\
3423 const int d25 = SRC(2) - SRC(5);\
3424 const int d34 = SRC(3) - SRC(4);\
3425 const int a4 = d16 + d25 + (d07 + (d07>>1));\
3426 const int a5 = d07 - d34 - (d25 + (d25>>1));\
3427 const int a6 = d07 + d34 - (d16 + (d16>>1));\
3428 const int a7 = d16 - d25 + (d34 + (d34>>1));\
3430 DST(1, a4 + (a7>>2)) ;\
3431 DST(2, a2 + (a3>>1)) ;\
3432 DST(3, a5 + (a6>>2)) ;\
3434 DST(5, a6 - (a5>>2)) ;\
3435 DST(6, (a2>>1) - a3 ) ;\
3436 DST(7, (a4>>2) - a7 ) ;\
3439 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3440 MpegEncContext * const s= (MpegEncContext *)c;
3445 s->dsp.diff_pixels(dct[0], src1, src2, stride);
3447 #define SRC(x) dct[i][x]
3448 #define DST(x,v) dct[i][x]= v
3449 for( i = 0; i < 8; i++ )
3454 #define SRC(x) dct[x][i]
3455 #define DST(x,v) sum += FFABS(v)
3456 for( i = 0; i < 8; i++ )
3464 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3465 MpegEncContext * const s= (MpegEncContext *)c;
3466 DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3467 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3472 s->dsp.diff_pixels(temp, src1, src2, stride);
3476 sum= FFMAX(sum, FFABS(temp[i]));
3481 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3482 MpegEncContext * const s= (MpegEncContext *)c;
3483 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3484 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3485 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3491 s->dsp.diff_pixels(temp, src1, src2, stride);
3493 memcpy(bak, temp, 64*sizeof(DCTELEM));
3495 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3496 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3497 simple_idct(temp); //FIXME
3500 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3505 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3506 MpegEncContext * const s= (MpegEncContext *)c;
3507 const uint8_t *scantable= s->intra_scantable.permutated;
3508 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3509 DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3510 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3511 uint8_t * const bak= (uint8_t*)aligned_bak;
3512 int i, last, run, bits, level, distoration, start_i;
3513 const int esc_length= s->ac_esc_length;
3515 uint8_t * last_length;
3520 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3521 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3524 s->dsp.diff_pixels(temp, src1, src2, stride);
3526 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3532 length = s->intra_ac_vlc_length;
3533 last_length= s->intra_ac_vlc_last_length;
3534 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3537 length = s->inter_ac_vlc_length;
3538 last_length= s->inter_ac_vlc_last_length;
3543 for(i=start_i; i<last; i++){
3544 int j= scantable[i];
3549 if((level&(~127)) == 0){
3550 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3559 level= temp[i] + 64;
3563 if((level&(~127)) == 0){
3564 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3572 s->dct_unquantize_intra(s, temp, 0, s->qscale);
3574 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3577 s->dsp.idct_add(bak, stride, temp);
3579 distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3581 return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3584 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3585 MpegEncContext * const s= (MpegEncContext *)c;
3586 const uint8_t *scantable= s->intra_scantable.permutated;
3587 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3588 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3589 int i, last, run, bits, level, start_i;
3590 const int esc_length= s->ac_esc_length;
3592 uint8_t * last_length;
3596 s->dsp.diff_pixels(temp, src1, src2, stride);
3598 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3604 length = s->intra_ac_vlc_length;
3605 last_length= s->intra_ac_vlc_last_length;
3606 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3609 length = s->inter_ac_vlc_length;
3610 last_length= s->inter_ac_vlc_last_length;
3615 for(i=start_i; i<last; i++){
3616 int j= scantable[i];
3621 if((level&(~127)) == 0){
3622 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3631 level= temp[i] + 64;
3635 if((level&(~127)) == 0){
3636 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3644 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3649 for(x=0; x<16; x+=4){
3650 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride])
3651 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);
3659 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3664 for(x=0; x<16; x++){
3665 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3674 #define SQ(a) ((a)*(a))
3675 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3680 for(x=0; x<16; x+=4){
3681 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride])
3682 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3690 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3695 for(x=0; x<16; x++){
3696 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3705 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3709 for(i=0; i<size; i++)
3710 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3714 WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3715 WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3716 WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3718 WARPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3720 WARPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3721 WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3722 WARPER8_16_SQ(rd8x8_c, rd16_c)
3723 WARPER8_16_SQ(bit8x8_c, bit16_c)
3725 static void vector_fmul_c(float *dst, const float *src, int len){
3727 for(i=0; i<len; i++)
3731 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3734 for(i=0; i<len; i++)
3735 dst[i] = src0[i] * src1[-i];
3738 void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){
3740 for(i=0; i<len; i++)
3741 dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
3744 void ff_float_to_int16_c(int16_t *dst, const float *src, int len){
3746 for(i=0; i<len; i++) {
3747 int_fast32_t tmp = ((int32_t*)src)[i];
3749 tmp = (0x43c0ffff - tmp)>>31;
3750 // is this faster on some gcc/cpu combinations?
3751 // if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3754 dst[i] = tmp - 0x8000;
3758 /* XXX: those functions should be suppressed ASAP when all IDCTs are
3760 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3763 put_pixels_clamped_c(block, dest, line_size);
3765 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3768 add_pixels_clamped_c(block, dest, line_size);
3771 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3774 put_pixels_clamped4_c(block, dest, line_size);
3776 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3779 add_pixels_clamped4_c(block, dest, line_size);
3782 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
3785 put_pixels_clamped2_c(block, dest, line_size);
3787 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
3790 add_pixels_clamped2_c(block, dest, line_size);
3793 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
3795 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
3797 dest[0] = cm[(block[0] + 4)>>3];
3799 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
3801 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
3803 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
3806 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
3808 /* init static data */
3809 void dsputil_static_init(void)
3813 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
3814 for(i=0;i<MAX_NEG_CROP;i++) {
3816 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
3819 for(i=0;i<512;i++) {
3820 ff_squareTbl[i] = (i - 256) * (i - 256);
3823 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3826 int ff_check_alignment(void){
3827 static int did_fail=0;
3828 DECLARE_ALIGNED_16(int, aligned);
3830 if((long)&aligned & 15){
3832 #if defined(HAVE_MMX) || defined(HAVE_ALTIVEC)
3833 av_log(NULL, AV_LOG_ERROR,
3834 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
3835 "and may be very slow or crash. This is not a bug in libavcodec,\n"
3836 "but in the compiler. Do not report crashes to FFmpeg developers.\n");
3845 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3849 ff_check_alignment();
3851 #ifdef CONFIG_ENCODERS
3852 if(avctx->dct_algo==FF_DCT_FASTINT) {
3853 c->fdct = fdct_ifast;
3854 c->fdct248 = fdct_ifast248;
3856 else if(avctx->dct_algo==FF_DCT_FAAN) {
3857 c->fdct = ff_faandct;
3858 c->fdct248 = ff_faandct248;
3861 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3862 c->fdct248 = ff_fdct248_islow;
3864 #endif //CONFIG_ENCODERS
3866 if(avctx->lowres==1){
3867 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !ENABLE_H264_DECODER){
3868 c->idct_put= ff_jref_idct4_put;
3869 c->idct_add= ff_jref_idct4_add;
3871 c->idct_put= ff_h264_lowres_idct_put_c;
3872 c->idct_add= ff_h264_lowres_idct_add_c;
3874 c->idct = j_rev_dct4;
3875 c->idct_permutation_type= FF_NO_IDCT_PERM;
3876 }else if(avctx->lowres==2){
3877 c->idct_put= ff_jref_idct2_put;
3878 c->idct_add= ff_jref_idct2_add;
3879 c->idct = j_rev_dct2;
3880 c->idct_permutation_type= FF_NO_IDCT_PERM;
3881 }else if(avctx->lowres==3){
3882 c->idct_put= ff_jref_idct1_put;
3883 c->idct_add= ff_jref_idct1_add;
3884 c->idct = j_rev_dct1;
3885 c->idct_permutation_type= FF_NO_IDCT_PERM;
3887 if(avctx->idct_algo==FF_IDCT_INT){
3888 c->idct_put= ff_jref_idct_put;
3889 c->idct_add= ff_jref_idct_add;
3890 c->idct = j_rev_dct;
3891 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3892 }else if((ENABLE_VP3_DECODER || ENABLE_VP5_DECODER || ENABLE_VP6_DECODER || ENABLE_THEORA_DECODER ) &&
3893 avctx->idct_algo==FF_IDCT_VP3){
3894 c->idct_put= ff_vp3_idct_put_c;
3895 c->idct_add= ff_vp3_idct_add_c;
3896 c->idct = ff_vp3_idct_c;
3897 c->idct_permutation_type= FF_NO_IDCT_PERM;
3898 }else{ //accurate/default
3899 c->idct_put= simple_idct_put;
3900 c->idct_add= simple_idct_add;
3901 c->idct = simple_idct;
3902 c->idct_permutation_type= FF_NO_IDCT_PERM;
3906 if (ENABLE_H264_DECODER) {
3907 c->h264_idct_add= ff_h264_idct_add_c;
3908 c->h264_idct8_add= ff_h264_idct8_add_c;
3909 c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
3910 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
3913 c->get_pixels = get_pixels_c;
3914 c->diff_pixels = diff_pixels_c;
3915 c->put_pixels_clamped = put_pixels_clamped_c;
3916 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
3917 c->add_pixels_clamped = add_pixels_clamped_c;
3918 c->add_pixels8 = add_pixels8_c;
3919 c->add_pixels4 = add_pixels4_c;
3920 c->sum_abs_dctelem = sum_abs_dctelem_c;
3923 c->clear_blocks = clear_blocks_c;
3924 c->pix_sum = pix_sum_c;
3925 c->pix_norm1 = pix_norm1_c;
3927 /* TODO [0] 16 [1] 8 */
3928 c->pix_abs[0][0] = pix_abs16_c;
3929 c->pix_abs[0][1] = pix_abs16_x2_c;
3930 c->pix_abs[0][2] = pix_abs16_y2_c;
3931 c->pix_abs[0][3] = pix_abs16_xy2_c;
3932 c->pix_abs[1][0] = pix_abs8_c;
3933 c->pix_abs[1][1] = pix_abs8_x2_c;
3934 c->pix_abs[1][2] = pix_abs8_y2_c;
3935 c->pix_abs[1][3] = pix_abs8_xy2_c;
3937 #define dspfunc(PFX, IDX, NUM) \
3938 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
3939 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
3940 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
3941 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3943 dspfunc(put, 0, 16);
3944 dspfunc(put_no_rnd, 0, 16);
3946 dspfunc(put_no_rnd, 1, 8);
3950 dspfunc(avg, 0, 16);
3951 dspfunc(avg_no_rnd, 0, 16);
3953 dspfunc(avg_no_rnd, 1, 8);
3958 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
3959 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
3961 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3962 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3963 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3964 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3965 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3966 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3967 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3968 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3969 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3971 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3972 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3973 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3974 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3975 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3976 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3977 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3978 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3979 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3981 #define dspfunc(PFX, IDX, NUM) \
3982 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3983 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3984 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3985 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3986 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3987 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3988 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3989 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3990 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3991 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3992 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3993 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3994 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3995 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3996 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3997 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3999 dspfunc(put_qpel, 0, 16);
4000 dspfunc(put_no_rnd_qpel, 0, 16);
4002 dspfunc(avg_qpel, 0, 16);
4003 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4005 dspfunc(put_qpel, 1, 8);
4006 dspfunc(put_no_rnd_qpel, 1, 8);
4008 dspfunc(avg_qpel, 1, 8);
4009 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4011 dspfunc(put_h264_qpel, 0, 16);
4012 dspfunc(put_h264_qpel, 1, 8);
4013 dspfunc(put_h264_qpel, 2, 4);
4014 dspfunc(put_h264_qpel, 3, 2);
4015 dspfunc(avg_h264_qpel, 0, 16);
4016 dspfunc(avg_h264_qpel, 1, 8);
4017 dspfunc(avg_h264_qpel, 2, 4);
4020 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4021 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4022 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4023 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4024 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4025 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4026 c->put_no_rnd_h264_chroma_pixels_tab[0]= put_no_rnd_h264_chroma_mc8_c;
4028 c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
4029 c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
4030 c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
4031 c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
4032 c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
4033 c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
4034 c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
4035 c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
4036 c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
4037 c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
4038 c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
4039 c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
4040 c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
4041 c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
4042 c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
4043 c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
4044 c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
4045 c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
4046 c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
4047 c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
4049 #ifdef CONFIG_CAVS_DECODER
4050 ff_cavsdsp_init(c,avctx);
4052 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4053 ff_vc1dsp_init(c,avctx);
4055 #if defined(CONFIG_H264_ENCODER)
4056 ff_h264dspenc_init(c,avctx);
4059 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4060 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4061 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4062 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4063 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4064 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4065 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4066 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4068 #define SET_CMP_FUNC(name) \
4069 c->name[0]= name ## 16_c;\
4070 c->name[1]= name ## 8x8_c;
4072 SET_CMP_FUNC(hadamard8_diff)
4073 c->hadamard8_diff[4]= hadamard8_intra16_c;
4074 SET_CMP_FUNC(dct_sad)
4075 SET_CMP_FUNC(dct_max)
4077 SET_CMP_FUNC(dct264_sad)
4079 c->sad[0]= pix_abs16_c;
4080 c->sad[1]= pix_abs8_c;
4084 SET_CMP_FUNC(quant_psnr)
4087 c->vsad[0]= vsad16_c;
4088 c->vsad[4]= vsad_intra16_c;
4089 c->vsse[0]= vsse16_c;
4090 c->vsse[4]= vsse_intra16_c;
4091 c->nsse[0]= nsse16_c;
4092 c->nsse[1]= nsse8_c;
4093 #ifdef CONFIG_SNOW_ENCODER
4094 c->w53[0]= w53_16_c;
4096 c->w97[0]= w97_16_c;
4100 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4102 c->add_bytes= add_bytes_c;
4103 c->diff_bytes= diff_bytes_c;
4104 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4105 c->bswap_buf= bswap_buf;
4107 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4108 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4109 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4110 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4111 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4112 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4113 c->h264_loop_filter_strength= NULL;
4115 if (ENABLE_ANY_H263) {
4116 c->h263_h_loop_filter= h263_h_loop_filter_c;
4117 c->h263_v_loop_filter= h263_v_loop_filter_c;
4120 c->h261_loop_filter= h261_loop_filter_c;
4122 c->try_8x8basis= try_8x8basis_c;
4123 c->add_8x8basis= add_8x8basis_c;
4125 #ifdef CONFIG_SNOW_DECODER
4126 c->vertical_compose97i = ff_snow_vertical_compose97i;
4127 c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4128 c->inner_add_yblock = ff_snow_inner_add_yblock;
4131 #ifdef CONFIG_VORBIS_DECODER
4132 c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4134 c->vector_fmul = vector_fmul_c;
4135 c->vector_fmul_reverse = vector_fmul_reverse_c;
4136 c->vector_fmul_add_add = ff_vector_fmul_add_add_c;
4137 c->float_to_int16 = ff_float_to_int16_c;
4139 c->shrink[0]= ff_img_copy_plane;
4140 c->shrink[1]= ff_shrink22;
4141 c->shrink[2]= ff_shrink44;
4142 c->shrink[3]= ff_shrink88;
4144 c->prefetch= just_return;
4146 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4147 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4149 if (ENABLE_MMX) dsputil_init_mmx (c, avctx);
4150 if (ENABLE_ARMV4L) dsputil_init_armv4l(c, avctx);
4151 if (ENABLE_MLIB) dsputil_init_mlib (c, avctx);
4152 if (ENABLE_SPARC) dsputil_init_vis (c, avctx);
4153 if (ENABLE_ALPHA) dsputil_init_alpha (c, avctx);
4154 if (ENABLE_POWERPC) dsputil_init_ppc (c, avctx);
4155 if (ENABLE_MMI) dsputil_init_mmi (c, avctx);
4156 if (ENABLE_SH4) dsputil_init_sh4 (c, avctx);
4157 if (ENABLE_BFIN) dsputil_init_bfin (c, avctx);
4159 for(i=0; i<64; i++){
4160 if(!c->put_2tap_qpel_pixels_tab[0][i])
4161 c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4162 if(!c->avg_2tap_qpel_pixels_tab[0][i])
4163 c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4166 switch(c->idct_permutation_type){
4167 case FF_NO_IDCT_PERM:
4169 c->idct_permutation[i]= i;
4171 case FF_LIBMPEG2_IDCT_PERM:
4173 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4175 case FF_SIMPLE_IDCT_PERM:
4177 c->idct_permutation[i]= simple_mmx_permutation[i];
4179 case FF_TRANSPOSE_IDCT_PERM:
4181 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4183 case FF_PARTTRANS_IDCT_PERM:
4185 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4188 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");