3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
32 #include "mpegvideo.h"
33 #include "simple_idct.h"
39 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
42 void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
45 void ff_flac_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
48 void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
50 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
51 uint32_t ff_squareTbl[512] = {0, };
53 const uint8_t ff_zigzag_direct[64] = {
54 0, 1, 8, 16, 9, 2, 3, 10,
55 17, 24, 32, 25, 18, 11, 4, 5,
56 12, 19, 26, 33, 40, 48, 41, 34,
57 27, 20, 13, 6, 7, 14, 21, 28,
58 35, 42, 49, 56, 57, 50, 43, 36,
59 29, 22, 15, 23, 30, 37, 44, 51,
60 58, 59, 52, 45, 38, 31, 39, 46,
61 53, 60, 61, 54, 47, 55, 62, 63
64 /* Specific zigzag scan for 248 idct. NOTE that unlike the
65 specification, we interleave the fields */
66 const uint8_t ff_zigzag248_direct[64] = {
67 0, 8, 1, 9, 16, 24, 2, 10,
68 17, 25, 32, 40, 48, 56, 33, 41,
69 18, 26, 3, 11, 4, 12, 19, 27,
70 34, 42, 49, 57, 50, 58, 35, 43,
71 20, 28, 5, 13, 6, 14, 21, 29,
72 36, 44, 51, 59, 52, 60, 37, 45,
73 22, 30, 7, 15, 23, 31, 38, 46,
74 53, 61, 54, 62, 39, 47, 55, 63,
77 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
78 DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
80 const uint8_t ff_alternate_horizontal_scan[64] = {
81 0, 1, 2, 3, 8, 9, 16, 17,
82 10, 11, 4, 5, 6, 7, 15, 14,
83 13, 12, 19, 18, 24, 25, 32, 33,
84 26, 27, 20, 21, 22, 23, 28, 29,
85 30, 31, 34, 35, 40, 41, 48, 49,
86 42, 43, 36, 37, 38, 39, 44, 45,
87 46, 47, 50, 51, 56, 57, 58, 59,
88 52, 53, 54, 55, 60, 61, 62, 63,
91 const uint8_t ff_alternate_vertical_scan[64] = {
92 0, 8, 16, 24, 1, 9, 2, 10,
93 17, 25, 32, 40, 48, 56, 57, 49,
94 41, 33, 26, 18, 3, 11, 4, 12,
95 19, 27, 34, 42, 50, 58, 35, 43,
96 51, 59, 20, 28, 5, 13, 6, 14,
97 21, 29, 36, 44, 52, 60, 37, 45,
98 53, 61, 22, 30, 7, 15, 23, 31,
99 38, 46, 54, 62, 39, 47, 55, 63,
102 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
103 const uint32_t ff_inverse[256]={
104 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
105 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
106 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
107 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
108 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
109 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
110 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
111 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
112 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
113 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
114 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
115 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
116 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
117 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
118 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
119 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
120 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
121 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
122 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
123 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
124 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
125 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
126 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
127 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
128 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
129 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
130 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
131 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
132 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
133 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
134 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
135 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
138 /* Input permutation for the simple_idct_mmx */
139 static const uint8_t simple_mmx_permutation[64]={
140 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
141 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
142 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
143 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
144 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
145 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
146 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
147 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
150 static int pix_sum_c(uint8_t * pix, int line_size)
155 for (i = 0; i < 16; i++) {
156 for (j = 0; j < 16; j += 8) {
167 pix += line_size - 16;
172 static int pix_norm1_c(uint8_t * pix, int line_size)
175 uint32_t *sq = ff_squareTbl + 256;
178 for (i = 0; i < 16; i++) {
179 for (j = 0; j < 16; j += 8) {
190 #if LONG_MAX > 2147483647
191 register uint64_t x=*(uint64_t*)pix;
193 s += sq[(x>>8)&0xff];
194 s += sq[(x>>16)&0xff];
195 s += sq[(x>>24)&0xff];
196 s += sq[(x>>32)&0xff];
197 s += sq[(x>>40)&0xff];
198 s += sq[(x>>48)&0xff];
199 s += sq[(x>>56)&0xff];
201 register uint32_t x=*(uint32_t*)pix;
203 s += sq[(x>>8)&0xff];
204 s += sq[(x>>16)&0xff];
205 s += sq[(x>>24)&0xff];
206 x=*(uint32_t*)(pix+4);
208 s += sq[(x>>8)&0xff];
209 s += sq[(x>>16)&0xff];
210 s += sq[(x>>24)&0xff];
215 pix += line_size - 16;
220 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
223 for(i=0; i+8<=w; i+=8){
224 dst[i+0]= bswap_32(src[i+0]);
225 dst[i+1]= bswap_32(src[i+1]);
226 dst[i+2]= bswap_32(src[i+2]);
227 dst[i+3]= bswap_32(src[i+3]);
228 dst[i+4]= bswap_32(src[i+4]);
229 dst[i+5]= bswap_32(src[i+5]);
230 dst[i+6]= bswap_32(src[i+6]);
231 dst[i+7]= bswap_32(src[i+7]);
234 dst[i+0]= bswap_32(src[i+0]);
238 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
241 uint32_t *sq = ff_squareTbl + 256;
244 for (i = 0; i < h; i++) {
245 s += sq[pix1[0] - pix2[0]];
246 s += sq[pix1[1] - pix2[1]];
247 s += sq[pix1[2] - pix2[2]];
248 s += sq[pix1[3] - pix2[3]];
255 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
258 uint32_t *sq = ff_squareTbl + 256;
261 for (i = 0; i < h; i++) {
262 s += sq[pix1[0] - pix2[0]];
263 s += sq[pix1[1] - pix2[1]];
264 s += sq[pix1[2] - pix2[2]];
265 s += sq[pix1[3] - pix2[3]];
266 s += sq[pix1[4] - pix2[4]];
267 s += sq[pix1[5] - pix2[5]];
268 s += sq[pix1[6] - pix2[6]];
269 s += sq[pix1[7] - pix2[7]];
276 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
279 uint32_t *sq = ff_squareTbl + 256;
282 for (i = 0; i < h; i++) {
283 s += sq[pix1[ 0] - pix2[ 0]];
284 s += sq[pix1[ 1] - pix2[ 1]];
285 s += sq[pix1[ 2] - pix2[ 2]];
286 s += sq[pix1[ 3] - pix2[ 3]];
287 s += sq[pix1[ 4] - pix2[ 4]];
288 s += sq[pix1[ 5] - pix2[ 5]];
289 s += sq[pix1[ 6] - pix2[ 6]];
290 s += sq[pix1[ 7] - pix2[ 7]];
291 s += sq[pix1[ 8] - pix2[ 8]];
292 s += sq[pix1[ 9] - pix2[ 9]];
293 s += sq[pix1[10] - pix2[10]];
294 s += sq[pix1[11] - pix2[11]];
295 s += sq[pix1[12] - pix2[12]];
296 s += sq[pix1[13] - pix2[13]];
297 s += sq[pix1[14] - pix2[14]];
298 s += sq[pix1[15] - pix2[15]];
307 #ifdef CONFIG_SNOW_ENCODER //dwt is in snow.c
308 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
310 const int dec_count= w==8 ? 3 : 4;
313 static const int scale[2][2][4][4]={
317 {268, 239, 239, 213},
321 // 9/7 16x16 or 32x32 dec=4
322 {344, 310, 310, 280},
330 {275, 245, 245, 218},
334 // 5/3 16x16 or 32x32 dec=4
335 {352, 317, 317, 286},
343 for (i = 0; i < h; i++) {
344 for (j = 0; j < w; j+=4) {
345 tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
346 tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
347 tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
348 tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
354 ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
358 for(level=0; level<dec_count; level++){
359 for(ori= level ? 1 : 0; ori<4; ori++){
360 int size= w>>(dec_count-level);
361 int sx= (ori&1) ? size : 0;
362 int stride= 32<<(dec_count-level);
363 int sy= (ori&2) ? stride>>1 : 0;
365 for(i=0; i<size; i++){
366 for(j=0; j<size; j++){
367 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
377 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
378 return w_c(v, pix1, pix2, line_size, 8, h, 1);
381 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
382 return w_c(v, pix1, pix2, line_size, 8, h, 0);
385 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
386 return w_c(v, pix1, pix2, line_size, 16, h, 1);
389 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
390 return w_c(v, pix1, pix2, line_size, 16, h, 0);
393 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
394 return w_c(v, pix1, pix2, line_size, 32, h, 1);
397 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
398 return w_c(v, pix1, pix2, line_size, 32, h, 0);
402 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
406 /* read the pixels */
408 block[0] = pixels[0];
409 block[1] = pixels[1];
410 block[2] = pixels[2];
411 block[3] = pixels[3];
412 block[4] = pixels[4];
413 block[5] = pixels[5];
414 block[6] = pixels[6];
415 block[7] = pixels[7];
421 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
422 const uint8_t *s2, int stride){
425 /* read the pixels */
427 block[0] = s1[0] - s2[0];
428 block[1] = s1[1] - s2[1];
429 block[2] = s1[2] - s2[2];
430 block[3] = s1[3] - s2[3];
431 block[4] = s1[4] - s2[4];
432 block[5] = s1[5] - s2[5];
433 block[6] = s1[6] - s2[6];
434 block[7] = s1[7] - s2[7];
442 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
446 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
448 /* read the pixels */
450 pixels[0] = cm[block[0]];
451 pixels[1] = cm[block[1]];
452 pixels[2] = cm[block[2]];
453 pixels[3] = cm[block[3]];
454 pixels[4] = cm[block[4]];
455 pixels[5] = cm[block[5]];
456 pixels[6] = cm[block[6]];
457 pixels[7] = cm[block[7]];
464 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
468 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
470 /* read the pixels */
472 pixels[0] = cm[block[0]];
473 pixels[1] = cm[block[1]];
474 pixels[2] = cm[block[2]];
475 pixels[3] = cm[block[3]];
482 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
486 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
488 /* read the pixels */
490 pixels[0] = cm[block[0]];
491 pixels[1] = cm[block[1]];
498 static void put_signed_pixels_clamped_c(const DCTELEM *block,
499 uint8_t *restrict pixels,
504 for (i = 0; i < 8; i++) {
505 for (j = 0; j < 8; j++) {
508 else if (*block > 127)
511 *pixels = (uint8_t)(*block + 128);
515 pixels += (line_size - 8);
519 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
523 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
525 /* read the pixels */
527 pixels[0] = cm[pixels[0] + block[0]];
528 pixels[1] = cm[pixels[1] + block[1]];
529 pixels[2] = cm[pixels[2] + block[2]];
530 pixels[3] = cm[pixels[3] + block[3]];
531 pixels[4] = cm[pixels[4] + block[4]];
532 pixels[5] = cm[pixels[5] + block[5]];
533 pixels[6] = cm[pixels[6] + block[6]];
534 pixels[7] = cm[pixels[7] + block[7]];
540 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
544 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
546 /* read the pixels */
548 pixels[0] = cm[pixels[0] + block[0]];
549 pixels[1] = cm[pixels[1] + block[1]];
550 pixels[2] = cm[pixels[2] + block[2]];
551 pixels[3] = cm[pixels[3] + block[3]];
557 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
561 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
563 /* read the pixels */
565 pixels[0] = cm[pixels[0] + block[0]];
566 pixels[1] = cm[pixels[1] + block[1]];
572 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
576 pixels[0] += block[0];
577 pixels[1] += block[1];
578 pixels[2] += block[2];
579 pixels[3] += block[3];
580 pixels[4] += block[4];
581 pixels[5] += block[5];
582 pixels[6] += block[6];
583 pixels[7] += block[7];
589 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
593 pixels[0] += block[0];
594 pixels[1] += block[1];
595 pixels[2] += block[2];
596 pixels[3] += block[3];
602 static int sum_abs_dctelem_c(DCTELEM *block)
606 sum+= FFABS(block[i]);
612 #define PIXOP2(OPNAME, OP) \
613 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
617 OP(*((uint64_t*)block), AV_RN64(pixels));\
623 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
627 const uint64_t a= AV_RN64(pixels );\
628 const uint64_t b= AV_RN64(pixels+1);\
629 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
635 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
639 const uint64_t a= AV_RN64(pixels );\
640 const uint64_t b= AV_RN64(pixels+1);\
641 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
647 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
651 const uint64_t a= AV_RN64(pixels );\
652 const uint64_t b= AV_RN64(pixels+line_size);\
653 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
659 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
663 const uint64_t a= AV_RN64(pixels );\
664 const uint64_t b= AV_RN64(pixels+line_size);\
665 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
671 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
674 const uint64_t a= AV_RN64(pixels );\
675 const uint64_t b= AV_RN64(pixels+1);\
676 uint64_t l0= (a&0x0303030303030303ULL)\
677 + (b&0x0303030303030303ULL)\
678 + 0x0202020202020202ULL;\
679 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
680 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
684 for(i=0; i<h; i+=2){\
685 uint64_t a= AV_RN64(pixels );\
686 uint64_t b= AV_RN64(pixels+1);\
687 l1= (a&0x0303030303030303ULL)\
688 + (b&0x0303030303030303ULL);\
689 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
690 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
691 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
694 a= AV_RN64(pixels );\
695 b= AV_RN64(pixels+1);\
696 l0= (a&0x0303030303030303ULL)\
697 + (b&0x0303030303030303ULL)\
698 + 0x0202020202020202ULL;\
699 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
700 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
701 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
707 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
710 const uint64_t a= AV_RN64(pixels );\
711 const uint64_t b= AV_RN64(pixels+1);\
712 uint64_t l0= (a&0x0303030303030303ULL)\
713 + (b&0x0303030303030303ULL)\
714 + 0x0101010101010101ULL;\
715 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
716 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
720 for(i=0; i<h; i+=2){\
721 uint64_t a= AV_RN64(pixels );\
722 uint64_t b= AV_RN64(pixels+1);\
723 l1= (a&0x0303030303030303ULL)\
724 + (b&0x0303030303030303ULL);\
725 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
726 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
727 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
730 a= AV_RN64(pixels );\
731 b= AV_RN64(pixels+1);\
732 l0= (a&0x0303030303030303ULL)\
733 + (b&0x0303030303030303ULL)\
734 + 0x0101010101010101ULL;\
735 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
736 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
737 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
743 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
744 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
745 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
746 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
747 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
748 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
749 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
751 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
752 #else // 64 bit variant
754 #define PIXOP2(OPNAME, OP) \
755 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
758 OP(*((uint16_t*)(block )), AV_RN16(pixels ));\
763 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
766 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
771 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
774 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
775 OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
780 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
781 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
784 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
785 int src_stride1, int src_stride2, int h){\
789 a= AV_RN32(&src1[i*src_stride1 ]);\
790 b= AV_RN32(&src2[i*src_stride2 ]);\
791 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
792 a= AV_RN32(&src1[i*src_stride1+4]);\
793 b= AV_RN32(&src2[i*src_stride2+4]);\
794 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
798 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
799 int src_stride1, int src_stride2, int h){\
803 a= AV_RN32(&src1[i*src_stride1 ]);\
804 b= AV_RN32(&src2[i*src_stride2 ]);\
805 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
806 a= AV_RN32(&src1[i*src_stride1+4]);\
807 b= AV_RN32(&src2[i*src_stride2+4]);\
808 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
812 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
813 int src_stride1, int src_stride2, int h){\
817 a= AV_RN32(&src1[i*src_stride1 ]);\
818 b= AV_RN32(&src2[i*src_stride2 ]);\
819 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
823 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
824 int src_stride1, int src_stride2, int h){\
828 a= AV_RN16(&src1[i*src_stride1 ]);\
829 b= AV_RN16(&src2[i*src_stride2 ]);\
830 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
834 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
835 int src_stride1, int src_stride2, int h){\
836 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
837 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
840 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
841 int src_stride1, int src_stride2, int h){\
842 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
843 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
846 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
847 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
850 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
851 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
854 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
855 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
858 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
859 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
862 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
863 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
866 uint32_t a, b, c, d, l0, l1, h0, h1;\
867 a= AV_RN32(&src1[i*src_stride1]);\
868 b= AV_RN32(&src2[i*src_stride2]);\
869 c= AV_RN32(&src3[i*src_stride3]);\
870 d= AV_RN32(&src4[i*src_stride4]);\
871 l0= (a&0x03030303UL)\
874 h0= ((a&0xFCFCFCFCUL)>>2)\
875 + ((b&0xFCFCFCFCUL)>>2);\
876 l1= (c&0x03030303UL)\
878 h1= ((c&0xFCFCFCFCUL)>>2)\
879 + ((d&0xFCFCFCFCUL)>>2);\
880 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
881 a= AV_RN32(&src1[i*src_stride1+4]);\
882 b= AV_RN32(&src2[i*src_stride2+4]);\
883 c= AV_RN32(&src3[i*src_stride3+4]);\
884 d= AV_RN32(&src4[i*src_stride4+4]);\
885 l0= (a&0x03030303UL)\
888 h0= ((a&0xFCFCFCFCUL)>>2)\
889 + ((b&0xFCFCFCFCUL)>>2);\
890 l1= (c&0x03030303UL)\
892 h1= ((c&0xFCFCFCFCUL)>>2)\
893 + ((d&0xFCFCFCFCUL)>>2);\
894 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
898 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
899 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
902 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
903 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
906 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
907 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
910 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
911 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
914 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
915 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
918 uint32_t a, b, c, d, l0, l1, h0, h1;\
919 a= AV_RN32(&src1[i*src_stride1]);\
920 b= AV_RN32(&src2[i*src_stride2]);\
921 c= AV_RN32(&src3[i*src_stride3]);\
922 d= AV_RN32(&src4[i*src_stride4]);\
923 l0= (a&0x03030303UL)\
926 h0= ((a&0xFCFCFCFCUL)>>2)\
927 + ((b&0xFCFCFCFCUL)>>2);\
928 l1= (c&0x03030303UL)\
930 h1= ((c&0xFCFCFCFCUL)>>2)\
931 + ((d&0xFCFCFCFCUL)>>2);\
932 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
933 a= AV_RN32(&src1[i*src_stride1+4]);\
934 b= AV_RN32(&src2[i*src_stride2+4]);\
935 c= AV_RN32(&src3[i*src_stride3+4]);\
936 d= AV_RN32(&src4[i*src_stride4+4]);\
937 l0= (a&0x03030303UL)\
940 h0= ((a&0xFCFCFCFCUL)>>2)\
941 + ((b&0xFCFCFCFCUL)>>2);\
942 l1= (c&0x03030303UL)\
944 h1= ((c&0xFCFCFCFCUL)>>2)\
945 + ((d&0xFCFCFCFCUL)>>2);\
946 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
949 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
950 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
951 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
952 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
954 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
955 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
956 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
957 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
960 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
962 int i, a0, b0, a1, b1;\
969 for(i=0; i<h; i+=2){\
975 block[0]= (a1+a0)>>2; /* FIXME non put */\
976 block[1]= (b1+b0)>>2;\
986 block[0]= (a1+a0)>>2;\
987 block[1]= (b1+b0)>>2;\
993 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
996 const uint32_t a= AV_RN32(pixels );\
997 const uint32_t b= AV_RN32(pixels+1);\
998 uint32_t l0= (a&0x03030303UL)\
1001 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1002 + ((b&0xFCFCFCFCUL)>>2);\
1006 for(i=0; i<h; i+=2){\
1007 uint32_t a= AV_RN32(pixels );\
1008 uint32_t b= AV_RN32(pixels+1);\
1009 l1= (a&0x03030303UL)\
1010 + (b&0x03030303UL);\
1011 h1= ((a&0xFCFCFCFCUL)>>2)\
1012 + ((b&0xFCFCFCFCUL)>>2);\
1013 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1016 a= AV_RN32(pixels );\
1017 b= AV_RN32(pixels+1);\
1018 l0= (a&0x03030303UL)\
1021 h0= ((a&0xFCFCFCFCUL)>>2)\
1022 + ((b&0xFCFCFCFCUL)>>2);\
1023 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1029 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1032 for(j=0; j<2; j++){\
1034 const uint32_t a= AV_RN32(pixels );\
1035 const uint32_t b= AV_RN32(pixels+1);\
1036 uint32_t l0= (a&0x03030303UL)\
1039 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1040 + ((b&0xFCFCFCFCUL)>>2);\
1044 for(i=0; i<h; i+=2){\
1045 uint32_t a= AV_RN32(pixels );\
1046 uint32_t b= AV_RN32(pixels+1);\
1047 l1= (a&0x03030303UL)\
1048 + (b&0x03030303UL);\
1049 h1= ((a&0xFCFCFCFCUL)>>2)\
1050 + ((b&0xFCFCFCFCUL)>>2);\
1051 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1054 a= AV_RN32(pixels );\
1055 b= AV_RN32(pixels+1);\
1056 l0= (a&0x03030303UL)\
1059 h0= ((a&0xFCFCFCFCUL)>>2)\
1060 + ((b&0xFCFCFCFCUL)>>2);\
1061 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1065 pixels+=4-line_size*(h+1);\
1066 block +=4-line_size*h;\
1070 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1073 for(j=0; j<2; j++){\
1075 const uint32_t a= AV_RN32(pixels );\
1076 const uint32_t b= AV_RN32(pixels+1);\
1077 uint32_t l0= (a&0x03030303UL)\
1080 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1081 + ((b&0xFCFCFCFCUL)>>2);\
1085 for(i=0; i<h; i+=2){\
1086 uint32_t a= AV_RN32(pixels );\
1087 uint32_t b= AV_RN32(pixels+1);\
1088 l1= (a&0x03030303UL)\
1089 + (b&0x03030303UL);\
1090 h1= ((a&0xFCFCFCFCUL)>>2)\
1091 + ((b&0xFCFCFCFCUL)>>2);\
1092 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1095 a= AV_RN32(pixels );\
1096 b= AV_RN32(pixels+1);\
1097 l0= (a&0x03030303UL)\
1100 h0= ((a&0xFCFCFCFCUL)>>2)\
1101 + ((b&0xFCFCFCFCUL)>>2);\
1102 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1106 pixels+=4-line_size*(h+1);\
1107 block +=4-line_size*h;\
1111 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1112 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1113 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1114 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1115 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1116 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1117 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1118 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1120 #define op_avg(a, b) a = rnd_avg32(a, b)
1122 #define op_put(a, b) a = b
1129 #define avg2(a,b) ((a+b+1)>>1)
1130 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1132 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1133 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1136 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1137 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1140 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1142 const int A=(16-x16)*(16-y16);
1143 const int B=( x16)*(16-y16);
1144 const int C=(16-x16)*( y16);
1145 const int D=( x16)*( y16);
1150 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1151 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1152 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1153 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1154 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1155 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1156 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1157 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1163 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1164 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1167 const int s= 1<<shift;
1177 for(x=0; x<8; x++){ //XXX FIXME optimize
1178 int src_x, src_y, frac_x, frac_y, index;
1182 frac_x= src_x&(s-1);
1183 frac_y= src_y&(s-1);
1187 if((unsigned)src_x < width){
1188 if((unsigned)src_y < height){
1189 index= src_x + src_y*stride;
1190 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1191 + src[index +1]* frac_x )*(s-frac_y)
1192 + ( src[index+stride ]*(s-frac_x)
1193 + src[index+stride+1]* frac_x )* frac_y
1196 index= src_x + av_clip(src_y, 0, height)*stride;
1197 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1198 + src[index +1]* frac_x )*s
1202 if((unsigned)src_y < height){
1203 index= av_clip(src_x, 0, width) + src_y*stride;
1204 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1205 + src[index+stride ]* frac_y )*s
1208 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1209 dst[y*stride + x]= src[index ];
1221 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1223 case 2: put_pixels2_c (dst, src, stride, height); break;
1224 case 4: put_pixels4_c (dst, src, stride, height); break;
1225 case 8: put_pixels8_c (dst, src, stride, height); break;
1226 case 16:put_pixels16_c(dst, src, stride, height); break;
1230 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1232 for (i=0; i < height; i++) {
1233 for (j=0; j < width; j++) {
1234 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1241 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1243 for (i=0; i < height; i++) {
1244 for (j=0; j < width; j++) {
1245 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1252 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1254 for (i=0; i < height; i++) {
1255 for (j=0; j < width; j++) {
1256 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1263 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1265 for (i=0; i < height; i++) {
1266 for (j=0; j < width; j++) {
1267 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1274 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1276 for (i=0; i < height; i++) {
1277 for (j=0; j < width; j++) {
1278 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1285 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1287 for (i=0; i < height; i++) {
1288 for (j=0; j < width; j++) {
1289 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1296 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1298 for (i=0; i < height; i++) {
1299 for (j=0; j < width; j++) {
1300 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1307 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1309 for (i=0; i < height; i++) {
1310 for (j=0; j < width; j++) {
1311 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1318 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1320 case 2: avg_pixels2_c (dst, src, stride, height); break;
1321 case 4: avg_pixels4_c (dst, src, stride, height); break;
1322 case 8: avg_pixels8_c (dst, src, stride, height); break;
1323 case 16:avg_pixels16_c(dst, src, stride, height); break;
1327 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1329 for (i=0; i < height; i++) {
1330 for (j=0; j < width; j++) {
1331 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1338 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1340 for (i=0; i < height; i++) {
1341 for (j=0; j < width; j++) {
1342 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1349 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1351 for (i=0; i < height; i++) {
1352 for (j=0; j < width; j++) {
1353 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1360 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1362 for (i=0; i < height; i++) {
1363 for (j=0; j < width; j++) {
1364 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1371 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1373 for (i=0; i < height; i++) {
1374 for (j=0; j < width; j++) {
1375 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1382 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1384 for (i=0; i < height; i++) {
1385 for (j=0; j < width; j++) {
1386 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1393 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1395 for (i=0; i < height; i++) {
1396 for (j=0; j < width; j++) {
1397 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1404 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1406 for (i=0; i < height; i++) {
1407 for (j=0; j < width; j++) {
1408 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1415 #define TPEL_WIDTH(width)\
1416 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1417 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1418 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1419 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1420 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1421 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1422 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1423 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1424 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1425 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1426 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1427 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1428 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1429 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1430 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1431 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1432 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1433 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1436 #define H264_CHROMA_MC(OPNAME, OP)\
1437 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1438 const int A=(8-x)*(8-y);\
1439 const int B=( x)*(8-y);\
1440 const int C=(8-x)*( y);\
1441 const int D=( x)*( y);\
1444 assert(x<8 && y<8 && x>=0 && y>=0);\
1447 for(i=0; i<h; i++){\
1448 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1449 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1455 const int step= C ? stride : 1;\
1456 for(i=0; i<h; i++){\
1457 OP(dst[0], (A*src[0] + E*src[step+0]));\
1458 OP(dst[1], (A*src[1] + E*src[step+1]));\
1465 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1466 const int A=(8-x)*(8-y);\
1467 const int B=( x)*(8-y);\
1468 const int C=(8-x)*( y);\
1469 const int D=( x)*( y);\
1472 assert(x<8 && y<8 && x>=0 && y>=0);\
1475 for(i=0; i<h; i++){\
1476 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1477 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1478 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1479 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1485 const int step= C ? stride : 1;\
1486 for(i=0; i<h; i++){\
1487 OP(dst[0], (A*src[0] + E*src[step+0]));\
1488 OP(dst[1], (A*src[1] + E*src[step+1]));\
1489 OP(dst[2], (A*src[2] + E*src[step+2]));\
1490 OP(dst[3], (A*src[3] + E*src[step+3]));\
1497 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1498 const int A=(8-x)*(8-y);\
1499 const int B=( x)*(8-y);\
1500 const int C=(8-x)*( y);\
1501 const int D=( x)*( y);\
1504 assert(x<8 && y<8 && x>=0 && y>=0);\
1507 for(i=0; i<h; i++){\
1508 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1509 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1510 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1511 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1512 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1513 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1514 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1515 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1521 const int step= C ? stride : 1;\
1522 for(i=0; i<h; i++){\
1523 OP(dst[0], (A*src[0] + E*src[step+0]));\
1524 OP(dst[1], (A*src[1] + E*src[step+1]));\
1525 OP(dst[2], (A*src[2] + E*src[step+2]));\
1526 OP(dst[3], (A*src[3] + E*src[step+3]));\
1527 OP(dst[4], (A*src[4] + E*src[step+4]));\
1528 OP(dst[5], (A*src[5] + E*src[step+5]));\
1529 OP(dst[6], (A*src[6] + E*src[step+6]));\
1530 OP(dst[7], (A*src[7] + E*src[step+7]));\
1537 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1538 #define op_put(a, b) a = (((b) + 32)>>6)
1540 H264_CHROMA_MC(put_ , op_put)
1541 H264_CHROMA_MC(avg_ , op_avg)
1545 static void put_no_rnd_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1546 const int A=(8-x)*(8-y);
1547 const int B=( x)*(8-y);
1548 const int C=(8-x)*( y);
1549 const int D=( x)*( y);
1552 assert(x<8 && y<8 && x>=0 && y>=0);
1556 dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1557 dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1558 dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1559 dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1560 dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1561 dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1562 dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1563 dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1569 #define QPEL_MC(r, OPNAME, RND, OP) \
1570 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1571 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1575 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1576 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1577 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1578 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1579 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1580 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1581 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1582 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1588 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1590 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1594 const int src0= src[0*srcStride];\
1595 const int src1= src[1*srcStride];\
1596 const int src2= src[2*srcStride];\
1597 const int src3= src[3*srcStride];\
1598 const int src4= src[4*srcStride];\
1599 const int src5= src[5*srcStride];\
1600 const int src6= src[6*srcStride];\
1601 const int src7= src[7*srcStride];\
1602 const int src8= src[8*srcStride];\
1603 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1604 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1605 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1606 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1607 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1608 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1609 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1610 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1616 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1617 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1622 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1623 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1624 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1625 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1626 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1627 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1628 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1629 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1630 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1631 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1632 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1633 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1634 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1635 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1636 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1637 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1643 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1644 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1649 const int src0= src[0*srcStride];\
1650 const int src1= src[1*srcStride];\
1651 const int src2= src[2*srcStride];\
1652 const int src3= src[3*srcStride];\
1653 const int src4= src[4*srcStride];\
1654 const int src5= src[5*srcStride];\
1655 const int src6= src[6*srcStride];\
1656 const int src7= src[7*srcStride];\
1657 const int src8= src[8*srcStride];\
1658 const int src9= src[9*srcStride];\
1659 const int src10= src[10*srcStride];\
1660 const int src11= src[11*srcStride];\
1661 const int src12= src[12*srcStride];\
1662 const int src13= src[13*srcStride];\
1663 const int src14= src[14*srcStride];\
1664 const int src15= src[15*srcStride];\
1665 const int src16= src[16*srcStride];\
1666 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1667 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1668 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1669 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1670 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1671 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1672 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1673 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1674 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1675 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1676 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1677 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1678 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1679 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1680 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1681 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1687 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1688 OPNAME ## pixels8_c(dst, src, stride, 8);\
1691 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1693 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1694 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1697 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1698 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1701 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1703 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1704 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1707 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1708 uint8_t full[16*9];\
1710 copy_block9(full, src, 16, stride, 9);\
1711 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1712 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1715 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1716 uint8_t full[16*9];\
1717 copy_block9(full, src, 16, stride, 9);\
1718 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1721 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1722 uint8_t full[16*9];\
1724 copy_block9(full, src, 16, stride, 9);\
1725 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1726 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1728 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1729 uint8_t full[16*9];\
1732 uint8_t halfHV[64];\
1733 copy_block9(full, src, 16, stride, 9);\
1734 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1735 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1736 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1737 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1739 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1740 uint8_t full[16*9];\
1742 uint8_t halfHV[64];\
1743 copy_block9(full, src, 16, stride, 9);\
1744 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1745 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1746 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1747 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1749 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1750 uint8_t full[16*9];\
1753 uint8_t halfHV[64];\
1754 copy_block9(full, src, 16, stride, 9);\
1755 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1756 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1757 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1758 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1760 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1761 uint8_t full[16*9];\
1763 uint8_t halfHV[64];\
1764 copy_block9(full, src, 16, stride, 9);\
1765 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1766 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1767 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1768 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1770 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1771 uint8_t full[16*9];\
1774 uint8_t halfHV[64];\
1775 copy_block9(full, src, 16, stride, 9);\
1776 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1777 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1778 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1779 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1781 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1782 uint8_t full[16*9];\
1784 uint8_t halfHV[64];\
1785 copy_block9(full, src, 16, stride, 9);\
1786 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1787 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1788 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1789 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1791 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1792 uint8_t full[16*9];\
1795 uint8_t halfHV[64];\
1796 copy_block9(full, src, 16, stride, 9);\
1797 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1798 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1799 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1800 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1802 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1803 uint8_t full[16*9];\
1805 uint8_t halfHV[64];\
1806 copy_block9(full, src, 16, stride, 9);\
1807 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1808 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1809 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1810 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1812 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1814 uint8_t halfHV[64];\
1815 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1816 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1817 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1819 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1821 uint8_t halfHV[64];\
1822 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1823 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1824 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1826 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1827 uint8_t full[16*9];\
1830 uint8_t halfHV[64];\
1831 copy_block9(full, src, 16, stride, 9);\
1832 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1833 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1834 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1835 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1837 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1838 uint8_t full[16*9];\
1840 copy_block9(full, src, 16, stride, 9);\
1841 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1842 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1843 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1845 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1846 uint8_t full[16*9];\
1849 uint8_t halfHV[64];\
1850 copy_block9(full, src, 16, stride, 9);\
1851 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1852 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1853 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1854 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1856 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1857 uint8_t full[16*9];\
1859 copy_block9(full, src, 16, stride, 9);\
1860 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1861 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1862 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1864 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1866 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1867 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1869 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1870 OPNAME ## pixels16_c(dst, src, stride, 16);\
1873 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1875 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1876 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1879 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1880 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1883 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1885 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1886 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1889 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1890 uint8_t full[24*17];\
1892 copy_block17(full, src, 24, stride, 17);\
1893 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1894 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1897 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1898 uint8_t full[24*17];\
1899 copy_block17(full, src, 24, stride, 17);\
1900 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1903 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1904 uint8_t full[24*17];\
1906 copy_block17(full, src, 24, stride, 17);\
1907 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1908 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1910 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1911 uint8_t full[24*17];\
1912 uint8_t halfH[272];\
1913 uint8_t halfV[256];\
1914 uint8_t halfHV[256];\
1915 copy_block17(full, src, 24, stride, 17);\
1916 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1917 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1918 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1919 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1921 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1922 uint8_t full[24*17];\
1923 uint8_t halfH[272];\
1924 uint8_t halfHV[256];\
1925 copy_block17(full, src, 24, stride, 17);\
1926 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1927 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1928 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1929 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1931 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1932 uint8_t full[24*17];\
1933 uint8_t halfH[272];\
1934 uint8_t halfV[256];\
1935 uint8_t halfHV[256];\
1936 copy_block17(full, src, 24, stride, 17);\
1937 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1938 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1939 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1940 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1942 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1943 uint8_t full[24*17];\
1944 uint8_t halfH[272];\
1945 uint8_t halfHV[256];\
1946 copy_block17(full, src, 24, stride, 17);\
1947 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1948 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1949 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1950 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1952 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1953 uint8_t full[24*17];\
1954 uint8_t halfH[272];\
1955 uint8_t halfV[256];\
1956 uint8_t halfHV[256];\
1957 copy_block17(full, src, 24, stride, 17);\
1958 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1959 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1960 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1961 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1963 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1964 uint8_t full[24*17];\
1965 uint8_t halfH[272];\
1966 uint8_t halfHV[256];\
1967 copy_block17(full, src, 24, stride, 17);\
1968 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1969 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1970 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1971 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1973 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1974 uint8_t full[24*17];\
1975 uint8_t halfH[272];\
1976 uint8_t halfV[256];\
1977 uint8_t halfHV[256];\
1978 copy_block17(full, src, 24, stride, 17);\
1979 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1980 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1981 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1982 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1984 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1985 uint8_t full[24*17];\
1986 uint8_t halfH[272];\
1987 uint8_t halfHV[256];\
1988 copy_block17(full, src, 24, stride, 17);\
1989 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1990 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1991 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1992 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1994 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1995 uint8_t halfH[272];\
1996 uint8_t halfHV[256];\
1997 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1998 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1999 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2001 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2002 uint8_t halfH[272];\
2003 uint8_t halfHV[256];\
2004 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2005 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2006 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2008 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2009 uint8_t full[24*17];\
2010 uint8_t halfH[272];\
2011 uint8_t halfV[256];\
2012 uint8_t halfHV[256];\
2013 copy_block17(full, src, 24, stride, 17);\
2014 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2015 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2016 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2017 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2019 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2020 uint8_t full[24*17];\
2021 uint8_t halfH[272];\
2022 copy_block17(full, src, 24, stride, 17);\
2023 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2024 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2025 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2027 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2028 uint8_t full[24*17];\
2029 uint8_t halfH[272];\
2030 uint8_t halfV[256];\
2031 uint8_t halfHV[256];\
2032 copy_block17(full, src, 24, stride, 17);\
2033 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2034 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2035 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2036 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2038 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2039 uint8_t full[24*17];\
2040 uint8_t halfH[272];\
2041 copy_block17(full, src, 24, stride, 17);\
2042 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2043 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2044 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2046 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2047 uint8_t halfH[272];\
2048 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2049 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2052 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2053 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2054 #define op_put(a, b) a = cm[((b) + 16)>>5]
2055 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2057 QPEL_MC(0, put_ , _ , op_put)
2058 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2059 QPEL_MC(0, avg_ , _ , op_avg)
2060 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
2062 #undef op_avg_no_rnd
2064 #undef op_put_no_rnd
2067 #define H264_LOWPASS(OPNAME, OP, OP2) \
2068 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2070 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2074 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2075 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2081 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2083 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2087 const int srcB= src[-2*srcStride];\
2088 const int srcA= src[-1*srcStride];\
2089 const int src0= src[0 *srcStride];\
2090 const int src1= src[1 *srcStride];\
2091 const int src2= src[2 *srcStride];\
2092 const int src3= src[3 *srcStride];\
2093 const int src4= src[4 *srcStride];\
2094 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2095 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2101 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2104 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2106 src -= 2*srcStride;\
2107 for(i=0; i<h+5; i++)\
2109 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2110 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2114 tmp -= tmpStride*(h+5-2);\
2117 const int tmpB= tmp[-2*tmpStride];\
2118 const int tmpA= tmp[-1*tmpStride];\
2119 const int tmp0= tmp[0 *tmpStride];\
2120 const int tmp1= tmp[1 *tmpStride];\
2121 const int tmp2= tmp[2 *tmpStride];\
2122 const int tmp3= tmp[3 *tmpStride];\
2123 const int tmp4= tmp[4 *tmpStride];\
2124 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2125 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2130 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2132 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2136 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2137 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2138 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2139 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2145 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2147 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2151 const int srcB= src[-2*srcStride];\
2152 const int srcA= src[-1*srcStride];\
2153 const int src0= src[0 *srcStride];\
2154 const int src1= src[1 *srcStride];\
2155 const int src2= src[2 *srcStride];\
2156 const int src3= src[3 *srcStride];\
2157 const int src4= src[4 *srcStride];\
2158 const int src5= src[5 *srcStride];\
2159 const int src6= src[6 *srcStride];\
2160 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2161 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2162 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2163 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2169 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2172 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2174 src -= 2*srcStride;\
2175 for(i=0; i<h+5; i++)\
2177 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2178 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2179 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2180 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2184 tmp -= tmpStride*(h+5-2);\
2187 const int tmpB= tmp[-2*tmpStride];\
2188 const int tmpA= tmp[-1*tmpStride];\
2189 const int tmp0= tmp[0 *tmpStride];\
2190 const int tmp1= tmp[1 *tmpStride];\
2191 const int tmp2= tmp[2 *tmpStride];\
2192 const int tmp3= tmp[3 *tmpStride];\
2193 const int tmp4= tmp[4 *tmpStride];\
2194 const int tmp5= tmp[5 *tmpStride];\
2195 const int tmp6= tmp[6 *tmpStride];\
2196 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2197 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2198 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2199 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2205 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2207 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2211 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2212 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2213 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2214 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2215 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2216 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2217 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2218 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2224 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2226 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2230 const int srcB= src[-2*srcStride];\
2231 const int srcA= src[-1*srcStride];\
2232 const int src0= src[0 *srcStride];\
2233 const int src1= src[1 *srcStride];\
2234 const int src2= src[2 *srcStride];\
2235 const int src3= src[3 *srcStride];\
2236 const int src4= src[4 *srcStride];\
2237 const int src5= src[5 *srcStride];\
2238 const int src6= src[6 *srcStride];\
2239 const int src7= src[7 *srcStride];\
2240 const int src8= src[8 *srcStride];\
2241 const int src9= src[9 *srcStride];\
2242 const int src10=src[10*srcStride];\
2243 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2244 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2245 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2246 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2247 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2248 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2249 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2250 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2256 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2259 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2261 src -= 2*srcStride;\
2262 for(i=0; i<h+5; i++)\
2264 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2265 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2266 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2267 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2268 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2269 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2270 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2271 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2275 tmp -= tmpStride*(h+5-2);\
2278 const int tmpB= tmp[-2*tmpStride];\
2279 const int tmpA= tmp[-1*tmpStride];\
2280 const int tmp0= tmp[0 *tmpStride];\
2281 const int tmp1= tmp[1 *tmpStride];\
2282 const int tmp2= tmp[2 *tmpStride];\
2283 const int tmp3= tmp[3 *tmpStride];\
2284 const int tmp4= tmp[4 *tmpStride];\
2285 const int tmp5= tmp[5 *tmpStride];\
2286 const int tmp6= tmp[6 *tmpStride];\
2287 const int tmp7= tmp[7 *tmpStride];\
2288 const int tmp8= tmp[8 *tmpStride];\
2289 const int tmp9= tmp[9 *tmpStride];\
2290 const int tmp10=tmp[10*tmpStride];\
2291 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2292 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2293 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2294 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2295 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2296 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2297 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2298 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2304 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2305 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2306 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2307 src += 8*srcStride;\
2308 dst += 8*dstStride;\
2309 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2310 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2313 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2314 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2315 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2316 src += 8*srcStride;\
2317 dst += 8*dstStride;\
2318 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2319 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2322 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2323 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2324 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2325 src += 8*srcStride;\
2326 dst += 8*dstStride;\
2327 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2328 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2331 #define H264_MC(OPNAME, SIZE) \
2332 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2333 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2336 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2337 uint8_t half[SIZE*SIZE];\
2338 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2339 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2342 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2343 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2346 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2347 uint8_t half[SIZE*SIZE];\
2348 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2349 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2352 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2353 uint8_t full[SIZE*(SIZE+5)];\
2354 uint8_t * const full_mid= full + SIZE*2;\
2355 uint8_t half[SIZE*SIZE];\
2356 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2357 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2358 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2361 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2362 uint8_t full[SIZE*(SIZE+5)];\
2363 uint8_t * const full_mid= full + SIZE*2;\
2364 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2365 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2368 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2369 uint8_t full[SIZE*(SIZE+5)];\
2370 uint8_t * const full_mid= full + SIZE*2;\
2371 uint8_t half[SIZE*SIZE];\
2372 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2373 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2374 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2377 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2378 uint8_t full[SIZE*(SIZE+5)];\
2379 uint8_t * const full_mid= full + SIZE*2;\
2380 uint8_t halfH[SIZE*SIZE];\
2381 uint8_t halfV[SIZE*SIZE];\
2382 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2383 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2384 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2385 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2388 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2389 uint8_t full[SIZE*(SIZE+5)];\
2390 uint8_t * const full_mid= full + SIZE*2;\
2391 uint8_t halfH[SIZE*SIZE];\
2392 uint8_t halfV[SIZE*SIZE];\
2393 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2394 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2395 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2396 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2399 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2400 uint8_t full[SIZE*(SIZE+5)];\
2401 uint8_t * const full_mid= full + SIZE*2;\
2402 uint8_t halfH[SIZE*SIZE];\
2403 uint8_t halfV[SIZE*SIZE];\
2404 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2405 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2406 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2407 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2410 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2411 uint8_t full[SIZE*(SIZE+5)];\
2412 uint8_t * const full_mid= full + SIZE*2;\
2413 uint8_t halfH[SIZE*SIZE];\
2414 uint8_t halfV[SIZE*SIZE];\
2415 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2416 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2417 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2418 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2421 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2422 int16_t tmp[SIZE*(SIZE+5)];\
2423 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2426 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2427 int16_t tmp[SIZE*(SIZE+5)];\
2428 uint8_t halfH[SIZE*SIZE];\
2429 uint8_t halfHV[SIZE*SIZE];\
2430 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2431 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2432 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2435 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2436 int16_t tmp[SIZE*(SIZE+5)];\
2437 uint8_t halfH[SIZE*SIZE];\
2438 uint8_t halfHV[SIZE*SIZE];\
2439 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2440 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2441 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2444 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2445 uint8_t full[SIZE*(SIZE+5)];\
2446 uint8_t * const full_mid= full + SIZE*2;\
2447 int16_t tmp[SIZE*(SIZE+5)];\
2448 uint8_t halfV[SIZE*SIZE];\
2449 uint8_t halfHV[SIZE*SIZE];\
2450 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2451 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2452 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2453 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2456 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2457 uint8_t full[SIZE*(SIZE+5)];\
2458 uint8_t * const full_mid= full + SIZE*2;\
2459 int16_t tmp[SIZE*(SIZE+5)];\
2460 uint8_t halfV[SIZE*SIZE];\
2461 uint8_t halfHV[SIZE*SIZE];\
2462 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2463 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2464 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2465 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2468 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2469 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2470 #define op_put(a, b) a = cm[((b) + 16)>>5]
2471 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2472 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2474 H264_LOWPASS(put_ , op_put, op2_put)
2475 H264_LOWPASS(avg_ , op_avg, op2_avg)
2490 #define op_scale1(x) block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2491 #define op_scale2(x) dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2492 #define H264_WEIGHT(W,H) \
2493 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2495 offset <<= log2_denom; \
2496 if(log2_denom) offset += 1<<(log2_denom-1); \
2497 for(y=0; y<H; y++, block += stride){ \
2500 if(W==2) continue; \
2503 if(W==4) continue; \
2508 if(W==8) continue; \
2519 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2521 offset = ((offset + 1) | 1) << log2_denom; \
2522 for(y=0; y<H; y++, dst += stride, src += stride){ \
2525 if(W==2) continue; \
2528 if(W==4) continue; \
2533 if(W==8) continue; \
2560 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2561 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2565 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2566 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2567 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2568 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2569 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2570 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2571 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2572 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2578 #ifdef CONFIG_CAVS_DECODER
2580 void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2582 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2583 put_pixels8_c(dst, src, stride, 8);
2585 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2586 avg_pixels8_c(dst, src, stride, 8);
2588 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2589 put_pixels16_c(dst, src, stride, 16);
2591 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2592 avg_pixels16_c(dst, src, stride, 16);
2594 #endif /* CONFIG_CAVS_DECODER */
2596 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
2598 void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2600 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2601 put_pixels8_c(dst, src, stride, 8);
2603 #endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
2605 void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
2608 void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2610 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2611 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2615 const int src_1= src[ -srcStride];
2616 const int src0 = src[0 ];
2617 const int src1 = src[ srcStride];
2618 const int src2 = src[2*srcStride];
2619 const int src3 = src[3*srcStride];
2620 const int src4 = src[4*srcStride];
2621 const int src5 = src[5*srcStride];
2622 const int src6 = src[6*srcStride];
2623 const int src7 = src[7*srcStride];
2624 const int src8 = src[8*srcStride];
2625 const int src9 = src[9*srcStride];
2626 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2627 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2628 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2629 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2630 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2631 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2632 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2633 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2639 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2640 put_pixels8_c(dst, src, stride, 8);
2643 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2645 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2646 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2649 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2650 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2653 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2655 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2656 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2659 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2660 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2663 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2667 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2668 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2669 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2670 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2672 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2676 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2677 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2678 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2679 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2681 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2683 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2684 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2687 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2688 if(ENABLE_ANY_H263) {
2690 const int strength= ff_h263_loop_filter_strength[qscale];
2694 int p0= src[x-2*stride];
2695 int p1= src[x-1*stride];
2696 int p2= src[x+0*stride];
2697 int p3= src[x+1*stride];
2698 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2700 if (d<-2*strength) d1= 0;
2701 else if(d<- strength) d1=-2*strength - d;
2702 else if(d< strength) d1= d;
2703 else if(d< 2*strength) d1= 2*strength - d;
2708 if(p1&256) p1= ~(p1>>31);
2709 if(p2&256) p2= ~(p2>>31);
2711 src[x-1*stride] = p1;
2712 src[x+0*stride] = p2;
2716 d2= av_clip((p0-p3)/4, -ad1, ad1);
2718 src[x-2*stride] = p0 - d2;
2719 src[x+ stride] = p3 + d2;
2724 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2725 if(ENABLE_ANY_H263) {
2727 const int strength= ff_h263_loop_filter_strength[qscale];
2731 int p0= src[y*stride-2];
2732 int p1= src[y*stride-1];
2733 int p2= src[y*stride+0];
2734 int p3= src[y*stride+1];
2735 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2737 if (d<-2*strength) d1= 0;
2738 else if(d<- strength) d1=-2*strength - d;
2739 else if(d< strength) d1= d;
2740 else if(d< 2*strength) d1= 2*strength - d;
2745 if(p1&256) p1= ~(p1>>31);
2746 if(p2&256) p2= ~(p2>>31);
2748 src[y*stride-1] = p1;
2749 src[y*stride+0] = p2;
2753 d2= av_clip((p0-p3)/4, -ad1, ad1);
2755 src[y*stride-2] = p0 - d2;
2756 src[y*stride+1] = p3 + d2;
2761 static void h261_loop_filter_c(uint8_t *src, int stride){
2766 temp[x ] = 4*src[x ];
2767 temp[x + 7*8] = 4*src[x + 7*stride];
2771 xy = y * stride + x;
2773 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2778 src[ y*stride] = (temp[ y*8] + 2)>>2;
2779 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2781 xy = y * stride + x;
2783 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2788 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2791 for( i = 0; i < 4; i++ ) {
2796 for( d = 0; d < 4; d++ ) {
2797 const int p0 = pix[-1*xstride];
2798 const int p1 = pix[-2*xstride];
2799 const int p2 = pix[-3*xstride];
2800 const int q0 = pix[0];
2801 const int q1 = pix[1*xstride];
2802 const int q2 = pix[2*xstride];
2804 if( FFABS( p0 - q0 ) < alpha &&
2805 FFABS( p1 - p0 ) < beta &&
2806 FFABS( q1 - q0 ) < beta ) {
2811 if( FFABS( p2 - p0 ) < beta ) {
2812 pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2815 if( FFABS( q2 - q0 ) < beta ) {
2816 pix[ xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2820 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2821 pix[-xstride] = av_clip_uint8( p0 + i_delta ); /* p0' */
2822 pix[0] = av_clip_uint8( q0 - i_delta ); /* q0' */
2828 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2830 h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2832 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2834 h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2837 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2840 for( i = 0; i < 4; i++ ) {
2841 const int tc = tc0[i];
2846 for( d = 0; d < 2; d++ ) {
2847 const int p0 = pix[-1*xstride];
2848 const int p1 = pix[-2*xstride];
2849 const int q0 = pix[0];
2850 const int q1 = pix[1*xstride];
2852 if( FFABS( p0 - q0 ) < alpha &&
2853 FFABS( p1 - p0 ) < beta &&
2854 FFABS( q1 - q0 ) < beta ) {
2856 int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2858 pix[-xstride] = av_clip_uint8( p0 + delta ); /* p0' */
2859 pix[0] = av_clip_uint8( q0 - delta ); /* q0' */
2865 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2867 h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2869 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2871 h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
2874 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2877 for( d = 0; d < 8; d++ ) {
2878 const int p0 = pix[-1*xstride];
2879 const int p1 = pix[-2*xstride];
2880 const int q0 = pix[0];
2881 const int q1 = pix[1*xstride];
2883 if( FFABS( p0 - q0 ) < alpha &&
2884 FFABS( p1 - p0 ) < beta &&
2885 FFABS( q1 - q0 ) < beta ) {
2887 pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */
2888 pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */
2893 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2895 h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
2897 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2899 h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
2902 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2908 s += abs(pix1[0] - pix2[0]);
2909 s += abs(pix1[1] - pix2[1]);
2910 s += abs(pix1[2] - pix2[2]);
2911 s += abs(pix1[3] - pix2[3]);
2912 s += abs(pix1[4] - pix2[4]);
2913 s += abs(pix1[5] - pix2[5]);
2914 s += abs(pix1[6] - pix2[6]);
2915 s += abs(pix1[7] - pix2[7]);
2916 s += abs(pix1[8] - pix2[8]);
2917 s += abs(pix1[9] - pix2[9]);
2918 s += abs(pix1[10] - pix2[10]);
2919 s += abs(pix1[11] - pix2[11]);
2920 s += abs(pix1[12] - pix2[12]);
2921 s += abs(pix1[13] - pix2[13]);
2922 s += abs(pix1[14] - pix2[14]);
2923 s += abs(pix1[15] - pix2[15]);
2930 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2936 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2937 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2938 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2939 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2940 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2941 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2942 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2943 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2944 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2945 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2946 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2947 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2948 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2949 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2950 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2951 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2958 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2961 uint8_t *pix3 = pix2 + line_size;
2965 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2966 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2967 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2968 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2969 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2970 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2971 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2972 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2973 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2974 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2975 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2976 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2977 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2978 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2979 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2980 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2988 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2991 uint8_t *pix3 = pix2 + line_size;
2995 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2996 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2997 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2998 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2999 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3000 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3001 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3002 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3003 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3004 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3005 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3006 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3007 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3008 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3009 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3010 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3018 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3024 s += abs(pix1[0] - pix2[0]);
3025 s += abs(pix1[1] - pix2[1]);
3026 s += abs(pix1[2] - pix2[2]);
3027 s += abs(pix1[3] - pix2[3]);
3028 s += abs(pix1[4] - pix2[4]);
3029 s += abs(pix1[5] - pix2[5]);
3030 s += abs(pix1[6] - pix2[6]);
3031 s += abs(pix1[7] - pix2[7]);
3038 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3044 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3045 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3046 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3047 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3048 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3049 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3050 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3051 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3058 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3061 uint8_t *pix3 = pix2 + line_size;
3065 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3066 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3067 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3068 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3069 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3070 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3071 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3072 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3080 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3083 uint8_t *pix3 = pix2 + line_size;
3087 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3088 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3089 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3090 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3091 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3092 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3093 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3094 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3102 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3103 MpegEncContext *c = v;
3109 for(x=0; x<16; x++){
3110 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3113 for(x=0; x<15; x++){
3114 score2+= FFABS( s1[x ] - s1[x +stride]
3115 - s1[x+1] + s1[x+1+stride])
3116 -FFABS( s2[x ] - s2[x +stride]
3117 - s2[x+1] + s2[x+1+stride]);
3124 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3125 else return score1 + FFABS(score2)*8;
3128 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3129 MpegEncContext *c = v;
3136 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3140 score2+= FFABS( s1[x ] - s1[x +stride]
3141 - s1[x+1] + s1[x+1+stride])
3142 -FFABS( s2[x ] - s2[x +stride]
3143 - s2[x+1] + s2[x+1+stride]);
3150 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3151 else return score1 + FFABS(score2)*8;
3154 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3158 for(i=0; i<8*8; i++){
3159 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3162 assert(-512<b && b<512);
3164 sum += (w*b)*(w*b)>>4;
3169 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3172 for(i=0; i<8*8; i++){
3173 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3178 * permutes an 8x8 block.
3179 * @param block the block which will be permuted according to the given permutation vector
3180 * @param permutation the permutation vector
3181 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3182 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3183 * (inverse) permutated to scantable order!
3185 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3191 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3193 for(i=0; i<=last; i++){
3194 const int j= scantable[i];
3199 for(i=0; i<=last; i++){
3200 const int j= scantable[i];
3201 const int perm_j= permutation[j];
3202 block[perm_j]= temp[j];
3206 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3210 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3213 memset(cmp, 0, sizeof(void*)*5);
3221 cmp[i]= c->hadamard8_diff[i];
3227 cmp[i]= c->dct_sad[i];
3230 cmp[i]= c->dct264_sad[i];
3233 cmp[i]= c->dct_max[i];
3236 cmp[i]= c->quant_psnr[i];
3256 #ifdef CONFIG_SNOW_ENCODER
3265 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3271 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3273 static void clear_blocks_c(DCTELEM *blocks)
3275 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3278 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3280 for(i=0; i+7<w; i+=8){
3281 dst[i+0] += src[i+0];
3282 dst[i+1] += src[i+1];
3283 dst[i+2] += src[i+2];
3284 dst[i+3] += src[i+3];
3285 dst[i+4] += src[i+4];
3286 dst[i+5] += src[i+5];
3287 dst[i+6] += src[i+6];
3288 dst[i+7] += src[i+7];
3291 dst[i+0] += src[i+0];
3294 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3296 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3297 long a = *(long*)(src1+i);
3298 long b = *(long*)(src2+i);
3299 *(long*)(dst+i) = ((a&0x7f7f7f7f7f7f7f7fL) + (b&0x7f7f7f7f7f7f7f7fL)) ^ ((a^b)&0x8080808080808080L);
3302 dst[i] = src1[i]+src2[i];
3305 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3307 for(i=0; i+7<w; i+=8){
3308 dst[i+0] = src1[i+0]-src2[i+0];
3309 dst[i+1] = src1[i+1]-src2[i+1];
3310 dst[i+2] = src1[i+2]-src2[i+2];
3311 dst[i+3] = src1[i+3]-src2[i+3];
3312 dst[i+4] = src1[i+4]-src2[i+4];
3313 dst[i+5] = src1[i+5]-src2[i+5];
3314 dst[i+6] = src1[i+6]-src2[i+6];
3315 dst[i+7] = src1[i+7]-src2[i+7];
3318 dst[i+0] = src1[i+0]-src2[i+0];
3321 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3329 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3339 #define BUTTERFLY2(o1,o2,i1,i2) \
3343 #define BUTTERFLY1(x,y) \
3352 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3354 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3362 //FIXME try pointer walks
3363 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3364 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3365 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3366 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3368 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3369 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3370 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3371 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3373 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3374 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3375 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3376 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3380 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3381 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3382 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3383 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3385 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3386 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3387 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3388 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3391 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3392 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3393 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3394 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3400 printf("MAX:%d\n", maxi);
3406 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3414 //FIXME try pointer walks
3415 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3416 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3417 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3418 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3420 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3421 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3422 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3423 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3425 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3426 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3427 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3428 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3432 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3433 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3434 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3435 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3437 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3438 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3439 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3440 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3443 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3444 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3445 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3446 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3449 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3454 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3455 MpegEncContext * const s= (MpegEncContext *)c;
3456 DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3457 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3461 s->dsp.diff_pixels(temp, src1, src2, stride);
3463 return s->dsp.sum_abs_dctelem(temp);
3468 const int s07 = SRC(0) + SRC(7);\
3469 const int s16 = SRC(1) + SRC(6);\
3470 const int s25 = SRC(2) + SRC(5);\
3471 const int s34 = SRC(3) + SRC(4);\
3472 const int a0 = s07 + s34;\
3473 const int a1 = s16 + s25;\
3474 const int a2 = s07 - s34;\
3475 const int a3 = s16 - s25;\
3476 const int d07 = SRC(0) - SRC(7);\
3477 const int d16 = SRC(1) - SRC(6);\
3478 const int d25 = SRC(2) - SRC(5);\
3479 const int d34 = SRC(3) - SRC(4);\
3480 const int a4 = d16 + d25 + (d07 + (d07>>1));\
3481 const int a5 = d07 - d34 - (d25 + (d25>>1));\
3482 const int a6 = d07 + d34 - (d16 + (d16>>1));\
3483 const int a7 = d16 - d25 + (d34 + (d34>>1));\
3485 DST(1, a4 + (a7>>2)) ;\
3486 DST(2, a2 + (a3>>1)) ;\
3487 DST(3, a5 + (a6>>2)) ;\
3489 DST(5, a6 - (a5>>2)) ;\
3490 DST(6, (a2>>1) - a3 ) ;\
3491 DST(7, (a4>>2) - a7 ) ;\
3494 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3495 MpegEncContext * const s= (MpegEncContext *)c;
3500 s->dsp.diff_pixels(dct[0], src1, src2, stride);
3502 #define SRC(x) dct[i][x]
3503 #define DST(x,v) dct[i][x]= v
3504 for( i = 0; i < 8; i++ )
3509 #define SRC(x) dct[x][i]
3510 #define DST(x,v) sum += FFABS(v)
3511 for( i = 0; i < 8; i++ )
3519 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3520 MpegEncContext * const s= (MpegEncContext *)c;
3521 DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3522 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3527 s->dsp.diff_pixels(temp, src1, src2, stride);
3531 sum= FFMAX(sum, FFABS(temp[i]));
3536 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3537 MpegEncContext * const s= (MpegEncContext *)c;
3538 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3539 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3540 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3546 s->dsp.diff_pixels(temp, src1, src2, stride);
3548 memcpy(bak, temp, 64*sizeof(DCTELEM));
3550 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3551 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3552 ff_simple_idct(temp); //FIXME
3555 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3560 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3561 MpegEncContext * const s= (MpegEncContext *)c;
3562 const uint8_t *scantable= s->intra_scantable.permutated;
3563 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3564 DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3565 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3566 uint8_t * const bak= (uint8_t*)aligned_bak;
3567 int i, last, run, bits, level, distoration, start_i;
3568 const int esc_length= s->ac_esc_length;
3570 uint8_t * last_length;
3575 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3576 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3579 s->dsp.diff_pixels(temp, src1, src2, stride);
3581 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3587 length = s->intra_ac_vlc_length;
3588 last_length= s->intra_ac_vlc_last_length;
3589 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3592 length = s->inter_ac_vlc_length;
3593 last_length= s->inter_ac_vlc_last_length;
3598 for(i=start_i; i<last; i++){
3599 int j= scantable[i];
3604 if((level&(~127)) == 0){
3605 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3614 level= temp[i] + 64;
3618 if((level&(~127)) == 0){
3619 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3627 s->dct_unquantize_intra(s, temp, 0, s->qscale);
3629 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3632 s->dsp.idct_add(bak, stride, temp);
3634 distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3636 return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3639 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3640 MpegEncContext * const s= (MpegEncContext *)c;
3641 const uint8_t *scantable= s->intra_scantable.permutated;
3642 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3643 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3644 int i, last, run, bits, level, start_i;
3645 const int esc_length= s->ac_esc_length;
3647 uint8_t * last_length;
3651 s->dsp.diff_pixels(temp, src1, src2, stride);
3653 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3659 length = s->intra_ac_vlc_length;
3660 last_length= s->intra_ac_vlc_last_length;
3661 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3664 length = s->inter_ac_vlc_length;
3665 last_length= s->inter_ac_vlc_last_length;
3670 for(i=start_i; i<last; i++){
3671 int j= scantable[i];
3676 if((level&(~127)) == 0){
3677 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3686 level= temp[i] + 64;
3690 if((level&(~127)) == 0){
3691 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3699 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3704 for(x=0; x<16; x+=4){
3705 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride])
3706 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);
3714 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3719 for(x=0; x<16; x++){
3720 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3729 #define SQ(a) ((a)*(a))
3730 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3735 for(x=0; x<16; x+=4){
3736 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride])
3737 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3745 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3750 for(x=0; x<16; x++){
3751 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3760 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3764 for(i=0; i<size; i++)
3765 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3769 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3770 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3771 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3773 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3775 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3776 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3777 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
3778 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
3780 static void vector_fmul_c(float *dst, const float *src, int len){
3782 for(i=0; i<len; i++)
3786 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3789 for(i=0; i<len; i++)
3790 dst[i] = src0[i] * src1[-i];
3793 void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){
3795 for(i=0; i<len; i++)
3796 dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
3799 void ff_float_to_int16_c(int16_t *dst, const float *src, int len){
3801 for(i=0; i<len; i++) {
3802 int_fast32_t tmp = ((const int32_t*)src)[i];
3804 tmp = (0x43c0ffff - tmp)>>31;
3805 // is this faster on some gcc/cpu combinations?
3806 // if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3809 dst[i] = tmp - 0x8000;
3814 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
3815 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
3816 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
3817 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
3818 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
3819 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
3820 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */
3822 static void wmv2_idct_row(short * b)
3825 int a0,a1,a2,a3,a4,a5,a6,a7;
3827 a1 = W1*b[1]+W7*b[7];
3828 a7 = W7*b[1]-W1*b[7];
3829 a5 = W5*b[5]+W3*b[3];
3830 a3 = W3*b[5]-W5*b[3];
3831 a2 = W2*b[2]+W6*b[6];
3832 a6 = W6*b[2]-W2*b[6];
3833 a0 = W0*b[0]+W0*b[4];
3834 a4 = W0*b[0]-W0*b[4];
3836 s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
3837 s2 = (181*(a1-a5-a7+a3)+128)>>8;
3839 b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
3840 b[1] = (a4+a6 +s1 + (1<<7))>>8;
3841 b[2] = (a4-a6 +s2 + (1<<7))>>8;
3842 b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
3843 b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
3844 b[5] = (a4-a6 -s2 + (1<<7))>>8;
3845 b[6] = (a4+a6 -s1 + (1<<7))>>8;
3846 b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
3848 static void wmv2_idct_col(short * b)
3851 int a0,a1,a2,a3,a4,a5,a6,a7;
3852 /*step 1, with extended precision*/
3853 a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
3854 a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
3855 a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
3856 a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
3857 a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
3858 a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
3859 a0 = (W0*b[8*0]+W0*b[8*4] )>>3;
3860 a4 = (W0*b[8*0]-W0*b[8*4] )>>3;
3862 s1 = (181*(a1-a5+a7-a3)+128)>>8;
3863 s2 = (181*(a1-a5-a7+a3)+128)>>8;
3865 b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
3866 b[8*1] = (a4+a6 +s1 + (1<<13))>>14;
3867 b[8*2] = (a4-a6 +s2 + (1<<13))>>14;
3868 b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
3870 b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
3871 b[8*5] = (a4-a6 -s2 + (1<<13))>>14;
3872 b[8*6] = (a4+a6 -s1 + (1<<13))>>14;
3873 b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
3875 void ff_wmv2_idct_c(short * block){
3879 wmv2_idct_row(block+i);
3882 wmv2_idct_col(block+i);
3885 /* XXX: those functions should be suppressed ASAP when all IDCTs are
3887 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
3889 ff_wmv2_idct_c(block);
3890 put_pixels_clamped_c(block, dest, line_size);
3892 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
3894 ff_wmv2_idct_c(block);
3895 add_pixels_clamped_c(block, dest, line_size);
3897 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3900 put_pixels_clamped_c(block, dest, line_size);
3902 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3905 add_pixels_clamped_c(block, dest, line_size);
3908 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3911 put_pixels_clamped4_c(block, dest, line_size);
3913 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3916 add_pixels_clamped4_c(block, dest, line_size);
3919 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
3922 put_pixels_clamped2_c(block, dest, line_size);
3924 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
3927 add_pixels_clamped2_c(block, dest, line_size);
3930 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
3932 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
3934 dest[0] = cm[(block[0] + 4)>>3];
3936 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
3938 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
3940 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
3943 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
3945 /* init static data */
3946 void dsputil_static_init(void)
3950 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
3951 for(i=0;i<MAX_NEG_CROP;i++) {
3953 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
3956 for(i=0;i<512;i++) {
3957 ff_squareTbl[i] = (i - 256) * (i - 256);
3960 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3963 int ff_check_alignment(void){
3964 static int did_fail=0;
3965 DECLARE_ALIGNED_16(int, aligned);
3967 if((long)&aligned & 15){
3969 #if defined(HAVE_MMX) || defined(HAVE_ALTIVEC)
3970 av_log(NULL, AV_LOG_ERROR,
3971 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
3972 "and may be very slow or crash. This is not a bug in libavcodec,\n"
3973 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
3974 "Do not report crashes to FFmpeg developers.\n");
3983 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3987 ff_check_alignment();
3989 #ifdef CONFIG_ENCODERS
3990 if(avctx->dct_algo==FF_DCT_FASTINT) {
3991 c->fdct = fdct_ifast;
3992 c->fdct248 = fdct_ifast248;
3994 else if(avctx->dct_algo==FF_DCT_FAAN) {
3995 c->fdct = ff_faandct;
3996 c->fdct248 = ff_faandct248;
3999 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4000 c->fdct248 = ff_fdct248_islow;
4002 #endif //CONFIG_ENCODERS
4004 if(avctx->lowres==1){
4005 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !ENABLE_H264_DECODER){
4006 c->idct_put= ff_jref_idct4_put;
4007 c->idct_add= ff_jref_idct4_add;
4009 c->idct_put= ff_h264_lowres_idct_put_c;
4010 c->idct_add= ff_h264_lowres_idct_add_c;
4012 c->idct = j_rev_dct4;
4013 c->idct_permutation_type= FF_NO_IDCT_PERM;
4014 }else if(avctx->lowres==2){
4015 c->idct_put= ff_jref_idct2_put;
4016 c->idct_add= ff_jref_idct2_add;
4017 c->idct = j_rev_dct2;
4018 c->idct_permutation_type= FF_NO_IDCT_PERM;
4019 }else if(avctx->lowres==3){
4020 c->idct_put= ff_jref_idct1_put;
4021 c->idct_add= ff_jref_idct1_add;
4022 c->idct = j_rev_dct1;
4023 c->idct_permutation_type= FF_NO_IDCT_PERM;
4025 if(avctx->idct_algo==FF_IDCT_INT){
4026 c->idct_put= ff_jref_idct_put;
4027 c->idct_add= ff_jref_idct_add;
4028 c->idct = j_rev_dct;
4029 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4030 }else if((ENABLE_VP3_DECODER || ENABLE_VP5_DECODER || ENABLE_VP6_DECODER || ENABLE_THEORA_DECODER ) &&
4031 avctx->idct_algo==FF_IDCT_VP3){
4032 c->idct_put= ff_vp3_idct_put_c;
4033 c->idct_add= ff_vp3_idct_add_c;
4034 c->idct = ff_vp3_idct_c;
4035 c->idct_permutation_type= FF_NO_IDCT_PERM;
4036 }else if(avctx->idct_algo==FF_IDCT_WMV2){
4037 c->idct_put= ff_wmv2_idct_put_c;
4038 c->idct_add= ff_wmv2_idct_add_c;
4039 c->idct = ff_wmv2_idct_c;
4040 c->idct_permutation_type= FF_NO_IDCT_PERM;
4041 }else{ //accurate/default
4042 c->idct_put= ff_simple_idct_put;
4043 c->idct_add= ff_simple_idct_add;
4044 c->idct = ff_simple_idct;
4045 c->idct_permutation_type= FF_NO_IDCT_PERM;
4049 if (ENABLE_H264_DECODER) {
4050 c->h264_idct_add= ff_h264_idct_add_c;
4051 c->h264_idct8_add= ff_h264_idct8_add_c;
4052 c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
4053 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
4056 c->get_pixels = get_pixels_c;
4057 c->diff_pixels = diff_pixels_c;
4058 c->put_pixels_clamped = put_pixels_clamped_c;
4059 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4060 c->add_pixels_clamped = add_pixels_clamped_c;
4061 c->add_pixels8 = add_pixels8_c;
4062 c->add_pixels4 = add_pixels4_c;
4063 c->sum_abs_dctelem = sum_abs_dctelem_c;
4066 c->clear_blocks = clear_blocks_c;
4067 c->pix_sum = pix_sum_c;
4068 c->pix_norm1 = pix_norm1_c;
4070 /* TODO [0] 16 [1] 8 */
4071 c->pix_abs[0][0] = pix_abs16_c;
4072 c->pix_abs[0][1] = pix_abs16_x2_c;
4073 c->pix_abs[0][2] = pix_abs16_y2_c;
4074 c->pix_abs[0][3] = pix_abs16_xy2_c;
4075 c->pix_abs[1][0] = pix_abs8_c;
4076 c->pix_abs[1][1] = pix_abs8_x2_c;
4077 c->pix_abs[1][2] = pix_abs8_y2_c;
4078 c->pix_abs[1][3] = pix_abs8_xy2_c;
4080 #define dspfunc(PFX, IDX, NUM) \
4081 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
4082 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
4083 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
4084 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4086 dspfunc(put, 0, 16);
4087 dspfunc(put_no_rnd, 0, 16);
4089 dspfunc(put_no_rnd, 1, 8);
4093 dspfunc(avg, 0, 16);
4094 dspfunc(avg_no_rnd, 0, 16);
4096 dspfunc(avg_no_rnd, 1, 8);
4101 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4102 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4104 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4105 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4106 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4107 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4108 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4109 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4110 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4111 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4112 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4114 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4115 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4116 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4117 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4118 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4119 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4120 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4121 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4122 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4124 #define dspfunc(PFX, IDX, NUM) \
4125 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4126 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4127 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4128 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4129 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4130 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4131 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4132 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4133 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4134 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4135 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4136 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4137 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4138 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4139 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4140 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4142 dspfunc(put_qpel, 0, 16);
4143 dspfunc(put_no_rnd_qpel, 0, 16);
4145 dspfunc(avg_qpel, 0, 16);
4146 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4148 dspfunc(put_qpel, 1, 8);
4149 dspfunc(put_no_rnd_qpel, 1, 8);
4151 dspfunc(avg_qpel, 1, 8);
4152 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4154 dspfunc(put_h264_qpel, 0, 16);
4155 dspfunc(put_h264_qpel, 1, 8);
4156 dspfunc(put_h264_qpel, 2, 4);
4157 dspfunc(put_h264_qpel, 3, 2);
4158 dspfunc(avg_h264_qpel, 0, 16);
4159 dspfunc(avg_h264_qpel, 1, 8);
4160 dspfunc(avg_h264_qpel, 2, 4);
4163 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4164 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4165 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4166 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4167 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4168 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4169 c->put_no_rnd_h264_chroma_pixels_tab[0]= put_no_rnd_h264_chroma_mc8_c;
4171 c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
4172 c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
4173 c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
4174 c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
4175 c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
4176 c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
4177 c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
4178 c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
4179 c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
4180 c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
4181 c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
4182 c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
4183 c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
4184 c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
4185 c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
4186 c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
4187 c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
4188 c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
4189 c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
4190 c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
4192 #ifdef CONFIG_CAVS_DECODER
4193 ff_cavsdsp_init(c,avctx);
4195 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4196 ff_vc1dsp_init(c,avctx);
4198 #if defined(CONFIG_WMV2_DECODER) || defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4199 ff_intrax8dsp_init(c,avctx);
4201 #if defined(CONFIG_H264_ENCODER)
4202 ff_h264dspenc_init(c,avctx);
4205 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4206 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4207 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4208 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4209 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4210 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4211 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4212 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4214 #define SET_CMP_FUNC(name) \
4215 c->name[0]= name ## 16_c;\
4216 c->name[1]= name ## 8x8_c;
4218 SET_CMP_FUNC(hadamard8_diff)
4219 c->hadamard8_diff[4]= hadamard8_intra16_c;
4220 SET_CMP_FUNC(dct_sad)
4221 SET_CMP_FUNC(dct_max)
4223 SET_CMP_FUNC(dct264_sad)
4225 c->sad[0]= pix_abs16_c;
4226 c->sad[1]= pix_abs8_c;
4230 SET_CMP_FUNC(quant_psnr)
4233 c->vsad[0]= vsad16_c;
4234 c->vsad[4]= vsad_intra16_c;
4235 c->vsse[0]= vsse16_c;
4236 c->vsse[4]= vsse_intra16_c;
4237 c->nsse[0]= nsse16_c;
4238 c->nsse[1]= nsse8_c;
4239 #ifdef CONFIG_SNOW_ENCODER
4240 c->w53[0]= w53_16_c;
4242 c->w97[0]= w97_16_c;
4246 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4248 c->add_bytes= add_bytes_c;
4249 c->add_bytes_l2= add_bytes_l2_c;
4250 c->diff_bytes= diff_bytes_c;
4251 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4252 c->bswap_buf= bswap_buf;
4253 #ifdef CONFIG_PNG_DECODER
4254 c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
4257 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4258 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4259 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4260 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4261 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4262 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4263 c->h264_loop_filter_strength= NULL;
4265 if (ENABLE_ANY_H263) {
4266 c->h263_h_loop_filter= h263_h_loop_filter_c;
4267 c->h263_v_loop_filter= h263_v_loop_filter_c;
4270 c->h261_loop_filter= h261_loop_filter_c;
4272 c->try_8x8basis= try_8x8basis_c;
4273 c->add_8x8basis= add_8x8basis_c;
4275 #ifdef CONFIG_SNOW_DECODER
4276 c->vertical_compose97i = ff_snow_vertical_compose97i;
4277 c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4278 c->inner_add_yblock = ff_snow_inner_add_yblock;
4281 #ifdef CONFIG_VORBIS_DECODER
4282 c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4284 #ifdef CONFIG_FLAC_ENCODER
4285 c->flac_compute_autocorr = ff_flac_compute_autocorr;
4287 c->vector_fmul = vector_fmul_c;
4288 c->vector_fmul_reverse = vector_fmul_reverse_c;
4289 c->vector_fmul_add_add = ff_vector_fmul_add_add_c;
4290 c->float_to_int16 = ff_float_to_int16_c;
4292 c->shrink[0]= ff_img_copy_plane;
4293 c->shrink[1]= ff_shrink22;
4294 c->shrink[2]= ff_shrink44;
4295 c->shrink[3]= ff_shrink88;
4297 c->prefetch= just_return;
4299 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4300 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4302 if (ENABLE_MMX) dsputil_init_mmx (c, avctx);
4303 if (ENABLE_ARMV4L) dsputil_init_armv4l(c, avctx);
4304 if (ENABLE_MLIB) dsputil_init_mlib (c, avctx);
4305 if (ENABLE_VIS) dsputil_init_vis (c, avctx);
4306 if (ENABLE_ALPHA) dsputil_init_alpha (c, avctx);
4307 if (ENABLE_POWERPC) dsputil_init_ppc (c, avctx);
4308 if (ENABLE_MMI) dsputil_init_mmi (c, avctx);
4309 if (ENABLE_SH4) dsputil_init_sh4 (c, avctx);
4310 if (ENABLE_BFIN) dsputil_init_bfin (c, avctx);
4312 for(i=0; i<64; i++){
4313 if(!c->put_2tap_qpel_pixels_tab[0][i])
4314 c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4315 if(!c->avg_2tap_qpel_pixels_tab[0][i])
4316 c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4319 switch(c->idct_permutation_type){
4320 case FF_NO_IDCT_PERM:
4322 c->idct_permutation[i]= i;
4324 case FF_LIBMPEG2_IDCT_PERM:
4326 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4328 case FF_SIMPLE_IDCT_PERM:
4330 c->idct_permutation[i]= simple_mmx_permutation[i];
4332 case FF_TRANSPOSE_IDCT_PERM:
4334 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4336 case FF_PARTTRANS_IDCT_PERM:
4338 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4341 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");