libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard.
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
   7  *
   8  * This file is part of FFmpeg.
   9  *
  10  * FFmpeg is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * FFmpeg is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with FFmpeg; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 /**
  26  * @file dsputil.c
  27  * DSP utils
  28  */
  29
  30 #include "avcodec.h"
  31 #include "dsputil.h"
  32 #include "simple_idct.h"
  33 #include "faandct.h"
  34 #include "faanidct.h"
  35 #include "h263.h"
  36 #include "snow.h"
  37
  38 /* snow.c */
  39 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
  40
  41 /* vorbis.c */
  42 void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
  43
  44 /* ac3dec.c */
  45 void ff_ac3_downmix_c(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len);
  46
  47 /* flacenc.c */
  48 void ff_flac_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
  49
  50 /* pngdec.c */
  51 void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
  52
  53 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
  54 uint32_t ff_squareTbl[512] = {0, };
  55
  56 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
  57 #define pb_7f (~0UL/255 * 0x7f)
  58 #define pb_80 (~0UL/255 * 0x80)
  59
  60 const uint8_t ff_zigzag_direct[64] = {
  61     0,   1,  8, 16,  9,  2,  3, 10,
  62     17, 24, 32, 25, 18, 11,  4,  5,
  63     12, 19, 26, 33, 40, 48, 41, 34,
  64     27, 20, 13,  6,  7, 14, 21, 28,
  65     35, 42, 49, 56, 57, 50, 43, 36,
  66     29, 22, 15, 23, 30, 37, 44, 51,
  67     58, 59, 52, 45, 38, 31, 39, 46,
  68     53, 60, 61, 54, 47, 55, 62, 63
  69 };
  70
  71 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  72    specification, we interleave the fields */
  73 const uint8_t ff_zigzag248_direct[64] = {
  74      0,  8,  1,  9, 16, 24,  2, 10,
  75     17, 25, 32, 40, 48, 56, 33, 41,
  76     18, 26,  3, 11,  4, 12, 19, 27,
  77     34, 42, 49, 57, 50, 58, 35, 43,
  78     20, 28,  5, 13,  6, 14, 21, 29,
  79     36, 44, 51, 59, 52, 60, 37, 45,
  80     22, 30,  7, 15, 23, 31, 38, 46,
  81     53, 61, 54, 62, 39, 47, 55, 63,
  82 };
  83
  84 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  85 DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
  86
  87 const uint8_t ff_alternate_horizontal_scan[64] = {
  88     0,  1,   2,  3,  8,  9, 16, 17,
  89     10, 11,  4,  5,  6,  7, 15, 14,
  90     13, 12, 19, 18, 24, 25, 32, 33,
  91     26, 27, 20, 21, 22, 23, 28, 29,
  92     30, 31, 34, 35, 40, 41, 48, 49,
  93     42, 43, 36, 37, 38, 39, 44, 45,
  94     46, 47, 50, 51, 56, 57, 58, 59,
  95     52, 53, 54, 55, 60, 61, 62, 63,
  96 };
  97
  98 const uint8_t ff_alternate_vertical_scan[64] = {
  99     0,  8,  16, 24,  1,  9,  2, 10,
 100     17, 25, 32, 40, 48, 56, 57, 49,
 101     41, 33, 26, 18,  3, 11,  4, 12,
 102     19, 27, 34, 42, 50, 58, 35, 43,
 103     51, 59, 20, 28,  5, 13,  6, 14,
 104     21, 29, 36, 44, 52, 60, 37, 45,
 105     53, 61, 22, 30,  7, 15, 23, 31,
 106     38, 46, 54, 62, 39, 47, 55, 63,
 107 };
 108
 109 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
 110 const uint32_t ff_inverse[256]={
 111          0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
 112  536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
 113  268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
 114  178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
 115  134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
 116  107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
 117   89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
 118   76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
 119   67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
 120   59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
 121   53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
 122   48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
 123   44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
 124   41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
 125   38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
 126   35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
 127   33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
 128   31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
 129   29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
 130   28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
 131   26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
 132   25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
 133   24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
 134   23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
 135   22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
 136   21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
 137   20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
 138   19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
 139   19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
 140   18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
 141   17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
 142   17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
 143 };
 144
 145 /* Input permutation for the simple_idct_mmx */
 146 static const uint8_t simple_mmx_permutation[64]={
 147         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 148         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 149         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 150         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 151         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 152         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 153         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 154         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 155 };
 156
 157 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
 158
 159 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
 160     int i;
 161     int end;
 162
 163     st->scantable= src_scantable;
 164
 165     for(i=0; i<64; i++){
 166         int j;
 167         j = src_scantable[i];
 168         st->permutated[i] = permutation[j];
 169 #ifdef ARCH_POWERPC
 170         st->inverse[j] = i;
 171 #endif
 172     }
 173
 174     end=-1;
 175     for(i=0; i<64; i++){
 176         int j;
 177         j = st->permutated[i];
 178         if(j>end) end=j;
 179         st->raster_end[i]= end;
 180     }
 181 }
 182
 183 static int pix_sum_c(uint8_t * pix, int line_size)
 184 {
 185     int s, i, j;
 186
 187     s = 0;
 188     for (i = 0; i < 16; i++) {
 189         for (j = 0; j < 16; j += 8) {
 190             s += pix[0];
 191             s += pix[1];
 192             s += pix[2];
 193             s += pix[3];
 194             s += pix[4];
 195             s += pix[5];
 196             s += pix[6];
 197             s += pix[7];
 198             pix += 8;
 199         }
 200         pix += line_size - 16;
 201     }
 202     return s;
 203 }
 204
 205 static int pix_norm1_c(uint8_t * pix, int line_size)
 206 {
 207     int s, i, j;
 208     uint32_t *sq = ff_squareTbl + 256;
 209
 210     s = 0;
 211     for (i = 0; i < 16; i++) {
 212         for (j = 0; j < 16; j += 8) {
 213 #if 0
 214             s += sq[pix[0]];
 215             s += sq[pix[1]];
 216             s += sq[pix[2]];
 217             s += sq[pix[3]];
 218             s += sq[pix[4]];
 219             s += sq[pix[5]];
 220             s += sq[pix[6]];
 221             s += sq[pix[7]];
 222 #else
 223 #if LONG_MAX > 2147483647
 224             register uint64_t x=*(uint64_t*)pix;
 225             s += sq[x&0xff];
 226             s += sq[(x>>8)&0xff];
 227             s += sq[(x>>16)&0xff];
 228             s += sq[(x>>24)&0xff];
 229             s += sq[(x>>32)&0xff];
 230             s += sq[(x>>40)&0xff];
 231             s += sq[(x>>48)&0xff];
 232             s += sq[(x>>56)&0xff];
 233 #else
 234             register uint32_t x=*(uint32_t*)pix;
 235             s += sq[x&0xff];
 236             s += sq[(x>>8)&0xff];
 237             s += sq[(x>>16)&0xff];
 238             s += sq[(x>>24)&0xff];
 239             x=*(uint32_t*)(pix+4);
 240             s += sq[x&0xff];
 241             s += sq[(x>>8)&0xff];
 242             s += sq[(x>>16)&0xff];
 243             s += sq[(x>>24)&0xff];
 244 #endif
 245 #endif
 246             pix += 8;
 247         }
 248         pix += line_size - 16;
 249     }
 250     return s;
 251 }
 252
 253 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
 254     int i;
 255
 256     for(i=0; i+8<=w; i+=8){
 257         dst[i+0]= bswap_32(src[i+0]);
 258         dst[i+1]= bswap_32(src[i+1]);
 259         dst[i+2]= bswap_32(src[i+2]);
 260         dst[i+3]= bswap_32(src[i+3]);
 261         dst[i+4]= bswap_32(src[i+4]);
 262         dst[i+5]= bswap_32(src[i+5]);
 263         dst[i+6]= bswap_32(src[i+6]);
 264         dst[i+7]= bswap_32(src[i+7]);
 265     }
 266     for(;i<w; i++){
 267         dst[i+0]= bswap_32(src[i+0]);
 268     }
 269 }
 270
 271 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 272 {
 273     int s, i;
 274     uint32_t *sq = ff_squareTbl + 256;
 275
 276     s = 0;
 277     for (i = 0; i < h; i++) {
 278         s += sq[pix1[0] - pix2[0]];
 279         s += sq[pix1[1] - pix2[1]];
 280         s += sq[pix1[2] - pix2[2]];
 281         s += sq[pix1[3] - pix2[3]];
 282         pix1 += line_size;
 283         pix2 += line_size;
 284     }
 285     return s;
 286 }
 287
 288 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 289 {
 290     int s, i;
 291     uint32_t *sq = ff_squareTbl + 256;
 292
 293     s = 0;
 294     for (i = 0; i < h; i++) {
 295         s += sq[pix1[0] - pix2[0]];
 296         s += sq[pix1[1] - pix2[1]];
 297         s += sq[pix1[2] - pix2[2]];
 298         s += sq[pix1[3] - pix2[3]];
 299         s += sq[pix1[4] - pix2[4]];
 300         s += sq[pix1[5] - pix2[5]];
 301         s += sq[pix1[6] - pix2[6]];
 302         s += sq[pix1[7] - pix2[7]];
 303         pix1 += line_size;
 304         pix2 += line_size;
 305     }
 306     return s;
 307 }
 308
 309 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 310 {
 311     int s, i;
 312     uint32_t *sq = ff_squareTbl + 256;
 313
 314     s = 0;
 315     for (i = 0; i < h; i++) {
 316         s += sq[pix1[ 0] - pix2[ 0]];
 317         s += sq[pix1[ 1] - pix2[ 1]];
 318         s += sq[pix1[ 2] - pix2[ 2]];
 319         s += sq[pix1[ 3] - pix2[ 3]];
 320         s += sq[pix1[ 4] - pix2[ 4]];
 321         s += sq[pix1[ 5] - pix2[ 5]];
 322         s += sq[pix1[ 6] - pix2[ 6]];
 323         s += sq[pix1[ 7] - pix2[ 7]];
 324         s += sq[pix1[ 8] - pix2[ 8]];
 325         s += sq[pix1[ 9] - pix2[ 9]];
 326         s += sq[pix1[10] - pix2[10]];
 327         s += sq[pix1[11] - pix2[11]];
 328         s += sq[pix1[12] - pix2[12]];
 329         s += sq[pix1[13] - pix2[13]];
 330         s += sq[pix1[14] - pix2[14]];
 331         s += sq[pix1[15] - pix2[15]];
 332
 333         pix1 += line_size;
 334         pix2 += line_size;
 335     }
 336     return s;
 337 }
 338
 339
 340 #ifdef CONFIG_SNOW_ENCODER //dwt is in snow.c
 341 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
 342     int s, i, j;
 343     const int dec_count= w==8 ? 3 : 4;
 344     int tmp[32*32];
 345     int level, ori;
 346     static const int scale[2][2][4][4]={
 347       {
 348         {
 349             // 9/7 8x8 dec=3
 350             {268, 239, 239, 213},
 351             {  0, 224, 224, 152},
 352             {  0, 135, 135, 110},
 353         },{
 354             // 9/7 16x16 or 32x32 dec=4
 355             {344, 310, 310, 280},
 356             {  0, 320, 320, 228},
 357             {  0, 175, 175, 136},
 358             {  0, 129, 129, 102},
 359         }
 360       },{
 361         {
 362             // 5/3 8x8 dec=3
 363             {275, 245, 245, 218},
 364             {  0, 230, 230, 156},
 365             {  0, 138, 138, 113},
 366         },{
 367             // 5/3 16x16 or 32x32 dec=4
 368             {352, 317, 317, 286},
 369             {  0, 328, 328, 233},
 370             {  0, 180, 180, 140},
 371             {  0, 132, 132, 105},
 372         }
 373       }
 374     };
 375
 376     for (i = 0; i < h; i++) {
 377         for (j = 0; j < w; j+=4) {
 378             tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
 379             tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
 380             tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
 381             tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
 382         }
 383         pix1 += line_size;
 384         pix2 += line_size;
 385     }
 386
 387     ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
 388
 389     s=0;
 390     assert(w==h);
 391     for(level=0; level<dec_count; level++){
 392         for(ori= level ? 1 : 0; ori<4; ori++){
 393             int size= w>>(dec_count-level);
 394             int sx= (ori&1) ? size : 0;
 395             int stride= 32<<(dec_count-level);
 396             int sy= (ori&2) ? stride>>1 : 0;
 397
 398             for(i=0; i<size; i++){
 399                 for(j=0; j<size; j++){
 400                     int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
 401                     s += FFABS(v);
 402                 }
 403             }
 404         }
 405     }
 406     assert(s>=0);
 407     return s>>9;
 408 }
 409
 410 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 411     return w_c(v, pix1, pix2, line_size,  8, h, 1);
 412 }
 413
 414 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 415     return w_c(v, pix1, pix2, line_size,  8, h, 0);
 416 }
 417
 418 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 419     return w_c(v, pix1, pix2, line_size, 16, h, 1);
 420 }
 421
 422 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 423     return w_c(v, pix1, pix2, line_size, 16, h, 0);
 424 }
 425
 426 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 427     return w_c(v, pix1, pix2, line_size, 32, h, 1);
 428 }
 429
 430 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 431     return w_c(v, pix1, pix2, line_size, 32, h, 0);
 432 }
 433 #endif
 434
 435 /* draw the edges of width 'w' of an image of size width, height */
 436 //FIXME check that this is ok for mpeg4 interlaced
 437 static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
 438 {
 439     uint8_t *ptr, *last_line;
 440     int i;
 441
 442     last_line = buf + (height - 1) * wrap;
 443     for(i=0;i<w;i++) {
 444         /* top and bottom */
 445         memcpy(buf - (i + 1) * wrap, buf, width);
 446         memcpy(last_line + (i + 1) * wrap, last_line, width);
 447     }
 448     /* left and right */
 449     ptr = buf;
 450     for(i=0;i<height;i++) {
 451         memset(ptr - w, ptr[0], w);
 452         memset(ptr + width, ptr[width-1], w);
 453         ptr += wrap;
 454     }
 455     /* corners */
 456     for(i=0;i<w;i++) {
 457         memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
 458         memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
 459         memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
 460         memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
 461     }
 462 }
 463
 464 /**
 465  * Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples.
 466  * @param buf destination buffer
 467  * @param src source buffer
 468  * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
 469  * @param block_w width of block
 470  * @param block_h height of block
 471  * @param src_x x coordinate of the top left sample of the block in the source buffer
 472  * @param src_y y coordinate of the top left sample of the block in the source buffer
 473  * @param w width of the source buffer
 474  * @param h height of the source buffer
 475  */
 476 void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, int block_w, int block_h,
 477                                     int src_x, int src_y, int w, int h){
 478     int x, y;
 479     int start_y, start_x, end_y, end_x;
 480
 481     if(src_y>= h){
 482         src+= (h-1-src_y)*linesize;
 483         src_y=h-1;
 484     }else if(src_y<=-block_h){
 485         src+= (1-block_h-src_y)*linesize;
 486         src_y=1-block_h;
 487     }
 488     if(src_x>= w){
 489         src+= (w-1-src_x);
 490         src_x=w-1;
 491     }else if(src_x<=-block_w){
 492         src+= (1-block_w-src_x);
 493         src_x=1-block_w;
 494     }
 495
 496     start_y= FFMAX(0, -src_y);
 497     start_x= FFMAX(0, -src_x);
 498     end_y= FFMIN(block_h, h-src_y);
 499     end_x= FFMIN(block_w, w-src_x);
 500
 501     // copy existing part
 502     for(y=start_y; y<end_y; y++){
 503         for(x=start_x; x<end_x; x++){
 504             buf[x + y*linesize]= src[x + y*linesize];
 505         }
 506     }
 507
 508     //top
 509     for(y=0; y<start_y; y++){
 510         for(x=start_x; x<end_x; x++){
 511             buf[x + y*linesize]= buf[x + start_y*linesize];
 512         }
 513     }
 514
 515     //bottom
 516     for(y=end_y; y<block_h; y++){
 517         for(x=start_x; x<end_x; x++){
 518             buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
 519         }
 520     }
 521
 522     for(y=0; y<block_h; y++){
 523        //left
 524         for(x=0; x<start_x; x++){
 525             buf[x + y*linesize]= buf[start_x + y*linesize];
 526         }
 527
 528        //right
 529         for(x=end_x; x<block_w; x++){
 530             buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
 531         }
 532     }
 533 }
 534
 535 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
 536 {
 537     int i;
 538
 539     /* read the pixels */
 540     for(i=0;i<8;i++) {
 541         block[0] = pixels[0];
 542         block[1] = pixels[1];
 543         block[2] = pixels[2];
 544         block[3] = pixels[3];
 545         block[4] = pixels[4];
 546         block[5] = pixels[5];
 547         block[6] = pixels[6];
 548         block[7] = pixels[7];
 549         pixels += line_size;
 550         block += 8;
 551     }
 552 }
 553
 554 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
 555                           const uint8_t *s2, int stride){
 556     int i;
 557
 558     /* read the pixels */
 559     for(i=0;i<8;i++) {
 560         block[0] = s1[0] - s2[0];
 561         block[1] = s1[1] - s2[1];
 562         block[2] = s1[2] - s2[2];
 563         block[3] = s1[3] - s2[3];
 564         block[4] = s1[4] - s2[4];
 565         block[5] = s1[5] - s2[5];
 566         block[6] = s1[6] - s2[6];
 567         block[7] = s1[7] - s2[7];
 568         s1 += stride;
 569         s2 += stride;
 570         block += 8;
 571     }
 572 }
 573
 574
 575 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 576                                  int line_size)
 577 {
 578     int i;
 579     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 580
 581     /* read the pixels */
 582     for(i=0;i<8;i++) {
 583         pixels[0] = cm[block[0]];
 584         pixels[1] = cm[block[1]];
 585         pixels[2] = cm[block[2]];
 586         pixels[3] = cm[block[3]];
 587         pixels[4] = cm[block[4]];
 588         pixels[5] = cm[block[5]];
 589         pixels[6] = cm[block[6]];
 590         pixels[7] = cm[block[7]];
 591
 592         pixels += line_size;
 593         block += 8;
 594     }
 595 }
 596
 597 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 598                                  int line_size)
 599 {
 600     int i;
 601     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 602
 603     /* read the pixels */
 604     for(i=0;i<4;i++) {
 605         pixels[0] = cm[block[0]];
 606         pixels[1] = cm[block[1]];
 607         pixels[2] = cm[block[2]];
 608         pixels[3] = cm[block[3]];
 609
 610         pixels += line_size;
 611         block += 8;
 612     }
 613 }
 614
 615 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 616                                  int line_size)
 617 {
 618     int i;
 619     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 620
 621     /* read the pixels */
 622     for(i=0;i<2;i++) {
 623         pixels[0] = cm[block[0]];
 624         pixels[1] = cm[block[1]];
 625
 626         pixels += line_size;
 627         block += 8;
 628     }
 629 }
 630
 631 static void put_signed_pixels_clamped_c(const DCTELEM *block,
 632                                         uint8_t *restrict pixels,
 633                                         int line_size)
 634 {
 635     int i, j;
 636
 637     for (i = 0; i < 8; i++) {
 638         for (j = 0; j < 8; j++) {
 639             if (*block < -128)
 640                 *pixels = 0;
 641             else if (*block > 127)
 642                 *pixels = 255;
 643             else
 644                 *pixels = (uint8_t)(*block + 128);
 645             block++;
 646             pixels++;
 647         }
 648         pixels += (line_size - 8);
 649     }
 650 }
 651
 652 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 653                           int line_size)
 654 {
 655     int i;
 656     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 657
 658     /* read the pixels */
 659     for(i=0;i<8;i++) {
 660         pixels[0] = cm[pixels[0] + block[0]];
 661         pixels[1] = cm[pixels[1] + block[1]];
 662         pixels[2] = cm[pixels[2] + block[2]];
 663         pixels[3] = cm[pixels[3] + block[3]];
 664         pixels[4] = cm[pixels[4] + block[4]];
 665         pixels[5] = cm[pixels[5] + block[5]];
 666         pixels[6] = cm[pixels[6] + block[6]];
 667         pixels[7] = cm[pixels[7] + block[7]];
 668         pixels += line_size;
 669         block += 8;
 670     }
 671 }
 672
 673 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 674                           int line_size)
 675 {
 676     int i;
 677     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 678
 679     /* read the pixels */
 680     for(i=0;i<4;i++) {
 681         pixels[0] = cm[pixels[0] + block[0]];
 682         pixels[1] = cm[pixels[1] + block[1]];
 683         pixels[2] = cm[pixels[2] + block[2]];
 684         pixels[3] = cm[pixels[3] + block[3]];
 685         pixels += line_size;
 686         block += 8;
 687     }
 688 }
 689
 690 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 691                           int line_size)
 692 {
 693     int i;
 694     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 695
 696     /* read the pixels */
 697     for(i=0;i<2;i++) {
 698         pixels[0] = cm[pixels[0] + block[0]];
 699         pixels[1] = cm[pixels[1] + block[1]];
 700         pixels += line_size;
 701         block += 8;
 702     }
 703 }
 704
 705 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 706 {
 707     int i;
 708     for(i=0;i<8;i++) {
 709         pixels[0] += block[0];
 710         pixels[1] += block[1];
 711         pixels[2] += block[2];
 712         pixels[3] += block[3];
 713         pixels[4] += block[4];
 714         pixels[5] += block[5];
 715         pixels[6] += block[6];
 716         pixels[7] += block[7];
 717         pixels += line_size;
 718         block += 8;
 719     }
 720 }
 721
 722 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 723 {
 724     int i;
 725     for(i=0;i<4;i++) {
 726         pixels[0] += block[0];
 727         pixels[1] += block[1];
 728         pixels[2] += block[2];
 729         pixels[3] += block[3];
 730         pixels += line_size;
 731         block += 4;
 732     }
 733 }
 734
 735 static int sum_abs_dctelem_c(DCTELEM *block)
 736 {
 737     int sum=0, i;
 738     for(i=0; i<64; i++)
 739         sum+= FFABS(block[i]);
 740     return sum;
 741 }
 742
 743 #if 0
 744
 745 #define PIXOP2(OPNAME, OP) \
 746 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 747 {\
 748     int i;\
 749     for(i=0; i<h; i++){\
 750         OP(*((uint64_t*)block), AV_RN64(pixels));\
 751         pixels+=line_size;\
 752         block +=line_size;\
 753     }\
 754 }\
 755 \
 756 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 757 {\
 758     int i;\
 759     for(i=0; i<h; i++){\
 760         const uint64_t a= AV_RN64(pixels  );\
 761         const uint64_t b= AV_RN64(pixels+1);\
 762         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 763         pixels+=line_size;\
 764         block +=line_size;\
 765     }\
 766 }\
 767 \
 768 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 769 {\
 770     int i;\
 771     for(i=0; i<h; i++){\
 772         const uint64_t a= AV_RN64(pixels  );\
 773         const uint64_t b= AV_RN64(pixels+1);\
 774         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 775         pixels+=line_size;\
 776         block +=line_size;\
 777     }\
 778 }\
 779 \
 780 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 781 {\
 782     int i;\
 783     for(i=0; i<h; i++){\
 784         const uint64_t a= AV_RN64(pixels          );\
 785         const uint64_t b= AV_RN64(pixels+line_size);\
 786         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 787         pixels+=line_size;\
 788         block +=line_size;\
 789     }\
 790 }\
 791 \
 792 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 793 {\
 794     int i;\
 795     for(i=0; i<h; i++){\
 796         const uint64_t a= AV_RN64(pixels          );\
 797         const uint64_t b= AV_RN64(pixels+line_size);\
 798         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 799         pixels+=line_size;\
 800         block +=line_size;\
 801     }\
 802 }\
 803 \
 804 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 805 {\
 806         int i;\
 807         const uint64_t a= AV_RN64(pixels  );\
 808         const uint64_t b= AV_RN64(pixels+1);\
 809         uint64_t l0=  (a&0x0303030303030303ULL)\
 810                     + (b&0x0303030303030303ULL)\
 811                     + 0x0202020202020202ULL;\
 812         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 813                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 814         uint64_t l1,h1;\
 815 \
 816         pixels+=line_size;\
 817         for(i=0; i<h; i+=2){\
 818             uint64_t a= AV_RN64(pixels  );\
 819             uint64_t b= AV_RN64(pixels+1);\
 820             l1=  (a&0x0303030303030303ULL)\
 821                + (b&0x0303030303030303ULL);\
 822             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 823               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 824             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 825             pixels+=line_size;\
 826             block +=line_size;\
 827             a= AV_RN64(pixels  );\
 828             b= AV_RN64(pixels+1);\
 829             l0=  (a&0x0303030303030303ULL)\
 830                + (b&0x0303030303030303ULL)\
 831                + 0x0202020202020202ULL;\
 832             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 833               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 834             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 835             pixels+=line_size;\
 836             block +=line_size;\
 837         }\
 838 }\
 839 \
 840 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 841 {\
 842         int i;\
 843         const uint64_t a= AV_RN64(pixels  );\
 844         const uint64_t b= AV_RN64(pixels+1);\
 845         uint64_t l0=  (a&0x0303030303030303ULL)\
 846                     + (b&0x0303030303030303ULL)\
 847                     + 0x0101010101010101ULL;\
 848         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 849                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 850         uint64_t l1,h1;\
 851 \
 852         pixels+=line_size;\
 853         for(i=0; i<h; i+=2){\
 854             uint64_t a= AV_RN64(pixels  );\
 855             uint64_t b= AV_RN64(pixels+1);\
 856             l1=  (a&0x0303030303030303ULL)\
 857                + (b&0x0303030303030303ULL);\
 858             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 859               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 860             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 861             pixels+=line_size;\
 862             block +=line_size;\
 863             a= AV_RN64(pixels  );\
 864             b= AV_RN64(pixels+1);\
 865             l0=  (a&0x0303030303030303ULL)\
 866                + (b&0x0303030303030303ULL)\
 867                + 0x0101010101010101ULL;\
 868             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 869               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 870             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 871             pixels+=line_size;\
 872             block +=line_size;\
 873         }\
 874 }\
 875 \
 876 CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
 877 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
 878 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
 879 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
 880 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
 881 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
 882 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
 883
 884 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
 885 #else // 64 bit variant
 886
 887 #define PIXOP2(OPNAME, OP) \
 888 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 889     int i;\
 890     for(i=0; i<h; i++){\
 891         OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
 892         pixels+=line_size;\
 893         block +=line_size;\
 894     }\
 895 }\
 896 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 897     int i;\
 898     for(i=0; i<h; i++){\
 899         OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
 900         pixels+=line_size;\
 901         block +=line_size;\
 902     }\
 903 }\
 904 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 905     int i;\
 906     for(i=0; i<h; i++){\
 907         OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
 908         OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
 909         pixels+=line_size;\
 910         block +=line_size;\
 911     }\
 912 }\
 913 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 914     OPNAME ## _pixels8_c(block, pixels, line_size, h);\
 915 }\
 916 \
 917 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 918                                                 int src_stride1, int src_stride2, int h){\
 919     int i;\
 920     for(i=0; i<h; i++){\
 921         uint32_t a,b;\
 922         a= AV_RN32(&src1[i*src_stride1  ]);\
 923         b= AV_RN32(&src2[i*src_stride2  ]);\
 924         OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
 925         a= AV_RN32(&src1[i*src_stride1+4]);\
 926         b= AV_RN32(&src2[i*src_stride2+4]);\
 927         OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
 928     }\
 929 }\
 930 \
 931 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 932                                                 int src_stride1, int src_stride2, int h){\
 933     int i;\
 934     for(i=0; i<h; i++){\
 935         uint32_t a,b;\
 936         a= AV_RN32(&src1[i*src_stride1  ]);\
 937         b= AV_RN32(&src2[i*src_stride2  ]);\
 938         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 939         a= AV_RN32(&src1[i*src_stride1+4]);\
 940         b= AV_RN32(&src2[i*src_stride2+4]);\
 941         OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
 942     }\
 943 }\
 944 \
 945 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 946                                                 int src_stride1, int src_stride2, int h){\
 947     int i;\
 948     for(i=0; i<h; i++){\
 949         uint32_t a,b;\
 950         a= AV_RN32(&src1[i*src_stride1  ]);\
 951         b= AV_RN32(&src2[i*src_stride2  ]);\
 952         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 953     }\
 954 }\
 955 \
 956 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 957                                                 int src_stride1, int src_stride2, int h){\
 958     int i;\
 959     for(i=0; i<h; i++){\
 960         uint32_t a,b;\
 961         a= AV_RN16(&src1[i*src_stride1  ]);\
 962         b= AV_RN16(&src2[i*src_stride2  ]);\
 963         OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 964     }\
 965 }\
 966 \
 967 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 968                                                 int src_stride1, int src_stride2, int h){\
 969     OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 970     OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 971 }\
 972 \
 973 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 974                                                 int src_stride1, int src_stride2, int h){\
 975     OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 976     OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 977 }\
 978 \
 979 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 980     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 981 }\
 982 \
 983 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 984     OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 985 }\
 986 \
 987 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 988     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 989 }\
 990 \
 991 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 992     OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 993 }\
 994 \
 995 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 996                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 997     int i;\
 998     for(i=0; i<h; i++){\
 999         uint32_t a, b, c, d, l0, l1, h0, h1;\
1000         a= AV_RN32(&src1[i*src_stride1]);\
1001         b= AV_RN32(&src2[i*src_stride2]);\
1002         c= AV_RN32(&src3[i*src_stride3]);\
1003         d= AV_RN32(&src4[i*src_stride4]);\
1004         l0=  (a&0x03030303UL)\
1005            + (b&0x03030303UL)\
1006            + 0x02020202UL;\
1007         h0= ((a&0xFCFCFCFCUL)>>2)\
1008           + ((b&0xFCFCFCFCUL)>>2);\
1009         l1=  (c&0x03030303UL)\
1010            + (d&0x03030303UL);\
1011         h1= ((c&0xFCFCFCFCUL)>>2)\
1012           + ((d&0xFCFCFCFCUL)>>2);\
1013         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1014         a= AV_RN32(&src1[i*src_stride1+4]);\
1015         b= AV_RN32(&src2[i*src_stride2+4]);\
1016         c= AV_RN32(&src3[i*src_stride3+4]);\
1017         d= AV_RN32(&src4[i*src_stride4+4]);\
1018         l0=  (a&0x03030303UL)\
1019            + (b&0x03030303UL)\
1020            + 0x02020202UL;\
1021         h0= ((a&0xFCFCFCFCUL)>>2)\
1022           + ((b&0xFCFCFCFCUL)>>2);\
1023         l1=  (c&0x03030303UL)\
1024            + (d&0x03030303UL);\
1025         h1= ((c&0xFCFCFCFCUL)>>2)\
1026           + ((d&0xFCFCFCFCUL)>>2);\
1027         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1028     }\
1029 }\
1030 \
1031 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1032     OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1033 }\
1034 \
1035 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1036     OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1037 }\
1038 \
1039 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1040     OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1041 }\
1042 \
1043 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1044     OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1045 }\
1046 \
1047 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1048                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1049     int i;\
1050     for(i=0; i<h; i++){\
1051         uint32_t a, b, c, d, l0, l1, h0, h1;\
1052         a= AV_RN32(&src1[i*src_stride1]);\
1053         b= AV_RN32(&src2[i*src_stride2]);\
1054         c= AV_RN32(&src3[i*src_stride3]);\
1055         d= AV_RN32(&src4[i*src_stride4]);\
1056         l0=  (a&0x03030303UL)\
1057            + (b&0x03030303UL)\
1058            + 0x01010101UL;\
1059         h0= ((a&0xFCFCFCFCUL)>>2)\
1060           + ((b&0xFCFCFCFCUL)>>2);\
1061         l1=  (c&0x03030303UL)\
1062            + (d&0x03030303UL);\
1063         h1= ((c&0xFCFCFCFCUL)>>2)\
1064           + ((d&0xFCFCFCFCUL)>>2);\
1065         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1066         a= AV_RN32(&src1[i*src_stride1+4]);\
1067         b= AV_RN32(&src2[i*src_stride2+4]);\
1068         c= AV_RN32(&src3[i*src_stride3+4]);\
1069         d= AV_RN32(&src4[i*src_stride4+4]);\
1070         l0=  (a&0x03030303UL)\
1071            + (b&0x03030303UL)\
1072            + 0x01010101UL;\
1073         h0= ((a&0xFCFCFCFCUL)>>2)\
1074           + ((b&0xFCFCFCFCUL)>>2);\
1075         l1=  (c&0x03030303UL)\
1076            + (d&0x03030303UL);\
1077         h1= ((c&0xFCFCFCFCUL)>>2)\
1078           + ((d&0xFCFCFCFCUL)>>2);\
1079         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1080     }\
1081 }\
1082 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1083                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1084     OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1085     OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1086 }\
1087 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1088                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1089     OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1090     OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1091 }\
1092 \
1093 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1094 {\
1095         int i, a0, b0, a1, b1;\
1096         a0= pixels[0];\
1097         b0= pixels[1] + 2;\
1098         a0 += b0;\
1099         b0 += pixels[2];\
1100 \
1101         pixels+=line_size;\
1102         for(i=0; i<h; i+=2){\
1103             a1= pixels[0];\
1104             b1= pixels[1];\
1105             a1 += b1;\
1106             b1 += pixels[2];\
1107 \
1108             block[0]= (a1+a0)>>2; /* FIXME non put */\
1109             block[1]= (b1+b0)>>2;\
1110 \
1111             pixels+=line_size;\
1112             block +=line_size;\
1113 \
1114             a0= pixels[0];\
1115             b0= pixels[1] + 2;\
1116             a0 += b0;\
1117             b0 += pixels[2];\
1118 \
1119             block[0]= (a1+a0)>>2;\
1120             block[1]= (b1+b0)>>2;\
1121             pixels+=line_size;\
1122             block +=line_size;\
1123         }\
1124 }\
1125 \
1126 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1127 {\
1128         int i;\
1129         const uint32_t a= AV_RN32(pixels  );\
1130         const uint32_t b= AV_RN32(pixels+1);\
1131         uint32_t l0=  (a&0x03030303UL)\
1132                     + (b&0x03030303UL)\
1133                     + 0x02020202UL;\
1134         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1135                    + ((b&0xFCFCFCFCUL)>>2);\
1136         uint32_t l1,h1;\
1137 \
1138         pixels+=line_size;\
1139         for(i=0; i<h; i+=2){\
1140             uint32_t a= AV_RN32(pixels  );\
1141             uint32_t b= AV_RN32(pixels+1);\
1142             l1=  (a&0x03030303UL)\
1143                + (b&0x03030303UL);\
1144             h1= ((a&0xFCFCFCFCUL)>>2)\
1145               + ((b&0xFCFCFCFCUL)>>2);\
1146             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1147             pixels+=line_size;\
1148             block +=line_size;\
1149             a= AV_RN32(pixels  );\
1150             b= AV_RN32(pixels+1);\
1151             l0=  (a&0x03030303UL)\
1152                + (b&0x03030303UL)\
1153                + 0x02020202UL;\
1154             h0= ((a&0xFCFCFCFCUL)>>2)\
1155               + ((b&0xFCFCFCFCUL)>>2);\
1156             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1157             pixels+=line_size;\
1158             block +=line_size;\
1159         }\
1160 }\
1161 \
1162 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1163 {\
1164     int j;\
1165     for(j=0; j<2; j++){\
1166         int i;\
1167         const uint32_t a= AV_RN32(pixels  );\
1168         const uint32_t b= AV_RN32(pixels+1);\
1169         uint32_t l0=  (a&0x03030303UL)\
1170                     + (b&0x03030303UL)\
1171                     + 0x02020202UL;\
1172         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1173                    + ((b&0xFCFCFCFCUL)>>2);\
1174         uint32_t l1,h1;\
1175 \
1176         pixels+=line_size;\
1177         for(i=0; i<h; i+=2){\
1178             uint32_t a= AV_RN32(pixels  );\
1179             uint32_t b= AV_RN32(pixels+1);\
1180             l1=  (a&0x03030303UL)\
1181                + (b&0x03030303UL);\
1182             h1= ((a&0xFCFCFCFCUL)>>2)\
1183               + ((b&0xFCFCFCFCUL)>>2);\
1184             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1185             pixels+=line_size;\
1186             block +=line_size;\
1187             a= AV_RN32(pixels  );\
1188             b= AV_RN32(pixels+1);\
1189             l0=  (a&0x03030303UL)\
1190                + (b&0x03030303UL)\
1191                + 0x02020202UL;\
1192             h0= ((a&0xFCFCFCFCUL)>>2)\
1193               + ((b&0xFCFCFCFCUL)>>2);\
1194             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1195             pixels+=line_size;\
1196             block +=line_size;\
1197         }\
1198         pixels+=4-line_size*(h+1);\
1199         block +=4-line_size*h;\
1200     }\
1201 }\
1202 \
1203 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1204 {\
1205     int j;\
1206     for(j=0; j<2; j++){\
1207         int i;\
1208         const uint32_t a= AV_RN32(pixels  );\
1209         const uint32_t b= AV_RN32(pixels+1);\
1210         uint32_t l0=  (a&0x03030303UL)\
1211                     + (b&0x03030303UL)\
1212                     + 0x01010101UL;\
1213         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1214                    + ((b&0xFCFCFCFCUL)>>2);\
1215         uint32_t l1,h1;\
1216 \
1217         pixels+=line_size;\
1218         for(i=0; i<h; i+=2){\
1219             uint32_t a= AV_RN32(pixels  );\
1220             uint32_t b= AV_RN32(pixels+1);\
1221             l1=  (a&0x03030303UL)\
1222                + (b&0x03030303UL);\
1223             h1= ((a&0xFCFCFCFCUL)>>2)\
1224               + ((b&0xFCFCFCFCUL)>>2);\
1225             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1226             pixels+=line_size;\
1227             block +=line_size;\
1228             a= AV_RN32(pixels  );\
1229             b= AV_RN32(pixels+1);\
1230             l0=  (a&0x03030303UL)\
1231                + (b&0x03030303UL)\
1232                + 0x01010101UL;\
1233             h0= ((a&0xFCFCFCFCUL)>>2)\
1234               + ((b&0xFCFCFCFCUL)>>2);\
1235             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1236             pixels+=line_size;\
1237             block +=line_size;\
1238         }\
1239         pixels+=4-line_size*(h+1);\
1240         block +=4-line_size*h;\
1241     }\
1242 }\
1243 \
1244 CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1245 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1246 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1247 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1248 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1249 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1250 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1251 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1252
1253 #define op_avg(a, b) a = rnd_avg32(a, b)
1254 #endif
1255 #define op_put(a, b) a = b
1256
1257 PIXOP2(avg, op_avg)
1258 PIXOP2(put, op_put)
1259 #undef op_avg
1260 #undef op_put
1261
1262 #define avg2(a,b) ((a+b+1)>>1)
1263 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1264
1265 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1266     put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1267 }
1268
1269 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1270     put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1271 }
1272
1273 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1274 {
1275     const int A=(16-x16)*(16-y16);
1276     const int B=(   x16)*(16-y16);
1277     const int C=(16-x16)*(   y16);
1278     const int D=(   x16)*(   y16);
1279     int i;
1280
1281     for(i=0; i<h; i++)
1282     {
1283         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1284         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1285         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1286         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1287         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1288         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1289         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1290         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1291         dst+= stride;
1292         src+= stride;
1293     }
1294 }
1295
1296 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1297                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1298 {
1299     int y, vx, vy;
1300     const int s= 1<<shift;
1301
1302     width--;
1303     height--;
1304
1305     for(y=0; y<h; y++){
1306         int x;
1307
1308         vx= ox;
1309         vy= oy;
1310         for(x=0; x<8; x++){ //XXX FIXME optimize
1311             int src_x, src_y, frac_x, frac_y, index;
1312
1313             src_x= vx>>16;
1314             src_y= vy>>16;
1315             frac_x= src_x&(s-1);
1316             frac_y= src_y&(s-1);
1317             src_x>>=shift;
1318             src_y>>=shift;
1319
1320             if((unsigned)src_x < width){
1321                 if((unsigned)src_y < height){
1322                     index= src_x + src_y*stride;
1323                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1324                                            + src[index       +1]*   frac_x )*(s-frac_y)
1325                                         + (  src[index+stride  ]*(s-frac_x)
1326                                            + src[index+stride+1]*   frac_x )*   frac_y
1327                                         + r)>>(shift*2);
1328                 }else{
1329                     index= src_x + av_clip(src_y, 0, height)*stride;
1330                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1331                                           + src[index       +1]*   frac_x )*s
1332                                         + r)>>(shift*2);
1333                 }
1334             }else{
1335                 if((unsigned)src_y < height){
1336                     index= av_clip(src_x, 0, width) + src_y*stride;
1337                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1338                                            + src[index+stride  ]*   frac_y )*s
1339                                         + r)>>(shift*2);
1340                 }else{
1341                     index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1342                     dst[y*stride + x]=    src[index         ];
1343                 }
1344             }
1345
1346             vx+= dxx;
1347             vy+= dyx;
1348         }
1349         ox += dxy;
1350         oy += dyy;
1351     }
1352 }
1353
1354 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1355     switch(width){
1356     case 2: put_pixels2_c (dst, src, stride, height); break;
1357     case 4: put_pixels4_c (dst, src, stride, height); break;
1358     case 8: put_pixels8_c (dst, src, stride, height); break;
1359     case 16:put_pixels16_c(dst, src, stride, height); break;
1360     }
1361 }
1362
1363 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1364     int i,j;
1365     for (i=0; i < height; i++) {
1366       for (j=0; j < width; j++) {
1367         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1368       }
1369       src += stride;
1370       dst += stride;
1371     }
1372 }
1373
1374 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1375     int i,j;
1376     for (i=0; i < height; i++) {
1377       for (j=0; j < width; j++) {
1378         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1379       }
1380       src += stride;
1381       dst += stride;
1382     }
1383 }
1384
1385 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1386     int i,j;
1387     for (i=0; i < height; i++) {
1388       for (j=0; j < width; j++) {
1389         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1390       }
1391       src += stride;
1392       dst += stride;
1393     }
1394 }
1395
1396 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1397     int i,j;
1398     for (i=0; i < height; i++) {
1399       for (j=0; j < width; j++) {
1400         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1401       }
1402       src += stride;
1403       dst += stride;
1404     }
1405 }
1406
1407 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1408     int i,j;
1409     for (i=0; i < height; i++) {
1410       for (j=0; j < width; j++) {
1411         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1412       }
1413       src += stride;
1414       dst += stride;
1415     }
1416 }
1417
1418 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1419     int i,j;
1420     for (i=0; i < height; i++) {
1421       for (j=0; j < width; j++) {
1422         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1423       }
1424       src += stride;
1425       dst += stride;
1426     }
1427 }
1428
1429 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1430     int i,j;
1431     for (i=0; i < height; i++) {
1432       for (j=0; j < width; j++) {
1433         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1434       }
1435       src += stride;
1436       dst += stride;
1437     }
1438 }
1439
1440 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1441     int i,j;
1442     for (i=0; i < height; i++) {
1443       for (j=0; j < width; j++) {
1444         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1445       }
1446       src += stride;
1447       dst += stride;
1448     }
1449 }
1450
1451 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1452     switch(width){
1453     case 2: avg_pixels2_c (dst, src, stride, height); break;
1454     case 4: avg_pixels4_c (dst, src, stride, height); break;
1455     case 8: avg_pixels8_c (dst, src, stride, height); break;
1456     case 16:avg_pixels16_c(dst, src, stride, height); break;
1457     }
1458 }
1459
1460 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1461     int i,j;
1462     for (i=0; i < height; i++) {
1463       for (j=0; j < width; j++) {
1464         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1465       }
1466       src += stride;
1467       dst += stride;
1468     }
1469 }
1470
1471 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1472     int i,j;
1473     for (i=0; i < height; i++) {
1474       for (j=0; j < width; j++) {
1475         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1476       }
1477       src += stride;
1478       dst += stride;
1479     }
1480 }
1481
1482 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1483     int i,j;
1484     for (i=0; i < height; i++) {
1485       for (j=0; j < width; j++) {
1486         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1487       }
1488       src += stride;
1489       dst += stride;
1490     }
1491 }
1492
1493 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1494     int i,j;
1495     for (i=0; i < height; i++) {
1496       for (j=0; j < width; j++) {
1497         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1498       }
1499       src += stride;
1500       dst += stride;
1501     }
1502 }
1503
1504 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1505     int i,j;
1506     for (i=0; i < height; i++) {
1507       for (j=0; j < width; j++) {
1508         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1509       }
1510       src += stride;
1511       dst += stride;
1512     }
1513 }
1514
1515 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1516     int i,j;
1517     for (i=0; i < height; i++) {
1518       for (j=0; j < width; j++) {
1519         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1520       }
1521       src += stride;
1522       dst += stride;
1523     }
1524 }
1525
1526 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1527     int i,j;
1528     for (i=0; i < height; i++) {
1529       for (j=0; j < width; j++) {
1530         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1531       }
1532       src += stride;
1533       dst += stride;
1534     }
1535 }
1536
1537 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1538     int i,j;
1539     for (i=0; i < height; i++) {
1540       for (j=0; j < width; j++) {
1541         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1542       }
1543       src += stride;
1544       dst += stride;
1545     }
1546 }
1547 #if 0
1548 #define TPEL_WIDTH(width)\
1549 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1550     void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1551 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1552     void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1553 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1554     void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1555 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1556     void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1557 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1558     void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1559 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1560     void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1561 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1562     void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1563 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1564     void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1565 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1566     void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1567 #endif
1568
1569 #define H264_CHROMA_MC(OPNAME, OP)\
1570 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1571     const int A=(8-x)*(8-y);\
1572     const int B=(  x)*(8-y);\
1573     const int C=(8-x)*(  y);\
1574     const int D=(  x)*(  y);\
1575     int i;\
1576     \
1577     assert(x<8 && y<8 && x>=0 && y>=0);\
1578 \
1579     if(D){\
1580         for(i=0; i<h; i++){\
1581             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1582             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1583             dst+= stride;\
1584             src+= stride;\
1585         }\
1586     }else{\
1587         const int E= B+C;\
1588         const int step= C ? stride : 1;\
1589         for(i=0; i<h; i++){\
1590             OP(dst[0], (A*src[0] + E*src[step+0]));\
1591             OP(dst[1], (A*src[1] + E*src[step+1]));\
1592             dst+= stride;\
1593             src+= stride;\
1594         }\
1595     }\
1596 }\
1597 \
1598 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1599     const int A=(8-x)*(8-y);\
1600     const int B=(  x)*(8-y);\
1601     const int C=(8-x)*(  y);\
1602     const int D=(  x)*(  y);\
1603     int i;\
1604     \
1605     assert(x<8 && y<8 && x>=0 && y>=0);\
1606 \
1607     if(D){\
1608         for(i=0; i<h; i++){\
1609             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1610             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1611             OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1612             OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1613             dst+= stride;\
1614             src+= stride;\
1615         }\
1616     }else{\
1617         const int E= B+C;\
1618         const int step= C ? stride : 1;\
1619         for(i=0; i<h; i++){\
1620             OP(dst[0], (A*src[0] + E*src[step+0]));\
1621             OP(dst[1], (A*src[1] + E*src[step+1]));\
1622             OP(dst[2], (A*src[2] + E*src[step+2]));\
1623             OP(dst[3], (A*src[3] + E*src[step+3]));\
1624             dst+= stride;\
1625             src+= stride;\
1626         }\
1627     }\
1628 }\
1629 \
1630 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1631     const int A=(8-x)*(8-y);\
1632     const int B=(  x)*(8-y);\
1633     const int C=(8-x)*(  y);\
1634     const int D=(  x)*(  y);\
1635     int i;\
1636     \
1637     assert(x<8 && y<8 && x>=0 && y>=0);\
1638 \
1639     if(D){\
1640         for(i=0; i<h; i++){\
1641             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1642             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1643             OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1644             OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1645             OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1646             OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1647             OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1648             OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1649             dst+= stride;\
1650             src+= stride;\
1651         }\
1652     }else{\
1653         const int E= B+C;\
1654         const int step= C ? stride : 1;\
1655         for(i=0; i<h; i++){\
1656             OP(dst[0], (A*src[0] + E*src[step+0]));\
1657             OP(dst[1], (A*src[1] + E*src[step+1]));\
1658             OP(dst[2], (A*src[2] + E*src[step+2]));\
1659             OP(dst[3], (A*src[3] + E*src[step+3]));\
1660             OP(dst[4], (A*src[4] + E*src[step+4]));\
1661             OP(dst[5], (A*src[5] + E*src[step+5]));\
1662             OP(dst[6], (A*src[6] + E*src[step+6]));\
1663             OP(dst[7], (A*src[7] + E*src[step+7]));\
1664             dst+= stride;\
1665             src+= stride;\
1666         }\
1667     }\
1668 }
1669
1670 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1671 #define op_put(a, b) a = (((b) + 32)>>6)
1672
1673 H264_CHROMA_MC(put_       , op_put)
1674 H264_CHROMA_MC(avg_       , op_avg)
1675 #undef op_avg
1676 #undef op_put
1677
1678 static void put_no_rnd_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1679     const int A=(8-x)*(8-y);
1680     const int B=(  x)*(8-y);
1681     const int C=(8-x)*(  y);
1682     const int D=(  x)*(  y);
1683     int i;
1684
1685     assert(x<8 && y<8 && x>=0 && y>=0);
1686
1687     for(i=0; i<h; i++)
1688     {
1689         dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1690         dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1691         dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1692         dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1693         dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1694         dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1695         dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1696         dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1697         dst+= stride;
1698         src+= stride;
1699     }
1700 }
1701
1702 #define QPEL_MC(r, OPNAME, RND, OP) \
1703 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1704     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1705     int i;\
1706     for(i=0; i<h; i++)\
1707     {\
1708         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1709         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1710         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1711         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1712         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1713         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1714         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1715         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1716         dst+=dstStride;\
1717         src+=srcStride;\
1718     }\
1719 }\
1720 \
1721 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1722     const int w=8;\
1723     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1724     int i;\
1725     for(i=0; i<w; i++)\
1726     {\
1727         const int src0= src[0*srcStride];\
1728         const int src1= src[1*srcStride];\
1729         const int src2= src[2*srcStride];\
1730         const int src3= src[3*srcStride];\
1731         const int src4= src[4*srcStride];\
1732         const int src5= src[5*srcStride];\
1733         const int src6= src[6*srcStride];\
1734         const int src7= src[7*srcStride];\
1735         const int src8= src[8*srcStride];\
1736         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1737         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1738         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1739         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1740         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1741         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1742         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1743         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1744         dst++;\
1745         src++;\
1746     }\
1747 }\
1748 \
1749 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1750     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1751     int i;\
1752     \
1753     for(i=0; i<h; i++)\
1754     {\
1755         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1756         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1757         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1758         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1759         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1760         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1761         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1762         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1763         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1764         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1765         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1766         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1767         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1768         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1769         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1770         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1771         dst+=dstStride;\
1772         src+=srcStride;\
1773     }\
1774 }\
1775 \
1776 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1777     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1778     int i;\
1779     const int w=16;\
1780     for(i=0; i<w; i++)\
1781     {\
1782         const int src0= src[0*srcStride];\
1783         const int src1= src[1*srcStride];\
1784         const int src2= src[2*srcStride];\
1785         const int src3= src[3*srcStride];\
1786         const int src4= src[4*srcStride];\
1787         const int src5= src[5*srcStride];\
1788         const int src6= src[6*srcStride];\
1789         const int src7= src[7*srcStride];\
1790         const int src8= src[8*srcStride];\
1791         const int src9= src[9*srcStride];\
1792         const int src10= src[10*srcStride];\
1793         const int src11= src[11*srcStride];\
1794         const int src12= src[12*srcStride];\
1795         const int src13= src[13*srcStride];\
1796         const int src14= src[14*srcStride];\
1797         const int src15= src[15*srcStride];\
1798         const int src16= src[16*srcStride];\
1799         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1800         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1801         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1802         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1803         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1804         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1805         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1806         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1807         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1808         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1809         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1810         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1811         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1812         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1813         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1814         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1815         dst++;\
1816         src++;\
1817     }\
1818 }\
1819 \
1820 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1821     OPNAME ## pixels8_c(dst, src, stride, 8);\
1822 }\
1823 \
1824 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1825     uint8_t half[64];\
1826     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1827     OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1828 }\
1829 \
1830 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1831     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1832 }\
1833 \
1834 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1835     uint8_t half[64];\
1836     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1837     OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1838 }\
1839 \
1840 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1841     uint8_t full[16*9];\
1842     uint8_t half[64];\
1843     copy_block9(full, src, 16, stride, 9);\
1844     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1845     OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1846 }\
1847 \
1848 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1849     uint8_t full[16*9];\
1850     copy_block9(full, src, 16, stride, 9);\
1851     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1852 }\
1853 \
1854 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1855     uint8_t full[16*9];\
1856     uint8_t half[64];\
1857     copy_block9(full, src, 16, stride, 9);\
1858     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1859     OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1860 }\
1861 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1862     uint8_t full[16*9];\
1863     uint8_t halfH[72];\
1864     uint8_t halfV[64];\
1865     uint8_t halfHV[64];\
1866     copy_block9(full, src, 16, stride, 9);\
1867     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1868     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1869     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1870     OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1871 }\
1872 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1873     uint8_t full[16*9];\
1874     uint8_t halfH[72];\
1875     uint8_t halfHV[64];\
1876     copy_block9(full, src, 16, stride, 9);\
1877     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1878     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1879     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1880     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1881 }\
1882 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1883     uint8_t full[16*9];\
1884     uint8_t halfH[72];\
1885     uint8_t halfV[64];\
1886     uint8_t halfHV[64];\
1887     copy_block9(full, src, 16, stride, 9);\
1888     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1889     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1890     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1891     OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1892 }\
1893 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1894     uint8_t full[16*9];\
1895     uint8_t halfH[72];\
1896     uint8_t halfHV[64];\
1897     copy_block9(full, src, 16, stride, 9);\
1898     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1899     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1900     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1901     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1902 }\
1903 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1904     uint8_t full[16*9];\
1905     uint8_t halfH[72];\
1906     uint8_t halfV[64];\
1907     uint8_t halfHV[64];\
1908     copy_block9(full, src, 16, stride, 9);\
1909     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1910     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1911     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1912     OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1913 }\
1914 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1915     uint8_t full[16*9];\
1916     uint8_t halfH[72];\
1917     uint8_t halfHV[64];\
1918     copy_block9(full, src, 16, stride, 9);\
1919     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1920     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1921     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1922     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1923 }\
1924 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1925     uint8_t full[16*9];\
1926     uint8_t halfH[72];\
1927     uint8_t halfV[64];\
1928     uint8_t halfHV[64];\
1929     copy_block9(full, src, 16, stride, 9);\
1930     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1931     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1932     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1933     OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1934 }\
1935 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1936     uint8_t full[16*9];\
1937     uint8_t halfH[72];\
1938     uint8_t halfHV[64];\
1939     copy_block9(full, src, 16, stride, 9);\
1940     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1941     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1942     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1943     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1944 }\
1945 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1946     uint8_t halfH[72];\
1947     uint8_t halfHV[64];\
1948     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1949     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1950     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1951 }\
1952 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1953     uint8_t halfH[72];\
1954     uint8_t halfHV[64];\
1955     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1956     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1957     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1958 }\
1959 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1960     uint8_t full[16*9];\
1961     uint8_t halfH[72];\
1962     uint8_t halfV[64];\
1963     uint8_t halfHV[64];\
1964     copy_block9(full, src, 16, stride, 9);\
1965     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1966     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1967     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1968     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1969 }\
1970 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1971     uint8_t full[16*9];\
1972     uint8_t halfH[72];\
1973     copy_block9(full, src, 16, stride, 9);\
1974     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1975     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1976     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1977 }\
1978 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1979     uint8_t full[16*9];\
1980     uint8_t halfH[72];\
1981     uint8_t halfV[64];\
1982     uint8_t halfHV[64];\
1983     copy_block9(full, src, 16, stride, 9);\
1984     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1985     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1986     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1987     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1988 }\
1989 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1990     uint8_t full[16*9];\
1991     uint8_t halfH[72];\
1992     copy_block9(full, src, 16, stride, 9);\
1993     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1994     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1995     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1996 }\
1997 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1998     uint8_t halfH[72];\
1999     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
2000     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2001 }\
2002 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2003     OPNAME ## pixels16_c(dst, src, stride, 16);\
2004 }\
2005 \
2006 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2007     uint8_t half[256];\
2008     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2009     OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
2010 }\
2011 \
2012 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2013     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
2014 }\
2015 \
2016 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2017     uint8_t half[256];\
2018     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2019     OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
2020 }\
2021 \
2022 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2023     uint8_t full[24*17];\
2024     uint8_t half[256];\
2025     copy_block17(full, src, 24, stride, 17);\
2026     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2027     OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
2028 }\
2029 \
2030 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2031     uint8_t full[24*17];\
2032     copy_block17(full, src, 24, stride, 17);\
2033     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
2034 }\
2035 \
2036 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2037     uint8_t full[24*17];\
2038     uint8_t half[256];\
2039     copy_block17(full, src, 24, stride, 17);\
2040     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2041     OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
2042 }\
2043 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
2044     uint8_t full[24*17];\
2045     uint8_t halfH[272];\
2046     uint8_t halfV[256];\
2047     uint8_t halfHV[256];\
2048     copy_block17(full, src, 24, stride, 17);\
2049     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2050     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2051     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2052     OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2053 }\
2054 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2055     uint8_t full[24*17];\
2056     uint8_t halfH[272];\
2057     uint8_t halfHV[256];\
2058     copy_block17(full, src, 24, stride, 17);\
2059     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2060     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2061     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2062     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2063 }\
2064 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2065     uint8_t full[24*17];\
2066     uint8_t halfH[272];\
2067     uint8_t halfV[256];\
2068     uint8_t halfHV[256];\
2069     copy_block17(full, src, 24, stride, 17);\
2070     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2071     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2072     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2073     OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2074 }\
2075 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2076     uint8_t full[24*17];\
2077     uint8_t halfH[272];\
2078     uint8_t halfHV[256];\
2079     copy_block17(full, src, 24, stride, 17);\
2080     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2081     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2082     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2083     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2084 }\
2085 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2086     uint8_t full[24*17];\
2087     uint8_t halfH[272];\
2088     uint8_t halfV[256];\
2089     uint8_t halfHV[256];\
2090     copy_block17(full, src, 24, stride, 17);\
2091     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2092     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2093     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2094     OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2095 }\
2096 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2097     uint8_t full[24*17];\
2098     uint8_t halfH[272];\
2099     uint8_t halfHV[256];\
2100     copy_block17(full, src, 24, stride, 17);\
2101     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2102     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2103     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2104     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2105 }\
2106 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2107     uint8_t full[24*17];\
2108     uint8_t halfH[272];\
2109     uint8_t halfV[256];\
2110     uint8_t halfHV[256];\
2111     copy_block17(full, src, 24, stride, 17);\
2112     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
2113     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2114     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2115     OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2116 }\
2117 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2118     uint8_t full[24*17];\
2119     uint8_t halfH[272];\
2120     uint8_t halfHV[256];\
2121     copy_block17(full, src, 24, stride, 17);\
2122     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2123     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2124     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2125     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2126 }\
2127 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2128     uint8_t halfH[272];\
2129     uint8_t halfHV[256];\
2130     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2131     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2132     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2133 }\
2134 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2135     uint8_t halfH[272];\
2136     uint8_t halfHV[256];\
2137     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2138     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2139     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2140 }\
2141 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2142     uint8_t full[24*17];\
2143     uint8_t halfH[272];\
2144     uint8_t halfV[256];\
2145     uint8_t halfHV[256];\
2146     copy_block17(full, src, 24, stride, 17);\
2147     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2148     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2149     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2150     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2151 }\
2152 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2153     uint8_t full[24*17];\
2154     uint8_t halfH[272];\
2155     copy_block17(full, src, 24, stride, 17);\
2156     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2157     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2158     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2159 }\
2160 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2161     uint8_t full[24*17];\
2162     uint8_t halfH[272];\
2163     uint8_t halfV[256];\
2164     uint8_t halfHV[256];\
2165     copy_block17(full, src, 24, stride, 17);\
2166     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2167     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2168     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2169     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2170 }\
2171 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2172     uint8_t full[24*17];\
2173     uint8_t halfH[272];\
2174     copy_block17(full, src, 24, stride, 17);\
2175     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2176     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2177     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2178 }\
2179 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2180     uint8_t halfH[272];\
2181     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2182     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2183 }
2184
2185 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2186 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2187 #define op_put(a, b) a = cm[((b) + 16)>>5]
2188 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2189
2190 QPEL_MC(0, put_       , _       , op_put)
2191 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2192 QPEL_MC(0, avg_       , _       , op_avg)
2193 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
2194 #undef op_avg
2195 #undef op_avg_no_rnd
2196 #undef op_put
2197 #undef op_put_no_rnd
2198
2199 #if 1
2200 #define H264_LOWPASS(OPNAME, OP, OP2) \
2201 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2202     const int h=2;\
2203     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2204     int i;\
2205     for(i=0; i<h; i++)\
2206     {\
2207         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2208         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2209         dst+=dstStride;\
2210         src+=srcStride;\
2211     }\
2212 }\
2213 \
2214 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2215     const int w=2;\
2216     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2217     int i;\
2218     for(i=0; i<w; i++)\
2219     {\
2220         const int srcB= src[-2*srcStride];\
2221         const int srcA= src[-1*srcStride];\
2222         const int src0= src[0 *srcStride];\
2223         const int src1= src[1 *srcStride];\
2224         const int src2= src[2 *srcStride];\
2225         const int src3= src[3 *srcStride];\
2226         const int src4= src[4 *srcStride];\
2227         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2228         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2229         dst++;\
2230         src++;\
2231     }\
2232 }\
2233 \
2234 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2235     const int h=2;\
2236     const int w=2;\
2237     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2238     int i;\
2239     src -= 2*srcStride;\
2240     for(i=0; i<h+5; i++)\
2241     {\
2242         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2243         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2244         tmp+=tmpStride;\
2245         src+=srcStride;\
2246     }\
2247     tmp -= tmpStride*(h+5-2);\
2248     for(i=0; i<w; i++)\
2249     {\
2250         const int tmpB= tmp[-2*tmpStride];\
2251         const int tmpA= tmp[-1*tmpStride];\
2252         const int tmp0= tmp[0 *tmpStride];\
2253         const int tmp1= tmp[1 *tmpStride];\
2254         const int tmp2= tmp[2 *tmpStride];\
2255         const int tmp3= tmp[3 *tmpStride];\
2256         const int tmp4= tmp[4 *tmpStride];\
2257         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2258         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2259         dst++;\
2260         tmp++;\
2261     }\
2262 }\
2263 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2264     const int h=4;\
2265     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2266     int i;\
2267     for(i=0; i<h; i++)\
2268     {\
2269         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2270         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2271         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2272         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2273         dst+=dstStride;\
2274         src+=srcStride;\
2275     }\
2276 }\
2277 \
2278 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2279     const int w=4;\
2280     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2281     int i;\
2282     for(i=0; i<w; i++)\
2283     {\
2284         const int srcB= src[-2*srcStride];\
2285         const int srcA= src[-1*srcStride];\
2286         const int src0= src[0 *srcStride];\
2287         const int src1= src[1 *srcStride];\
2288         const int src2= src[2 *srcStride];\
2289         const int src3= src[3 *srcStride];\
2290         const int src4= src[4 *srcStride];\
2291         const int src5= src[5 *srcStride];\
2292         const int src6= src[6 *srcStride];\
2293         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2294         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2295         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2296         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2297         dst++;\
2298         src++;\
2299     }\
2300 }\
2301 \
2302 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2303     const int h=4;\
2304     const int w=4;\
2305     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2306     int i;\
2307     src -= 2*srcStride;\
2308     for(i=0; i<h+5; i++)\
2309     {\
2310         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2311         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2312         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2313         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2314         tmp+=tmpStride;\
2315         src+=srcStride;\
2316     }\
2317     tmp -= tmpStride*(h+5-2);\
2318     for(i=0; i<w; i++)\
2319     {\
2320         const int tmpB= tmp[-2*tmpStride];\
2321         const int tmpA= tmp[-1*tmpStride];\
2322         const int tmp0= tmp[0 *tmpStride];\
2323         const int tmp1= tmp[1 *tmpStride];\
2324         const int tmp2= tmp[2 *tmpStride];\
2325         const int tmp3= tmp[3 *tmpStride];\
2326         const int tmp4= tmp[4 *tmpStride];\
2327         const int tmp5= tmp[5 *tmpStride];\
2328         const int tmp6= tmp[6 *tmpStride];\
2329         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2330         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2331         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2332         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2333         dst++;\
2334         tmp++;\
2335     }\
2336 }\
2337 \
2338 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2339     const int h=8;\
2340     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2341     int i;\
2342     for(i=0; i<h; i++)\
2343     {\
2344         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2345         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2346         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2347         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2348         OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2349         OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2350         OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2351         OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2352         dst+=dstStride;\
2353         src+=srcStride;\
2354     }\
2355 }\
2356 \
2357 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2358     const int w=8;\
2359     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2360     int i;\
2361     for(i=0; i<w; i++)\
2362     {\
2363         const int srcB= src[-2*srcStride];\
2364         const int srcA= src[-1*srcStride];\
2365         const int src0= src[0 *srcStride];\
2366         const int src1= src[1 *srcStride];\
2367         const int src2= src[2 *srcStride];\
2368         const int src3= src[3 *srcStride];\
2369         const int src4= src[4 *srcStride];\
2370         const int src5= src[5 *srcStride];\
2371         const int src6= src[6 *srcStride];\
2372         const int src7= src[7 *srcStride];\
2373         const int src8= src[8 *srcStride];\
2374         const int src9= src[9 *srcStride];\
2375         const int src10=src[10*srcStride];\
2376         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2377         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2378         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2379         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2380         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2381         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2382         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2383         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2384         dst++;\
2385         src++;\
2386     }\
2387 }\
2388 \
2389 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2390     const int h=8;\
2391     const int w=8;\
2392     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2393     int i;\
2394     src -= 2*srcStride;\
2395     for(i=0; i<h+5; i++)\
2396     {\
2397         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2398         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2399         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2400         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2401         tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2402         tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2403         tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2404         tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2405         tmp+=tmpStride;\
2406         src+=srcStride;\
2407     }\
2408     tmp -= tmpStride*(h+5-2);\
2409     for(i=0; i<w; i++)\
2410     {\
2411         const int tmpB= tmp[-2*tmpStride];\
2412         const int tmpA= tmp[-1*tmpStride];\
2413         const int tmp0= tmp[0 *tmpStride];\
2414         const int tmp1= tmp[1 *tmpStride];\
2415         const int tmp2= tmp[2 *tmpStride];\
2416         const int tmp3= tmp[3 *tmpStride];\
2417         const int tmp4= tmp[4 *tmpStride];\
2418         const int tmp5= tmp[5 *tmpStride];\
2419         const int tmp6= tmp[6 *tmpStride];\
2420         const int tmp7= tmp[7 *tmpStride];\
2421         const int tmp8= tmp[8 *tmpStride];\
2422         const int tmp9= tmp[9 *tmpStride];\
2423         const int tmp10=tmp[10*tmpStride];\
2424         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2425         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2426         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2427         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2428         OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2429         OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2430         OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2431         OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2432         dst++;\
2433         tmp++;\
2434     }\
2435 }\
2436 \
2437 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2438     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2439     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2440     src += 8*srcStride;\
2441     dst += 8*dstStride;\
2442     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2443     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2444 }\
2445 \
2446 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2447     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2448     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2449     src += 8*srcStride;\
2450     dst += 8*dstStride;\
2451     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2452     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2453 }\
2454 \
2455 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2456     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2457     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2458     src += 8*srcStride;\
2459     dst += 8*dstStride;\
2460     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2461     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2462 }\
2463
2464 #define H264_MC(OPNAME, SIZE) \
2465 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2466     OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2467 }\
2468 \
2469 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2470     uint8_t half[SIZE*SIZE];\
2471     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2472     OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2473 }\
2474 \
2475 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2476     OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2477 }\
2478 \
2479 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2480     uint8_t half[SIZE*SIZE];\
2481     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2482     OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2483 }\
2484 \
2485 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2486     uint8_t full[SIZE*(SIZE+5)];\
2487     uint8_t * const full_mid= full + SIZE*2;\
2488     uint8_t half[SIZE*SIZE];\
2489     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2490     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2491     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2492 }\
2493 \
2494 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2495     uint8_t full[SIZE*(SIZE+5)];\
2496     uint8_t * const full_mid= full + SIZE*2;\
2497     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2498     OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2499 }\
2500 \
2501 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2502     uint8_t full[SIZE*(SIZE+5)];\
2503     uint8_t * const full_mid= full + SIZE*2;\
2504     uint8_t half[SIZE*SIZE];\
2505     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2506     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2507     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2508 }\
2509 \
2510 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2511     uint8_t full[SIZE*(SIZE+5)];\
2512     uint8_t * const full_mid= full + SIZE*2;\
2513     uint8_t halfH[SIZE*SIZE];\
2514     uint8_t halfV[SIZE*SIZE];\
2515     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2516     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2517     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2518     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2519 }\
2520 \
2521 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2522     uint8_t full[SIZE*(SIZE+5)];\
2523     uint8_t * const full_mid= full + SIZE*2;\
2524     uint8_t halfH[SIZE*SIZE];\
2525     uint8_t halfV[SIZE*SIZE];\
2526     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2527     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2528     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2529     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2530 }\
2531 \
2532 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2533     uint8_t full[SIZE*(SIZE+5)];\
2534     uint8_t * const full_mid= full + SIZE*2;\
2535     uint8_t halfH[SIZE*SIZE];\
2536     uint8_t halfV[SIZE*SIZE];\
2537     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2538     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2539     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2540     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2541 }\
2542 \
2543 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2544     uint8_t full[SIZE*(SIZE+5)];\
2545     uint8_t * const full_mid= full + SIZE*2;\
2546     uint8_t halfH[SIZE*SIZE];\
2547     uint8_t halfV[SIZE*SIZE];\
2548     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2549     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2550     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2551     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2552 }\
2553 \
2554 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2555     int16_t tmp[SIZE*(SIZE+5)];\
2556     OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2557 }\
2558 \
2559 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2560     int16_t tmp[SIZE*(SIZE+5)];\
2561     uint8_t halfH[SIZE*SIZE];\
2562     uint8_t halfHV[SIZE*SIZE];\
2563     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2564     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2565     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2566 }\
2567 \
2568 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2569     int16_t tmp[SIZE*(SIZE+5)];\
2570     uint8_t halfH[SIZE*SIZE];\
2571     uint8_t halfHV[SIZE*SIZE];\
2572     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2573     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2574     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2575 }\
2576 \
2577 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2578     uint8_t full[SIZE*(SIZE+5)];\
2579     uint8_t * const full_mid= full + SIZE*2;\
2580     int16_t tmp[SIZE*(SIZE+5)];\
2581     uint8_t halfV[SIZE*SIZE];\
2582     uint8_t halfHV[SIZE*SIZE];\
2583     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2584     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2585     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2586     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2587 }\
2588 \
2589 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2590     uint8_t full[SIZE*(SIZE+5)];\
2591     uint8_t * const full_mid= full + SIZE*2;\
2592     int16_t tmp[SIZE*(SIZE+5)];\
2593     uint8_t halfV[SIZE*SIZE];\
2594     uint8_t halfHV[SIZE*SIZE];\
2595     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2596     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2597     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2598     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2599 }\
2600
2601 #define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2602 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2603 #define op_put(a, b)  a = cm[((b) + 16)>>5]
2604 #define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2605 #define op2_put(a, b)  a = cm[((b) + 512)>>10]
2606
2607 H264_LOWPASS(put_       , op_put, op2_put)
2608 H264_LOWPASS(avg_       , op_avg, op2_avg)
2609 H264_MC(put_, 2)
2610 H264_MC(put_, 4)
2611 H264_MC(put_, 8)
2612 H264_MC(put_, 16)
2613 H264_MC(avg_, 4)
2614 H264_MC(avg_, 8)
2615 H264_MC(avg_, 16)
2616
2617 #undef op_avg
2618 #undef op_put
2619 #undef op2_avg
2620 #undef op2_put
2621 #endif
2622
2623 #define op_scale1(x)  block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2624 #define op_scale2(x)  dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2625 #define H264_WEIGHT(W,H) \
2626 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2627     int y; \
2628     offset <<= log2_denom; \
2629     if(log2_denom) offset += 1<<(log2_denom-1); \
2630     for(y=0; y<H; y++, block += stride){ \
2631         op_scale1(0); \
2632         op_scale1(1); \
2633         if(W==2) continue; \
2634         op_scale1(2); \
2635         op_scale1(3); \
2636         if(W==4) continue; \
2637         op_scale1(4); \
2638         op_scale1(5); \
2639         op_scale1(6); \
2640         op_scale1(7); \
2641         if(W==8) continue; \
2642         op_scale1(8); \
2643         op_scale1(9); \
2644         op_scale1(10); \
2645         op_scale1(11); \
2646         op_scale1(12); \
2647         op_scale1(13); \
2648         op_scale1(14); \
2649         op_scale1(15); \
2650     } \
2651 } \
2652 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2653     int y; \
2654     offset = ((offset + 1) | 1) << log2_denom; \
2655     for(y=0; y<H; y++, dst += stride, src += stride){ \
2656         op_scale2(0); \
2657         op_scale2(1); \
2658         if(W==2) continue; \
2659         op_scale2(2); \
2660         op_scale2(3); \
2661         if(W==4) continue; \
2662         op_scale2(4); \
2663         op_scale2(5); \
2664         op_scale2(6); \
2665         op_scale2(7); \
2666         if(W==8) continue; \
2667         op_scale2(8); \
2668         op_scale2(9); \
2669         op_scale2(10); \
2670         op_scale2(11); \
2671         op_scale2(12); \
2672         op_scale2(13); \
2673         op_scale2(14); \
2674         op_scale2(15); \
2675     } \
2676 }
2677
2678 H264_WEIGHT(16,16)
2679 H264_WEIGHT(16,8)
2680 H264_WEIGHT(8,16)
2681 H264_WEIGHT(8,8)
2682 H264_WEIGHT(8,4)
2683 H264_WEIGHT(4,8)
2684 H264_WEIGHT(4,4)
2685 H264_WEIGHT(4,2)
2686 H264_WEIGHT(2,4)
2687 H264_WEIGHT(2,2)
2688
2689 #undef op_scale1
2690 #undef op_scale2
2691 #undef H264_WEIGHT
2692
2693 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2694     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2695     int i;
2696
2697     for(i=0; i<h; i++){
2698         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2699         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2700         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2701         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2702         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2703         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2704         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2705         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2706         dst+=dstStride;
2707         src+=srcStride;
2708     }
2709 }
2710
2711 #ifdef CONFIG_CAVS_DECODER
2712 /* AVS specific */
2713 void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2714
2715 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2716     put_pixels8_c(dst, src, stride, 8);
2717 }
2718 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2719     avg_pixels8_c(dst, src, stride, 8);
2720 }
2721 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2722     put_pixels16_c(dst, src, stride, 16);
2723 }
2724 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2725     avg_pixels16_c(dst, src, stride, 16);
2726 }
2727 #endif /* CONFIG_CAVS_DECODER */
2728
2729 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
2730 /* VC-1 specific */
2731 void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2732
2733 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2734     put_pixels8_c(dst, src, stride, 8);
2735 }
2736 #endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
2737
2738 void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
2739
2740 /* H264 specific */
2741 void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2742
2743 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2744     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2745     int i;
2746
2747     for(i=0; i<w; i++){
2748         const int src_1= src[ -srcStride];
2749         const int src0 = src[0          ];
2750         const int src1 = src[  srcStride];
2751         const int src2 = src[2*srcStride];
2752         const int src3 = src[3*srcStride];
2753         const int src4 = src[4*srcStride];
2754         const int src5 = src[5*srcStride];
2755         const int src6 = src[6*srcStride];
2756         const int src7 = src[7*srcStride];
2757         const int src8 = src[8*srcStride];
2758         const int src9 = src[9*srcStride];
2759         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2760         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2761         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2762         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2763         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2764         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2765         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2766         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2767         src++;
2768         dst++;
2769     }
2770 }
2771
2772 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2773     put_pixels8_c(dst, src, stride, 8);
2774 }
2775
2776 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2777     uint8_t half[64];
2778     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2779     put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2780 }
2781
2782 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2783     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2784 }
2785
2786 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2787     uint8_t half[64];
2788     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2789     put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2790 }
2791
2792 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2793     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2794 }
2795
2796 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2797     uint8_t halfH[88];
2798     uint8_t halfV[64];
2799     uint8_t halfHV[64];
2800     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2801     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2802     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2803     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2804 }
2805 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2806     uint8_t halfH[88];
2807     uint8_t halfV[64];
2808     uint8_t halfHV[64];
2809     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2810     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2811     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2812     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2813 }
2814 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2815     uint8_t halfH[88];
2816     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2817     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2818 }
2819
2820 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2821     if(ENABLE_ANY_H263) {
2822     int x;
2823     const int strength= ff_h263_loop_filter_strength[qscale];
2824
2825     for(x=0; x<8; x++){
2826         int d1, d2, ad1;
2827         int p0= src[x-2*stride];
2828         int p1= src[x-1*stride];
2829         int p2= src[x+0*stride];
2830         int p3= src[x+1*stride];
2831         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2832
2833         if     (d<-2*strength) d1= 0;
2834         else if(d<-  strength) d1=-2*strength - d;
2835         else if(d<   strength) d1= d;
2836         else if(d< 2*strength) d1= 2*strength - d;
2837         else                   d1= 0;
2838
2839         p1 += d1;
2840         p2 -= d1;
2841         if(p1&256) p1= ~(p1>>31);
2842         if(p2&256) p2= ~(p2>>31);
2843
2844         src[x-1*stride] = p1;
2845         src[x+0*stride] = p2;
2846
2847         ad1= FFABS(d1)>>1;
2848
2849         d2= av_clip((p0-p3)/4, -ad1, ad1);
2850
2851         src[x-2*stride] = p0 - d2;
2852         src[x+  stride] = p3 + d2;
2853     }
2854     }
2855 }
2856
2857 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2858     if(ENABLE_ANY_H263) {
2859     int y;
2860     const int strength= ff_h263_loop_filter_strength[qscale];
2861
2862     for(y=0; y<8; y++){
2863         int d1, d2, ad1;
2864         int p0= src[y*stride-2];
2865         int p1= src[y*stride-1];
2866         int p2= src[y*stride+0];
2867         int p3= src[y*stride+1];
2868         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2869
2870         if     (d<-2*strength) d1= 0;
2871         else if(d<-  strength) d1=-2*strength - d;
2872         else if(d<   strength) d1= d;
2873         else if(d< 2*strength) d1= 2*strength - d;
2874         else                   d1= 0;
2875
2876         p1 += d1;
2877         p2 -= d1;
2878         if(p1&256) p1= ~(p1>>31);
2879         if(p2&256) p2= ~(p2>>31);
2880
2881         src[y*stride-1] = p1;
2882         src[y*stride+0] = p2;
2883
2884         ad1= FFABS(d1)>>1;
2885
2886         d2= av_clip((p0-p3)/4, -ad1, ad1);
2887
2888         src[y*stride-2] = p0 - d2;
2889         src[y*stride+1] = p3 + d2;
2890     }
2891     }
2892 }
2893
2894 static void h261_loop_filter_c(uint8_t *src, int stride){
2895     int x,y,xy,yz;
2896     int temp[64];
2897
2898     for(x=0; x<8; x++){
2899         temp[x      ] = 4*src[x           ];
2900         temp[x + 7*8] = 4*src[x + 7*stride];
2901     }
2902     for(y=1; y<7; y++){
2903         for(x=0; x<8; x++){
2904             xy = y * stride + x;
2905             yz = y * 8 + x;
2906             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2907         }
2908     }
2909
2910     for(y=0; y<8; y++){
2911         src[  y*stride] = (temp[  y*8] + 2)>>2;
2912         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2913         for(x=1; x<7; x++){
2914             xy = y * stride + x;
2915             yz = y * 8 + x;
2916             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2917         }
2918     }
2919 }
2920
2921 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2922 {
2923     int i, d;
2924     for( i = 0; i < 4; i++ ) {
2925         if( tc0[i] < 0 ) {
2926             pix += 4*ystride;
2927             continue;
2928         }
2929         for( d = 0; d < 4; d++ ) {
2930             const int p0 = pix[-1*xstride];
2931             const int p1 = pix[-2*xstride];
2932             const int p2 = pix[-3*xstride];
2933             const int q0 = pix[0];
2934             const int q1 = pix[1*xstride];
2935             const int q2 = pix[2*xstride];
2936
2937             if( FFABS( p0 - q0 ) < alpha &&
2938                 FFABS( p1 - p0 ) < beta &&
2939                 FFABS( q1 - q0 ) < beta ) {
2940
2941                 int tc = tc0[i];
2942                 int i_delta;
2943
2944                 if( FFABS( p2 - p0 ) < beta ) {
2945                     pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2946                     tc++;
2947                 }
2948                 if( FFABS( q2 - q0 ) < beta ) {
2949                     pix[   xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2950                     tc++;
2951                 }
2952
2953                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2954                 pix[-xstride] = av_clip_uint8( p0 + i_delta );    /* p0' */
2955                 pix[0]        = av_clip_uint8( q0 - i_delta );    /* q0' */
2956             }
2957             pix += ystride;
2958         }
2959     }
2960 }
2961 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2962 {
2963     h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2964 }
2965 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2966 {
2967     h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2968 }
2969
2970 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2971 {
2972     int i, d;
2973     for( i = 0; i < 4; i++ ) {
2974         const int tc = tc0[i];
2975         if( tc <= 0 ) {
2976             pix += 2*ystride;
2977             continue;
2978         }
2979         for( d = 0; d < 2; d++ ) {
2980             const int p0 = pix[-1*xstride];
2981             const int p1 = pix[-2*xstride];
2982             const int q0 = pix[0];
2983             const int q1 = pix[1*xstride];
2984
2985             if( FFABS( p0 - q0 ) < alpha &&
2986                 FFABS( p1 - p0 ) < beta &&
2987                 FFABS( q1 - q0 ) < beta ) {
2988
2989                 int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2990
2991                 pix[-xstride] = av_clip_uint8( p0 + delta );    /* p0' */
2992                 pix[0]        = av_clip_uint8( q0 - delta );    /* q0' */
2993             }
2994             pix += ystride;
2995         }
2996     }
2997 }
2998 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2999 {
3000     h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
3001 }
3002 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3003 {
3004     h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
3005 }
3006
3007 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3008 {
3009     int d;
3010     for( d = 0; d < 8; d++ ) {
3011         const int p0 = pix[-1*xstride];
3012         const int p1 = pix[-2*xstride];
3013         const int q0 = pix[0];
3014         const int q1 = pix[1*xstride];
3015
3016         if( FFABS( p0 - q0 ) < alpha &&
3017             FFABS( p1 - p0 ) < beta &&
3018             FFABS( q1 - q0 ) < beta ) {
3019
3020             pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
3021             pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
3022         }
3023         pix += ystride;
3024     }
3025 }
3026 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3027 {
3028     h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
3029 }
3030 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3031 {
3032     h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
3033 }
3034
3035 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3036 {
3037     int s, i;
3038
3039     s = 0;
3040     for(i=0;i<h;i++) {
3041         s += abs(pix1[0] - pix2[0]);
3042         s += abs(pix1[1] - pix2[1]);
3043         s += abs(pix1[2] - pix2[2]);
3044         s += abs(pix1[3] - pix2[3]);
3045         s += abs(pix1[4] - pix2[4]);
3046         s += abs(pix1[5] - pix2[5]);
3047         s += abs(pix1[6] - pix2[6]);
3048         s += abs(pix1[7] - pix2[7]);
3049         s += abs(pix1[8] - pix2[8]);
3050         s += abs(pix1[9] - pix2[9]);
3051         s += abs(pix1[10] - pix2[10]);
3052         s += abs(pix1[11] - pix2[11]);
3053         s += abs(pix1[12] - pix2[12]);
3054         s += abs(pix1[13] - pix2[13]);
3055         s += abs(pix1[14] - pix2[14]);
3056         s += abs(pix1[15] - pix2[15]);
3057         pix1 += line_size;
3058         pix2 += line_size;
3059     }
3060     return s;
3061 }
3062
3063 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3064 {
3065     int s, i;
3066
3067     s = 0;
3068     for(i=0;i<h;i++) {
3069         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3070         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3071         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3072         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3073         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3074         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3075         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3076         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3077         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
3078         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
3079         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
3080         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
3081         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
3082         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
3083         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
3084         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
3085         pix1 += line_size;
3086         pix2 += line_size;
3087     }
3088     return s;
3089 }
3090
3091 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3092 {
3093     int s, i;
3094     uint8_t *pix3 = pix2 + line_size;
3095
3096     s = 0;
3097     for(i=0;i<h;i++) {
3098         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3099         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3100         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3101         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3102         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3103         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3104         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3105         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3106         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
3107         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
3108         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
3109         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
3110         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
3111         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
3112         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
3113         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
3114         pix1 += line_size;
3115         pix2 += line_size;
3116         pix3 += line_size;
3117     }
3118     return s;
3119 }
3120
3121 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3122 {
3123     int s, i;
3124     uint8_t *pix3 = pix2 + line_size;
3125
3126     s = 0;
3127     for(i=0;i<h;i++) {
3128         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3129         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3130         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3131         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3132         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3133         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3134         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3135         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3136         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3137         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3138         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3139         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3140         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3141         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3142         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3143         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3144         pix1 += line_size;
3145         pix2 += line_size;
3146         pix3 += line_size;
3147     }
3148     return s;
3149 }
3150
3151 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3152 {
3153     int s, i;
3154
3155     s = 0;
3156     for(i=0;i<h;i++) {
3157         s += abs(pix1[0] - pix2[0]);
3158         s += abs(pix1[1] - pix2[1]);
3159         s += abs(pix1[2] - pix2[2]);
3160         s += abs(pix1[3] - pix2[3]);
3161         s += abs(pix1[4] - pix2[4]);
3162         s += abs(pix1[5] - pix2[5]);
3163         s += abs(pix1[6] - pix2[6]);
3164         s += abs(pix1[7] - pix2[7]);
3165         pix1 += line_size;
3166         pix2 += line_size;
3167     }
3168     return s;
3169 }
3170
3171 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3172 {
3173     int s, i;
3174
3175     s = 0;
3176     for(i=0;i<h;i++) {
3177         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3178         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3179         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3180         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3181         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3182         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3183         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3184         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3185         pix1 += line_size;
3186         pix2 += line_size;
3187     }
3188     return s;
3189 }
3190
3191 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3192 {
3193     int s, i;
3194     uint8_t *pix3 = pix2 + line_size;
3195
3196     s = 0;
3197     for(i=0;i<h;i++) {
3198         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3199         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3200         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3201         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3202         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3203         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3204         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3205         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3206         pix1 += line_size;
3207         pix2 += line_size;
3208         pix3 += line_size;
3209     }
3210     return s;
3211 }
3212
3213 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3214 {
3215     int s, i;
3216     uint8_t *pix3 = pix2 + line_size;
3217
3218     s = 0;
3219     for(i=0;i<h;i++) {
3220         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3221         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3222         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3223         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3224         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3225         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3226         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3227         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3228         pix1 += line_size;
3229         pix2 += line_size;
3230         pix3 += line_size;
3231     }
3232     return s;
3233 }
3234
3235 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3236     MpegEncContext *c = v;
3237     int score1=0;
3238     int score2=0;
3239     int x,y;
3240
3241     for(y=0; y<h; y++){
3242         for(x=0; x<16; x++){
3243             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3244         }
3245         if(y+1<h){
3246             for(x=0; x<15; x++){
3247                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
3248                              - s1[x+1] + s1[x+1+stride])
3249                         -FFABS(  s2[x  ] - s2[x  +stride]
3250                              - s2[x+1] + s2[x+1+stride]);
3251             }
3252         }
3253         s1+= stride;
3254         s2+= stride;
3255     }
3256
3257     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3258     else  return score1 + FFABS(score2)*8;
3259 }
3260
3261 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3262     MpegEncContext *c = v;
3263     int score1=0;
3264     int score2=0;
3265     int x,y;
3266
3267     for(y=0; y<h; y++){
3268         for(x=0; x<8; x++){
3269             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3270         }
3271         if(y+1<h){
3272             for(x=0; x<7; x++){
3273                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
3274                              - s1[x+1] + s1[x+1+stride])
3275                         -FFABS(  s2[x  ] - s2[x  +stride]
3276                              - s2[x+1] + s2[x+1+stride]);
3277             }
3278         }
3279         s1+= stride;
3280         s2+= stride;
3281     }
3282
3283     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3284     else  return score1 + FFABS(score2)*8;
3285 }
3286
3287 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3288     int i;
3289     unsigned int sum=0;
3290
3291     for(i=0; i<8*8; i++){
3292         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3293         int w= weight[i];
3294         b>>= RECON_SHIFT;
3295         assert(-512<b && b<512);
3296
3297         sum += (w*b)*(w*b)>>4;
3298     }
3299     return sum>>2;
3300 }
3301
3302 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3303     int i;
3304
3305     for(i=0; i<8*8; i++){
3306         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3307     }
3308 }
3309
3310 /**
3311  * permutes an 8x8 block.
3312  * @param block the block which will be permuted according to the given permutation vector
3313  * @param permutation the permutation vector
3314  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3315  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3316  *                  (inverse) permutated to scantable order!
3317  */
3318 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3319 {
3320     int i;
3321     DCTELEM temp[64];
3322
3323     if(last<=0) return;
3324     //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3325
3326     for(i=0; i<=last; i++){
3327         const int j= scantable[i];
3328         temp[j]= block[j];
3329         block[j]=0;
3330     }
3331
3332     for(i=0; i<=last; i++){
3333         const int j= scantable[i];
3334         const int perm_j= permutation[j];
3335         block[perm_j]= temp[j];
3336     }
3337 }
3338
3339 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3340     return 0;
3341 }
3342
3343 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3344     int i;
3345
3346     memset(cmp, 0, sizeof(void*)*5);
3347
3348     for(i=0; i<5; i++){
3349         switch(type&0xFF){
3350         case FF_CMP_SAD:
3351             cmp[i]= c->sad[i];
3352             break;
3353         case FF_CMP_SATD:
3354             cmp[i]= c->hadamard8_diff[i];
3355             break;
3356         case FF_CMP_SSE:
3357             cmp[i]= c->sse[i];
3358             break;
3359         case FF_CMP_DCT:
3360             cmp[i]= c->dct_sad[i];
3361             break;
3362         case FF_CMP_DCT264:
3363             cmp[i]= c->dct264_sad[i];
3364             break;
3365         case FF_CMP_DCTMAX:
3366             cmp[i]= c->dct_max[i];
3367             break;
3368         case FF_CMP_PSNR:
3369             cmp[i]= c->quant_psnr[i];
3370             break;
3371         case FF_CMP_BIT:
3372             cmp[i]= c->bit[i];
3373             break;
3374         case FF_CMP_RD:
3375             cmp[i]= c->rd[i];
3376             break;
3377         case FF_CMP_VSAD:
3378             cmp[i]= c->vsad[i];
3379             break;
3380         case FF_CMP_VSSE:
3381             cmp[i]= c->vsse[i];
3382             break;
3383         case FF_CMP_ZERO:
3384             cmp[i]= zero_cmp;
3385             break;
3386         case FF_CMP_NSSE:
3387             cmp[i]= c->nsse[i];
3388             break;
3389 #ifdef CONFIG_SNOW_ENCODER
3390         case FF_CMP_W53:
3391             cmp[i]= c->w53[i];
3392             break;
3393         case FF_CMP_W97:
3394             cmp[i]= c->w97[i];
3395             break;
3396 #endif
3397         default:
3398             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3399         }
3400     }
3401 }
3402
3403 /**
3404  * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3405  */
3406 static void clear_blocks_c(DCTELEM *blocks)
3407 {
3408     memset(blocks, 0, sizeof(DCTELEM)*6*64);
3409 }
3410
3411 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3412     long i;
3413     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3414         long a = *(long*)(src+i);
3415         long b = *(long*)(dst+i);
3416         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3417     }
3418     for(; i<w; i++)
3419         dst[i+0] += src[i+0];
3420 }
3421
3422 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3423     long i;
3424     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3425         long a = *(long*)(src1+i);
3426         long b = *(long*)(src2+i);
3427         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3428     }
3429     for(; i<w; i++)
3430         dst[i] = src1[i]+src2[i];
3431 }
3432
3433 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3434     long i;
3435 #ifndef HAVE_FAST_UNALIGNED
3436     if((long)src2 & (sizeof(long)-1)){
3437         for(i=0; i+7<w; i+=8){
3438             dst[i+0] = src1[i+0]-src2[i+0];
3439             dst[i+1] = src1[i+1]-src2[i+1];
3440             dst[i+2] = src1[i+2]-src2[i+2];
3441             dst[i+3] = src1[i+3]-src2[i+3];
3442             dst[i+4] = src1[i+4]-src2[i+4];
3443             dst[i+5] = src1[i+5]-src2[i+5];
3444             dst[i+6] = src1[i+6]-src2[i+6];
3445             dst[i+7] = src1[i+7]-src2[i+7];
3446         }
3447     }else
3448 #endif
3449     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3450         long a = *(long*)(src1+i);
3451         long b = *(long*)(src2+i);
3452         *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3453     }
3454     for(; i<w; i++)
3455         dst[i+0] = src1[i+0]-src2[i+0];
3456 }
3457
3458 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3459     int i;
3460     uint8_t l, lt;
3461
3462     l= *left;
3463     lt= *left_top;
3464
3465     for(i=0; i<w; i++){
3466         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3467         lt= src1[i];
3468         l= src2[i];
3469         dst[i]= l - pred;
3470     }
3471
3472     *left= l;
3473     *left_top= lt;
3474 }
3475
3476 #define BUTTERFLY2(o1,o2,i1,i2) \
3477 o1= (i1)+(i2);\
3478 o2= (i1)-(i2);
3479
3480 #define BUTTERFLY1(x,y) \
3481 {\
3482     int a,b;\
3483     a= x;\
3484     b= y;\
3485     x= a+b;\
3486     y= a-b;\
3487 }
3488
3489 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3490
3491 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3492     int i;
3493     int temp[64];
3494     int sum=0;
3495
3496     assert(h==8);
3497
3498     for(i=0; i<8; i++){
3499         //FIXME try pointer walks
3500         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3501         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3502         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3503         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3504
3505         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3506         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3507         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3508         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3509
3510         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3511         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3512         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3513         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3514     }
3515
3516     for(i=0; i<8; i++){
3517         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3518         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3519         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3520         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3521
3522         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3523         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3524         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3525         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3526
3527         sum +=
3528              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3529             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3530             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3531             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3532     }
3533 #if 0
3534 static int maxi=0;
3535 if(sum>maxi){
3536     maxi=sum;
3537     printf("MAX:%d\n", maxi);
3538 }
3539 #endif
3540     return sum;
3541 }
3542
3543 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3544     int i;
3545     int temp[64];
3546     int sum=0;
3547
3548     assert(h==8);
3549
3550     for(i=0; i<8; i++){
3551         //FIXME try pointer walks
3552         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3553         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3554         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3555         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3556
3557         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3558         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3559         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3560         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3561
3562         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3563         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3564         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3565         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3566     }
3567
3568     for(i=0; i<8; i++){
3569         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3570         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3571         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3572         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3573
3574         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3575         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3576         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3577         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3578
3579         sum +=
3580              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3581             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3582             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3583             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3584     }
3585
3586     sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3587
3588     return sum;
3589 }
3590
3591 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3592     MpegEncContext * const s= (MpegEncContext *)c;
3593     DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3594     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3595
3596     assert(h==8);
3597
3598     s->dsp.diff_pixels(temp, src1, src2, stride);
3599     s->dsp.fdct(temp);
3600     return s->dsp.sum_abs_dctelem(temp);
3601 }
3602
3603 #ifdef CONFIG_GPL
3604 #define DCT8_1D {\
3605     const int s07 = SRC(0) + SRC(7);\
3606     const int s16 = SRC(1) + SRC(6);\
3607     const int s25 = SRC(2) + SRC(5);\
3608     const int s34 = SRC(3) + SRC(4);\
3609     const int a0 = s07 + s34;\
3610     const int a1 = s16 + s25;\
3611     const int a2 = s07 - s34;\
3612     const int a3 = s16 - s25;\
3613     const int d07 = SRC(0) - SRC(7);\
3614     const int d16 = SRC(1) - SRC(6);\
3615     const int d25 = SRC(2) - SRC(5);\
3616     const int d34 = SRC(3) - SRC(4);\
3617     const int a4 = d16 + d25 + (d07 + (d07>>1));\
3618     const int a5 = d07 - d34 - (d25 + (d25>>1));\
3619     const int a6 = d07 + d34 - (d16 + (d16>>1));\
3620     const int a7 = d16 - d25 + (d34 + (d34>>1));\
3621     DST(0,  a0 + a1     ) ;\
3622     DST(1,  a4 + (a7>>2)) ;\
3623     DST(2,  a2 + (a3>>1)) ;\
3624     DST(3,  a5 + (a6>>2)) ;\
3625     DST(4,  a0 - a1     ) ;\
3626     DST(5,  a6 - (a5>>2)) ;\
3627     DST(6, (a2>>1) - a3 ) ;\
3628     DST(7, (a4>>2) - a7 ) ;\
3629 }
3630
3631 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3632     MpegEncContext * const s= (MpegEncContext *)c;
3633     DCTELEM dct[8][8];
3634     int i;
3635     int sum=0;
3636
3637     s->dsp.diff_pixels(dct[0], src1, src2, stride);
3638
3639 #define SRC(x) dct[i][x]
3640 #define DST(x,v) dct[i][x]= v
3641     for( i = 0; i < 8; i++ )
3642         DCT8_1D
3643 #undef SRC
3644 #undef DST
3645
3646 #define SRC(x) dct[x][i]
3647 #define DST(x,v) sum += FFABS(v)
3648     for( i = 0; i < 8; i++ )
3649         DCT8_1D
3650 #undef SRC
3651 #undef DST
3652     return sum;
3653 }
3654 #endif
3655
3656 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3657     MpegEncContext * const s= (MpegEncContext *)c;
3658     DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3659     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3660     int sum=0, i;
3661
3662     assert(h==8);
3663
3664     s->dsp.diff_pixels(temp, src1, src2, stride);
3665     s->dsp.fdct(temp);
3666
3667     for(i=0; i<64; i++)
3668         sum= FFMAX(sum, FFABS(temp[i]));
3669
3670     return sum;
3671 }
3672
3673 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3674     MpegEncContext * const s= (MpegEncContext *)c;
3675     DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3676     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3677     DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3678     int sum=0, i;
3679
3680     assert(h==8);
3681     s->mb_intra=0;
3682
3683     s->dsp.diff_pixels(temp, src1, src2, stride);
3684
3685     memcpy(bak, temp, 64*sizeof(DCTELEM));
3686
3687     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3688     s->dct_unquantize_inter(s, temp, 0, s->qscale);
3689     ff_simple_idct(temp); //FIXME
3690
3691     for(i=0; i<64; i++)
3692         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3693
3694     return sum;
3695 }
3696
3697 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3698     MpegEncContext * const s= (MpegEncContext *)c;
3699     const uint8_t *scantable= s->intra_scantable.permutated;
3700     DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3701     DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3702     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3703     uint8_t * const bak= (uint8_t*)aligned_bak;
3704     int i, last, run, bits, level, distortion, start_i;
3705     const int esc_length= s->ac_esc_length;
3706     uint8_t * length;
3707     uint8_t * last_length;
3708
3709     assert(h==8);
3710
3711     for(i=0; i<8; i++){
3712         ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3713         ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3714     }
3715
3716     s->dsp.diff_pixels(temp, src1, src2, stride);
3717
3718     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3719
3720     bits=0;
3721
3722     if (s->mb_intra) {
3723         start_i = 1;
3724         length     = s->intra_ac_vlc_length;
3725         last_length= s->intra_ac_vlc_last_length;
3726         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3727     } else {
3728         start_i = 0;
3729         length     = s->inter_ac_vlc_length;
3730         last_length= s->inter_ac_vlc_last_length;
3731     }
3732
3733     if(last>=start_i){
3734         run=0;
3735         for(i=start_i; i<last; i++){
3736             int j= scantable[i];
3737             level= temp[j];
3738
3739             if(level){
3740                 level+=64;
3741                 if((level&(~127)) == 0){
3742                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3743                 }else
3744                     bits+= esc_length;
3745                 run=0;
3746             }else
3747                 run++;
3748         }
3749         i= scantable[last];
3750
3751         level= temp[i] + 64;
3752
3753         assert(level - 64);
3754
3755         if((level&(~127)) == 0){
3756             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3757         }else
3758             bits+= esc_length;
3759
3760     }
3761
3762     if(last>=0){
3763         if(s->mb_intra)
3764             s->dct_unquantize_intra(s, temp, 0, s->qscale);
3765         else
3766             s->dct_unquantize_inter(s, temp, 0, s->qscale);
3767     }
3768
3769     s->dsp.idct_add(bak, stride, temp);
3770
3771     distortion= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3772
3773     return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3774 }
3775
3776 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3777     MpegEncContext * const s= (MpegEncContext *)c;
3778     const uint8_t *scantable= s->intra_scantable.permutated;
3779     DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3780     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3781     int i, last, run, bits, level, start_i;
3782     const int esc_length= s->ac_esc_length;
3783     uint8_t * length;
3784     uint8_t * last_length;
3785
3786     assert(h==8);
3787
3788     s->dsp.diff_pixels(temp, src1, src2, stride);
3789
3790     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3791
3792     bits=0;
3793
3794     if (s->mb_intra) {
3795         start_i = 1;
3796         length     = s->intra_ac_vlc_length;
3797         last_length= s->intra_ac_vlc_last_length;
3798         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3799     } else {
3800         start_i = 0;
3801         length     = s->inter_ac_vlc_length;
3802         last_length= s->inter_ac_vlc_last_length;
3803     }
3804
3805     if(last>=start_i){
3806         run=0;
3807         for(i=start_i; i<last; i++){
3808             int j= scantable[i];
3809             level= temp[j];
3810
3811             if(level){
3812                 level+=64;
3813                 if((level&(~127)) == 0){
3814                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3815                 }else
3816                     bits+= esc_length;
3817                 run=0;
3818             }else
3819                 run++;
3820         }
3821         i= scantable[last];
3822
3823         level= temp[i] + 64;
3824
3825         assert(level - 64);
3826
3827         if((level&(~127)) == 0){
3828             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3829         }else
3830             bits+= esc_length;
3831     }
3832
3833     return bits;
3834 }
3835
3836 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3837     int score=0;
3838     int x,y;
3839
3840     for(y=1; y<h; y++){
3841         for(x=0; x<16; x+=4){
3842             score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])
3843                    +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);
3844         }
3845         s+= stride;
3846     }
3847
3848     return score;
3849 }
3850
3851 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3852     int score=0;
3853     int x,y;
3854
3855     for(y=1; y<h; y++){
3856         for(x=0; x<16; x++){
3857             score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3858         }
3859         s1+= stride;
3860         s2+= stride;
3861     }
3862
3863     return score;
3864 }
3865
3866 #define SQ(a) ((a)*(a))
3867 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3868     int score=0;
3869     int x,y;
3870
3871     for(y=1; y<h; y++){
3872         for(x=0; x<16; x+=4){
3873             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])
3874                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3875         }
3876         s+= stride;
3877     }
3878
3879     return score;
3880 }
3881
3882 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3883     int score=0;
3884     int x,y;
3885
3886     for(y=1; y<h; y++){
3887         for(x=0; x<16; x++){
3888             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3889         }
3890         s1+= stride;
3891         s2+= stride;
3892     }
3893
3894     return score;
3895 }
3896
3897 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3898                                int size){
3899     int score=0;
3900     int i;
3901     for(i=0; i<size; i++)
3902         score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3903     return score;
3904 }
3905
3906 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3907 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3908 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3909 #ifdef CONFIG_GPL
3910 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3911 #endif
3912 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3913 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3914 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
3915 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
3916
3917 static void vector_fmul_c(float *dst, const float *src, int len){
3918     int i;
3919     for(i=0; i<len; i++)
3920         dst[i] *= src[i];
3921 }
3922
3923 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3924     int i;
3925     src1 += len-1;
3926     for(i=0; i<len; i++)
3927         dst[i] = src0[i] * src1[-i];
3928 }
3929
3930 void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){
3931     int i;
3932     for(i=0; i<len; i++)
3933         dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
3934 }
3935
3936 void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
3937     int i,j;
3938     dst += len;
3939     win += len;
3940     src0+= len;
3941     for(i=-len, j=len-1; i<0; i++, j--) {
3942         float s0 = src0[i];
3943         float s1 = src1[j];
3944         float wi = win[i];
3945         float wj = win[j];
3946         dst[i] = s0*wj - s1*wi + add_bias;
3947         dst[j] = s0*wi + s1*wj + add_bias;
3948     }
3949 }
3950
3951 static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
3952     int i;
3953     for(i=0; i<len; i++)
3954         dst[i] = src[i] * mul;
3955 }
3956
3957 static av_always_inline int float_to_int16_one(const float *src){
3958     int_fast32_t tmp = *(const int32_t*)src;
3959     if(tmp & 0xf0000){
3960         tmp = (0x43c0ffff - tmp)>>31;
3961         // is this faster on some gcc/cpu combinations?
3962 //      if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3963 //      else                 tmp = 0;
3964     }
3965     return tmp - 0x8000;
3966 }
3967
3968 void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
3969     int i;
3970     for(i=0; i<len; i++)
3971         dst[i] = float_to_int16_one(src+i);
3972 }
3973
3974 void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
3975     int i,j,c;
3976     if(channels==2){
3977         for(i=0; i<len; i++){
3978             dst[2*i]   = float_to_int16_one(src[0]+i);
3979             dst[2*i+1] = float_to_int16_one(src[1]+i);
3980         }
3981     }else{
3982         for(c=0; c<channels; c++)
3983             for(i=0, j=c; i<len; i++, j+=channels)
3984                 dst[j] = float_to_int16_one(src[c]+i);
3985     }
3986 }
3987
3988 static void add_int16_c(int16_t * v1, int16_t * v2, int order)
3989 {
3990     while (order--)
3991        *v1++ += *v2++;
3992 }
3993
3994 static void sub_int16_c(int16_t * v1, int16_t * v2, int order)
3995 {
3996     while (order--)
3997         *v1++ -= *v2++;
3998 }
3999
4000 static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift)
4001 {
4002     int res = 0;
4003
4004     while (order--)
4005         res += (*v1++ * *v2++) >> shift;
4006
4007     return res;
4008 }
4009
4010 #define W0 2048
4011 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
4012 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
4013 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
4014 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
4015 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
4016 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
4017 #define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
4018
4019 static void wmv2_idct_row(short * b)
4020 {
4021     int s1,s2;
4022     int a0,a1,a2,a3,a4,a5,a6,a7;
4023     /*step 1*/
4024     a1 = W1*b[1]+W7*b[7];
4025     a7 = W7*b[1]-W1*b[7];
4026     a5 = W5*b[5]+W3*b[3];
4027     a3 = W3*b[5]-W5*b[3];
4028     a2 = W2*b[2]+W6*b[6];
4029     a6 = W6*b[2]-W2*b[6];
4030     a0 = W0*b[0]+W0*b[4];
4031     a4 = W0*b[0]-W0*b[4];
4032     /*step 2*/
4033     s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
4034     s2 = (181*(a1-a5-a7+a3)+128)>>8;
4035     /*step 3*/
4036     b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
4037     b[1] = (a4+a6 +s1   + (1<<7))>>8;
4038     b[2] = (a4-a6 +s2   + (1<<7))>>8;
4039     b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
4040     b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
4041     b[5] = (a4-a6 -s2   + (1<<7))>>8;
4042     b[6] = (a4+a6 -s1   + (1<<7))>>8;
4043     b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
4044 }
4045 static void wmv2_idct_col(short * b)
4046 {
4047     int s1,s2;
4048     int a0,a1,a2,a3,a4,a5,a6,a7;
4049     /*step 1, with extended precision*/
4050     a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
4051     a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
4052     a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
4053     a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
4054     a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
4055     a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
4056     a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
4057     a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
4058     /*step 2*/
4059     s1 = (181*(a1-a5+a7-a3)+128)>>8;
4060     s2 = (181*(a1-a5-a7+a3)+128)>>8;
4061     /*step 3*/
4062     b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
4063     b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
4064     b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
4065     b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
4066
4067     b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
4068     b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
4069     b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
4070     b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
4071 }
4072 void ff_wmv2_idct_c(short * block){
4073     int i;
4074
4075     for(i=0;i<64;i+=8){
4076         wmv2_idct_row(block+i);
4077     }
4078     for(i=0;i<8;i++){
4079         wmv2_idct_col(block+i);
4080     }
4081 }
4082 /* XXX: those functions should be suppressed ASAP when all IDCTs are
4083  converted */
4084 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
4085 {
4086     ff_wmv2_idct_c(block);
4087     put_pixels_clamped_c(block, dest, line_size);
4088 }
4089 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
4090 {
4091     ff_wmv2_idct_c(block);
4092     add_pixels_clamped_c(block, dest, line_size);
4093 }
4094 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
4095 {
4096     j_rev_dct (block);
4097     put_pixels_clamped_c(block, dest, line_size);
4098 }
4099 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
4100 {
4101     j_rev_dct (block);
4102     add_pixels_clamped_c(block, dest, line_size);
4103 }
4104
4105 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
4106 {
4107     j_rev_dct4 (block);
4108     put_pixels_clamped4_c(block, dest, line_size);
4109 }
4110 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
4111 {
4112     j_rev_dct4 (block);
4113     add_pixels_clamped4_c(block, dest, line_size);
4114 }
4115
4116 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4117 {
4118     j_rev_dct2 (block);
4119     put_pixels_clamped2_c(block, dest, line_size);
4120 }
4121 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4122 {
4123     j_rev_dct2 (block);
4124     add_pixels_clamped2_c(block, dest, line_size);
4125 }
4126
4127 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4128 {
4129     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4130
4131     dest[0] = cm[(block[0] + 4)>>3];
4132 }
4133 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4134 {
4135     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4136
4137     dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4138 }
4139
4140 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4141
4142 /* init static data */
4143 void dsputil_static_init(void)
4144 {
4145     int i;
4146
4147     for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4148     for(i=0;i<MAX_NEG_CROP;i++) {
4149         ff_cropTbl[i] = 0;
4150         ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4151     }
4152
4153     for(i=0;i<512;i++) {
4154         ff_squareTbl[i] = (i - 256) * (i - 256);
4155     }
4156
4157     for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4158 }
4159
4160 int ff_check_alignment(void){
4161     static int did_fail=0;
4162     DECLARE_ALIGNED_16(int, aligned);
4163
4164     if((long)&aligned & 15){
4165         if(!did_fail){
4166 #if defined(HAVE_MMX) || defined(HAVE_ALTIVEC)
4167             av_log(NULL, AV_LOG_ERROR,
4168                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4169                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
4170                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4171                 "Do not report crashes to FFmpeg developers.\n");
4172 #endif
4173             did_fail=1;
4174         }
4175         return -1;
4176     }
4177     return 0;
4178 }
4179
4180 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4181 {
4182     int i;
4183
4184     ff_check_alignment();
4185
4186 #ifdef CONFIG_ENCODERS
4187     if(avctx->dct_algo==FF_DCT_FASTINT) {
4188         c->fdct = fdct_ifast;
4189         c->fdct248 = fdct_ifast248;
4190     }
4191     else if(avctx->dct_algo==FF_DCT_FAAN) {
4192         c->fdct = ff_faandct;
4193         c->fdct248 = ff_faandct248;
4194     }
4195     else {
4196         c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4197         c->fdct248 = ff_fdct248_islow;
4198     }
4199 #endif //CONFIG_ENCODERS
4200
4201     if(avctx->lowres==1){
4202         if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !ENABLE_H264_DECODER){
4203             c->idct_put= ff_jref_idct4_put;
4204             c->idct_add= ff_jref_idct4_add;
4205         }else{
4206             c->idct_put= ff_h264_lowres_idct_put_c;
4207             c->idct_add= ff_h264_lowres_idct_add_c;
4208         }
4209         c->idct    = j_rev_dct4;
4210         c->idct_permutation_type= FF_NO_IDCT_PERM;
4211     }else if(avctx->lowres==2){
4212         c->idct_put= ff_jref_idct2_put;
4213         c->idct_add= ff_jref_idct2_add;
4214         c->idct    = j_rev_dct2;
4215         c->idct_permutation_type= FF_NO_IDCT_PERM;
4216     }else if(avctx->lowres==3){
4217         c->idct_put= ff_jref_idct1_put;
4218         c->idct_add= ff_jref_idct1_add;
4219         c->idct    = j_rev_dct1;
4220         c->idct_permutation_type= FF_NO_IDCT_PERM;
4221     }else{
4222         if(avctx->idct_algo==FF_IDCT_INT){
4223             c->idct_put= ff_jref_idct_put;
4224             c->idct_add= ff_jref_idct_add;
4225             c->idct    = j_rev_dct;
4226             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4227         }else if((ENABLE_VP3_DECODER || ENABLE_VP5_DECODER || ENABLE_VP6_DECODER || ENABLE_THEORA_DECODER ) &&
4228                 avctx->idct_algo==FF_IDCT_VP3){
4229             c->idct_put= ff_vp3_idct_put_c;
4230             c->idct_add= ff_vp3_idct_add_c;
4231             c->idct    = ff_vp3_idct_c;
4232             c->idct_permutation_type= FF_NO_IDCT_PERM;
4233         }else if(avctx->idct_algo==FF_IDCT_WMV2){
4234             c->idct_put= ff_wmv2_idct_put_c;
4235             c->idct_add= ff_wmv2_idct_add_c;
4236             c->idct    = ff_wmv2_idct_c;
4237             c->idct_permutation_type= FF_NO_IDCT_PERM;
4238         }else if(avctx->idct_algo==FF_IDCT_FAAN){
4239             c->idct_put= ff_faanidct_put;
4240             c->idct_add= ff_faanidct_add;
4241             c->idct    = ff_faanidct;
4242             c->idct_permutation_type= FF_NO_IDCT_PERM;
4243         }else{ //accurate/default
4244             c->idct_put= ff_simple_idct_put;
4245             c->idct_add= ff_simple_idct_add;
4246             c->idct    = ff_simple_idct;
4247             c->idct_permutation_type= FF_NO_IDCT_PERM;
4248         }
4249     }
4250
4251     if (ENABLE_H264_DECODER) {
4252         c->h264_idct_add= ff_h264_idct_add_c;
4253         c->h264_idct8_add= ff_h264_idct8_add_c;
4254         c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
4255         c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
4256     }
4257
4258     c->get_pixels = get_pixels_c;
4259     c->diff_pixels = diff_pixels_c;
4260     c->put_pixels_clamped = put_pixels_clamped_c;
4261     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4262     c->add_pixels_clamped = add_pixels_clamped_c;
4263     c->add_pixels8 = add_pixels8_c;
4264     c->add_pixels4 = add_pixels4_c;
4265     c->sum_abs_dctelem = sum_abs_dctelem_c;
4266     c->gmc1 = gmc1_c;
4267     c->gmc = ff_gmc_c;
4268     c->clear_blocks = clear_blocks_c;
4269     c->pix_sum = pix_sum_c;
4270     c->pix_norm1 = pix_norm1_c;
4271
4272     /* TODO [0] 16  [1] 8 */
4273     c->pix_abs[0][0] = pix_abs16_c;
4274     c->pix_abs[0][1] = pix_abs16_x2_c;
4275     c->pix_abs[0][2] = pix_abs16_y2_c;
4276     c->pix_abs[0][3] = pix_abs16_xy2_c;
4277     c->pix_abs[1][0] = pix_abs8_c;
4278     c->pix_abs[1][1] = pix_abs8_x2_c;
4279     c->pix_abs[1][2] = pix_abs8_y2_c;
4280     c->pix_abs[1][3] = pix_abs8_xy2_c;
4281
4282 #define dspfunc(PFX, IDX, NUM) \
4283     c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
4284     c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
4285     c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
4286     c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4287
4288     dspfunc(put, 0, 16);
4289     dspfunc(put_no_rnd, 0, 16);
4290     dspfunc(put, 1, 8);
4291     dspfunc(put_no_rnd, 1, 8);
4292     dspfunc(put, 2, 4);
4293     dspfunc(put, 3, 2);
4294
4295     dspfunc(avg, 0, 16);
4296     dspfunc(avg_no_rnd, 0, 16);
4297     dspfunc(avg, 1, 8);
4298     dspfunc(avg_no_rnd, 1, 8);
4299     dspfunc(avg, 2, 4);
4300     dspfunc(avg, 3, 2);
4301 #undef dspfunc
4302
4303     c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4304     c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4305
4306     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4307     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4308     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4309     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4310     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4311     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4312     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4313     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4314     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4315
4316     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4317     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4318     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4319     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4320     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4321     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4322     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4323     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4324     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4325
4326 #define dspfunc(PFX, IDX, NUM) \
4327     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4328     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4329     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4330     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4331     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4332     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4333     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4334     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4335     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4336     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4337     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4338     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4339     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4340     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4341     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4342     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4343
4344     dspfunc(put_qpel, 0, 16);
4345     dspfunc(put_no_rnd_qpel, 0, 16);
4346
4347     dspfunc(avg_qpel, 0, 16);
4348     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4349
4350     dspfunc(put_qpel, 1, 8);
4351     dspfunc(put_no_rnd_qpel, 1, 8);
4352
4353     dspfunc(avg_qpel, 1, 8);
4354     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4355
4356     dspfunc(put_h264_qpel, 0, 16);
4357     dspfunc(put_h264_qpel, 1, 8);
4358     dspfunc(put_h264_qpel, 2, 4);
4359     dspfunc(put_h264_qpel, 3, 2);
4360     dspfunc(avg_h264_qpel, 0, 16);
4361     dspfunc(avg_h264_qpel, 1, 8);
4362     dspfunc(avg_h264_qpel, 2, 4);
4363
4364 #undef dspfunc
4365     c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4366     c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4367     c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4368     c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4369     c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4370     c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4371     c->put_no_rnd_h264_chroma_pixels_tab[0]= put_no_rnd_h264_chroma_mc8_c;
4372
4373     c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
4374     c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
4375     c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
4376     c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
4377     c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
4378     c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
4379     c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
4380     c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
4381     c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
4382     c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
4383     c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
4384     c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
4385     c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
4386     c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
4387     c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
4388     c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
4389     c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
4390     c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
4391     c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
4392     c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
4393
4394     c->draw_edges = draw_edges_c;
4395
4396 #ifdef CONFIG_CAVS_DECODER
4397     ff_cavsdsp_init(c,avctx);
4398 #endif
4399 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4400     ff_vc1dsp_init(c,avctx);
4401 #endif
4402 #if defined(CONFIG_WMV2_DECODER) || defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4403     ff_intrax8dsp_init(c,avctx);
4404 #endif
4405 #if defined(CONFIG_H264_ENCODER)
4406     ff_h264dspenc_init(c,avctx);
4407 #endif
4408
4409     c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4410     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4411     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4412     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4413     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4414     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4415     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4416     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4417
4418 #define SET_CMP_FUNC(name) \
4419     c->name[0]= name ## 16_c;\
4420     c->name[1]= name ## 8x8_c;
4421
4422     SET_CMP_FUNC(hadamard8_diff)
4423     c->hadamard8_diff[4]= hadamard8_intra16_c;
4424     SET_CMP_FUNC(dct_sad)
4425     SET_CMP_FUNC(dct_max)
4426 #ifdef CONFIG_GPL
4427     SET_CMP_FUNC(dct264_sad)
4428 #endif
4429     c->sad[0]= pix_abs16_c;
4430     c->sad[1]= pix_abs8_c;
4431     c->sse[0]= sse16_c;
4432     c->sse[1]= sse8_c;
4433     c->sse[2]= sse4_c;
4434     SET_CMP_FUNC(quant_psnr)
4435     SET_CMP_FUNC(rd)
4436     SET_CMP_FUNC(bit)
4437     c->vsad[0]= vsad16_c;
4438     c->vsad[4]= vsad_intra16_c;
4439     c->vsse[0]= vsse16_c;
4440     c->vsse[4]= vsse_intra16_c;
4441     c->nsse[0]= nsse16_c;
4442     c->nsse[1]= nsse8_c;
4443 #ifdef CONFIG_SNOW_ENCODER
4444     c->w53[0]= w53_16_c;
4445     c->w53[1]= w53_8_c;
4446     c->w97[0]= w97_16_c;
4447     c->w97[1]= w97_8_c;
4448 #endif
4449
4450     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4451
4452     c->add_bytes= add_bytes_c;
4453     c->add_bytes_l2= add_bytes_l2_c;
4454     c->diff_bytes= diff_bytes_c;
4455     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4456     c->bswap_buf= bswap_buf;
4457 #ifdef CONFIG_PNG_DECODER
4458     c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
4459 #endif
4460
4461     c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4462     c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4463     c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4464     c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4465     c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4466     c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4467     c->h264_loop_filter_strength= NULL;
4468
4469     if (ENABLE_ANY_H263) {
4470         c->h263_h_loop_filter= h263_h_loop_filter_c;
4471         c->h263_v_loop_filter= h263_v_loop_filter_c;
4472     }
4473
4474     if (ENABLE_VP3_DECODER || ENABLE_THEORA_DECODER) {
4475         c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
4476         c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
4477     }
4478
4479     c->h261_loop_filter= h261_loop_filter_c;
4480
4481     c->try_8x8basis= try_8x8basis_c;
4482     c->add_8x8basis= add_8x8basis_c;
4483
4484 #ifdef CONFIG_SNOW_DECODER
4485     c->vertical_compose97i = ff_snow_vertical_compose97i;
4486     c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4487     c->inner_add_yblock = ff_snow_inner_add_yblock;
4488 #endif
4489
4490 #ifdef CONFIG_VORBIS_DECODER
4491     c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4492 #endif
4493 #ifdef CONFIG_AC3_DECODER
4494     c->ac3_downmix = ff_ac3_downmix_c;
4495 #endif
4496 #ifdef CONFIG_FLAC_ENCODER
4497     c->flac_compute_autocorr = ff_flac_compute_autocorr;
4498 #endif
4499     c->vector_fmul = vector_fmul_c;
4500     c->vector_fmul_reverse = vector_fmul_reverse_c;
4501     c->vector_fmul_add_add = ff_vector_fmul_add_add_c;
4502     c->vector_fmul_window = ff_vector_fmul_window_c;
4503     c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
4504     c->float_to_int16 = ff_float_to_int16_c;
4505     c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
4506     c->add_int16 = add_int16_c;
4507     c->sub_int16 = sub_int16_c;
4508     c->scalarproduct_int16 = scalarproduct_int16_c;
4509
4510     c->shrink[0]= ff_img_copy_plane;
4511     c->shrink[1]= ff_shrink22;
4512     c->shrink[2]= ff_shrink44;
4513     c->shrink[3]= ff_shrink88;
4514
4515     c->prefetch= just_return;
4516
4517     memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4518     memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4519
4520     if (ENABLE_MMX)      dsputil_init_mmx   (c, avctx);
4521     if (ENABLE_ARMV4L)   dsputil_init_armv4l(c, avctx);
4522     if (ENABLE_MLIB)     dsputil_init_mlib  (c, avctx);
4523     if (ENABLE_VIS)      dsputil_init_vis   (c, avctx);
4524     if (ENABLE_ALPHA)    dsputil_init_alpha (c, avctx);
4525     if (ENABLE_POWERPC)  dsputil_init_ppc   (c, avctx);
4526     if (ENABLE_MMI)      dsputil_init_mmi   (c, avctx);
4527     if (ENABLE_SH4)      dsputil_init_sh4   (c, avctx);
4528     if (ENABLE_BFIN)     dsputil_init_bfin  (c, avctx);
4529
4530     for(i=0; i<64; i++){
4531         if(!c->put_2tap_qpel_pixels_tab[0][i])
4532             c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4533         if(!c->avg_2tap_qpel_pixels_tab[0][i])
4534             c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4535     }
4536
4537     switch(c->idct_permutation_type){
4538     case FF_NO_IDCT_PERM:
4539         for(i=0; i<64; i++)
4540             c->idct_permutation[i]= i;
4541         break;
4542     case FF_LIBMPEG2_IDCT_PERM:
4543         for(i=0; i<64; i++)
4544             c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4545         break;
4546     case FF_SIMPLE_IDCT_PERM:
4547         for(i=0; i<64; i++)
4548             c->idct_permutation[i]= simple_mmx_permutation[i];
4549         break;
4550     case FF_TRANSPOSE_IDCT_PERM:
4551         for(i=0; i<64; i++)
4552             c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4553         break;
4554     case FF_PARTTRANS_IDCT_PERM:
4555         for(i=0; i<64; i++)
4556             c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4557         break;
4558     case FF_SSE2_IDCT_PERM:
4559         for(i=0; i<64; i++)
4560             c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
4561         break;
4562     default:
4563         av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
4564     }
4565 }
4566