libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard.
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * This file is part of FFmpeg.
   7  *
   8  * FFmpeg is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License as published by the Free Software Foundation; either
  11  * version 2.1 of the License, or (at your option) any later version.
  12  *
  13  * FFmpeg is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16  * Lesser General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU Lesser General Public
  19  * License along with FFmpeg; if not, write to the Free Software
  20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21  *
  22  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
  23  */
  24
  25 /**
  26  * @file dsputil.c
  27  * DSP utils
  28  */
  29
  30 #include "avcodec.h"
  31 #include "dsputil.h"
  32 #include "mpegvideo.h"
  33 #include "simple_idct.h"
  34 #include "faandct.h"
  35 #include "snow.h"
  36
  37 /* snow.c */
  38 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
  39
  40 /* vorbis.c */
  41 void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
  42
  43 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
  44 uint32_t ff_squareTbl[512] = {0, };
  45
  46 const uint8_t ff_zigzag_direct[64] = {
  47     0,   1,  8, 16,  9,  2,  3, 10,
  48     17, 24, 32, 25, 18, 11,  4,  5,
  49     12, 19, 26, 33, 40, 48, 41, 34,
  50     27, 20, 13,  6,  7, 14, 21, 28,
  51     35, 42, 49, 56, 57, 50, 43, 36,
  52     29, 22, 15, 23, 30, 37, 44, 51,
  53     58, 59, 52, 45, 38, 31, 39, 46,
  54     53, 60, 61, 54, 47, 55, 62, 63
  55 };
  56
  57 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  58    specification, we interleave the fields */
  59 const uint8_t ff_zigzag248_direct[64] = {
  60      0,  8,  1,  9, 16, 24,  2, 10,
  61     17, 25, 32, 40, 48, 56, 33, 41,
  62     18, 26,  3, 11,  4, 12, 19, 27,
  63     34, 42, 49, 57, 50, 58, 35, 43,
  64     20, 28,  5, 13,  6, 14, 21, 29,
  65     36, 44, 51, 59, 52, 60, 37, 45,
  66     22, 30,  7, 15, 23, 31, 38, 46,
  67     53, 61, 54, 62, 39, 47, 55, 63,
  68 };
  69
  70 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  71 DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
  72
  73 const uint8_t ff_alternate_horizontal_scan[64] = {
  74     0,  1,   2,  3,  8,  9, 16, 17,
  75     10, 11,  4,  5,  6,  7, 15, 14,
  76     13, 12, 19, 18, 24, 25, 32, 33,
  77     26, 27, 20, 21, 22, 23, 28, 29,
  78     30, 31, 34, 35, 40, 41, 48, 49,
  79     42, 43, 36, 37, 38, 39, 44, 45,
  80     46, 47, 50, 51, 56, 57, 58, 59,
  81     52, 53, 54, 55, 60, 61, 62, 63,
  82 };
  83
  84 const uint8_t ff_alternate_vertical_scan[64] = {
  85     0,  8,  16, 24,  1,  9,  2, 10,
  86     17, 25, 32, 40, 48, 56, 57, 49,
  87     41, 33, 26, 18,  3, 11,  4, 12,
  88     19, 27, 34, 42, 50, 58, 35, 43,
  89     51, 59, 20, 28,  5, 13,  6, 14,
  90     21, 29, 36, 44, 52, 60, 37, 45,
  91     53, 61, 22, 30,  7, 15, 23, 31,
  92     38, 46, 54, 62, 39, 47, 55, 63,
  93 };
  94
  95 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
  96 const uint32_t ff_inverse[256]={
  97          0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
  98  536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
  99  268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
 100  178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
 101  134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
 102  107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
 103   89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
 104   76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
 105   67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
 106   59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
 107   53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
 108   48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
 109   44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
 110   41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
 111   38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
 112   35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
 113   33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
 114   31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
 115   29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
 116   28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
 117   26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
 118   25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
 119   24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
 120   23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
 121   22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
 122   21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
 123   20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
 124   19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
 125   19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
 126   18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
 127   17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
 128   17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
 129 };
 130
 131 /* Input permutation for the simple_idct_mmx */
 132 static const uint8_t simple_mmx_permutation[64]={
 133         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 134         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 135         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 136         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 137         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 138         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 139         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 140         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 141 };
 142
 143 static int pix_sum_c(uint8_t * pix, int line_size)
 144 {
 145     int s, i, j;
 146
 147     s = 0;
 148     for (i = 0; i < 16; i++) {
 149         for (j = 0; j < 16; j += 8) {
 150             s += pix[0];
 151             s += pix[1];
 152             s += pix[2];
 153             s += pix[3];
 154             s += pix[4];
 155             s += pix[5];
 156             s += pix[6];
 157             s += pix[7];
 158             pix += 8;
 159         }
 160         pix += line_size - 16;
 161     }
 162     return s;
 163 }
 164
 165 static int pix_norm1_c(uint8_t * pix, int line_size)
 166 {
 167     int s, i, j;
 168     uint32_t *sq = ff_squareTbl + 256;
 169
 170     s = 0;
 171     for (i = 0; i < 16; i++) {
 172         for (j = 0; j < 16; j += 8) {
 173 #if 0
 174             s += sq[pix[0]];
 175             s += sq[pix[1]];
 176             s += sq[pix[2]];
 177             s += sq[pix[3]];
 178             s += sq[pix[4]];
 179             s += sq[pix[5]];
 180             s += sq[pix[6]];
 181             s += sq[pix[7]];
 182 #else
 183 #if LONG_MAX > 2147483647
 184             register uint64_t x=*(uint64_t*)pix;
 185             s += sq[x&0xff];
 186             s += sq[(x>>8)&0xff];
 187             s += sq[(x>>16)&0xff];
 188             s += sq[(x>>24)&0xff];
 189             s += sq[(x>>32)&0xff];
 190             s += sq[(x>>40)&0xff];
 191             s += sq[(x>>48)&0xff];
 192             s += sq[(x>>56)&0xff];
 193 #else
 194             register uint32_t x=*(uint32_t*)pix;
 195             s += sq[x&0xff];
 196             s += sq[(x>>8)&0xff];
 197             s += sq[(x>>16)&0xff];
 198             s += sq[(x>>24)&0xff];
 199             x=*(uint32_t*)(pix+4);
 200             s += sq[x&0xff];
 201             s += sq[(x>>8)&0xff];
 202             s += sq[(x>>16)&0xff];
 203             s += sq[(x>>24)&0xff];
 204 #endif
 205 #endif
 206             pix += 8;
 207         }
 208         pix += line_size - 16;
 209     }
 210     return s;
 211 }
 212
 213 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
 214     int i;
 215
 216     for(i=0; i+8<=w; i+=8){
 217         dst[i+0]= bswap_32(src[i+0]);
 218         dst[i+1]= bswap_32(src[i+1]);
 219         dst[i+2]= bswap_32(src[i+2]);
 220         dst[i+3]= bswap_32(src[i+3]);
 221         dst[i+4]= bswap_32(src[i+4]);
 222         dst[i+5]= bswap_32(src[i+5]);
 223         dst[i+6]= bswap_32(src[i+6]);
 224         dst[i+7]= bswap_32(src[i+7]);
 225     }
 226     for(;i<w; i++){
 227         dst[i+0]= bswap_32(src[i+0]);
 228     }
 229 }
 230
 231 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 232 {
 233     int s, i;
 234     uint32_t *sq = ff_squareTbl + 256;
 235
 236     s = 0;
 237     for (i = 0; i < h; i++) {
 238         s += sq[pix1[0] - pix2[0]];
 239         s += sq[pix1[1] - pix2[1]];
 240         s += sq[pix1[2] - pix2[2]];
 241         s += sq[pix1[3] - pix2[3]];
 242         pix1 += line_size;
 243         pix2 += line_size;
 244     }
 245     return s;
 246 }
 247
 248 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 249 {
 250     int s, i;
 251     uint32_t *sq = ff_squareTbl + 256;
 252
 253     s = 0;
 254     for (i = 0; i < h; i++) {
 255         s += sq[pix1[0] - pix2[0]];
 256         s += sq[pix1[1] - pix2[1]];
 257         s += sq[pix1[2] - pix2[2]];
 258         s += sq[pix1[3] - pix2[3]];
 259         s += sq[pix1[4] - pix2[4]];
 260         s += sq[pix1[5] - pix2[5]];
 261         s += sq[pix1[6] - pix2[6]];
 262         s += sq[pix1[7] - pix2[7]];
 263         pix1 += line_size;
 264         pix2 += line_size;
 265     }
 266     return s;
 267 }
 268
 269 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 270 {
 271     int s, i;
 272     uint32_t *sq = ff_squareTbl + 256;
 273
 274     s = 0;
 275     for (i = 0; i < h; i++) {
 276         s += sq[pix1[ 0] - pix2[ 0]];
 277         s += sq[pix1[ 1] - pix2[ 1]];
 278         s += sq[pix1[ 2] - pix2[ 2]];
 279         s += sq[pix1[ 3] - pix2[ 3]];
 280         s += sq[pix1[ 4] - pix2[ 4]];
 281         s += sq[pix1[ 5] - pix2[ 5]];
 282         s += sq[pix1[ 6] - pix2[ 6]];
 283         s += sq[pix1[ 7] - pix2[ 7]];
 284         s += sq[pix1[ 8] - pix2[ 8]];
 285         s += sq[pix1[ 9] - pix2[ 9]];
 286         s += sq[pix1[10] - pix2[10]];
 287         s += sq[pix1[11] - pix2[11]];
 288         s += sq[pix1[12] - pix2[12]];
 289         s += sq[pix1[13] - pix2[13]];
 290         s += sq[pix1[14] - pix2[14]];
 291         s += sq[pix1[15] - pix2[15]];
 292
 293         pix1 += line_size;
 294         pix2 += line_size;
 295     }
 296     return s;
 297 }
 298
 299
 300 #ifdef CONFIG_SNOW_ENCODER //dwt is in snow.c
 301 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
 302     int s, i, j;
 303     const int dec_count= w==8 ? 3 : 4;
 304     int tmp[32*32];
 305     int level, ori;
 306     static const int scale[2][2][4][4]={
 307       {
 308         {
 309             // 9/7 8x8 dec=3
 310             {268, 239, 239, 213},
 311             {  0, 224, 224, 152},
 312             {  0, 135, 135, 110},
 313         },{
 314             // 9/7 16x16 or 32x32 dec=4
 315             {344, 310, 310, 280},
 316             {  0, 320, 320, 228},
 317             {  0, 175, 175, 136},
 318             {  0, 129, 129, 102},
 319         }
 320       },{
 321         {
 322             // 5/3 8x8 dec=3
 323             {275, 245, 245, 218},
 324             {  0, 230, 230, 156},
 325             {  0, 138, 138, 113},
 326         },{
 327             // 5/3 16x16 or 32x32 dec=4
 328             {352, 317, 317, 286},
 329             {  0, 328, 328, 233},
 330             {  0, 180, 180, 140},
 331             {  0, 132, 132, 105},
 332         }
 333       }
 334     };
 335
 336     for (i = 0; i < h; i++) {
 337         for (j = 0; j < w; j+=4) {
 338             tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
 339             tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
 340             tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
 341             tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
 342         }
 343         pix1 += line_size;
 344         pix2 += line_size;
 345     }
 346
 347     ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
 348
 349     s=0;
 350     assert(w==h);
 351     for(level=0; level<dec_count; level++){
 352         for(ori= level ? 1 : 0; ori<4; ori++){
 353             int size= w>>(dec_count-level);
 354             int sx= (ori&1) ? size : 0;
 355             int stride= 32<<(dec_count-level);
 356             int sy= (ori&2) ? stride>>1 : 0;
 357
 358             for(i=0; i<size; i++){
 359                 for(j=0; j<size; j++){
 360                     int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
 361                     s += FFABS(v);
 362                 }
 363             }
 364         }
 365     }
 366     assert(s>=0);
 367     return s>>9;
 368 }
 369
 370 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 371     return w_c(v, pix1, pix2, line_size,  8, h, 1);
 372 }
 373
 374 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 375     return w_c(v, pix1, pix2, line_size,  8, h, 0);
 376 }
 377
 378 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 379     return w_c(v, pix1, pix2, line_size, 16, h, 1);
 380 }
 381
 382 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 383     return w_c(v, pix1, pix2, line_size, 16, h, 0);
 384 }
 385
 386 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 387     return w_c(v, pix1, pix2, line_size, 32, h, 1);
 388 }
 389
 390 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 391     return w_c(v, pix1, pix2, line_size, 32, h, 0);
 392 }
 393 #endif
 394
 395 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
 396 {
 397     int i;
 398
 399     /* read the pixels */
 400     for(i=0;i<8;i++) {
 401         block[0] = pixels[0];
 402         block[1] = pixels[1];
 403         block[2] = pixels[2];
 404         block[3] = pixels[3];
 405         block[4] = pixels[4];
 406         block[5] = pixels[5];
 407         block[6] = pixels[6];
 408         block[7] = pixels[7];
 409         pixels += line_size;
 410         block += 8;
 411     }
 412 }
 413
 414 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
 415                           const uint8_t *s2, int stride){
 416     int i;
 417
 418     /* read the pixels */
 419     for(i=0;i<8;i++) {
 420         block[0] = s1[0] - s2[0];
 421         block[1] = s1[1] - s2[1];
 422         block[2] = s1[2] - s2[2];
 423         block[3] = s1[3] - s2[3];
 424         block[4] = s1[4] - s2[4];
 425         block[5] = s1[5] - s2[5];
 426         block[6] = s1[6] - s2[6];
 427         block[7] = s1[7] - s2[7];
 428         s1 += stride;
 429         s2 += stride;
 430         block += 8;
 431     }
 432 }
 433
 434
 435 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 436                                  int line_size)
 437 {
 438     int i;
 439     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 440
 441     /* read the pixels */
 442     for(i=0;i<8;i++) {
 443         pixels[0] = cm[block[0]];
 444         pixels[1] = cm[block[1]];
 445         pixels[2] = cm[block[2]];
 446         pixels[3] = cm[block[3]];
 447         pixels[4] = cm[block[4]];
 448         pixels[5] = cm[block[5]];
 449         pixels[6] = cm[block[6]];
 450         pixels[7] = cm[block[7]];
 451
 452         pixels += line_size;
 453         block += 8;
 454     }
 455 }
 456
 457 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 458                                  int line_size)
 459 {
 460     int i;
 461     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 462
 463     /* read the pixels */
 464     for(i=0;i<4;i++) {
 465         pixels[0] = cm[block[0]];
 466         pixels[1] = cm[block[1]];
 467         pixels[2] = cm[block[2]];
 468         pixels[3] = cm[block[3]];
 469
 470         pixels += line_size;
 471         block += 8;
 472     }
 473 }
 474
 475 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 476                                  int line_size)
 477 {
 478     int i;
 479     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 480
 481     /* read the pixels */
 482     for(i=0;i<2;i++) {
 483         pixels[0] = cm[block[0]];
 484         pixels[1] = cm[block[1]];
 485
 486         pixels += line_size;
 487         block += 8;
 488     }
 489 }
 490
 491 static void put_signed_pixels_clamped_c(const DCTELEM *block,
 492                                         uint8_t *restrict pixels,
 493                                         int line_size)
 494 {
 495     int i, j;
 496
 497     for (i = 0; i < 8; i++) {
 498         for (j = 0; j < 8; j++) {
 499             if (*block < -128)
 500                 *pixels = 0;
 501             else if (*block > 127)
 502                 *pixels = 255;
 503             else
 504                 *pixels = (uint8_t)(*block + 128);
 505             block++;
 506             pixels++;
 507         }
 508         pixels += (line_size - 8);
 509     }
 510 }
 511
 512 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 513                           int line_size)
 514 {
 515     int i;
 516     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 517
 518     /* read the pixels */
 519     for(i=0;i<8;i++) {
 520         pixels[0] = cm[pixels[0] + block[0]];
 521         pixels[1] = cm[pixels[1] + block[1]];
 522         pixels[2] = cm[pixels[2] + block[2]];
 523         pixels[3] = cm[pixels[3] + block[3]];
 524         pixels[4] = cm[pixels[4] + block[4]];
 525         pixels[5] = cm[pixels[5] + block[5]];
 526         pixels[6] = cm[pixels[6] + block[6]];
 527         pixels[7] = cm[pixels[7] + block[7]];
 528         pixels += line_size;
 529         block += 8;
 530     }
 531 }
 532
 533 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 534                           int line_size)
 535 {
 536     int i;
 537     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 538
 539     /* read the pixels */
 540     for(i=0;i<4;i++) {
 541         pixels[0] = cm[pixels[0] + block[0]];
 542         pixels[1] = cm[pixels[1] + block[1]];
 543         pixels[2] = cm[pixels[2] + block[2]];
 544         pixels[3] = cm[pixels[3] + block[3]];
 545         pixels += line_size;
 546         block += 8;
 547     }
 548 }
 549
 550 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 551                           int line_size)
 552 {
 553     int i;
 554     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 555
 556     /* read the pixels */
 557     for(i=0;i<2;i++) {
 558         pixels[0] = cm[pixels[0] + block[0]];
 559         pixels[1] = cm[pixels[1] + block[1]];
 560         pixels += line_size;
 561         block += 8;
 562     }
 563 }
 564
 565 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 566 {
 567     int i;
 568     for(i=0;i<8;i++) {
 569         pixels[0] += block[0];
 570         pixels[1] += block[1];
 571         pixels[2] += block[2];
 572         pixels[3] += block[3];
 573         pixels[4] += block[4];
 574         pixels[5] += block[5];
 575         pixels[6] += block[6];
 576         pixels[7] += block[7];
 577         pixels += line_size;
 578         block += 8;
 579     }
 580 }
 581
 582 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 583 {
 584     int i;
 585     for(i=0;i<4;i++) {
 586         pixels[0] += block[0];
 587         pixels[1] += block[1];
 588         pixels[2] += block[2];
 589         pixels[3] += block[3];
 590         pixels += line_size;
 591         block += 4;
 592     }
 593 }
 594
 595 static int sum_abs_dctelem_c(DCTELEM *block)
 596 {
 597     int sum=0, i;
 598     for(i=0; i<64; i++)
 599         sum+= FFABS(block[i]);
 600     return sum;
 601 }
 602
 603 #if 0
 604
 605 #define PIXOP2(OPNAME, OP) \
 606 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 607 {\
 608     int i;\
 609     for(i=0; i<h; i++){\
 610         OP(*((uint64_t*)block), LD64(pixels));\
 611         pixels+=line_size;\
 612         block +=line_size;\
 613     }\
 614 }\
 615 \
 616 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 617 {\
 618     int i;\
 619     for(i=0; i<h; i++){\
 620         const uint64_t a= LD64(pixels  );\
 621         const uint64_t b= LD64(pixels+1);\
 622         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 623         pixels+=line_size;\
 624         block +=line_size;\
 625     }\
 626 }\
 627 \
 628 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 629 {\
 630     int i;\
 631     for(i=0; i<h; i++){\
 632         const uint64_t a= LD64(pixels  );\
 633         const uint64_t b= LD64(pixels+1);\
 634         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 635         pixels+=line_size;\
 636         block +=line_size;\
 637     }\
 638 }\
 639 \
 640 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 641 {\
 642     int i;\
 643     for(i=0; i<h; i++){\
 644         const uint64_t a= LD64(pixels          );\
 645         const uint64_t b= LD64(pixels+line_size);\
 646         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 647         pixels+=line_size;\
 648         block +=line_size;\
 649     }\
 650 }\
 651 \
 652 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 653 {\
 654     int i;\
 655     for(i=0; i<h; i++){\
 656         const uint64_t a= LD64(pixels          );\
 657         const uint64_t b= LD64(pixels+line_size);\
 658         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 659         pixels+=line_size;\
 660         block +=line_size;\
 661     }\
 662 }\
 663 \
 664 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 665 {\
 666         int i;\
 667         const uint64_t a= LD64(pixels  );\
 668         const uint64_t b= LD64(pixels+1);\
 669         uint64_t l0=  (a&0x0303030303030303ULL)\
 670                     + (b&0x0303030303030303ULL)\
 671                     + 0x0202020202020202ULL;\
 672         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 673                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 674         uint64_t l1,h1;\
 675 \
 676         pixels+=line_size;\
 677         for(i=0; i<h; i+=2){\
 678             uint64_t a= LD64(pixels  );\
 679             uint64_t b= LD64(pixels+1);\
 680             l1=  (a&0x0303030303030303ULL)\
 681                + (b&0x0303030303030303ULL);\
 682             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 683               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 684             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 685             pixels+=line_size;\
 686             block +=line_size;\
 687             a= LD64(pixels  );\
 688             b= LD64(pixels+1);\
 689             l0=  (a&0x0303030303030303ULL)\
 690                + (b&0x0303030303030303ULL)\
 691                + 0x0202020202020202ULL;\
 692             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 693               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 694             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 695             pixels+=line_size;\
 696             block +=line_size;\
 697         }\
 698 }\
 699 \
 700 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 701 {\
 702         int i;\
 703         const uint64_t a= LD64(pixels  );\
 704         const uint64_t b= LD64(pixels+1);\
 705         uint64_t l0=  (a&0x0303030303030303ULL)\
 706                     + (b&0x0303030303030303ULL)\
 707                     + 0x0101010101010101ULL;\
 708         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 709                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 710         uint64_t l1,h1;\
 711 \
 712         pixels+=line_size;\
 713         for(i=0; i<h; i+=2){\
 714             uint64_t a= LD64(pixels  );\
 715             uint64_t b= LD64(pixels+1);\
 716             l1=  (a&0x0303030303030303ULL)\
 717                + (b&0x0303030303030303ULL);\
 718             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 719               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 720             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 721             pixels+=line_size;\
 722             block +=line_size;\
 723             a= LD64(pixels  );\
 724             b= LD64(pixels+1);\
 725             l0=  (a&0x0303030303030303ULL)\
 726                + (b&0x0303030303030303ULL)\
 727                + 0x0101010101010101ULL;\
 728             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 729               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 730             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 731             pixels+=line_size;\
 732             block +=line_size;\
 733         }\
 734 }\
 735 \
 736 CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
 737 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
 738 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
 739 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
 740 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
 741 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
 742 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
 743
 744 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
 745 #else // 64 bit variant
 746
 747 #define PIXOP2(OPNAME, OP) \
 748 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 749     int i;\
 750     for(i=0; i<h; i++){\
 751         OP(*((uint16_t*)(block  )), LD16(pixels  ));\
 752         pixels+=line_size;\
 753         block +=line_size;\
 754     }\
 755 }\
 756 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 757     int i;\
 758     for(i=0; i<h; i++){\
 759         OP(*((uint32_t*)(block  )), LD32(pixels  ));\
 760         pixels+=line_size;\
 761         block +=line_size;\
 762     }\
 763 }\
 764 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 765     int i;\
 766     for(i=0; i<h; i++){\
 767         OP(*((uint32_t*)(block  )), LD32(pixels  ));\
 768         OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
 769         pixels+=line_size;\
 770         block +=line_size;\
 771     }\
 772 }\
 773 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 774     OPNAME ## _pixels8_c(block, pixels, line_size, h);\
 775 }\
 776 \
 777 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 778                                                 int src_stride1, int src_stride2, int h){\
 779     int i;\
 780     for(i=0; i<h; i++){\
 781         uint32_t a,b;\
 782         a= LD32(&src1[i*src_stride1  ]);\
 783         b= LD32(&src2[i*src_stride2  ]);\
 784         OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
 785         a= LD32(&src1[i*src_stride1+4]);\
 786         b= LD32(&src2[i*src_stride2+4]);\
 787         OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
 788     }\
 789 }\
 790 \
 791 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 792                                                 int src_stride1, int src_stride2, int h){\
 793     int i;\
 794     for(i=0; i<h; i++){\
 795         uint32_t a,b;\
 796         a= LD32(&src1[i*src_stride1  ]);\
 797         b= LD32(&src2[i*src_stride2  ]);\
 798         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 799         a= LD32(&src1[i*src_stride1+4]);\
 800         b= LD32(&src2[i*src_stride2+4]);\
 801         OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
 802     }\
 803 }\
 804 \
 805 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 806                                                 int src_stride1, int src_stride2, int h){\
 807     int i;\
 808     for(i=0; i<h; i++){\
 809         uint32_t a,b;\
 810         a= LD32(&src1[i*src_stride1  ]);\
 811         b= LD32(&src2[i*src_stride2  ]);\
 812         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 813     }\
 814 }\
 815 \
 816 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 817                                                 int src_stride1, int src_stride2, int h){\
 818     int i;\
 819     for(i=0; i<h; i++){\
 820         uint32_t a,b;\
 821         a= LD16(&src1[i*src_stride1  ]);\
 822         b= LD16(&src2[i*src_stride2  ]);\
 823         OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 824     }\
 825 }\
 826 \
 827 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 828                                                 int src_stride1, int src_stride2, int h){\
 829     OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 830     OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 831 }\
 832 \
 833 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 834                                                 int src_stride1, int src_stride2, int h){\
 835     OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 836     OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 837 }\
 838 \
 839 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 840     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 841 }\
 842 \
 843 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 844     OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 845 }\
 846 \
 847 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 848     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 849 }\
 850 \
 851 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 852     OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 853 }\
 854 \
 855 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 856                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 857     int i;\
 858     for(i=0; i<h; i++){\
 859         uint32_t a, b, c, d, l0, l1, h0, h1;\
 860         a= LD32(&src1[i*src_stride1]);\
 861         b= LD32(&src2[i*src_stride2]);\
 862         c= LD32(&src3[i*src_stride3]);\
 863         d= LD32(&src4[i*src_stride4]);\
 864         l0=  (a&0x03030303UL)\
 865            + (b&0x03030303UL)\
 866            + 0x02020202UL;\
 867         h0= ((a&0xFCFCFCFCUL)>>2)\
 868           + ((b&0xFCFCFCFCUL)>>2);\
 869         l1=  (c&0x03030303UL)\
 870            + (d&0x03030303UL);\
 871         h1= ((c&0xFCFCFCFCUL)>>2)\
 872           + ((d&0xFCFCFCFCUL)>>2);\
 873         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 874         a= LD32(&src1[i*src_stride1+4]);\
 875         b= LD32(&src2[i*src_stride2+4]);\
 876         c= LD32(&src3[i*src_stride3+4]);\
 877         d= LD32(&src4[i*src_stride4+4]);\
 878         l0=  (a&0x03030303UL)\
 879            + (b&0x03030303UL)\
 880            + 0x02020202UL;\
 881         h0= ((a&0xFCFCFCFCUL)>>2)\
 882           + ((b&0xFCFCFCFCUL)>>2);\
 883         l1=  (c&0x03030303UL)\
 884            + (d&0x03030303UL);\
 885         h1= ((c&0xFCFCFCFCUL)>>2)\
 886           + ((d&0xFCFCFCFCUL)>>2);\
 887         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 888     }\
 889 }\
 890 \
 891 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 892     OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 893 }\
 894 \
 895 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 896     OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 897 }\
 898 \
 899 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 900     OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 901 }\
 902 \
 903 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 904     OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 905 }\
 906 \
 907 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 908                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 909     int i;\
 910     for(i=0; i<h; i++){\
 911         uint32_t a, b, c, d, l0, l1, h0, h1;\
 912         a= LD32(&src1[i*src_stride1]);\
 913         b= LD32(&src2[i*src_stride2]);\
 914         c= LD32(&src3[i*src_stride3]);\
 915         d= LD32(&src4[i*src_stride4]);\
 916         l0=  (a&0x03030303UL)\
 917            + (b&0x03030303UL)\
 918            + 0x01010101UL;\
 919         h0= ((a&0xFCFCFCFCUL)>>2)\
 920           + ((b&0xFCFCFCFCUL)>>2);\
 921         l1=  (c&0x03030303UL)\
 922            + (d&0x03030303UL);\
 923         h1= ((c&0xFCFCFCFCUL)>>2)\
 924           + ((d&0xFCFCFCFCUL)>>2);\
 925         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 926         a= LD32(&src1[i*src_stride1+4]);\
 927         b= LD32(&src2[i*src_stride2+4]);\
 928         c= LD32(&src3[i*src_stride3+4]);\
 929         d= LD32(&src4[i*src_stride4+4]);\
 930         l0=  (a&0x03030303UL)\
 931            + (b&0x03030303UL)\
 932            + 0x01010101UL;\
 933         h0= ((a&0xFCFCFCFCUL)>>2)\
 934           + ((b&0xFCFCFCFCUL)>>2);\
 935         l1=  (c&0x03030303UL)\
 936            + (d&0x03030303UL);\
 937         h1= ((c&0xFCFCFCFCUL)>>2)\
 938           + ((d&0xFCFCFCFCUL)>>2);\
 939         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 940     }\
 941 }\
 942 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 943                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 944     OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 945     OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 946 }\
 947 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 948                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 949     OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 950     OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 951 }\
 952 \
 953 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 954 {\
 955         int i, a0, b0, a1, b1;\
 956         a0= pixels[0];\
 957         b0= pixels[1] + 2;\
 958         a0 += b0;\
 959         b0 += pixels[2];\
 960 \
 961         pixels+=line_size;\
 962         for(i=0; i<h; i+=2){\
 963             a1= pixels[0];\
 964             b1= pixels[1];\
 965             a1 += b1;\
 966             b1 += pixels[2];\
 967 \
 968             block[0]= (a1+a0)>>2; /* FIXME non put */\
 969             block[1]= (b1+b0)>>2;\
 970 \
 971             pixels+=line_size;\
 972             block +=line_size;\
 973 \
 974             a0= pixels[0];\
 975             b0= pixels[1] + 2;\
 976             a0 += b0;\
 977             b0 += pixels[2];\
 978 \
 979             block[0]= (a1+a0)>>2;\
 980             block[1]= (b1+b0)>>2;\
 981             pixels+=line_size;\
 982             block +=line_size;\
 983         }\
 984 }\
 985 \
 986 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 987 {\
 988         int i;\
 989         const uint32_t a= LD32(pixels  );\
 990         const uint32_t b= LD32(pixels+1);\
 991         uint32_t l0=  (a&0x03030303UL)\
 992                     + (b&0x03030303UL)\
 993                     + 0x02020202UL;\
 994         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
 995                    + ((b&0xFCFCFCFCUL)>>2);\
 996         uint32_t l1,h1;\
 997 \
 998         pixels+=line_size;\
 999         for(i=0; i<h; i+=2){\
1000             uint32_t a= LD32(pixels  );\
1001             uint32_t b= LD32(pixels+1);\
1002             l1=  (a&0x03030303UL)\
1003                + (b&0x03030303UL);\
1004             h1= ((a&0xFCFCFCFCUL)>>2)\
1005               + ((b&0xFCFCFCFCUL)>>2);\
1006             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1007             pixels+=line_size;\
1008             block +=line_size;\
1009             a= LD32(pixels  );\
1010             b= LD32(pixels+1);\
1011             l0=  (a&0x03030303UL)\
1012                + (b&0x03030303UL)\
1013                + 0x02020202UL;\
1014             h0= ((a&0xFCFCFCFCUL)>>2)\
1015               + ((b&0xFCFCFCFCUL)>>2);\
1016             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1017             pixels+=line_size;\
1018             block +=line_size;\
1019         }\
1020 }\
1021 \
1022 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1023 {\
1024     int j;\
1025     for(j=0; j<2; j++){\
1026         int i;\
1027         const uint32_t a= LD32(pixels  );\
1028         const uint32_t b= LD32(pixels+1);\
1029         uint32_t l0=  (a&0x03030303UL)\
1030                     + (b&0x03030303UL)\
1031                     + 0x02020202UL;\
1032         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1033                    + ((b&0xFCFCFCFCUL)>>2);\
1034         uint32_t l1,h1;\
1035 \
1036         pixels+=line_size;\
1037         for(i=0; i<h; i+=2){\
1038             uint32_t a= LD32(pixels  );\
1039             uint32_t b= LD32(pixels+1);\
1040             l1=  (a&0x03030303UL)\
1041                + (b&0x03030303UL);\
1042             h1= ((a&0xFCFCFCFCUL)>>2)\
1043               + ((b&0xFCFCFCFCUL)>>2);\
1044             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1045             pixels+=line_size;\
1046             block +=line_size;\
1047             a= LD32(pixels  );\
1048             b= LD32(pixels+1);\
1049             l0=  (a&0x03030303UL)\
1050                + (b&0x03030303UL)\
1051                + 0x02020202UL;\
1052             h0= ((a&0xFCFCFCFCUL)>>2)\
1053               + ((b&0xFCFCFCFCUL)>>2);\
1054             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1055             pixels+=line_size;\
1056             block +=line_size;\
1057         }\
1058         pixels+=4-line_size*(h+1);\
1059         block +=4-line_size*h;\
1060     }\
1061 }\
1062 \
1063 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1064 {\
1065     int j;\
1066     for(j=0; j<2; j++){\
1067         int i;\
1068         const uint32_t a= LD32(pixels  );\
1069         const uint32_t b= LD32(pixels+1);\
1070         uint32_t l0=  (a&0x03030303UL)\
1071                     + (b&0x03030303UL)\
1072                     + 0x01010101UL;\
1073         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1074                    + ((b&0xFCFCFCFCUL)>>2);\
1075         uint32_t l1,h1;\
1076 \
1077         pixels+=line_size;\
1078         for(i=0; i<h; i+=2){\
1079             uint32_t a= LD32(pixels  );\
1080             uint32_t b= LD32(pixels+1);\
1081             l1=  (a&0x03030303UL)\
1082                + (b&0x03030303UL);\
1083             h1= ((a&0xFCFCFCFCUL)>>2)\
1084               + ((b&0xFCFCFCFCUL)>>2);\
1085             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1086             pixels+=line_size;\
1087             block +=line_size;\
1088             a= LD32(pixels  );\
1089             b= LD32(pixels+1);\
1090             l0=  (a&0x03030303UL)\
1091                + (b&0x03030303UL)\
1092                + 0x01010101UL;\
1093             h0= ((a&0xFCFCFCFCUL)>>2)\
1094               + ((b&0xFCFCFCFCUL)>>2);\
1095             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1096             pixels+=line_size;\
1097             block +=line_size;\
1098         }\
1099         pixels+=4-line_size*(h+1);\
1100         block +=4-line_size*h;\
1101     }\
1102 }\
1103 \
1104 CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1105 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1106 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1107 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1108 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1109 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1110 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1111 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1112
1113 #define op_avg(a, b) a = rnd_avg32(a, b)
1114 #endif
1115 #define op_put(a, b) a = b
1116
1117 PIXOP2(avg, op_avg)
1118 PIXOP2(put, op_put)
1119 #undef op_avg
1120 #undef op_put
1121
1122 #define avg2(a,b) ((a+b+1)>>1)
1123 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1124
1125 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1126     put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1127 }
1128
1129 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1130     put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1131 }
1132
1133 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1134 {
1135     const int A=(16-x16)*(16-y16);
1136     const int B=(   x16)*(16-y16);
1137     const int C=(16-x16)*(   y16);
1138     const int D=(   x16)*(   y16);
1139     int i;
1140
1141     for(i=0; i<h; i++)
1142     {
1143         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1144         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1145         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1146         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1147         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1148         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1149         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1150         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1151         dst+= stride;
1152         src+= stride;
1153     }
1154 }
1155
1156 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1157                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1158 {
1159     int y, vx, vy;
1160     const int s= 1<<shift;
1161
1162     width--;
1163     height--;
1164
1165     for(y=0; y<h; y++){
1166         int x;
1167
1168         vx= ox;
1169         vy= oy;
1170         for(x=0; x<8; x++){ //XXX FIXME optimize
1171             int src_x, src_y, frac_x, frac_y, index;
1172
1173             src_x= vx>>16;
1174             src_y= vy>>16;
1175             frac_x= src_x&(s-1);
1176             frac_y= src_y&(s-1);
1177             src_x>>=shift;
1178             src_y>>=shift;
1179
1180             if((unsigned)src_x < width){
1181                 if((unsigned)src_y < height){
1182                     index= src_x + src_y*stride;
1183                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1184                                            + src[index       +1]*   frac_x )*(s-frac_y)
1185                                         + (  src[index+stride  ]*(s-frac_x)
1186                                            + src[index+stride+1]*   frac_x )*   frac_y
1187                                         + r)>>(shift*2);
1188                 }else{
1189                     index= src_x + av_clip(src_y, 0, height)*stride;
1190                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1191                                           + src[index       +1]*   frac_x )*s
1192                                         + r)>>(shift*2);
1193                 }
1194             }else{
1195                 if((unsigned)src_y < height){
1196                     index= av_clip(src_x, 0, width) + src_y*stride;
1197                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1198                                            + src[index+stride  ]*   frac_y )*s
1199                                         + r)>>(shift*2);
1200                 }else{
1201                     index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1202                     dst[y*stride + x]=    src[index         ];
1203                 }
1204             }
1205
1206             vx+= dxx;
1207             vy+= dyx;
1208         }
1209         ox += dxy;
1210         oy += dyy;
1211     }
1212 }
1213
1214 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1215     switch(width){
1216     case 2: put_pixels2_c (dst, src, stride, height); break;
1217     case 4: put_pixels4_c (dst, src, stride, height); break;
1218     case 8: put_pixels8_c (dst, src, stride, height); break;
1219     case 16:put_pixels16_c(dst, src, stride, height); break;
1220     }
1221 }
1222
1223 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1224     int i,j;
1225     for (i=0; i < height; i++) {
1226       for (j=0; j < width; j++) {
1227         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1228       }
1229       src += stride;
1230       dst += stride;
1231     }
1232 }
1233
1234 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1235     int i,j;
1236     for (i=0; i < height; i++) {
1237       for (j=0; j < width; j++) {
1238         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1239       }
1240       src += stride;
1241       dst += stride;
1242     }
1243 }
1244
1245 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1246     int i,j;
1247     for (i=0; i < height; i++) {
1248       for (j=0; j < width; j++) {
1249         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1250       }
1251       src += stride;
1252       dst += stride;
1253     }
1254 }
1255
1256 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1257     int i,j;
1258     for (i=0; i < height; i++) {
1259       for (j=0; j < width; j++) {
1260         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1261       }
1262       src += stride;
1263       dst += stride;
1264     }
1265 }
1266
1267 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1268     int i,j;
1269     for (i=0; i < height; i++) {
1270       for (j=0; j < width; j++) {
1271         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1272       }
1273       src += stride;
1274       dst += stride;
1275     }
1276 }
1277
1278 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1279     int i,j;
1280     for (i=0; i < height; i++) {
1281       for (j=0; j < width; j++) {
1282         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1283       }
1284       src += stride;
1285       dst += stride;
1286     }
1287 }
1288
1289 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1290     int i,j;
1291     for (i=0; i < height; i++) {
1292       for (j=0; j < width; j++) {
1293         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1294       }
1295       src += stride;
1296       dst += stride;
1297     }
1298 }
1299
1300 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1301     int i,j;
1302     for (i=0; i < height; i++) {
1303       for (j=0; j < width; j++) {
1304         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1305       }
1306       src += stride;
1307       dst += stride;
1308     }
1309 }
1310
1311 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1312     switch(width){
1313     case 2: avg_pixels2_c (dst, src, stride, height); break;
1314     case 4: avg_pixels4_c (dst, src, stride, height); break;
1315     case 8: avg_pixels8_c (dst, src, stride, height); break;
1316     case 16:avg_pixels16_c(dst, src, stride, height); break;
1317     }
1318 }
1319
1320 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1321     int i,j;
1322     for (i=0; i < height; i++) {
1323       for (j=0; j < width; j++) {
1324         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1325       }
1326       src += stride;
1327       dst += stride;
1328     }
1329 }
1330
1331 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1332     int i,j;
1333     for (i=0; i < height; i++) {
1334       for (j=0; j < width; j++) {
1335         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1336       }
1337       src += stride;
1338       dst += stride;
1339     }
1340 }
1341
1342 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1343     int i,j;
1344     for (i=0; i < height; i++) {
1345       for (j=0; j < width; j++) {
1346         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1347       }
1348       src += stride;
1349       dst += stride;
1350     }
1351 }
1352
1353 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1354     int i,j;
1355     for (i=0; i < height; i++) {
1356       for (j=0; j < width; j++) {
1357         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1358       }
1359       src += stride;
1360       dst += stride;
1361     }
1362 }
1363
1364 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1365     int i,j;
1366     for (i=0; i < height; i++) {
1367       for (j=0; j < width; j++) {
1368         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1369       }
1370       src += stride;
1371       dst += stride;
1372     }
1373 }
1374
1375 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1376     int i,j;
1377     for (i=0; i < height; i++) {
1378       for (j=0; j < width; j++) {
1379         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1380       }
1381       src += stride;
1382       dst += stride;
1383     }
1384 }
1385
1386 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1387     int i,j;
1388     for (i=0; i < height; i++) {
1389       for (j=0; j < width; j++) {
1390         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1391       }
1392       src += stride;
1393       dst += stride;
1394     }
1395 }
1396
1397 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1398     int i,j;
1399     for (i=0; i < height; i++) {
1400       for (j=0; j < width; j++) {
1401         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1402       }
1403       src += stride;
1404       dst += stride;
1405     }
1406 }
1407 #if 0
1408 #define TPEL_WIDTH(width)\
1409 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1410     void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1411 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1412     void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1413 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1414     void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1415 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1416     void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1417 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1418     void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1419 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1420     void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1421 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1422     void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1423 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1424     void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1425 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1426     void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1427 #endif
1428
1429 #define H264_CHROMA_MC(OPNAME, OP)\
1430 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1431     const int A=(8-x)*(8-y);\
1432     const int B=(  x)*(8-y);\
1433     const int C=(8-x)*(  y);\
1434     const int D=(  x)*(  y);\
1435     int i;\
1436     \
1437     assert(x<8 && y<8 && x>=0 && y>=0);\
1438 \
1439     for(i=0; i<h; i++)\
1440     {\
1441         OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1442         OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1443         dst+= stride;\
1444         src+= stride;\
1445     }\
1446 }\
1447 \
1448 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1449     const int A=(8-x)*(8-y);\
1450     const int B=(  x)*(8-y);\
1451     const int C=(8-x)*(  y);\
1452     const int D=(  x)*(  y);\
1453     int i;\
1454     \
1455     assert(x<8 && y<8 && x>=0 && y>=0);\
1456 \
1457     for(i=0; i<h; i++)\
1458     {\
1459         OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1460         OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1461         OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1462         OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1463         dst+= stride;\
1464         src+= stride;\
1465     }\
1466 }\
1467 \
1468 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1469     const int A=(8-x)*(8-y);\
1470     const int B=(  x)*(8-y);\
1471     const int C=(8-x)*(  y);\
1472     const int D=(  x)*(  y);\
1473     int i;\
1474     \
1475     assert(x<8 && y<8 && x>=0 && y>=0);\
1476 \
1477     for(i=0; i<h; i++)\
1478     {\
1479         OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1480         OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1481         OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1482         OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1483         OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1484         OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1485         OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1486         OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1487         dst+= stride;\
1488         src+= stride;\
1489     }\
1490 }
1491
1492 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1493 #define op_put(a, b) a = (((b) + 32)>>6)
1494
1495 H264_CHROMA_MC(put_       , op_put)
1496 H264_CHROMA_MC(avg_       , op_avg)
1497 #undef op_avg
1498 #undef op_put
1499
1500 static void put_no_rnd_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1501     const int A=(8-x)*(8-y);
1502     const int B=(  x)*(8-y);
1503     const int C=(8-x)*(  y);
1504     const int D=(  x)*(  y);
1505     int i;
1506
1507     assert(x<8 && y<8 && x>=0 && y>=0);
1508
1509     for(i=0; i<h; i++)
1510     {
1511         dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1512         dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1513         dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1514         dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1515         dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1516         dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1517         dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1518         dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1519         dst+= stride;
1520         src+= stride;
1521     }
1522 }
1523
1524 #define QPEL_MC(r, OPNAME, RND, OP) \
1525 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1526     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1527     int i;\
1528     for(i=0; i<h; i++)\
1529     {\
1530         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1531         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1532         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1533         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1534         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1535         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1536         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1537         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1538         dst+=dstStride;\
1539         src+=srcStride;\
1540     }\
1541 }\
1542 \
1543 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1544     const int w=8;\
1545     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1546     int i;\
1547     for(i=0; i<w; i++)\
1548     {\
1549         const int src0= src[0*srcStride];\
1550         const int src1= src[1*srcStride];\
1551         const int src2= src[2*srcStride];\
1552         const int src3= src[3*srcStride];\
1553         const int src4= src[4*srcStride];\
1554         const int src5= src[5*srcStride];\
1555         const int src6= src[6*srcStride];\
1556         const int src7= src[7*srcStride];\
1557         const int src8= src[8*srcStride];\
1558         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1559         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1560         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1561         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1562         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1563         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1564         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1565         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1566         dst++;\
1567         src++;\
1568     }\
1569 }\
1570 \
1571 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1572     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1573     int i;\
1574     \
1575     for(i=0; i<h; i++)\
1576     {\
1577         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1578         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1579         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1580         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1581         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1582         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1583         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1584         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1585         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1586         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1587         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1588         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1589         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1590         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1591         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1592         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1593         dst+=dstStride;\
1594         src+=srcStride;\
1595     }\
1596 }\
1597 \
1598 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1599     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1600     int i;\
1601     const int w=16;\
1602     for(i=0; i<w; i++)\
1603     {\
1604         const int src0= src[0*srcStride];\
1605         const int src1= src[1*srcStride];\
1606         const int src2= src[2*srcStride];\
1607         const int src3= src[3*srcStride];\
1608         const int src4= src[4*srcStride];\
1609         const int src5= src[5*srcStride];\
1610         const int src6= src[6*srcStride];\
1611         const int src7= src[7*srcStride];\
1612         const int src8= src[8*srcStride];\
1613         const int src9= src[9*srcStride];\
1614         const int src10= src[10*srcStride];\
1615         const int src11= src[11*srcStride];\
1616         const int src12= src[12*srcStride];\
1617         const int src13= src[13*srcStride];\
1618         const int src14= src[14*srcStride];\
1619         const int src15= src[15*srcStride];\
1620         const int src16= src[16*srcStride];\
1621         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1622         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1623         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1624         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1625         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1626         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1627         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1628         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1629         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1630         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1631         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1632         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1633         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1634         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1635         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1636         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1637         dst++;\
1638         src++;\
1639     }\
1640 }\
1641 \
1642 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1643     OPNAME ## pixels8_c(dst, src, stride, 8);\
1644 }\
1645 \
1646 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1647     uint8_t half[64];\
1648     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1649     OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1650 }\
1651 \
1652 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1653     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1654 }\
1655 \
1656 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1657     uint8_t half[64];\
1658     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1659     OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1660 }\
1661 \
1662 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1663     uint8_t full[16*9];\
1664     uint8_t half[64];\
1665     copy_block9(full, src, 16, stride, 9);\
1666     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1667     OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1668 }\
1669 \
1670 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1671     uint8_t full[16*9];\
1672     copy_block9(full, src, 16, stride, 9);\
1673     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1674 }\
1675 \
1676 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1677     uint8_t full[16*9];\
1678     uint8_t half[64];\
1679     copy_block9(full, src, 16, stride, 9);\
1680     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1681     OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1682 }\
1683 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1684     uint8_t full[16*9];\
1685     uint8_t halfH[72];\
1686     uint8_t halfV[64];\
1687     uint8_t halfHV[64];\
1688     copy_block9(full, src, 16, stride, 9);\
1689     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1690     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1691     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1692     OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1693 }\
1694 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1695     uint8_t full[16*9];\
1696     uint8_t halfH[72];\
1697     uint8_t halfHV[64];\
1698     copy_block9(full, src, 16, stride, 9);\
1699     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1700     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1701     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1702     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1703 }\
1704 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1705     uint8_t full[16*9];\
1706     uint8_t halfH[72];\
1707     uint8_t halfV[64];\
1708     uint8_t halfHV[64];\
1709     copy_block9(full, src, 16, stride, 9);\
1710     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1711     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1712     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1713     OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1714 }\
1715 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1716     uint8_t full[16*9];\
1717     uint8_t halfH[72];\
1718     uint8_t halfHV[64];\
1719     copy_block9(full, src, 16, stride, 9);\
1720     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1721     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1722     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1723     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1724 }\
1725 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1726     uint8_t full[16*9];\
1727     uint8_t halfH[72];\
1728     uint8_t halfV[64];\
1729     uint8_t halfHV[64];\
1730     copy_block9(full, src, 16, stride, 9);\
1731     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1732     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1733     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1734     OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1735 }\
1736 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1737     uint8_t full[16*9];\
1738     uint8_t halfH[72];\
1739     uint8_t halfHV[64];\
1740     copy_block9(full, src, 16, stride, 9);\
1741     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1742     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1743     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1744     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1745 }\
1746 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1747     uint8_t full[16*9];\
1748     uint8_t halfH[72];\
1749     uint8_t halfV[64];\
1750     uint8_t halfHV[64];\
1751     copy_block9(full, src, 16, stride, 9);\
1752     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1753     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1754     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1755     OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1756 }\
1757 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1758     uint8_t full[16*9];\
1759     uint8_t halfH[72];\
1760     uint8_t halfHV[64];\
1761     copy_block9(full, src, 16, stride, 9);\
1762     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1763     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1764     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1765     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1766 }\
1767 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1768     uint8_t halfH[72];\
1769     uint8_t halfHV[64];\
1770     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1771     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1772     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1773 }\
1774 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1775     uint8_t halfH[72];\
1776     uint8_t halfHV[64];\
1777     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1778     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1779     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1780 }\
1781 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1782     uint8_t full[16*9];\
1783     uint8_t halfH[72];\
1784     uint8_t halfV[64];\
1785     uint8_t halfHV[64];\
1786     copy_block9(full, src, 16, stride, 9);\
1787     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1788     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1789     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1790     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1791 }\
1792 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1793     uint8_t full[16*9];\
1794     uint8_t halfH[72];\
1795     copy_block9(full, src, 16, stride, 9);\
1796     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1797     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1798     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1799 }\
1800 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1801     uint8_t full[16*9];\
1802     uint8_t halfH[72];\
1803     uint8_t halfV[64];\
1804     uint8_t halfHV[64];\
1805     copy_block9(full, src, 16, stride, 9);\
1806     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1807     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1808     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1809     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1810 }\
1811 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1812     uint8_t full[16*9];\
1813     uint8_t halfH[72];\
1814     copy_block9(full, src, 16, stride, 9);\
1815     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1816     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1817     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1818 }\
1819 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1820     uint8_t halfH[72];\
1821     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1822     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1823 }\
1824 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1825     OPNAME ## pixels16_c(dst, src, stride, 16);\
1826 }\
1827 \
1828 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1829     uint8_t half[256];\
1830     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1831     OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1832 }\
1833 \
1834 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1835     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1836 }\
1837 \
1838 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1839     uint8_t half[256];\
1840     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1841     OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1842 }\
1843 \
1844 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1845     uint8_t full[24*17];\
1846     uint8_t half[256];\
1847     copy_block17(full, src, 24, stride, 17);\
1848     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1849     OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1850 }\
1851 \
1852 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1853     uint8_t full[24*17];\
1854     copy_block17(full, src, 24, stride, 17);\
1855     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1856 }\
1857 \
1858 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1859     uint8_t full[24*17];\
1860     uint8_t half[256];\
1861     copy_block17(full, src, 24, stride, 17);\
1862     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1863     OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1864 }\
1865 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1866     uint8_t full[24*17];\
1867     uint8_t halfH[272];\
1868     uint8_t halfV[256];\
1869     uint8_t halfHV[256];\
1870     copy_block17(full, src, 24, stride, 17);\
1871     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1872     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1873     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1874     OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1875 }\
1876 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1877     uint8_t full[24*17];\
1878     uint8_t halfH[272];\
1879     uint8_t halfHV[256];\
1880     copy_block17(full, src, 24, stride, 17);\
1881     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1882     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1883     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1884     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1885 }\
1886 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1887     uint8_t full[24*17];\
1888     uint8_t halfH[272];\
1889     uint8_t halfV[256];\
1890     uint8_t halfHV[256];\
1891     copy_block17(full, src, 24, stride, 17);\
1892     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1893     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1894     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1895     OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1896 }\
1897 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1898     uint8_t full[24*17];\
1899     uint8_t halfH[272];\
1900     uint8_t halfHV[256];\
1901     copy_block17(full, src, 24, stride, 17);\
1902     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1903     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1904     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1905     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1906 }\
1907 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1908     uint8_t full[24*17];\
1909     uint8_t halfH[272];\
1910     uint8_t halfV[256];\
1911     uint8_t halfHV[256];\
1912     copy_block17(full, src, 24, stride, 17);\
1913     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1914     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1915     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1916     OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1917 }\
1918 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1919     uint8_t full[24*17];\
1920     uint8_t halfH[272];\
1921     uint8_t halfHV[256];\
1922     copy_block17(full, src, 24, stride, 17);\
1923     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1924     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1925     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1926     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1927 }\
1928 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1929     uint8_t full[24*17];\
1930     uint8_t halfH[272];\
1931     uint8_t halfV[256];\
1932     uint8_t halfHV[256];\
1933     copy_block17(full, src, 24, stride, 17);\
1934     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1935     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1936     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1937     OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1938 }\
1939 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1940     uint8_t full[24*17];\
1941     uint8_t halfH[272];\
1942     uint8_t halfHV[256];\
1943     copy_block17(full, src, 24, stride, 17);\
1944     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1945     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1946     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1947     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1948 }\
1949 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1950     uint8_t halfH[272];\
1951     uint8_t halfHV[256];\
1952     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1953     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1954     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1955 }\
1956 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1957     uint8_t halfH[272];\
1958     uint8_t halfHV[256];\
1959     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1960     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1961     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1962 }\
1963 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1964     uint8_t full[24*17];\
1965     uint8_t halfH[272];\
1966     uint8_t halfV[256];\
1967     uint8_t halfHV[256];\
1968     copy_block17(full, src, 24, stride, 17);\
1969     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1970     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1971     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1972     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1973 }\
1974 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1975     uint8_t full[24*17];\
1976     uint8_t halfH[272];\
1977     copy_block17(full, src, 24, stride, 17);\
1978     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1979     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1980     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1981 }\
1982 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1983     uint8_t full[24*17];\
1984     uint8_t halfH[272];\
1985     uint8_t halfV[256];\
1986     uint8_t halfHV[256];\
1987     copy_block17(full, src, 24, stride, 17);\
1988     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1989     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1990     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1991     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1992 }\
1993 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1994     uint8_t full[24*17];\
1995     uint8_t halfH[272];\
1996     copy_block17(full, src, 24, stride, 17);\
1997     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1998     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1999     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2000 }\
2001 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2002     uint8_t halfH[272];\
2003     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2004     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2005 }
2006
2007 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2008 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2009 #define op_put(a, b) a = cm[((b) + 16)>>5]
2010 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2011
2012 QPEL_MC(0, put_       , _       , op_put)
2013 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2014 QPEL_MC(0, avg_       , _       , op_avg)
2015 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
2016 #undef op_avg
2017 #undef op_avg_no_rnd
2018 #undef op_put
2019 #undef op_put_no_rnd
2020
2021 #if 1
2022 #define H264_LOWPASS(OPNAME, OP, OP2) \
2023 static void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2024     const int h=2;\
2025     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2026     int i;\
2027     for(i=0; i<h; i++)\
2028     {\
2029         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2030         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2031         dst+=dstStride;\
2032         src+=srcStride;\
2033     }\
2034 }\
2035 \
2036 static void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2037     const int w=2;\
2038     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2039     int i;\
2040     for(i=0; i<w; i++)\
2041     {\
2042         const int srcB= src[-2*srcStride];\
2043         const int srcA= src[-1*srcStride];\
2044         const int src0= src[0 *srcStride];\
2045         const int src1= src[1 *srcStride];\
2046         const int src2= src[2 *srcStride];\
2047         const int src3= src[3 *srcStride];\
2048         const int src4= src[4 *srcStride];\
2049         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2050         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2051         dst++;\
2052         src++;\
2053     }\
2054 }\
2055 \
2056 static void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2057     const int h=2;\
2058     const int w=2;\
2059     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2060     int i;\
2061     src -= 2*srcStride;\
2062     for(i=0; i<h+5; i++)\
2063     {\
2064         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2065         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2066         tmp+=tmpStride;\
2067         src+=srcStride;\
2068     }\
2069     tmp -= tmpStride*(h+5-2);\
2070     for(i=0; i<w; i++)\
2071     {\
2072         const int tmpB= tmp[-2*tmpStride];\
2073         const int tmpA= tmp[-1*tmpStride];\
2074         const int tmp0= tmp[0 *tmpStride];\
2075         const int tmp1= tmp[1 *tmpStride];\
2076         const int tmp2= tmp[2 *tmpStride];\
2077         const int tmp3= tmp[3 *tmpStride];\
2078         const int tmp4= tmp[4 *tmpStride];\
2079         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2080         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2081         dst++;\
2082         tmp++;\
2083     }\
2084 }\
2085 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2086     const int h=4;\
2087     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2088     int i;\
2089     for(i=0; i<h; i++)\
2090     {\
2091         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2092         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2093         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2094         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2095         dst+=dstStride;\
2096         src+=srcStride;\
2097     }\
2098 }\
2099 \
2100 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2101     const int w=4;\
2102     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2103     int i;\
2104     for(i=0; i<w; i++)\
2105     {\
2106         const int srcB= src[-2*srcStride];\
2107         const int srcA= src[-1*srcStride];\
2108         const int src0= src[0 *srcStride];\
2109         const int src1= src[1 *srcStride];\
2110         const int src2= src[2 *srcStride];\
2111         const int src3= src[3 *srcStride];\
2112         const int src4= src[4 *srcStride];\
2113         const int src5= src[5 *srcStride];\
2114         const int src6= src[6 *srcStride];\
2115         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2116         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2117         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2118         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2119         dst++;\
2120         src++;\
2121     }\
2122 }\
2123 \
2124 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2125     const int h=4;\
2126     const int w=4;\
2127     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2128     int i;\
2129     src -= 2*srcStride;\
2130     for(i=0; i<h+5; i++)\
2131     {\
2132         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2133         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2134         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2135         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2136         tmp+=tmpStride;\
2137         src+=srcStride;\
2138     }\
2139     tmp -= tmpStride*(h+5-2);\
2140     for(i=0; i<w; i++)\
2141     {\
2142         const int tmpB= tmp[-2*tmpStride];\
2143         const int tmpA= tmp[-1*tmpStride];\
2144         const int tmp0= tmp[0 *tmpStride];\
2145         const int tmp1= tmp[1 *tmpStride];\
2146         const int tmp2= tmp[2 *tmpStride];\
2147         const int tmp3= tmp[3 *tmpStride];\
2148         const int tmp4= tmp[4 *tmpStride];\
2149         const int tmp5= tmp[5 *tmpStride];\
2150         const int tmp6= tmp[6 *tmpStride];\
2151         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2152         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2153         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2154         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2155         dst++;\
2156         tmp++;\
2157     }\
2158 }\
2159 \
2160 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2161     const int h=8;\
2162     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2163     int i;\
2164     for(i=0; i<h; i++)\
2165     {\
2166         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2167         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2168         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2169         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2170         OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2171         OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2172         OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2173         OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2174         dst+=dstStride;\
2175         src+=srcStride;\
2176     }\
2177 }\
2178 \
2179 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2180     const int w=8;\
2181     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2182     int i;\
2183     for(i=0; i<w; i++)\
2184     {\
2185         const int srcB= src[-2*srcStride];\
2186         const int srcA= src[-1*srcStride];\
2187         const int src0= src[0 *srcStride];\
2188         const int src1= src[1 *srcStride];\
2189         const int src2= src[2 *srcStride];\
2190         const int src3= src[3 *srcStride];\
2191         const int src4= src[4 *srcStride];\
2192         const int src5= src[5 *srcStride];\
2193         const int src6= src[6 *srcStride];\
2194         const int src7= src[7 *srcStride];\
2195         const int src8= src[8 *srcStride];\
2196         const int src9= src[9 *srcStride];\
2197         const int src10=src[10*srcStride];\
2198         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2199         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2200         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2201         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2202         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2203         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2204         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2205         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2206         dst++;\
2207         src++;\
2208     }\
2209 }\
2210 \
2211 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2212     const int h=8;\
2213     const int w=8;\
2214     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2215     int i;\
2216     src -= 2*srcStride;\
2217     for(i=0; i<h+5; i++)\
2218     {\
2219         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2220         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2221         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2222         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2223         tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2224         tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2225         tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2226         tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2227         tmp+=tmpStride;\
2228         src+=srcStride;\
2229     }\
2230     tmp -= tmpStride*(h+5-2);\
2231     for(i=0; i<w; i++)\
2232     {\
2233         const int tmpB= tmp[-2*tmpStride];\
2234         const int tmpA= tmp[-1*tmpStride];\
2235         const int tmp0= tmp[0 *tmpStride];\
2236         const int tmp1= tmp[1 *tmpStride];\
2237         const int tmp2= tmp[2 *tmpStride];\
2238         const int tmp3= tmp[3 *tmpStride];\
2239         const int tmp4= tmp[4 *tmpStride];\
2240         const int tmp5= tmp[5 *tmpStride];\
2241         const int tmp6= tmp[6 *tmpStride];\
2242         const int tmp7= tmp[7 *tmpStride];\
2243         const int tmp8= tmp[8 *tmpStride];\
2244         const int tmp9= tmp[9 *tmpStride];\
2245         const int tmp10=tmp[10*tmpStride];\
2246         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2247         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2248         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2249         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2250         OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2251         OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2252         OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2253         OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2254         dst++;\
2255         tmp++;\
2256     }\
2257 }\
2258 \
2259 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2260     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2261     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2262     src += 8*srcStride;\
2263     dst += 8*dstStride;\
2264     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2265     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2266 }\
2267 \
2268 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2269     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2270     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2271     src += 8*srcStride;\
2272     dst += 8*dstStride;\
2273     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2274     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2275 }\
2276 \
2277 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2278     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2279     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2280     src += 8*srcStride;\
2281     dst += 8*dstStride;\
2282     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2283     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2284 }\
2285
2286 #define H264_MC(OPNAME, SIZE) \
2287 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2288     OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2289 }\
2290 \
2291 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2292     uint8_t half[SIZE*SIZE];\
2293     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2294     OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2295 }\
2296 \
2297 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2298     OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2299 }\
2300 \
2301 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2302     uint8_t half[SIZE*SIZE];\
2303     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2304     OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2305 }\
2306 \
2307 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2308     uint8_t full[SIZE*(SIZE+5)];\
2309     uint8_t * const full_mid= full + SIZE*2;\
2310     uint8_t half[SIZE*SIZE];\
2311     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2312     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2313     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2314 }\
2315 \
2316 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2317     uint8_t full[SIZE*(SIZE+5)];\
2318     uint8_t * const full_mid= full + SIZE*2;\
2319     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2320     OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2321 }\
2322 \
2323 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2324     uint8_t full[SIZE*(SIZE+5)];\
2325     uint8_t * const full_mid= full + SIZE*2;\
2326     uint8_t half[SIZE*SIZE];\
2327     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2328     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2329     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2330 }\
2331 \
2332 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2333     uint8_t full[SIZE*(SIZE+5)];\
2334     uint8_t * const full_mid= full + SIZE*2;\
2335     uint8_t halfH[SIZE*SIZE];\
2336     uint8_t halfV[SIZE*SIZE];\
2337     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2338     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2339     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2340     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2341 }\
2342 \
2343 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2344     uint8_t full[SIZE*(SIZE+5)];\
2345     uint8_t * const full_mid= full + SIZE*2;\
2346     uint8_t halfH[SIZE*SIZE];\
2347     uint8_t halfV[SIZE*SIZE];\
2348     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2349     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2350     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2351     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2352 }\
2353 \
2354 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2355     uint8_t full[SIZE*(SIZE+5)];\
2356     uint8_t * const full_mid= full + SIZE*2;\
2357     uint8_t halfH[SIZE*SIZE];\
2358     uint8_t halfV[SIZE*SIZE];\
2359     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2360     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2361     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2362     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2363 }\
2364 \
2365 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2366     uint8_t full[SIZE*(SIZE+5)];\
2367     uint8_t * const full_mid= full + SIZE*2;\
2368     uint8_t halfH[SIZE*SIZE];\
2369     uint8_t halfV[SIZE*SIZE];\
2370     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2371     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2372     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2373     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2374 }\
2375 \
2376 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2377     int16_t tmp[SIZE*(SIZE+5)];\
2378     OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2379 }\
2380 \
2381 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2382     int16_t tmp[SIZE*(SIZE+5)];\
2383     uint8_t halfH[SIZE*SIZE];\
2384     uint8_t halfHV[SIZE*SIZE];\
2385     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2386     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2387     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2388 }\
2389 \
2390 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2391     int16_t tmp[SIZE*(SIZE+5)];\
2392     uint8_t halfH[SIZE*SIZE];\
2393     uint8_t halfHV[SIZE*SIZE];\
2394     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2395     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2396     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2397 }\
2398 \
2399 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2400     uint8_t full[SIZE*(SIZE+5)];\
2401     uint8_t * const full_mid= full + SIZE*2;\
2402     int16_t tmp[SIZE*(SIZE+5)];\
2403     uint8_t halfV[SIZE*SIZE];\
2404     uint8_t halfHV[SIZE*SIZE];\
2405     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2406     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2407     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2408     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2409 }\
2410 \
2411 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2412     uint8_t full[SIZE*(SIZE+5)];\
2413     uint8_t * const full_mid= full + SIZE*2;\
2414     int16_t tmp[SIZE*(SIZE+5)];\
2415     uint8_t halfV[SIZE*SIZE];\
2416     uint8_t halfHV[SIZE*SIZE];\
2417     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2418     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2419     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2420     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2421 }\
2422
2423 #define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2424 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2425 #define op_put(a, b)  a = cm[((b) + 16)>>5]
2426 #define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2427 #define op2_put(a, b)  a = cm[((b) + 512)>>10]
2428
2429 H264_LOWPASS(put_       , op_put, op2_put)
2430 H264_LOWPASS(avg_       , op_avg, op2_avg)
2431 H264_MC(put_, 2)
2432 H264_MC(put_, 4)
2433 H264_MC(put_, 8)
2434 H264_MC(put_, 16)
2435 H264_MC(avg_, 4)
2436 H264_MC(avg_, 8)
2437 H264_MC(avg_, 16)
2438
2439 #undef op_avg
2440 #undef op_put
2441 #undef op2_avg
2442 #undef op2_put
2443 #endif
2444
2445 #define op_scale1(x)  block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2446 #define op_scale2(x)  dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2447 #define H264_WEIGHT(W,H) \
2448 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2449     int y; \
2450     offset <<= log2_denom; \
2451     if(log2_denom) offset += 1<<(log2_denom-1); \
2452     for(y=0; y<H; y++, block += stride){ \
2453         op_scale1(0); \
2454         op_scale1(1); \
2455         if(W==2) continue; \
2456         op_scale1(2); \
2457         op_scale1(3); \
2458         if(W==4) continue; \
2459         op_scale1(4); \
2460         op_scale1(5); \
2461         op_scale1(6); \
2462         op_scale1(7); \
2463         if(W==8) continue; \
2464         op_scale1(8); \
2465         op_scale1(9); \
2466         op_scale1(10); \
2467         op_scale1(11); \
2468         op_scale1(12); \
2469         op_scale1(13); \
2470         op_scale1(14); \
2471         op_scale1(15); \
2472     } \
2473 } \
2474 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2475     int y; \
2476     offset = ((offset + 1) | 1) << log2_denom; \
2477     for(y=0; y<H; y++, dst += stride, src += stride){ \
2478         op_scale2(0); \
2479         op_scale2(1); \
2480         if(W==2) continue; \
2481         op_scale2(2); \
2482         op_scale2(3); \
2483         if(W==4) continue; \
2484         op_scale2(4); \
2485         op_scale2(5); \
2486         op_scale2(6); \
2487         op_scale2(7); \
2488         if(W==8) continue; \
2489         op_scale2(8); \
2490         op_scale2(9); \
2491         op_scale2(10); \
2492         op_scale2(11); \
2493         op_scale2(12); \
2494         op_scale2(13); \
2495         op_scale2(14); \
2496         op_scale2(15); \
2497     } \
2498 }
2499
2500 H264_WEIGHT(16,16)
2501 H264_WEIGHT(16,8)
2502 H264_WEIGHT(8,16)
2503 H264_WEIGHT(8,8)
2504 H264_WEIGHT(8,4)
2505 H264_WEIGHT(4,8)
2506 H264_WEIGHT(4,4)
2507 H264_WEIGHT(4,2)
2508 H264_WEIGHT(2,4)
2509 H264_WEIGHT(2,2)
2510
2511 #undef op_scale1
2512 #undef op_scale2
2513 #undef H264_WEIGHT
2514
2515 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2516     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2517     int i;
2518
2519     for(i=0; i<h; i++){
2520         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2521         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2522         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2523         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2524         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2525         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2526         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2527         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2528         dst+=dstStride;
2529         src+=srcStride;
2530     }
2531 }
2532
2533 #ifdef CONFIG_CAVS_DECODER
2534 /* AVS specific */
2535 void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2536
2537 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2538     put_pixels8_c(dst, src, stride, 8);
2539 }
2540 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2541     avg_pixels8_c(dst, src, stride, 8);
2542 }
2543 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2544     put_pixels16_c(dst, src, stride, 16);
2545 }
2546 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2547     avg_pixels16_c(dst, src, stride, 16);
2548 }
2549 #endif /* CONFIG_CAVS_DECODER */
2550
2551 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
2552 /* VC-1 specific */
2553 void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2554
2555 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2556     put_pixels8_c(dst, src, stride, 8);
2557 }
2558 #endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
2559
2560 #if defined(CONFIG_H264_ENCODER)
2561 /* H264 specific */
2562 void ff_h264dsp_init(DSPContext* c, AVCodecContext *avctx);
2563 #endif /* CONFIG_H264_ENCODER */
2564
2565 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2566     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2567     int i;
2568
2569     for(i=0; i<w; i++){
2570         const int src_1= src[ -srcStride];
2571         const int src0 = src[0          ];
2572         const int src1 = src[  srcStride];
2573         const int src2 = src[2*srcStride];
2574         const int src3 = src[3*srcStride];
2575         const int src4 = src[4*srcStride];
2576         const int src5 = src[5*srcStride];
2577         const int src6 = src[6*srcStride];
2578         const int src7 = src[7*srcStride];
2579         const int src8 = src[8*srcStride];
2580         const int src9 = src[9*srcStride];
2581         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2582         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2583         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2584         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2585         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2586         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2587         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2588         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2589         src++;
2590         dst++;
2591     }
2592 }
2593
2594 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2595     put_pixels8_c(dst, src, stride, 8);
2596 }
2597
2598 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2599     uint8_t half[64];
2600     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2601     put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2602 }
2603
2604 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2605     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2606 }
2607
2608 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2609     uint8_t half[64];
2610     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2611     put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2612 }
2613
2614 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2615     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2616 }
2617
2618 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2619     uint8_t halfH[88];
2620     uint8_t halfV[64];
2621     uint8_t halfHV[64];
2622     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2623     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2624     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2625     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2626 }
2627 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2628     uint8_t halfH[88];
2629     uint8_t halfV[64];
2630     uint8_t halfHV[64];
2631     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2632     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2633     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2634     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2635 }
2636 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2637     uint8_t halfH[88];
2638     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2639     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2640 }
2641
2642 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2643     int x;
2644     const int strength= ff_h263_loop_filter_strength[qscale];
2645
2646     for(x=0; x<8; x++){
2647         int d1, d2, ad1;
2648         int p0= src[x-2*stride];
2649         int p1= src[x-1*stride];
2650         int p2= src[x+0*stride];
2651         int p3= src[x+1*stride];
2652         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2653
2654         if     (d<-2*strength) d1= 0;
2655         else if(d<-  strength) d1=-2*strength - d;
2656         else if(d<   strength) d1= d;
2657         else if(d< 2*strength) d1= 2*strength - d;
2658         else                   d1= 0;
2659
2660         p1 += d1;
2661         p2 -= d1;
2662         if(p1&256) p1= ~(p1>>31);
2663         if(p2&256) p2= ~(p2>>31);
2664
2665         src[x-1*stride] = p1;
2666         src[x+0*stride] = p2;
2667
2668         ad1= FFABS(d1)>>1;
2669
2670         d2= av_clip((p0-p3)/4, -ad1, ad1);
2671
2672         src[x-2*stride] = p0 - d2;
2673         src[x+  stride] = p3 + d2;
2674     }
2675 }
2676
2677 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2678     int y;
2679     const int strength= ff_h263_loop_filter_strength[qscale];
2680
2681     for(y=0; y<8; y++){
2682         int d1, d2, ad1;
2683         int p0= src[y*stride-2];
2684         int p1= src[y*stride-1];
2685         int p2= src[y*stride+0];
2686         int p3= src[y*stride+1];
2687         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2688
2689         if     (d<-2*strength) d1= 0;
2690         else if(d<-  strength) d1=-2*strength - d;
2691         else if(d<   strength) d1= d;
2692         else if(d< 2*strength) d1= 2*strength - d;
2693         else                   d1= 0;
2694
2695         p1 += d1;
2696         p2 -= d1;
2697         if(p1&256) p1= ~(p1>>31);
2698         if(p2&256) p2= ~(p2>>31);
2699
2700         src[y*stride-1] = p1;
2701         src[y*stride+0] = p2;
2702
2703         ad1= FFABS(d1)>>1;
2704
2705         d2= av_clip((p0-p3)/4, -ad1, ad1);
2706
2707         src[y*stride-2] = p0 - d2;
2708         src[y*stride+1] = p3 + d2;
2709     }
2710 }
2711
2712 static void h261_loop_filter_c(uint8_t *src, int stride){
2713     int x,y,xy,yz;
2714     int temp[64];
2715
2716     for(x=0; x<8; x++){
2717         temp[x      ] = 4*src[x           ];
2718         temp[x + 7*8] = 4*src[x + 7*stride];
2719     }
2720     for(y=1; y<7; y++){
2721         for(x=0; x<8; x++){
2722             xy = y * stride + x;
2723             yz = y * 8 + x;
2724             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2725         }
2726     }
2727
2728     for(y=0; y<8; y++){
2729         src[  y*stride] = (temp[  y*8] + 2)>>2;
2730         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2731         for(x=1; x<7; x++){
2732             xy = y * stride + x;
2733             yz = y * 8 + x;
2734             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2735         }
2736     }
2737 }
2738
2739 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2740 {
2741     int i, d;
2742     for( i = 0; i < 4; i++ ) {
2743         if( tc0[i] < 0 ) {
2744             pix += 4*ystride;
2745             continue;
2746         }
2747         for( d = 0; d < 4; d++ ) {
2748             const int p0 = pix[-1*xstride];
2749             const int p1 = pix[-2*xstride];
2750             const int p2 = pix[-3*xstride];
2751             const int q0 = pix[0];
2752             const int q1 = pix[1*xstride];
2753             const int q2 = pix[2*xstride];
2754
2755             if( FFABS( p0 - q0 ) < alpha &&
2756                 FFABS( p1 - p0 ) < beta &&
2757                 FFABS( q1 - q0 ) < beta ) {
2758
2759                 int tc = tc0[i];
2760                 int i_delta;
2761
2762                 if( FFABS( p2 - p0 ) < beta ) {
2763                     pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2764                     tc++;
2765                 }
2766                 if( FFABS( q2 - q0 ) < beta ) {
2767                     pix[   xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2768                     tc++;
2769                 }
2770
2771                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2772                 pix[-xstride] = av_clip_uint8( p0 + i_delta );    /* p0' */
2773                 pix[0]        = av_clip_uint8( q0 - i_delta );    /* q0' */
2774             }
2775             pix += ystride;
2776         }
2777     }
2778 }
2779 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2780 {
2781     h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2782 }
2783 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2784 {
2785     h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2786 }
2787
2788 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2789 {
2790     int i, d;
2791     for( i = 0; i < 4; i++ ) {
2792         const int tc = tc0[i];
2793         if( tc <= 0 ) {
2794             pix += 2*ystride;
2795             continue;
2796         }
2797         for( d = 0; d < 2; d++ ) {
2798             const int p0 = pix[-1*xstride];
2799             const int p1 = pix[-2*xstride];
2800             const int q0 = pix[0];
2801             const int q1 = pix[1*xstride];
2802
2803             if( FFABS( p0 - q0 ) < alpha &&
2804                 FFABS( p1 - p0 ) < beta &&
2805                 FFABS( q1 - q0 ) < beta ) {
2806
2807                 int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2808
2809                 pix[-xstride] = av_clip_uint8( p0 + delta );    /* p0' */
2810                 pix[0]        = av_clip_uint8( q0 - delta );    /* q0' */
2811             }
2812             pix += ystride;
2813         }
2814     }
2815 }
2816 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2817 {
2818     h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2819 }
2820 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2821 {
2822     h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
2823 }
2824
2825 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2826 {
2827     int d;
2828     for( d = 0; d < 8; d++ ) {
2829         const int p0 = pix[-1*xstride];
2830         const int p1 = pix[-2*xstride];
2831         const int q0 = pix[0];
2832         const int q1 = pix[1*xstride];
2833
2834         if( FFABS( p0 - q0 ) < alpha &&
2835             FFABS( p1 - p0 ) < beta &&
2836             FFABS( q1 - q0 ) < beta ) {
2837
2838             pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
2839             pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
2840         }
2841         pix += ystride;
2842     }
2843 }
2844 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2845 {
2846     h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
2847 }
2848 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2849 {
2850     h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
2851 }
2852
2853 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2854 {
2855     int s, i;
2856
2857     s = 0;
2858     for(i=0;i<h;i++) {
2859         s += abs(pix1[0] - pix2[0]);
2860         s += abs(pix1[1] - pix2[1]);
2861         s += abs(pix1[2] - pix2[2]);
2862         s += abs(pix1[3] - pix2[3]);
2863         s += abs(pix1[4] - pix2[4]);
2864         s += abs(pix1[5] - pix2[5]);
2865         s += abs(pix1[6] - pix2[6]);
2866         s += abs(pix1[7] - pix2[7]);
2867         s += abs(pix1[8] - pix2[8]);
2868         s += abs(pix1[9] - pix2[9]);
2869         s += abs(pix1[10] - pix2[10]);
2870         s += abs(pix1[11] - pix2[11]);
2871         s += abs(pix1[12] - pix2[12]);
2872         s += abs(pix1[13] - pix2[13]);
2873         s += abs(pix1[14] - pix2[14]);
2874         s += abs(pix1[15] - pix2[15]);
2875         pix1 += line_size;
2876         pix2 += line_size;
2877     }
2878     return s;
2879 }
2880
2881 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2882 {
2883     int s, i;
2884
2885     s = 0;
2886     for(i=0;i<h;i++) {
2887         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2888         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2889         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2890         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2891         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2892         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2893         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2894         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2895         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2896         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2897         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2898         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2899         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2900         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2901         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2902         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2903         pix1 += line_size;
2904         pix2 += line_size;
2905     }
2906     return s;
2907 }
2908
2909 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2910 {
2911     int s, i;
2912     uint8_t *pix3 = pix2 + line_size;
2913
2914     s = 0;
2915     for(i=0;i<h;i++) {
2916         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2917         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2918         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2919         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2920         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2921         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2922         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2923         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2924         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2925         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2926         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2927         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2928         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2929         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2930         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2931         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2932         pix1 += line_size;
2933         pix2 += line_size;
2934         pix3 += line_size;
2935     }
2936     return s;
2937 }
2938
2939 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2940 {
2941     int s, i;
2942     uint8_t *pix3 = pix2 + line_size;
2943
2944     s = 0;
2945     for(i=0;i<h;i++) {
2946         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2947         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2948         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2949         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2950         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2951         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2952         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2953         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2954         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2955         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2956         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2957         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2958         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2959         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2960         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2961         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2962         pix1 += line_size;
2963         pix2 += line_size;
2964         pix3 += line_size;
2965     }
2966     return s;
2967 }
2968
2969 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2970 {
2971     int s, i;
2972
2973     s = 0;
2974     for(i=0;i<h;i++) {
2975         s += abs(pix1[0] - pix2[0]);
2976         s += abs(pix1[1] - pix2[1]);
2977         s += abs(pix1[2] - pix2[2]);
2978         s += abs(pix1[3] - pix2[3]);
2979         s += abs(pix1[4] - pix2[4]);
2980         s += abs(pix1[5] - pix2[5]);
2981         s += abs(pix1[6] - pix2[6]);
2982         s += abs(pix1[7] - pix2[7]);
2983         pix1 += line_size;
2984         pix2 += line_size;
2985     }
2986     return s;
2987 }
2988
2989 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2990 {
2991     int s, i;
2992
2993     s = 0;
2994     for(i=0;i<h;i++) {
2995         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2996         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2997         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2998         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2999         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3000         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3001         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3002         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3003         pix1 += line_size;
3004         pix2 += line_size;
3005     }
3006     return s;
3007 }
3008
3009 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3010 {
3011     int s, i;
3012     uint8_t *pix3 = pix2 + line_size;
3013
3014     s = 0;
3015     for(i=0;i<h;i++) {
3016         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3017         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3018         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3019         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3020         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3021         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3022         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3023         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3024         pix1 += line_size;
3025         pix2 += line_size;
3026         pix3 += line_size;
3027     }
3028     return s;
3029 }
3030
3031 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3032 {
3033     int s, i;
3034     uint8_t *pix3 = pix2 + line_size;
3035
3036     s = 0;
3037     for(i=0;i<h;i++) {
3038         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3039         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3040         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3041         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3042         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3043         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3044         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3045         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3046         pix1 += line_size;
3047         pix2 += line_size;
3048         pix3 += line_size;
3049     }
3050     return s;
3051 }
3052
3053 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3054     MpegEncContext *c = v;
3055     int score1=0;
3056     int score2=0;
3057     int x,y;
3058
3059     for(y=0; y<h; y++){
3060         for(x=0; x<16; x++){
3061             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3062         }
3063         if(y+1<h){
3064             for(x=0; x<15; x++){
3065                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
3066                              - s1[x+1] + s1[x+1+stride])
3067                         -FFABS(  s2[x  ] - s2[x  +stride]
3068                              - s2[x+1] + s2[x+1+stride]);
3069             }
3070         }
3071         s1+= stride;
3072         s2+= stride;
3073     }
3074
3075     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3076     else  return score1 + FFABS(score2)*8;
3077 }
3078
3079 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3080     MpegEncContext *c = v;
3081     int score1=0;
3082     int score2=0;
3083     int x,y;
3084
3085     for(y=0; y<h; y++){
3086         for(x=0; x<8; x++){
3087             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3088         }
3089         if(y+1<h){
3090             for(x=0; x<7; x++){
3091                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
3092                              - s1[x+1] + s1[x+1+stride])
3093                         -FFABS(  s2[x  ] - s2[x  +stride]
3094                              - s2[x+1] + s2[x+1+stride]);
3095             }
3096         }
3097         s1+= stride;
3098         s2+= stride;
3099     }
3100
3101     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3102     else  return score1 + FFABS(score2)*8;
3103 }
3104
3105 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3106     int i;
3107     unsigned int sum=0;
3108
3109     for(i=0; i<8*8; i++){
3110         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3111         int w= weight[i];
3112         b>>= RECON_SHIFT;
3113         assert(-512<b && b<512);
3114
3115         sum += (w*b)*(w*b)>>4;
3116     }
3117     return sum>>2;
3118 }
3119
3120 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3121     int i;
3122
3123     for(i=0; i<8*8; i++){
3124         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3125     }
3126 }
3127
3128 /**
3129  * permutes an 8x8 block.
3130  * @param block the block which will be permuted according to the given permutation vector
3131  * @param permutation the permutation vector
3132  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3133  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3134  *                  (inverse) permutated to scantable order!
3135  */
3136 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3137 {
3138     int i;
3139     DCTELEM temp[64];
3140
3141     if(last<=0) return;
3142     //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
3143
3144     for(i=0; i<=last; i++){
3145         const int j= scantable[i];
3146         temp[j]= block[j];
3147         block[j]=0;
3148     }
3149
3150     for(i=0; i<=last; i++){
3151         const int j= scantable[i];
3152         const int perm_j= permutation[j];
3153         block[perm_j]= temp[j];
3154     }
3155 }
3156
3157 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3158     return 0;
3159 }
3160
3161 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3162     int i;
3163
3164     memset(cmp, 0, sizeof(void*)*5);
3165
3166     for(i=0; i<5; i++){
3167         switch(type&0xFF){
3168         case FF_CMP_SAD:
3169             cmp[i]= c->sad[i];
3170             break;
3171         case FF_CMP_SATD:
3172             cmp[i]= c->hadamard8_diff[i];
3173             break;
3174         case FF_CMP_SSE:
3175             cmp[i]= c->sse[i];
3176             break;
3177         case FF_CMP_DCT:
3178             cmp[i]= c->dct_sad[i];
3179             break;
3180         case FF_CMP_DCT264:
3181             cmp[i]= c->dct264_sad[i];
3182             break;
3183         case FF_CMP_DCTMAX:
3184             cmp[i]= c->dct_max[i];
3185             break;
3186         case FF_CMP_PSNR:
3187             cmp[i]= c->quant_psnr[i];
3188             break;
3189         case FF_CMP_BIT:
3190             cmp[i]= c->bit[i];
3191             break;
3192         case FF_CMP_RD:
3193             cmp[i]= c->rd[i];
3194             break;
3195         case FF_CMP_VSAD:
3196             cmp[i]= c->vsad[i];
3197             break;
3198         case FF_CMP_VSSE:
3199             cmp[i]= c->vsse[i];
3200             break;
3201         case FF_CMP_ZERO:
3202             cmp[i]= zero_cmp;
3203             break;
3204         case FF_CMP_NSSE:
3205             cmp[i]= c->nsse[i];
3206             break;
3207 #ifdef CONFIG_SNOW_ENCODER
3208         case FF_CMP_W53:
3209             cmp[i]= c->w53[i];
3210             break;
3211         case FF_CMP_W97:
3212             cmp[i]= c->w97[i];
3213             break;
3214 #endif
3215         default:
3216             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3217         }
3218     }
3219 }
3220
3221 /**
3222  * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3223  */
3224 static void clear_blocks_c(DCTELEM *blocks)
3225 {
3226     memset(blocks, 0, sizeof(DCTELEM)*6*64);
3227 }
3228
3229 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3230     int i;
3231     for(i=0; i+7<w; i+=8){
3232         dst[i+0] += src[i+0];
3233         dst[i+1] += src[i+1];
3234         dst[i+2] += src[i+2];
3235         dst[i+3] += src[i+3];
3236         dst[i+4] += src[i+4];
3237         dst[i+5] += src[i+5];
3238         dst[i+6] += src[i+6];
3239         dst[i+7] += src[i+7];
3240     }
3241     for(; i<w; i++)
3242         dst[i+0] += src[i+0];
3243 }
3244
3245 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3246     int i;
3247     for(i=0; i+7<w; i+=8){
3248         dst[i+0] = src1[i+0]-src2[i+0];
3249         dst[i+1] = src1[i+1]-src2[i+1];
3250         dst[i+2] = src1[i+2]-src2[i+2];
3251         dst[i+3] = src1[i+3]-src2[i+3];
3252         dst[i+4] = src1[i+4]-src2[i+4];
3253         dst[i+5] = src1[i+5]-src2[i+5];
3254         dst[i+6] = src1[i+6]-src2[i+6];
3255         dst[i+7] = src1[i+7]-src2[i+7];
3256     }
3257     for(; i<w; i++)
3258         dst[i+0] = src1[i+0]-src2[i+0];
3259 }
3260
3261 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3262     int i;
3263     uint8_t l, lt;
3264
3265     l= *left;
3266     lt= *left_top;
3267
3268     for(i=0; i<w; i++){
3269         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3270         lt= src1[i];
3271         l= src2[i];
3272         dst[i]= l - pred;
3273     }
3274
3275     *left= l;
3276     *left_top= lt;
3277 }
3278
3279 #define BUTTERFLY2(o1,o2,i1,i2) \
3280 o1= (i1)+(i2);\
3281 o2= (i1)-(i2);
3282
3283 #define BUTTERFLY1(x,y) \
3284 {\
3285     int a,b;\
3286     a= x;\
3287     b= y;\
3288     x= a+b;\
3289     y= a-b;\
3290 }
3291
3292 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3293
3294 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3295     int i;
3296     int temp[64];
3297     int sum=0;
3298
3299     assert(h==8);
3300
3301     for(i=0; i<8; i++){
3302         //FIXME try pointer walks
3303         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3304         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3305         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3306         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3307
3308         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3309         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3310         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3311         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3312
3313         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3314         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3315         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3316         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3317     }
3318
3319     for(i=0; i<8; i++){
3320         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3321         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3322         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3323         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3324
3325         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3326         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3327         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3328         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3329
3330         sum +=
3331              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3332             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3333             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3334             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3335     }
3336 #if 0
3337 static int maxi=0;
3338 if(sum>maxi){
3339     maxi=sum;
3340     printf("MAX:%d\n", maxi);
3341 }
3342 #endif
3343     return sum;
3344 }
3345
3346 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3347     int i;
3348     int temp[64];
3349     int sum=0;
3350
3351     assert(h==8);
3352
3353     for(i=0; i<8; i++){
3354         //FIXME try pointer walks
3355         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3356         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3357         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3358         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3359
3360         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3361         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3362         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3363         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3364
3365         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3366         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3367         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3368         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3369     }
3370
3371     for(i=0; i<8; i++){
3372         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3373         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3374         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3375         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3376
3377         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3378         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3379         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3380         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3381
3382         sum +=
3383              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3384             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3385             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3386             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3387     }
3388
3389     sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3390
3391     return sum;
3392 }
3393
3394 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3395     MpegEncContext * const s= (MpegEncContext *)c;
3396     DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3397     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3398
3399     assert(h==8);
3400
3401     s->dsp.diff_pixels(temp, src1, src2, stride);
3402     s->dsp.fdct(temp);
3403     return s->dsp.sum_abs_dctelem(temp);
3404 }
3405
3406 #ifdef CONFIG_GPL
3407 #define DCT8_1D {\
3408     const int s07 = SRC(0) + SRC(7);\
3409     const int s16 = SRC(1) + SRC(6);\
3410     const int s25 = SRC(2) + SRC(5);\
3411     const int s34 = SRC(3) + SRC(4);\
3412     const int a0 = s07 + s34;\
3413     const int a1 = s16 + s25;\
3414     const int a2 = s07 - s34;\
3415     const int a3 = s16 - s25;\
3416     const int d07 = SRC(0) - SRC(7);\
3417     const int d16 = SRC(1) - SRC(6);\
3418     const int d25 = SRC(2) - SRC(5);\
3419     const int d34 = SRC(3) - SRC(4);\
3420     const int a4 = d16 + d25 + (d07 + (d07>>1));\
3421     const int a5 = d07 - d34 - (d25 + (d25>>1));\
3422     const int a6 = d07 + d34 - (d16 + (d16>>1));\
3423     const int a7 = d16 - d25 + (d34 + (d34>>1));\
3424     DST(0,  a0 + a1     ) ;\
3425     DST(1,  a4 + (a7>>2)) ;\
3426     DST(2,  a2 + (a3>>1)) ;\
3427     DST(3,  a5 + (a6>>2)) ;\
3428     DST(4,  a0 - a1     ) ;\
3429     DST(5,  a6 - (a5>>2)) ;\
3430     DST(6, (a2>>1) - a3 ) ;\
3431     DST(7, (a4>>2) - a7 ) ;\
3432 }
3433
3434 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3435     MpegEncContext * const s= (MpegEncContext *)c;
3436     int16_t dct[8][8];
3437     int i;
3438     int sum=0;
3439
3440     s->dsp.diff_pixels(dct, src1, src2, stride);
3441
3442 #define SRC(x) dct[i][x]
3443 #define DST(x,v) dct[i][x]= v
3444     for( i = 0; i < 8; i++ )
3445         DCT8_1D
3446 #undef SRC
3447 #undef DST
3448
3449 #define SRC(x) dct[x][i]
3450 #define DST(x,v) sum += FFABS(v)
3451     for( i = 0; i < 8; i++ )
3452         DCT8_1D
3453 #undef SRC
3454 #undef DST
3455     return sum;
3456 }
3457 #endif
3458
3459 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3460     MpegEncContext * const s= (MpegEncContext *)c;
3461     DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3462     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3463     int sum=0, i;
3464
3465     assert(h==8);
3466
3467     s->dsp.diff_pixels(temp, src1, src2, stride);
3468     s->dsp.fdct(temp);
3469
3470     for(i=0; i<64; i++)
3471         sum= FFMAX(sum, FFABS(temp[i]));
3472
3473     return sum;
3474 }
3475
3476 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3477     MpegEncContext * const s= (MpegEncContext *)c;
3478     DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3479     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3480     DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3481     int sum=0, i;
3482
3483     assert(h==8);
3484     s->mb_intra=0;
3485
3486     s->dsp.diff_pixels(temp, src1, src2, stride);
3487
3488     memcpy(bak, temp, 64*sizeof(DCTELEM));
3489
3490     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3491     s->dct_unquantize_inter(s, temp, 0, s->qscale);
3492     simple_idct(temp); //FIXME
3493
3494     for(i=0; i<64; i++)
3495         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3496
3497     return sum;
3498 }
3499
3500 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3501     MpegEncContext * const s= (MpegEncContext *)c;
3502     const uint8_t *scantable= s->intra_scantable.permutated;
3503     DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3504     DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3505     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3506     uint8_t * const bak= (uint8_t*)aligned_bak;
3507     int i, last, run, bits, level, distoration, start_i;
3508     const int esc_length= s->ac_esc_length;
3509     uint8_t * length;
3510     uint8_t * last_length;
3511
3512     assert(h==8);
3513
3514     for(i=0; i<8; i++){
3515         ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3516         ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3517     }
3518
3519     s->dsp.diff_pixels(temp, src1, src2, stride);
3520
3521     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3522
3523     bits=0;
3524
3525     if (s->mb_intra) {
3526         start_i = 1;
3527         length     = s->intra_ac_vlc_length;
3528         last_length= s->intra_ac_vlc_last_length;
3529         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3530     } else {
3531         start_i = 0;
3532         length     = s->inter_ac_vlc_length;
3533         last_length= s->inter_ac_vlc_last_length;
3534     }
3535
3536     if(last>=start_i){
3537         run=0;
3538         for(i=start_i; i<last; i++){
3539             int j= scantable[i];
3540             level= temp[j];
3541
3542             if(level){
3543                 level+=64;
3544                 if((level&(~127)) == 0){
3545                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3546                 }else
3547                     bits+= esc_length;
3548                 run=0;
3549             }else
3550                 run++;
3551         }
3552         i= scantable[last];
3553
3554         level= temp[i] + 64;
3555
3556         assert(level - 64);
3557
3558         if((level&(~127)) == 0){
3559             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3560         }else
3561             bits+= esc_length;
3562
3563     }
3564
3565     if(last>=0){
3566         if(s->mb_intra)
3567             s->dct_unquantize_intra(s, temp, 0, s->qscale);
3568         else
3569             s->dct_unquantize_inter(s, temp, 0, s->qscale);
3570     }
3571
3572     s->dsp.idct_add(bak, stride, temp);
3573
3574     distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3575
3576     return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3577 }
3578
3579 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3580     MpegEncContext * const s= (MpegEncContext *)c;
3581     const uint8_t *scantable= s->intra_scantable.permutated;
3582     DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3583     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3584     int i, last, run, bits, level, start_i;
3585     const int esc_length= s->ac_esc_length;
3586     uint8_t * length;
3587     uint8_t * last_length;
3588
3589     assert(h==8);
3590
3591     s->dsp.diff_pixels(temp, src1, src2, stride);
3592
3593     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3594
3595     bits=0;
3596
3597     if (s->mb_intra) {
3598         start_i = 1;
3599         length     = s->intra_ac_vlc_length;
3600         last_length= s->intra_ac_vlc_last_length;
3601         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3602     } else {
3603         start_i = 0;
3604         length     = s->inter_ac_vlc_length;
3605         last_length= s->inter_ac_vlc_last_length;
3606     }
3607
3608     if(last>=start_i){
3609         run=0;
3610         for(i=start_i; i<last; i++){
3611             int j= scantable[i];
3612             level= temp[j];
3613
3614             if(level){
3615                 level+=64;
3616                 if((level&(~127)) == 0){
3617                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3618                 }else
3619                     bits+= esc_length;
3620                 run=0;
3621             }else
3622                 run++;
3623         }
3624         i= scantable[last];
3625
3626         level= temp[i] + 64;
3627
3628         assert(level - 64);
3629
3630         if((level&(~127)) == 0){
3631             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3632         }else
3633             bits+= esc_length;
3634     }
3635
3636     return bits;
3637 }
3638
3639 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3640     int score=0;
3641     int x,y;
3642
3643     for(y=1; y<h; y++){
3644         for(x=0; x<16; x+=4){
3645             score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])
3646                    +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);
3647         }
3648         s+= stride;
3649     }
3650
3651     return score;
3652 }
3653
3654 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3655     int score=0;
3656     int x,y;
3657
3658     for(y=1; y<h; y++){
3659         for(x=0; x<16; x++){
3660             score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3661         }
3662         s1+= stride;
3663         s2+= stride;
3664     }
3665
3666     return score;
3667 }
3668
3669 #define SQ(a) ((a)*(a))
3670 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3671     int score=0;
3672     int x,y;
3673
3674     for(y=1; y<h; y++){
3675         for(x=0; x<16; x+=4){
3676             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])
3677                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3678         }
3679         s+= stride;
3680     }
3681
3682     return score;
3683 }
3684
3685 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3686     int score=0;
3687     int x,y;
3688
3689     for(y=1; y<h; y++){
3690         for(x=0; x<16; x++){
3691             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3692         }
3693         s1+= stride;
3694         s2+= stride;
3695     }
3696
3697     return score;
3698 }
3699
3700 static int ssd_int8_vs_int16_c(int8_t *pix1, int16_t *pix2, int size){
3701     int score=0;
3702     int i;
3703     for(i=0; i<size; i++)
3704         score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3705     return score;
3706 }
3707
3708 WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3709 WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3710 WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3711 #ifdef CONFIG_GPL
3712 WARPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3713 #endif
3714 WARPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3715 WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3716 WARPER8_16_SQ(rd8x8_c, rd16_c)
3717 WARPER8_16_SQ(bit8x8_c, bit16_c)
3718
3719 static void vector_fmul_c(float *dst, const float *src, int len){
3720     int i;
3721     for(i=0; i<len; i++)
3722         dst[i] *= src[i];
3723 }
3724
3725 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3726     int i;
3727     src1 += len-1;
3728     for(i=0; i<len; i++)
3729         dst[i] = src0[i] * src1[-i];
3730 }
3731
3732 void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){
3733     int i;
3734     for(i=0; i<len; i++)
3735         dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
3736 }
3737
3738 void ff_float_to_int16_c(int16_t *dst, const float *src, int len){
3739     int i;
3740     for(i=0; i<len; i++) {
3741         int_fast32_t tmp = ((int32_t*)src)[i];
3742         if(tmp & 0xf0000){
3743             tmp = (0x43c0ffff - tmp)>>31;
3744             // is this faster on some gcc/cpu combinations?
3745 //          if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3746 //          else                 tmp = 0;
3747         }
3748         dst[i] = tmp - 0x8000;
3749     }
3750 }
3751
3752 /* XXX: those functions should be suppressed ASAP when all IDCTs are
3753  converted */
3754 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3755 {
3756     j_rev_dct (block);
3757     put_pixels_clamped_c(block, dest, line_size);
3758 }
3759 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3760 {
3761     j_rev_dct (block);
3762     add_pixels_clamped_c(block, dest, line_size);
3763 }
3764
3765 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3766 {
3767     j_rev_dct4 (block);
3768     put_pixels_clamped4_c(block, dest, line_size);
3769 }
3770 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3771 {
3772     j_rev_dct4 (block);
3773     add_pixels_clamped4_c(block, dest, line_size);
3774 }
3775
3776 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
3777 {
3778     j_rev_dct2 (block);
3779     put_pixels_clamped2_c(block, dest, line_size);
3780 }
3781 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
3782 {
3783     j_rev_dct2 (block);
3784     add_pixels_clamped2_c(block, dest, line_size);
3785 }
3786
3787 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
3788 {
3789     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
3790
3791     dest[0] = cm[(block[0] + 4)>>3];
3792 }
3793 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
3794 {
3795     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
3796
3797     dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
3798 }
3799
3800 static void just_return() { return; }
3801
3802 /* init static data */
3803 void dsputil_static_init(void)
3804 {
3805     int i;
3806
3807     for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
3808     for(i=0;i<MAX_NEG_CROP;i++) {
3809         ff_cropTbl[i] = 0;
3810         ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
3811     }
3812
3813     for(i=0;i<512;i++) {
3814         ff_squareTbl[i] = (i - 256) * (i - 256);
3815     }
3816
3817     for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3818 }
3819
3820 int ff_check_alignment(void){
3821     static int did_fail=0;
3822     DECLARE_ALIGNED_16(int, aligned);
3823
3824     if((int)&aligned & 15){
3825         if(!did_fail){
3826 #if defined(HAVE_MMX) || defined(HAVE_ALTIVEC)
3827             av_log(NULL, AV_LOG_ERROR,
3828                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
3829                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
3830                 "but in the compiler. Do not report crashes to FFmpeg developers.\n");
3831 #endif
3832             did_fail=1;
3833         }
3834         return -1;
3835     }
3836     return 0;
3837 }
3838
3839 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3840 {
3841     int i;
3842
3843     ff_check_alignment();
3844
3845 #ifdef CONFIG_ENCODERS
3846     if(avctx->dct_algo==FF_DCT_FASTINT) {
3847         c->fdct = fdct_ifast;
3848         c->fdct248 = fdct_ifast248;
3849     }
3850     else if(avctx->dct_algo==FF_DCT_FAAN) {
3851         c->fdct = ff_faandct;
3852         c->fdct248 = ff_faandct248;
3853     }
3854     else {
3855         c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3856         c->fdct248 = ff_fdct248_islow;
3857     }
3858 #endif //CONFIG_ENCODERS
3859
3860     if(avctx->lowres==1){
3861         if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !ENABLE_H264_DECODER){
3862             c->idct_put= ff_jref_idct4_put;
3863             c->idct_add= ff_jref_idct4_add;
3864         }else{
3865             c->idct_put= ff_h264_lowres_idct_put_c;
3866             c->idct_add= ff_h264_lowres_idct_add_c;
3867         }
3868         c->idct    = j_rev_dct4;
3869         c->idct_permutation_type= FF_NO_IDCT_PERM;
3870     }else if(avctx->lowres==2){
3871         c->idct_put= ff_jref_idct2_put;
3872         c->idct_add= ff_jref_idct2_add;
3873         c->idct    = j_rev_dct2;
3874         c->idct_permutation_type= FF_NO_IDCT_PERM;
3875     }else if(avctx->lowres==3){
3876         c->idct_put= ff_jref_idct1_put;
3877         c->idct_add= ff_jref_idct1_add;
3878         c->idct    = j_rev_dct1;
3879         c->idct_permutation_type= FF_NO_IDCT_PERM;
3880     }else{
3881         if(avctx->idct_algo==FF_IDCT_INT){
3882             c->idct_put= ff_jref_idct_put;
3883             c->idct_add= ff_jref_idct_add;
3884             c->idct    = j_rev_dct;
3885             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3886         }else if((ENABLE_VP3_DECODER || ENABLE_VP5_DECODER || ENABLE_VP6_DECODER || ENABLE_THEORA_DECODER ) &&
3887                 avctx->idct_algo==FF_IDCT_VP3){
3888             c->idct_put= ff_vp3_idct_put_c;
3889             c->idct_add= ff_vp3_idct_add_c;
3890             c->idct    = ff_vp3_idct_c;
3891             c->idct_permutation_type= FF_NO_IDCT_PERM;
3892         }else{ //accurate/default
3893             c->idct_put= simple_idct_put;
3894             c->idct_add= simple_idct_add;
3895             c->idct    = simple_idct;
3896             c->idct_permutation_type= FF_NO_IDCT_PERM;
3897         }
3898     }
3899
3900     if (ENABLE_H264_DECODER) {
3901         c->h264_idct_add= ff_h264_idct_add_c;
3902         c->h264_idct8_add= ff_h264_idct8_add_c;
3903         c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
3904         c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
3905     }
3906
3907     c->get_pixels = get_pixels_c;
3908     c->diff_pixels = diff_pixels_c;
3909     c->put_pixels_clamped = put_pixels_clamped_c;
3910     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
3911     c->add_pixels_clamped = add_pixels_clamped_c;
3912     c->add_pixels8 = add_pixels8_c;
3913     c->add_pixels4 = add_pixels4_c;
3914     c->sum_abs_dctelem = sum_abs_dctelem_c;
3915     c->gmc1 = gmc1_c;
3916     c->gmc = ff_gmc_c;
3917     c->clear_blocks = clear_blocks_c;
3918     c->pix_sum = pix_sum_c;
3919     c->pix_norm1 = pix_norm1_c;
3920
3921     /* TODO [0] 16  [1] 8 */
3922     c->pix_abs[0][0] = pix_abs16_c;
3923     c->pix_abs[0][1] = pix_abs16_x2_c;
3924     c->pix_abs[0][2] = pix_abs16_y2_c;
3925     c->pix_abs[0][3] = pix_abs16_xy2_c;
3926     c->pix_abs[1][0] = pix_abs8_c;
3927     c->pix_abs[1][1] = pix_abs8_x2_c;
3928     c->pix_abs[1][2] = pix_abs8_y2_c;
3929     c->pix_abs[1][3] = pix_abs8_xy2_c;
3930
3931 #define dspfunc(PFX, IDX, NUM) \
3932     c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
3933     c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
3934     c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
3935     c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3936
3937     dspfunc(put, 0, 16);
3938     dspfunc(put_no_rnd, 0, 16);
3939     dspfunc(put, 1, 8);
3940     dspfunc(put_no_rnd, 1, 8);
3941     dspfunc(put, 2, 4);
3942     dspfunc(put, 3, 2);
3943
3944     dspfunc(avg, 0, 16);
3945     dspfunc(avg_no_rnd, 0, 16);
3946     dspfunc(avg, 1, 8);
3947     dspfunc(avg_no_rnd, 1, 8);
3948     dspfunc(avg, 2, 4);
3949     dspfunc(avg, 3, 2);
3950 #undef dspfunc
3951
3952     c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
3953     c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
3954
3955     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3956     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3957     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3958     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3959     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3960     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3961     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3962     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3963     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3964
3965     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3966     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3967     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3968     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3969     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3970     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3971     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3972     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3973     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3974
3975 #define dspfunc(PFX, IDX, NUM) \
3976     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3977     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3978     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3979     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3980     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3981     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3982     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3983     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3984     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3985     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3986     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3987     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3988     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3989     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3990     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3991     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3992
3993     dspfunc(put_qpel, 0, 16);
3994     dspfunc(put_no_rnd_qpel, 0, 16);
3995
3996     dspfunc(avg_qpel, 0, 16);
3997     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3998
3999     dspfunc(put_qpel, 1, 8);
4000     dspfunc(put_no_rnd_qpel, 1, 8);
4001
4002     dspfunc(avg_qpel, 1, 8);
4003     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4004
4005     dspfunc(put_h264_qpel, 0, 16);
4006     dspfunc(put_h264_qpel, 1, 8);
4007     dspfunc(put_h264_qpel, 2, 4);
4008     dspfunc(put_h264_qpel, 3, 2);
4009     dspfunc(avg_h264_qpel, 0, 16);
4010     dspfunc(avg_h264_qpel, 1, 8);
4011     dspfunc(avg_h264_qpel, 2, 4);
4012
4013 #undef dspfunc
4014     c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4015     c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4016     c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4017     c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4018     c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4019     c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4020     c->put_no_rnd_h264_chroma_pixels_tab[0]= put_no_rnd_h264_chroma_mc8_c;
4021
4022     c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
4023     c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
4024     c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
4025     c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
4026     c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
4027     c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
4028     c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
4029     c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
4030     c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
4031     c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
4032     c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
4033     c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
4034     c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
4035     c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
4036     c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
4037     c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
4038     c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
4039     c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
4040     c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
4041     c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
4042
4043 #ifdef CONFIG_CAVS_DECODER
4044     ff_cavsdsp_init(c,avctx);
4045 #endif
4046 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4047     ff_vc1dsp_init(c,avctx);
4048 #endif
4049 #if defined(CONFIG_H264_ENCODER)
4050     ff_h264dsp_init(c,avctx);
4051 #endif
4052
4053     c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4054     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4055     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4056     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4057     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4058     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4059     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4060     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4061
4062 #define SET_CMP_FUNC(name) \
4063     c->name[0]= name ## 16_c;\
4064     c->name[1]= name ## 8x8_c;
4065
4066     SET_CMP_FUNC(hadamard8_diff)
4067     c->hadamard8_diff[4]= hadamard8_intra16_c;
4068     SET_CMP_FUNC(dct_sad)
4069     SET_CMP_FUNC(dct_max)
4070 #ifdef CONFIG_GPL
4071     SET_CMP_FUNC(dct264_sad)
4072 #endif
4073     c->sad[0]= pix_abs16_c;
4074     c->sad[1]= pix_abs8_c;
4075     c->sse[0]= sse16_c;
4076     c->sse[1]= sse8_c;
4077     c->sse[2]= sse4_c;
4078     SET_CMP_FUNC(quant_psnr)
4079     SET_CMP_FUNC(rd)
4080     SET_CMP_FUNC(bit)
4081     c->vsad[0]= vsad16_c;
4082     c->vsad[4]= vsad_intra16_c;
4083     c->vsse[0]= vsse16_c;
4084     c->vsse[4]= vsse_intra16_c;
4085     c->nsse[0]= nsse16_c;
4086     c->nsse[1]= nsse8_c;
4087 #ifdef CONFIG_SNOW_ENCODER
4088     c->w53[0]= w53_16_c;
4089     c->w53[1]= w53_8_c;
4090     c->w97[0]= w97_16_c;
4091     c->w97[1]= w97_8_c;
4092 #endif
4093
4094     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4095
4096     c->add_bytes= add_bytes_c;
4097     c->diff_bytes= diff_bytes_c;
4098     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4099     c->bswap_buf= bswap_buf;
4100
4101     c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4102     c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4103     c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4104     c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4105     c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4106     c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4107     c->h264_loop_filter_strength= NULL;
4108
4109     c->h263_h_loop_filter= h263_h_loop_filter_c;
4110     c->h263_v_loop_filter= h263_v_loop_filter_c;
4111
4112     c->h261_loop_filter= h261_loop_filter_c;
4113
4114     c->try_8x8basis= try_8x8basis_c;
4115     c->add_8x8basis= add_8x8basis_c;
4116
4117 #ifdef CONFIG_SNOW_DECODER
4118     c->vertical_compose97i = ff_snow_vertical_compose97i;
4119     c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4120     c->inner_add_yblock = ff_snow_inner_add_yblock;
4121 #endif
4122
4123 #ifdef CONFIG_VORBIS_DECODER
4124     c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4125 #endif
4126     c->vector_fmul = vector_fmul_c;
4127     c->vector_fmul_reverse = vector_fmul_reverse_c;
4128     c->vector_fmul_add_add = ff_vector_fmul_add_add_c;
4129     c->float_to_int16 = ff_float_to_int16_c;
4130
4131     c->shrink[0]= ff_img_copy_plane;
4132     c->shrink[1]= ff_shrink22;
4133     c->shrink[2]= ff_shrink44;
4134     c->shrink[3]= ff_shrink88;
4135
4136     c->prefetch= just_return;
4137
4138     memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4139     memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4140
4141 #ifdef HAVE_MMX
4142     dsputil_init_mmx(c, avctx);
4143 #endif
4144 #ifdef ARCH_ARMV4L
4145     dsputil_init_armv4l(c, avctx);
4146 #endif
4147 #ifdef HAVE_MLIB
4148     dsputil_init_mlib(c, avctx);
4149 #endif
4150 #ifdef ARCH_SPARC
4151    dsputil_init_vis(c,avctx);
4152 #endif
4153 #ifdef ARCH_ALPHA
4154     dsputil_init_alpha(c, avctx);
4155 #endif
4156 #ifdef ARCH_POWERPC
4157     dsputil_init_ppc(c, avctx);
4158 #endif
4159 #ifdef HAVE_MMI
4160     dsputil_init_mmi(c, avctx);
4161 #endif
4162 #ifdef ARCH_SH4
4163     dsputil_init_sh4(c,avctx);
4164 #endif
4165 #ifdef ARCH_BFIN
4166     dsputil_init_bfin(c,avctx);
4167 #endif
4168
4169     for(i=0; i<64; i++){
4170         if(!c->put_2tap_qpel_pixels_tab[0][i])
4171             c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4172         if(!c->avg_2tap_qpel_pixels_tab[0][i])
4173             c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4174     }
4175
4176     switch(c->idct_permutation_type){
4177     case FF_NO_IDCT_PERM:
4178         for(i=0; i<64; i++)
4179             c->idct_permutation[i]= i;
4180         break;
4181     case FF_LIBMPEG2_IDCT_PERM:
4182         for(i=0; i<64; i++)
4183             c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4184         break;
4185     case FF_SIMPLE_IDCT_PERM:
4186         for(i=0; i<64; i++)
4187             c->idct_permutation[i]= simple_mmx_permutation[i];
4188         break;
4189     case FF_TRANSPOSE_IDCT_PERM:
4190         for(i=0; i<64; i++)
4191             c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4192         break;
4193     case FF_PARTTRANS_IDCT_PERM:
4194         for(i=0; i<64; i++)
4195             c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4196         break;
4197     default:
4198         av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
4199     }
4200 }
4201