libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard.
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * This file is part of FFmpeg.
   7  *
   8  * FFmpeg is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License as published by the Free Software Foundation; either
  11  * version 2.1 of the License, or (at your option) any later version.
  12  *
  13  * FFmpeg is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16  * Lesser General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU Lesser General Public
  19  * License along with FFmpeg; if not, write to the Free Software
  20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21  *
  22  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
  23  */
  24
  25 /**
  26  * @file dsputil.c
  27  * DSP utils
  28  */
  29
  30 #include "avcodec.h"
  31 #include "dsputil.h"
  32 #include "mpegvideo.h"
  33 #include "simple_idct.h"
  34 #include "faandct.h"
  35 #include "snow.h"
  36
  37 /* snow.c */
  38 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
  39
  40 /* vorbis.c */
  41 void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
  42
  43 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
  44 uint32_t ff_squareTbl[512] = {0, };
  45
  46 const uint8_t ff_zigzag_direct[64] = {
  47     0,   1,  8, 16,  9,  2,  3, 10,
  48     17, 24, 32, 25, 18, 11,  4,  5,
  49     12, 19, 26, 33, 40, 48, 41, 34,
  50     27, 20, 13,  6,  7, 14, 21, 28,
  51     35, 42, 49, 56, 57, 50, 43, 36,
  52     29, 22, 15, 23, 30, 37, 44, 51,
  53     58, 59, 52, 45, 38, 31, 39, 46,
  54     53, 60, 61, 54, 47, 55, 62, 63
  55 };
  56
  57 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  58    specification, we interleave the fields */
  59 const uint8_t ff_zigzag248_direct[64] = {
  60      0,  8,  1,  9, 16, 24,  2, 10,
  61     17, 25, 32, 40, 48, 56, 33, 41,
  62     18, 26,  3, 11,  4, 12, 19, 27,
  63     34, 42, 49, 57, 50, 58, 35, 43,
  64     20, 28,  5, 13,  6, 14, 21, 29,
  65     36, 44, 51, 59, 52, 60, 37, 45,
  66     22, 30,  7, 15, 23, 31, 38, 46,
  67     53, 61, 54, 62, 39, 47, 55, 63,
  68 };
  69
  70 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  71 DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
  72
  73 const uint8_t ff_alternate_horizontal_scan[64] = {
  74     0,  1,   2,  3,  8,  9, 16, 17,
  75     10, 11,  4,  5,  6,  7, 15, 14,
  76     13, 12, 19, 18, 24, 25, 32, 33,
  77     26, 27, 20, 21, 22, 23, 28, 29,
  78     30, 31, 34, 35, 40, 41, 48, 49,
  79     42, 43, 36, 37, 38, 39, 44, 45,
  80     46, 47, 50, 51, 56, 57, 58, 59,
  81     52, 53, 54, 55, 60, 61, 62, 63,
  82 };
  83
  84 const uint8_t ff_alternate_vertical_scan[64] = {
  85     0,  8,  16, 24,  1,  9,  2, 10,
  86     17, 25, 32, 40, 48, 56, 57, 49,
  87     41, 33, 26, 18,  3, 11,  4, 12,
  88     19, 27, 34, 42, 50, 58, 35, 43,
  89     51, 59, 20, 28,  5, 13,  6, 14,
  90     21, 29, 36, 44, 52, 60, 37, 45,
  91     53, 61, 22, 30,  7, 15, 23, 31,
  92     38, 46, 54, 62, 39, 47, 55, 63,
  93 };
  94
  95 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
  96 const uint32_t ff_inverse[256]={
  97          0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
  98  536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
  99  268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
 100  178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
 101  134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
 102  107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
 103   89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
 104   76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
 105   67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
 106   59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
 107   53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
 108   48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
 109   44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
 110   41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
 111   38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
 112   35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
 113   33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
 114   31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
 115   29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
 116   28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
 117   26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
 118   25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
 119   24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
 120   23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
 121   22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
 122   21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
 123   20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
 124   19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
 125   19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
 126   18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
 127   17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
 128   17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
 129 };
 130
 131 /* Input permutation for the simple_idct_mmx */
 132 static const uint8_t simple_mmx_permutation[64]={
 133         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 134         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 135         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 136         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 137         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 138         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 139         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 140         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 141 };
 142
 143 static int pix_sum_c(uint8_t * pix, int line_size)
 144 {
 145     int s, i, j;
 146
 147     s = 0;
 148     for (i = 0; i < 16; i++) {
 149         for (j = 0; j < 16; j += 8) {
 150             s += pix[0];
 151             s += pix[1];
 152             s += pix[2];
 153             s += pix[3];
 154             s += pix[4];
 155             s += pix[5];
 156             s += pix[6];
 157             s += pix[7];
 158             pix += 8;
 159         }
 160         pix += line_size - 16;
 161     }
 162     return s;
 163 }
 164
 165 static int pix_norm1_c(uint8_t * pix, int line_size)
 166 {
 167     int s, i, j;
 168     uint32_t *sq = ff_squareTbl + 256;
 169
 170     s = 0;
 171     for (i = 0; i < 16; i++) {
 172         for (j = 0; j < 16; j += 8) {
 173 #if 0
 174             s += sq[pix[0]];
 175             s += sq[pix[1]];
 176             s += sq[pix[2]];
 177             s += sq[pix[3]];
 178             s += sq[pix[4]];
 179             s += sq[pix[5]];
 180             s += sq[pix[6]];
 181             s += sq[pix[7]];
 182 #else
 183 #if LONG_MAX > 2147483647
 184             register uint64_t x=*(uint64_t*)pix;
 185             s += sq[x&0xff];
 186             s += sq[(x>>8)&0xff];
 187             s += sq[(x>>16)&0xff];
 188             s += sq[(x>>24)&0xff];
 189             s += sq[(x>>32)&0xff];
 190             s += sq[(x>>40)&0xff];
 191             s += sq[(x>>48)&0xff];
 192             s += sq[(x>>56)&0xff];
 193 #else
 194             register uint32_t x=*(uint32_t*)pix;
 195             s += sq[x&0xff];
 196             s += sq[(x>>8)&0xff];
 197             s += sq[(x>>16)&0xff];
 198             s += sq[(x>>24)&0xff];
 199             x=*(uint32_t*)(pix+4);
 200             s += sq[x&0xff];
 201             s += sq[(x>>8)&0xff];
 202             s += sq[(x>>16)&0xff];
 203             s += sq[(x>>24)&0xff];
 204 #endif
 205 #endif
 206             pix += 8;
 207         }
 208         pix += line_size - 16;
 209     }
 210     return s;
 211 }
 212
 213 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
 214     int i;
 215
 216     for(i=0; i+8<=w; i+=8){
 217         dst[i+0]= bswap_32(src[i+0]);
 218         dst[i+1]= bswap_32(src[i+1]);
 219         dst[i+2]= bswap_32(src[i+2]);
 220         dst[i+3]= bswap_32(src[i+3]);
 221         dst[i+4]= bswap_32(src[i+4]);
 222         dst[i+5]= bswap_32(src[i+5]);
 223         dst[i+6]= bswap_32(src[i+6]);
 224         dst[i+7]= bswap_32(src[i+7]);
 225     }
 226     for(;i<w; i++){
 227         dst[i+0]= bswap_32(src[i+0]);
 228     }
 229 }
 230
 231 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 232 {
 233     int s, i;
 234     uint32_t *sq = ff_squareTbl + 256;
 235
 236     s = 0;
 237     for (i = 0; i < h; i++) {
 238         s += sq[pix1[0] - pix2[0]];
 239         s += sq[pix1[1] - pix2[1]];
 240         s += sq[pix1[2] - pix2[2]];
 241         s += sq[pix1[3] - pix2[3]];
 242         pix1 += line_size;
 243         pix2 += line_size;
 244     }
 245     return s;
 246 }
 247
 248 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 249 {
 250     int s, i;
 251     uint32_t *sq = ff_squareTbl + 256;
 252
 253     s = 0;
 254     for (i = 0; i < h; i++) {
 255         s += sq[pix1[0] - pix2[0]];
 256         s += sq[pix1[1] - pix2[1]];
 257         s += sq[pix1[2] - pix2[2]];
 258         s += sq[pix1[3] - pix2[3]];
 259         s += sq[pix1[4] - pix2[4]];
 260         s += sq[pix1[5] - pix2[5]];
 261         s += sq[pix1[6] - pix2[6]];
 262         s += sq[pix1[7] - pix2[7]];
 263         pix1 += line_size;
 264         pix2 += line_size;
 265     }
 266     return s;
 267 }
 268
 269 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 270 {
 271     int s, i;
 272     uint32_t *sq = ff_squareTbl + 256;
 273
 274     s = 0;
 275     for (i = 0; i < h; i++) {
 276         s += sq[pix1[ 0] - pix2[ 0]];
 277         s += sq[pix1[ 1] - pix2[ 1]];
 278         s += sq[pix1[ 2] - pix2[ 2]];
 279         s += sq[pix1[ 3] - pix2[ 3]];
 280         s += sq[pix1[ 4] - pix2[ 4]];
 281         s += sq[pix1[ 5] - pix2[ 5]];
 282         s += sq[pix1[ 6] - pix2[ 6]];
 283         s += sq[pix1[ 7] - pix2[ 7]];
 284         s += sq[pix1[ 8] - pix2[ 8]];
 285         s += sq[pix1[ 9] - pix2[ 9]];
 286         s += sq[pix1[10] - pix2[10]];
 287         s += sq[pix1[11] - pix2[11]];
 288         s += sq[pix1[12] - pix2[12]];
 289         s += sq[pix1[13] - pix2[13]];
 290         s += sq[pix1[14] - pix2[14]];
 291         s += sq[pix1[15] - pix2[15]];
 292
 293         pix1 += line_size;
 294         pix2 += line_size;
 295     }
 296     return s;
 297 }
 298
 299
 300 #ifdef CONFIG_SNOW_ENCODER //dwt is in snow.c
 301 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
 302     int s, i, j;
 303     const int dec_count= w==8 ? 3 : 4;
 304     int tmp[32*32];
 305     int level, ori;
 306     static const int scale[2][2][4][4]={
 307       {
 308         {
 309             // 9/7 8x8 dec=3
 310             {268, 239, 239, 213},
 311             {  0, 224, 224, 152},
 312             {  0, 135, 135, 110},
 313         },{
 314             // 9/7 16x16 or 32x32 dec=4
 315             {344, 310, 310, 280},
 316             {  0, 320, 320, 228},
 317             {  0, 175, 175, 136},
 318             {  0, 129, 129, 102},
 319         }
 320       },{
 321         {
 322             // 5/3 8x8 dec=3
 323             {275, 245, 245, 218},
 324             {  0, 230, 230, 156},
 325             {  0, 138, 138, 113},
 326         },{
 327             // 5/3 16x16 or 32x32 dec=4
 328             {352, 317, 317, 286},
 329             {  0, 328, 328, 233},
 330             {  0, 180, 180, 140},
 331             {  0, 132, 132, 105},
 332         }
 333       }
 334     };
 335
 336     for (i = 0; i < h; i++) {
 337         for (j = 0; j < w; j+=4) {
 338             tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
 339             tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
 340             tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
 341             tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
 342         }
 343         pix1 += line_size;
 344         pix2 += line_size;
 345     }
 346
 347     ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
 348
 349     s=0;
 350     assert(w==h);
 351     for(level=0; level<dec_count; level++){
 352         for(ori= level ? 1 : 0; ori<4; ori++){
 353             int size= w>>(dec_count-level);
 354             int sx= (ori&1) ? size : 0;
 355             int stride= 32<<(dec_count-level);
 356             int sy= (ori&2) ? stride>>1 : 0;
 357
 358             for(i=0; i<size; i++){
 359                 for(j=0; j<size; j++){
 360                     int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
 361                     s += FFABS(v);
 362                 }
 363             }
 364         }
 365     }
 366     assert(s>=0);
 367     return s>>9;
 368 }
 369
 370 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 371     return w_c(v, pix1, pix2, line_size,  8, h, 1);
 372 }
 373
 374 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 375     return w_c(v, pix1, pix2, line_size,  8, h, 0);
 376 }
 377
 378 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 379     return w_c(v, pix1, pix2, line_size, 16, h, 1);
 380 }
 381
 382 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 383     return w_c(v, pix1, pix2, line_size, 16, h, 0);
 384 }
 385
 386 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 387     return w_c(v, pix1, pix2, line_size, 32, h, 1);
 388 }
 389
 390 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 391     return w_c(v, pix1, pix2, line_size, 32, h, 0);
 392 }
 393 #endif
 394
 395 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
 396 {
 397     int i;
 398
 399     /* read the pixels */
 400     for(i=0;i<8;i++) {
 401         block[0] = pixels[0];
 402         block[1] = pixels[1];
 403         block[2] = pixels[2];
 404         block[3] = pixels[3];
 405         block[4] = pixels[4];
 406         block[5] = pixels[5];
 407         block[6] = pixels[6];
 408         block[7] = pixels[7];
 409         pixels += line_size;
 410         block += 8;
 411     }
 412 }
 413
 414 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
 415                           const uint8_t *s2, int stride){
 416     int i;
 417
 418     /* read the pixels */
 419     for(i=0;i<8;i++) {
 420         block[0] = s1[0] - s2[0];
 421         block[1] = s1[1] - s2[1];
 422         block[2] = s1[2] - s2[2];
 423         block[3] = s1[3] - s2[3];
 424         block[4] = s1[4] - s2[4];
 425         block[5] = s1[5] - s2[5];
 426         block[6] = s1[6] - s2[6];
 427         block[7] = s1[7] - s2[7];
 428         s1 += stride;
 429         s2 += stride;
 430         block += 8;
 431     }
 432 }
 433
 434
 435 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 436                                  int line_size)
 437 {
 438     int i;
 439     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 440
 441     /* read the pixels */
 442     for(i=0;i<8;i++) {
 443         pixels[0] = cm[block[0]];
 444         pixels[1] = cm[block[1]];
 445         pixels[2] = cm[block[2]];
 446         pixels[3] = cm[block[3]];
 447         pixels[4] = cm[block[4]];
 448         pixels[5] = cm[block[5]];
 449         pixels[6] = cm[block[6]];
 450         pixels[7] = cm[block[7]];
 451
 452         pixels += line_size;
 453         block += 8;
 454     }
 455 }
 456
 457 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 458                                  int line_size)
 459 {
 460     int i;
 461     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 462
 463     /* read the pixels */
 464     for(i=0;i<4;i++) {
 465         pixels[0] = cm[block[0]];
 466         pixels[1] = cm[block[1]];
 467         pixels[2] = cm[block[2]];
 468         pixels[3] = cm[block[3]];
 469
 470         pixels += line_size;
 471         block += 8;
 472     }
 473 }
 474
 475 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 476                                  int line_size)
 477 {
 478     int i;
 479     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 480
 481     /* read the pixels */
 482     for(i=0;i<2;i++) {
 483         pixels[0] = cm[block[0]];
 484         pixels[1] = cm[block[1]];
 485
 486         pixels += line_size;
 487         block += 8;
 488     }
 489 }
 490
 491 static void put_signed_pixels_clamped_c(const DCTELEM *block,
 492                                         uint8_t *restrict pixels,
 493                                         int line_size)
 494 {
 495     int i, j;
 496
 497     for (i = 0; i < 8; i++) {
 498         for (j = 0; j < 8; j++) {
 499             if (*block < -128)
 500                 *pixels = 0;
 501             else if (*block > 127)
 502                 *pixels = 255;
 503             else
 504                 *pixels = (uint8_t)(*block + 128);
 505             block++;
 506             pixels++;
 507         }
 508         pixels += (line_size - 8);
 509     }
 510 }
 511
 512 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 513                           int line_size)
 514 {
 515     int i;
 516     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 517
 518     /* read the pixels */
 519     for(i=0;i<8;i++) {
 520         pixels[0] = cm[pixels[0] + block[0]];
 521         pixels[1] = cm[pixels[1] + block[1]];
 522         pixels[2] = cm[pixels[2] + block[2]];
 523         pixels[3] = cm[pixels[3] + block[3]];
 524         pixels[4] = cm[pixels[4] + block[4]];
 525         pixels[5] = cm[pixels[5] + block[5]];
 526         pixels[6] = cm[pixels[6] + block[6]];
 527         pixels[7] = cm[pixels[7] + block[7]];
 528         pixels += line_size;
 529         block += 8;
 530     }
 531 }
 532
 533 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 534                           int line_size)
 535 {
 536     int i;
 537     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 538
 539     /* read the pixels */
 540     for(i=0;i<4;i++) {
 541         pixels[0] = cm[pixels[0] + block[0]];
 542         pixels[1] = cm[pixels[1] + block[1]];
 543         pixels[2] = cm[pixels[2] + block[2]];
 544         pixels[3] = cm[pixels[3] + block[3]];
 545         pixels += line_size;
 546         block += 8;
 547     }
 548 }
 549
 550 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 551                           int line_size)
 552 {
 553     int i;
 554     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 555
 556     /* read the pixels */
 557     for(i=0;i<2;i++) {
 558         pixels[0] = cm[pixels[0] + block[0]];
 559         pixels[1] = cm[pixels[1] + block[1]];
 560         pixels += line_size;
 561         block += 8;
 562     }
 563 }
 564
 565 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 566 {
 567     int i;
 568     for(i=0;i<8;i++) {
 569         pixels[0] += block[0];
 570         pixels[1] += block[1];
 571         pixels[2] += block[2];
 572         pixels[3] += block[3];
 573         pixels[4] += block[4];
 574         pixels[5] += block[5];
 575         pixels[6] += block[6];
 576         pixels[7] += block[7];
 577         pixels += line_size;
 578         block += 8;
 579     }
 580 }
 581
 582 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 583 {
 584     int i;
 585     for(i=0;i<4;i++) {
 586         pixels[0] += block[0];
 587         pixels[1] += block[1];
 588         pixels[2] += block[2];
 589         pixels[3] += block[3];
 590         pixels += line_size;
 591         block += 4;
 592     }
 593 }
 594
 595 #if 0
 596
 597 #define PIXOP2(OPNAME, OP) \
 598 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 599 {\
 600     int i;\
 601     for(i=0; i<h; i++){\
 602         OP(*((uint64_t*)block), LD64(pixels));\
 603         pixels+=line_size;\
 604         block +=line_size;\
 605     }\
 606 }\
 607 \
 608 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 609 {\
 610     int i;\
 611     for(i=0; i<h; i++){\
 612         const uint64_t a= LD64(pixels  );\
 613         const uint64_t b= LD64(pixels+1);\
 614         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 615         pixels+=line_size;\
 616         block +=line_size;\
 617     }\
 618 }\
 619 \
 620 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 621 {\
 622     int i;\
 623     for(i=0; i<h; i++){\
 624         const uint64_t a= LD64(pixels  );\
 625         const uint64_t b= LD64(pixels+1);\
 626         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 627         pixels+=line_size;\
 628         block +=line_size;\
 629     }\
 630 }\
 631 \
 632 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 633 {\
 634     int i;\
 635     for(i=0; i<h; i++){\
 636         const uint64_t a= LD64(pixels          );\
 637         const uint64_t b= LD64(pixels+line_size);\
 638         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 639         pixels+=line_size;\
 640         block +=line_size;\
 641     }\
 642 }\
 643 \
 644 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 645 {\
 646     int i;\
 647     for(i=0; i<h; i++){\
 648         const uint64_t a= LD64(pixels          );\
 649         const uint64_t b= LD64(pixels+line_size);\
 650         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 651         pixels+=line_size;\
 652         block +=line_size;\
 653     }\
 654 }\
 655 \
 656 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 657 {\
 658         int i;\
 659         const uint64_t a= LD64(pixels  );\
 660         const uint64_t b= LD64(pixels+1);\
 661         uint64_t l0=  (a&0x0303030303030303ULL)\
 662                     + (b&0x0303030303030303ULL)\
 663                     + 0x0202020202020202ULL;\
 664         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 665                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 666         uint64_t l1,h1;\
 667 \
 668         pixels+=line_size;\
 669         for(i=0; i<h; i+=2){\
 670             uint64_t a= LD64(pixels  );\
 671             uint64_t b= LD64(pixels+1);\
 672             l1=  (a&0x0303030303030303ULL)\
 673                + (b&0x0303030303030303ULL);\
 674             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 675               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 676             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 677             pixels+=line_size;\
 678             block +=line_size;\
 679             a= LD64(pixels  );\
 680             b= LD64(pixels+1);\
 681             l0=  (a&0x0303030303030303ULL)\
 682                + (b&0x0303030303030303ULL)\
 683                + 0x0202020202020202ULL;\
 684             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 685               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 686             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 687             pixels+=line_size;\
 688             block +=line_size;\
 689         }\
 690 }\
 691 \
 692 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 693 {\
 694         int i;\
 695         const uint64_t a= LD64(pixels  );\
 696         const uint64_t b= LD64(pixels+1);\
 697         uint64_t l0=  (a&0x0303030303030303ULL)\
 698                     + (b&0x0303030303030303ULL)\
 699                     + 0x0101010101010101ULL;\
 700         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 701                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 702         uint64_t l1,h1;\
 703 \
 704         pixels+=line_size;\
 705         for(i=0; i<h; i+=2){\
 706             uint64_t a= LD64(pixels  );\
 707             uint64_t b= LD64(pixels+1);\
 708             l1=  (a&0x0303030303030303ULL)\
 709                + (b&0x0303030303030303ULL);\
 710             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 711               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 712             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 713             pixels+=line_size;\
 714             block +=line_size;\
 715             a= LD64(pixels  );\
 716             b= LD64(pixels+1);\
 717             l0=  (a&0x0303030303030303ULL)\
 718                + (b&0x0303030303030303ULL)\
 719                + 0x0101010101010101ULL;\
 720             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 721               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 722             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 723             pixels+=line_size;\
 724             block +=line_size;\
 725         }\
 726 }\
 727 \
 728 CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
 729 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
 730 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
 731 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
 732 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
 733 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
 734 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
 735
 736 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
 737 #else // 64 bit variant
 738
 739 #define PIXOP2(OPNAME, OP) \
 740 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 741     int i;\
 742     for(i=0; i<h; i++){\
 743         OP(*((uint16_t*)(block  )), LD16(pixels  ));\
 744         pixels+=line_size;\
 745         block +=line_size;\
 746     }\
 747 }\
 748 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 749     int i;\
 750     for(i=0; i<h; i++){\
 751         OP(*((uint32_t*)(block  )), LD32(pixels  ));\
 752         pixels+=line_size;\
 753         block +=line_size;\
 754     }\
 755 }\
 756 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 757     int i;\
 758     for(i=0; i<h; i++){\
 759         OP(*((uint32_t*)(block  )), LD32(pixels  ));\
 760         OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
 761         pixels+=line_size;\
 762         block +=line_size;\
 763     }\
 764 }\
 765 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 766     OPNAME ## _pixels8_c(block, pixels, line_size, h);\
 767 }\
 768 \
 769 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 770                                                 int src_stride1, int src_stride2, int h){\
 771     int i;\
 772     for(i=0; i<h; i++){\
 773         uint32_t a,b;\
 774         a= LD32(&src1[i*src_stride1  ]);\
 775         b= LD32(&src2[i*src_stride2  ]);\
 776         OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
 777         a= LD32(&src1[i*src_stride1+4]);\
 778         b= LD32(&src2[i*src_stride2+4]);\
 779         OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
 780     }\
 781 }\
 782 \
 783 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 784                                                 int src_stride1, int src_stride2, int h){\
 785     int i;\
 786     for(i=0; i<h; i++){\
 787         uint32_t a,b;\
 788         a= LD32(&src1[i*src_stride1  ]);\
 789         b= LD32(&src2[i*src_stride2  ]);\
 790         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 791         a= LD32(&src1[i*src_stride1+4]);\
 792         b= LD32(&src2[i*src_stride2+4]);\
 793         OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
 794     }\
 795 }\
 796 \
 797 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 798                                                 int src_stride1, int src_stride2, int h){\
 799     int i;\
 800     for(i=0; i<h; i++){\
 801         uint32_t a,b;\
 802         a= LD32(&src1[i*src_stride1  ]);\
 803         b= LD32(&src2[i*src_stride2  ]);\
 804         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 805     }\
 806 }\
 807 \
 808 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 809                                                 int src_stride1, int src_stride2, int h){\
 810     int i;\
 811     for(i=0; i<h; i++){\
 812         uint32_t a,b;\
 813         a= LD16(&src1[i*src_stride1  ]);\
 814         b= LD16(&src2[i*src_stride2  ]);\
 815         OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 816     }\
 817 }\
 818 \
 819 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 820                                                 int src_stride1, int src_stride2, int h){\
 821     OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 822     OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 823 }\
 824 \
 825 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 826                                                 int src_stride1, int src_stride2, int h){\
 827     OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 828     OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 829 }\
 830 \
 831 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 832     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 833 }\
 834 \
 835 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 836     OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 837 }\
 838 \
 839 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 840     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 841 }\
 842 \
 843 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 844     OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 845 }\
 846 \
 847 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 848                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 849     int i;\
 850     for(i=0; i<h; i++){\
 851         uint32_t a, b, c, d, l0, l1, h0, h1;\
 852         a= LD32(&src1[i*src_stride1]);\
 853         b= LD32(&src2[i*src_stride2]);\
 854         c= LD32(&src3[i*src_stride3]);\
 855         d= LD32(&src4[i*src_stride4]);\
 856         l0=  (a&0x03030303UL)\
 857            + (b&0x03030303UL)\
 858            + 0x02020202UL;\
 859         h0= ((a&0xFCFCFCFCUL)>>2)\
 860           + ((b&0xFCFCFCFCUL)>>2);\
 861         l1=  (c&0x03030303UL)\
 862            + (d&0x03030303UL);\
 863         h1= ((c&0xFCFCFCFCUL)>>2)\
 864           + ((d&0xFCFCFCFCUL)>>2);\
 865         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 866         a= LD32(&src1[i*src_stride1+4]);\
 867         b= LD32(&src2[i*src_stride2+4]);\
 868         c= LD32(&src3[i*src_stride3+4]);\
 869         d= LD32(&src4[i*src_stride4+4]);\
 870         l0=  (a&0x03030303UL)\
 871            + (b&0x03030303UL)\
 872            + 0x02020202UL;\
 873         h0= ((a&0xFCFCFCFCUL)>>2)\
 874           + ((b&0xFCFCFCFCUL)>>2);\
 875         l1=  (c&0x03030303UL)\
 876            + (d&0x03030303UL);\
 877         h1= ((c&0xFCFCFCFCUL)>>2)\
 878           + ((d&0xFCFCFCFCUL)>>2);\
 879         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 880     }\
 881 }\
 882 \
 883 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 884     OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 885 }\
 886 \
 887 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 888     OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 889 }\
 890 \
 891 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 892     OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 893 }\
 894 \
 895 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 896     OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 897 }\
 898 \
 899 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 900                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 901     int i;\
 902     for(i=0; i<h; i++){\
 903         uint32_t a, b, c, d, l0, l1, h0, h1;\
 904         a= LD32(&src1[i*src_stride1]);\
 905         b= LD32(&src2[i*src_stride2]);\
 906         c= LD32(&src3[i*src_stride3]);\
 907         d= LD32(&src4[i*src_stride4]);\
 908         l0=  (a&0x03030303UL)\
 909            + (b&0x03030303UL)\
 910            + 0x01010101UL;\
 911         h0= ((a&0xFCFCFCFCUL)>>2)\
 912           + ((b&0xFCFCFCFCUL)>>2);\
 913         l1=  (c&0x03030303UL)\
 914            + (d&0x03030303UL);\
 915         h1= ((c&0xFCFCFCFCUL)>>2)\
 916           + ((d&0xFCFCFCFCUL)>>2);\
 917         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 918         a= LD32(&src1[i*src_stride1+4]);\
 919         b= LD32(&src2[i*src_stride2+4]);\
 920         c= LD32(&src3[i*src_stride3+4]);\
 921         d= LD32(&src4[i*src_stride4+4]);\
 922         l0=  (a&0x03030303UL)\
 923            + (b&0x03030303UL)\
 924            + 0x01010101UL;\
 925         h0= ((a&0xFCFCFCFCUL)>>2)\
 926           + ((b&0xFCFCFCFCUL)>>2);\
 927         l1=  (c&0x03030303UL)\
 928            + (d&0x03030303UL);\
 929         h1= ((c&0xFCFCFCFCUL)>>2)\
 930           + ((d&0xFCFCFCFCUL)>>2);\
 931         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 932     }\
 933 }\
 934 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 935                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 936     OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 937     OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 938 }\
 939 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 940                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 941     OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 942     OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 943 }\
 944 \
 945 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 946 {\
 947         int i, a0, b0, a1, b1;\
 948         a0= pixels[0];\
 949         b0= pixels[1] + 2;\
 950         a0 += b0;\
 951         b0 += pixels[2];\
 952 \
 953         pixels+=line_size;\
 954         for(i=0; i<h; i+=2){\
 955             a1= pixels[0];\
 956             b1= pixels[1];\
 957             a1 += b1;\
 958             b1 += pixels[2];\
 959 \
 960             block[0]= (a1+a0)>>2; /* FIXME non put */\
 961             block[1]= (b1+b0)>>2;\
 962 \
 963             pixels+=line_size;\
 964             block +=line_size;\
 965 \
 966             a0= pixels[0];\
 967             b0= pixels[1] + 2;\
 968             a0 += b0;\
 969             b0 += pixels[2];\
 970 \
 971             block[0]= (a1+a0)>>2;\
 972             block[1]= (b1+b0)>>2;\
 973             pixels+=line_size;\
 974             block +=line_size;\
 975         }\
 976 }\
 977 \
 978 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 979 {\
 980         int i;\
 981         const uint32_t a= LD32(pixels  );\
 982         const uint32_t b= LD32(pixels+1);\
 983         uint32_t l0=  (a&0x03030303UL)\
 984                     + (b&0x03030303UL)\
 985                     + 0x02020202UL;\
 986         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
 987                    + ((b&0xFCFCFCFCUL)>>2);\
 988         uint32_t l1,h1;\
 989 \
 990         pixels+=line_size;\
 991         for(i=0; i<h; i+=2){\
 992             uint32_t a= LD32(pixels  );\
 993             uint32_t b= LD32(pixels+1);\
 994             l1=  (a&0x03030303UL)\
 995                + (b&0x03030303UL);\
 996             h1= ((a&0xFCFCFCFCUL)>>2)\
 997               + ((b&0xFCFCFCFCUL)>>2);\
 998             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 999             pixels+=line_size;\
1000             block +=line_size;\
1001             a= LD32(pixels  );\
1002             b= LD32(pixels+1);\
1003             l0=  (a&0x03030303UL)\
1004                + (b&0x03030303UL)\
1005                + 0x02020202UL;\
1006             h0= ((a&0xFCFCFCFCUL)>>2)\
1007               + ((b&0xFCFCFCFCUL)>>2);\
1008             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1009             pixels+=line_size;\
1010             block +=line_size;\
1011         }\
1012 }\
1013 \
1014 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1015 {\
1016     int j;\
1017     for(j=0; j<2; j++){\
1018         int i;\
1019         const uint32_t a= LD32(pixels  );\
1020         const uint32_t b= LD32(pixels+1);\
1021         uint32_t l0=  (a&0x03030303UL)\
1022                     + (b&0x03030303UL)\
1023                     + 0x02020202UL;\
1024         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1025                    + ((b&0xFCFCFCFCUL)>>2);\
1026         uint32_t l1,h1;\
1027 \
1028         pixels+=line_size;\
1029         for(i=0; i<h; i+=2){\
1030             uint32_t a= LD32(pixels  );\
1031             uint32_t b= LD32(pixels+1);\
1032             l1=  (a&0x03030303UL)\
1033                + (b&0x03030303UL);\
1034             h1= ((a&0xFCFCFCFCUL)>>2)\
1035               + ((b&0xFCFCFCFCUL)>>2);\
1036             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1037             pixels+=line_size;\
1038             block +=line_size;\
1039             a= LD32(pixels  );\
1040             b= LD32(pixels+1);\
1041             l0=  (a&0x03030303UL)\
1042                + (b&0x03030303UL)\
1043                + 0x02020202UL;\
1044             h0= ((a&0xFCFCFCFCUL)>>2)\
1045               + ((b&0xFCFCFCFCUL)>>2);\
1046             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1047             pixels+=line_size;\
1048             block +=line_size;\
1049         }\
1050         pixels+=4-line_size*(h+1);\
1051         block +=4-line_size*h;\
1052     }\
1053 }\
1054 \
1055 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1056 {\
1057     int j;\
1058     for(j=0; j<2; j++){\
1059         int i;\
1060         const uint32_t a= LD32(pixels  );\
1061         const uint32_t b= LD32(pixels+1);\
1062         uint32_t l0=  (a&0x03030303UL)\
1063                     + (b&0x03030303UL)\
1064                     + 0x01010101UL;\
1065         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1066                    + ((b&0xFCFCFCFCUL)>>2);\
1067         uint32_t l1,h1;\
1068 \
1069         pixels+=line_size;\
1070         for(i=0; i<h; i+=2){\
1071             uint32_t a= LD32(pixels  );\
1072             uint32_t b= LD32(pixels+1);\
1073             l1=  (a&0x03030303UL)\
1074                + (b&0x03030303UL);\
1075             h1= ((a&0xFCFCFCFCUL)>>2)\
1076               + ((b&0xFCFCFCFCUL)>>2);\
1077             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1078             pixels+=line_size;\
1079             block +=line_size;\
1080             a= LD32(pixels  );\
1081             b= LD32(pixels+1);\
1082             l0=  (a&0x03030303UL)\
1083                + (b&0x03030303UL)\
1084                + 0x01010101UL;\
1085             h0= ((a&0xFCFCFCFCUL)>>2)\
1086               + ((b&0xFCFCFCFCUL)>>2);\
1087             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1088             pixels+=line_size;\
1089             block +=line_size;\
1090         }\
1091         pixels+=4-line_size*(h+1);\
1092         block +=4-line_size*h;\
1093     }\
1094 }\
1095 \
1096 CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1097 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1098 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1099 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1100 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1101 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1102 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1103 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1104
1105 #define op_avg(a, b) a = rnd_avg32(a, b)
1106 #endif
1107 #define op_put(a, b) a = b
1108
1109 PIXOP2(avg, op_avg)
1110 PIXOP2(put, op_put)
1111 #undef op_avg
1112 #undef op_put
1113
1114 #define avg2(a,b) ((a+b+1)>>1)
1115 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1116
1117 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1118     put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1119 }
1120
1121 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1122     put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1123 }
1124
1125 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1126 {
1127     const int A=(16-x16)*(16-y16);
1128     const int B=(   x16)*(16-y16);
1129     const int C=(16-x16)*(   y16);
1130     const int D=(   x16)*(   y16);
1131     int i;
1132
1133     for(i=0; i<h; i++)
1134     {
1135         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1136         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1137         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1138         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1139         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1140         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1141         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1142         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1143         dst+= stride;
1144         src+= stride;
1145     }
1146 }
1147
1148 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1149                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1150 {
1151     int y, vx, vy;
1152     const int s= 1<<shift;
1153
1154     width--;
1155     height--;
1156
1157     for(y=0; y<h; y++){
1158         int x;
1159
1160         vx= ox;
1161         vy= oy;
1162         for(x=0; x<8; x++){ //XXX FIXME optimize
1163             int src_x, src_y, frac_x, frac_y, index;
1164
1165             src_x= vx>>16;
1166             src_y= vy>>16;
1167             frac_x= src_x&(s-1);
1168             frac_y= src_y&(s-1);
1169             src_x>>=shift;
1170             src_y>>=shift;
1171
1172             if((unsigned)src_x < width){
1173                 if((unsigned)src_y < height){
1174                     index= src_x + src_y*stride;
1175                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1176                                            + src[index       +1]*   frac_x )*(s-frac_y)
1177                                         + (  src[index+stride  ]*(s-frac_x)
1178                                            + src[index+stride+1]*   frac_x )*   frac_y
1179                                         + r)>>(shift*2);
1180                 }else{
1181                     index= src_x + av_clip(src_y, 0, height)*stride;
1182                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1183                                           + src[index       +1]*   frac_x )*s
1184                                         + r)>>(shift*2);
1185                 }
1186             }else{
1187                 if((unsigned)src_y < height){
1188                     index= av_clip(src_x, 0, width) + src_y*stride;
1189                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1190                                            + src[index+stride  ]*   frac_y )*s
1191                                         + r)>>(shift*2);
1192                 }else{
1193                     index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1194                     dst[y*stride + x]=    src[index         ];
1195                 }
1196             }
1197
1198             vx+= dxx;
1199             vy+= dyx;
1200         }
1201         ox += dxy;
1202         oy += dyy;
1203     }
1204 }
1205
1206 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1207     switch(width){
1208     case 2: put_pixels2_c (dst, src, stride, height); break;
1209     case 4: put_pixels4_c (dst, src, stride, height); break;
1210     case 8: put_pixels8_c (dst, src, stride, height); break;
1211     case 16:put_pixels16_c(dst, src, stride, height); break;
1212     }
1213 }
1214
1215 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1216     int i,j;
1217     for (i=0; i < height; i++) {
1218       for (j=0; j < width; j++) {
1219         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1220       }
1221       src += stride;
1222       dst += stride;
1223     }
1224 }
1225
1226 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1227     int i,j;
1228     for (i=0; i < height; i++) {
1229       for (j=0; j < width; j++) {
1230         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1231       }
1232       src += stride;
1233       dst += stride;
1234     }
1235 }
1236
1237 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1238     int i,j;
1239     for (i=0; i < height; i++) {
1240       for (j=0; j < width; j++) {
1241         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1242       }
1243       src += stride;
1244       dst += stride;
1245     }
1246 }
1247
1248 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1249     int i,j;
1250     for (i=0; i < height; i++) {
1251       for (j=0; j < width; j++) {
1252         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1253       }
1254       src += stride;
1255       dst += stride;
1256     }
1257 }
1258
1259 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1260     int i,j;
1261     for (i=0; i < height; i++) {
1262       for (j=0; j < width; j++) {
1263         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1264       }
1265       src += stride;
1266       dst += stride;
1267     }
1268 }
1269
1270 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1271     int i,j;
1272     for (i=0; i < height; i++) {
1273       for (j=0; j < width; j++) {
1274         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1275       }
1276       src += stride;
1277       dst += stride;
1278     }
1279 }
1280
1281 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1282     int i,j;
1283     for (i=0; i < height; i++) {
1284       for (j=0; j < width; j++) {
1285         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1286       }
1287       src += stride;
1288       dst += stride;
1289     }
1290 }
1291
1292 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1293     int i,j;
1294     for (i=0; i < height; i++) {
1295       for (j=0; j < width; j++) {
1296         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1297       }
1298       src += stride;
1299       dst += stride;
1300     }
1301 }
1302
1303 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1304     switch(width){
1305     case 2: avg_pixels2_c (dst, src, stride, height); break;
1306     case 4: avg_pixels4_c (dst, src, stride, height); break;
1307     case 8: avg_pixels8_c (dst, src, stride, height); break;
1308     case 16:avg_pixels16_c(dst, src, stride, height); break;
1309     }
1310 }
1311
1312 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1313     int i,j;
1314     for (i=0; i < height; i++) {
1315       for (j=0; j < width; j++) {
1316         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1317       }
1318       src += stride;
1319       dst += stride;
1320     }
1321 }
1322
1323 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1324     int i,j;
1325     for (i=0; i < height; i++) {
1326       for (j=0; j < width; j++) {
1327         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1328       }
1329       src += stride;
1330       dst += stride;
1331     }
1332 }
1333
1334 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1335     int i,j;
1336     for (i=0; i < height; i++) {
1337       for (j=0; j < width; j++) {
1338         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1339       }
1340       src += stride;
1341       dst += stride;
1342     }
1343 }
1344
1345 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1346     int i,j;
1347     for (i=0; i < height; i++) {
1348       for (j=0; j < width; j++) {
1349         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1350       }
1351       src += stride;
1352       dst += stride;
1353     }
1354 }
1355
1356 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1357     int i,j;
1358     for (i=0; i < height; i++) {
1359       for (j=0; j < width; j++) {
1360         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1361       }
1362       src += stride;
1363       dst += stride;
1364     }
1365 }
1366
1367 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1368     int i,j;
1369     for (i=0; i < height; i++) {
1370       for (j=0; j < width; j++) {
1371         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1372       }
1373       src += stride;
1374       dst += stride;
1375     }
1376 }
1377
1378 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1379     int i,j;
1380     for (i=0; i < height; i++) {
1381       for (j=0; j < width; j++) {
1382         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1383       }
1384       src += stride;
1385       dst += stride;
1386     }
1387 }
1388
1389 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1390     int i,j;
1391     for (i=0; i < height; i++) {
1392       for (j=0; j < width; j++) {
1393         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1394       }
1395       src += stride;
1396       dst += stride;
1397     }
1398 }
1399 #if 0
1400 #define TPEL_WIDTH(width)\
1401 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1402     void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1403 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1404     void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1405 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1406     void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1407 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1408     void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1409 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1410     void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1411 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1412     void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1413 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1414     void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1415 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1416     void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1417 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1418     void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1419 #endif
1420
1421 #define H264_CHROMA_MC(OPNAME, OP)\
1422 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1423     const int A=(8-x)*(8-y);\
1424     const int B=(  x)*(8-y);\
1425     const int C=(8-x)*(  y);\
1426     const int D=(  x)*(  y);\
1427     int i;\
1428     \
1429     assert(x<8 && y<8 && x>=0 && y>=0);\
1430 \
1431     for(i=0; i<h; i++)\
1432     {\
1433         OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1434         OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1435         dst+= stride;\
1436         src+= stride;\
1437     }\
1438 }\
1439 \
1440 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1441     const int A=(8-x)*(8-y);\
1442     const int B=(  x)*(8-y);\
1443     const int C=(8-x)*(  y);\
1444     const int D=(  x)*(  y);\
1445     int i;\
1446     \
1447     assert(x<8 && y<8 && x>=0 && y>=0);\
1448 \
1449     for(i=0; i<h; i++)\
1450     {\
1451         OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1452         OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1453         OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1454         OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1455         dst+= stride;\
1456         src+= stride;\
1457     }\
1458 }\
1459 \
1460 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1461     const int A=(8-x)*(8-y);\
1462     const int B=(  x)*(8-y);\
1463     const int C=(8-x)*(  y);\
1464     const int D=(  x)*(  y);\
1465     int i;\
1466     \
1467     assert(x<8 && y<8 && x>=0 && y>=0);\
1468 \
1469     for(i=0; i<h; i++)\
1470     {\
1471         OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1472         OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1473         OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1474         OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1475         OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1476         OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1477         OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1478         OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1479         dst+= stride;\
1480         src+= stride;\
1481     }\
1482 }
1483
1484 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1485 #define op_put(a, b) a = (((b) + 32)>>6)
1486
1487 H264_CHROMA_MC(put_       , op_put)
1488 H264_CHROMA_MC(avg_       , op_avg)
1489 #undef op_avg
1490 #undef op_put
1491
1492 static void put_no_rnd_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1493     const int A=(8-x)*(8-y);
1494     const int B=(  x)*(8-y);
1495     const int C=(8-x)*(  y);
1496     const int D=(  x)*(  y);
1497     int i;
1498
1499     assert(x<8 && y<8 && x>=0 && y>=0);
1500
1501     for(i=0; i<h; i++)
1502     {
1503         dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1504         dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1505         dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1506         dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1507         dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1508         dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1509         dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1510         dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1511         dst+= stride;
1512         src+= stride;
1513     }
1514 }
1515
1516 #define QPEL_MC(r, OPNAME, RND, OP) \
1517 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1518     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1519     int i;\
1520     for(i=0; i<h; i++)\
1521     {\
1522         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1523         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1524         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1525         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1526         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1527         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1528         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1529         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1530         dst+=dstStride;\
1531         src+=srcStride;\
1532     }\
1533 }\
1534 \
1535 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1536     const int w=8;\
1537     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1538     int i;\
1539     for(i=0; i<w; i++)\
1540     {\
1541         const int src0= src[0*srcStride];\
1542         const int src1= src[1*srcStride];\
1543         const int src2= src[2*srcStride];\
1544         const int src3= src[3*srcStride];\
1545         const int src4= src[4*srcStride];\
1546         const int src5= src[5*srcStride];\
1547         const int src6= src[6*srcStride];\
1548         const int src7= src[7*srcStride];\
1549         const int src8= src[8*srcStride];\
1550         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1551         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1552         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1553         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1554         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1555         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1556         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1557         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1558         dst++;\
1559         src++;\
1560     }\
1561 }\
1562 \
1563 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1564     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1565     int i;\
1566     \
1567     for(i=0; i<h; i++)\
1568     {\
1569         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1570         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1571         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1572         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1573         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1574         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1575         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1576         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1577         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1578         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1579         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1580         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1581         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1582         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1583         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1584         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1585         dst+=dstStride;\
1586         src+=srcStride;\
1587     }\
1588 }\
1589 \
1590 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1591     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1592     int i;\
1593     const int w=16;\
1594     for(i=0; i<w; i++)\
1595     {\
1596         const int src0= src[0*srcStride];\
1597         const int src1= src[1*srcStride];\
1598         const int src2= src[2*srcStride];\
1599         const int src3= src[3*srcStride];\
1600         const int src4= src[4*srcStride];\
1601         const int src5= src[5*srcStride];\
1602         const int src6= src[6*srcStride];\
1603         const int src7= src[7*srcStride];\
1604         const int src8= src[8*srcStride];\
1605         const int src9= src[9*srcStride];\
1606         const int src10= src[10*srcStride];\
1607         const int src11= src[11*srcStride];\
1608         const int src12= src[12*srcStride];\
1609         const int src13= src[13*srcStride];\
1610         const int src14= src[14*srcStride];\
1611         const int src15= src[15*srcStride];\
1612         const int src16= src[16*srcStride];\
1613         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1614         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1615         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1616         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1617         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1618         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1619         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1620         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1621         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1622         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1623         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1624         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1625         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1626         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1627         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1628         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1629         dst++;\
1630         src++;\
1631     }\
1632 }\
1633 \
1634 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1635     OPNAME ## pixels8_c(dst, src, stride, 8);\
1636 }\
1637 \
1638 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1639     uint8_t half[64];\
1640     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1641     OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1642 }\
1643 \
1644 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1645     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1646 }\
1647 \
1648 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1649     uint8_t half[64];\
1650     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1651     OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1652 }\
1653 \
1654 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1655     uint8_t full[16*9];\
1656     uint8_t half[64];\
1657     copy_block9(full, src, 16, stride, 9);\
1658     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1659     OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1660 }\
1661 \
1662 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1663     uint8_t full[16*9];\
1664     copy_block9(full, src, 16, stride, 9);\
1665     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1666 }\
1667 \
1668 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1669     uint8_t full[16*9];\
1670     uint8_t half[64];\
1671     copy_block9(full, src, 16, stride, 9);\
1672     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1673     OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1674 }\
1675 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1676     uint8_t full[16*9];\
1677     uint8_t halfH[72];\
1678     uint8_t halfV[64];\
1679     uint8_t halfHV[64];\
1680     copy_block9(full, src, 16, stride, 9);\
1681     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1682     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1683     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1684     OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1685 }\
1686 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1687     uint8_t full[16*9];\
1688     uint8_t halfH[72];\
1689     uint8_t halfHV[64];\
1690     copy_block9(full, src, 16, stride, 9);\
1691     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1692     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1693     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1694     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1695 }\
1696 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1697     uint8_t full[16*9];\
1698     uint8_t halfH[72];\
1699     uint8_t halfV[64];\
1700     uint8_t halfHV[64];\
1701     copy_block9(full, src, 16, stride, 9);\
1702     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1703     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1704     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1705     OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1706 }\
1707 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1708     uint8_t full[16*9];\
1709     uint8_t halfH[72];\
1710     uint8_t halfHV[64];\
1711     copy_block9(full, src, 16, stride, 9);\
1712     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1713     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1714     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1715     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1716 }\
1717 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1718     uint8_t full[16*9];\
1719     uint8_t halfH[72];\
1720     uint8_t halfV[64];\
1721     uint8_t halfHV[64];\
1722     copy_block9(full, src, 16, stride, 9);\
1723     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1724     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1725     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1726     OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1727 }\
1728 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1729     uint8_t full[16*9];\
1730     uint8_t halfH[72];\
1731     uint8_t halfHV[64];\
1732     copy_block9(full, src, 16, stride, 9);\
1733     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1734     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1735     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1736     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1737 }\
1738 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1739     uint8_t full[16*9];\
1740     uint8_t halfH[72];\
1741     uint8_t halfV[64];\
1742     uint8_t halfHV[64];\
1743     copy_block9(full, src, 16, stride, 9);\
1744     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1745     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1746     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1747     OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1748 }\
1749 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1750     uint8_t full[16*9];\
1751     uint8_t halfH[72];\
1752     uint8_t halfHV[64];\
1753     copy_block9(full, src, 16, stride, 9);\
1754     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1755     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1756     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1757     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1758 }\
1759 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1760     uint8_t halfH[72];\
1761     uint8_t halfHV[64];\
1762     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1763     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1764     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1765 }\
1766 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1767     uint8_t halfH[72];\
1768     uint8_t halfHV[64];\
1769     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1770     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1771     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1772 }\
1773 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1774     uint8_t full[16*9];\
1775     uint8_t halfH[72];\
1776     uint8_t halfV[64];\
1777     uint8_t halfHV[64];\
1778     copy_block9(full, src, 16, stride, 9);\
1779     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1780     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1781     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1782     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1783 }\
1784 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1785     uint8_t full[16*9];\
1786     uint8_t halfH[72];\
1787     copy_block9(full, src, 16, stride, 9);\
1788     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1789     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1790     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1791 }\
1792 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1793     uint8_t full[16*9];\
1794     uint8_t halfH[72];\
1795     uint8_t halfV[64];\
1796     uint8_t halfHV[64];\
1797     copy_block9(full, src, 16, stride, 9);\
1798     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1799     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1800     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1801     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1802 }\
1803 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1804     uint8_t full[16*9];\
1805     uint8_t halfH[72];\
1806     copy_block9(full, src, 16, stride, 9);\
1807     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1808     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1809     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1810 }\
1811 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1812     uint8_t halfH[72];\
1813     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1814     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1815 }\
1816 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1817     OPNAME ## pixels16_c(dst, src, stride, 16);\
1818 }\
1819 \
1820 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1821     uint8_t half[256];\
1822     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1823     OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1824 }\
1825 \
1826 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1827     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1828 }\
1829 \
1830 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1831     uint8_t half[256];\
1832     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1833     OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1834 }\
1835 \
1836 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1837     uint8_t full[24*17];\
1838     uint8_t half[256];\
1839     copy_block17(full, src, 24, stride, 17);\
1840     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1841     OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1842 }\
1843 \
1844 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1845     uint8_t full[24*17];\
1846     copy_block17(full, src, 24, stride, 17);\
1847     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1848 }\
1849 \
1850 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1851     uint8_t full[24*17];\
1852     uint8_t half[256];\
1853     copy_block17(full, src, 24, stride, 17);\
1854     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1855     OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1856 }\
1857 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1858     uint8_t full[24*17];\
1859     uint8_t halfH[272];\
1860     uint8_t halfV[256];\
1861     uint8_t halfHV[256];\
1862     copy_block17(full, src, 24, stride, 17);\
1863     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1864     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1865     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1866     OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1867 }\
1868 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1869     uint8_t full[24*17];\
1870     uint8_t halfH[272];\
1871     uint8_t halfHV[256];\
1872     copy_block17(full, src, 24, stride, 17);\
1873     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1874     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1875     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1876     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1877 }\
1878 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1879     uint8_t full[24*17];\
1880     uint8_t halfH[272];\
1881     uint8_t halfV[256];\
1882     uint8_t halfHV[256];\
1883     copy_block17(full, src, 24, stride, 17);\
1884     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1885     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1886     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1887     OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1888 }\
1889 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1890     uint8_t full[24*17];\
1891     uint8_t halfH[272];\
1892     uint8_t halfHV[256];\
1893     copy_block17(full, src, 24, stride, 17);\
1894     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1895     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1896     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1897     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1898 }\
1899 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1900     uint8_t full[24*17];\
1901     uint8_t halfH[272];\
1902     uint8_t halfV[256];\
1903     uint8_t halfHV[256];\
1904     copy_block17(full, src, 24, stride, 17);\
1905     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1906     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1907     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1908     OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1909 }\
1910 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1911     uint8_t full[24*17];\
1912     uint8_t halfH[272];\
1913     uint8_t halfHV[256];\
1914     copy_block17(full, src, 24, stride, 17);\
1915     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1916     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1917     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1918     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1919 }\
1920 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1921     uint8_t full[24*17];\
1922     uint8_t halfH[272];\
1923     uint8_t halfV[256];\
1924     uint8_t halfHV[256];\
1925     copy_block17(full, src, 24, stride, 17);\
1926     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1927     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1928     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1929     OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1930 }\
1931 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1932     uint8_t full[24*17];\
1933     uint8_t halfH[272];\
1934     uint8_t halfHV[256];\
1935     copy_block17(full, src, 24, stride, 17);\
1936     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1937     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1938     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1939     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1940 }\
1941 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1942     uint8_t halfH[272];\
1943     uint8_t halfHV[256];\
1944     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1945     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1946     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1947 }\
1948 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1949     uint8_t halfH[272];\
1950     uint8_t halfHV[256];\
1951     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1952     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1953     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1954 }\
1955 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1956     uint8_t full[24*17];\
1957     uint8_t halfH[272];\
1958     uint8_t halfV[256];\
1959     uint8_t halfHV[256];\
1960     copy_block17(full, src, 24, stride, 17);\
1961     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1962     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1963     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1964     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1965 }\
1966 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1967     uint8_t full[24*17];\
1968     uint8_t halfH[272];\
1969     copy_block17(full, src, 24, stride, 17);\
1970     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1971     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1972     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1973 }\
1974 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1975     uint8_t full[24*17];\
1976     uint8_t halfH[272];\
1977     uint8_t halfV[256];\
1978     uint8_t halfHV[256];\
1979     copy_block17(full, src, 24, stride, 17);\
1980     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1981     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1982     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1983     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1984 }\
1985 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1986     uint8_t full[24*17];\
1987     uint8_t halfH[272];\
1988     copy_block17(full, src, 24, stride, 17);\
1989     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1990     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1991     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1992 }\
1993 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1994     uint8_t halfH[272];\
1995     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1996     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1997 }
1998
1999 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2000 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2001 #define op_put(a, b) a = cm[((b) + 16)>>5]
2002 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2003
2004 QPEL_MC(0, put_       , _       , op_put)
2005 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2006 QPEL_MC(0, avg_       , _       , op_avg)
2007 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
2008 #undef op_avg
2009 #undef op_avg_no_rnd
2010 #undef op_put
2011 #undef op_put_no_rnd
2012
2013 #if 1
2014 #define H264_LOWPASS(OPNAME, OP, OP2) \
2015 static void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2016     const int h=2;\
2017     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2018     int i;\
2019     for(i=0; i<h; i++)\
2020     {\
2021         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2022         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2023         dst+=dstStride;\
2024         src+=srcStride;\
2025     }\
2026 }\
2027 \
2028 static void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2029     const int w=2;\
2030     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2031     int i;\
2032     for(i=0; i<w; i++)\
2033     {\
2034         const int srcB= src[-2*srcStride];\
2035         const int srcA= src[-1*srcStride];\
2036         const int src0= src[0 *srcStride];\
2037         const int src1= src[1 *srcStride];\
2038         const int src2= src[2 *srcStride];\
2039         const int src3= src[3 *srcStride];\
2040         const int src4= src[4 *srcStride];\
2041         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2042         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2043         dst++;\
2044         src++;\
2045     }\
2046 }\
2047 \
2048 static void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2049     const int h=2;\
2050     const int w=2;\
2051     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2052     int i;\
2053     src -= 2*srcStride;\
2054     for(i=0; i<h+5; i++)\
2055     {\
2056         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2057         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2058         tmp+=tmpStride;\
2059         src+=srcStride;\
2060     }\
2061     tmp -= tmpStride*(h+5-2);\
2062     for(i=0; i<w; i++)\
2063     {\
2064         const int tmpB= tmp[-2*tmpStride];\
2065         const int tmpA= tmp[-1*tmpStride];\
2066         const int tmp0= tmp[0 *tmpStride];\
2067         const int tmp1= tmp[1 *tmpStride];\
2068         const int tmp2= tmp[2 *tmpStride];\
2069         const int tmp3= tmp[3 *tmpStride];\
2070         const int tmp4= tmp[4 *tmpStride];\
2071         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2072         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2073         dst++;\
2074         tmp++;\
2075     }\
2076 }\
2077 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2078     const int h=4;\
2079     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2080     int i;\
2081     for(i=0; i<h; i++)\
2082     {\
2083         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2084         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2085         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2086         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2087         dst+=dstStride;\
2088         src+=srcStride;\
2089     }\
2090 }\
2091 \
2092 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2093     const int w=4;\
2094     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2095     int i;\
2096     for(i=0; i<w; i++)\
2097     {\
2098         const int srcB= src[-2*srcStride];\
2099         const int srcA= src[-1*srcStride];\
2100         const int src0= src[0 *srcStride];\
2101         const int src1= src[1 *srcStride];\
2102         const int src2= src[2 *srcStride];\
2103         const int src3= src[3 *srcStride];\
2104         const int src4= src[4 *srcStride];\
2105         const int src5= src[5 *srcStride];\
2106         const int src6= src[6 *srcStride];\
2107         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2108         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2109         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2110         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2111         dst++;\
2112         src++;\
2113     }\
2114 }\
2115 \
2116 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2117     const int h=4;\
2118     const int w=4;\
2119     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2120     int i;\
2121     src -= 2*srcStride;\
2122     for(i=0; i<h+5; i++)\
2123     {\
2124         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2125         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2126         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2127         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2128         tmp+=tmpStride;\
2129         src+=srcStride;\
2130     }\
2131     tmp -= tmpStride*(h+5-2);\
2132     for(i=0; i<w; i++)\
2133     {\
2134         const int tmpB= tmp[-2*tmpStride];\
2135         const int tmpA= tmp[-1*tmpStride];\
2136         const int tmp0= tmp[0 *tmpStride];\
2137         const int tmp1= tmp[1 *tmpStride];\
2138         const int tmp2= tmp[2 *tmpStride];\
2139         const int tmp3= tmp[3 *tmpStride];\
2140         const int tmp4= tmp[4 *tmpStride];\
2141         const int tmp5= tmp[5 *tmpStride];\
2142         const int tmp6= tmp[6 *tmpStride];\
2143         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2144         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2145         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2146         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2147         dst++;\
2148         tmp++;\
2149     }\
2150 }\
2151 \
2152 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2153     const int h=8;\
2154     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2155     int i;\
2156     for(i=0; i<h; i++)\
2157     {\
2158         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2159         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2160         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2161         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2162         OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2163         OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2164         OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2165         OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2166         dst+=dstStride;\
2167         src+=srcStride;\
2168     }\
2169 }\
2170 \
2171 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2172     const int w=8;\
2173     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2174     int i;\
2175     for(i=0; i<w; i++)\
2176     {\
2177         const int srcB= src[-2*srcStride];\
2178         const int srcA= src[-1*srcStride];\
2179         const int src0= src[0 *srcStride];\
2180         const int src1= src[1 *srcStride];\
2181         const int src2= src[2 *srcStride];\
2182         const int src3= src[3 *srcStride];\
2183         const int src4= src[4 *srcStride];\
2184         const int src5= src[5 *srcStride];\
2185         const int src6= src[6 *srcStride];\
2186         const int src7= src[7 *srcStride];\
2187         const int src8= src[8 *srcStride];\
2188         const int src9= src[9 *srcStride];\
2189         const int src10=src[10*srcStride];\
2190         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2191         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2192         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2193         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2194         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2195         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2196         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2197         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2198         dst++;\
2199         src++;\
2200     }\
2201 }\
2202 \
2203 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2204     const int h=8;\
2205     const int w=8;\
2206     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2207     int i;\
2208     src -= 2*srcStride;\
2209     for(i=0; i<h+5; i++)\
2210     {\
2211         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2212         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2213         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2214         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2215         tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2216         tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2217         tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2218         tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2219         tmp+=tmpStride;\
2220         src+=srcStride;\
2221     }\
2222     tmp -= tmpStride*(h+5-2);\
2223     for(i=0; i<w; i++)\
2224     {\
2225         const int tmpB= tmp[-2*tmpStride];\
2226         const int tmpA= tmp[-1*tmpStride];\
2227         const int tmp0= tmp[0 *tmpStride];\
2228         const int tmp1= tmp[1 *tmpStride];\
2229         const int tmp2= tmp[2 *tmpStride];\
2230         const int tmp3= tmp[3 *tmpStride];\
2231         const int tmp4= tmp[4 *tmpStride];\
2232         const int tmp5= tmp[5 *tmpStride];\
2233         const int tmp6= tmp[6 *tmpStride];\
2234         const int tmp7= tmp[7 *tmpStride];\
2235         const int tmp8= tmp[8 *tmpStride];\
2236         const int tmp9= tmp[9 *tmpStride];\
2237         const int tmp10=tmp[10*tmpStride];\
2238         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2239         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2240         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2241         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2242         OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2243         OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2244         OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2245         OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2246         dst++;\
2247         tmp++;\
2248     }\
2249 }\
2250 \
2251 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2252     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2253     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2254     src += 8*srcStride;\
2255     dst += 8*dstStride;\
2256     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2257     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2258 }\
2259 \
2260 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2261     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2262     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2263     src += 8*srcStride;\
2264     dst += 8*dstStride;\
2265     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2266     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2267 }\
2268 \
2269 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2270     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2271     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2272     src += 8*srcStride;\
2273     dst += 8*dstStride;\
2274     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2275     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2276 }\
2277
2278 #define H264_MC(OPNAME, SIZE) \
2279 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2280     OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2281 }\
2282 \
2283 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2284     uint8_t half[SIZE*SIZE];\
2285     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2286     OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2287 }\
2288 \
2289 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2290     OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2291 }\
2292 \
2293 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2294     uint8_t half[SIZE*SIZE];\
2295     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2296     OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2297 }\
2298 \
2299 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2300     uint8_t full[SIZE*(SIZE+5)];\
2301     uint8_t * const full_mid= full + SIZE*2;\
2302     uint8_t half[SIZE*SIZE];\
2303     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2304     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2305     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2306 }\
2307 \
2308 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2309     uint8_t full[SIZE*(SIZE+5)];\
2310     uint8_t * const full_mid= full + SIZE*2;\
2311     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2312     OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2313 }\
2314 \
2315 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2316     uint8_t full[SIZE*(SIZE+5)];\
2317     uint8_t * const full_mid= full + SIZE*2;\
2318     uint8_t half[SIZE*SIZE];\
2319     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2320     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2321     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2322 }\
2323 \
2324 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2325     uint8_t full[SIZE*(SIZE+5)];\
2326     uint8_t * const full_mid= full + SIZE*2;\
2327     uint8_t halfH[SIZE*SIZE];\
2328     uint8_t halfV[SIZE*SIZE];\
2329     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2330     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2331     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2332     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2333 }\
2334 \
2335 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2336     uint8_t full[SIZE*(SIZE+5)];\
2337     uint8_t * const full_mid= full + SIZE*2;\
2338     uint8_t halfH[SIZE*SIZE];\
2339     uint8_t halfV[SIZE*SIZE];\
2340     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2341     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2342     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2343     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2344 }\
2345 \
2346 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2347     uint8_t full[SIZE*(SIZE+5)];\
2348     uint8_t * const full_mid= full + SIZE*2;\
2349     uint8_t halfH[SIZE*SIZE];\
2350     uint8_t halfV[SIZE*SIZE];\
2351     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2352     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2353     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2354     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2355 }\
2356 \
2357 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2358     uint8_t full[SIZE*(SIZE+5)];\
2359     uint8_t * const full_mid= full + SIZE*2;\
2360     uint8_t halfH[SIZE*SIZE];\
2361     uint8_t halfV[SIZE*SIZE];\
2362     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2363     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2364     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2365     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2366 }\
2367 \
2368 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2369     int16_t tmp[SIZE*(SIZE+5)];\
2370     OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2371 }\
2372 \
2373 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2374     int16_t tmp[SIZE*(SIZE+5)];\
2375     uint8_t halfH[SIZE*SIZE];\
2376     uint8_t halfHV[SIZE*SIZE];\
2377     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2378     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2379     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2380 }\
2381 \
2382 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2383     int16_t tmp[SIZE*(SIZE+5)];\
2384     uint8_t halfH[SIZE*SIZE];\
2385     uint8_t halfHV[SIZE*SIZE];\
2386     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2387     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2388     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2389 }\
2390 \
2391 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2392     uint8_t full[SIZE*(SIZE+5)];\
2393     uint8_t * const full_mid= full + SIZE*2;\
2394     int16_t tmp[SIZE*(SIZE+5)];\
2395     uint8_t halfV[SIZE*SIZE];\
2396     uint8_t halfHV[SIZE*SIZE];\
2397     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2398     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2399     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2400     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2401 }\
2402 \
2403 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2404     uint8_t full[SIZE*(SIZE+5)];\
2405     uint8_t * const full_mid= full + SIZE*2;\
2406     int16_t tmp[SIZE*(SIZE+5)];\
2407     uint8_t halfV[SIZE*SIZE];\
2408     uint8_t halfHV[SIZE*SIZE];\
2409     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2410     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2411     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2412     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2413 }\
2414
2415 #define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2416 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2417 #define op_put(a, b)  a = cm[((b) + 16)>>5]
2418 #define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2419 #define op2_put(a, b)  a = cm[((b) + 512)>>10]
2420
2421 H264_LOWPASS(put_       , op_put, op2_put)
2422 H264_LOWPASS(avg_       , op_avg, op2_avg)
2423 H264_MC(put_, 2)
2424 H264_MC(put_, 4)
2425 H264_MC(put_, 8)
2426 H264_MC(put_, 16)
2427 H264_MC(avg_, 4)
2428 H264_MC(avg_, 8)
2429 H264_MC(avg_, 16)
2430
2431 #undef op_avg
2432 #undef op_put
2433 #undef op2_avg
2434 #undef op2_put
2435 #endif
2436
2437 #define op_scale1(x)  block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2438 #define op_scale2(x)  dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2439 #define H264_WEIGHT(W,H) \
2440 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2441     int y; \
2442     offset <<= log2_denom; \
2443     if(log2_denom) offset += 1<<(log2_denom-1); \
2444     for(y=0; y<H; y++, block += stride){ \
2445         op_scale1(0); \
2446         op_scale1(1); \
2447         if(W==2) continue; \
2448         op_scale1(2); \
2449         op_scale1(3); \
2450         if(W==4) continue; \
2451         op_scale1(4); \
2452         op_scale1(5); \
2453         op_scale1(6); \
2454         op_scale1(7); \
2455         if(W==8) continue; \
2456         op_scale1(8); \
2457         op_scale1(9); \
2458         op_scale1(10); \
2459         op_scale1(11); \
2460         op_scale1(12); \
2461         op_scale1(13); \
2462         op_scale1(14); \
2463         op_scale1(15); \
2464     } \
2465 } \
2466 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2467     int y; \
2468     offset = ((offset + 1) | 1) << log2_denom; \
2469     for(y=0; y<H; y++, dst += stride, src += stride){ \
2470         op_scale2(0); \
2471         op_scale2(1); \
2472         if(W==2) continue; \
2473         op_scale2(2); \
2474         op_scale2(3); \
2475         if(W==4) continue; \
2476         op_scale2(4); \
2477         op_scale2(5); \
2478         op_scale2(6); \
2479         op_scale2(7); \
2480         if(W==8) continue; \
2481         op_scale2(8); \
2482         op_scale2(9); \
2483         op_scale2(10); \
2484         op_scale2(11); \
2485         op_scale2(12); \
2486         op_scale2(13); \
2487         op_scale2(14); \
2488         op_scale2(15); \
2489     } \
2490 }
2491
2492 H264_WEIGHT(16,16)
2493 H264_WEIGHT(16,8)
2494 H264_WEIGHT(8,16)
2495 H264_WEIGHT(8,8)
2496 H264_WEIGHT(8,4)
2497 H264_WEIGHT(4,8)
2498 H264_WEIGHT(4,4)
2499 H264_WEIGHT(4,2)
2500 H264_WEIGHT(2,4)
2501 H264_WEIGHT(2,2)
2502
2503 #undef op_scale1
2504 #undef op_scale2
2505 #undef H264_WEIGHT
2506
2507 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2508     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2509     int i;
2510
2511     for(i=0; i<h; i++){
2512         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2513         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2514         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2515         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2516         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2517         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2518         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2519         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2520         dst+=dstStride;
2521         src+=srcStride;
2522     }
2523 }
2524
2525 #ifdef CONFIG_CAVS_DECODER
2526 /* AVS specific */
2527 void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2528
2529 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2530     put_pixels8_c(dst, src, stride, 8);
2531 }
2532 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2533     avg_pixels8_c(dst, src, stride, 8);
2534 }
2535 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2536     put_pixels16_c(dst, src, stride, 16);
2537 }
2538 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2539     avg_pixels16_c(dst, src, stride, 16);
2540 }
2541 #endif /* CONFIG_CAVS_DECODER */
2542
2543 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
2544 /* VC-1 specific */
2545 void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2546
2547 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2548     put_pixels8_c(dst, src, stride, 8);
2549 }
2550 #endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
2551
2552 #if defined(CONFIG_H264_ENCODER)
2553 /* H264 specific */
2554 void ff_h264dsp_init(DSPContext* c, AVCodecContext *avctx);
2555 #endif /* CONFIG_H264_ENCODER */
2556
2557 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2558     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2559     int i;
2560
2561     for(i=0; i<w; i++){
2562         const int src_1= src[ -srcStride];
2563         const int src0 = src[0          ];
2564         const int src1 = src[  srcStride];
2565         const int src2 = src[2*srcStride];
2566         const int src3 = src[3*srcStride];
2567         const int src4 = src[4*srcStride];
2568         const int src5 = src[5*srcStride];
2569         const int src6 = src[6*srcStride];
2570         const int src7 = src[7*srcStride];
2571         const int src8 = src[8*srcStride];
2572         const int src9 = src[9*srcStride];
2573         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2574         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2575         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2576         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2577         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2578         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2579         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2580         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2581         src++;
2582         dst++;
2583     }
2584 }
2585
2586 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2587     put_pixels8_c(dst, src, stride, 8);
2588 }
2589
2590 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2591     uint8_t half[64];
2592     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2593     put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2594 }
2595
2596 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2597     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2598 }
2599
2600 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2601     uint8_t half[64];
2602     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2603     put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2604 }
2605
2606 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2607     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2608 }
2609
2610 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2611     uint8_t halfH[88];
2612     uint8_t halfV[64];
2613     uint8_t halfHV[64];
2614     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2615     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2616     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2617     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2618 }
2619 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2620     uint8_t halfH[88];
2621     uint8_t halfV[64];
2622     uint8_t halfHV[64];
2623     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2624     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2625     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2626     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2627 }
2628 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2629     uint8_t halfH[88];
2630     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2631     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2632 }
2633
2634 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2635     int x;
2636     const int strength= ff_h263_loop_filter_strength[qscale];
2637
2638     for(x=0; x<8; x++){
2639         int d1, d2, ad1;
2640         int p0= src[x-2*stride];
2641         int p1= src[x-1*stride];
2642         int p2= src[x+0*stride];
2643         int p3= src[x+1*stride];
2644         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2645
2646         if     (d<-2*strength) d1= 0;
2647         else if(d<-  strength) d1=-2*strength - d;
2648         else if(d<   strength) d1= d;
2649         else if(d< 2*strength) d1= 2*strength - d;
2650         else                   d1= 0;
2651
2652         p1 += d1;
2653         p2 -= d1;
2654         if(p1&256) p1= ~(p1>>31);
2655         if(p2&256) p2= ~(p2>>31);
2656
2657         src[x-1*stride] = p1;
2658         src[x+0*stride] = p2;
2659
2660         ad1= FFABS(d1)>>1;
2661
2662         d2= av_clip((p0-p3)/4, -ad1, ad1);
2663
2664         src[x-2*stride] = p0 - d2;
2665         src[x+  stride] = p3 + d2;
2666     }
2667 }
2668
2669 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2670     int y;
2671     const int strength= ff_h263_loop_filter_strength[qscale];
2672
2673     for(y=0; y<8; y++){
2674         int d1, d2, ad1;
2675         int p0= src[y*stride-2];
2676         int p1= src[y*stride-1];
2677         int p2= src[y*stride+0];
2678         int p3= src[y*stride+1];
2679         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2680
2681         if     (d<-2*strength) d1= 0;
2682         else if(d<-  strength) d1=-2*strength - d;
2683         else if(d<   strength) d1= d;
2684         else if(d< 2*strength) d1= 2*strength - d;
2685         else                   d1= 0;
2686
2687         p1 += d1;
2688         p2 -= d1;
2689         if(p1&256) p1= ~(p1>>31);
2690         if(p2&256) p2= ~(p2>>31);
2691
2692         src[y*stride-1] = p1;
2693         src[y*stride+0] = p2;
2694
2695         ad1= FFABS(d1)>>1;
2696
2697         d2= av_clip((p0-p3)/4, -ad1, ad1);
2698
2699         src[y*stride-2] = p0 - d2;
2700         src[y*stride+1] = p3 + d2;
2701     }
2702 }
2703
2704 static void h261_loop_filter_c(uint8_t *src, int stride){
2705     int x,y,xy,yz;
2706     int temp[64];
2707
2708     for(x=0; x<8; x++){
2709         temp[x      ] = 4*src[x           ];
2710         temp[x + 7*8] = 4*src[x + 7*stride];
2711     }
2712     for(y=1; y<7; y++){
2713         for(x=0; x<8; x++){
2714             xy = y * stride + x;
2715             yz = y * 8 + x;
2716             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2717         }
2718     }
2719
2720     for(y=0; y<8; y++){
2721         src[  y*stride] = (temp[  y*8] + 2)>>2;
2722         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2723         for(x=1; x<7; x++){
2724             xy = y * stride + x;
2725             yz = y * 8 + x;
2726             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2727         }
2728     }
2729 }
2730
2731 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2732 {
2733     int i, d;
2734     for( i = 0; i < 4; i++ ) {
2735         if( tc0[i] < 0 ) {
2736             pix += 4*ystride;
2737             continue;
2738         }
2739         for( d = 0; d < 4; d++ ) {
2740             const int p0 = pix[-1*xstride];
2741             const int p1 = pix[-2*xstride];
2742             const int p2 = pix[-3*xstride];
2743             const int q0 = pix[0];
2744             const int q1 = pix[1*xstride];
2745             const int q2 = pix[2*xstride];
2746
2747             if( FFABS( p0 - q0 ) < alpha &&
2748                 FFABS( p1 - p0 ) < beta &&
2749                 FFABS( q1 - q0 ) < beta ) {
2750
2751                 int tc = tc0[i];
2752                 int i_delta;
2753
2754                 if( FFABS( p2 - p0 ) < beta ) {
2755                     pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2756                     tc++;
2757                 }
2758                 if( FFABS( q2 - q0 ) < beta ) {
2759                     pix[   xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2760                     tc++;
2761                 }
2762
2763                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2764                 pix[-xstride] = av_clip_uint8( p0 + i_delta );    /* p0' */
2765                 pix[0]        = av_clip_uint8( q0 - i_delta );    /* q0' */
2766             }
2767             pix += ystride;
2768         }
2769     }
2770 }
2771 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2772 {
2773     h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2774 }
2775 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2776 {
2777     h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2778 }
2779
2780 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2781 {
2782     int i, d;
2783     for( i = 0; i < 4; i++ ) {
2784         const int tc = tc0[i];
2785         if( tc <= 0 ) {
2786             pix += 2*ystride;
2787             continue;
2788         }
2789         for( d = 0; d < 2; d++ ) {
2790             const int p0 = pix[-1*xstride];
2791             const int p1 = pix[-2*xstride];
2792             const int q0 = pix[0];
2793             const int q1 = pix[1*xstride];
2794
2795             if( FFABS( p0 - q0 ) < alpha &&
2796                 FFABS( p1 - p0 ) < beta &&
2797                 FFABS( q1 - q0 ) < beta ) {
2798
2799                 int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2800
2801                 pix[-xstride] = av_clip_uint8( p0 + delta );    /* p0' */
2802                 pix[0]        = av_clip_uint8( q0 - delta );    /* q0' */
2803             }
2804             pix += ystride;
2805         }
2806     }
2807 }
2808 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2809 {
2810     h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2811 }
2812 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2813 {
2814     h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
2815 }
2816
2817 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2818 {
2819     int d;
2820     for( d = 0; d < 8; d++ ) {
2821         const int p0 = pix[-1*xstride];
2822         const int p1 = pix[-2*xstride];
2823         const int q0 = pix[0];
2824         const int q1 = pix[1*xstride];
2825
2826         if( FFABS( p0 - q0 ) < alpha &&
2827             FFABS( p1 - p0 ) < beta &&
2828             FFABS( q1 - q0 ) < beta ) {
2829
2830             pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
2831             pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
2832         }
2833         pix += ystride;
2834     }
2835 }
2836 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2837 {
2838     h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
2839 }
2840 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2841 {
2842     h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
2843 }
2844
2845 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2846 {
2847     int s, i;
2848
2849     s = 0;
2850     for(i=0;i<h;i++) {
2851         s += abs(pix1[0] - pix2[0]);
2852         s += abs(pix1[1] - pix2[1]);
2853         s += abs(pix1[2] - pix2[2]);
2854         s += abs(pix1[3] - pix2[3]);
2855         s += abs(pix1[4] - pix2[4]);
2856         s += abs(pix1[5] - pix2[5]);
2857         s += abs(pix1[6] - pix2[6]);
2858         s += abs(pix1[7] - pix2[7]);
2859         s += abs(pix1[8] - pix2[8]);
2860         s += abs(pix1[9] - pix2[9]);
2861         s += abs(pix1[10] - pix2[10]);
2862         s += abs(pix1[11] - pix2[11]);
2863         s += abs(pix1[12] - pix2[12]);
2864         s += abs(pix1[13] - pix2[13]);
2865         s += abs(pix1[14] - pix2[14]);
2866         s += abs(pix1[15] - pix2[15]);
2867         pix1 += line_size;
2868         pix2 += line_size;
2869     }
2870     return s;
2871 }
2872
2873 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2874 {
2875     int s, i;
2876
2877     s = 0;
2878     for(i=0;i<h;i++) {
2879         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2880         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2881         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2882         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2883         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2884         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2885         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2886         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2887         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2888         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2889         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2890         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2891         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2892         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2893         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2894         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2895         pix1 += line_size;
2896         pix2 += line_size;
2897     }
2898     return s;
2899 }
2900
2901 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2902 {
2903     int s, i;
2904     uint8_t *pix3 = pix2 + line_size;
2905
2906     s = 0;
2907     for(i=0;i<h;i++) {
2908         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2909         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2910         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2911         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2912         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2913         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2914         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2915         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2916         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2917         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2918         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2919         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2920         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2921         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2922         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2923         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2924         pix1 += line_size;
2925         pix2 += line_size;
2926         pix3 += line_size;
2927     }
2928     return s;
2929 }
2930
2931 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2932 {
2933     int s, i;
2934     uint8_t *pix3 = pix2 + line_size;
2935
2936     s = 0;
2937     for(i=0;i<h;i++) {
2938         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2939         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2940         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2941         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2942         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2943         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2944         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2945         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2946         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2947         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2948         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2949         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2950         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2951         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2952         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2953         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2954         pix1 += line_size;
2955         pix2 += line_size;
2956         pix3 += line_size;
2957     }
2958     return s;
2959 }
2960
2961 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2962 {
2963     int s, i;
2964
2965     s = 0;
2966     for(i=0;i<h;i++) {
2967         s += abs(pix1[0] - pix2[0]);
2968         s += abs(pix1[1] - pix2[1]);
2969         s += abs(pix1[2] - pix2[2]);
2970         s += abs(pix1[3] - pix2[3]);
2971         s += abs(pix1[4] - pix2[4]);
2972         s += abs(pix1[5] - pix2[5]);
2973         s += abs(pix1[6] - pix2[6]);
2974         s += abs(pix1[7] - pix2[7]);
2975         pix1 += line_size;
2976         pix2 += line_size;
2977     }
2978     return s;
2979 }
2980
2981 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2982 {
2983     int s, i;
2984
2985     s = 0;
2986     for(i=0;i<h;i++) {
2987         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2988         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2989         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2990         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2991         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2992         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2993         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2994         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2995         pix1 += line_size;
2996         pix2 += line_size;
2997     }
2998     return s;
2999 }
3000
3001 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3002 {
3003     int s, i;
3004     uint8_t *pix3 = pix2 + line_size;
3005
3006     s = 0;
3007     for(i=0;i<h;i++) {
3008         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3009         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3010         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3011         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3012         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3013         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3014         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3015         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3016         pix1 += line_size;
3017         pix2 += line_size;
3018         pix3 += line_size;
3019     }
3020     return s;
3021 }
3022
3023 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3024 {
3025     int s, i;
3026     uint8_t *pix3 = pix2 + line_size;
3027
3028     s = 0;
3029     for(i=0;i<h;i++) {
3030         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3031         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3032         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3033         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3034         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3035         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3036         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3037         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3038         pix1 += line_size;
3039         pix2 += line_size;
3040         pix3 += line_size;
3041     }
3042     return s;
3043 }
3044
3045 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3046     MpegEncContext *c = v;
3047     int score1=0;
3048     int score2=0;
3049     int x,y;
3050
3051     for(y=0; y<h; y++){
3052         for(x=0; x<16; x++){
3053             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3054         }
3055         if(y+1<h){
3056             for(x=0; x<15; x++){
3057                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
3058                              - s1[x+1] + s1[x+1+stride])
3059                         -FFABS(  s2[x  ] - s2[x  +stride]
3060                              - s2[x+1] + s2[x+1+stride]);
3061             }
3062         }
3063         s1+= stride;
3064         s2+= stride;
3065     }
3066
3067     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3068     else  return score1 + FFABS(score2)*8;
3069 }
3070
3071 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3072     MpegEncContext *c = v;
3073     int score1=0;
3074     int score2=0;
3075     int x,y;
3076
3077     for(y=0; y<h; y++){
3078         for(x=0; x<8; x++){
3079             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3080         }
3081         if(y+1<h){
3082             for(x=0; x<7; x++){
3083                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
3084                              - s1[x+1] + s1[x+1+stride])
3085                         -FFABS(  s2[x  ] - s2[x  +stride]
3086                              - s2[x+1] + s2[x+1+stride]);
3087             }
3088         }
3089         s1+= stride;
3090         s2+= stride;
3091     }
3092
3093     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3094     else  return score1 + FFABS(score2)*8;
3095 }
3096
3097 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3098     int i;
3099     unsigned int sum=0;
3100
3101     for(i=0; i<8*8; i++){
3102         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3103         int w= weight[i];
3104         b>>= RECON_SHIFT;
3105         assert(-512<b && b<512);
3106
3107         sum += (w*b)*(w*b)>>4;
3108     }
3109     return sum>>2;
3110 }
3111
3112 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3113     int i;
3114
3115     for(i=0; i<8*8; i++){
3116         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3117     }
3118 }
3119
3120 /**
3121  * permutes an 8x8 block.
3122  * @param block the block which will be permuted according to the given permutation vector
3123  * @param permutation the permutation vector
3124  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3125  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3126  *                  (inverse) permutated to scantable order!
3127  */
3128 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3129 {
3130     int i;
3131     DCTELEM temp[64];
3132
3133     if(last<=0) return;
3134     //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
3135
3136     for(i=0; i<=last; i++){
3137         const int j= scantable[i];
3138         temp[j]= block[j];
3139         block[j]=0;
3140     }
3141
3142     for(i=0; i<=last; i++){
3143         const int j= scantable[i];
3144         const int perm_j= permutation[j];
3145         block[perm_j]= temp[j];
3146     }
3147 }
3148
3149 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3150     return 0;
3151 }
3152
3153 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3154     int i;
3155
3156     memset(cmp, 0, sizeof(void*)*5);
3157
3158     for(i=0; i<5; i++){
3159         switch(type&0xFF){
3160         case FF_CMP_SAD:
3161             cmp[i]= c->sad[i];
3162             break;
3163         case FF_CMP_SATD:
3164             cmp[i]= c->hadamard8_diff[i];
3165             break;
3166         case FF_CMP_SSE:
3167             cmp[i]= c->sse[i];
3168             break;
3169         case FF_CMP_DCT:
3170             cmp[i]= c->dct_sad[i];
3171             break;
3172         case FF_CMP_DCT264:
3173             cmp[i]= c->dct264_sad[i];
3174             break;
3175         case FF_CMP_DCTMAX:
3176             cmp[i]= c->dct_max[i];
3177             break;
3178         case FF_CMP_PSNR:
3179             cmp[i]= c->quant_psnr[i];
3180             break;
3181         case FF_CMP_BIT:
3182             cmp[i]= c->bit[i];
3183             break;
3184         case FF_CMP_RD:
3185             cmp[i]= c->rd[i];
3186             break;
3187         case FF_CMP_VSAD:
3188             cmp[i]= c->vsad[i];
3189             break;
3190         case FF_CMP_VSSE:
3191             cmp[i]= c->vsse[i];
3192             break;
3193         case FF_CMP_ZERO:
3194             cmp[i]= zero_cmp;
3195             break;
3196         case FF_CMP_NSSE:
3197             cmp[i]= c->nsse[i];
3198             break;
3199 #ifdef CONFIG_SNOW_ENCODER
3200         case FF_CMP_W53:
3201             cmp[i]= c->w53[i];
3202             break;
3203         case FF_CMP_W97:
3204             cmp[i]= c->w97[i];
3205             break;
3206 #endif
3207         default:
3208             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3209         }
3210     }
3211 }
3212
3213 /**
3214  * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3215  */
3216 static void clear_blocks_c(DCTELEM *blocks)
3217 {
3218     memset(blocks, 0, sizeof(DCTELEM)*6*64);
3219 }
3220
3221 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3222     int i;
3223     for(i=0; i+7<w; i+=8){
3224         dst[i+0] += src[i+0];
3225         dst[i+1] += src[i+1];
3226         dst[i+2] += src[i+2];
3227         dst[i+3] += src[i+3];
3228         dst[i+4] += src[i+4];
3229         dst[i+5] += src[i+5];
3230         dst[i+6] += src[i+6];
3231         dst[i+7] += src[i+7];
3232     }
3233     for(; i<w; i++)
3234         dst[i+0] += src[i+0];
3235 }
3236
3237 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3238     int i;
3239     for(i=0; i+7<w; i+=8){
3240         dst[i+0] = src1[i+0]-src2[i+0];
3241         dst[i+1] = src1[i+1]-src2[i+1];
3242         dst[i+2] = src1[i+2]-src2[i+2];
3243         dst[i+3] = src1[i+3]-src2[i+3];
3244         dst[i+4] = src1[i+4]-src2[i+4];
3245         dst[i+5] = src1[i+5]-src2[i+5];
3246         dst[i+6] = src1[i+6]-src2[i+6];
3247         dst[i+7] = src1[i+7]-src2[i+7];
3248     }
3249     for(; i<w; i++)
3250         dst[i+0] = src1[i+0]-src2[i+0];
3251 }
3252
3253 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3254     int i;
3255     uint8_t l, lt;
3256
3257     l= *left;
3258     lt= *left_top;
3259
3260     for(i=0; i<w; i++){
3261         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3262         lt= src1[i];
3263         l= src2[i];
3264         dst[i]= l - pred;
3265     }
3266
3267     *left= l;
3268     *left_top= lt;
3269 }
3270
3271 #define BUTTERFLY2(o1,o2,i1,i2) \
3272 o1= (i1)+(i2);\
3273 o2= (i1)-(i2);
3274
3275 #define BUTTERFLY1(x,y) \
3276 {\
3277     int a,b;\
3278     a= x;\
3279     b= y;\
3280     x= a+b;\
3281     y= a-b;\
3282 }
3283
3284 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3285
3286 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3287     int i;
3288     int temp[64];
3289     int sum=0;
3290
3291     assert(h==8);
3292
3293     for(i=0; i<8; i++){
3294         //FIXME try pointer walks
3295         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3296         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3297         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3298         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3299
3300         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3301         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3302         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3303         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3304
3305         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3306         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3307         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3308         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3309     }
3310
3311     for(i=0; i<8; i++){
3312         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3313         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3314         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3315         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3316
3317         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3318         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3319         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3320         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3321
3322         sum +=
3323              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3324             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3325             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3326             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3327     }
3328 #if 0
3329 static int maxi=0;
3330 if(sum>maxi){
3331     maxi=sum;
3332     printf("MAX:%d\n", maxi);
3333 }
3334 #endif
3335     return sum;
3336 }
3337
3338 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3339     int i;
3340     int temp[64];
3341     int sum=0;
3342
3343     assert(h==8);
3344
3345     for(i=0; i<8; i++){
3346         //FIXME try pointer walks
3347         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3348         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3349         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3350         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3351
3352         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3353         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3354         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3355         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3356
3357         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3358         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3359         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3360         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3361     }
3362
3363     for(i=0; i<8; i++){
3364         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3365         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3366         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3367         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3368
3369         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3370         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3371         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3372         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3373
3374         sum +=
3375              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3376             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3377             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3378             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3379     }
3380
3381     sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3382
3383     return sum;
3384 }
3385
3386 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3387     MpegEncContext * const s= (MpegEncContext *)c;
3388     DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3389     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3390     int sum=0, i;
3391
3392     assert(h==8);
3393
3394     s->dsp.diff_pixels(temp, src1, src2, stride);
3395     s->dsp.fdct(temp);
3396
3397     for(i=0; i<64; i++)
3398         sum+= FFABS(temp[i]);
3399
3400     return sum;
3401 }
3402
3403 #ifdef CONFIG_GPL
3404 #define DCT8_1D {\
3405     const int s07 = SRC(0) + SRC(7);\
3406     const int s16 = SRC(1) + SRC(6);\
3407     const int s25 = SRC(2) + SRC(5);\
3408     const int s34 = SRC(3) + SRC(4);\
3409     const int a0 = s07 + s34;\
3410     const int a1 = s16 + s25;\
3411     const int a2 = s07 - s34;\
3412     const int a3 = s16 - s25;\
3413     const int d07 = SRC(0) - SRC(7);\
3414     const int d16 = SRC(1) - SRC(6);\
3415     const int d25 = SRC(2) - SRC(5);\
3416     const int d34 = SRC(3) - SRC(4);\
3417     const int a4 = d16 + d25 + (d07 + (d07>>1));\
3418     const int a5 = d07 - d34 - (d25 + (d25>>1));\
3419     const int a6 = d07 + d34 - (d16 + (d16>>1));\
3420     const int a7 = d16 - d25 + (d34 + (d34>>1));\
3421     DST(0,  a0 + a1     ) ;\
3422     DST(1,  a4 + (a7>>2)) ;\
3423     DST(2,  a2 + (a3>>1)) ;\
3424     DST(3,  a5 + (a6>>2)) ;\
3425     DST(4,  a0 - a1     ) ;\
3426     DST(5,  a6 - (a5>>2)) ;\
3427     DST(6, (a2>>1) - a3 ) ;\
3428     DST(7, (a4>>2) - a7 ) ;\
3429 }
3430
3431 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3432     MpegEncContext * const s= (MpegEncContext *)c;
3433     int16_t dct[8][8];
3434     int i;
3435     int sum=0;
3436
3437     s->dsp.diff_pixels(dct, src1, src2, stride);
3438
3439 #define SRC(x) dct[i][x]
3440 #define DST(x,v) dct[i][x]= v
3441     for( i = 0; i < 8; i++ )
3442         DCT8_1D
3443 #undef SRC
3444 #undef DST
3445
3446 #define SRC(x) dct[x][i]
3447 #define DST(x,v) sum += FFABS(v)
3448     for( i = 0; i < 8; i++ )
3449         DCT8_1D
3450 #undef SRC
3451 #undef DST
3452     return sum;
3453 }
3454 #endif
3455
3456 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3457     MpegEncContext * const s= (MpegEncContext *)c;
3458     DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3459     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3460     int sum=0, i;
3461
3462     assert(h==8);
3463
3464     s->dsp.diff_pixels(temp, src1, src2, stride);
3465     s->dsp.fdct(temp);
3466
3467     for(i=0; i<64; i++)
3468         sum= FFMAX(sum, FFABS(temp[i]));
3469
3470     return sum;
3471 }
3472
3473 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3474     MpegEncContext * const s= (MpegEncContext *)c;
3475     DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3476     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3477     DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3478     int sum=0, i;
3479
3480     assert(h==8);
3481     s->mb_intra=0;
3482
3483     s->dsp.diff_pixels(temp, src1, src2, stride);
3484
3485     memcpy(bak, temp, 64*sizeof(DCTELEM));
3486
3487     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3488     s->dct_unquantize_inter(s, temp, 0, s->qscale);
3489     simple_idct(temp); //FIXME
3490
3491     for(i=0; i<64; i++)
3492         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3493
3494     return sum;
3495 }
3496
3497 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3498     MpegEncContext * const s= (MpegEncContext *)c;
3499     const uint8_t *scantable= s->intra_scantable.permutated;
3500     DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3501     DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3502     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3503     uint8_t * const bak= (uint8_t*)aligned_bak;
3504     int i, last, run, bits, level, distoration, start_i;
3505     const int esc_length= s->ac_esc_length;
3506     uint8_t * length;
3507     uint8_t * last_length;
3508
3509     assert(h==8);
3510
3511     for(i=0; i<8; i++){
3512         ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3513         ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3514     }
3515
3516     s->dsp.diff_pixels(temp, src1, src2, stride);
3517
3518     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3519
3520     bits=0;
3521
3522     if (s->mb_intra) {
3523         start_i = 1;
3524         length     = s->intra_ac_vlc_length;
3525         last_length= s->intra_ac_vlc_last_length;
3526         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3527     } else {
3528         start_i = 0;
3529         length     = s->inter_ac_vlc_length;
3530         last_length= s->inter_ac_vlc_last_length;
3531     }
3532
3533     if(last>=start_i){
3534         run=0;
3535         for(i=start_i; i<last; i++){
3536             int j= scantable[i];
3537             level= temp[j];
3538
3539             if(level){
3540                 level+=64;
3541                 if((level&(~127)) == 0){
3542                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3543                 }else
3544                     bits+= esc_length;
3545                 run=0;
3546             }else
3547                 run++;
3548         }
3549         i= scantable[last];
3550
3551         level= temp[i] + 64;
3552
3553         assert(level - 64);
3554
3555         if((level&(~127)) == 0){
3556             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3557         }else
3558             bits+= esc_length;
3559
3560     }
3561
3562     if(last>=0){
3563         if(s->mb_intra)
3564             s->dct_unquantize_intra(s, temp, 0, s->qscale);
3565         else
3566             s->dct_unquantize_inter(s, temp, 0, s->qscale);
3567     }
3568
3569     s->dsp.idct_add(bak, stride, temp);
3570
3571     distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3572
3573     return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3574 }
3575
3576 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3577     MpegEncContext * const s= (MpegEncContext *)c;
3578     const uint8_t *scantable= s->intra_scantable.permutated;
3579     DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3580     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3581     int i, last, run, bits, level, start_i;
3582     const int esc_length= s->ac_esc_length;
3583     uint8_t * length;
3584     uint8_t * last_length;
3585
3586     assert(h==8);
3587
3588     s->dsp.diff_pixels(temp, src1, src2, stride);
3589
3590     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3591
3592     bits=0;
3593
3594     if (s->mb_intra) {
3595         start_i = 1;
3596         length     = s->intra_ac_vlc_length;
3597         last_length= s->intra_ac_vlc_last_length;
3598         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3599     } else {
3600         start_i = 0;
3601         length     = s->inter_ac_vlc_length;
3602         last_length= s->inter_ac_vlc_last_length;
3603     }
3604
3605     if(last>=start_i){
3606         run=0;
3607         for(i=start_i; i<last; i++){
3608             int j= scantable[i];
3609             level= temp[j];
3610
3611             if(level){
3612                 level+=64;
3613                 if((level&(~127)) == 0){
3614                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3615                 }else
3616                     bits+= esc_length;
3617                 run=0;
3618             }else
3619                 run++;
3620         }
3621         i= scantable[last];
3622
3623         level= temp[i] + 64;
3624
3625         assert(level - 64);
3626
3627         if((level&(~127)) == 0){
3628             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3629         }else
3630             bits+= esc_length;
3631     }
3632
3633     return bits;
3634 }
3635
3636 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3637     int score=0;
3638     int x,y;
3639
3640     for(y=1; y<h; y++){
3641         for(x=0; x<16; x+=4){
3642             score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])
3643                    +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);
3644         }
3645         s+= stride;
3646     }
3647
3648     return score;
3649 }
3650
3651 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3652     int score=0;
3653     int x,y;
3654
3655     for(y=1; y<h; y++){
3656         for(x=0; x<16; x++){
3657             score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3658         }
3659         s1+= stride;
3660         s2+= stride;
3661     }
3662
3663     return score;
3664 }
3665
3666 #define SQ(a) ((a)*(a))
3667 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3668     int score=0;
3669     int x,y;
3670
3671     for(y=1; y<h; y++){
3672         for(x=0; x<16; x+=4){
3673             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])
3674                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3675         }
3676         s+= stride;
3677     }
3678
3679     return score;
3680 }
3681
3682 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3683     int score=0;
3684     int x,y;
3685
3686     for(y=1; y<h; y++){
3687         for(x=0; x<16; x++){
3688             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3689         }
3690         s1+= stride;
3691         s2+= stride;
3692     }
3693
3694     return score;
3695 }
3696
3697 WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3698 WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3699 WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3700 #ifdef CONFIG_GPL
3701 WARPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3702 #endif
3703 WARPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3704 WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3705 WARPER8_16_SQ(rd8x8_c, rd16_c)
3706 WARPER8_16_SQ(bit8x8_c, bit16_c)
3707
3708 static void vector_fmul_c(float *dst, const float *src, int len){
3709     int i;
3710     for(i=0; i<len; i++)
3711         dst[i] *= src[i];
3712 }
3713
3714 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3715     int i;
3716     src1 += len-1;
3717     for(i=0; i<len; i++)
3718         dst[i] = src0[i] * src1[-i];
3719 }
3720
3721 void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){
3722     int i;
3723     for(i=0; i<len; i++)
3724         dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
3725 }
3726
3727 void ff_float_to_int16_c(int16_t *dst, const float *src, int len){
3728     int i;
3729     for(i=0; i<len; i++) {
3730         int_fast32_t tmp = ((int32_t*)src)[i];
3731         if(tmp & 0xf0000){
3732             tmp = (0x43c0ffff - tmp)>>31;
3733             // is this faster on some gcc/cpu combinations?
3734 //          if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3735 //          else                 tmp = 0;
3736         }
3737         dst[i] = tmp - 0x8000;
3738     }
3739 }
3740
3741 /* XXX: those functions should be suppressed ASAP when all IDCTs are
3742  converted */
3743 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3744 {
3745     j_rev_dct (block);
3746     put_pixels_clamped_c(block, dest, line_size);
3747 }
3748 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3749 {
3750     j_rev_dct (block);
3751     add_pixels_clamped_c(block, dest, line_size);
3752 }
3753
3754 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3755 {
3756     j_rev_dct4 (block);
3757     put_pixels_clamped4_c(block, dest, line_size);
3758 }
3759 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3760 {
3761     j_rev_dct4 (block);
3762     add_pixels_clamped4_c(block, dest, line_size);
3763 }
3764
3765 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
3766 {
3767     j_rev_dct2 (block);
3768     put_pixels_clamped2_c(block, dest, line_size);
3769 }
3770 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
3771 {
3772     j_rev_dct2 (block);
3773     add_pixels_clamped2_c(block, dest, line_size);
3774 }
3775
3776 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
3777 {
3778     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
3779
3780     dest[0] = cm[(block[0] + 4)>>3];
3781 }
3782 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
3783 {
3784     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
3785
3786     dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
3787 }
3788
3789 static void just_return() { return; }
3790
3791 /* init static data */
3792 void dsputil_static_init(void)
3793 {
3794     int i;
3795
3796     for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
3797     for(i=0;i<MAX_NEG_CROP;i++) {
3798         ff_cropTbl[i] = 0;
3799         ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
3800     }
3801
3802     for(i=0;i<512;i++) {
3803         ff_squareTbl[i] = (i - 256) * (i - 256);
3804     }
3805
3806     for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3807 }
3808
3809 int ff_check_alignment(void){
3810     static int did_fail=0;
3811     DECLARE_ALIGNED_16(int, aligned);
3812
3813     if((int)&aligned & 15){
3814         if(!did_fail){
3815 #if defined(HAVE_MMX) || defined(HAVE_ALTIVEC)
3816             av_log(NULL, AV_LOG_ERROR,
3817                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
3818                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
3819                 "but in the compiler. Do not report crashes to FFmpeg developers.\n");
3820 #endif
3821             did_fail=1;
3822         }
3823         return -1;
3824     }
3825     return 0;
3826 }
3827
3828 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3829 {
3830     int i;
3831
3832     ff_check_alignment();
3833
3834 #ifdef CONFIG_ENCODERS
3835     if(avctx->dct_algo==FF_DCT_FASTINT) {
3836         c->fdct = fdct_ifast;
3837         c->fdct248 = fdct_ifast248;
3838     }
3839     else if(avctx->dct_algo==FF_DCT_FAAN) {
3840         c->fdct = ff_faandct;
3841         c->fdct248 = ff_faandct248;
3842     }
3843     else {
3844         c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3845         c->fdct248 = ff_fdct248_islow;
3846     }
3847 #endif //CONFIG_ENCODERS
3848
3849     if(avctx->lowres==1){
3850         if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO){
3851             c->idct_put= ff_jref_idct4_put;
3852             c->idct_add= ff_jref_idct4_add;
3853         }else{
3854             c->idct_put= ff_h264_lowres_idct_put_c;
3855             c->idct_add= ff_h264_lowres_idct_add_c;
3856         }
3857         c->idct    = j_rev_dct4;
3858         c->idct_permutation_type= FF_NO_IDCT_PERM;
3859     }else if(avctx->lowres==2){
3860         c->idct_put= ff_jref_idct2_put;
3861         c->idct_add= ff_jref_idct2_add;
3862         c->idct    = j_rev_dct2;
3863         c->idct_permutation_type= FF_NO_IDCT_PERM;
3864     }else if(avctx->lowres==3){
3865         c->idct_put= ff_jref_idct1_put;
3866         c->idct_add= ff_jref_idct1_add;
3867         c->idct    = j_rev_dct1;
3868         c->idct_permutation_type= FF_NO_IDCT_PERM;
3869     }else{
3870         if(avctx->idct_algo==FF_IDCT_INT){
3871             c->idct_put= ff_jref_idct_put;
3872             c->idct_add= ff_jref_idct_add;
3873             c->idct    = j_rev_dct;
3874             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3875         }else if(avctx->idct_algo==FF_IDCT_VP3){
3876             c->idct_put= ff_vp3_idct_put_c;
3877             c->idct_add= ff_vp3_idct_add_c;
3878             c->idct    = ff_vp3_idct_c;
3879             c->idct_permutation_type= FF_NO_IDCT_PERM;
3880         }else{ //accurate/default
3881             c->idct_put= simple_idct_put;
3882             c->idct_add= simple_idct_add;
3883             c->idct    = simple_idct;
3884             c->idct_permutation_type= FF_NO_IDCT_PERM;
3885         }
3886     }
3887
3888     c->h264_idct_add= ff_h264_idct_add_c;
3889     c->h264_idct8_add= ff_h264_idct8_add_c;
3890     c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
3891     c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
3892
3893     c->get_pixels = get_pixels_c;
3894     c->diff_pixels = diff_pixels_c;
3895     c->put_pixels_clamped = put_pixels_clamped_c;
3896     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
3897     c->add_pixels_clamped = add_pixels_clamped_c;
3898     c->add_pixels8 = add_pixels8_c;
3899     c->add_pixels4 = add_pixels4_c;
3900     c->gmc1 = gmc1_c;
3901     c->gmc = ff_gmc_c;
3902     c->clear_blocks = clear_blocks_c;
3903     c->pix_sum = pix_sum_c;
3904     c->pix_norm1 = pix_norm1_c;
3905
3906     /* TODO [0] 16  [1] 8 */
3907     c->pix_abs[0][0] = pix_abs16_c;
3908     c->pix_abs[0][1] = pix_abs16_x2_c;
3909     c->pix_abs[0][2] = pix_abs16_y2_c;
3910     c->pix_abs[0][3] = pix_abs16_xy2_c;
3911     c->pix_abs[1][0] = pix_abs8_c;
3912     c->pix_abs[1][1] = pix_abs8_x2_c;
3913     c->pix_abs[1][2] = pix_abs8_y2_c;
3914     c->pix_abs[1][3] = pix_abs8_xy2_c;
3915
3916 #define dspfunc(PFX, IDX, NUM) \
3917     c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
3918     c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
3919     c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
3920     c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3921
3922     dspfunc(put, 0, 16);
3923     dspfunc(put_no_rnd, 0, 16);
3924     dspfunc(put, 1, 8);
3925     dspfunc(put_no_rnd, 1, 8);
3926     dspfunc(put, 2, 4);
3927     dspfunc(put, 3, 2);
3928
3929     dspfunc(avg, 0, 16);
3930     dspfunc(avg_no_rnd, 0, 16);
3931     dspfunc(avg, 1, 8);
3932     dspfunc(avg_no_rnd, 1, 8);
3933     dspfunc(avg, 2, 4);
3934     dspfunc(avg, 3, 2);
3935 #undef dspfunc
3936
3937     c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
3938     c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
3939
3940     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3941     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3942     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3943     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3944     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3945     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3946     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3947     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3948     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3949
3950     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3951     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3952     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3953     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3954     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3955     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3956     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3957     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3958     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3959
3960 #define dspfunc(PFX, IDX, NUM) \
3961     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3962     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3963     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3964     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3965     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3966     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3967     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3968     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3969     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3970     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3971     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3972     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3973     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3974     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3975     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3976     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3977
3978     dspfunc(put_qpel, 0, 16);
3979     dspfunc(put_no_rnd_qpel, 0, 16);
3980
3981     dspfunc(avg_qpel, 0, 16);
3982     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3983
3984     dspfunc(put_qpel, 1, 8);
3985     dspfunc(put_no_rnd_qpel, 1, 8);
3986
3987     dspfunc(avg_qpel, 1, 8);
3988     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3989
3990     dspfunc(put_h264_qpel, 0, 16);
3991     dspfunc(put_h264_qpel, 1, 8);
3992     dspfunc(put_h264_qpel, 2, 4);
3993     dspfunc(put_h264_qpel, 3, 2);
3994     dspfunc(avg_h264_qpel, 0, 16);
3995     dspfunc(avg_h264_qpel, 1, 8);
3996     dspfunc(avg_h264_qpel, 2, 4);
3997
3998 #undef dspfunc
3999     c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4000     c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4001     c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4002     c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4003     c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4004     c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4005     c->put_no_rnd_h264_chroma_pixels_tab[0]= put_no_rnd_h264_chroma_mc8_c;
4006
4007     c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
4008     c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
4009     c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
4010     c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
4011     c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
4012     c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
4013     c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
4014     c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
4015     c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
4016     c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
4017     c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
4018     c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
4019     c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
4020     c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
4021     c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
4022     c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
4023     c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
4024     c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
4025     c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
4026     c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
4027
4028 #ifdef CONFIG_CAVS_DECODER
4029     ff_cavsdsp_init(c,avctx);
4030 #endif
4031 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4032     ff_vc1dsp_init(c,avctx);
4033 #endif
4034 #if defined(CONFIG_H264_ENCODER)
4035     ff_h264dsp_init(c,avctx);
4036 #endif
4037
4038     c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4039     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4040     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4041     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4042     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4043     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4044     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4045     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4046
4047 #define SET_CMP_FUNC(name) \
4048     c->name[0]= name ## 16_c;\
4049     c->name[1]= name ## 8x8_c;
4050
4051     SET_CMP_FUNC(hadamard8_diff)
4052     c->hadamard8_diff[4]= hadamard8_intra16_c;
4053     SET_CMP_FUNC(dct_sad)
4054     SET_CMP_FUNC(dct_max)
4055 #ifdef CONFIG_GPL
4056     SET_CMP_FUNC(dct264_sad)
4057 #endif
4058     c->sad[0]= pix_abs16_c;
4059     c->sad[1]= pix_abs8_c;
4060     c->sse[0]= sse16_c;
4061     c->sse[1]= sse8_c;
4062     c->sse[2]= sse4_c;
4063     SET_CMP_FUNC(quant_psnr)
4064     SET_CMP_FUNC(rd)
4065     SET_CMP_FUNC(bit)
4066     c->vsad[0]= vsad16_c;
4067     c->vsad[4]= vsad_intra16_c;
4068     c->vsse[0]= vsse16_c;
4069     c->vsse[4]= vsse_intra16_c;
4070     c->nsse[0]= nsse16_c;
4071     c->nsse[1]= nsse8_c;
4072 #ifdef CONFIG_SNOW_ENCODER
4073     c->w53[0]= w53_16_c;
4074     c->w53[1]= w53_8_c;
4075     c->w97[0]= w97_16_c;
4076     c->w97[1]= w97_8_c;
4077 #endif
4078
4079     c->add_bytes= add_bytes_c;
4080     c->diff_bytes= diff_bytes_c;
4081     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4082     c->bswap_buf= bswap_buf;
4083
4084     c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4085     c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4086     c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4087     c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4088     c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4089     c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4090     c->h264_loop_filter_strength= NULL;
4091
4092     c->h263_h_loop_filter= h263_h_loop_filter_c;
4093     c->h263_v_loop_filter= h263_v_loop_filter_c;
4094
4095     c->h261_loop_filter= h261_loop_filter_c;
4096
4097     c->try_8x8basis= try_8x8basis_c;
4098     c->add_8x8basis= add_8x8basis_c;
4099
4100 #ifdef CONFIG_SNOW_ENCODER
4101     c->vertical_compose97i = ff_snow_vertical_compose97i;
4102     c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4103     c->inner_add_yblock = ff_snow_inner_add_yblock;
4104 #endif
4105
4106 #ifdef CONFIG_VORBIS_DECODER
4107     c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4108 #endif
4109     c->vector_fmul = vector_fmul_c;
4110     c->vector_fmul_reverse = vector_fmul_reverse_c;
4111     c->vector_fmul_add_add = ff_vector_fmul_add_add_c;
4112     c->float_to_int16 = ff_float_to_int16_c;
4113
4114     c->shrink[0]= ff_img_copy_plane;
4115     c->shrink[1]= ff_shrink22;
4116     c->shrink[2]= ff_shrink44;
4117     c->shrink[3]= ff_shrink88;
4118
4119     c->prefetch= just_return;
4120
4121     memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4122     memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4123
4124 #ifdef HAVE_MMX
4125     dsputil_init_mmx(c, avctx);
4126 #endif
4127 #ifdef ARCH_ARMV4L
4128     dsputil_init_armv4l(c, avctx);
4129 #endif
4130 #ifdef HAVE_MLIB
4131     dsputil_init_mlib(c, avctx);
4132 #endif
4133 #ifdef ARCH_SPARC
4134    dsputil_init_vis(c,avctx);
4135 #endif
4136 #ifdef ARCH_ALPHA
4137     dsputil_init_alpha(c, avctx);
4138 #endif
4139 #ifdef ARCH_POWERPC
4140     dsputil_init_ppc(c, avctx);
4141 #endif
4142 #ifdef HAVE_MMI
4143     dsputil_init_mmi(c, avctx);
4144 #endif
4145 #ifdef ARCH_SH4
4146     dsputil_init_sh4(c,avctx);
4147 #endif
4148 #ifdef ARCH_BFIN
4149     dsputil_init_bfin(c,avctx);
4150 #endif
4151
4152     for(i=0; i<64; i++){
4153         if(!c->put_2tap_qpel_pixels_tab[0][i])
4154             c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4155         if(!c->avg_2tap_qpel_pixels_tab[0][i])
4156             c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4157     }
4158
4159     switch(c->idct_permutation_type){
4160     case FF_NO_IDCT_PERM:
4161         for(i=0; i<64; i++)
4162             c->idct_permutation[i]= i;
4163         break;
4164     case FF_LIBMPEG2_IDCT_PERM:
4165         for(i=0; i<64; i++)
4166             c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4167         break;
4168     case FF_SIMPLE_IDCT_PERM:
4169         for(i=0; i<64; i++)
4170             c->idct_permutation[i]= simple_mmx_permutation[i];
4171         break;
4172     case FF_TRANSPOSE_IDCT_PERM:
4173         for(i=0; i<64; i++)
4174             c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4175         break;
4176     case FF_PARTTRANS_IDCT_PERM:
4177         for(i=0; i<64; i++)
4178             c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4179         break;
4180     default:
4181         av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
4182     }
4183 }
4184