libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard.
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
   7  *
   8  * This file is part of FFmpeg.
   9  *
  10  * FFmpeg is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * FFmpeg is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with FFmpeg; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 /**
  26  * @file dsputil.c
  27  * DSP utils
  28  */
  29
  30 #include "avcodec.h"
  31 #include "dsputil.h"
  32 #include "mpegvideo.h"
  33 #include "simple_idct.h"
  34 #include "faandct.h"
  35 #include "h263.h"
  36 #include "snow.h"
  37
  38 /* snow.c */
  39 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
  40
  41 /* vorbis.c */
  42 void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
  43
  44 /* flacenc.c */
  45 void ff_flac_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
  46
  47 /* pngdec.c */
  48 void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
  49
  50 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
  51 uint32_t ff_squareTbl[512] = {0, };
  52
  53 const uint8_t ff_zigzag_direct[64] = {
  54     0,   1,  8, 16,  9,  2,  3, 10,
  55     17, 24, 32, 25, 18, 11,  4,  5,
  56     12, 19, 26, 33, 40, 48, 41, 34,
  57     27, 20, 13,  6,  7, 14, 21, 28,
  58     35, 42, 49, 56, 57, 50, 43, 36,
  59     29, 22, 15, 23, 30, 37, 44, 51,
  60     58, 59, 52, 45, 38, 31, 39, 46,
  61     53, 60, 61, 54, 47, 55, 62, 63
  62 };
  63
  64 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  65    specification, we interleave the fields */
  66 const uint8_t ff_zigzag248_direct[64] = {
  67      0,  8,  1,  9, 16, 24,  2, 10,
  68     17, 25, 32, 40, 48, 56, 33, 41,
  69     18, 26,  3, 11,  4, 12, 19, 27,
  70     34, 42, 49, 57, 50, 58, 35, 43,
  71     20, 28,  5, 13,  6, 14, 21, 29,
  72     36, 44, 51, 59, 52, 60, 37, 45,
  73     22, 30,  7, 15, 23, 31, 38, 46,
  74     53, 61, 54, 62, 39, 47, 55, 63,
  75 };
  76
  77 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  78 DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
  79
  80 const uint8_t ff_alternate_horizontal_scan[64] = {
  81     0,  1,   2,  3,  8,  9, 16, 17,
  82     10, 11,  4,  5,  6,  7, 15, 14,
  83     13, 12, 19, 18, 24, 25, 32, 33,
  84     26, 27, 20, 21, 22, 23, 28, 29,
  85     30, 31, 34, 35, 40, 41, 48, 49,
  86     42, 43, 36, 37, 38, 39, 44, 45,
  87     46, 47, 50, 51, 56, 57, 58, 59,
  88     52, 53, 54, 55, 60, 61, 62, 63,
  89 };
  90
  91 const uint8_t ff_alternate_vertical_scan[64] = {
  92     0,  8,  16, 24,  1,  9,  2, 10,
  93     17, 25, 32, 40, 48, 56, 57, 49,
  94     41, 33, 26, 18,  3, 11,  4, 12,
  95     19, 27, 34, 42, 50, 58, 35, 43,
  96     51, 59, 20, 28,  5, 13,  6, 14,
  97     21, 29, 36, 44, 52, 60, 37, 45,
  98     53, 61, 22, 30,  7, 15, 23, 31,
  99     38, 46, 54, 62, 39, 47, 55, 63,
 100 };
 101
 102 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
 103 const uint32_t ff_inverse[256]={
 104          0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
 105  536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
 106  268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
 107  178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
 108  134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
 109  107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
 110   89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
 111   76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
 112   67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
 113   59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
 114   53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
 115   48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
 116   44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
 117   41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
 118   38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
 119   35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
 120   33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
 121   31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
 122   29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
 123   28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
 124   26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
 125   25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
 126   24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
 127   23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
 128   22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
 129   21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
 130   20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
 131   19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
 132   19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
 133   18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
 134   17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
 135   17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
 136 };
 137
 138 /* Input permutation for the simple_idct_mmx */
 139 static const uint8_t simple_mmx_permutation[64]={
 140         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 141         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 142         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 143         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 144         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 145         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 146         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 147         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 148 };
 149
 150 static int pix_sum_c(uint8_t * pix, int line_size)
 151 {
 152     int s, i, j;
 153
 154     s = 0;
 155     for (i = 0; i < 16; i++) {
 156         for (j = 0; j < 16; j += 8) {
 157             s += pix[0];
 158             s += pix[1];
 159             s += pix[2];
 160             s += pix[3];
 161             s += pix[4];
 162             s += pix[5];
 163             s += pix[6];
 164             s += pix[7];
 165             pix += 8;
 166         }
 167         pix += line_size - 16;
 168     }
 169     return s;
 170 }
 171
 172 static int pix_norm1_c(uint8_t * pix, int line_size)
 173 {
 174     int s, i, j;
 175     uint32_t *sq = ff_squareTbl + 256;
 176
 177     s = 0;
 178     for (i = 0; i < 16; i++) {
 179         for (j = 0; j < 16; j += 8) {
 180 #if 0
 181             s += sq[pix[0]];
 182             s += sq[pix[1]];
 183             s += sq[pix[2]];
 184             s += sq[pix[3]];
 185             s += sq[pix[4]];
 186             s += sq[pix[5]];
 187             s += sq[pix[6]];
 188             s += sq[pix[7]];
 189 #else
 190 #if LONG_MAX > 2147483647
 191             register uint64_t x=*(uint64_t*)pix;
 192             s += sq[x&0xff];
 193             s += sq[(x>>8)&0xff];
 194             s += sq[(x>>16)&0xff];
 195             s += sq[(x>>24)&0xff];
 196             s += sq[(x>>32)&0xff];
 197             s += sq[(x>>40)&0xff];
 198             s += sq[(x>>48)&0xff];
 199             s += sq[(x>>56)&0xff];
 200 #else
 201             register uint32_t x=*(uint32_t*)pix;
 202             s += sq[x&0xff];
 203             s += sq[(x>>8)&0xff];
 204             s += sq[(x>>16)&0xff];
 205             s += sq[(x>>24)&0xff];
 206             x=*(uint32_t*)(pix+4);
 207             s += sq[x&0xff];
 208             s += sq[(x>>8)&0xff];
 209             s += sq[(x>>16)&0xff];
 210             s += sq[(x>>24)&0xff];
 211 #endif
 212 #endif
 213             pix += 8;
 214         }
 215         pix += line_size - 16;
 216     }
 217     return s;
 218 }
 219
 220 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
 221     int i;
 222
 223     for(i=0; i+8<=w; i+=8){
 224         dst[i+0]= bswap_32(src[i+0]);
 225         dst[i+1]= bswap_32(src[i+1]);
 226         dst[i+2]= bswap_32(src[i+2]);
 227         dst[i+3]= bswap_32(src[i+3]);
 228         dst[i+4]= bswap_32(src[i+4]);
 229         dst[i+5]= bswap_32(src[i+5]);
 230         dst[i+6]= bswap_32(src[i+6]);
 231         dst[i+7]= bswap_32(src[i+7]);
 232     }
 233     for(;i<w; i++){
 234         dst[i+0]= bswap_32(src[i+0]);
 235     }
 236 }
 237
 238 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 239 {
 240     int s, i;
 241     uint32_t *sq = ff_squareTbl + 256;
 242
 243     s = 0;
 244     for (i = 0; i < h; i++) {
 245         s += sq[pix1[0] - pix2[0]];
 246         s += sq[pix1[1] - pix2[1]];
 247         s += sq[pix1[2] - pix2[2]];
 248         s += sq[pix1[3] - pix2[3]];
 249         pix1 += line_size;
 250         pix2 += line_size;
 251     }
 252     return s;
 253 }
 254
 255 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 256 {
 257     int s, i;
 258     uint32_t *sq = ff_squareTbl + 256;
 259
 260     s = 0;
 261     for (i = 0; i < h; i++) {
 262         s += sq[pix1[0] - pix2[0]];
 263         s += sq[pix1[1] - pix2[1]];
 264         s += sq[pix1[2] - pix2[2]];
 265         s += sq[pix1[3] - pix2[3]];
 266         s += sq[pix1[4] - pix2[4]];
 267         s += sq[pix1[5] - pix2[5]];
 268         s += sq[pix1[6] - pix2[6]];
 269         s += sq[pix1[7] - pix2[7]];
 270         pix1 += line_size;
 271         pix2 += line_size;
 272     }
 273     return s;
 274 }
 275
 276 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 277 {
 278     int s, i;
 279     uint32_t *sq = ff_squareTbl + 256;
 280
 281     s = 0;
 282     for (i = 0; i < h; i++) {
 283         s += sq[pix1[ 0] - pix2[ 0]];
 284         s += sq[pix1[ 1] - pix2[ 1]];
 285         s += sq[pix1[ 2] - pix2[ 2]];
 286         s += sq[pix1[ 3] - pix2[ 3]];
 287         s += sq[pix1[ 4] - pix2[ 4]];
 288         s += sq[pix1[ 5] - pix2[ 5]];
 289         s += sq[pix1[ 6] - pix2[ 6]];
 290         s += sq[pix1[ 7] - pix2[ 7]];
 291         s += sq[pix1[ 8] - pix2[ 8]];
 292         s += sq[pix1[ 9] - pix2[ 9]];
 293         s += sq[pix1[10] - pix2[10]];
 294         s += sq[pix1[11] - pix2[11]];
 295         s += sq[pix1[12] - pix2[12]];
 296         s += sq[pix1[13] - pix2[13]];
 297         s += sq[pix1[14] - pix2[14]];
 298         s += sq[pix1[15] - pix2[15]];
 299
 300         pix1 += line_size;
 301         pix2 += line_size;
 302     }
 303     return s;
 304 }
 305
 306
 307 #ifdef CONFIG_SNOW_ENCODER //dwt is in snow.c
 308 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
 309     int s, i, j;
 310     const int dec_count= w==8 ? 3 : 4;
 311     int tmp[32*32];
 312     int level, ori;
 313     static const int scale[2][2][4][4]={
 314       {
 315         {
 316             // 9/7 8x8 dec=3
 317             {268, 239, 239, 213},
 318             {  0, 224, 224, 152},
 319             {  0, 135, 135, 110},
 320         },{
 321             // 9/7 16x16 or 32x32 dec=4
 322             {344, 310, 310, 280},
 323             {  0, 320, 320, 228},
 324             {  0, 175, 175, 136},
 325             {  0, 129, 129, 102},
 326         }
 327       },{
 328         {
 329             // 5/3 8x8 dec=3
 330             {275, 245, 245, 218},
 331             {  0, 230, 230, 156},
 332             {  0, 138, 138, 113},
 333         },{
 334             // 5/3 16x16 or 32x32 dec=4
 335             {352, 317, 317, 286},
 336             {  0, 328, 328, 233},
 337             {  0, 180, 180, 140},
 338             {  0, 132, 132, 105},
 339         }
 340       }
 341     };
 342
 343     for (i = 0; i < h; i++) {
 344         for (j = 0; j < w; j+=4) {
 345             tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
 346             tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
 347             tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
 348             tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
 349         }
 350         pix1 += line_size;
 351         pix2 += line_size;
 352     }
 353
 354     ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
 355
 356     s=0;
 357     assert(w==h);
 358     for(level=0; level<dec_count; level++){
 359         for(ori= level ? 1 : 0; ori<4; ori++){
 360             int size= w>>(dec_count-level);
 361             int sx= (ori&1) ? size : 0;
 362             int stride= 32<<(dec_count-level);
 363             int sy= (ori&2) ? stride>>1 : 0;
 364
 365             for(i=0; i<size; i++){
 366                 for(j=0; j<size; j++){
 367                     int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
 368                     s += FFABS(v);
 369                 }
 370             }
 371         }
 372     }
 373     assert(s>=0);
 374     return s>>9;
 375 }
 376
 377 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 378     return w_c(v, pix1, pix2, line_size,  8, h, 1);
 379 }
 380
 381 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 382     return w_c(v, pix1, pix2, line_size,  8, h, 0);
 383 }
 384
 385 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 386     return w_c(v, pix1, pix2, line_size, 16, h, 1);
 387 }
 388
 389 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 390     return w_c(v, pix1, pix2, line_size, 16, h, 0);
 391 }
 392
 393 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 394     return w_c(v, pix1, pix2, line_size, 32, h, 1);
 395 }
 396
 397 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 398     return w_c(v, pix1, pix2, line_size, 32, h, 0);
 399 }
 400 #endif
 401
 402 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
 403 {
 404     int i;
 405
 406     /* read the pixels */
 407     for(i=0;i<8;i++) {
 408         block[0] = pixels[0];
 409         block[1] = pixels[1];
 410         block[2] = pixels[2];
 411         block[3] = pixels[3];
 412         block[4] = pixels[4];
 413         block[5] = pixels[5];
 414         block[6] = pixels[6];
 415         block[7] = pixels[7];
 416         pixels += line_size;
 417         block += 8;
 418     }
 419 }
 420
 421 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
 422                           const uint8_t *s2, int stride){
 423     int i;
 424
 425     /* read the pixels */
 426     for(i=0;i<8;i++) {
 427         block[0] = s1[0] - s2[0];
 428         block[1] = s1[1] - s2[1];
 429         block[2] = s1[2] - s2[2];
 430         block[3] = s1[3] - s2[3];
 431         block[4] = s1[4] - s2[4];
 432         block[5] = s1[5] - s2[5];
 433         block[6] = s1[6] - s2[6];
 434         block[7] = s1[7] - s2[7];
 435         s1 += stride;
 436         s2 += stride;
 437         block += 8;
 438     }
 439 }
 440
 441
 442 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 443                                  int line_size)
 444 {
 445     int i;
 446     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 447
 448     /* read the pixels */
 449     for(i=0;i<8;i++) {
 450         pixels[0] = cm[block[0]];
 451         pixels[1] = cm[block[1]];
 452         pixels[2] = cm[block[2]];
 453         pixels[3] = cm[block[3]];
 454         pixels[4] = cm[block[4]];
 455         pixels[5] = cm[block[5]];
 456         pixels[6] = cm[block[6]];
 457         pixels[7] = cm[block[7]];
 458
 459         pixels += line_size;
 460         block += 8;
 461     }
 462 }
 463
 464 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 465                                  int line_size)
 466 {
 467     int i;
 468     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 469
 470     /* read the pixels */
 471     for(i=0;i<4;i++) {
 472         pixels[0] = cm[block[0]];
 473         pixels[1] = cm[block[1]];
 474         pixels[2] = cm[block[2]];
 475         pixels[3] = cm[block[3]];
 476
 477         pixels += line_size;
 478         block += 8;
 479     }
 480 }
 481
 482 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 483                                  int line_size)
 484 {
 485     int i;
 486     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 487
 488     /* read the pixels */
 489     for(i=0;i<2;i++) {
 490         pixels[0] = cm[block[0]];
 491         pixels[1] = cm[block[1]];
 492
 493         pixels += line_size;
 494         block += 8;
 495     }
 496 }
 497
 498 static void put_signed_pixels_clamped_c(const DCTELEM *block,
 499                                         uint8_t *restrict pixels,
 500                                         int line_size)
 501 {
 502     int i, j;
 503
 504     for (i = 0; i < 8; i++) {
 505         for (j = 0; j < 8; j++) {
 506             if (*block < -128)
 507                 *pixels = 0;
 508             else if (*block > 127)
 509                 *pixels = 255;
 510             else
 511                 *pixels = (uint8_t)(*block + 128);
 512             block++;
 513             pixels++;
 514         }
 515         pixels += (line_size - 8);
 516     }
 517 }
 518
 519 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 520                           int line_size)
 521 {
 522     int i;
 523     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 524
 525     /* read the pixels */
 526     for(i=0;i<8;i++) {
 527         pixels[0] = cm[pixels[0] + block[0]];
 528         pixels[1] = cm[pixels[1] + block[1]];
 529         pixels[2] = cm[pixels[2] + block[2]];
 530         pixels[3] = cm[pixels[3] + block[3]];
 531         pixels[4] = cm[pixels[4] + block[4]];
 532         pixels[5] = cm[pixels[5] + block[5]];
 533         pixels[6] = cm[pixels[6] + block[6]];
 534         pixels[7] = cm[pixels[7] + block[7]];
 535         pixels += line_size;
 536         block += 8;
 537     }
 538 }
 539
 540 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 541                           int line_size)
 542 {
 543     int i;
 544     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 545
 546     /* read the pixels */
 547     for(i=0;i<4;i++) {
 548         pixels[0] = cm[pixels[0] + block[0]];
 549         pixels[1] = cm[pixels[1] + block[1]];
 550         pixels[2] = cm[pixels[2] + block[2]];
 551         pixels[3] = cm[pixels[3] + block[3]];
 552         pixels += line_size;
 553         block += 8;
 554     }
 555 }
 556
 557 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 558                           int line_size)
 559 {
 560     int i;
 561     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 562
 563     /* read the pixels */
 564     for(i=0;i<2;i++) {
 565         pixels[0] = cm[pixels[0] + block[0]];
 566         pixels[1] = cm[pixels[1] + block[1]];
 567         pixels += line_size;
 568         block += 8;
 569     }
 570 }
 571
 572 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 573 {
 574     int i;
 575     for(i=0;i<8;i++) {
 576         pixels[0] += block[0];
 577         pixels[1] += block[1];
 578         pixels[2] += block[2];
 579         pixels[3] += block[3];
 580         pixels[4] += block[4];
 581         pixels[5] += block[5];
 582         pixels[6] += block[6];
 583         pixels[7] += block[7];
 584         pixels += line_size;
 585         block += 8;
 586     }
 587 }
 588
 589 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 590 {
 591     int i;
 592     for(i=0;i<4;i++) {
 593         pixels[0] += block[0];
 594         pixels[1] += block[1];
 595         pixels[2] += block[2];
 596         pixels[3] += block[3];
 597         pixels += line_size;
 598         block += 4;
 599     }
 600 }
 601
 602 static int sum_abs_dctelem_c(DCTELEM *block)
 603 {
 604     int sum=0, i;
 605     for(i=0; i<64; i++)
 606         sum+= FFABS(block[i]);
 607     return sum;
 608 }
 609
 610 #if 0
 611
 612 #define PIXOP2(OPNAME, OP) \
 613 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 614 {\
 615     int i;\
 616     for(i=0; i<h; i++){\
 617         OP(*((uint64_t*)block), AV_RN64(pixels));\
 618         pixels+=line_size;\
 619         block +=line_size;\
 620     }\
 621 }\
 622 \
 623 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 624 {\
 625     int i;\
 626     for(i=0; i<h; i++){\
 627         const uint64_t a= AV_RN64(pixels  );\
 628         const uint64_t b= AV_RN64(pixels+1);\
 629         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 630         pixels+=line_size;\
 631         block +=line_size;\
 632     }\
 633 }\
 634 \
 635 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 636 {\
 637     int i;\
 638     for(i=0; i<h; i++){\
 639         const uint64_t a= AV_RN64(pixels  );\
 640         const uint64_t b= AV_RN64(pixels+1);\
 641         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 642         pixels+=line_size;\
 643         block +=line_size;\
 644     }\
 645 }\
 646 \
 647 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 648 {\
 649     int i;\
 650     for(i=0; i<h; i++){\
 651         const uint64_t a= AV_RN64(pixels          );\
 652         const uint64_t b= AV_RN64(pixels+line_size);\
 653         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 654         pixels+=line_size;\
 655         block +=line_size;\
 656     }\
 657 }\
 658 \
 659 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 660 {\
 661     int i;\
 662     for(i=0; i<h; i++){\
 663         const uint64_t a= AV_RN64(pixels          );\
 664         const uint64_t b= AV_RN64(pixels+line_size);\
 665         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 666         pixels+=line_size;\
 667         block +=line_size;\
 668     }\
 669 }\
 670 \
 671 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 672 {\
 673         int i;\
 674         const uint64_t a= AV_RN64(pixels  );\
 675         const uint64_t b= AV_RN64(pixels+1);\
 676         uint64_t l0=  (a&0x0303030303030303ULL)\
 677                     + (b&0x0303030303030303ULL)\
 678                     + 0x0202020202020202ULL;\
 679         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 680                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 681         uint64_t l1,h1;\
 682 \
 683         pixels+=line_size;\
 684         for(i=0; i<h; i+=2){\
 685             uint64_t a= AV_RN64(pixels  );\
 686             uint64_t b= AV_RN64(pixels+1);\
 687             l1=  (a&0x0303030303030303ULL)\
 688                + (b&0x0303030303030303ULL);\
 689             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 690               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 691             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 692             pixels+=line_size;\
 693             block +=line_size;\
 694             a= AV_RN64(pixels  );\
 695             b= AV_RN64(pixels+1);\
 696             l0=  (a&0x0303030303030303ULL)\
 697                + (b&0x0303030303030303ULL)\
 698                + 0x0202020202020202ULL;\
 699             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 700               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 701             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 702             pixels+=line_size;\
 703             block +=line_size;\
 704         }\
 705 }\
 706 \
 707 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 708 {\
 709         int i;\
 710         const uint64_t a= AV_RN64(pixels  );\
 711         const uint64_t b= AV_RN64(pixels+1);\
 712         uint64_t l0=  (a&0x0303030303030303ULL)\
 713                     + (b&0x0303030303030303ULL)\
 714                     + 0x0101010101010101ULL;\
 715         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 716                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 717         uint64_t l1,h1;\
 718 \
 719         pixels+=line_size;\
 720         for(i=0; i<h; i+=2){\
 721             uint64_t a= AV_RN64(pixels  );\
 722             uint64_t b= AV_RN64(pixels+1);\
 723             l1=  (a&0x0303030303030303ULL)\
 724                + (b&0x0303030303030303ULL);\
 725             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 726               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 727             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 728             pixels+=line_size;\
 729             block +=line_size;\
 730             a= AV_RN64(pixels  );\
 731             b= AV_RN64(pixels+1);\
 732             l0=  (a&0x0303030303030303ULL)\
 733                + (b&0x0303030303030303ULL)\
 734                + 0x0101010101010101ULL;\
 735             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 736               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 737             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 738             pixels+=line_size;\
 739             block +=line_size;\
 740         }\
 741 }\
 742 \
 743 CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
 744 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
 745 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
 746 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
 747 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
 748 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
 749 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
 750
 751 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
 752 #else // 64 bit variant
 753
 754 #define PIXOP2(OPNAME, OP) \
 755 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 756     int i;\
 757     for(i=0; i<h; i++){\
 758         OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
 759         pixels+=line_size;\
 760         block +=line_size;\
 761     }\
 762 }\
 763 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 764     int i;\
 765     for(i=0; i<h; i++){\
 766         OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
 767         pixels+=line_size;\
 768         block +=line_size;\
 769     }\
 770 }\
 771 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 772     int i;\
 773     for(i=0; i<h; i++){\
 774         OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
 775         OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
 776         pixels+=line_size;\
 777         block +=line_size;\
 778     }\
 779 }\
 780 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 781     OPNAME ## _pixels8_c(block, pixels, line_size, h);\
 782 }\
 783 \
 784 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 785                                                 int src_stride1, int src_stride2, int h){\
 786     int i;\
 787     for(i=0; i<h; i++){\
 788         uint32_t a,b;\
 789         a= AV_RN32(&src1[i*src_stride1  ]);\
 790         b= AV_RN32(&src2[i*src_stride2  ]);\
 791         OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
 792         a= AV_RN32(&src1[i*src_stride1+4]);\
 793         b= AV_RN32(&src2[i*src_stride2+4]);\
 794         OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
 795     }\
 796 }\
 797 \
 798 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 799                                                 int src_stride1, int src_stride2, int h){\
 800     int i;\
 801     for(i=0; i<h; i++){\
 802         uint32_t a,b;\
 803         a= AV_RN32(&src1[i*src_stride1  ]);\
 804         b= AV_RN32(&src2[i*src_stride2  ]);\
 805         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 806         a= AV_RN32(&src1[i*src_stride1+4]);\
 807         b= AV_RN32(&src2[i*src_stride2+4]);\
 808         OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
 809     }\
 810 }\
 811 \
 812 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 813                                                 int src_stride1, int src_stride2, int h){\
 814     int i;\
 815     for(i=0; i<h; i++){\
 816         uint32_t a,b;\
 817         a= AV_RN32(&src1[i*src_stride1  ]);\
 818         b= AV_RN32(&src2[i*src_stride2  ]);\
 819         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 820     }\
 821 }\
 822 \
 823 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 824                                                 int src_stride1, int src_stride2, int h){\
 825     int i;\
 826     for(i=0; i<h; i++){\
 827         uint32_t a,b;\
 828         a= AV_RN16(&src1[i*src_stride1  ]);\
 829         b= AV_RN16(&src2[i*src_stride2  ]);\
 830         OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 831     }\
 832 }\
 833 \
 834 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 835                                                 int src_stride1, int src_stride2, int h){\
 836     OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 837     OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 838 }\
 839 \
 840 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 841                                                 int src_stride1, int src_stride2, int h){\
 842     OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 843     OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 844 }\
 845 \
 846 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 847     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 848 }\
 849 \
 850 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 851     OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 852 }\
 853 \
 854 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 855     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 856 }\
 857 \
 858 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 859     OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 860 }\
 861 \
 862 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 863                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 864     int i;\
 865     for(i=0; i<h; i++){\
 866         uint32_t a, b, c, d, l0, l1, h0, h1;\
 867         a= AV_RN32(&src1[i*src_stride1]);\
 868         b= AV_RN32(&src2[i*src_stride2]);\
 869         c= AV_RN32(&src3[i*src_stride3]);\
 870         d= AV_RN32(&src4[i*src_stride4]);\
 871         l0=  (a&0x03030303UL)\
 872            + (b&0x03030303UL)\
 873            + 0x02020202UL;\
 874         h0= ((a&0xFCFCFCFCUL)>>2)\
 875           + ((b&0xFCFCFCFCUL)>>2);\
 876         l1=  (c&0x03030303UL)\
 877            + (d&0x03030303UL);\
 878         h1= ((c&0xFCFCFCFCUL)>>2)\
 879           + ((d&0xFCFCFCFCUL)>>2);\
 880         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 881         a= AV_RN32(&src1[i*src_stride1+4]);\
 882         b= AV_RN32(&src2[i*src_stride2+4]);\
 883         c= AV_RN32(&src3[i*src_stride3+4]);\
 884         d= AV_RN32(&src4[i*src_stride4+4]);\
 885         l0=  (a&0x03030303UL)\
 886            + (b&0x03030303UL)\
 887            + 0x02020202UL;\
 888         h0= ((a&0xFCFCFCFCUL)>>2)\
 889           + ((b&0xFCFCFCFCUL)>>2);\
 890         l1=  (c&0x03030303UL)\
 891            + (d&0x03030303UL);\
 892         h1= ((c&0xFCFCFCFCUL)>>2)\
 893           + ((d&0xFCFCFCFCUL)>>2);\
 894         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 895     }\
 896 }\
 897 \
 898 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 899     OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 900 }\
 901 \
 902 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 903     OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 904 }\
 905 \
 906 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 907     OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 908 }\
 909 \
 910 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 911     OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 912 }\
 913 \
 914 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 915                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 916     int i;\
 917     for(i=0; i<h; i++){\
 918         uint32_t a, b, c, d, l0, l1, h0, h1;\
 919         a= AV_RN32(&src1[i*src_stride1]);\
 920         b= AV_RN32(&src2[i*src_stride2]);\
 921         c= AV_RN32(&src3[i*src_stride3]);\
 922         d= AV_RN32(&src4[i*src_stride4]);\
 923         l0=  (a&0x03030303UL)\
 924            + (b&0x03030303UL)\
 925            + 0x01010101UL;\
 926         h0= ((a&0xFCFCFCFCUL)>>2)\
 927           + ((b&0xFCFCFCFCUL)>>2);\
 928         l1=  (c&0x03030303UL)\
 929            + (d&0x03030303UL);\
 930         h1= ((c&0xFCFCFCFCUL)>>2)\
 931           + ((d&0xFCFCFCFCUL)>>2);\
 932         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 933         a= AV_RN32(&src1[i*src_stride1+4]);\
 934         b= AV_RN32(&src2[i*src_stride2+4]);\
 935         c= AV_RN32(&src3[i*src_stride3+4]);\
 936         d= AV_RN32(&src4[i*src_stride4+4]);\
 937         l0=  (a&0x03030303UL)\
 938            + (b&0x03030303UL)\
 939            + 0x01010101UL;\
 940         h0= ((a&0xFCFCFCFCUL)>>2)\
 941           + ((b&0xFCFCFCFCUL)>>2);\
 942         l1=  (c&0x03030303UL)\
 943            + (d&0x03030303UL);\
 944         h1= ((c&0xFCFCFCFCUL)>>2)\
 945           + ((d&0xFCFCFCFCUL)>>2);\
 946         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 947     }\
 948 }\
 949 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 950                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 951     OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 952     OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 953 }\
 954 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 955                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 956     OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 957     OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 958 }\
 959 \
 960 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 961 {\
 962         int i, a0, b0, a1, b1;\
 963         a0= pixels[0];\
 964         b0= pixels[1] + 2;\
 965         a0 += b0;\
 966         b0 += pixels[2];\
 967 \
 968         pixels+=line_size;\
 969         for(i=0; i<h; i+=2){\
 970             a1= pixels[0];\
 971             b1= pixels[1];\
 972             a1 += b1;\
 973             b1 += pixels[2];\
 974 \
 975             block[0]= (a1+a0)>>2; /* FIXME non put */\
 976             block[1]= (b1+b0)>>2;\
 977 \
 978             pixels+=line_size;\
 979             block +=line_size;\
 980 \
 981             a0= pixels[0];\
 982             b0= pixels[1] + 2;\
 983             a0 += b0;\
 984             b0 += pixels[2];\
 985 \
 986             block[0]= (a1+a0)>>2;\
 987             block[1]= (b1+b0)>>2;\
 988             pixels+=line_size;\
 989             block +=line_size;\
 990         }\
 991 }\
 992 \
 993 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 994 {\
 995         int i;\
 996         const uint32_t a= AV_RN32(pixels  );\
 997         const uint32_t b= AV_RN32(pixels+1);\
 998         uint32_t l0=  (a&0x03030303UL)\
 999                     + (b&0x03030303UL)\
1000                     + 0x02020202UL;\
1001         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1002                    + ((b&0xFCFCFCFCUL)>>2);\
1003         uint32_t l1,h1;\
1004 \
1005         pixels+=line_size;\
1006         for(i=0; i<h; i+=2){\
1007             uint32_t a= AV_RN32(pixels  );\
1008             uint32_t b= AV_RN32(pixels+1);\
1009             l1=  (a&0x03030303UL)\
1010                + (b&0x03030303UL);\
1011             h1= ((a&0xFCFCFCFCUL)>>2)\
1012               + ((b&0xFCFCFCFCUL)>>2);\
1013             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1014             pixels+=line_size;\
1015             block +=line_size;\
1016             a= AV_RN32(pixels  );\
1017             b= AV_RN32(pixels+1);\
1018             l0=  (a&0x03030303UL)\
1019                + (b&0x03030303UL)\
1020                + 0x02020202UL;\
1021             h0= ((a&0xFCFCFCFCUL)>>2)\
1022               + ((b&0xFCFCFCFCUL)>>2);\
1023             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1024             pixels+=line_size;\
1025             block +=line_size;\
1026         }\
1027 }\
1028 \
1029 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1030 {\
1031     int j;\
1032     for(j=0; j<2; j++){\
1033         int i;\
1034         const uint32_t a= AV_RN32(pixels  );\
1035         const uint32_t b= AV_RN32(pixels+1);\
1036         uint32_t l0=  (a&0x03030303UL)\
1037                     + (b&0x03030303UL)\
1038                     + 0x02020202UL;\
1039         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1040                    + ((b&0xFCFCFCFCUL)>>2);\
1041         uint32_t l1,h1;\
1042 \
1043         pixels+=line_size;\
1044         for(i=0; i<h; i+=2){\
1045             uint32_t a= AV_RN32(pixels  );\
1046             uint32_t b= AV_RN32(pixels+1);\
1047             l1=  (a&0x03030303UL)\
1048                + (b&0x03030303UL);\
1049             h1= ((a&0xFCFCFCFCUL)>>2)\
1050               + ((b&0xFCFCFCFCUL)>>2);\
1051             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1052             pixels+=line_size;\
1053             block +=line_size;\
1054             a= AV_RN32(pixels  );\
1055             b= AV_RN32(pixels+1);\
1056             l0=  (a&0x03030303UL)\
1057                + (b&0x03030303UL)\
1058                + 0x02020202UL;\
1059             h0= ((a&0xFCFCFCFCUL)>>2)\
1060               + ((b&0xFCFCFCFCUL)>>2);\
1061             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1062             pixels+=line_size;\
1063             block +=line_size;\
1064         }\
1065         pixels+=4-line_size*(h+1);\
1066         block +=4-line_size*h;\
1067     }\
1068 }\
1069 \
1070 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1071 {\
1072     int j;\
1073     for(j=0; j<2; j++){\
1074         int i;\
1075         const uint32_t a= AV_RN32(pixels  );\
1076         const uint32_t b= AV_RN32(pixels+1);\
1077         uint32_t l0=  (a&0x03030303UL)\
1078                     + (b&0x03030303UL)\
1079                     + 0x01010101UL;\
1080         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1081                    + ((b&0xFCFCFCFCUL)>>2);\
1082         uint32_t l1,h1;\
1083 \
1084         pixels+=line_size;\
1085         for(i=0; i<h; i+=2){\
1086             uint32_t a= AV_RN32(pixels  );\
1087             uint32_t b= AV_RN32(pixels+1);\
1088             l1=  (a&0x03030303UL)\
1089                + (b&0x03030303UL);\
1090             h1= ((a&0xFCFCFCFCUL)>>2)\
1091               + ((b&0xFCFCFCFCUL)>>2);\
1092             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1093             pixels+=line_size;\
1094             block +=line_size;\
1095             a= AV_RN32(pixels  );\
1096             b= AV_RN32(pixels+1);\
1097             l0=  (a&0x03030303UL)\
1098                + (b&0x03030303UL)\
1099                + 0x01010101UL;\
1100             h0= ((a&0xFCFCFCFCUL)>>2)\
1101               + ((b&0xFCFCFCFCUL)>>2);\
1102             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1103             pixels+=line_size;\
1104             block +=line_size;\
1105         }\
1106         pixels+=4-line_size*(h+1);\
1107         block +=4-line_size*h;\
1108     }\
1109 }\
1110 \
1111 CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1112 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1113 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1114 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1115 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1116 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1117 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1118 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1119
1120 #define op_avg(a, b) a = rnd_avg32(a, b)
1121 #endif
1122 #define op_put(a, b) a = b
1123
1124 PIXOP2(avg, op_avg)
1125 PIXOP2(put, op_put)
1126 #undef op_avg
1127 #undef op_put
1128
1129 #define avg2(a,b) ((a+b+1)>>1)
1130 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1131
1132 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1133     put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1134 }
1135
1136 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1137     put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1138 }
1139
1140 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1141 {
1142     const int A=(16-x16)*(16-y16);
1143     const int B=(   x16)*(16-y16);
1144     const int C=(16-x16)*(   y16);
1145     const int D=(   x16)*(   y16);
1146     int i;
1147
1148     for(i=0; i<h; i++)
1149     {
1150         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1151         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1152         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1153         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1154         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1155         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1156         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1157         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1158         dst+= stride;
1159         src+= stride;
1160     }
1161 }
1162
1163 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1164                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1165 {
1166     int y, vx, vy;
1167     const int s= 1<<shift;
1168
1169     width--;
1170     height--;
1171
1172     for(y=0; y<h; y++){
1173         int x;
1174
1175         vx= ox;
1176         vy= oy;
1177         for(x=0; x<8; x++){ //XXX FIXME optimize
1178             int src_x, src_y, frac_x, frac_y, index;
1179
1180             src_x= vx>>16;
1181             src_y= vy>>16;
1182             frac_x= src_x&(s-1);
1183             frac_y= src_y&(s-1);
1184             src_x>>=shift;
1185             src_y>>=shift;
1186
1187             if((unsigned)src_x < width){
1188                 if((unsigned)src_y < height){
1189                     index= src_x + src_y*stride;
1190                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1191                                            + src[index       +1]*   frac_x )*(s-frac_y)
1192                                         + (  src[index+stride  ]*(s-frac_x)
1193                                            + src[index+stride+1]*   frac_x )*   frac_y
1194                                         + r)>>(shift*2);
1195                 }else{
1196                     index= src_x + av_clip(src_y, 0, height)*stride;
1197                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1198                                           + src[index       +1]*   frac_x )*s
1199                                         + r)>>(shift*2);
1200                 }
1201             }else{
1202                 if((unsigned)src_y < height){
1203                     index= av_clip(src_x, 0, width) + src_y*stride;
1204                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1205                                            + src[index+stride  ]*   frac_y )*s
1206                                         + r)>>(shift*2);
1207                 }else{
1208                     index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1209                     dst[y*stride + x]=    src[index         ];
1210                 }
1211             }
1212
1213             vx+= dxx;
1214             vy+= dyx;
1215         }
1216         ox += dxy;
1217         oy += dyy;
1218     }
1219 }
1220
1221 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1222     switch(width){
1223     case 2: put_pixels2_c (dst, src, stride, height); break;
1224     case 4: put_pixels4_c (dst, src, stride, height); break;
1225     case 8: put_pixels8_c (dst, src, stride, height); break;
1226     case 16:put_pixels16_c(dst, src, stride, height); break;
1227     }
1228 }
1229
1230 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1231     int i,j;
1232     for (i=0; i < height; i++) {
1233       for (j=0; j < width; j++) {
1234         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1235       }
1236       src += stride;
1237       dst += stride;
1238     }
1239 }
1240
1241 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1242     int i,j;
1243     for (i=0; i < height; i++) {
1244       for (j=0; j < width; j++) {
1245         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1246       }
1247       src += stride;
1248       dst += stride;
1249     }
1250 }
1251
1252 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1253     int i,j;
1254     for (i=0; i < height; i++) {
1255       for (j=0; j < width; j++) {
1256         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1257       }
1258       src += stride;
1259       dst += stride;
1260     }
1261 }
1262
1263 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1264     int i,j;
1265     for (i=0; i < height; i++) {
1266       for (j=0; j < width; j++) {
1267         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1268       }
1269       src += stride;
1270       dst += stride;
1271     }
1272 }
1273
1274 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1275     int i,j;
1276     for (i=0; i < height; i++) {
1277       for (j=0; j < width; j++) {
1278         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1279       }
1280       src += stride;
1281       dst += stride;
1282     }
1283 }
1284
1285 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1286     int i,j;
1287     for (i=0; i < height; i++) {
1288       for (j=0; j < width; j++) {
1289         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1290       }
1291       src += stride;
1292       dst += stride;
1293     }
1294 }
1295
1296 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1297     int i,j;
1298     for (i=0; i < height; i++) {
1299       for (j=0; j < width; j++) {
1300         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1301       }
1302       src += stride;
1303       dst += stride;
1304     }
1305 }
1306
1307 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1308     int i,j;
1309     for (i=0; i < height; i++) {
1310       for (j=0; j < width; j++) {
1311         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1312       }
1313       src += stride;
1314       dst += stride;
1315     }
1316 }
1317
1318 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1319     switch(width){
1320     case 2: avg_pixels2_c (dst, src, stride, height); break;
1321     case 4: avg_pixels4_c (dst, src, stride, height); break;
1322     case 8: avg_pixels8_c (dst, src, stride, height); break;
1323     case 16:avg_pixels16_c(dst, src, stride, height); break;
1324     }
1325 }
1326
1327 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1328     int i,j;
1329     for (i=0; i < height; i++) {
1330       for (j=0; j < width; j++) {
1331         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1332       }
1333       src += stride;
1334       dst += stride;
1335     }
1336 }
1337
1338 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1339     int i,j;
1340     for (i=0; i < height; i++) {
1341       for (j=0; j < width; j++) {
1342         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1343       }
1344       src += stride;
1345       dst += stride;
1346     }
1347 }
1348
1349 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1350     int i,j;
1351     for (i=0; i < height; i++) {
1352       for (j=0; j < width; j++) {
1353         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1354       }
1355       src += stride;
1356       dst += stride;
1357     }
1358 }
1359
1360 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1361     int i,j;
1362     for (i=0; i < height; i++) {
1363       for (j=0; j < width; j++) {
1364         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1365       }
1366       src += stride;
1367       dst += stride;
1368     }
1369 }
1370
1371 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1372     int i,j;
1373     for (i=0; i < height; i++) {
1374       for (j=0; j < width; j++) {
1375         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1376       }
1377       src += stride;
1378       dst += stride;
1379     }
1380 }
1381
1382 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1383     int i,j;
1384     for (i=0; i < height; i++) {
1385       for (j=0; j < width; j++) {
1386         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1387       }
1388       src += stride;
1389       dst += stride;
1390     }
1391 }
1392
1393 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1394     int i,j;
1395     for (i=0; i < height; i++) {
1396       for (j=0; j < width; j++) {
1397         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1398       }
1399       src += stride;
1400       dst += stride;
1401     }
1402 }
1403
1404 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1405     int i,j;
1406     for (i=0; i < height; i++) {
1407       for (j=0; j < width; j++) {
1408         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1409       }
1410       src += stride;
1411       dst += stride;
1412     }
1413 }
1414 #if 0
1415 #define TPEL_WIDTH(width)\
1416 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1417     void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1418 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1419     void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1420 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1421     void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1422 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1423     void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1424 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1425     void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1426 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1427     void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1428 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1429     void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1430 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1431     void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1432 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1433     void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1434 #endif
1435
1436 #define H264_CHROMA_MC(OPNAME, OP)\
1437 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1438     const int A=(8-x)*(8-y);\
1439     const int B=(  x)*(8-y);\
1440     const int C=(8-x)*(  y);\
1441     const int D=(  x)*(  y);\
1442     int i;\
1443     \
1444     assert(x<8 && y<8 && x>=0 && y>=0);\
1445 \
1446     if(D){\
1447         for(i=0; i<h; i++){\
1448             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1449             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1450             dst+= stride;\
1451             src+= stride;\
1452         }\
1453     }else{\
1454         const int E= B+C;\
1455         const int step= C ? stride : 1;\
1456         for(i=0; i<h; i++){\
1457             OP(dst[0], (A*src[0] + E*src[step+0]));\
1458             OP(dst[1], (A*src[1] + E*src[step+1]));\
1459             dst+= stride;\
1460             src+= stride;\
1461         }\
1462     }\
1463 }\
1464 \
1465 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1466     const int A=(8-x)*(8-y);\
1467     const int B=(  x)*(8-y);\
1468     const int C=(8-x)*(  y);\
1469     const int D=(  x)*(  y);\
1470     int i;\
1471     \
1472     assert(x<8 && y<8 && x>=0 && y>=0);\
1473 \
1474     if(D){\
1475         for(i=0; i<h; i++){\
1476             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1477             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1478             OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1479             OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1480             dst+= stride;\
1481             src+= stride;\
1482         }\
1483     }else{\
1484         const int E= B+C;\
1485         const int step= C ? stride : 1;\
1486         for(i=0; i<h; i++){\
1487             OP(dst[0], (A*src[0] + E*src[step+0]));\
1488             OP(dst[1], (A*src[1] + E*src[step+1]));\
1489             OP(dst[2], (A*src[2] + E*src[step+2]));\
1490             OP(dst[3], (A*src[3] + E*src[step+3]));\
1491             dst+= stride;\
1492             src+= stride;\
1493         }\
1494     }\
1495 }\
1496 \
1497 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1498     const int A=(8-x)*(8-y);\
1499     const int B=(  x)*(8-y);\
1500     const int C=(8-x)*(  y);\
1501     const int D=(  x)*(  y);\
1502     int i;\
1503     \
1504     assert(x<8 && y<8 && x>=0 && y>=0);\
1505 \
1506     if(D){\
1507         for(i=0; i<h; i++){\
1508             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1509             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1510             OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1511             OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1512             OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1513             OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1514             OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1515             OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1516             dst+= stride;\
1517             src+= stride;\
1518         }\
1519     }else{\
1520         const int E= B+C;\
1521         const int step= C ? stride : 1;\
1522         for(i=0; i<h; i++){\
1523             OP(dst[0], (A*src[0] + E*src[step+0]));\
1524             OP(dst[1], (A*src[1] + E*src[step+1]));\
1525             OP(dst[2], (A*src[2] + E*src[step+2]));\
1526             OP(dst[3], (A*src[3] + E*src[step+3]));\
1527             OP(dst[4], (A*src[4] + E*src[step+4]));\
1528             OP(dst[5], (A*src[5] + E*src[step+5]));\
1529             OP(dst[6], (A*src[6] + E*src[step+6]));\
1530             OP(dst[7], (A*src[7] + E*src[step+7]));\
1531             dst+= stride;\
1532             src+= stride;\
1533         }\
1534     }\
1535 }
1536
1537 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1538 #define op_put(a, b) a = (((b) + 32)>>6)
1539
1540 H264_CHROMA_MC(put_       , op_put)
1541 H264_CHROMA_MC(avg_       , op_avg)
1542 #undef op_avg
1543 #undef op_put
1544
1545 static void put_no_rnd_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1546     const int A=(8-x)*(8-y);
1547     const int B=(  x)*(8-y);
1548     const int C=(8-x)*(  y);
1549     const int D=(  x)*(  y);
1550     int i;
1551
1552     assert(x<8 && y<8 && x>=0 && y>=0);
1553
1554     for(i=0; i<h; i++)
1555     {
1556         dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1557         dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1558         dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1559         dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1560         dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1561         dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1562         dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1563         dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1564         dst+= stride;
1565         src+= stride;
1566     }
1567 }
1568
1569 #define QPEL_MC(r, OPNAME, RND, OP) \
1570 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1571     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1572     int i;\
1573     for(i=0; i<h; i++)\
1574     {\
1575         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1576         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1577         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1578         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1579         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1580         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1581         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1582         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1583         dst+=dstStride;\
1584         src+=srcStride;\
1585     }\
1586 }\
1587 \
1588 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1589     const int w=8;\
1590     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1591     int i;\
1592     for(i=0; i<w; i++)\
1593     {\
1594         const int src0= src[0*srcStride];\
1595         const int src1= src[1*srcStride];\
1596         const int src2= src[2*srcStride];\
1597         const int src3= src[3*srcStride];\
1598         const int src4= src[4*srcStride];\
1599         const int src5= src[5*srcStride];\
1600         const int src6= src[6*srcStride];\
1601         const int src7= src[7*srcStride];\
1602         const int src8= src[8*srcStride];\
1603         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1604         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1605         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1606         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1607         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1608         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1609         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1610         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1611         dst++;\
1612         src++;\
1613     }\
1614 }\
1615 \
1616 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1617     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1618     int i;\
1619     \
1620     for(i=0; i<h; i++)\
1621     {\
1622         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1623         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1624         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1625         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1626         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1627         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1628         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1629         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1630         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1631         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1632         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1633         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1634         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1635         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1636         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1637         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1638         dst+=dstStride;\
1639         src+=srcStride;\
1640     }\
1641 }\
1642 \
1643 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1644     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1645     int i;\
1646     const int w=16;\
1647     for(i=0; i<w; i++)\
1648     {\
1649         const int src0= src[0*srcStride];\
1650         const int src1= src[1*srcStride];\
1651         const int src2= src[2*srcStride];\
1652         const int src3= src[3*srcStride];\
1653         const int src4= src[4*srcStride];\
1654         const int src5= src[5*srcStride];\
1655         const int src6= src[6*srcStride];\
1656         const int src7= src[7*srcStride];\
1657         const int src8= src[8*srcStride];\
1658         const int src9= src[9*srcStride];\
1659         const int src10= src[10*srcStride];\
1660         const int src11= src[11*srcStride];\
1661         const int src12= src[12*srcStride];\
1662         const int src13= src[13*srcStride];\
1663         const int src14= src[14*srcStride];\
1664         const int src15= src[15*srcStride];\
1665         const int src16= src[16*srcStride];\
1666         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1667         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1668         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1669         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1670         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1671         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1672         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1673         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1674         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1675         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1676         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1677         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1678         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1679         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1680         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1681         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1682         dst++;\
1683         src++;\
1684     }\
1685 }\
1686 \
1687 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1688     OPNAME ## pixels8_c(dst, src, stride, 8);\
1689 }\
1690 \
1691 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1692     uint8_t half[64];\
1693     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1694     OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1695 }\
1696 \
1697 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1698     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1699 }\
1700 \
1701 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1702     uint8_t half[64];\
1703     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1704     OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1705 }\
1706 \
1707 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1708     uint8_t full[16*9];\
1709     uint8_t half[64];\
1710     copy_block9(full, src, 16, stride, 9);\
1711     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1712     OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1713 }\
1714 \
1715 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1716     uint8_t full[16*9];\
1717     copy_block9(full, src, 16, stride, 9);\
1718     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1719 }\
1720 \
1721 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1722     uint8_t full[16*9];\
1723     uint8_t half[64];\
1724     copy_block9(full, src, 16, stride, 9);\
1725     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1726     OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1727 }\
1728 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1729     uint8_t full[16*9];\
1730     uint8_t halfH[72];\
1731     uint8_t halfV[64];\
1732     uint8_t halfHV[64];\
1733     copy_block9(full, src, 16, stride, 9);\
1734     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1735     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1736     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1737     OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1738 }\
1739 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1740     uint8_t full[16*9];\
1741     uint8_t halfH[72];\
1742     uint8_t halfHV[64];\
1743     copy_block9(full, src, 16, stride, 9);\
1744     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1745     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1746     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1747     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1748 }\
1749 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1750     uint8_t full[16*9];\
1751     uint8_t halfH[72];\
1752     uint8_t halfV[64];\
1753     uint8_t halfHV[64];\
1754     copy_block9(full, src, 16, stride, 9);\
1755     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1756     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1757     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1758     OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1759 }\
1760 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1761     uint8_t full[16*9];\
1762     uint8_t halfH[72];\
1763     uint8_t halfHV[64];\
1764     copy_block9(full, src, 16, stride, 9);\
1765     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1766     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1767     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1768     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1769 }\
1770 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1771     uint8_t full[16*9];\
1772     uint8_t halfH[72];\
1773     uint8_t halfV[64];\
1774     uint8_t halfHV[64];\
1775     copy_block9(full, src, 16, stride, 9);\
1776     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1777     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1778     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1779     OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1780 }\
1781 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1782     uint8_t full[16*9];\
1783     uint8_t halfH[72];\
1784     uint8_t halfHV[64];\
1785     copy_block9(full, src, 16, stride, 9);\
1786     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1787     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1788     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1789     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1790 }\
1791 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1792     uint8_t full[16*9];\
1793     uint8_t halfH[72];\
1794     uint8_t halfV[64];\
1795     uint8_t halfHV[64];\
1796     copy_block9(full, src, 16, stride, 9);\
1797     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1798     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1799     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1800     OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1801 }\
1802 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1803     uint8_t full[16*9];\
1804     uint8_t halfH[72];\
1805     uint8_t halfHV[64];\
1806     copy_block9(full, src, 16, stride, 9);\
1807     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1808     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1809     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1810     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1811 }\
1812 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1813     uint8_t halfH[72];\
1814     uint8_t halfHV[64];\
1815     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1816     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1817     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1818 }\
1819 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1820     uint8_t halfH[72];\
1821     uint8_t halfHV[64];\
1822     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1823     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1824     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1825 }\
1826 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1827     uint8_t full[16*9];\
1828     uint8_t halfH[72];\
1829     uint8_t halfV[64];\
1830     uint8_t halfHV[64];\
1831     copy_block9(full, src, 16, stride, 9);\
1832     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1833     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1834     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1835     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1836 }\
1837 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1838     uint8_t full[16*9];\
1839     uint8_t halfH[72];\
1840     copy_block9(full, src, 16, stride, 9);\
1841     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1842     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1843     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1844 }\
1845 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1846     uint8_t full[16*9];\
1847     uint8_t halfH[72];\
1848     uint8_t halfV[64];\
1849     uint8_t halfHV[64];\
1850     copy_block9(full, src, 16, stride, 9);\
1851     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1852     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1853     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1854     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1855 }\
1856 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1857     uint8_t full[16*9];\
1858     uint8_t halfH[72];\
1859     copy_block9(full, src, 16, stride, 9);\
1860     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1861     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1862     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1863 }\
1864 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1865     uint8_t halfH[72];\
1866     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1867     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1868 }\
1869 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1870     OPNAME ## pixels16_c(dst, src, stride, 16);\
1871 }\
1872 \
1873 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1874     uint8_t half[256];\
1875     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1876     OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1877 }\
1878 \
1879 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1880     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1881 }\
1882 \
1883 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1884     uint8_t half[256];\
1885     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1886     OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1887 }\
1888 \
1889 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1890     uint8_t full[24*17];\
1891     uint8_t half[256];\
1892     copy_block17(full, src, 24, stride, 17);\
1893     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1894     OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1895 }\
1896 \
1897 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1898     uint8_t full[24*17];\
1899     copy_block17(full, src, 24, stride, 17);\
1900     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1901 }\
1902 \
1903 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1904     uint8_t full[24*17];\
1905     uint8_t half[256];\
1906     copy_block17(full, src, 24, stride, 17);\
1907     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1908     OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1909 }\
1910 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1911     uint8_t full[24*17];\
1912     uint8_t halfH[272];\
1913     uint8_t halfV[256];\
1914     uint8_t halfHV[256];\
1915     copy_block17(full, src, 24, stride, 17);\
1916     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1917     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1918     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1919     OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1920 }\
1921 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1922     uint8_t full[24*17];\
1923     uint8_t halfH[272];\
1924     uint8_t halfHV[256];\
1925     copy_block17(full, src, 24, stride, 17);\
1926     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1927     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1928     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1929     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1930 }\
1931 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1932     uint8_t full[24*17];\
1933     uint8_t halfH[272];\
1934     uint8_t halfV[256];\
1935     uint8_t halfHV[256];\
1936     copy_block17(full, src, 24, stride, 17);\
1937     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1938     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1939     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1940     OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1941 }\
1942 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1943     uint8_t full[24*17];\
1944     uint8_t halfH[272];\
1945     uint8_t halfHV[256];\
1946     copy_block17(full, src, 24, stride, 17);\
1947     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1948     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1949     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1950     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1951 }\
1952 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1953     uint8_t full[24*17];\
1954     uint8_t halfH[272];\
1955     uint8_t halfV[256];\
1956     uint8_t halfHV[256];\
1957     copy_block17(full, src, 24, stride, 17);\
1958     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1959     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1960     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1961     OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1962 }\
1963 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1964     uint8_t full[24*17];\
1965     uint8_t halfH[272];\
1966     uint8_t halfHV[256];\
1967     copy_block17(full, src, 24, stride, 17);\
1968     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1969     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1970     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1971     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1972 }\
1973 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1974     uint8_t full[24*17];\
1975     uint8_t halfH[272];\
1976     uint8_t halfV[256];\
1977     uint8_t halfHV[256];\
1978     copy_block17(full, src, 24, stride, 17);\
1979     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1980     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1981     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1982     OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1983 }\
1984 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1985     uint8_t full[24*17];\
1986     uint8_t halfH[272];\
1987     uint8_t halfHV[256];\
1988     copy_block17(full, src, 24, stride, 17);\
1989     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1990     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1991     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1992     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1993 }\
1994 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1995     uint8_t halfH[272];\
1996     uint8_t halfHV[256];\
1997     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1998     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1999     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2000 }\
2001 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2002     uint8_t halfH[272];\
2003     uint8_t halfHV[256];\
2004     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2005     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2006     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2007 }\
2008 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2009     uint8_t full[24*17];\
2010     uint8_t halfH[272];\
2011     uint8_t halfV[256];\
2012     uint8_t halfHV[256];\
2013     copy_block17(full, src, 24, stride, 17);\
2014     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2015     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2016     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2017     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2018 }\
2019 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2020     uint8_t full[24*17];\
2021     uint8_t halfH[272];\
2022     copy_block17(full, src, 24, stride, 17);\
2023     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2024     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2025     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2026 }\
2027 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2028     uint8_t full[24*17];\
2029     uint8_t halfH[272];\
2030     uint8_t halfV[256];\
2031     uint8_t halfHV[256];\
2032     copy_block17(full, src, 24, stride, 17);\
2033     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2034     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2035     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2036     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2037 }\
2038 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2039     uint8_t full[24*17];\
2040     uint8_t halfH[272];\
2041     copy_block17(full, src, 24, stride, 17);\
2042     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2043     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2044     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2045 }\
2046 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2047     uint8_t halfH[272];\
2048     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2049     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2050 }
2051
2052 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2053 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2054 #define op_put(a, b) a = cm[((b) + 16)>>5]
2055 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2056
2057 QPEL_MC(0, put_       , _       , op_put)
2058 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2059 QPEL_MC(0, avg_       , _       , op_avg)
2060 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
2061 #undef op_avg
2062 #undef op_avg_no_rnd
2063 #undef op_put
2064 #undef op_put_no_rnd
2065
2066 #if 1
2067 #define H264_LOWPASS(OPNAME, OP, OP2) \
2068 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2069     const int h=2;\
2070     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2071     int i;\
2072     for(i=0; i<h; i++)\
2073     {\
2074         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2075         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2076         dst+=dstStride;\
2077         src+=srcStride;\
2078     }\
2079 }\
2080 \
2081 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2082     const int w=2;\
2083     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2084     int i;\
2085     for(i=0; i<w; i++)\
2086     {\
2087         const int srcB= src[-2*srcStride];\
2088         const int srcA= src[-1*srcStride];\
2089         const int src0= src[0 *srcStride];\
2090         const int src1= src[1 *srcStride];\
2091         const int src2= src[2 *srcStride];\
2092         const int src3= src[3 *srcStride];\
2093         const int src4= src[4 *srcStride];\
2094         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2095         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2096         dst++;\
2097         src++;\
2098     }\
2099 }\
2100 \
2101 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2102     const int h=2;\
2103     const int w=2;\
2104     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2105     int i;\
2106     src -= 2*srcStride;\
2107     for(i=0; i<h+5; i++)\
2108     {\
2109         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2110         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2111         tmp+=tmpStride;\
2112         src+=srcStride;\
2113     }\
2114     tmp -= tmpStride*(h+5-2);\
2115     for(i=0; i<w; i++)\
2116     {\
2117         const int tmpB= tmp[-2*tmpStride];\
2118         const int tmpA= tmp[-1*tmpStride];\
2119         const int tmp0= tmp[0 *tmpStride];\
2120         const int tmp1= tmp[1 *tmpStride];\
2121         const int tmp2= tmp[2 *tmpStride];\
2122         const int tmp3= tmp[3 *tmpStride];\
2123         const int tmp4= tmp[4 *tmpStride];\
2124         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2125         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2126         dst++;\
2127         tmp++;\
2128     }\
2129 }\
2130 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2131     const int h=4;\
2132     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2133     int i;\
2134     for(i=0; i<h; i++)\
2135     {\
2136         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2137         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2138         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2139         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2140         dst+=dstStride;\
2141         src+=srcStride;\
2142     }\
2143 }\
2144 \
2145 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2146     const int w=4;\
2147     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2148     int i;\
2149     for(i=0; i<w; i++)\
2150     {\
2151         const int srcB= src[-2*srcStride];\
2152         const int srcA= src[-1*srcStride];\
2153         const int src0= src[0 *srcStride];\
2154         const int src1= src[1 *srcStride];\
2155         const int src2= src[2 *srcStride];\
2156         const int src3= src[3 *srcStride];\
2157         const int src4= src[4 *srcStride];\
2158         const int src5= src[5 *srcStride];\
2159         const int src6= src[6 *srcStride];\
2160         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2161         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2162         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2163         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2164         dst++;\
2165         src++;\
2166     }\
2167 }\
2168 \
2169 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2170     const int h=4;\
2171     const int w=4;\
2172     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2173     int i;\
2174     src -= 2*srcStride;\
2175     for(i=0; i<h+5; i++)\
2176     {\
2177         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2178         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2179         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2180         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2181         tmp+=tmpStride;\
2182         src+=srcStride;\
2183     }\
2184     tmp -= tmpStride*(h+5-2);\
2185     for(i=0; i<w; i++)\
2186     {\
2187         const int tmpB= tmp[-2*tmpStride];\
2188         const int tmpA= tmp[-1*tmpStride];\
2189         const int tmp0= tmp[0 *tmpStride];\
2190         const int tmp1= tmp[1 *tmpStride];\
2191         const int tmp2= tmp[2 *tmpStride];\
2192         const int tmp3= tmp[3 *tmpStride];\
2193         const int tmp4= tmp[4 *tmpStride];\
2194         const int tmp5= tmp[5 *tmpStride];\
2195         const int tmp6= tmp[6 *tmpStride];\
2196         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2197         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2198         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2199         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2200         dst++;\
2201         tmp++;\
2202     }\
2203 }\
2204 \
2205 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2206     const int h=8;\
2207     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2208     int i;\
2209     for(i=0; i<h; i++)\
2210     {\
2211         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2212         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2213         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2214         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2215         OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2216         OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2217         OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2218         OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2219         dst+=dstStride;\
2220         src+=srcStride;\
2221     }\
2222 }\
2223 \
2224 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2225     const int w=8;\
2226     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2227     int i;\
2228     for(i=0; i<w; i++)\
2229     {\
2230         const int srcB= src[-2*srcStride];\
2231         const int srcA= src[-1*srcStride];\
2232         const int src0= src[0 *srcStride];\
2233         const int src1= src[1 *srcStride];\
2234         const int src2= src[2 *srcStride];\
2235         const int src3= src[3 *srcStride];\
2236         const int src4= src[4 *srcStride];\
2237         const int src5= src[5 *srcStride];\
2238         const int src6= src[6 *srcStride];\
2239         const int src7= src[7 *srcStride];\
2240         const int src8= src[8 *srcStride];\
2241         const int src9= src[9 *srcStride];\
2242         const int src10=src[10*srcStride];\
2243         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2244         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2245         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2246         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2247         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2248         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2249         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2250         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2251         dst++;\
2252         src++;\
2253     }\
2254 }\
2255 \
2256 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2257     const int h=8;\
2258     const int w=8;\
2259     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2260     int i;\
2261     src -= 2*srcStride;\
2262     for(i=0; i<h+5; i++)\
2263     {\
2264         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2265         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2266         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2267         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2268         tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2269         tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2270         tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2271         tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2272         tmp+=tmpStride;\
2273         src+=srcStride;\
2274     }\
2275     tmp -= tmpStride*(h+5-2);\
2276     for(i=0; i<w; i++)\
2277     {\
2278         const int tmpB= tmp[-2*tmpStride];\
2279         const int tmpA= tmp[-1*tmpStride];\
2280         const int tmp0= tmp[0 *tmpStride];\
2281         const int tmp1= tmp[1 *tmpStride];\
2282         const int tmp2= tmp[2 *tmpStride];\
2283         const int tmp3= tmp[3 *tmpStride];\
2284         const int tmp4= tmp[4 *tmpStride];\
2285         const int tmp5= tmp[5 *tmpStride];\
2286         const int tmp6= tmp[6 *tmpStride];\
2287         const int tmp7= tmp[7 *tmpStride];\
2288         const int tmp8= tmp[8 *tmpStride];\
2289         const int tmp9= tmp[9 *tmpStride];\
2290         const int tmp10=tmp[10*tmpStride];\
2291         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2292         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2293         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2294         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2295         OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2296         OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2297         OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2298         OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2299         dst++;\
2300         tmp++;\
2301     }\
2302 }\
2303 \
2304 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2305     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2306     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2307     src += 8*srcStride;\
2308     dst += 8*dstStride;\
2309     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2310     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2311 }\
2312 \
2313 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2314     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2315     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2316     src += 8*srcStride;\
2317     dst += 8*dstStride;\
2318     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2319     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2320 }\
2321 \
2322 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2323     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2324     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2325     src += 8*srcStride;\
2326     dst += 8*dstStride;\
2327     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2328     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2329 }\
2330
2331 #define H264_MC(OPNAME, SIZE) \
2332 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2333     OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2334 }\
2335 \
2336 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2337     uint8_t half[SIZE*SIZE];\
2338     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2339     OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2340 }\
2341 \
2342 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2343     OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2344 }\
2345 \
2346 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2347     uint8_t half[SIZE*SIZE];\
2348     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2349     OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2350 }\
2351 \
2352 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2353     uint8_t full[SIZE*(SIZE+5)];\
2354     uint8_t * const full_mid= full + SIZE*2;\
2355     uint8_t half[SIZE*SIZE];\
2356     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2357     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2358     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2359 }\
2360 \
2361 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2362     uint8_t full[SIZE*(SIZE+5)];\
2363     uint8_t * const full_mid= full + SIZE*2;\
2364     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2365     OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2366 }\
2367 \
2368 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2369     uint8_t full[SIZE*(SIZE+5)];\
2370     uint8_t * const full_mid= full + SIZE*2;\
2371     uint8_t half[SIZE*SIZE];\
2372     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2373     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2374     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2375 }\
2376 \
2377 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2378     uint8_t full[SIZE*(SIZE+5)];\
2379     uint8_t * const full_mid= full + SIZE*2;\
2380     uint8_t halfH[SIZE*SIZE];\
2381     uint8_t halfV[SIZE*SIZE];\
2382     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2383     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2384     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2385     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2386 }\
2387 \
2388 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2389     uint8_t full[SIZE*(SIZE+5)];\
2390     uint8_t * const full_mid= full + SIZE*2;\
2391     uint8_t halfH[SIZE*SIZE];\
2392     uint8_t halfV[SIZE*SIZE];\
2393     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2394     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2395     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2396     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2397 }\
2398 \
2399 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2400     uint8_t full[SIZE*(SIZE+5)];\
2401     uint8_t * const full_mid= full + SIZE*2;\
2402     uint8_t halfH[SIZE*SIZE];\
2403     uint8_t halfV[SIZE*SIZE];\
2404     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2405     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2406     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2407     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2408 }\
2409 \
2410 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2411     uint8_t full[SIZE*(SIZE+5)];\
2412     uint8_t * const full_mid= full + SIZE*2;\
2413     uint8_t halfH[SIZE*SIZE];\
2414     uint8_t halfV[SIZE*SIZE];\
2415     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2416     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2417     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2418     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2419 }\
2420 \
2421 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2422     int16_t tmp[SIZE*(SIZE+5)];\
2423     OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2424 }\
2425 \
2426 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2427     int16_t tmp[SIZE*(SIZE+5)];\
2428     uint8_t halfH[SIZE*SIZE];\
2429     uint8_t halfHV[SIZE*SIZE];\
2430     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2431     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2432     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2433 }\
2434 \
2435 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2436     int16_t tmp[SIZE*(SIZE+5)];\
2437     uint8_t halfH[SIZE*SIZE];\
2438     uint8_t halfHV[SIZE*SIZE];\
2439     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2440     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2441     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2442 }\
2443 \
2444 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2445     uint8_t full[SIZE*(SIZE+5)];\
2446     uint8_t * const full_mid= full + SIZE*2;\
2447     int16_t tmp[SIZE*(SIZE+5)];\
2448     uint8_t halfV[SIZE*SIZE];\
2449     uint8_t halfHV[SIZE*SIZE];\
2450     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2451     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2452     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2453     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2454 }\
2455 \
2456 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2457     uint8_t full[SIZE*(SIZE+5)];\
2458     uint8_t * const full_mid= full + SIZE*2;\
2459     int16_t tmp[SIZE*(SIZE+5)];\
2460     uint8_t halfV[SIZE*SIZE];\
2461     uint8_t halfHV[SIZE*SIZE];\
2462     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2463     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2464     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2465     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2466 }\
2467
2468 #define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2469 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2470 #define op_put(a, b)  a = cm[((b) + 16)>>5]
2471 #define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2472 #define op2_put(a, b)  a = cm[((b) + 512)>>10]
2473
2474 H264_LOWPASS(put_       , op_put, op2_put)
2475 H264_LOWPASS(avg_       , op_avg, op2_avg)
2476 H264_MC(put_, 2)
2477 H264_MC(put_, 4)
2478 H264_MC(put_, 8)
2479 H264_MC(put_, 16)
2480 H264_MC(avg_, 4)
2481 H264_MC(avg_, 8)
2482 H264_MC(avg_, 16)
2483
2484 #undef op_avg
2485 #undef op_put
2486 #undef op2_avg
2487 #undef op2_put
2488 #endif
2489
2490 #define op_scale1(x)  block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2491 #define op_scale2(x)  dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2492 #define H264_WEIGHT(W,H) \
2493 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2494     int y; \
2495     offset <<= log2_denom; \
2496     if(log2_denom) offset += 1<<(log2_denom-1); \
2497     for(y=0; y<H; y++, block += stride){ \
2498         op_scale1(0); \
2499         op_scale1(1); \
2500         if(W==2) continue; \
2501         op_scale1(2); \
2502         op_scale1(3); \
2503         if(W==4) continue; \
2504         op_scale1(4); \
2505         op_scale1(5); \
2506         op_scale1(6); \
2507         op_scale1(7); \
2508         if(W==8) continue; \
2509         op_scale1(8); \
2510         op_scale1(9); \
2511         op_scale1(10); \
2512         op_scale1(11); \
2513         op_scale1(12); \
2514         op_scale1(13); \
2515         op_scale1(14); \
2516         op_scale1(15); \
2517     } \
2518 } \
2519 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2520     int y; \
2521     offset = ((offset + 1) | 1) << log2_denom; \
2522     for(y=0; y<H; y++, dst += stride, src += stride){ \
2523         op_scale2(0); \
2524         op_scale2(1); \
2525         if(W==2) continue; \
2526         op_scale2(2); \
2527         op_scale2(3); \
2528         if(W==4) continue; \
2529         op_scale2(4); \
2530         op_scale2(5); \
2531         op_scale2(6); \
2532         op_scale2(7); \
2533         if(W==8) continue; \
2534         op_scale2(8); \
2535         op_scale2(9); \
2536         op_scale2(10); \
2537         op_scale2(11); \
2538         op_scale2(12); \
2539         op_scale2(13); \
2540         op_scale2(14); \
2541         op_scale2(15); \
2542     } \
2543 }
2544
2545 H264_WEIGHT(16,16)
2546 H264_WEIGHT(16,8)
2547 H264_WEIGHT(8,16)
2548 H264_WEIGHT(8,8)
2549 H264_WEIGHT(8,4)
2550 H264_WEIGHT(4,8)
2551 H264_WEIGHT(4,4)
2552 H264_WEIGHT(4,2)
2553 H264_WEIGHT(2,4)
2554 H264_WEIGHT(2,2)
2555
2556 #undef op_scale1
2557 #undef op_scale2
2558 #undef H264_WEIGHT
2559
2560 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2561     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2562     int i;
2563
2564     for(i=0; i<h; i++){
2565         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2566         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2567         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2568         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2569         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2570         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2571         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2572         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2573         dst+=dstStride;
2574         src+=srcStride;
2575     }
2576 }
2577
2578 #ifdef CONFIG_CAVS_DECODER
2579 /* AVS specific */
2580 void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2581
2582 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2583     put_pixels8_c(dst, src, stride, 8);
2584 }
2585 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2586     avg_pixels8_c(dst, src, stride, 8);
2587 }
2588 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2589     put_pixels16_c(dst, src, stride, 16);
2590 }
2591 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2592     avg_pixels16_c(dst, src, stride, 16);
2593 }
2594 #endif /* CONFIG_CAVS_DECODER */
2595
2596 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
2597 /* VC-1 specific */
2598 void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2599
2600 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2601     put_pixels8_c(dst, src, stride, 8);
2602 }
2603 #endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
2604
2605 void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
2606
2607 /* H264 specific */
2608 void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2609
2610 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2611     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2612     int i;
2613
2614     for(i=0; i<w; i++){
2615         const int src_1= src[ -srcStride];
2616         const int src0 = src[0          ];
2617         const int src1 = src[  srcStride];
2618         const int src2 = src[2*srcStride];
2619         const int src3 = src[3*srcStride];
2620         const int src4 = src[4*srcStride];
2621         const int src5 = src[5*srcStride];
2622         const int src6 = src[6*srcStride];
2623         const int src7 = src[7*srcStride];
2624         const int src8 = src[8*srcStride];
2625         const int src9 = src[9*srcStride];
2626         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2627         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2628         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2629         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2630         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2631         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2632         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2633         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2634         src++;
2635         dst++;
2636     }
2637 }
2638
2639 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2640     put_pixels8_c(dst, src, stride, 8);
2641 }
2642
2643 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2644     uint8_t half[64];
2645     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2646     put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2647 }
2648
2649 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2650     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2651 }
2652
2653 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2654     uint8_t half[64];
2655     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2656     put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2657 }
2658
2659 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2660     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2661 }
2662
2663 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2664     uint8_t halfH[88];
2665     uint8_t halfV[64];
2666     uint8_t halfHV[64];
2667     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2668     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2669     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2670     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2671 }
2672 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2673     uint8_t halfH[88];
2674     uint8_t halfV[64];
2675     uint8_t halfHV[64];
2676     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2677     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2678     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2679     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2680 }
2681 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2682     uint8_t halfH[88];
2683     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2684     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2685 }
2686
2687 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2688     if(ENABLE_ANY_H263) {
2689     int x;
2690     const int strength= ff_h263_loop_filter_strength[qscale];
2691
2692     for(x=0; x<8; x++){
2693         int d1, d2, ad1;
2694         int p0= src[x-2*stride];
2695         int p1= src[x-1*stride];
2696         int p2= src[x+0*stride];
2697         int p3= src[x+1*stride];
2698         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2699
2700         if     (d<-2*strength) d1= 0;
2701         else if(d<-  strength) d1=-2*strength - d;
2702         else if(d<   strength) d1= d;
2703         else if(d< 2*strength) d1= 2*strength - d;
2704         else                   d1= 0;
2705
2706         p1 += d1;
2707         p2 -= d1;
2708         if(p1&256) p1= ~(p1>>31);
2709         if(p2&256) p2= ~(p2>>31);
2710
2711         src[x-1*stride] = p1;
2712         src[x+0*stride] = p2;
2713
2714         ad1= FFABS(d1)>>1;
2715
2716         d2= av_clip((p0-p3)/4, -ad1, ad1);
2717
2718         src[x-2*stride] = p0 - d2;
2719         src[x+  stride] = p3 + d2;
2720     }
2721     }
2722 }
2723
2724 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2725     if(ENABLE_ANY_H263) {
2726     int y;
2727     const int strength= ff_h263_loop_filter_strength[qscale];
2728
2729     for(y=0; y<8; y++){
2730         int d1, d2, ad1;
2731         int p0= src[y*stride-2];
2732         int p1= src[y*stride-1];
2733         int p2= src[y*stride+0];
2734         int p3= src[y*stride+1];
2735         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2736
2737         if     (d<-2*strength) d1= 0;
2738         else if(d<-  strength) d1=-2*strength - d;
2739         else if(d<   strength) d1= d;
2740         else if(d< 2*strength) d1= 2*strength - d;
2741         else                   d1= 0;
2742
2743         p1 += d1;
2744         p2 -= d1;
2745         if(p1&256) p1= ~(p1>>31);
2746         if(p2&256) p2= ~(p2>>31);
2747
2748         src[y*stride-1] = p1;
2749         src[y*stride+0] = p2;
2750
2751         ad1= FFABS(d1)>>1;
2752
2753         d2= av_clip((p0-p3)/4, -ad1, ad1);
2754
2755         src[y*stride-2] = p0 - d2;
2756         src[y*stride+1] = p3 + d2;
2757     }
2758     }
2759 }
2760
2761 static void h261_loop_filter_c(uint8_t *src, int stride){
2762     int x,y,xy,yz;
2763     int temp[64];
2764
2765     for(x=0; x<8; x++){
2766         temp[x      ] = 4*src[x           ];
2767         temp[x + 7*8] = 4*src[x + 7*stride];
2768     }
2769     for(y=1; y<7; y++){
2770         for(x=0; x<8; x++){
2771             xy = y * stride + x;
2772             yz = y * 8 + x;
2773             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2774         }
2775     }
2776
2777     for(y=0; y<8; y++){
2778         src[  y*stride] = (temp[  y*8] + 2)>>2;
2779         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2780         for(x=1; x<7; x++){
2781             xy = y * stride + x;
2782             yz = y * 8 + x;
2783             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2784         }
2785     }
2786 }
2787
2788 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2789 {
2790     int i, d;
2791     for( i = 0; i < 4; i++ ) {
2792         if( tc0[i] < 0 ) {
2793             pix += 4*ystride;
2794             continue;
2795         }
2796         for( d = 0; d < 4; d++ ) {
2797             const int p0 = pix[-1*xstride];
2798             const int p1 = pix[-2*xstride];
2799             const int p2 = pix[-3*xstride];
2800             const int q0 = pix[0];
2801             const int q1 = pix[1*xstride];
2802             const int q2 = pix[2*xstride];
2803
2804             if( FFABS( p0 - q0 ) < alpha &&
2805                 FFABS( p1 - p0 ) < beta &&
2806                 FFABS( q1 - q0 ) < beta ) {
2807
2808                 int tc = tc0[i];
2809                 int i_delta;
2810
2811                 if( FFABS( p2 - p0 ) < beta ) {
2812                     pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2813                     tc++;
2814                 }
2815                 if( FFABS( q2 - q0 ) < beta ) {
2816                     pix[   xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2817                     tc++;
2818                 }
2819
2820                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2821                 pix[-xstride] = av_clip_uint8( p0 + i_delta );    /* p0' */
2822                 pix[0]        = av_clip_uint8( q0 - i_delta );    /* q0' */
2823             }
2824             pix += ystride;
2825         }
2826     }
2827 }
2828 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2829 {
2830     h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2831 }
2832 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2833 {
2834     h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2835 }
2836
2837 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2838 {
2839     int i, d;
2840     for( i = 0; i < 4; i++ ) {
2841         const int tc = tc0[i];
2842         if( tc <= 0 ) {
2843             pix += 2*ystride;
2844             continue;
2845         }
2846         for( d = 0; d < 2; d++ ) {
2847             const int p0 = pix[-1*xstride];
2848             const int p1 = pix[-2*xstride];
2849             const int q0 = pix[0];
2850             const int q1 = pix[1*xstride];
2851
2852             if( FFABS( p0 - q0 ) < alpha &&
2853                 FFABS( p1 - p0 ) < beta &&
2854                 FFABS( q1 - q0 ) < beta ) {
2855
2856                 int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2857
2858                 pix[-xstride] = av_clip_uint8( p0 + delta );    /* p0' */
2859                 pix[0]        = av_clip_uint8( q0 - delta );    /* q0' */
2860             }
2861             pix += ystride;
2862         }
2863     }
2864 }
2865 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2866 {
2867     h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2868 }
2869 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2870 {
2871     h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
2872 }
2873
2874 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2875 {
2876     int d;
2877     for( d = 0; d < 8; d++ ) {
2878         const int p0 = pix[-1*xstride];
2879         const int p1 = pix[-2*xstride];
2880         const int q0 = pix[0];
2881         const int q1 = pix[1*xstride];
2882
2883         if( FFABS( p0 - q0 ) < alpha &&
2884             FFABS( p1 - p0 ) < beta &&
2885             FFABS( q1 - q0 ) < beta ) {
2886
2887             pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
2888             pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
2889         }
2890         pix += ystride;
2891     }
2892 }
2893 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2894 {
2895     h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
2896 }
2897 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2898 {
2899     h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
2900 }
2901
2902 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2903 {
2904     int s, i;
2905
2906     s = 0;
2907     for(i=0;i<h;i++) {
2908         s += abs(pix1[0] - pix2[0]);
2909         s += abs(pix1[1] - pix2[1]);
2910         s += abs(pix1[2] - pix2[2]);
2911         s += abs(pix1[3] - pix2[3]);
2912         s += abs(pix1[4] - pix2[4]);
2913         s += abs(pix1[5] - pix2[5]);
2914         s += abs(pix1[6] - pix2[6]);
2915         s += abs(pix1[7] - pix2[7]);
2916         s += abs(pix1[8] - pix2[8]);
2917         s += abs(pix1[9] - pix2[9]);
2918         s += abs(pix1[10] - pix2[10]);
2919         s += abs(pix1[11] - pix2[11]);
2920         s += abs(pix1[12] - pix2[12]);
2921         s += abs(pix1[13] - pix2[13]);
2922         s += abs(pix1[14] - pix2[14]);
2923         s += abs(pix1[15] - pix2[15]);
2924         pix1 += line_size;
2925         pix2 += line_size;
2926     }
2927     return s;
2928 }
2929
2930 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2931 {
2932     int s, i;
2933
2934     s = 0;
2935     for(i=0;i<h;i++) {
2936         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2937         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2938         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2939         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2940         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2941         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2942         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2943         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2944         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2945         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2946         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2947         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2948         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2949         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2950         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2951         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2952         pix1 += line_size;
2953         pix2 += line_size;
2954     }
2955     return s;
2956 }
2957
2958 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2959 {
2960     int s, i;
2961     uint8_t *pix3 = pix2 + line_size;
2962
2963     s = 0;
2964     for(i=0;i<h;i++) {
2965         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2966         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2967         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2968         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2969         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2970         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2971         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2972         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2973         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2974         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2975         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2976         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2977         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2978         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2979         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2980         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2981         pix1 += line_size;
2982         pix2 += line_size;
2983         pix3 += line_size;
2984     }
2985     return s;
2986 }
2987
2988 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2989 {
2990     int s, i;
2991     uint8_t *pix3 = pix2 + line_size;
2992
2993     s = 0;
2994     for(i=0;i<h;i++) {
2995         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2996         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2997         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2998         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2999         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3000         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3001         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3002         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3003         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3004         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3005         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3006         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3007         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3008         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3009         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3010         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3011         pix1 += line_size;
3012         pix2 += line_size;
3013         pix3 += line_size;
3014     }
3015     return s;
3016 }
3017
3018 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3019 {
3020     int s, i;
3021
3022     s = 0;
3023     for(i=0;i<h;i++) {
3024         s += abs(pix1[0] - pix2[0]);
3025         s += abs(pix1[1] - pix2[1]);
3026         s += abs(pix1[2] - pix2[2]);
3027         s += abs(pix1[3] - pix2[3]);
3028         s += abs(pix1[4] - pix2[4]);
3029         s += abs(pix1[5] - pix2[5]);
3030         s += abs(pix1[6] - pix2[6]);
3031         s += abs(pix1[7] - pix2[7]);
3032         pix1 += line_size;
3033         pix2 += line_size;
3034     }
3035     return s;
3036 }
3037
3038 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3039 {
3040     int s, i;
3041
3042     s = 0;
3043     for(i=0;i<h;i++) {
3044         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3045         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3046         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3047         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3048         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3049         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3050         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3051         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3052         pix1 += line_size;
3053         pix2 += line_size;
3054     }
3055     return s;
3056 }
3057
3058 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3059 {
3060     int s, i;
3061     uint8_t *pix3 = pix2 + line_size;
3062
3063     s = 0;
3064     for(i=0;i<h;i++) {
3065         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3066         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3067         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3068         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3069         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3070         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3071         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3072         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3073         pix1 += line_size;
3074         pix2 += line_size;
3075         pix3 += line_size;
3076     }
3077     return s;
3078 }
3079
3080 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3081 {
3082     int s, i;
3083     uint8_t *pix3 = pix2 + line_size;
3084
3085     s = 0;
3086     for(i=0;i<h;i++) {
3087         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3088         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3089         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3090         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3091         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3092         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3093         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3094         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3095         pix1 += line_size;
3096         pix2 += line_size;
3097         pix3 += line_size;
3098     }
3099     return s;
3100 }
3101
3102 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3103     MpegEncContext *c = v;
3104     int score1=0;
3105     int score2=0;
3106     int x,y;
3107
3108     for(y=0; y<h; y++){
3109         for(x=0; x<16; x++){
3110             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3111         }
3112         if(y+1<h){
3113             for(x=0; x<15; x++){
3114                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
3115                              - s1[x+1] + s1[x+1+stride])
3116                         -FFABS(  s2[x  ] - s2[x  +stride]
3117                              - s2[x+1] + s2[x+1+stride]);
3118             }
3119         }
3120         s1+= stride;
3121         s2+= stride;
3122     }
3123
3124     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3125     else  return score1 + FFABS(score2)*8;
3126 }
3127
3128 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3129     MpegEncContext *c = v;
3130     int score1=0;
3131     int score2=0;
3132     int x,y;
3133
3134     for(y=0; y<h; y++){
3135         for(x=0; x<8; x++){
3136             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3137         }
3138         if(y+1<h){
3139             for(x=0; x<7; x++){
3140                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
3141                              - s1[x+1] + s1[x+1+stride])
3142                         -FFABS(  s2[x  ] - s2[x  +stride]
3143                              - s2[x+1] + s2[x+1+stride]);
3144             }
3145         }
3146         s1+= stride;
3147         s2+= stride;
3148     }
3149
3150     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3151     else  return score1 + FFABS(score2)*8;
3152 }
3153
3154 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3155     int i;
3156     unsigned int sum=0;
3157
3158     for(i=0; i<8*8; i++){
3159         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3160         int w= weight[i];
3161         b>>= RECON_SHIFT;
3162         assert(-512<b && b<512);
3163
3164         sum += (w*b)*(w*b)>>4;
3165     }
3166     return sum>>2;
3167 }
3168
3169 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3170     int i;
3171
3172     for(i=0; i<8*8; i++){
3173         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3174     }
3175 }
3176
3177 /**
3178  * permutes an 8x8 block.
3179  * @param block the block which will be permuted according to the given permutation vector
3180  * @param permutation the permutation vector
3181  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3182  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3183  *                  (inverse) permutated to scantable order!
3184  */
3185 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3186 {
3187     int i;
3188     DCTELEM temp[64];
3189
3190     if(last<=0) return;
3191     //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3192
3193     for(i=0; i<=last; i++){
3194         const int j= scantable[i];
3195         temp[j]= block[j];
3196         block[j]=0;
3197     }
3198
3199     for(i=0; i<=last; i++){
3200         const int j= scantable[i];
3201         const int perm_j= permutation[j];
3202         block[perm_j]= temp[j];
3203     }
3204 }
3205
3206 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3207     return 0;
3208 }
3209
3210 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3211     int i;
3212
3213     memset(cmp, 0, sizeof(void*)*5);
3214
3215     for(i=0; i<5; i++){
3216         switch(type&0xFF){
3217         case FF_CMP_SAD:
3218             cmp[i]= c->sad[i];
3219             break;
3220         case FF_CMP_SATD:
3221             cmp[i]= c->hadamard8_diff[i];
3222             break;
3223         case FF_CMP_SSE:
3224             cmp[i]= c->sse[i];
3225             break;
3226         case FF_CMP_DCT:
3227             cmp[i]= c->dct_sad[i];
3228             break;
3229         case FF_CMP_DCT264:
3230             cmp[i]= c->dct264_sad[i];
3231             break;
3232         case FF_CMP_DCTMAX:
3233             cmp[i]= c->dct_max[i];
3234             break;
3235         case FF_CMP_PSNR:
3236             cmp[i]= c->quant_psnr[i];
3237             break;
3238         case FF_CMP_BIT:
3239             cmp[i]= c->bit[i];
3240             break;
3241         case FF_CMP_RD:
3242             cmp[i]= c->rd[i];
3243             break;
3244         case FF_CMP_VSAD:
3245             cmp[i]= c->vsad[i];
3246             break;
3247         case FF_CMP_VSSE:
3248             cmp[i]= c->vsse[i];
3249             break;
3250         case FF_CMP_ZERO:
3251             cmp[i]= zero_cmp;
3252             break;
3253         case FF_CMP_NSSE:
3254             cmp[i]= c->nsse[i];
3255             break;
3256 #ifdef CONFIG_SNOW_ENCODER
3257         case FF_CMP_W53:
3258             cmp[i]= c->w53[i];
3259             break;
3260         case FF_CMP_W97:
3261             cmp[i]= c->w97[i];
3262             break;
3263 #endif
3264         default:
3265             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3266         }
3267     }
3268 }
3269
3270 /**
3271  * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3272  */
3273 static void clear_blocks_c(DCTELEM *blocks)
3274 {
3275     memset(blocks, 0, sizeof(DCTELEM)*6*64);
3276 }
3277
3278 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3279     int i;
3280     for(i=0; i+7<w; i+=8){
3281         dst[i+0] += src[i+0];
3282         dst[i+1] += src[i+1];
3283         dst[i+2] += src[i+2];
3284         dst[i+3] += src[i+3];
3285         dst[i+4] += src[i+4];
3286         dst[i+5] += src[i+5];
3287         dst[i+6] += src[i+6];
3288         dst[i+7] += src[i+7];
3289     }
3290     for(; i<w; i++)
3291         dst[i+0] += src[i+0];
3292 }
3293
3294 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3295     int i;
3296     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3297         long a = *(long*)(src1+i);
3298         long b = *(long*)(src2+i);
3299         *(long*)(dst+i) = ((a&0x7f7f7f7f7f7f7f7fL) + (b&0x7f7f7f7f7f7f7f7fL)) ^ ((a^b)&0x8080808080808080L);
3300     }
3301     for(; i<w; i++)
3302         dst[i] = src1[i]+src2[i];
3303 }
3304
3305 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3306     int i;
3307     for(i=0; i+7<w; i+=8){
3308         dst[i+0] = src1[i+0]-src2[i+0];
3309         dst[i+1] = src1[i+1]-src2[i+1];
3310         dst[i+2] = src1[i+2]-src2[i+2];
3311         dst[i+3] = src1[i+3]-src2[i+3];
3312         dst[i+4] = src1[i+4]-src2[i+4];
3313         dst[i+5] = src1[i+5]-src2[i+5];
3314         dst[i+6] = src1[i+6]-src2[i+6];
3315         dst[i+7] = src1[i+7]-src2[i+7];
3316     }
3317     for(; i<w; i++)
3318         dst[i+0] = src1[i+0]-src2[i+0];
3319 }
3320
3321 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3322     int i;
3323     uint8_t l, lt;
3324
3325     l= *left;
3326     lt= *left_top;
3327
3328     for(i=0; i<w; i++){
3329         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3330         lt= src1[i];
3331         l= src2[i];
3332         dst[i]= l - pred;
3333     }
3334
3335     *left= l;
3336     *left_top= lt;
3337 }
3338
3339 #define BUTTERFLY2(o1,o2,i1,i2) \
3340 o1= (i1)+(i2);\
3341 o2= (i1)-(i2);
3342
3343 #define BUTTERFLY1(x,y) \
3344 {\
3345     int a,b;\
3346     a= x;\
3347     b= y;\
3348     x= a+b;\
3349     y= a-b;\
3350 }
3351
3352 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3353
3354 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3355     int i;
3356     int temp[64];
3357     int sum=0;
3358
3359     assert(h==8);
3360
3361     for(i=0; i<8; i++){
3362         //FIXME try pointer walks
3363         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3364         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3365         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3366         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3367
3368         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3369         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3370         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3371         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3372
3373         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3374         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3375         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3376         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3377     }
3378
3379     for(i=0; i<8; i++){
3380         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3381         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3382         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3383         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3384
3385         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3386         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3387         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3388         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3389
3390         sum +=
3391              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3392             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3393             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3394             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3395     }
3396 #if 0
3397 static int maxi=0;
3398 if(sum>maxi){
3399     maxi=sum;
3400     printf("MAX:%d\n", maxi);
3401 }
3402 #endif
3403     return sum;
3404 }
3405
3406 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3407     int i;
3408     int temp[64];
3409     int sum=0;
3410
3411     assert(h==8);
3412
3413     for(i=0; i<8; i++){
3414         //FIXME try pointer walks
3415         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3416         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3417         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3418         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3419
3420         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3421         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3422         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3423         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3424
3425         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3426         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3427         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3428         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3429     }
3430
3431     for(i=0; i<8; i++){
3432         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3433         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3434         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3435         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3436
3437         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3438         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3439         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3440         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3441
3442         sum +=
3443              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3444             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3445             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3446             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3447     }
3448
3449     sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3450
3451     return sum;
3452 }
3453
3454 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3455     MpegEncContext * const s= (MpegEncContext *)c;
3456     DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3457     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3458
3459     assert(h==8);
3460
3461     s->dsp.diff_pixels(temp, src1, src2, stride);
3462     s->dsp.fdct(temp);
3463     return s->dsp.sum_abs_dctelem(temp);
3464 }
3465
3466 #ifdef CONFIG_GPL
3467 #define DCT8_1D {\
3468     const int s07 = SRC(0) + SRC(7);\
3469     const int s16 = SRC(1) + SRC(6);\
3470     const int s25 = SRC(2) + SRC(5);\
3471     const int s34 = SRC(3) + SRC(4);\
3472     const int a0 = s07 + s34;\
3473     const int a1 = s16 + s25;\
3474     const int a2 = s07 - s34;\
3475     const int a3 = s16 - s25;\
3476     const int d07 = SRC(0) - SRC(7);\
3477     const int d16 = SRC(1) - SRC(6);\
3478     const int d25 = SRC(2) - SRC(5);\
3479     const int d34 = SRC(3) - SRC(4);\
3480     const int a4 = d16 + d25 + (d07 + (d07>>1));\
3481     const int a5 = d07 - d34 - (d25 + (d25>>1));\
3482     const int a6 = d07 + d34 - (d16 + (d16>>1));\
3483     const int a7 = d16 - d25 + (d34 + (d34>>1));\
3484     DST(0,  a0 + a1     ) ;\
3485     DST(1,  a4 + (a7>>2)) ;\
3486     DST(2,  a2 + (a3>>1)) ;\
3487     DST(3,  a5 + (a6>>2)) ;\
3488     DST(4,  a0 - a1     ) ;\
3489     DST(5,  a6 - (a5>>2)) ;\
3490     DST(6, (a2>>1) - a3 ) ;\
3491     DST(7, (a4>>2) - a7 ) ;\
3492 }
3493
3494 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3495     MpegEncContext * const s= (MpegEncContext *)c;
3496     DCTELEM dct[8][8];
3497     int i;
3498     int sum=0;
3499
3500     s->dsp.diff_pixels(dct[0], src1, src2, stride);
3501
3502 #define SRC(x) dct[i][x]
3503 #define DST(x,v) dct[i][x]= v
3504     for( i = 0; i < 8; i++ )
3505         DCT8_1D
3506 #undef SRC
3507 #undef DST
3508
3509 #define SRC(x) dct[x][i]
3510 #define DST(x,v) sum += FFABS(v)
3511     for( i = 0; i < 8; i++ )
3512         DCT8_1D
3513 #undef SRC
3514 #undef DST
3515     return sum;
3516 }
3517 #endif
3518
3519 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3520     MpegEncContext * const s= (MpegEncContext *)c;
3521     DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3522     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3523     int sum=0, i;
3524
3525     assert(h==8);
3526
3527     s->dsp.diff_pixels(temp, src1, src2, stride);
3528     s->dsp.fdct(temp);
3529
3530     for(i=0; i<64; i++)
3531         sum= FFMAX(sum, FFABS(temp[i]));
3532
3533     return sum;
3534 }
3535
3536 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3537     MpegEncContext * const s= (MpegEncContext *)c;
3538     DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3539     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3540     DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3541     int sum=0, i;
3542
3543     assert(h==8);
3544     s->mb_intra=0;
3545
3546     s->dsp.diff_pixels(temp, src1, src2, stride);
3547
3548     memcpy(bak, temp, 64*sizeof(DCTELEM));
3549
3550     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3551     s->dct_unquantize_inter(s, temp, 0, s->qscale);
3552     ff_simple_idct(temp); //FIXME
3553
3554     for(i=0; i<64; i++)
3555         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3556
3557     return sum;
3558 }
3559
3560 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3561     MpegEncContext * const s= (MpegEncContext *)c;
3562     const uint8_t *scantable= s->intra_scantable.permutated;
3563     DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3564     DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3565     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3566     uint8_t * const bak= (uint8_t*)aligned_bak;
3567     int i, last, run, bits, level, distoration, start_i;
3568     const int esc_length= s->ac_esc_length;
3569     uint8_t * length;
3570     uint8_t * last_length;
3571
3572     assert(h==8);
3573
3574     for(i=0; i<8; i++){
3575         ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3576         ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3577     }
3578
3579     s->dsp.diff_pixels(temp, src1, src2, stride);
3580
3581     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3582
3583     bits=0;
3584
3585     if (s->mb_intra) {
3586         start_i = 1;
3587         length     = s->intra_ac_vlc_length;
3588         last_length= s->intra_ac_vlc_last_length;
3589         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3590     } else {
3591         start_i = 0;
3592         length     = s->inter_ac_vlc_length;
3593         last_length= s->inter_ac_vlc_last_length;
3594     }
3595
3596     if(last>=start_i){
3597         run=0;
3598         for(i=start_i; i<last; i++){
3599             int j= scantable[i];
3600             level= temp[j];
3601
3602             if(level){
3603                 level+=64;
3604                 if((level&(~127)) == 0){
3605                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3606                 }else
3607                     bits+= esc_length;
3608                 run=0;
3609             }else
3610                 run++;
3611         }
3612         i= scantable[last];
3613
3614         level= temp[i] + 64;
3615
3616         assert(level - 64);
3617
3618         if((level&(~127)) == 0){
3619             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3620         }else
3621             bits+= esc_length;
3622
3623     }
3624
3625     if(last>=0){
3626         if(s->mb_intra)
3627             s->dct_unquantize_intra(s, temp, 0, s->qscale);
3628         else
3629             s->dct_unquantize_inter(s, temp, 0, s->qscale);
3630     }
3631
3632     s->dsp.idct_add(bak, stride, temp);
3633
3634     distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3635
3636     return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3637 }
3638
3639 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3640     MpegEncContext * const s= (MpegEncContext *)c;
3641     const uint8_t *scantable= s->intra_scantable.permutated;
3642     DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3643     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3644     int i, last, run, bits, level, start_i;
3645     const int esc_length= s->ac_esc_length;
3646     uint8_t * length;
3647     uint8_t * last_length;
3648
3649     assert(h==8);
3650
3651     s->dsp.diff_pixels(temp, src1, src2, stride);
3652
3653     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3654
3655     bits=0;
3656
3657     if (s->mb_intra) {
3658         start_i = 1;
3659         length     = s->intra_ac_vlc_length;
3660         last_length= s->intra_ac_vlc_last_length;
3661         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3662     } else {
3663         start_i = 0;
3664         length     = s->inter_ac_vlc_length;
3665         last_length= s->inter_ac_vlc_last_length;
3666     }
3667
3668     if(last>=start_i){
3669         run=0;
3670         for(i=start_i; i<last; i++){
3671             int j= scantable[i];
3672             level= temp[j];
3673
3674             if(level){
3675                 level+=64;
3676                 if((level&(~127)) == 0){
3677                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3678                 }else
3679                     bits+= esc_length;
3680                 run=0;
3681             }else
3682                 run++;
3683         }
3684         i= scantable[last];
3685
3686         level= temp[i] + 64;
3687
3688         assert(level - 64);
3689
3690         if((level&(~127)) == 0){
3691             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3692         }else
3693             bits+= esc_length;
3694     }
3695
3696     return bits;
3697 }
3698
3699 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3700     int score=0;
3701     int x,y;
3702
3703     for(y=1; y<h; y++){
3704         for(x=0; x<16; x+=4){
3705             score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])
3706                    +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);
3707         }
3708         s+= stride;
3709     }
3710
3711     return score;
3712 }
3713
3714 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3715     int score=0;
3716     int x,y;
3717
3718     for(y=1; y<h; y++){
3719         for(x=0; x<16; x++){
3720             score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3721         }
3722         s1+= stride;
3723         s2+= stride;
3724     }
3725
3726     return score;
3727 }
3728
3729 #define SQ(a) ((a)*(a))
3730 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3731     int score=0;
3732     int x,y;
3733
3734     for(y=1; y<h; y++){
3735         for(x=0; x<16; x+=4){
3736             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])
3737                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3738         }
3739         s+= stride;
3740     }
3741
3742     return score;
3743 }
3744
3745 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3746     int score=0;
3747     int x,y;
3748
3749     for(y=1; y<h; y++){
3750         for(x=0; x<16; x++){
3751             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3752         }
3753         s1+= stride;
3754         s2+= stride;
3755     }
3756
3757     return score;
3758 }
3759
3760 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3761                                int size){
3762     int score=0;
3763     int i;
3764     for(i=0; i<size; i++)
3765         score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3766     return score;
3767 }
3768
3769 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3770 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3771 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3772 #ifdef CONFIG_GPL
3773 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3774 #endif
3775 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3776 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3777 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
3778 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
3779
3780 static void vector_fmul_c(float *dst, const float *src, int len){
3781     int i;
3782     for(i=0; i<len; i++)
3783         dst[i] *= src[i];
3784 }
3785
3786 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3787     int i;
3788     src1 += len-1;
3789     for(i=0; i<len; i++)
3790         dst[i] = src0[i] * src1[-i];
3791 }
3792
3793 void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){
3794     int i;
3795     for(i=0; i<len; i++)
3796         dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
3797 }
3798
3799 void ff_float_to_int16_c(int16_t *dst, const float *src, int len){
3800     int i;
3801     for(i=0; i<len; i++) {
3802         int_fast32_t tmp = ((const int32_t*)src)[i];
3803         if(tmp & 0xf0000){
3804             tmp = (0x43c0ffff - tmp)>>31;
3805             // is this faster on some gcc/cpu combinations?
3806 //          if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3807 //          else                 tmp = 0;
3808         }
3809         dst[i] = tmp - 0x8000;
3810     }
3811 }
3812
3813 #define W0 2048
3814 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
3815 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
3816 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
3817 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
3818 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
3819 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
3820 #define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
3821
3822 static void wmv2_idct_row(short * b)
3823 {
3824     int s1,s2;
3825     int a0,a1,a2,a3,a4,a5,a6,a7;
3826     /*step 1*/
3827     a1 = W1*b[1]+W7*b[7];
3828     a7 = W7*b[1]-W1*b[7];
3829     a5 = W5*b[5]+W3*b[3];
3830     a3 = W3*b[5]-W5*b[3];
3831     a2 = W2*b[2]+W6*b[6];
3832     a6 = W6*b[2]-W2*b[6];
3833     a0 = W0*b[0]+W0*b[4];
3834     a4 = W0*b[0]-W0*b[4];
3835     /*step 2*/
3836     s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
3837     s2 = (181*(a1-a5-a7+a3)+128)>>8;
3838     /*step 3*/
3839     b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
3840     b[1] = (a4+a6 +s1   + (1<<7))>>8;
3841     b[2] = (a4-a6 +s2   + (1<<7))>>8;
3842     b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
3843     b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
3844     b[5] = (a4-a6 -s2   + (1<<7))>>8;
3845     b[6] = (a4+a6 -s1   + (1<<7))>>8;
3846     b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
3847 }
3848 static void wmv2_idct_col(short * b)
3849 {
3850     int s1,s2;
3851     int a0,a1,a2,a3,a4,a5,a6,a7;
3852     /*step 1, with extended precision*/
3853     a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
3854     a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
3855     a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
3856     a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
3857     a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
3858     a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
3859     a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
3860     a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
3861     /*step 2*/
3862     s1 = (181*(a1-a5+a7-a3)+128)>>8;
3863     s2 = (181*(a1-a5-a7+a3)+128)>>8;
3864     /*step 3*/
3865     b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
3866     b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
3867     b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
3868     b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
3869
3870     b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
3871     b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
3872     b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
3873     b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
3874 }
3875 void ff_wmv2_idct_c(short * block){
3876     int i;
3877
3878     for(i=0;i<64;i+=8){
3879         wmv2_idct_row(block+i);
3880     }
3881     for(i=0;i<8;i++){
3882         wmv2_idct_col(block+i);
3883     }
3884 }
3885 /* XXX: those functions should be suppressed ASAP when all IDCTs are
3886  converted */
3887 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
3888 {
3889     ff_wmv2_idct_c(block);
3890     put_pixels_clamped_c(block, dest, line_size);
3891 }
3892 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
3893 {
3894     ff_wmv2_idct_c(block);
3895     add_pixels_clamped_c(block, dest, line_size);
3896 }
3897 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3898 {
3899     j_rev_dct (block);
3900     put_pixels_clamped_c(block, dest, line_size);
3901 }
3902 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3903 {
3904     j_rev_dct (block);
3905     add_pixels_clamped_c(block, dest, line_size);
3906 }
3907
3908 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3909 {
3910     j_rev_dct4 (block);
3911     put_pixels_clamped4_c(block, dest, line_size);
3912 }
3913 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3914 {
3915     j_rev_dct4 (block);
3916     add_pixels_clamped4_c(block, dest, line_size);
3917 }
3918
3919 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
3920 {
3921     j_rev_dct2 (block);
3922     put_pixels_clamped2_c(block, dest, line_size);
3923 }
3924 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
3925 {
3926     j_rev_dct2 (block);
3927     add_pixels_clamped2_c(block, dest, line_size);
3928 }
3929
3930 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
3931 {
3932     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
3933
3934     dest[0] = cm[(block[0] + 4)>>3];
3935 }
3936 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
3937 {
3938     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
3939
3940     dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
3941 }
3942
3943 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
3944
3945 /* init static data */
3946 void dsputil_static_init(void)
3947 {
3948     int i;
3949
3950     for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
3951     for(i=0;i<MAX_NEG_CROP;i++) {
3952         ff_cropTbl[i] = 0;
3953         ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
3954     }
3955
3956     for(i=0;i<512;i++) {
3957         ff_squareTbl[i] = (i - 256) * (i - 256);
3958     }
3959
3960     for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3961 }
3962
3963 int ff_check_alignment(void){
3964     static int did_fail=0;
3965     DECLARE_ALIGNED_16(int, aligned);
3966
3967     if((long)&aligned & 15){
3968         if(!did_fail){
3969 #if defined(HAVE_MMX) || defined(HAVE_ALTIVEC)
3970             av_log(NULL, AV_LOG_ERROR,
3971                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
3972                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
3973                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
3974                 "Do not report crashes to FFmpeg developers.\n");
3975 #endif
3976             did_fail=1;
3977         }
3978         return -1;
3979     }
3980     return 0;
3981 }
3982
3983 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3984 {
3985     int i;
3986
3987     ff_check_alignment();
3988
3989 #ifdef CONFIG_ENCODERS
3990     if(avctx->dct_algo==FF_DCT_FASTINT) {
3991         c->fdct = fdct_ifast;
3992         c->fdct248 = fdct_ifast248;
3993     }
3994     else if(avctx->dct_algo==FF_DCT_FAAN) {
3995         c->fdct = ff_faandct;
3996         c->fdct248 = ff_faandct248;
3997     }
3998     else {
3999         c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4000         c->fdct248 = ff_fdct248_islow;
4001     }
4002 #endif //CONFIG_ENCODERS
4003
4004     if(avctx->lowres==1){
4005         if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !ENABLE_H264_DECODER){
4006             c->idct_put= ff_jref_idct4_put;
4007             c->idct_add= ff_jref_idct4_add;
4008         }else{
4009             c->idct_put= ff_h264_lowres_idct_put_c;
4010             c->idct_add= ff_h264_lowres_idct_add_c;
4011         }
4012         c->idct    = j_rev_dct4;
4013         c->idct_permutation_type= FF_NO_IDCT_PERM;
4014     }else if(avctx->lowres==2){
4015         c->idct_put= ff_jref_idct2_put;
4016         c->idct_add= ff_jref_idct2_add;
4017         c->idct    = j_rev_dct2;
4018         c->idct_permutation_type= FF_NO_IDCT_PERM;
4019     }else if(avctx->lowres==3){
4020         c->idct_put= ff_jref_idct1_put;
4021         c->idct_add= ff_jref_idct1_add;
4022         c->idct    = j_rev_dct1;
4023         c->idct_permutation_type= FF_NO_IDCT_PERM;
4024     }else{
4025         if(avctx->idct_algo==FF_IDCT_INT){
4026             c->idct_put= ff_jref_idct_put;
4027             c->idct_add= ff_jref_idct_add;
4028             c->idct    = j_rev_dct;
4029             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4030         }else if((ENABLE_VP3_DECODER || ENABLE_VP5_DECODER || ENABLE_VP6_DECODER || ENABLE_THEORA_DECODER ) &&
4031                 avctx->idct_algo==FF_IDCT_VP3){
4032             c->idct_put= ff_vp3_idct_put_c;
4033             c->idct_add= ff_vp3_idct_add_c;
4034             c->idct    = ff_vp3_idct_c;
4035             c->idct_permutation_type= FF_NO_IDCT_PERM;
4036         }else if(avctx->idct_algo==FF_IDCT_WMV2){
4037             c->idct_put= ff_wmv2_idct_put_c;
4038             c->idct_add= ff_wmv2_idct_add_c;
4039             c->idct    = ff_wmv2_idct_c;
4040             c->idct_permutation_type= FF_NO_IDCT_PERM;
4041         }else{ //accurate/default
4042             c->idct_put= ff_simple_idct_put;
4043             c->idct_add= ff_simple_idct_add;
4044             c->idct    = ff_simple_idct;
4045             c->idct_permutation_type= FF_NO_IDCT_PERM;
4046         }
4047     }
4048
4049     if (ENABLE_H264_DECODER) {
4050         c->h264_idct_add= ff_h264_idct_add_c;
4051         c->h264_idct8_add= ff_h264_idct8_add_c;
4052         c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
4053         c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
4054     }
4055
4056     c->get_pixels = get_pixels_c;
4057     c->diff_pixels = diff_pixels_c;
4058     c->put_pixels_clamped = put_pixels_clamped_c;
4059     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4060     c->add_pixels_clamped = add_pixels_clamped_c;
4061     c->add_pixels8 = add_pixels8_c;
4062     c->add_pixels4 = add_pixels4_c;
4063     c->sum_abs_dctelem = sum_abs_dctelem_c;
4064     c->gmc1 = gmc1_c;
4065     c->gmc = ff_gmc_c;
4066     c->clear_blocks = clear_blocks_c;
4067     c->pix_sum = pix_sum_c;
4068     c->pix_norm1 = pix_norm1_c;
4069
4070     /* TODO [0] 16  [1] 8 */
4071     c->pix_abs[0][0] = pix_abs16_c;
4072     c->pix_abs[0][1] = pix_abs16_x2_c;
4073     c->pix_abs[0][2] = pix_abs16_y2_c;
4074     c->pix_abs[0][3] = pix_abs16_xy2_c;
4075     c->pix_abs[1][0] = pix_abs8_c;
4076     c->pix_abs[1][1] = pix_abs8_x2_c;
4077     c->pix_abs[1][2] = pix_abs8_y2_c;
4078     c->pix_abs[1][3] = pix_abs8_xy2_c;
4079
4080 #define dspfunc(PFX, IDX, NUM) \
4081     c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
4082     c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
4083     c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
4084     c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4085
4086     dspfunc(put, 0, 16);
4087     dspfunc(put_no_rnd, 0, 16);
4088     dspfunc(put, 1, 8);
4089     dspfunc(put_no_rnd, 1, 8);
4090     dspfunc(put, 2, 4);
4091     dspfunc(put, 3, 2);
4092
4093     dspfunc(avg, 0, 16);
4094     dspfunc(avg_no_rnd, 0, 16);
4095     dspfunc(avg, 1, 8);
4096     dspfunc(avg_no_rnd, 1, 8);
4097     dspfunc(avg, 2, 4);
4098     dspfunc(avg, 3, 2);
4099 #undef dspfunc
4100
4101     c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4102     c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4103
4104     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4105     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4106     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4107     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4108     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4109     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4110     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4111     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4112     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4113
4114     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4115     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4116     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4117     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4118     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4119     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4120     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4121     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4122     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4123
4124 #define dspfunc(PFX, IDX, NUM) \
4125     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4126     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4127     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4128     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4129     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4130     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4131     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4132     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4133     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4134     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4135     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4136     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4137     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4138     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4139     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4140     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4141
4142     dspfunc(put_qpel, 0, 16);
4143     dspfunc(put_no_rnd_qpel, 0, 16);
4144
4145     dspfunc(avg_qpel, 0, 16);
4146     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4147
4148     dspfunc(put_qpel, 1, 8);
4149     dspfunc(put_no_rnd_qpel, 1, 8);
4150
4151     dspfunc(avg_qpel, 1, 8);
4152     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4153
4154     dspfunc(put_h264_qpel, 0, 16);
4155     dspfunc(put_h264_qpel, 1, 8);
4156     dspfunc(put_h264_qpel, 2, 4);
4157     dspfunc(put_h264_qpel, 3, 2);
4158     dspfunc(avg_h264_qpel, 0, 16);
4159     dspfunc(avg_h264_qpel, 1, 8);
4160     dspfunc(avg_h264_qpel, 2, 4);
4161
4162 #undef dspfunc
4163     c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4164     c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4165     c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4166     c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4167     c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4168     c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4169     c->put_no_rnd_h264_chroma_pixels_tab[0]= put_no_rnd_h264_chroma_mc8_c;
4170
4171     c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
4172     c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
4173     c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
4174     c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
4175     c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
4176     c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
4177     c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
4178     c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
4179     c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
4180     c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
4181     c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
4182     c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
4183     c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
4184     c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
4185     c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
4186     c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
4187     c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
4188     c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
4189     c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
4190     c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
4191
4192 #ifdef CONFIG_CAVS_DECODER
4193     ff_cavsdsp_init(c,avctx);
4194 #endif
4195 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4196     ff_vc1dsp_init(c,avctx);
4197 #endif
4198 #if defined(CONFIG_WMV2_DECODER) || defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4199     ff_intrax8dsp_init(c,avctx);
4200 #endif
4201 #if defined(CONFIG_H264_ENCODER)
4202     ff_h264dspenc_init(c,avctx);
4203 #endif
4204
4205     c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4206     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4207     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4208     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4209     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4210     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4211     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4212     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4213
4214 #define SET_CMP_FUNC(name) \
4215     c->name[0]= name ## 16_c;\
4216     c->name[1]= name ## 8x8_c;
4217
4218     SET_CMP_FUNC(hadamard8_diff)
4219     c->hadamard8_diff[4]= hadamard8_intra16_c;
4220     SET_CMP_FUNC(dct_sad)
4221     SET_CMP_FUNC(dct_max)
4222 #ifdef CONFIG_GPL
4223     SET_CMP_FUNC(dct264_sad)
4224 #endif
4225     c->sad[0]= pix_abs16_c;
4226     c->sad[1]= pix_abs8_c;
4227     c->sse[0]= sse16_c;
4228     c->sse[1]= sse8_c;
4229     c->sse[2]= sse4_c;
4230     SET_CMP_FUNC(quant_psnr)
4231     SET_CMP_FUNC(rd)
4232     SET_CMP_FUNC(bit)
4233     c->vsad[0]= vsad16_c;
4234     c->vsad[4]= vsad_intra16_c;
4235     c->vsse[0]= vsse16_c;
4236     c->vsse[4]= vsse_intra16_c;
4237     c->nsse[0]= nsse16_c;
4238     c->nsse[1]= nsse8_c;
4239 #ifdef CONFIG_SNOW_ENCODER
4240     c->w53[0]= w53_16_c;
4241     c->w53[1]= w53_8_c;
4242     c->w97[0]= w97_16_c;
4243     c->w97[1]= w97_8_c;
4244 #endif
4245
4246     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4247
4248     c->add_bytes= add_bytes_c;
4249     c->add_bytes_l2= add_bytes_l2_c;
4250     c->diff_bytes= diff_bytes_c;
4251     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4252     c->bswap_buf= bswap_buf;
4253 #ifdef CONFIG_PNG_DECODER
4254     c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
4255 #endif
4256
4257     c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4258     c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4259     c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4260     c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4261     c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4262     c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4263     c->h264_loop_filter_strength= NULL;
4264
4265     if (ENABLE_ANY_H263) {
4266         c->h263_h_loop_filter= h263_h_loop_filter_c;
4267         c->h263_v_loop_filter= h263_v_loop_filter_c;
4268     }
4269
4270     c->h261_loop_filter= h261_loop_filter_c;
4271
4272     c->try_8x8basis= try_8x8basis_c;
4273     c->add_8x8basis= add_8x8basis_c;
4274
4275 #ifdef CONFIG_SNOW_DECODER
4276     c->vertical_compose97i = ff_snow_vertical_compose97i;
4277     c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4278     c->inner_add_yblock = ff_snow_inner_add_yblock;
4279 #endif
4280
4281 #ifdef CONFIG_VORBIS_DECODER
4282     c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4283 #endif
4284 #ifdef CONFIG_FLAC_ENCODER
4285     c->flac_compute_autocorr = ff_flac_compute_autocorr;
4286 #endif
4287     c->vector_fmul = vector_fmul_c;
4288     c->vector_fmul_reverse = vector_fmul_reverse_c;
4289     c->vector_fmul_add_add = ff_vector_fmul_add_add_c;
4290     c->float_to_int16 = ff_float_to_int16_c;
4291
4292     c->shrink[0]= ff_img_copy_plane;
4293     c->shrink[1]= ff_shrink22;
4294     c->shrink[2]= ff_shrink44;
4295     c->shrink[3]= ff_shrink88;
4296
4297     c->prefetch= just_return;
4298
4299     memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4300     memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4301
4302     if (ENABLE_MMX)      dsputil_init_mmx   (c, avctx);
4303     if (ENABLE_ARMV4L)   dsputil_init_armv4l(c, avctx);
4304     if (ENABLE_MLIB)     dsputil_init_mlib  (c, avctx);
4305     if (ENABLE_VIS)      dsputil_init_vis   (c, avctx);
4306     if (ENABLE_ALPHA)    dsputil_init_alpha (c, avctx);
4307     if (ENABLE_POWERPC)  dsputil_init_ppc   (c, avctx);
4308     if (ENABLE_MMI)      dsputil_init_mmi   (c, avctx);
4309     if (ENABLE_SH4)      dsputil_init_sh4   (c, avctx);
4310     if (ENABLE_BFIN)     dsputil_init_bfin  (c, avctx);
4311
4312     for(i=0; i<64; i++){
4313         if(!c->put_2tap_qpel_pixels_tab[0][i])
4314             c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4315         if(!c->avg_2tap_qpel_pixels_tab[0][i])
4316             c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4317     }
4318
4319     switch(c->idct_permutation_type){
4320     case FF_NO_IDCT_PERM:
4321         for(i=0; i<64; i++)
4322             c->idct_permutation[i]= i;
4323         break;
4324     case FF_LIBMPEG2_IDCT_PERM:
4325         for(i=0; i<64; i++)
4326             c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4327         break;
4328     case FF_SIMPLE_IDCT_PERM:
4329         for(i=0; i<64; i++)
4330             c->idct_permutation[i]= simple_mmx_permutation[i];
4331         break;
4332     case FF_TRANSPOSE_IDCT_PERM:
4333         for(i=0; i<64; i++)
4334             c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4335         break;
4336     case FF_PARTTRANS_IDCT_PERM:
4337         for(i=0; i<64; i++)
4338             c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4339         break;
4340     default:
4341         av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
4342     }
4343 }
4344