libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard.
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
   7  *
   8  * This file is part of FFmpeg.
   9  *
  10  * FFmpeg is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * FFmpeg is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with FFmpeg; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 /**
  26  * @file dsputil.c
  27  * DSP utils
  28  */
  29
  30 #include "avcodec.h"
  31 #include "dsputil.h"
  32 #include "simple_idct.h"
  33 #include "faandct.h"
  34 #include "faanidct.h"
  35 #include "h263.h"
  36 #include "snow.h"
  37
  38 /* snow.c */
  39 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
  40
  41 /* vorbis.c */
  42 void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
  43
  44 /* ac3dec.c */
  45 void ff_ac3_downmix_c(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len);
  46
  47 /* flacenc.c */
  48 void ff_flac_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
  49
  50 /* pngdec.c */
  51 void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
  52
  53 /* eaidct.c */
  54 void ff_ea_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block);
  55
  56 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
  57 uint32_t ff_squareTbl[512] = {0, };
  58
  59 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
  60 #define pb_7f (~0UL/255 * 0x7f)
  61 #define pb_80 (~0UL/255 * 0x80)
  62
  63 const uint8_t ff_zigzag_direct[64] = {
  64     0,   1,  8, 16,  9,  2,  3, 10,
  65     17, 24, 32, 25, 18, 11,  4,  5,
  66     12, 19, 26, 33, 40, 48, 41, 34,
  67     27, 20, 13,  6,  7, 14, 21, 28,
  68     35, 42, 49, 56, 57, 50, 43, 36,
  69     29, 22, 15, 23, 30, 37, 44, 51,
  70     58, 59, 52, 45, 38, 31, 39, 46,
  71     53, 60, 61, 54, 47, 55, 62, 63
  72 };
  73
  74 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  75    specification, we interleave the fields */
  76 const uint8_t ff_zigzag248_direct[64] = {
  77      0,  8,  1,  9, 16, 24,  2, 10,
  78     17, 25, 32, 40, 48, 56, 33, 41,
  79     18, 26,  3, 11,  4, 12, 19, 27,
  80     34, 42, 49, 57, 50, 58, 35, 43,
  81     20, 28,  5, 13,  6, 14, 21, 29,
  82     36, 44, 51, 59, 52, 60, 37, 45,
  83     22, 30,  7, 15, 23, 31, 38, 46,
  84     53, 61, 54, 62, 39, 47, 55, 63,
  85 };
  86
  87 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  88 DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
  89
  90 const uint8_t ff_alternate_horizontal_scan[64] = {
  91     0,  1,   2,  3,  8,  9, 16, 17,
  92     10, 11,  4,  5,  6,  7, 15, 14,
  93     13, 12, 19, 18, 24, 25, 32, 33,
  94     26, 27, 20, 21, 22, 23, 28, 29,
  95     30, 31, 34, 35, 40, 41, 48, 49,
  96     42, 43, 36, 37, 38, 39, 44, 45,
  97     46, 47, 50, 51, 56, 57, 58, 59,
  98     52, 53, 54, 55, 60, 61, 62, 63,
  99 };
 100
 101 const uint8_t ff_alternate_vertical_scan[64] = {
 102     0,  8,  16, 24,  1,  9,  2, 10,
 103     17, 25, 32, 40, 48, 56, 57, 49,
 104     41, 33, 26, 18,  3, 11,  4, 12,
 105     19, 27, 34, 42, 50, 58, 35, 43,
 106     51, 59, 20, 28,  5, 13,  6, 14,
 107     21, 29, 36, 44, 52, 60, 37, 45,
 108     53, 61, 22, 30,  7, 15, 23, 31,
 109     38, 46, 54, 62, 39, 47, 55, 63,
 110 };
 111
 112 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
 113 const uint32_t ff_inverse[256]={
 114          0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
 115  536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
 116  268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
 117  178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
 118  134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
 119  107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
 120   89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
 121   76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
 122   67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
 123   59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
 124   53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
 125   48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
 126   44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
 127   41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
 128   38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
 129   35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
 130   33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
 131   31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
 132   29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
 133   28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
 134   26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
 135   25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
 136   24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
 137   23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
 138   22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
 139   21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
 140   20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
 141   19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
 142   19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
 143   18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
 144   17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
 145   17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
 146 };
 147
 148 /* Input permutation for the simple_idct_mmx */
 149 static const uint8_t simple_mmx_permutation[64]={
 150         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 151         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 152         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 153         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 154         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 155         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 156         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 157         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 158 };
 159
 160 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
 161
 162 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
 163     int i;
 164     int end;
 165
 166     st->scantable= src_scantable;
 167
 168     for(i=0; i<64; i++){
 169         int j;
 170         j = src_scantable[i];
 171         st->permutated[i] = permutation[j];
 172 #ifdef ARCH_POWERPC
 173         st->inverse[j] = i;
 174 #endif
 175     }
 176
 177     end=-1;
 178     for(i=0; i<64; i++){
 179         int j;
 180         j = st->permutated[i];
 181         if(j>end) end=j;
 182         st->raster_end[i]= end;
 183     }
 184 }
 185
 186 static int pix_sum_c(uint8_t * pix, int line_size)
 187 {
 188     int s, i, j;
 189
 190     s = 0;
 191     for (i = 0; i < 16; i++) {
 192         for (j = 0; j < 16; j += 8) {
 193             s += pix[0];
 194             s += pix[1];
 195             s += pix[2];
 196             s += pix[3];
 197             s += pix[4];
 198             s += pix[5];
 199             s += pix[6];
 200             s += pix[7];
 201             pix += 8;
 202         }
 203         pix += line_size - 16;
 204     }
 205     return s;
 206 }
 207
 208 static int pix_norm1_c(uint8_t * pix, int line_size)
 209 {
 210     int s, i, j;
 211     uint32_t *sq = ff_squareTbl + 256;
 212
 213     s = 0;
 214     for (i = 0; i < 16; i++) {
 215         for (j = 0; j < 16; j += 8) {
 216 #if 0
 217             s += sq[pix[0]];
 218             s += sq[pix[1]];
 219             s += sq[pix[2]];
 220             s += sq[pix[3]];
 221             s += sq[pix[4]];
 222             s += sq[pix[5]];
 223             s += sq[pix[6]];
 224             s += sq[pix[7]];
 225 #else
 226 #if LONG_MAX > 2147483647
 227             register uint64_t x=*(uint64_t*)pix;
 228             s += sq[x&0xff];
 229             s += sq[(x>>8)&0xff];
 230             s += sq[(x>>16)&0xff];
 231             s += sq[(x>>24)&0xff];
 232             s += sq[(x>>32)&0xff];
 233             s += sq[(x>>40)&0xff];
 234             s += sq[(x>>48)&0xff];
 235             s += sq[(x>>56)&0xff];
 236 #else
 237             register uint32_t x=*(uint32_t*)pix;
 238             s += sq[x&0xff];
 239             s += sq[(x>>8)&0xff];
 240             s += sq[(x>>16)&0xff];
 241             s += sq[(x>>24)&0xff];
 242             x=*(uint32_t*)(pix+4);
 243             s += sq[x&0xff];
 244             s += sq[(x>>8)&0xff];
 245             s += sq[(x>>16)&0xff];
 246             s += sq[(x>>24)&0xff];
 247 #endif
 248 #endif
 249             pix += 8;
 250         }
 251         pix += line_size - 16;
 252     }
 253     return s;
 254 }
 255
 256 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
 257     int i;
 258
 259     for(i=0; i+8<=w; i+=8){
 260         dst[i+0]= bswap_32(src[i+0]);
 261         dst[i+1]= bswap_32(src[i+1]);
 262         dst[i+2]= bswap_32(src[i+2]);
 263         dst[i+3]= bswap_32(src[i+3]);
 264         dst[i+4]= bswap_32(src[i+4]);
 265         dst[i+5]= bswap_32(src[i+5]);
 266         dst[i+6]= bswap_32(src[i+6]);
 267         dst[i+7]= bswap_32(src[i+7]);
 268     }
 269     for(;i<w; i++){
 270         dst[i+0]= bswap_32(src[i+0]);
 271     }
 272 }
 273
 274 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 275 {
 276     int s, i;
 277     uint32_t *sq = ff_squareTbl + 256;
 278
 279     s = 0;
 280     for (i = 0; i < h; i++) {
 281         s += sq[pix1[0] - pix2[0]];
 282         s += sq[pix1[1] - pix2[1]];
 283         s += sq[pix1[2] - pix2[2]];
 284         s += sq[pix1[3] - pix2[3]];
 285         pix1 += line_size;
 286         pix2 += line_size;
 287     }
 288     return s;
 289 }
 290
 291 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 292 {
 293     int s, i;
 294     uint32_t *sq = ff_squareTbl + 256;
 295
 296     s = 0;
 297     for (i = 0; i < h; i++) {
 298         s += sq[pix1[0] - pix2[0]];
 299         s += sq[pix1[1] - pix2[1]];
 300         s += sq[pix1[2] - pix2[2]];
 301         s += sq[pix1[3] - pix2[3]];
 302         s += sq[pix1[4] - pix2[4]];
 303         s += sq[pix1[5] - pix2[5]];
 304         s += sq[pix1[6] - pix2[6]];
 305         s += sq[pix1[7] - pix2[7]];
 306         pix1 += line_size;
 307         pix2 += line_size;
 308     }
 309     return s;
 310 }
 311
 312 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 313 {
 314     int s, i;
 315     uint32_t *sq = ff_squareTbl + 256;
 316
 317     s = 0;
 318     for (i = 0; i < h; i++) {
 319         s += sq[pix1[ 0] - pix2[ 0]];
 320         s += sq[pix1[ 1] - pix2[ 1]];
 321         s += sq[pix1[ 2] - pix2[ 2]];
 322         s += sq[pix1[ 3] - pix2[ 3]];
 323         s += sq[pix1[ 4] - pix2[ 4]];
 324         s += sq[pix1[ 5] - pix2[ 5]];
 325         s += sq[pix1[ 6] - pix2[ 6]];
 326         s += sq[pix1[ 7] - pix2[ 7]];
 327         s += sq[pix1[ 8] - pix2[ 8]];
 328         s += sq[pix1[ 9] - pix2[ 9]];
 329         s += sq[pix1[10] - pix2[10]];
 330         s += sq[pix1[11] - pix2[11]];
 331         s += sq[pix1[12] - pix2[12]];
 332         s += sq[pix1[13] - pix2[13]];
 333         s += sq[pix1[14] - pix2[14]];
 334         s += sq[pix1[15] - pix2[15]];
 335
 336         pix1 += line_size;
 337         pix2 += line_size;
 338     }
 339     return s;
 340 }
 341
 342
 343 #ifdef CONFIG_SNOW_ENCODER //dwt is in snow.c
 344 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
 345     int s, i, j;
 346     const int dec_count= w==8 ? 3 : 4;
 347     int tmp[32*32];
 348     int level, ori;
 349     static const int scale[2][2][4][4]={
 350       {
 351         {
 352             // 9/7 8x8 dec=3
 353             {268, 239, 239, 213},
 354             {  0, 224, 224, 152},
 355             {  0, 135, 135, 110},
 356         },{
 357             // 9/7 16x16 or 32x32 dec=4
 358             {344, 310, 310, 280},
 359             {  0, 320, 320, 228},
 360             {  0, 175, 175, 136},
 361             {  0, 129, 129, 102},
 362         }
 363       },{
 364         {
 365             // 5/3 8x8 dec=3
 366             {275, 245, 245, 218},
 367             {  0, 230, 230, 156},
 368             {  0, 138, 138, 113},
 369         },{
 370             // 5/3 16x16 or 32x32 dec=4
 371             {352, 317, 317, 286},
 372             {  0, 328, 328, 233},
 373             {  0, 180, 180, 140},
 374             {  0, 132, 132, 105},
 375         }
 376       }
 377     };
 378
 379     for (i = 0; i < h; i++) {
 380         for (j = 0; j < w; j+=4) {
 381             tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
 382             tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
 383             tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
 384             tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
 385         }
 386         pix1 += line_size;
 387         pix2 += line_size;
 388     }
 389
 390     ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
 391
 392     s=0;
 393     assert(w==h);
 394     for(level=0; level<dec_count; level++){
 395         for(ori= level ? 1 : 0; ori<4; ori++){
 396             int size= w>>(dec_count-level);
 397             int sx= (ori&1) ? size : 0;
 398             int stride= 32<<(dec_count-level);
 399             int sy= (ori&2) ? stride>>1 : 0;
 400
 401             for(i=0; i<size; i++){
 402                 for(j=0; j<size; j++){
 403                     int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
 404                     s += FFABS(v);
 405                 }
 406             }
 407         }
 408     }
 409     assert(s>=0);
 410     return s>>9;
 411 }
 412
 413 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 414     return w_c(v, pix1, pix2, line_size,  8, h, 1);
 415 }
 416
 417 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 418     return w_c(v, pix1, pix2, line_size,  8, h, 0);
 419 }
 420
 421 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 422     return w_c(v, pix1, pix2, line_size, 16, h, 1);
 423 }
 424
 425 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 426     return w_c(v, pix1, pix2, line_size, 16, h, 0);
 427 }
 428
 429 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 430     return w_c(v, pix1, pix2, line_size, 32, h, 1);
 431 }
 432
 433 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 434     return w_c(v, pix1, pix2, line_size, 32, h, 0);
 435 }
 436 #endif
 437
 438 /* draw the edges of width 'w' of an image of size width, height */
 439 //FIXME check that this is ok for mpeg4 interlaced
 440 static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
 441 {
 442     uint8_t *ptr, *last_line;
 443     int i;
 444
 445     last_line = buf + (height - 1) * wrap;
 446     for(i=0;i<w;i++) {
 447         /* top and bottom */
 448         memcpy(buf - (i + 1) * wrap, buf, width);
 449         memcpy(last_line + (i + 1) * wrap, last_line, width);
 450     }
 451     /* left and right */
 452     ptr = buf;
 453     for(i=0;i<height;i++) {
 454         memset(ptr - w, ptr[0], w);
 455         memset(ptr + width, ptr[width-1], w);
 456         ptr += wrap;
 457     }
 458     /* corners */
 459     for(i=0;i<w;i++) {
 460         memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
 461         memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
 462         memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
 463         memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
 464     }
 465 }
 466
 467 /**
 468  * Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples.
 469  * @param buf destination buffer
 470  * @param src source buffer
 471  * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
 472  * @param block_w width of block
 473  * @param block_h height of block
 474  * @param src_x x coordinate of the top left sample of the block in the source buffer
 475  * @param src_y y coordinate of the top left sample of the block in the source buffer
 476  * @param w width of the source buffer
 477  * @param h height of the source buffer
 478  */
 479 void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, int block_w, int block_h,
 480                                     int src_x, int src_y, int w, int h){
 481     int x, y;
 482     int start_y, start_x, end_y, end_x;
 483
 484     if(src_y>= h){
 485         src+= (h-1-src_y)*linesize;
 486         src_y=h-1;
 487     }else if(src_y<=-block_h){
 488         src+= (1-block_h-src_y)*linesize;
 489         src_y=1-block_h;
 490     }
 491     if(src_x>= w){
 492         src+= (w-1-src_x);
 493         src_x=w-1;
 494     }else if(src_x<=-block_w){
 495         src+= (1-block_w-src_x);
 496         src_x=1-block_w;
 497     }
 498
 499     start_y= FFMAX(0, -src_y);
 500     start_x= FFMAX(0, -src_x);
 501     end_y= FFMIN(block_h, h-src_y);
 502     end_x= FFMIN(block_w, w-src_x);
 503
 504     // copy existing part
 505     for(y=start_y; y<end_y; y++){
 506         for(x=start_x; x<end_x; x++){
 507             buf[x + y*linesize]= src[x + y*linesize];
 508         }
 509     }
 510
 511     //top
 512     for(y=0; y<start_y; y++){
 513         for(x=start_x; x<end_x; x++){
 514             buf[x + y*linesize]= buf[x + start_y*linesize];
 515         }
 516     }
 517
 518     //bottom
 519     for(y=end_y; y<block_h; y++){
 520         for(x=start_x; x<end_x; x++){
 521             buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
 522         }
 523     }
 524
 525     for(y=0; y<block_h; y++){
 526        //left
 527         for(x=0; x<start_x; x++){
 528             buf[x + y*linesize]= buf[start_x + y*linesize];
 529         }
 530
 531        //right
 532         for(x=end_x; x<block_w; x++){
 533             buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
 534         }
 535     }
 536 }
 537
 538 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
 539 {
 540     int i;
 541
 542     /* read the pixels */
 543     for(i=0;i<8;i++) {
 544         block[0] = pixels[0];
 545         block[1] = pixels[1];
 546         block[2] = pixels[2];
 547         block[3] = pixels[3];
 548         block[4] = pixels[4];
 549         block[5] = pixels[5];
 550         block[6] = pixels[6];
 551         block[7] = pixels[7];
 552         pixels += line_size;
 553         block += 8;
 554     }
 555 }
 556
 557 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
 558                           const uint8_t *s2, int stride){
 559     int i;
 560
 561     /* read the pixels */
 562     for(i=0;i<8;i++) {
 563         block[0] = s1[0] - s2[0];
 564         block[1] = s1[1] - s2[1];
 565         block[2] = s1[2] - s2[2];
 566         block[3] = s1[3] - s2[3];
 567         block[4] = s1[4] - s2[4];
 568         block[5] = s1[5] - s2[5];
 569         block[6] = s1[6] - s2[6];
 570         block[7] = s1[7] - s2[7];
 571         s1 += stride;
 572         s2 += stride;
 573         block += 8;
 574     }
 575 }
 576
 577
 578 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 579                                  int line_size)
 580 {
 581     int i;
 582     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 583
 584     /* read the pixels */
 585     for(i=0;i<8;i++) {
 586         pixels[0] = cm[block[0]];
 587         pixels[1] = cm[block[1]];
 588         pixels[2] = cm[block[2]];
 589         pixels[3] = cm[block[3]];
 590         pixels[4] = cm[block[4]];
 591         pixels[5] = cm[block[5]];
 592         pixels[6] = cm[block[6]];
 593         pixels[7] = cm[block[7]];
 594
 595         pixels += line_size;
 596         block += 8;
 597     }
 598 }
 599
 600 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 601                                  int line_size)
 602 {
 603     int i;
 604     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 605
 606     /* read the pixels */
 607     for(i=0;i<4;i++) {
 608         pixels[0] = cm[block[0]];
 609         pixels[1] = cm[block[1]];
 610         pixels[2] = cm[block[2]];
 611         pixels[3] = cm[block[3]];
 612
 613         pixels += line_size;
 614         block += 8;
 615     }
 616 }
 617
 618 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 619                                  int line_size)
 620 {
 621     int i;
 622     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 623
 624     /* read the pixels */
 625     for(i=0;i<2;i++) {
 626         pixels[0] = cm[block[0]];
 627         pixels[1] = cm[block[1]];
 628
 629         pixels += line_size;
 630         block += 8;
 631     }
 632 }
 633
 634 static void put_signed_pixels_clamped_c(const DCTELEM *block,
 635                                         uint8_t *restrict pixels,
 636                                         int line_size)
 637 {
 638     int i, j;
 639
 640     for (i = 0; i < 8; i++) {
 641         for (j = 0; j < 8; j++) {
 642             if (*block < -128)
 643                 *pixels = 0;
 644             else if (*block > 127)
 645                 *pixels = 255;
 646             else
 647                 *pixels = (uint8_t)(*block + 128);
 648             block++;
 649             pixels++;
 650         }
 651         pixels += (line_size - 8);
 652     }
 653 }
 654
 655 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 656                           int line_size)
 657 {
 658     int i;
 659     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 660
 661     /* read the pixels */
 662     for(i=0;i<8;i++) {
 663         pixels[0] = cm[pixels[0] + block[0]];
 664         pixels[1] = cm[pixels[1] + block[1]];
 665         pixels[2] = cm[pixels[2] + block[2]];
 666         pixels[3] = cm[pixels[3] + block[3]];
 667         pixels[4] = cm[pixels[4] + block[4]];
 668         pixels[5] = cm[pixels[5] + block[5]];
 669         pixels[6] = cm[pixels[6] + block[6]];
 670         pixels[7] = cm[pixels[7] + block[7]];
 671         pixels += line_size;
 672         block += 8;
 673     }
 674 }
 675
 676 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 677                           int line_size)
 678 {
 679     int i;
 680     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 681
 682     /* read the pixels */
 683     for(i=0;i<4;i++) {
 684         pixels[0] = cm[pixels[0] + block[0]];
 685         pixels[1] = cm[pixels[1] + block[1]];
 686         pixels[2] = cm[pixels[2] + block[2]];
 687         pixels[3] = cm[pixels[3] + block[3]];
 688         pixels += line_size;
 689         block += 8;
 690     }
 691 }
 692
 693 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 694                           int line_size)
 695 {
 696     int i;
 697     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 698
 699     /* read the pixels */
 700     for(i=0;i<2;i++) {
 701         pixels[0] = cm[pixels[0] + block[0]];
 702         pixels[1] = cm[pixels[1] + block[1]];
 703         pixels += line_size;
 704         block += 8;
 705     }
 706 }
 707
 708 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 709 {
 710     int i;
 711     for(i=0;i<8;i++) {
 712         pixels[0] += block[0];
 713         pixels[1] += block[1];
 714         pixels[2] += block[2];
 715         pixels[3] += block[3];
 716         pixels[4] += block[4];
 717         pixels[5] += block[5];
 718         pixels[6] += block[6];
 719         pixels[7] += block[7];
 720         pixels += line_size;
 721         block += 8;
 722     }
 723 }
 724
 725 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 726 {
 727     int i;
 728     for(i=0;i<4;i++) {
 729         pixels[0] += block[0];
 730         pixels[1] += block[1];
 731         pixels[2] += block[2];
 732         pixels[3] += block[3];
 733         pixels += line_size;
 734         block += 4;
 735     }
 736 }
 737
 738 static int sum_abs_dctelem_c(DCTELEM *block)
 739 {
 740     int sum=0, i;
 741     for(i=0; i<64; i++)
 742         sum+= FFABS(block[i]);
 743     return sum;
 744 }
 745
 746 #if 0
 747
 748 #define PIXOP2(OPNAME, OP) \
 749 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 750 {\
 751     int i;\
 752     for(i=0; i<h; i++){\
 753         OP(*((uint64_t*)block), AV_RN64(pixels));\
 754         pixels+=line_size;\
 755         block +=line_size;\
 756     }\
 757 }\
 758 \
 759 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 760 {\
 761     int i;\
 762     for(i=0; i<h; i++){\
 763         const uint64_t a= AV_RN64(pixels  );\
 764         const uint64_t b= AV_RN64(pixels+1);\
 765         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 766         pixels+=line_size;\
 767         block +=line_size;\
 768     }\
 769 }\
 770 \
 771 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 772 {\
 773     int i;\
 774     for(i=0; i<h; i++){\
 775         const uint64_t a= AV_RN64(pixels  );\
 776         const uint64_t b= AV_RN64(pixels+1);\
 777         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 778         pixels+=line_size;\
 779         block +=line_size;\
 780     }\
 781 }\
 782 \
 783 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 784 {\
 785     int i;\
 786     for(i=0; i<h; i++){\
 787         const uint64_t a= AV_RN64(pixels          );\
 788         const uint64_t b= AV_RN64(pixels+line_size);\
 789         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 790         pixels+=line_size;\
 791         block +=line_size;\
 792     }\
 793 }\
 794 \
 795 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 796 {\
 797     int i;\
 798     for(i=0; i<h; i++){\
 799         const uint64_t a= AV_RN64(pixels          );\
 800         const uint64_t b= AV_RN64(pixels+line_size);\
 801         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 802         pixels+=line_size;\
 803         block +=line_size;\
 804     }\
 805 }\
 806 \
 807 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 808 {\
 809         int i;\
 810         const uint64_t a= AV_RN64(pixels  );\
 811         const uint64_t b= AV_RN64(pixels+1);\
 812         uint64_t l0=  (a&0x0303030303030303ULL)\
 813                     + (b&0x0303030303030303ULL)\
 814                     + 0x0202020202020202ULL;\
 815         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 816                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 817         uint64_t l1,h1;\
 818 \
 819         pixels+=line_size;\
 820         for(i=0; i<h; i+=2){\
 821             uint64_t a= AV_RN64(pixels  );\
 822             uint64_t b= AV_RN64(pixels+1);\
 823             l1=  (a&0x0303030303030303ULL)\
 824                + (b&0x0303030303030303ULL);\
 825             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 826               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 827             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 828             pixels+=line_size;\
 829             block +=line_size;\
 830             a= AV_RN64(pixels  );\
 831             b= AV_RN64(pixels+1);\
 832             l0=  (a&0x0303030303030303ULL)\
 833                + (b&0x0303030303030303ULL)\
 834                + 0x0202020202020202ULL;\
 835             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 836               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 837             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 838             pixels+=line_size;\
 839             block +=line_size;\
 840         }\
 841 }\
 842 \
 843 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 844 {\
 845         int i;\
 846         const uint64_t a= AV_RN64(pixels  );\
 847         const uint64_t b= AV_RN64(pixels+1);\
 848         uint64_t l0=  (a&0x0303030303030303ULL)\
 849                     + (b&0x0303030303030303ULL)\
 850                     + 0x0101010101010101ULL;\
 851         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 852                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 853         uint64_t l1,h1;\
 854 \
 855         pixels+=line_size;\
 856         for(i=0; i<h; i+=2){\
 857             uint64_t a= AV_RN64(pixels  );\
 858             uint64_t b= AV_RN64(pixels+1);\
 859             l1=  (a&0x0303030303030303ULL)\
 860                + (b&0x0303030303030303ULL);\
 861             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 862               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 863             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 864             pixels+=line_size;\
 865             block +=line_size;\
 866             a= AV_RN64(pixels  );\
 867             b= AV_RN64(pixels+1);\
 868             l0=  (a&0x0303030303030303ULL)\
 869                + (b&0x0303030303030303ULL)\
 870                + 0x0101010101010101ULL;\
 871             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 872               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 873             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 874             pixels+=line_size;\
 875             block +=line_size;\
 876         }\
 877 }\
 878 \
 879 CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
 880 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
 881 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
 882 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
 883 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
 884 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
 885 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
 886
 887 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
 888 #else // 64 bit variant
 889
 890 #define PIXOP2(OPNAME, OP) \
 891 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 892     int i;\
 893     for(i=0; i<h; i++){\
 894         OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
 895         pixels+=line_size;\
 896         block +=line_size;\
 897     }\
 898 }\
 899 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 900     int i;\
 901     for(i=0; i<h; i++){\
 902         OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
 903         pixels+=line_size;\
 904         block +=line_size;\
 905     }\
 906 }\
 907 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 908     int i;\
 909     for(i=0; i<h; i++){\
 910         OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
 911         OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
 912         pixels+=line_size;\
 913         block +=line_size;\
 914     }\
 915 }\
 916 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 917     OPNAME ## _pixels8_c(block, pixels, line_size, h);\
 918 }\
 919 \
 920 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 921                                                 int src_stride1, int src_stride2, int h){\
 922     int i;\
 923     for(i=0; i<h; i++){\
 924         uint32_t a,b;\
 925         a= AV_RN32(&src1[i*src_stride1  ]);\
 926         b= AV_RN32(&src2[i*src_stride2  ]);\
 927         OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
 928         a= AV_RN32(&src1[i*src_stride1+4]);\
 929         b= AV_RN32(&src2[i*src_stride2+4]);\
 930         OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
 931     }\
 932 }\
 933 \
 934 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 935                                                 int src_stride1, int src_stride2, int h){\
 936     int i;\
 937     for(i=0; i<h; i++){\
 938         uint32_t a,b;\
 939         a= AV_RN32(&src1[i*src_stride1  ]);\
 940         b= AV_RN32(&src2[i*src_stride2  ]);\
 941         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 942         a= AV_RN32(&src1[i*src_stride1+4]);\
 943         b= AV_RN32(&src2[i*src_stride2+4]);\
 944         OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
 945     }\
 946 }\
 947 \
 948 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 949                                                 int src_stride1, int src_stride2, int h){\
 950     int i;\
 951     for(i=0; i<h; i++){\
 952         uint32_t a,b;\
 953         a= AV_RN32(&src1[i*src_stride1  ]);\
 954         b= AV_RN32(&src2[i*src_stride2  ]);\
 955         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 956     }\
 957 }\
 958 \
 959 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 960                                                 int src_stride1, int src_stride2, int h){\
 961     int i;\
 962     for(i=0; i<h; i++){\
 963         uint32_t a,b;\
 964         a= AV_RN16(&src1[i*src_stride1  ]);\
 965         b= AV_RN16(&src2[i*src_stride2  ]);\
 966         OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 967     }\
 968 }\
 969 \
 970 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 971                                                 int src_stride1, int src_stride2, int h){\
 972     OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 973     OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 974 }\
 975 \
 976 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 977                                                 int src_stride1, int src_stride2, int h){\
 978     OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 979     OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 980 }\
 981 \
 982 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 983     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 984 }\
 985 \
 986 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 987     OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 988 }\
 989 \
 990 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 991     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 992 }\
 993 \
 994 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 995     OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 996 }\
 997 \
 998 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 999                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1000     int i;\
1001     for(i=0; i<h; i++){\
1002         uint32_t a, b, c, d, l0, l1, h0, h1;\
1003         a= AV_RN32(&src1[i*src_stride1]);\
1004         b= AV_RN32(&src2[i*src_stride2]);\
1005         c= AV_RN32(&src3[i*src_stride3]);\
1006         d= AV_RN32(&src4[i*src_stride4]);\
1007         l0=  (a&0x03030303UL)\
1008            + (b&0x03030303UL)\
1009            + 0x02020202UL;\
1010         h0= ((a&0xFCFCFCFCUL)>>2)\
1011           + ((b&0xFCFCFCFCUL)>>2);\
1012         l1=  (c&0x03030303UL)\
1013            + (d&0x03030303UL);\
1014         h1= ((c&0xFCFCFCFCUL)>>2)\
1015           + ((d&0xFCFCFCFCUL)>>2);\
1016         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1017         a= AV_RN32(&src1[i*src_stride1+4]);\
1018         b= AV_RN32(&src2[i*src_stride2+4]);\
1019         c= AV_RN32(&src3[i*src_stride3+4]);\
1020         d= AV_RN32(&src4[i*src_stride4+4]);\
1021         l0=  (a&0x03030303UL)\
1022            + (b&0x03030303UL)\
1023            + 0x02020202UL;\
1024         h0= ((a&0xFCFCFCFCUL)>>2)\
1025           + ((b&0xFCFCFCFCUL)>>2);\
1026         l1=  (c&0x03030303UL)\
1027            + (d&0x03030303UL);\
1028         h1= ((c&0xFCFCFCFCUL)>>2)\
1029           + ((d&0xFCFCFCFCUL)>>2);\
1030         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1031     }\
1032 }\
1033 \
1034 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1035     OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1036 }\
1037 \
1038 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1039     OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1040 }\
1041 \
1042 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1043     OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1044 }\
1045 \
1046 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1047     OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1048 }\
1049 \
1050 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1051                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1052     int i;\
1053     for(i=0; i<h; i++){\
1054         uint32_t a, b, c, d, l0, l1, h0, h1;\
1055         a= AV_RN32(&src1[i*src_stride1]);\
1056         b= AV_RN32(&src2[i*src_stride2]);\
1057         c= AV_RN32(&src3[i*src_stride3]);\
1058         d= AV_RN32(&src4[i*src_stride4]);\
1059         l0=  (a&0x03030303UL)\
1060            + (b&0x03030303UL)\
1061            + 0x01010101UL;\
1062         h0= ((a&0xFCFCFCFCUL)>>2)\
1063           + ((b&0xFCFCFCFCUL)>>2);\
1064         l1=  (c&0x03030303UL)\
1065            + (d&0x03030303UL);\
1066         h1= ((c&0xFCFCFCFCUL)>>2)\
1067           + ((d&0xFCFCFCFCUL)>>2);\
1068         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1069         a= AV_RN32(&src1[i*src_stride1+4]);\
1070         b= AV_RN32(&src2[i*src_stride2+4]);\
1071         c= AV_RN32(&src3[i*src_stride3+4]);\
1072         d= AV_RN32(&src4[i*src_stride4+4]);\
1073         l0=  (a&0x03030303UL)\
1074            + (b&0x03030303UL)\
1075            + 0x01010101UL;\
1076         h0= ((a&0xFCFCFCFCUL)>>2)\
1077           + ((b&0xFCFCFCFCUL)>>2);\
1078         l1=  (c&0x03030303UL)\
1079            + (d&0x03030303UL);\
1080         h1= ((c&0xFCFCFCFCUL)>>2)\
1081           + ((d&0xFCFCFCFCUL)>>2);\
1082         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1083     }\
1084 }\
1085 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1086                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1087     OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1088     OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1089 }\
1090 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1091                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1092     OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1093     OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1094 }\
1095 \
1096 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1097 {\
1098         int i, a0, b0, a1, b1;\
1099         a0= pixels[0];\
1100         b0= pixels[1] + 2;\
1101         a0 += b0;\
1102         b0 += pixels[2];\
1103 \
1104         pixels+=line_size;\
1105         for(i=0; i<h; i+=2){\
1106             a1= pixels[0];\
1107             b1= pixels[1];\
1108             a1 += b1;\
1109             b1 += pixels[2];\
1110 \
1111             block[0]= (a1+a0)>>2; /* FIXME non put */\
1112             block[1]= (b1+b0)>>2;\
1113 \
1114             pixels+=line_size;\
1115             block +=line_size;\
1116 \
1117             a0= pixels[0];\
1118             b0= pixels[1] + 2;\
1119             a0 += b0;\
1120             b0 += pixels[2];\
1121 \
1122             block[0]= (a1+a0)>>2;\
1123             block[1]= (b1+b0)>>2;\
1124             pixels+=line_size;\
1125             block +=line_size;\
1126         }\
1127 }\
1128 \
1129 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1130 {\
1131         int i;\
1132         const uint32_t a= AV_RN32(pixels  );\
1133         const uint32_t b= AV_RN32(pixels+1);\
1134         uint32_t l0=  (a&0x03030303UL)\
1135                     + (b&0x03030303UL)\
1136                     + 0x02020202UL;\
1137         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1138                    + ((b&0xFCFCFCFCUL)>>2);\
1139         uint32_t l1,h1;\
1140 \
1141         pixels+=line_size;\
1142         for(i=0; i<h; i+=2){\
1143             uint32_t a= AV_RN32(pixels  );\
1144             uint32_t b= AV_RN32(pixels+1);\
1145             l1=  (a&0x03030303UL)\
1146                + (b&0x03030303UL);\
1147             h1= ((a&0xFCFCFCFCUL)>>2)\
1148               + ((b&0xFCFCFCFCUL)>>2);\
1149             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1150             pixels+=line_size;\
1151             block +=line_size;\
1152             a= AV_RN32(pixels  );\
1153             b= AV_RN32(pixels+1);\
1154             l0=  (a&0x03030303UL)\
1155                + (b&0x03030303UL)\
1156                + 0x02020202UL;\
1157             h0= ((a&0xFCFCFCFCUL)>>2)\
1158               + ((b&0xFCFCFCFCUL)>>2);\
1159             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1160             pixels+=line_size;\
1161             block +=line_size;\
1162         }\
1163 }\
1164 \
1165 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1166 {\
1167     int j;\
1168     for(j=0; j<2; j++){\
1169         int i;\
1170         const uint32_t a= AV_RN32(pixels  );\
1171         const uint32_t b= AV_RN32(pixels+1);\
1172         uint32_t l0=  (a&0x03030303UL)\
1173                     + (b&0x03030303UL)\
1174                     + 0x02020202UL;\
1175         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1176                    + ((b&0xFCFCFCFCUL)>>2);\
1177         uint32_t l1,h1;\
1178 \
1179         pixels+=line_size;\
1180         for(i=0; i<h; i+=2){\
1181             uint32_t a= AV_RN32(pixels  );\
1182             uint32_t b= AV_RN32(pixels+1);\
1183             l1=  (a&0x03030303UL)\
1184                + (b&0x03030303UL);\
1185             h1= ((a&0xFCFCFCFCUL)>>2)\
1186               + ((b&0xFCFCFCFCUL)>>2);\
1187             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1188             pixels+=line_size;\
1189             block +=line_size;\
1190             a= AV_RN32(pixels  );\
1191             b= AV_RN32(pixels+1);\
1192             l0=  (a&0x03030303UL)\
1193                + (b&0x03030303UL)\
1194                + 0x02020202UL;\
1195             h0= ((a&0xFCFCFCFCUL)>>2)\
1196               + ((b&0xFCFCFCFCUL)>>2);\
1197             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1198             pixels+=line_size;\
1199             block +=line_size;\
1200         }\
1201         pixels+=4-line_size*(h+1);\
1202         block +=4-line_size*h;\
1203     }\
1204 }\
1205 \
1206 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1207 {\
1208     int j;\
1209     for(j=0; j<2; j++){\
1210         int i;\
1211         const uint32_t a= AV_RN32(pixels  );\
1212         const uint32_t b= AV_RN32(pixels+1);\
1213         uint32_t l0=  (a&0x03030303UL)\
1214                     + (b&0x03030303UL)\
1215                     + 0x01010101UL;\
1216         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1217                    + ((b&0xFCFCFCFCUL)>>2);\
1218         uint32_t l1,h1;\
1219 \
1220         pixels+=line_size;\
1221         for(i=0; i<h; i+=2){\
1222             uint32_t a= AV_RN32(pixels  );\
1223             uint32_t b= AV_RN32(pixels+1);\
1224             l1=  (a&0x03030303UL)\
1225                + (b&0x03030303UL);\
1226             h1= ((a&0xFCFCFCFCUL)>>2)\
1227               + ((b&0xFCFCFCFCUL)>>2);\
1228             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1229             pixels+=line_size;\
1230             block +=line_size;\
1231             a= AV_RN32(pixels  );\
1232             b= AV_RN32(pixels+1);\
1233             l0=  (a&0x03030303UL)\
1234                + (b&0x03030303UL)\
1235                + 0x01010101UL;\
1236             h0= ((a&0xFCFCFCFCUL)>>2)\
1237               + ((b&0xFCFCFCFCUL)>>2);\
1238             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1239             pixels+=line_size;\
1240             block +=line_size;\
1241         }\
1242         pixels+=4-line_size*(h+1);\
1243         block +=4-line_size*h;\
1244     }\
1245 }\
1246 \
1247 CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1248 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1249 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1250 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1251 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1252 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1253 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1254 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1255
1256 #define op_avg(a, b) a = rnd_avg32(a, b)
1257 #endif
1258 #define op_put(a, b) a = b
1259
1260 PIXOP2(avg, op_avg)
1261 PIXOP2(put, op_put)
1262 #undef op_avg
1263 #undef op_put
1264
1265 #define avg2(a,b) ((a+b+1)>>1)
1266 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1267
1268 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1269     put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1270 }
1271
1272 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1273     put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1274 }
1275
1276 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1277 {
1278     const int A=(16-x16)*(16-y16);
1279     const int B=(   x16)*(16-y16);
1280     const int C=(16-x16)*(   y16);
1281     const int D=(   x16)*(   y16);
1282     int i;
1283
1284     for(i=0; i<h; i++)
1285     {
1286         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1287         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1288         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1289         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1290         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1291         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1292         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1293         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1294         dst+= stride;
1295         src+= stride;
1296     }
1297 }
1298
1299 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1300                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1301 {
1302     int y, vx, vy;
1303     const int s= 1<<shift;
1304
1305     width--;
1306     height--;
1307
1308     for(y=0; y<h; y++){
1309         int x;
1310
1311         vx= ox;
1312         vy= oy;
1313         for(x=0; x<8; x++){ //XXX FIXME optimize
1314             int src_x, src_y, frac_x, frac_y, index;
1315
1316             src_x= vx>>16;
1317             src_y= vy>>16;
1318             frac_x= src_x&(s-1);
1319             frac_y= src_y&(s-1);
1320             src_x>>=shift;
1321             src_y>>=shift;
1322
1323             if((unsigned)src_x < width){
1324                 if((unsigned)src_y < height){
1325                     index= src_x + src_y*stride;
1326                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1327                                            + src[index       +1]*   frac_x )*(s-frac_y)
1328                                         + (  src[index+stride  ]*(s-frac_x)
1329                                            + src[index+stride+1]*   frac_x )*   frac_y
1330                                         + r)>>(shift*2);
1331                 }else{
1332                     index= src_x + av_clip(src_y, 0, height)*stride;
1333                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1334                                           + src[index       +1]*   frac_x )*s
1335                                         + r)>>(shift*2);
1336                 }
1337             }else{
1338                 if((unsigned)src_y < height){
1339                     index= av_clip(src_x, 0, width) + src_y*stride;
1340                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1341                                            + src[index+stride  ]*   frac_y )*s
1342                                         + r)>>(shift*2);
1343                 }else{
1344                     index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1345                     dst[y*stride + x]=    src[index         ];
1346                 }
1347             }
1348
1349             vx+= dxx;
1350             vy+= dyx;
1351         }
1352         ox += dxy;
1353         oy += dyy;
1354     }
1355 }
1356
1357 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1358     switch(width){
1359     case 2: put_pixels2_c (dst, src, stride, height); break;
1360     case 4: put_pixels4_c (dst, src, stride, height); break;
1361     case 8: put_pixels8_c (dst, src, stride, height); break;
1362     case 16:put_pixels16_c(dst, src, stride, height); break;
1363     }
1364 }
1365
1366 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1367     int i,j;
1368     for (i=0; i < height; i++) {
1369       for (j=0; j < width; j++) {
1370         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1371       }
1372       src += stride;
1373       dst += stride;
1374     }
1375 }
1376
1377 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1378     int i,j;
1379     for (i=0; i < height; i++) {
1380       for (j=0; j < width; j++) {
1381         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1382       }
1383       src += stride;
1384       dst += stride;
1385     }
1386 }
1387
1388 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1389     int i,j;
1390     for (i=0; i < height; i++) {
1391       for (j=0; j < width; j++) {
1392         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1393       }
1394       src += stride;
1395       dst += stride;
1396     }
1397 }
1398
1399 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1400     int i,j;
1401     for (i=0; i < height; i++) {
1402       for (j=0; j < width; j++) {
1403         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1404       }
1405       src += stride;
1406       dst += stride;
1407     }
1408 }
1409
1410 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1411     int i,j;
1412     for (i=0; i < height; i++) {
1413       for (j=0; j < width; j++) {
1414         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1415       }
1416       src += stride;
1417       dst += stride;
1418     }
1419 }
1420
1421 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1422     int i,j;
1423     for (i=0; i < height; i++) {
1424       for (j=0; j < width; j++) {
1425         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1426       }
1427       src += stride;
1428       dst += stride;
1429     }
1430 }
1431
1432 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1433     int i,j;
1434     for (i=0; i < height; i++) {
1435       for (j=0; j < width; j++) {
1436         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1437       }
1438       src += stride;
1439       dst += stride;
1440     }
1441 }
1442
1443 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1444     int i,j;
1445     for (i=0; i < height; i++) {
1446       for (j=0; j < width; j++) {
1447         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1448       }
1449       src += stride;
1450       dst += stride;
1451     }
1452 }
1453
1454 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1455     switch(width){
1456     case 2: avg_pixels2_c (dst, src, stride, height); break;
1457     case 4: avg_pixels4_c (dst, src, stride, height); break;
1458     case 8: avg_pixels8_c (dst, src, stride, height); break;
1459     case 16:avg_pixels16_c(dst, src, stride, height); break;
1460     }
1461 }
1462
1463 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1464     int i,j;
1465     for (i=0; i < height; i++) {
1466       for (j=0; j < width; j++) {
1467         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1468       }
1469       src += stride;
1470       dst += stride;
1471     }
1472 }
1473
1474 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1475     int i,j;
1476     for (i=0; i < height; i++) {
1477       for (j=0; j < width; j++) {
1478         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1479       }
1480       src += stride;
1481       dst += stride;
1482     }
1483 }
1484
1485 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1486     int i,j;
1487     for (i=0; i < height; i++) {
1488       for (j=0; j < width; j++) {
1489         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1490       }
1491       src += stride;
1492       dst += stride;
1493     }
1494 }
1495
1496 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1497     int i,j;
1498     for (i=0; i < height; i++) {
1499       for (j=0; j < width; j++) {
1500         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1501       }
1502       src += stride;
1503       dst += stride;
1504     }
1505 }
1506
1507 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1508     int i,j;
1509     for (i=0; i < height; i++) {
1510       for (j=0; j < width; j++) {
1511         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1512       }
1513       src += stride;
1514       dst += stride;
1515     }
1516 }
1517
1518 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1519     int i,j;
1520     for (i=0; i < height; i++) {
1521       for (j=0; j < width; j++) {
1522         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1523       }
1524       src += stride;
1525       dst += stride;
1526     }
1527 }
1528
1529 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1530     int i,j;
1531     for (i=0; i < height; i++) {
1532       for (j=0; j < width; j++) {
1533         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1534       }
1535       src += stride;
1536       dst += stride;
1537     }
1538 }
1539
1540 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1541     int i,j;
1542     for (i=0; i < height; i++) {
1543       for (j=0; j < width; j++) {
1544         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1545       }
1546       src += stride;
1547       dst += stride;
1548     }
1549 }
1550 #if 0
1551 #define TPEL_WIDTH(width)\
1552 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1553     void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1554 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1555     void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1556 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1557     void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1558 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1559     void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1560 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1561     void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1562 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1563     void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1564 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1565     void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1566 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1567     void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1568 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1569     void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1570 #endif
1571
1572 #define H264_CHROMA_MC(OPNAME, OP)\
1573 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1574     const int A=(8-x)*(8-y);\
1575     const int B=(  x)*(8-y);\
1576     const int C=(8-x)*(  y);\
1577     const int D=(  x)*(  y);\
1578     int i;\
1579     \
1580     assert(x<8 && y<8 && x>=0 && y>=0);\
1581 \
1582     if(D){\
1583         for(i=0; i<h; i++){\
1584             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1585             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1586             dst+= stride;\
1587             src+= stride;\
1588         }\
1589     }else{\
1590         const int E= B+C;\
1591         const int step= C ? stride : 1;\
1592         for(i=0; i<h; i++){\
1593             OP(dst[0], (A*src[0] + E*src[step+0]));\
1594             OP(dst[1], (A*src[1] + E*src[step+1]));\
1595             dst+= stride;\
1596             src+= stride;\
1597         }\
1598     }\
1599 }\
1600 \
1601 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1602     const int A=(8-x)*(8-y);\
1603     const int B=(  x)*(8-y);\
1604     const int C=(8-x)*(  y);\
1605     const int D=(  x)*(  y);\
1606     int i;\
1607     \
1608     assert(x<8 && y<8 && x>=0 && y>=0);\
1609 \
1610     if(D){\
1611         for(i=0; i<h; i++){\
1612             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1613             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1614             OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1615             OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1616             dst+= stride;\
1617             src+= stride;\
1618         }\
1619     }else{\
1620         const int E= B+C;\
1621         const int step= C ? stride : 1;\
1622         for(i=0; i<h; i++){\
1623             OP(dst[0], (A*src[0] + E*src[step+0]));\
1624             OP(dst[1], (A*src[1] + E*src[step+1]));\
1625             OP(dst[2], (A*src[2] + E*src[step+2]));\
1626             OP(dst[3], (A*src[3] + E*src[step+3]));\
1627             dst+= stride;\
1628             src+= stride;\
1629         }\
1630     }\
1631 }\
1632 \
1633 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1634     const int A=(8-x)*(8-y);\
1635     const int B=(  x)*(8-y);\
1636     const int C=(8-x)*(  y);\
1637     const int D=(  x)*(  y);\
1638     int i;\
1639     \
1640     assert(x<8 && y<8 && x>=0 && y>=0);\
1641 \
1642     if(D){\
1643         for(i=0; i<h; i++){\
1644             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1645             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1646             OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1647             OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1648             OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1649             OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1650             OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1651             OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1652             dst+= stride;\
1653             src+= stride;\
1654         }\
1655     }else{\
1656         const int E= B+C;\
1657         const int step= C ? stride : 1;\
1658         for(i=0; i<h; i++){\
1659             OP(dst[0], (A*src[0] + E*src[step+0]));\
1660             OP(dst[1], (A*src[1] + E*src[step+1]));\
1661             OP(dst[2], (A*src[2] + E*src[step+2]));\
1662             OP(dst[3], (A*src[3] + E*src[step+3]));\
1663             OP(dst[4], (A*src[4] + E*src[step+4]));\
1664             OP(dst[5], (A*src[5] + E*src[step+5]));\
1665             OP(dst[6], (A*src[6] + E*src[step+6]));\
1666             OP(dst[7], (A*src[7] + E*src[step+7]));\
1667             dst+= stride;\
1668             src+= stride;\
1669         }\
1670     }\
1671 }
1672
1673 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1674 #define op_put(a, b) a = (((b) + 32)>>6)
1675
1676 H264_CHROMA_MC(put_       , op_put)
1677 H264_CHROMA_MC(avg_       , op_avg)
1678 #undef op_avg
1679 #undef op_put
1680
1681 static void put_no_rnd_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1682     const int A=(8-x)*(8-y);
1683     const int B=(  x)*(8-y);
1684     const int C=(8-x)*(  y);
1685     const int D=(  x)*(  y);
1686     int i;
1687
1688     assert(x<8 && y<8 && x>=0 && y>=0);
1689
1690     for(i=0; i<h; i++)
1691     {
1692         dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1693         dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1694         dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1695         dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1696         dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1697         dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1698         dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1699         dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1700         dst+= stride;
1701         src+= stride;
1702     }
1703 }
1704
1705 #define QPEL_MC(r, OPNAME, RND, OP) \
1706 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1707     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1708     int i;\
1709     for(i=0; i<h; i++)\
1710     {\
1711         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1712         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1713         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1714         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1715         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1716         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1717         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1718         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1719         dst+=dstStride;\
1720         src+=srcStride;\
1721     }\
1722 }\
1723 \
1724 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1725     const int w=8;\
1726     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1727     int i;\
1728     for(i=0; i<w; i++)\
1729     {\
1730         const int src0= src[0*srcStride];\
1731         const int src1= src[1*srcStride];\
1732         const int src2= src[2*srcStride];\
1733         const int src3= src[3*srcStride];\
1734         const int src4= src[4*srcStride];\
1735         const int src5= src[5*srcStride];\
1736         const int src6= src[6*srcStride];\
1737         const int src7= src[7*srcStride];\
1738         const int src8= src[8*srcStride];\
1739         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1740         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1741         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1742         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1743         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1744         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1745         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1746         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1747         dst++;\
1748         src++;\
1749     }\
1750 }\
1751 \
1752 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1753     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1754     int i;\
1755     \
1756     for(i=0; i<h; i++)\
1757     {\
1758         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1759         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1760         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1761         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1762         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1763         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1764         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1765         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1766         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1767         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1768         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1769         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1770         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1771         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1772         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1773         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1774         dst+=dstStride;\
1775         src+=srcStride;\
1776     }\
1777 }\
1778 \
1779 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1780     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1781     int i;\
1782     const int w=16;\
1783     for(i=0; i<w; i++)\
1784     {\
1785         const int src0= src[0*srcStride];\
1786         const int src1= src[1*srcStride];\
1787         const int src2= src[2*srcStride];\
1788         const int src3= src[3*srcStride];\
1789         const int src4= src[4*srcStride];\
1790         const int src5= src[5*srcStride];\
1791         const int src6= src[6*srcStride];\
1792         const int src7= src[7*srcStride];\
1793         const int src8= src[8*srcStride];\
1794         const int src9= src[9*srcStride];\
1795         const int src10= src[10*srcStride];\
1796         const int src11= src[11*srcStride];\
1797         const int src12= src[12*srcStride];\
1798         const int src13= src[13*srcStride];\
1799         const int src14= src[14*srcStride];\
1800         const int src15= src[15*srcStride];\
1801         const int src16= src[16*srcStride];\
1802         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1803         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1804         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1805         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1806         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1807         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1808         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1809         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1810         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1811         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1812         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1813         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1814         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1815         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1816         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1817         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1818         dst++;\
1819         src++;\
1820     }\
1821 }\
1822 \
1823 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1824     OPNAME ## pixels8_c(dst, src, stride, 8);\
1825 }\
1826 \
1827 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1828     uint8_t half[64];\
1829     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1830     OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1831 }\
1832 \
1833 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1834     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1835 }\
1836 \
1837 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1838     uint8_t half[64];\
1839     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1840     OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1841 }\
1842 \
1843 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1844     uint8_t full[16*9];\
1845     uint8_t half[64];\
1846     copy_block9(full, src, 16, stride, 9);\
1847     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1848     OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1849 }\
1850 \
1851 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1852     uint8_t full[16*9];\
1853     copy_block9(full, src, 16, stride, 9);\
1854     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1855 }\
1856 \
1857 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1858     uint8_t full[16*9];\
1859     uint8_t half[64];\
1860     copy_block9(full, src, 16, stride, 9);\
1861     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1862     OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1863 }\
1864 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1865     uint8_t full[16*9];\
1866     uint8_t halfH[72];\
1867     uint8_t halfV[64];\
1868     uint8_t halfHV[64];\
1869     copy_block9(full, src, 16, stride, 9);\
1870     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1871     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1872     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1873     OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1874 }\
1875 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1876     uint8_t full[16*9];\
1877     uint8_t halfH[72];\
1878     uint8_t halfHV[64];\
1879     copy_block9(full, src, 16, stride, 9);\
1880     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1881     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1882     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1883     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1884 }\
1885 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1886     uint8_t full[16*9];\
1887     uint8_t halfH[72];\
1888     uint8_t halfV[64];\
1889     uint8_t halfHV[64];\
1890     copy_block9(full, src, 16, stride, 9);\
1891     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1892     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1893     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1894     OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1895 }\
1896 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1897     uint8_t full[16*9];\
1898     uint8_t halfH[72];\
1899     uint8_t halfHV[64];\
1900     copy_block9(full, src, 16, stride, 9);\
1901     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1902     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1903     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1904     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1905 }\
1906 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1907     uint8_t full[16*9];\
1908     uint8_t halfH[72];\
1909     uint8_t halfV[64];\
1910     uint8_t halfHV[64];\
1911     copy_block9(full, src, 16, stride, 9);\
1912     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1913     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1914     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1915     OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1916 }\
1917 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1918     uint8_t full[16*9];\
1919     uint8_t halfH[72];\
1920     uint8_t halfHV[64];\
1921     copy_block9(full, src, 16, stride, 9);\
1922     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1923     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1924     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1925     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1926 }\
1927 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1928     uint8_t full[16*9];\
1929     uint8_t halfH[72];\
1930     uint8_t halfV[64];\
1931     uint8_t halfHV[64];\
1932     copy_block9(full, src, 16, stride, 9);\
1933     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1934     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1935     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1936     OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1937 }\
1938 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1939     uint8_t full[16*9];\
1940     uint8_t halfH[72];\
1941     uint8_t halfHV[64];\
1942     copy_block9(full, src, 16, stride, 9);\
1943     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1944     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1945     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1946     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1947 }\
1948 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1949     uint8_t halfH[72];\
1950     uint8_t halfHV[64];\
1951     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1952     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1953     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1954 }\
1955 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1956     uint8_t halfH[72];\
1957     uint8_t halfHV[64];\
1958     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1959     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1960     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1961 }\
1962 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1963     uint8_t full[16*9];\
1964     uint8_t halfH[72];\
1965     uint8_t halfV[64];\
1966     uint8_t halfHV[64];\
1967     copy_block9(full, src, 16, stride, 9);\
1968     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1969     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1970     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1971     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1972 }\
1973 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1974     uint8_t full[16*9];\
1975     uint8_t halfH[72];\
1976     copy_block9(full, src, 16, stride, 9);\
1977     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1978     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1979     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1980 }\
1981 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1982     uint8_t full[16*9];\
1983     uint8_t halfH[72];\
1984     uint8_t halfV[64];\
1985     uint8_t halfHV[64];\
1986     copy_block9(full, src, 16, stride, 9);\
1987     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1988     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1989     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1990     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1991 }\
1992 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1993     uint8_t full[16*9];\
1994     uint8_t halfH[72];\
1995     copy_block9(full, src, 16, stride, 9);\
1996     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1997     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1998     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1999 }\
2000 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2001     uint8_t halfH[72];\
2002     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
2003     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2004 }\
2005 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2006     OPNAME ## pixels16_c(dst, src, stride, 16);\
2007 }\
2008 \
2009 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2010     uint8_t half[256];\
2011     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2012     OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
2013 }\
2014 \
2015 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2016     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
2017 }\
2018 \
2019 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2020     uint8_t half[256];\
2021     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2022     OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
2023 }\
2024 \
2025 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2026     uint8_t full[24*17];\
2027     uint8_t half[256];\
2028     copy_block17(full, src, 24, stride, 17);\
2029     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2030     OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
2031 }\
2032 \
2033 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2034     uint8_t full[24*17];\
2035     copy_block17(full, src, 24, stride, 17);\
2036     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
2037 }\
2038 \
2039 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2040     uint8_t full[24*17];\
2041     uint8_t half[256];\
2042     copy_block17(full, src, 24, stride, 17);\
2043     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2044     OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
2045 }\
2046 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
2047     uint8_t full[24*17];\
2048     uint8_t halfH[272];\
2049     uint8_t halfV[256];\
2050     uint8_t halfHV[256];\
2051     copy_block17(full, src, 24, stride, 17);\
2052     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2053     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2054     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2055     OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2056 }\
2057 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2058     uint8_t full[24*17];\
2059     uint8_t halfH[272];\
2060     uint8_t halfHV[256];\
2061     copy_block17(full, src, 24, stride, 17);\
2062     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2063     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2064     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2065     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2066 }\
2067 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2068     uint8_t full[24*17];\
2069     uint8_t halfH[272];\
2070     uint8_t halfV[256];\
2071     uint8_t halfHV[256];\
2072     copy_block17(full, src, 24, stride, 17);\
2073     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2074     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2075     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2076     OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2077 }\
2078 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2079     uint8_t full[24*17];\
2080     uint8_t halfH[272];\
2081     uint8_t halfHV[256];\
2082     copy_block17(full, src, 24, stride, 17);\
2083     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2084     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2085     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2086     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2087 }\
2088 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2089     uint8_t full[24*17];\
2090     uint8_t halfH[272];\
2091     uint8_t halfV[256];\
2092     uint8_t halfHV[256];\
2093     copy_block17(full, src, 24, stride, 17);\
2094     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2095     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2096     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2097     OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2098 }\
2099 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2100     uint8_t full[24*17];\
2101     uint8_t halfH[272];\
2102     uint8_t halfHV[256];\
2103     copy_block17(full, src, 24, stride, 17);\
2104     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2105     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2106     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2107     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2108 }\
2109 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2110     uint8_t full[24*17];\
2111     uint8_t halfH[272];\
2112     uint8_t halfV[256];\
2113     uint8_t halfHV[256];\
2114     copy_block17(full, src, 24, stride, 17);\
2115     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
2116     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2117     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2118     OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2119 }\
2120 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2121     uint8_t full[24*17];\
2122     uint8_t halfH[272];\
2123     uint8_t halfHV[256];\
2124     copy_block17(full, src, 24, stride, 17);\
2125     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2126     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2127     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2128     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2129 }\
2130 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2131     uint8_t halfH[272];\
2132     uint8_t halfHV[256];\
2133     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2134     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2135     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2136 }\
2137 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2138     uint8_t halfH[272];\
2139     uint8_t halfHV[256];\
2140     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2141     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2142     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2143 }\
2144 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2145     uint8_t full[24*17];\
2146     uint8_t halfH[272];\
2147     uint8_t halfV[256];\
2148     uint8_t halfHV[256];\
2149     copy_block17(full, src, 24, stride, 17);\
2150     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2151     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2152     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2153     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2154 }\
2155 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2156     uint8_t full[24*17];\
2157     uint8_t halfH[272];\
2158     copy_block17(full, src, 24, stride, 17);\
2159     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2160     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2161     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2162 }\
2163 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2164     uint8_t full[24*17];\
2165     uint8_t halfH[272];\
2166     uint8_t halfV[256];\
2167     uint8_t halfHV[256];\
2168     copy_block17(full, src, 24, stride, 17);\
2169     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2170     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2171     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2172     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2173 }\
2174 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2175     uint8_t full[24*17];\
2176     uint8_t halfH[272];\
2177     copy_block17(full, src, 24, stride, 17);\
2178     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2179     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2180     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2181 }\
2182 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2183     uint8_t halfH[272];\
2184     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2185     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2186 }
2187
2188 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2189 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2190 #define op_put(a, b) a = cm[((b) + 16)>>5]
2191 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2192
2193 QPEL_MC(0, put_       , _       , op_put)
2194 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2195 QPEL_MC(0, avg_       , _       , op_avg)
2196 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
2197 #undef op_avg
2198 #undef op_avg_no_rnd
2199 #undef op_put
2200 #undef op_put_no_rnd
2201
2202 #if 1
2203 #define H264_LOWPASS(OPNAME, OP, OP2) \
2204 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2205     const int h=2;\
2206     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2207     int i;\
2208     for(i=0; i<h; i++)\
2209     {\
2210         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2211         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2212         dst+=dstStride;\
2213         src+=srcStride;\
2214     }\
2215 }\
2216 \
2217 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2218     const int w=2;\
2219     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2220     int i;\
2221     for(i=0; i<w; i++)\
2222     {\
2223         const int srcB= src[-2*srcStride];\
2224         const int srcA= src[-1*srcStride];\
2225         const int src0= src[0 *srcStride];\
2226         const int src1= src[1 *srcStride];\
2227         const int src2= src[2 *srcStride];\
2228         const int src3= src[3 *srcStride];\
2229         const int src4= src[4 *srcStride];\
2230         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2231         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2232         dst++;\
2233         src++;\
2234     }\
2235 }\
2236 \
2237 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2238     const int h=2;\
2239     const int w=2;\
2240     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2241     int i;\
2242     src -= 2*srcStride;\
2243     for(i=0; i<h+5; i++)\
2244     {\
2245         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2246         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2247         tmp+=tmpStride;\
2248         src+=srcStride;\
2249     }\
2250     tmp -= tmpStride*(h+5-2);\
2251     for(i=0; i<w; i++)\
2252     {\
2253         const int tmpB= tmp[-2*tmpStride];\
2254         const int tmpA= tmp[-1*tmpStride];\
2255         const int tmp0= tmp[0 *tmpStride];\
2256         const int tmp1= tmp[1 *tmpStride];\
2257         const int tmp2= tmp[2 *tmpStride];\
2258         const int tmp3= tmp[3 *tmpStride];\
2259         const int tmp4= tmp[4 *tmpStride];\
2260         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2261         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2262         dst++;\
2263         tmp++;\
2264     }\
2265 }\
2266 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2267     const int h=4;\
2268     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2269     int i;\
2270     for(i=0; i<h; i++)\
2271     {\
2272         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2273         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2274         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2275         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2276         dst+=dstStride;\
2277         src+=srcStride;\
2278     }\
2279 }\
2280 \
2281 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2282     const int w=4;\
2283     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2284     int i;\
2285     for(i=0; i<w; i++)\
2286     {\
2287         const int srcB= src[-2*srcStride];\
2288         const int srcA= src[-1*srcStride];\
2289         const int src0= src[0 *srcStride];\
2290         const int src1= src[1 *srcStride];\
2291         const int src2= src[2 *srcStride];\
2292         const int src3= src[3 *srcStride];\
2293         const int src4= src[4 *srcStride];\
2294         const int src5= src[5 *srcStride];\
2295         const int src6= src[6 *srcStride];\
2296         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2297         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2298         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2299         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2300         dst++;\
2301         src++;\
2302     }\
2303 }\
2304 \
2305 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2306     const int h=4;\
2307     const int w=4;\
2308     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2309     int i;\
2310     src -= 2*srcStride;\
2311     for(i=0; i<h+5; i++)\
2312     {\
2313         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2314         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2315         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2316         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2317         tmp+=tmpStride;\
2318         src+=srcStride;\
2319     }\
2320     tmp -= tmpStride*(h+5-2);\
2321     for(i=0; i<w; i++)\
2322     {\
2323         const int tmpB= tmp[-2*tmpStride];\
2324         const int tmpA= tmp[-1*tmpStride];\
2325         const int tmp0= tmp[0 *tmpStride];\
2326         const int tmp1= tmp[1 *tmpStride];\
2327         const int tmp2= tmp[2 *tmpStride];\
2328         const int tmp3= tmp[3 *tmpStride];\
2329         const int tmp4= tmp[4 *tmpStride];\
2330         const int tmp5= tmp[5 *tmpStride];\
2331         const int tmp6= tmp[6 *tmpStride];\
2332         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2333         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2334         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2335         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2336         dst++;\
2337         tmp++;\
2338     }\
2339 }\
2340 \
2341 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2342     const int h=8;\
2343     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2344     int i;\
2345     for(i=0; i<h; i++)\
2346     {\
2347         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2348         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2349         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2350         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2351         OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2352         OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2353         OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2354         OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2355         dst+=dstStride;\
2356         src+=srcStride;\
2357     }\
2358 }\
2359 \
2360 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2361     const int w=8;\
2362     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2363     int i;\
2364     for(i=0; i<w; i++)\
2365     {\
2366         const int srcB= src[-2*srcStride];\
2367         const int srcA= src[-1*srcStride];\
2368         const int src0= src[0 *srcStride];\
2369         const int src1= src[1 *srcStride];\
2370         const int src2= src[2 *srcStride];\
2371         const int src3= src[3 *srcStride];\
2372         const int src4= src[4 *srcStride];\
2373         const int src5= src[5 *srcStride];\
2374         const int src6= src[6 *srcStride];\
2375         const int src7= src[7 *srcStride];\
2376         const int src8= src[8 *srcStride];\
2377         const int src9= src[9 *srcStride];\
2378         const int src10=src[10*srcStride];\
2379         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2380         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2381         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2382         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2383         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2384         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2385         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2386         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2387         dst++;\
2388         src++;\
2389     }\
2390 }\
2391 \
2392 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2393     const int h=8;\
2394     const int w=8;\
2395     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2396     int i;\
2397     src -= 2*srcStride;\
2398     for(i=0; i<h+5; i++)\
2399     {\
2400         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2401         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2402         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2403         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2404         tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2405         tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2406         tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2407         tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2408         tmp+=tmpStride;\
2409         src+=srcStride;\
2410     }\
2411     tmp -= tmpStride*(h+5-2);\
2412     for(i=0; i<w; i++)\
2413     {\
2414         const int tmpB= tmp[-2*tmpStride];\
2415         const int tmpA= tmp[-1*tmpStride];\
2416         const int tmp0= tmp[0 *tmpStride];\
2417         const int tmp1= tmp[1 *tmpStride];\
2418         const int tmp2= tmp[2 *tmpStride];\
2419         const int tmp3= tmp[3 *tmpStride];\
2420         const int tmp4= tmp[4 *tmpStride];\
2421         const int tmp5= tmp[5 *tmpStride];\
2422         const int tmp6= tmp[6 *tmpStride];\
2423         const int tmp7= tmp[7 *tmpStride];\
2424         const int tmp8= tmp[8 *tmpStride];\
2425         const int tmp9= tmp[9 *tmpStride];\
2426         const int tmp10=tmp[10*tmpStride];\
2427         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2428         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2429         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2430         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2431         OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2432         OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2433         OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2434         OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2435         dst++;\
2436         tmp++;\
2437     }\
2438 }\
2439 \
2440 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2441     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2442     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2443     src += 8*srcStride;\
2444     dst += 8*dstStride;\
2445     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2446     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2447 }\
2448 \
2449 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2450     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2451     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2452     src += 8*srcStride;\
2453     dst += 8*dstStride;\
2454     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2455     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2456 }\
2457 \
2458 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2459     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2460     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2461     src += 8*srcStride;\
2462     dst += 8*dstStride;\
2463     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2464     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2465 }\
2466
2467 #define H264_MC(OPNAME, SIZE) \
2468 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2469     OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2470 }\
2471 \
2472 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2473     uint8_t half[SIZE*SIZE];\
2474     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2475     OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2476 }\
2477 \
2478 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2479     OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2480 }\
2481 \
2482 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2483     uint8_t half[SIZE*SIZE];\
2484     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2485     OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2486 }\
2487 \
2488 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2489     uint8_t full[SIZE*(SIZE+5)];\
2490     uint8_t * const full_mid= full + SIZE*2;\
2491     uint8_t half[SIZE*SIZE];\
2492     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2493     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2494     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2495 }\
2496 \
2497 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2498     uint8_t full[SIZE*(SIZE+5)];\
2499     uint8_t * const full_mid= full + SIZE*2;\
2500     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2501     OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2502 }\
2503 \
2504 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2505     uint8_t full[SIZE*(SIZE+5)];\
2506     uint8_t * const full_mid= full + SIZE*2;\
2507     uint8_t half[SIZE*SIZE];\
2508     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2509     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2510     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2511 }\
2512 \
2513 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2514     uint8_t full[SIZE*(SIZE+5)];\
2515     uint8_t * const full_mid= full + SIZE*2;\
2516     uint8_t halfH[SIZE*SIZE];\
2517     uint8_t halfV[SIZE*SIZE];\
2518     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2519     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2520     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2521     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2522 }\
2523 \
2524 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2525     uint8_t full[SIZE*(SIZE+5)];\
2526     uint8_t * const full_mid= full + SIZE*2;\
2527     uint8_t halfH[SIZE*SIZE];\
2528     uint8_t halfV[SIZE*SIZE];\
2529     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2530     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2531     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2532     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2533 }\
2534 \
2535 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2536     uint8_t full[SIZE*(SIZE+5)];\
2537     uint8_t * const full_mid= full + SIZE*2;\
2538     uint8_t halfH[SIZE*SIZE];\
2539     uint8_t halfV[SIZE*SIZE];\
2540     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2541     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2542     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2543     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2544 }\
2545 \
2546 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2547     uint8_t full[SIZE*(SIZE+5)];\
2548     uint8_t * const full_mid= full + SIZE*2;\
2549     uint8_t halfH[SIZE*SIZE];\
2550     uint8_t halfV[SIZE*SIZE];\
2551     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2552     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2553     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2554     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2555 }\
2556 \
2557 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2558     int16_t tmp[SIZE*(SIZE+5)];\
2559     OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2560 }\
2561 \
2562 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2563     int16_t tmp[SIZE*(SIZE+5)];\
2564     uint8_t halfH[SIZE*SIZE];\
2565     uint8_t halfHV[SIZE*SIZE];\
2566     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2567     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2568     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2569 }\
2570 \
2571 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2572     int16_t tmp[SIZE*(SIZE+5)];\
2573     uint8_t halfH[SIZE*SIZE];\
2574     uint8_t halfHV[SIZE*SIZE];\
2575     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2576     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2577     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2578 }\
2579 \
2580 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2581     uint8_t full[SIZE*(SIZE+5)];\
2582     uint8_t * const full_mid= full + SIZE*2;\
2583     int16_t tmp[SIZE*(SIZE+5)];\
2584     uint8_t halfV[SIZE*SIZE];\
2585     uint8_t halfHV[SIZE*SIZE];\
2586     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2587     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2588     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2589     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2590 }\
2591 \
2592 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2593     uint8_t full[SIZE*(SIZE+5)];\
2594     uint8_t * const full_mid= full + SIZE*2;\
2595     int16_t tmp[SIZE*(SIZE+5)];\
2596     uint8_t halfV[SIZE*SIZE];\
2597     uint8_t halfHV[SIZE*SIZE];\
2598     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2599     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2600     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2601     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2602 }\
2603
2604 #define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2605 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2606 #define op_put(a, b)  a = cm[((b) + 16)>>5]
2607 #define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2608 #define op2_put(a, b)  a = cm[((b) + 512)>>10]
2609
2610 H264_LOWPASS(put_       , op_put, op2_put)
2611 H264_LOWPASS(avg_       , op_avg, op2_avg)
2612 H264_MC(put_, 2)
2613 H264_MC(put_, 4)
2614 H264_MC(put_, 8)
2615 H264_MC(put_, 16)
2616 H264_MC(avg_, 4)
2617 H264_MC(avg_, 8)
2618 H264_MC(avg_, 16)
2619
2620 #undef op_avg
2621 #undef op_put
2622 #undef op2_avg
2623 #undef op2_put
2624 #endif
2625
2626 #define op_scale1(x)  block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2627 #define op_scale2(x)  dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2628 #define H264_WEIGHT(W,H) \
2629 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2630     int y; \
2631     offset <<= log2_denom; \
2632     if(log2_denom) offset += 1<<(log2_denom-1); \
2633     for(y=0; y<H; y++, block += stride){ \
2634         op_scale1(0); \
2635         op_scale1(1); \
2636         if(W==2) continue; \
2637         op_scale1(2); \
2638         op_scale1(3); \
2639         if(W==4) continue; \
2640         op_scale1(4); \
2641         op_scale1(5); \
2642         op_scale1(6); \
2643         op_scale1(7); \
2644         if(W==8) continue; \
2645         op_scale1(8); \
2646         op_scale1(9); \
2647         op_scale1(10); \
2648         op_scale1(11); \
2649         op_scale1(12); \
2650         op_scale1(13); \
2651         op_scale1(14); \
2652         op_scale1(15); \
2653     } \
2654 } \
2655 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2656     int y; \
2657     offset = ((offset + 1) | 1) << log2_denom; \
2658     for(y=0; y<H; y++, dst += stride, src += stride){ \
2659         op_scale2(0); \
2660         op_scale2(1); \
2661         if(W==2) continue; \
2662         op_scale2(2); \
2663         op_scale2(3); \
2664         if(W==4) continue; \
2665         op_scale2(4); \
2666         op_scale2(5); \
2667         op_scale2(6); \
2668         op_scale2(7); \
2669         if(W==8) continue; \
2670         op_scale2(8); \
2671         op_scale2(9); \
2672         op_scale2(10); \
2673         op_scale2(11); \
2674         op_scale2(12); \
2675         op_scale2(13); \
2676         op_scale2(14); \
2677         op_scale2(15); \
2678     } \
2679 }
2680
2681 H264_WEIGHT(16,16)
2682 H264_WEIGHT(16,8)
2683 H264_WEIGHT(8,16)
2684 H264_WEIGHT(8,8)
2685 H264_WEIGHT(8,4)
2686 H264_WEIGHT(4,8)
2687 H264_WEIGHT(4,4)
2688 H264_WEIGHT(4,2)
2689 H264_WEIGHT(2,4)
2690 H264_WEIGHT(2,2)
2691
2692 #undef op_scale1
2693 #undef op_scale2
2694 #undef H264_WEIGHT
2695
2696 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2697     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2698     int i;
2699
2700     for(i=0; i<h; i++){
2701         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2702         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2703         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2704         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2705         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2706         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2707         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2708         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2709         dst+=dstStride;
2710         src+=srcStride;
2711     }
2712 }
2713
2714 #ifdef CONFIG_CAVS_DECODER
2715 /* AVS specific */
2716 void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2717
2718 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2719     put_pixels8_c(dst, src, stride, 8);
2720 }
2721 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2722     avg_pixels8_c(dst, src, stride, 8);
2723 }
2724 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2725     put_pixels16_c(dst, src, stride, 16);
2726 }
2727 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2728     avg_pixels16_c(dst, src, stride, 16);
2729 }
2730 #endif /* CONFIG_CAVS_DECODER */
2731
2732 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
2733 /* VC-1 specific */
2734 void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2735
2736 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2737     put_pixels8_c(dst, src, stride, 8);
2738 }
2739 #endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
2740
2741 void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
2742
2743 /* H264 specific */
2744 void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2745
2746 #if defined(CONFIG_RV40_DECODER)
2747 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2748     put_pixels16_xy2_c(dst, src, stride, 16);
2749 }
2750 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2751     avg_pixels16_xy2_c(dst, src, stride, 16);
2752 }
2753 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2754     put_pixels8_xy2_c(dst, src, stride, 8);
2755 }
2756 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2757     avg_pixels8_xy2_c(dst, src, stride, 8);
2758 }
2759
2760 void ff_rv40dsp_init(DSPContext* c, AVCodecContext *avctx);
2761 #endif /* CONFIG_RV40_DECODER */
2762
2763 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2764     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2765     int i;
2766
2767     for(i=0; i<w; i++){
2768         const int src_1= src[ -srcStride];
2769         const int src0 = src[0          ];
2770         const int src1 = src[  srcStride];
2771         const int src2 = src[2*srcStride];
2772         const int src3 = src[3*srcStride];
2773         const int src4 = src[4*srcStride];
2774         const int src5 = src[5*srcStride];
2775         const int src6 = src[6*srcStride];
2776         const int src7 = src[7*srcStride];
2777         const int src8 = src[8*srcStride];
2778         const int src9 = src[9*srcStride];
2779         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2780         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2781         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2782         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2783         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2784         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2785         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2786         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2787         src++;
2788         dst++;
2789     }
2790 }
2791
2792 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2793     put_pixels8_c(dst, src, stride, 8);
2794 }
2795
2796 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2797     uint8_t half[64];
2798     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2799     put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2800 }
2801
2802 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2803     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2804 }
2805
2806 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2807     uint8_t half[64];
2808     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2809     put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2810 }
2811
2812 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2813     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2814 }
2815
2816 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2817     uint8_t halfH[88];
2818     uint8_t halfV[64];
2819     uint8_t halfHV[64];
2820     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2821     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2822     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2823     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2824 }
2825 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2826     uint8_t halfH[88];
2827     uint8_t halfV[64];
2828     uint8_t halfHV[64];
2829     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2830     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2831     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2832     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2833 }
2834 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2835     uint8_t halfH[88];
2836     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2837     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2838 }
2839
2840 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2841     if(ENABLE_ANY_H263) {
2842     int x;
2843     const int strength= ff_h263_loop_filter_strength[qscale];
2844
2845     for(x=0; x<8; x++){
2846         int d1, d2, ad1;
2847         int p0= src[x-2*stride];
2848         int p1= src[x-1*stride];
2849         int p2= src[x+0*stride];
2850         int p3= src[x+1*stride];
2851         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2852
2853         if     (d<-2*strength) d1= 0;
2854         else if(d<-  strength) d1=-2*strength - d;
2855         else if(d<   strength) d1= d;
2856         else if(d< 2*strength) d1= 2*strength - d;
2857         else                   d1= 0;
2858
2859         p1 += d1;
2860         p2 -= d1;
2861         if(p1&256) p1= ~(p1>>31);
2862         if(p2&256) p2= ~(p2>>31);
2863
2864         src[x-1*stride] = p1;
2865         src[x+0*stride] = p2;
2866
2867         ad1= FFABS(d1)>>1;
2868
2869         d2= av_clip((p0-p3)/4, -ad1, ad1);
2870
2871         src[x-2*stride] = p0 - d2;
2872         src[x+  stride] = p3 + d2;
2873     }
2874     }
2875 }
2876
2877 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2878     if(ENABLE_ANY_H263) {
2879     int y;
2880     const int strength= ff_h263_loop_filter_strength[qscale];
2881
2882     for(y=0; y<8; y++){
2883         int d1, d2, ad1;
2884         int p0= src[y*stride-2];
2885         int p1= src[y*stride-1];
2886         int p2= src[y*stride+0];
2887         int p3= src[y*stride+1];
2888         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2889
2890         if     (d<-2*strength) d1= 0;
2891         else if(d<-  strength) d1=-2*strength - d;
2892         else if(d<   strength) d1= d;
2893         else if(d< 2*strength) d1= 2*strength - d;
2894         else                   d1= 0;
2895
2896         p1 += d1;
2897         p2 -= d1;
2898         if(p1&256) p1= ~(p1>>31);
2899         if(p2&256) p2= ~(p2>>31);
2900
2901         src[y*stride-1] = p1;
2902         src[y*stride+0] = p2;
2903
2904         ad1= FFABS(d1)>>1;
2905
2906         d2= av_clip((p0-p3)/4, -ad1, ad1);
2907
2908         src[y*stride-2] = p0 - d2;
2909         src[y*stride+1] = p3 + d2;
2910     }
2911     }
2912 }
2913
2914 static void h261_loop_filter_c(uint8_t *src, int stride){
2915     int x,y,xy,yz;
2916     int temp[64];
2917
2918     for(x=0; x<8; x++){
2919         temp[x      ] = 4*src[x           ];
2920         temp[x + 7*8] = 4*src[x + 7*stride];
2921     }
2922     for(y=1; y<7; y++){
2923         for(x=0; x<8; x++){
2924             xy = y * stride + x;
2925             yz = y * 8 + x;
2926             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2927         }
2928     }
2929
2930     for(y=0; y<8; y++){
2931         src[  y*stride] = (temp[  y*8] + 2)>>2;
2932         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2933         for(x=1; x<7; x++){
2934             xy = y * stride + x;
2935             yz = y * 8 + x;
2936             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2937         }
2938     }
2939 }
2940
2941 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2942 {
2943     int i, d;
2944     for( i = 0; i < 4; i++ ) {
2945         if( tc0[i] < 0 ) {
2946             pix += 4*ystride;
2947             continue;
2948         }
2949         for( d = 0; d < 4; d++ ) {
2950             const int p0 = pix[-1*xstride];
2951             const int p1 = pix[-2*xstride];
2952             const int p2 = pix[-3*xstride];
2953             const int q0 = pix[0];
2954             const int q1 = pix[1*xstride];
2955             const int q2 = pix[2*xstride];
2956
2957             if( FFABS( p0 - q0 ) < alpha &&
2958                 FFABS( p1 - p0 ) < beta &&
2959                 FFABS( q1 - q0 ) < beta ) {
2960
2961                 int tc = tc0[i];
2962                 int i_delta;
2963
2964                 if( FFABS( p2 - p0 ) < beta ) {
2965                     pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2966                     tc++;
2967                 }
2968                 if( FFABS( q2 - q0 ) < beta ) {
2969                     pix[   xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2970                     tc++;
2971                 }
2972
2973                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2974                 pix[-xstride] = av_clip_uint8( p0 + i_delta );    /* p0' */
2975                 pix[0]        = av_clip_uint8( q0 - i_delta );    /* q0' */
2976             }
2977             pix += ystride;
2978         }
2979     }
2980 }
2981 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2982 {
2983     h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2984 }
2985 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2986 {
2987     h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2988 }
2989
2990 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2991 {
2992     int i, d;
2993     for( i = 0; i < 4; i++ ) {
2994         const int tc = tc0[i];
2995         if( tc <= 0 ) {
2996             pix += 2*ystride;
2997             continue;
2998         }
2999         for( d = 0; d < 2; d++ ) {
3000             const int p0 = pix[-1*xstride];
3001             const int p1 = pix[-2*xstride];
3002             const int q0 = pix[0];
3003             const int q1 = pix[1*xstride];
3004
3005             if( FFABS( p0 - q0 ) < alpha &&
3006                 FFABS( p1 - p0 ) < beta &&
3007                 FFABS( q1 - q0 ) < beta ) {
3008
3009                 int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
3010
3011                 pix[-xstride] = av_clip_uint8( p0 + delta );    /* p0' */
3012                 pix[0]        = av_clip_uint8( q0 - delta );    /* q0' */
3013             }
3014             pix += ystride;
3015         }
3016     }
3017 }
3018 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3019 {
3020     h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
3021 }
3022 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3023 {
3024     h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
3025 }
3026
3027 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3028 {
3029     int d;
3030     for( d = 0; d < 8; d++ ) {
3031         const int p0 = pix[-1*xstride];
3032         const int p1 = pix[-2*xstride];
3033         const int q0 = pix[0];
3034         const int q1 = pix[1*xstride];
3035
3036         if( FFABS( p0 - q0 ) < alpha &&
3037             FFABS( p1 - p0 ) < beta &&
3038             FFABS( q1 - q0 ) < beta ) {
3039
3040             pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
3041             pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
3042         }
3043         pix += ystride;
3044     }
3045 }
3046 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3047 {
3048     h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
3049 }
3050 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3051 {
3052     h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
3053 }
3054
3055 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3056 {
3057     int s, i;
3058
3059     s = 0;
3060     for(i=0;i<h;i++) {
3061         s += abs(pix1[0] - pix2[0]);
3062         s += abs(pix1[1] - pix2[1]);
3063         s += abs(pix1[2] - pix2[2]);
3064         s += abs(pix1[3] - pix2[3]);
3065         s += abs(pix1[4] - pix2[4]);
3066         s += abs(pix1[5] - pix2[5]);
3067         s += abs(pix1[6] - pix2[6]);
3068         s += abs(pix1[7] - pix2[7]);
3069         s += abs(pix1[8] - pix2[8]);
3070         s += abs(pix1[9] - pix2[9]);
3071         s += abs(pix1[10] - pix2[10]);
3072         s += abs(pix1[11] - pix2[11]);
3073         s += abs(pix1[12] - pix2[12]);
3074         s += abs(pix1[13] - pix2[13]);
3075         s += abs(pix1[14] - pix2[14]);
3076         s += abs(pix1[15] - pix2[15]);
3077         pix1 += line_size;
3078         pix2 += line_size;
3079     }
3080     return s;
3081 }
3082
3083 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3084 {
3085     int s, i;
3086
3087     s = 0;
3088     for(i=0;i<h;i++) {
3089         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3090         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3091         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3092         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3093         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3094         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3095         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3096         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3097         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
3098         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
3099         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
3100         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
3101         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
3102         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
3103         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
3104         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
3105         pix1 += line_size;
3106         pix2 += line_size;
3107     }
3108     return s;
3109 }
3110
3111 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3112 {
3113     int s, i;
3114     uint8_t *pix3 = pix2 + line_size;
3115
3116     s = 0;
3117     for(i=0;i<h;i++) {
3118         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3119         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3120         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3121         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3122         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3123         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3124         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3125         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3126         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
3127         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
3128         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
3129         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
3130         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
3131         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
3132         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
3133         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
3134         pix1 += line_size;
3135         pix2 += line_size;
3136         pix3 += line_size;
3137     }
3138     return s;
3139 }
3140
3141 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3142 {
3143     int s, i;
3144     uint8_t *pix3 = pix2 + line_size;
3145
3146     s = 0;
3147     for(i=0;i<h;i++) {
3148         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3149         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3150         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3151         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3152         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3153         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3154         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3155         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3156         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3157         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3158         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3159         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3160         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3161         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3162         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3163         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3164         pix1 += line_size;
3165         pix2 += line_size;
3166         pix3 += line_size;
3167     }
3168     return s;
3169 }
3170
3171 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3172 {
3173     int s, i;
3174
3175     s = 0;
3176     for(i=0;i<h;i++) {
3177         s += abs(pix1[0] - pix2[0]);
3178         s += abs(pix1[1] - pix2[1]);
3179         s += abs(pix1[2] - pix2[2]);
3180         s += abs(pix1[3] - pix2[3]);
3181         s += abs(pix1[4] - pix2[4]);
3182         s += abs(pix1[5] - pix2[5]);
3183         s += abs(pix1[6] - pix2[6]);
3184         s += abs(pix1[7] - pix2[7]);
3185         pix1 += line_size;
3186         pix2 += line_size;
3187     }
3188     return s;
3189 }
3190
3191 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3192 {
3193     int s, i;
3194
3195     s = 0;
3196     for(i=0;i<h;i++) {
3197         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3198         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3199         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3200         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3201         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3202         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3203         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3204         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3205         pix1 += line_size;
3206         pix2 += line_size;
3207     }
3208     return s;
3209 }
3210
3211 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3212 {
3213     int s, i;
3214     uint8_t *pix3 = pix2 + line_size;
3215
3216     s = 0;
3217     for(i=0;i<h;i++) {
3218         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3219         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3220         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3221         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3222         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3223         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3224         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3225         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3226         pix1 += line_size;
3227         pix2 += line_size;
3228         pix3 += line_size;
3229     }
3230     return s;
3231 }
3232
3233 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3234 {
3235     int s, i;
3236     uint8_t *pix3 = pix2 + line_size;
3237
3238     s = 0;
3239     for(i=0;i<h;i++) {
3240         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3241         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3242         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3243         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3244         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3245         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3246         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3247         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3248         pix1 += line_size;
3249         pix2 += line_size;
3250         pix3 += line_size;
3251     }
3252     return s;
3253 }
3254
3255 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3256     MpegEncContext *c = v;
3257     int score1=0;
3258     int score2=0;
3259     int x,y;
3260
3261     for(y=0; y<h; y++){
3262         for(x=0; x<16; x++){
3263             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3264         }
3265         if(y+1<h){
3266             for(x=0; x<15; x++){
3267                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
3268                              - s1[x+1] + s1[x+1+stride])
3269                         -FFABS(  s2[x  ] - s2[x  +stride]
3270                              - s2[x+1] + s2[x+1+stride]);
3271             }
3272         }
3273         s1+= stride;
3274         s2+= stride;
3275     }
3276
3277     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3278     else  return score1 + FFABS(score2)*8;
3279 }
3280
3281 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3282     MpegEncContext *c = v;
3283     int score1=0;
3284     int score2=0;
3285     int x,y;
3286
3287     for(y=0; y<h; y++){
3288         for(x=0; x<8; x++){
3289             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3290         }
3291         if(y+1<h){
3292             for(x=0; x<7; x++){
3293                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
3294                              - s1[x+1] + s1[x+1+stride])
3295                         -FFABS(  s2[x  ] - s2[x  +stride]
3296                              - s2[x+1] + s2[x+1+stride]);
3297             }
3298         }
3299         s1+= stride;
3300         s2+= stride;
3301     }
3302
3303     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3304     else  return score1 + FFABS(score2)*8;
3305 }
3306
3307 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3308     int i;
3309     unsigned int sum=0;
3310
3311     for(i=0; i<8*8; i++){
3312         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3313         int w= weight[i];
3314         b>>= RECON_SHIFT;
3315         assert(-512<b && b<512);
3316
3317         sum += (w*b)*(w*b)>>4;
3318     }
3319     return sum>>2;
3320 }
3321
3322 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3323     int i;
3324
3325     for(i=0; i<8*8; i++){
3326         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3327     }
3328 }
3329
3330 /**
3331  * permutes an 8x8 block.
3332  * @param block the block which will be permuted according to the given permutation vector
3333  * @param permutation the permutation vector
3334  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3335  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3336  *                  (inverse) permutated to scantable order!
3337  */
3338 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3339 {
3340     int i;
3341     DCTELEM temp[64];
3342
3343     if(last<=0) return;
3344     //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3345
3346     for(i=0; i<=last; i++){
3347         const int j= scantable[i];
3348         temp[j]= block[j];
3349         block[j]=0;
3350     }
3351
3352     for(i=0; i<=last; i++){
3353         const int j= scantable[i];
3354         const int perm_j= permutation[j];
3355         block[perm_j]= temp[j];
3356     }
3357 }
3358
3359 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3360     return 0;
3361 }
3362
3363 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3364     int i;
3365
3366     memset(cmp, 0, sizeof(void*)*5);
3367
3368     for(i=0; i<5; i++){
3369         switch(type&0xFF){
3370         case FF_CMP_SAD:
3371             cmp[i]= c->sad[i];
3372             break;
3373         case FF_CMP_SATD:
3374             cmp[i]= c->hadamard8_diff[i];
3375             break;
3376         case FF_CMP_SSE:
3377             cmp[i]= c->sse[i];
3378             break;
3379         case FF_CMP_DCT:
3380             cmp[i]= c->dct_sad[i];
3381             break;
3382         case FF_CMP_DCT264:
3383             cmp[i]= c->dct264_sad[i];
3384             break;
3385         case FF_CMP_DCTMAX:
3386             cmp[i]= c->dct_max[i];
3387             break;
3388         case FF_CMP_PSNR:
3389             cmp[i]= c->quant_psnr[i];
3390             break;
3391         case FF_CMP_BIT:
3392             cmp[i]= c->bit[i];
3393             break;
3394         case FF_CMP_RD:
3395             cmp[i]= c->rd[i];
3396             break;
3397         case FF_CMP_VSAD:
3398             cmp[i]= c->vsad[i];
3399             break;
3400         case FF_CMP_VSSE:
3401             cmp[i]= c->vsse[i];
3402             break;
3403         case FF_CMP_ZERO:
3404             cmp[i]= zero_cmp;
3405             break;
3406         case FF_CMP_NSSE:
3407             cmp[i]= c->nsse[i];
3408             break;
3409 #ifdef CONFIG_SNOW_ENCODER
3410         case FF_CMP_W53:
3411             cmp[i]= c->w53[i];
3412             break;
3413         case FF_CMP_W97:
3414             cmp[i]= c->w97[i];
3415             break;
3416 #endif
3417         default:
3418             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3419         }
3420     }
3421 }
3422
3423 static void clear_block_c(DCTELEM *block)
3424 {
3425     memset(block, 0, sizeof(DCTELEM)*64);
3426 }
3427
3428 /**
3429  * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3430  */
3431 static void clear_blocks_c(DCTELEM *blocks)
3432 {
3433     memset(blocks, 0, sizeof(DCTELEM)*6*64);
3434 }
3435
3436 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3437     long i;
3438     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3439         long a = *(long*)(src+i);
3440         long b = *(long*)(dst+i);
3441         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3442     }
3443     for(; i<w; i++)
3444         dst[i+0] += src[i+0];
3445 }
3446
3447 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3448     long i;
3449     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3450         long a = *(long*)(src1+i);
3451         long b = *(long*)(src2+i);
3452         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3453     }
3454     for(; i<w; i++)
3455         dst[i] = src1[i]+src2[i];
3456 }
3457
3458 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3459     long i;
3460 #ifndef HAVE_FAST_UNALIGNED
3461     if((long)src2 & (sizeof(long)-1)){
3462         for(i=0; i+7<w; i+=8){
3463             dst[i+0] = src1[i+0]-src2[i+0];
3464             dst[i+1] = src1[i+1]-src2[i+1];
3465             dst[i+2] = src1[i+2]-src2[i+2];
3466             dst[i+3] = src1[i+3]-src2[i+3];
3467             dst[i+4] = src1[i+4]-src2[i+4];
3468             dst[i+5] = src1[i+5]-src2[i+5];
3469             dst[i+6] = src1[i+6]-src2[i+6];
3470             dst[i+7] = src1[i+7]-src2[i+7];
3471         }
3472     }else
3473 #endif
3474     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3475         long a = *(long*)(src1+i);
3476         long b = *(long*)(src2+i);
3477         *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3478     }
3479     for(; i<w; i++)
3480         dst[i+0] = src1[i+0]-src2[i+0];
3481 }
3482
3483 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3484     int i;
3485     uint8_t l, lt;
3486
3487     l= *left;
3488     lt= *left_top;
3489
3490     for(i=0; i<w; i++){
3491         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3492         lt= src1[i];
3493         l= src2[i];
3494         dst[i]= l - pred;
3495     }
3496
3497     *left= l;
3498     *left_top= lt;
3499 }
3500
3501 #define BUTTERFLY2(o1,o2,i1,i2) \
3502 o1= (i1)+(i2);\
3503 o2= (i1)-(i2);
3504
3505 #define BUTTERFLY1(x,y) \
3506 {\
3507     int a,b;\
3508     a= x;\
3509     b= y;\
3510     x= a+b;\
3511     y= a-b;\
3512 }
3513
3514 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3515
3516 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3517     int i;
3518     int temp[64];
3519     int sum=0;
3520
3521     assert(h==8);
3522
3523     for(i=0; i<8; i++){
3524         //FIXME try pointer walks
3525         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3526         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3527         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3528         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3529
3530         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3531         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3532         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3533         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3534
3535         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3536         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3537         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3538         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3539     }
3540
3541     for(i=0; i<8; i++){
3542         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3543         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3544         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3545         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3546
3547         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3548         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3549         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3550         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3551
3552         sum +=
3553              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3554             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3555             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3556             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3557     }
3558 #if 0
3559 static int maxi=0;
3560 if(sum>maxi){
3561     maxi=sum;
3562     printf("MAX:%d\n", maxi);
3563 }
3564 #endif
3565     return sum;
3566 }
3567
3568 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3569     int i;
3570     int temp[64];
3571     int sum=0;
3572
3573     assert(h==8);
3574
3575     for(i=0; i<8; i++){
3576         //FIXME try pointer walks
3577         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3578         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3579         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3580         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3581
3582         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3583         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3584         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3585         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3586
3587         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3588         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3589         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3590         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3591     }
3592
3593     for(i=0; i<8; i++){
3594         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3595         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3596         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3597         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3598
3599         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3600         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3601         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3602         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3603
3604         sum +=
3605              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3606             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3607             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3608             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3609     }
3610
3611     sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3612
3613     return sum;
3614 }
3615
3616 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3617     MpegEncContext * const s= (MpegEncContext *)c;
3618     DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3619     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3620
3621     assert(h==8);
3622
3623     s->dsp.diff_pixels(temp, src1, src2, stride);
3624     s->dsp.fdct(temp);
3625     return s->dsp.sum_abs_dctelem(temp);
3626 }
3627
3628 #ifdef CONFIG_GPL
3629 #define DCT8_1D {\
3630     const int s07 = SRC(0) + SRC(7);\
3631     const int s16 = SRC(1) + SRC(6);\
3632     const int s25 = SRC(2) + SRC(5);\
3633     const int s34 = SRC(3) + SRC(4);\
3634     const int a0 = s07 + s34;\
3635     const int a1 = s16 + s25;\
3636     const int a2 = s07 - s34;\
3637     const int a3 = s16 - s25;\
3638     const int d07 = SRC(0) - SRC(7);\
3639     const int d16 = SRC(1) - SRC(6);\
3640     const int d25 = SRC(2) - SRC(5);\
3641     const int d34 = SRC(3) - SRC(4);\
3642     const int a4 = d16 + d25 + (d07 + (d07>>1));\
3643     const int a5 = d07 - d34 - (d25 + (d25>>1));\
3644     const int a6 = d07 + d34 - (d16 + (d16>>1));\
3645     const int a7 = d16 - d25 + (d34 + (d34>>1));\
3646     DST(0,  a0 + a1     ) ;\
3647     DST(1,  a4 + (a7>>2)) ;\
3648     DST(2,  a2 + (a3>>1)) ;\
3649     DST(3,  a5 + (a6>>2)) ;\
3650     DST(4,  a0 - a1     ) ;\
3651     DST(5,  a6 - (a5>>2)) ;\
3652     DST(6, (a2>>1) - a3 ) ;\
3653     DST(7, (a4>>2) - a7 ) ;\
3654 }
3655
3656 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3657     MpegEncContext * const s= (MpegEncContext *)c;
3658     DCTELEM dct[8][8];
3659     int i;
3660     int sum=0;
3661
3662     s->dsp.diff_pixels(dct[0], src1, src2, stride);
3663
3664 #define SRC(x) dct[i][x]
3665 #define DST(x,v) dct[i][x]= v
3666     for( i = 0; i < 8; i++ )
3667         DCT8_1D
3668 #undef SRC
3669 #undef DST
3670
3671 #define SRC(x) dct[x][i]
3672 #define DST(x,v) sum += FFABS(v)
3673     for( i = 0; i < 8; i++ )
3674         DCT8_1D
3675 #undef SRC
3676 #undef DST
3677     return sum;
3678 }
3679 #endif
3680
3681 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3682     MpegEncContext * const s= (MpegEncContext *)c;
3683     DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3684     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3685     int sum=0, i;
3686
3687     assert(h==8);
3688
3689     s->dsp.diff_pixels(temp, src1, src2, stride);
3690     s->dsp.fdct(temp);
3691
3692     for(i=0; i<64; i++)
3693         sum= FFMAX(sum, FFABS(temp[i]));
3694
3695     return sum;
3696 }
3697
3698 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3699     MpegEncContext * const s= (MpegEncContext *)c;
3700     DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3701     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3702     DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3703     int sum=0, i;
3704
3705     assert(h==8);
3706     s->mb_intra=0;
3707
3708     s->dsp.diff_pixels(temp, src1, src2, stride);
3709
3710     memcpy(bak, temp, 64*sizeof(DCTELEM));
3711
3712     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3713     s->dct_unquantize_inter(s, temp, 0, s->qscale);
3714     ff_simple_idct(temp); //FIXME
3715
3716     for(i=0; i<64; i++)
3717         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3718
3719     return sum;
3720 }
3721
3722 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3723     MpegEncContext * const s= (MpegEncContext *)c;
3724     const uint8_t *scantable= s->intra_scantable.permutated;
3725     DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3726     DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3727     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3728     uint8_t * const bak= (uint8_t*)aligned_bak;
3729     int i, last, run, bits, level, distortion, start_i;
3730     const int esc_length= s->ac_esc_length;
3731     uint8_t * length;
3732     uint8_t * last_length;
3733
3734     assert(h==8);
3735
3736     for(i=0; i<8; i++){
3737         ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3738         ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3739     }
3740
3741     s->dsp.diff_pixels(temp, src1, src2, stride);
3742
3743     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3744
3745     bits=0;
3746
3747     if (s->mb_intra) {
3748         start_i = 1;
3749         length     = s->intra_ac_vlc_length;
3750         last_length= s->intra_ac_vlc_last_length;
3751         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3752     } else {
3753         start_i = 0;
3754         length     = s->inter_ac_vlc_length;
3755         last_length= s->inter_ac_vlc_last_length;
3756     }
3757
3758     if(last>=start_i){
3759         run=0;
3760         for(i=start_i; i<last; i++){
3761             int j= scantable[i];
3762             level= temp[j];
3763
3764             if(level){
3765                 level+=64;
3766                 if((level&(~127)) == 0){
3767                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3768                 }else
3769                     bits+= esc_length;
3770                 run=0;
3771             }else
3772                 run++;
3773         }
3774         i= scantable[last];
3775
3776         level= temp[i] + 64;
3777
3778         assert(level - 64);
3779
3780         if((level&(~127)) == 0){
3781             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3782         }else
3783             bits+= esc_length;
3784
3785     }
3786
3787     if(last>=0){
3788         if(s->mb_intra)
3789             s->dct_unquantize_intra(s, temp, 0, s->qscale);
3790         else
3791             s->dct_unquantize_inter(s, temp, 0, s->qscale);
3792     }
3793
3794     s->dsp.idct_add(bak, stride, temp);
3795
3796     distortion= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3797
3798     return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3799 }
3800
3801 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3802     MpegEncContext * const s= (MpegEncContext *)c;
3803     const uint8_t *scantable= s->intra_scantable.permutated;
3804     DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3805     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3806     int i, last, run, bits, level, start_i;
3807     const int esc_length= s->ac_esc_length;
3808     uint8_t * length;
3809     uint8_t * last_length;
3810
3811     assert(h==8);
3812
3813     s->dsp.diff_pixels(temp, src1, src2, stride);
3814
3815     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3816
3817     bits=0;
3818
3819     if (s->mb_intra) {
3820         start_i = 1;
3821         length     = s->intra_ac_vlc_length;
3822         last_length= s->intra_ac_vlc_last_length;
3823         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3824     } else {
3825         start_i = 0;
3826         length     = s->inter_ac_vlc_length;
3827         last_length= s->inter_ac_vlc_last_length;
3828     }
3829
3830     if(last>=start_i){
3831         run=0;
3832         for(i=start_i; i<last; i++){
3833             int j= scantable[i];
3834             level= temp[j];
3835
3836             if(level){
3837                 level+=64;
3838                 if((level&(~127)) == 0){
3839                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3840                 }else
3841                     bits+= esc_length;
3842                 run=0;
3843             }else
3844                 run++;
3845         }
3846         i= scantable[last];
3847
3848         level= temp[i] + 64;
3849
3850         assert(level - 64);
3851
3852         if((level&(~127)) == 0){
3853             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3854         }else
3855             bits+= esc_length;
3856     }
3857
3858     return bits;
3859 }
3860
3861 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3862     int score=0;
3863     int x,y;
3864
3865     for(y=1; y<h; y++){
3866         for(x=0; x<16; x+=4){
3867             score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])
3868                    +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);
3869         }
3870         s+= stride;
3871     }
3872
3873     return score;
3874 }
3875
3876 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3877     int score=0;
3878     int x,y;
3879
3880     for(y=1; y<h; y++){
3881         for(x=0; x<16; x++){
3882             score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3883         }
3884         s1+= stride;
3885         s2+= stride;
3886     }
3887
3888     return score;
3889 }
3890
3891 #define SQ(a) ((a)*(a))
3892 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3893     int score=0;
3894     int x,y;
3895
3896     for(y=1; y<h; y++){
3897         for(x=0; x<16; x+=4){
3898             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])
3899                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3900         }
3901         s+= stride;
3902     }
3903
3904     return score;
3905 }
3906
3907 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3908     int score=0;
3909     int x,y;
3910
3911     for(y=1; y<h; y++){
3912         for(x=0; x<16; x++){
3913             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3914         }
3915         s1+= stride;
3916         s2+= stride;
3917     }
3918
3919     return score;
3920 }
3921
3922 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3923                                int size){
3924     int score=0;
3925     int i;
3926     for(i=0; i<size; i++)
3927         score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3928     return score;
3929 }
3930
3931 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3932 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3933 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3934 #ifdef CONFIG_GPL
3935 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3936 #endif
3937 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3938 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3939 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
3940 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
3941
3942 static void vector_fmul_c(float *dst, const float *src, int len){
3943     int i;
3944     for(i=0; i<len; i++)
3945         dst[i] *= src[i];
3946 }
3947
3948 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3949     int i;
3950     src1 += len-1;
3951     for(i=0; i<len; i++)
3952         dst[i] = src0[i] * src1[-i];
3953 }
3954
3955 void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){
3956     int i;
3957     for(i=0; i<len; i++)
3958         dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
3959 }
3960
3961 void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
3962     int i,j;
3963     dst += len;
3964     win += len;
3965     src0+= len;
3966     for(i=-len, j=len-1; i<0; i++, j--) {
3967         float s0 = src0[i];
3968         float s1 = src1[j];
3969         float wi = win[i];
3970         float wj = win[j];
3971         dst[i] = s0*wj - s1*wi + add_bias;
3972         dst[j] = s0*wi + s1*wj + add_bias;
3973     }
3974 }
3975
3976 static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
3977     int i;
3978     for(i=0; i<len; i++)
3979         dst[i] = src[i] * mul;
3980 }
3981
3982 static av_always_inline int float_to_int16_one(const float *src){
3983     int_fast32_t tmp = *(const int32_t*)src;
3984     if(tmp & 0xf0000){
3985         tmp = (0x43c0ffff - tmp)>>31;
3986         // is this faster on some gcc/cpu combinations?
3987 //      if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3988 //      else                 tmp = 0;
3989     }
3990     return tmp - 0x8000;
3991 }
3992
3993 void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
3994     int i;
3995     for(i=0; i<len; i++)
3996         dst[i] = float_to_int16_one(src+i);
3997 }
3998
3999 void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
4000     int i,j,c;
4001     if(channels==2){
4002         for(i=0; i<len; i++){
4003             dst[2*i]   = float_to_int16_one(src[0]+i);
4004             dst[2*i+1] = float_to_int16_one(src[1]+i);
4005         }
4006     }else{
4007         for(c=0; c<channels; c++)
4008             for(i=0, j=c; i<len; i++, j+=channels)
4009                 dst[j] = float_to_int16_one(src[c]+i);
4010     }
4011 }
4012
4013 static void add_int16_c(int16_t * v1, int16_t * v2, int order)
4014 {
4015     while (order--)
4016        *v1++ += *v2++;
4017 }
4018
4019 static void sub_int16_c(int16_t * v1, int16_t * v2, int order)
4020 {
4021     while (order--)
4022         *v1++ -= *v2++;
4023 }
4024
4025 static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift)
4026 {
4027     int res = 0;
4028
4029     while (order--)
4030         res += (*v1++ * *v2++) >> shift;
4031
4032     return res;
4033 }
4034
4035 #define W0 2048
4036 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
4037 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
4038 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
4039 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
4040 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
4041 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
4042 #define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
4043
4044 static void wmv2_idct_row(short * b)
4045 {
4046     int s1,s2;
4047     int a0,a1,a2,a3,a4,a5,a6,a7;
4048     /*step 1*/
4049     a1 = W1*b[1]+W7*b[7];
4050     a7 = W7*b[1]-W1*b[7];
4051     a5 = W5*b[5]+W3*b[3];
4052     a3 = W3*b[5]-W5*b[3];
4053     a2 = W2*b[2]+W6*b[6];
4054     a6 = W6*b[2]-W2*b[6];
4055     a0 = W0*b[0]+W0*b[4];
4056     a4 = W0*b[0]-W0*b[4];
4057     /*step 2*/
4058     s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
4059     s2 = (181*(a1-a5-a7+a3)+128)>>8;
4060     /*step 3*/
4061     b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
4062     b[1] = (a4+a6 +s1   + (1<<7))>>8;
4063     b[2] = (a4-a6 +s2   + (1<<7))>>8;
4064     b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
4065     b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
4066     b[5] = (a4-a6 -s2   + (1<<7))>>8;
4067     b[6] = (a4+a6 -s1   + (1<<7))>>8;
4068     b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
4069 }
4070 static void wmv2_idct_col(short * b)
4071 {
4072     int s1,s2;
4073     int a0,a1,a2,a3,a4,a5,a6,a7;
4074     /*step 1, with extended precision*/
4075     a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
4076     a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
4077     a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
4078     a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
4079     a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
4080     a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
4081     a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
4082     a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
4083     /*step 2*/
4084     s1 = (181*(a1-a5+a7-a3)+128)>>8;
4085     s2 = (181*(a1-a5-a7+a3)+128)>>8;
4086     /*step 3*/
4087     b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
4088     b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
4089     b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
4090     b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
4091
4092     b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
4093     b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
4094     b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
4095     b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
4096 }
4097 void ff_wmv2_idct_c(short * block){
4098     int i;
4099
4100     for(i=0;i<64;i+=8){
4101         wmv2_idct_row(block+i);
4102     }
4103     for(i=0;i<8;i++){
4104         wmv2_idct_col(block+i);
4105     }
4106 }
4107 /* XXX: those functions should be suppressed ASAP when all IDCTs are
4108  converted */
4109 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
4110 {
4111     ff_wmv2_idct_c(block);
4112     put_pixels_clamped_c(block, dest, line_size);
4113 }
4114 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
4115 {
4116     ff_wmv2_idct_c(block);
4117     add_pixels_clamped_c(block, dest, line_size);
4118 }
4119 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
4120 {
4121     j_rev_dct (block);
4122     put_pixels_clamped_c(block, dest, line_size);
4123 }
4124 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
4125 {
4126     j_rev_dct (block);
4127     add_pixels_clamped_c(block, dest, line_size);
4128 }
4129
4130 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
4131 {
4132     j_rev_dct4 (block);
4133     put_pixels_clamped4_c(block, dest, line_size);
4134 }
4135 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
4136 {
4137     j_rev_dct4 (block);
4138     add_pixels_clamped4_c(block, dest, line_size);
4139 }
4140
4141 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4142 {
4143     j_rev_dct2 (block);
4144     put_pixels_clamped2_c(block, dest, line_size);
4145 }
4146 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4147 {
4148     j_rev_dct2 (block);
4149     add_pixels_clamped2_c(block, dest, line_size);
4150 }
4151
4152 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4153 {
4154     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4155
4156     dest[0] = cm[(block[0] + 4)>>3];
4157 }
4158 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4159 {
4160     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4161
4162     dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4163 }
4164
4165 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4166
4167 /* init static data */
4168 void dsputil_static_init(void)
4169 {
4170     int i;
4171
4172     for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4173     for(i=0;i<MAX_NEG_CROP;i++) {
4174         ff_cropTbl[i] = 0;
4175         ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4176     }
4177
4178     for(i=0;i<512;i++) {
4179         ff_squareTbl[i] = (i - 256) * (i - 256);
4180     }
4181
4182     for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4183 }
4184
4185 int ff_check_alignment(void){
4186     static int did_fail=0;
4187     DECLARE_ALIGNED_16(int, aligned);
4188
4189     if((long)&aligned & 15){
4190         if(!did_fail){
4191 #if defined(HAVE_MMX) || defined(HAVE_ALTIVEC)
4192             av_log(NULL, AV_LOG_ERROR,
4193                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4194                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
4195                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4196                 "Do not report crashes to FFmpeg developers.\n");
4197 #endif
4198             did_fail=1;
4199         }
4200         return -1;
4201     }
4202     return 0;
4203 }
4204
4205 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4206 {
4207     int i;
4208
4209     ff_check_alignment();
4210
4211 #ifdef CONFIG_ENCODERS
4212     if(avctx->dct_algo==FF_DCT_FASTINT) {
4213         c->fdct = fdct_ifast;
4214         c->fdct248 = fdct_ifast248;
4215     }
4216     else if(avctx->dct_algo==FF_DCT_FAAN) {
4217         c->fdct = ff_faandct;
4218         c->fdct248 = ff_faandct248;
4219     }
4220     else {
4221         c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4222         c->fdct248 = ff_fdct248_islow;
4223     }
4224 #endif //CONFIG_ENCODERS
4225
4226     if(avctx->lowres==1){
4227         if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !ENABLE_H264_DECODER){
4228             c->idct_put= ff_jref_idct4_put;
4229             c->idct_add= ff_jref_idct4_add;
4230         }else{
4231             c->idct_put= ff_h264_lowres_idct_put_c;
4232             c->idct_add= ff_h264_lowres_idct_add_c;
4233         }
4234         c->idct    = j_rev_dct4;
4235         c->idct_permutation_type= FF_NO_IDCT_PERM;
4236     }else if(avctx->lowres==2){
4237         c->idct_put= ff_jref_idct2_put;
4238         c->idct_add= ff_jref_idct2_add;
4239         c->idct    = j_rev_dct2;
4240         c->idct_permutation_type= FF_NO_IDCT_PERM;
4241     }else if(avctx->lowres==3){
4242         c->idct_put= ff_jref_idct1_put;
4243         c->idct_add= ff_jref_idct1_add;
4244         c->idct    = j_rev_dct1;
4245         c->idct_permutation_type= FF_NO_IDCT_PERM;
4246     }else{
4247         if(avctx->idct_algo==FF_IDCT_INT){
4248             c->idct_put= ff_jref_idct_put;
4249             c->idct_add= ff_jref_idct_add;
4250             c->idct    = j_rev_dct;
4251             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4252         }else if((ENABLE_VP3_DECODER || ENABLE_VP5_DECODER || ENABLE_VP6_DECODER || ENABLE_THEORA_DECODER ) &&
4253                 avctx->idct_algo==FF_IDCT_VP3){
4254             c->idct_put= ff_vp3_idct_put_c;
4255             c->idct_add= ff_vp3_idct_add_c;
4256             c->idct    = ff_vp3_idct_c;
4257             c->idct_permutation_type= FF_NO_IDCT_PERM;
4258         }else if(avctx->idct_algo==FF_IDCT_WMV2){
4259             c->idct_put= ff_wmv2_idct_put_c;
4260             c->idct_add= ff_wmv2_idct_add_c;
4261             c->idct    = ff_wmv2_idct_c;
4262             c->idct_permutation_type= FF_NO_IDCT_PERM;
4263         }else if(avctx->idct_algo==FF_IDCT_FAAN){
4264             c->idct_put= ff_faanidct_put;
4265             c->idct_add= ff_faanidct_add;
4266             c->idct    = ff_faanidct;
4267             c->idct_permutation_type= FF_NO_IDCT_PERM;
4268         }else if(ENABLE_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
4269             c->idct_put= ff_ea_idct_put_c;
4270             c->idct_permutation_type= FF_NO_IDCT_PERM;
4271         }else{ //accurate/default
4272             c->idct_put= ff_simple_idct_put;
4273             c->idct_add= ff_simple_idct_add;
4274             c->idct    = ff_simple_idct;
4275             c->idct_permutation_type= FF_NO_IDCT_PERM;
4276         }
4277     }
4278
4279     if (ENABLE_H264_DECODER) {
4280         c->h264_idct_add= ff_h264_idct_add_c;
4281         c->h264_idct8_add= ff_h264_idct8_add_c;
4282         c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
4283         c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
4284     }
4285
4286     c->get_pixels = get_pixels_c;
4287     c->diff_pixels = diff_pixels_c;
4288     c->put_pixels_clamped = put_pixels_clamped_c;
4289     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4290     c->add_pixels_clamped = add_pixels_clamped_c;
4291     c->add_pixels8 = add_pixels8_c;
4292     c->add_pixels4 = add_pixels4_c;
4293     c->sum_abs_dctelem = sum_abs_dctelem_c;
4294     c->gmc1 = gmc1_c;
4295     c->gmc = ff_gmc_c;
4296     c->clear_block = clear_block_c;
4297     c->clear_blocks = clear_blocks_c;
4298     c->pix_sum = pix_sum_c;
4299     c->pix_norm1 = pix_norm1_c;
4300
4301     /* TODO [0] 16  [1] 8 */
4302     c->pix_abs[0][0] = pix_abs16_c;
4303     c->pix_abs[0][1] = pix_abs16_x2_c;
4304     c->pix_abs[0][2] = pix_abs16_y2_c;
4305     c->pix_abs[0][3] = pix_abs16_xy2_c;
4306     c->pix_abs[1][0] = pix_abs8_c;
4307     c->pix_abs[1][1] = pix_abs8_x2_c;
4308     c->pix_abs[1][2] = pix_abs8_y2_c;
4309     c->pix_abs[1][3] = pix_abs8_xy2_c;
4310
4311 #define dspfunc(PFX, IDX, NUM) \
4312     c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
4313     c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
4314     c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
4315     c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4316
4317     dspfunc(put, 0, 16);
4318     dspfunc(put_no_rnd, 0, 16);
4319     dspfunc(put, 1, 8);
4320     dspfunc(put_no_rnd, 1, 8);
4321     dspfunc(put, 2, 4);
4322     dspfunc(put, 3, 2);
4323
4324     dspfunc(avg, 0, 16);
4325     dspfunc(avg_no_rnd, 0, 16);
4326     dspfunc(avg, 1, 8);
4327     dspfunc(avg_no_rnd, 1, 8);
4328     dspfunc(avg, 2, 4);
4329     dspfunc(avg, 3, 2);
4330 #undef dspfunc
4331
4332     c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4333     c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4334
4335     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4336     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4337     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4338     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4339     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4340     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4341     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4342     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4343     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4344
4345     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4346     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4347     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4348     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4349     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4350     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4351     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4352     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4353     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4354
4355 #define dspfunc(PFX, IDX, NUM) \
4356     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4357     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4358     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4359     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4360     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4361     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4362     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4363     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4364     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4365     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4366     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4367     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4368     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4369     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4370     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4371     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4372
4373     dspfunc(put_qpel, 0, 16);
4374     dspfunc(put_no_rnd_qpel, 0, 16);
4375
4376     dspfunc(avg_qpel, 0, 16);
4377     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4378
4379     dspfunc(put_qpel, 1, 8);
4380     dspfunc(put_no_rnd_qpel, 1, 8);
4381
4382     dspfunc(avg_qpel, 1, 8);
4383     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4384
4385     dspfunc(put_h264_qpel, 0, 16);
4386     dspfunc(put_h264_qpel, 1, 8);
4387     dspfunc(put_h264_qpel, 2, 4);
4388     dspfunc(put_h264_qpel, 3, 2);
4389     dspfunc(avg_h264_qpel, 0, 16);
4390     dspfunc(avg_h264_qpel, 1, 8);
4391     dspfunc(avg_h264_qpel, 2, 4);
4392
4393 #undef dspfunc
4394     c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4395     c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4396     c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4397     c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4398     c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4399     c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4400     c->put_no_rnd_h264_chroma_pixels_tab[0]= put_no_rnd_h264_chroma_mc8_c;
4401
4402     c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
4403     c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
4404     c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
4405     c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
4406     c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
4407     c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
4408     c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
4409     c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
4410     c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
4411     c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
4412     c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
4413     c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
4414     c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
4415     c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
4416     c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
4417     c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
4418     c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
4419     c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
4420     c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
4421     c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
4422
4423     c->draw_edges = draw_edges_c;
4424
4425 #ifdef CONFIG_CAVS_DECODER
4426     ff_cavsdsp_init(c,avctx);
4427 #endif
4428 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4429     ff_vc1dsp_init(c,avctx);
4430 #endif
4431 #if defined(CONFIG_WMV2_DECODER) || defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4432     ff_intrax8dsp_init(c,avctx);
4433 #endif
4434 #if defined(CONFIG_H264_ENCODER)
4435     ff_h264dspenc_init(c,avctx);
4436 #endif
4437 #if defined(CONFIG_RV40_DECODER)
4438     ff_rv40dsp_init(c,avctx);
4439     c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
4440     c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
4441     c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
4442     c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
4443 #endif
4444
4445     c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4446     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4447     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4448     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4449     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4450     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4451     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4452     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4453
4454 #define SET_CMP_FUNC(name) \
4455     c->name[0]= name ## 16_c;\
4456     c->name[1]= name ## 8x8_c;
4457
4458     SET_CMP_FUNC(hadamard8_diff)
4459     c->hadamard8_diff[4]= hadamard8_intra16_c;
4460     SET_CMP_FUNC(dct_sad)
4461     SET_CMP_FUNC(dct_max)
4462 #ifdef CONFIG_GPL
4463     SET_CMP_FUNC(dct264_sad)
4464 #endif
4465     c->sad[0]= pix_abs16_c;
4466     c->sad[1]= pix_abs8_c;
4467     c->sse[0]= sse16_c;
4468     c->sse[1]= sse8_c;
4469     c->sse[2]= sse4_c;
4470     SET_CMP_FUNC(quant_psnr)
4471     SET_CMP_FUNC(rd)
4472     SET_CMP_FUNC(bit)
4473     c->vsad[0]= vsad16_c;
4474     c->vsad[4]= vsad_intra16_c;
4475     c->vsse[0]= vsse16_c;
4476     c->vsse[4]= vsse_intra16_c;
4477     c->nsse[0]= nsse16_c;
4478     c->nsse[1]= nsse8_c;
4479 #ifdef CONFIG_SNOW_ENCODER
4480     c->w53[0]= w53_16_c;
4481     c->w53[1]= w53_8_c;
4482     c->w97[0]= w97_16_c;
4483     c->w97[1]= w97_8_c;
4484 #endif
4485
4486     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4487
4488     c->add_bytes= add_bytes_c;
4489     c->add_bytes_l2= add_bytes_l2_c;
4490     c->diff_bytes= diff_bytes_c;
4491     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4492     c->bswap_buf= bswap_buf;
4493 #ifdef CONFIG_PNG_DECODER
4494     c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
4495 #endif
4496
4497     c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4498     c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4499     c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4500     c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4501     c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4502     c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4503     c->h264_loop_filter_strength= NULL;
4504
4505     if (ENABLE_ANY_H263) {
4506         c->h263_h_loop_filter= h263_h_loop_filter_c;
4507         c->h263_v_loop_filter= h263_v_loop_filter_c;
4508     }
4509
4510     if (ENABLE_VP3_DECODER || ENABLE_THEORA_DECODER) {
4511         c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
4512         c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
4513     }
4514
4515     c->h261_loop_filter= h261_loop_filter_c;
4516
4517     c->try_8x8basis= try_8x8basis_c;
4518     c->add_8x8basis= add_8x8basis_c;
4519
4520 #ifdef CONFIG_SNOW_DECODER
4521     c->vertical_compose97i = ff_snow_vertical_compose97i;
4522     c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4523     c->inner_add_yblock = ff_snow_inner_add_yblock;
4524 #endif
4525
4526 #ifdef CONFIG_VORBIS_DECODER
4527     c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4528 #endif
4529 #ifdef CONFIG_AC3_DECODER
4530     c->ac3_downmix = ff_ac3_downmix_c;
4531 #endif
4532 #ifdef CONFIG_FLAC_ENCODER
4533     c->flac_compute_autocorr = ff_flac_compute_autocorr;
4534 #endif
4535     c->vector_fmul = vector_fmul_c;
4536     c->vector_fmul_reverse = vector_fmul_reverse_c;
4537     c->vector_fmul_add_add = ff_vector_fmul_add_add_c;
4538     c->vector_fmul_window = ff_vector_fmul_window_c;
4539     c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
4540     c->float_to_int16 = ff_float_to_int16_c;
4541     c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
4542     c->add_int16 = add_int16_c;
4543     c->sub_int16 = sub_int16_c;
4544     c->scalarproduct_int16 = scalarproduct_int16_c;
4545
4546     c->shrink[0]= ff_img_copy_plane;
4547     c->shrink[1]= ff_shrink22;
4548     c->shrink[2]= ff_shrink44;
4549     c->shrink[3]= ff_shrink88;
4550
4551     c->prefetch= just_return;
4552
4553     memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4554     memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4555
4556     if (ENABLE_MMX)      dsputil_init_mmx   (c, avctx);
4557     if (ENABLE_ARM)      dsputil_init_arm   (c, avctx);
4558     if (ENABLE_MLIB)     dsputil_init_mlib  (c, avctx);
4559     if (ENABLE_VIS)      dsputil_init_vis   (c, avctx);
4560     if (ENABLE_ALPHA)    dsputil_init_alpha (c, avctx);
4561     if (ENABLE_POWERPC)  dsputil_init_ppc   (c, avctx);
4562     if (ENABLE_MMI)      dsputil_init_mmi   (c, avctx);
4563     if (ENABLE_SH4)      dsputil_init_sh4   (c, avctx);
4564     if (ENABLE_BFIN)     dsputil_init_bfin  (c, avctx);
4565
4566     for(i=0; i<64; i++){
4567         if(!c->put_2tap_qpel_pixels_tab[0][i])
4568             c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4569         if(!c->avg_2tap_qpel_pixels_tab[0][i])
4570             c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4571     }
4572
4573     switch(c->idct_permutation_type){
4574     case FF_NO_IDCT_PERM:
4575         for(i=0; i<64; i++)
4576             c->idct_permutation[i]= i;
4577         break;
4578     case FF_LIBMPEG2_IDCT_PERM:
4579         for(i=0; i<64; i++)
4580             c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4581         break;
4582     case FF_SIMPLE_IDCT_PERM:
4583         for(i=0; i<64; i++)
4584             c->idct_permutation[i]= simple_mmx_permutation[i];
4585         break;
4586     case FF_TRANSPOSE_IDCT_PERM:
4587         for(i=0; i<64; i++)
4588             c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4589         break;
4590     case FF_PARTTRANS_IDCT_PERM:
4591         for(i=0; i<64; i++)
4592             c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4593         break;
4594     case FF_SSE2_IDCT_PERM:
4595         for(i=0; i<64; i++)
4596             c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
4597         break;
4598     default:
4599         av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
4600     }
4601 }
4602