libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard.
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
   7  *
   8  * This file is part of FFmpeg.
   9  *
  10  * FFmpeg is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * FFmpeg is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with FFmpeg; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 /**
  26  * @file dsputil.c
  27  * DSP utils
  28  */
  29
  30 #include "avcodec.h"
  31 #include "dsputil.h"
  32 #include "simple_idct.h"
  33 #include "faandct.h"
  34 #include "faanidct.h"
  35 #include "h263.h"
  36 #include "snow.h"
  37
  38 /* snow.c */
  39 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
  40
  41 /* vorbis.c */
  42 void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
  43
  44 /* flacenc.c */
  45 void ff_flac_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
  46
  47 /* pngdec.c */
  48 void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
  49
  50 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
  51 uint32_t ff_squareTbl[512] = {0, };
  52
  53 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
  54 #define pb_7f (~0UL/255 * 0x7f)
  55 #define pb_80 (~0UL/255 * 0x80)
  56
  57 const uint8_t ff_zigzag_direct[64] = {
  58     0,   1,  8, 16,  9,  2,  3, 10,
  59     17, 24, 32, 25, 18, 11,  4,  5,
  60     12, 19, 26, 33, 40, 48, 41, 34,
  61     27, 20, 13,  6,  7, 14, 21, 28,
  62     35, 42, 49, 56, 57, 50, 43, 36,
  63     29, 22, 15, 23, 30, 37, 44, 51,
  64     58, 59, 52, 45, 38, 31, 39, 46,
  65     53, 60, 61, 54, 47, 55, 62, 63
  66 };
  67
  68 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  69    specification, we interleave the fields */
  70 const uint8_t ff_zigzag248_direct[64] = {
  71      0,  8,  1,  9, 16, 24,  2, 10,
  72     17, 25, 32, 40, 48, 56, 33, 41,
  73     18, 26,  3, 11,  4, 12, 19, 27,
  74     34, 42, 49, 57, 50, 58, 35, 43,
  75     20, 28,  5, 13,  6, 14, 21, 29,
  76     36, 44, 51, 59, 52, 60, 37, 45,
  77     22, 30,  7, 15, 23, 31, 38, 46,
  78     53, 61, 54, 62, 39, 47, 55, 63,
  79 };
  80
  81 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  82 DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
  83
  84 const uint8_t ff_alternate_horizontal_scan[64] = {
  85     0,  1,   2,  3,  8,  9, 16, 17,
  86     10, 11,  4,  5,  6,  7, 15, 14,
  87     13, 12, 19, 18, 24, 25, 32, 33,
  88     26, 27, 20, 21, 22, 23, 28, 29,
  89     30, 31, 34, 35, 40, 41, 48, 49,
  90     42, 43, 36, 37, 38, 39, 44, 45,
  91     46, 47, 50, 51, 56, 57, 58, 59,
  92     52, 53, 54, 55, 60, 61, 62, 63,
  93 };
  94
  95 const uint8_t ff_alternate_vertical_scan[64] = {
  96     0,  8,  16, 24,  1,  9,  2, 10,
  97     17, 25, 32, 40, 48, 56, 57, 49,
  98     41, 33, 26, 18,  3, 11,  4, 12,
  99     19, 27, 34, 42, 50, 58, 35, 43,
 100     51, 59, 20, 28,  5, 13,  6, 14,
 101     21, 29, 36, 44, 52, 60, 37, 45,
 102     53, 61, 22, 30,  7, 15, 23, 31,
 103     38, 46, 54, 62, 39, 47, 55, 63,
 104 };
 105
 106 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
 107 const uint32_t ff_inverse[256]={
 108          0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
 109  536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
 110  268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
 111  178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
 112  134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
 113  107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
 114   89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
 115   76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
 116   67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
 117   59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
 118   53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
 119   48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
 120   44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
 121   41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
 122   38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
 123   35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
 124   33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
 125   31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
 126   29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
 127   28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
 128   26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
 129   25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
 130   24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
 131   23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
 132   22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
 133   21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
 134   20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
 135   19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
 136   19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
 137   18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
 138   17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
 139   17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
 140 };
 141
 142 /* Input permutation for the simple_idct_mmx */
 143 static const uint8_t simple_mmx_permutation[64]={
 144         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 145         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 146         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 147         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 148         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 149         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 150         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 151         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 152 };
 153
 154 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
 155     int i;
 156     int end;
 157
 158     st->scantable= src_scantable;
 159
 160     for(i=0; i<64; i++){
 161         int j;
 162         j = src_scantable[i];
 163         st->permutated[i] = permutation[j];
 164 #ifdef ARCH_POWERPC
 165         st->inverse[j] = i;
 166 #endif
 167     }
 168
 169     end=-1;
 170     for(i=0; i<64; i++){
 171         int j;
 172         j = st->permutated[i];
 173         if(j>end) end=j;
 174         st->raster_end[i]= end;
 175     }
 176 }
 177
 178 static int pix_sum_c(uint8_t * pix, int line_size)
 179 {
 180     int s, i, j;
 181
 182     s = 0;
 183     for (i = 0; i < 16; i++) {
 184         for (j = 0; j < 16; j += 8) {
 185             s += pix[0];
 186             s += pix[1];
 187             s += pix[2];
 188             s += pix[3];
 189             s += pix[4];
 190             s += pix[5];
 191             s += pix[6];
 192             s += pix[7];
 193             pix += 8;
 194         }
 195         pix += line_size - 16;
 196     }
 197     return s;
 198 }
 199
 200 static int pix_norm1_c(uint8_t * pix, int line_size)
 201 {
 202     int s, i, j;
 203     uint32_t *sq = ff_squareTbl + 256;
 204
 205     s = 0;
 206     for (i = 0; i < 16; i++) {
 207         for (j = 0; j < 16; j += 8) {
 208 #if 0
 209             s += sq[pix[0]];
 210             s += sq[pix[1]];
 211             s += sq[pix[2]];
 212             s += sq[pix[3]];
 213             s += sq[pix[4]];
 214             s += sq[pix[5]];
 215             s += sq[pix[6]];
 216             s += sq[pix[7]];
 217 #else
 218 #if LONG_MAX > 2147483647
 219             register uint64_t x=*(uint64_t*)pix;
 220             s += sq[x&0xff];
 221             s += sq[(x>>8)&0xff];
 222             s += sq[(x>>16)&0xff];
 223             s += sq[(x>>24)&0xff];
 224             s += sq[(x>>32)&0xff];
 225             s += sq[(x>>40)&0xff];
 226             s += sq[(x>>48)&0xff];
 227             s += sq[(x>>56)&0xff];
 228 #else
 229             register uint32_t x=*(uint32_t*)pix;
 230             s += sq[x&0xff];
 231             s += sq[(x>>8)&0xff];
 232             s += sq[(x>>16)&0xff];
 233             s += sq[(x>>24)&0xff];
 234             x=*(uint32_t*)(pix+4);
 235             s += sq[x&0xff];
 236             s += sq[(x>>8)&0xff];
 237             s += sq[(x>>16)&0xff];
 238             s += sq[(x>>24)&0xff];
 239 #endif
 240 #endif
 241             pix += 8;
 242         }
 243         pix += line_size - 16;
 244     }
 245     return s;
 246 }
 247
 248 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
 249     int i;
 250
 251     for(i=0; i+8<=w; i+=8){
 252         dst[i+0]= bswap_32(src[i+0]);
 253         dst[i+1]= bswap_32(src[i+1]);
 254         dst[i+2]= bswap_32(src[i+2]);
 255         dst[i+3]= bswap_32(src[i+3]);
 256         dst[i+4]= bswap_32(src[i+4]);
 257         dst[i+5]= bswap_32(src[i+5]);
 258         dst[i+6]= bswap_32(src[i+6]);
 259         dst[i+7]= bswap_32(src[i+7]);
 260     }
 261     for(;i<w; i++){
 262         dst[i+0]= bswap_32(src[i+0]);
 263     }
 264 }
 265
 266 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 267 {
 268     int s, i;
 269     uint32_t *sq = ff_squareTbl + 256;
 270
 271     s = 0;
 272     for (i = 0; i < h; i++) {
 273         s += sq[pix1[0] - pix2[0]];
 274         s += sq[pix1[1] - pix2[1]];
 275         s += sq[pix1[2] - pix2[2]];
 276         s += sq[pix1[3] - pix2[3]];
 277         pix1 += line_size;
 278         pix2 += line_size;
 279     }
 280     return s;
 281 }
 282
 283 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 284 {
 285     int s, i;
 286     uint32_t *sq = ff_squareTbl + 256;
 287
 288     s = 0;
 289     for (i = 0; i < h; i++) {
 290         s += sq[pix1[0] - pix2[0]];
 291         s += sq[pix1[1] - pix2[1]];
 292         s += sq[pix1[2] - pix2[2]];
 293         s += sq[pix1[3] - pix2[3]];
 294         s += sq[pix1[4] - pix2[4]];
 295         s += sq[pix1[5] - pix2[5]];
 296         s += sq[pix1[6] - pix2[6]];
 297         s += sq[pix1[7] - pix2[7]];
 298         pix1 += line_size;
 299         pix2 += line_size;
 300     }
 301     return s;
 302 }
 303
 304 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 305 {
 306     int s, i;
 307     uint32_t *sq = ff_squareTbl + 256;
 308
 309     s = 0;
 310     for (i = 0; i < h; i++) {
 311         s += sq[pix1[ 0] - pix2[ 0]];
 312         s += sq[pix1[ 1] - pix2[ 1]];
 313         s += sq[pix1[ 2] - pix2[ 2]];
 314         s += sq[pix1[ 3] - pix2[ 3]];
 315         s += sq[pix1[ 4] - pix2[ 4]];
 316         s += sq[pix1[ 5] - pix2[ 5]];
 317         s += sq[pix1[ 6] - pix2[ 6]];
 318         s += sq[pix1[ 7] - pix2[ 7]];
 319         s += sq[pix1[ 8] - pix2[ 8]];
 320         s += sq[pix1[ 9] - pix2[ 9]];
 321         s += sq[pix1[10] - pix2[10]];
 322         s += sq[pix1[11] - pix2[11]];
 323         s += sq[pix1[12] - pix2[12]];
 324         s += sq[pix1[13] - pix2[13]];
 325         s += sq[pix1[14] - pix2[14]];
 326         s += sq[pix1[15] - pix2[15]];
 327
 328         pix1 += line_size;
 329         pix2 += line_size;
 330     }
 331     return s;
 332 }
 333
 334
 335 #ifdef CONFIG_SNOW_ENCODER //dwt is in snow.c
 336 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
 337     int s, i, j;
 338     const int dec_count= w==8 ? 3 : 4;
 339     int tmp[32*32];
 340     int level, ori;
 341     static const int scale[2][2][4][4]={
 342       {
 343         {
 344             // 9/7 8x8 dec=3
 345             {268, 239, 239, 213},
 346             {  0, 224, 224, 152},
 347             {  0, 135, 135, 110},
 348         },{
 349             // 9/7 16x16 or 32x32 dec=4
 350             {344, 310, 310, 280},
 351             {  0, 320, 320, 228},
 352             {  0, 175, 175, 136},
 353             {  0, 129, 129, 102},
 354         }
 355       },{
 356         {
 357             // 5/3 8x8 dec=3
 358             {275, 245, 245, 218},
 359             {  0, 230, 230, 156},
 360             {  0, 138, 138, 113},
 361         },{
 362             // 5/3 16x16 or 32x32 dec=4
 363             {352, 317, 317, 286},
 364             {  0, 328, 328, 233},
 365             {  0, 180, 180, 140},
 366             {  0, 132, 132, 105},
 367         }
 368       }
 369     };
 370
 371     for (i = 0; i < h; i++) {
 372         for (j = 0; j < w; j+=4) {
 373             tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
 374             tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
 375             tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
 376             tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
 377         }
 378         pix1 += line_size;
 379         pix2 += line_size;
 380     }
 381
 382     ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
 383
 384     s=0;
 385     assert(w==h);
 386     for(level=0; level<dec_count; level++){
 387         for(ori= level ? 1 : 0; ori<4; ori++){
 388             int size= w>>(dec_count-level);
 389             int sx= (ori&1) ? size : 0;
 390             int stride= 32<<(dec_count-level);
 391             int sy= (ori&2) ? stride>>1 : 0;
 392
 393             for(i=0; i<size; i++){
 394                 for(j=0; j<size; j++){
 395                     int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
 396                     s += FFABS(v);
 397                 }
 398             }
 399         }
 400     }
 401     assert(s>=0);
 402     return s>>9;
 403 }
 404
 405 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 406     return w_c(v, pix1, pix2, line_size,  8, h, 1);
 407 }
 408
 409 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 410     return w_c(v, pix1, pix2, line_size,  8, h, 0);
 411 }
 412
 413 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 414     return w_c(v, pix1, pix2, line_size, 16, h, 1);
 415 }
 416
 417 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 418     return w_c(v, pix1, pix2, line_size, 16, h, 0);
 419 }
 420
 421 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 422     return w_c(v, pix1, pix2, line_size, 32, h, 1);
 423 }
 424
 425 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 426     return w_c(v, pix1, pix2, line_size, 32, h, 0);
 427 }
 428 #endif
 429
 430 /* draw the edges of width 'w' of an image of size width, height */
 431 //FIXME check that this is ok for mpeg4 interlaced
 432 static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
 433 {
 434     uint8_t *ptr, *last_line;
 435     int i;
 436
 437     last_line = buf + (height - 1) * wrap;
 438     for(i=0;i<w;i++) {
 439         /* top and bottom */
 440         memcpy(buf - (i + 1) * wrap, buf, width);
 441         memcpy(last_line + (i + 1) * wrap, last_line, width);
 442     }
 443     /* left and right */
 444     ptr = buf;
 445     for(i=0;i<height;i++) {
 446         memset(ptr - w, ptr[0], w);
 447         memset(ptr + width, ptr[width-1], w);
 448         ptr += wrap;
 449     }
 450     /* corners */
 451     for(i=0;i<w;i++) {
 452         memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
 453         memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
 454         memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
 455         memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
 456     }
 457 }
 458
 459 /**
 460  * Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples.
 461  * @param buf destination buffer
 462  * @param src source buffer
 463  * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
 464  * @param block_w width of block
 465  * @param block_h height of block
 466  * @param src_x x coordinate of the top left sample of the block in the source buffer
 467  * @param src_y y coordinate of the top left sample of the block in the source buffer
 468  * @param w width of the source buffer
 469  * @param h height of the source buffer
 470  */
 471 void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, int block_w, int block_h,
 472                                     int src_x, int src_y, int w, int h){
 473     int x, y;
 474     int start_y, start_x, end_y, end_x;
 475
 476     if(src_y>= h){
 477         src+= (h-1-src_y)*linesize;
 478         src_y=h-1;
 479     }else if(src_y<=-block_h){
 480         src+= (1-block_h-src_y)*linesize;
 481         src_y=1-block_h;
 482     }
 483     if(src_x>= w){
 484         src+= (w-1-src_x);
 485         src_x=w-1;
 486     }else if(src_x<=-block_w){
 487         src+= (1-block_w-src_x);
 488         src_x=1-block_w;
 489     }
 490
 491     start_y= FFMAX(0, -src_y);
 492     start_x= FFMAX(0, -src_x);
 493     end_y= FFMIN(block_h, h-src_y);
 494     end_x= FFMIN(block_w, w-src_x);
 495
 496     // copy existing part
 497     for(y=start_y; y<end_y; y++){
 498         for(x=start_x; x<end_x; x++){
 499             buf[x + y*linesize]= src[x + y*linesize];
 500         }
 501     }
 502
 503     //top
 504     for(y=0; y<start_y; y++){
 505         for(x=start_x; x<end_x; x++){
 506             buf[x + y*linesize]= buf[x + start_y*linesize];
 507         }
 508     }
 509
 510     //bottom
 511     for(y=end_y; y<block_h; y++){
 512         for(x=start_x; x<end_x; x++){
 513             buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
 514         }
 515     }
 516
 517     for(y=0; y<block_h; y++){
 518        //left
 519         for(x=0; x<start_x; x++){
 520             buf[x + y*linesize]= buf[start_x + y*linesize];
 521         }
 522
 523        //right
 524         for(x=end_x; x<block_w; x++){
 525             buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
 526         }
 527     }
 528 }
 529
 530 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
 531 {
 532     int i;
 533
 534     /* read the pixels */
 535     for(i=0;i<8;i++) {
 536         block[0] = pixels[0];
 537         block[1] = pixels[1];
 538         block[2] = pixels[2];
 539         block[3] = pixels[3];
 540         block[4] = pixels[4];
 541         block[5] = pixels[5];
 542         block[6] = pixels[6];
 543         block[7] = pixels[7];
 544         pixels += line_size;
 545         block += 8;
 546     }
 547 }
 548
 549 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
 550                           const uint8_t *s2, int stride){
 551     int i;
 552
 553     /* read the pixels */
 554     for(i=0;i<8;i++) {
 555         block[0] = s1[0] - s2[0];
 556         block[1] = s1[1] - s2[1];
 557         block[2] = s1[2] - s2[2];
 558         block[3] = s1[3] - s2[3];
 559         block[4] = s1[4] - s2[4];
 560         block[5] = s1[5] - s2[5];
 561         block[6] = s1[6] - s2[6];
 562         block[7] = s1[7] - s2[7];
 563         s1 += stride;
 564         s2 += stride;
 565         block += 8;
 566     }
 567 }
 568
 569
 570 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 571                                  int line_size)
 572 {
 573     int i;
 574     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 575
 576     /* read the pixels */
 577     for(i=0;i<8;i++) {
 578         pixels[0] = cm[block[0]];
 579         pixels[1] = cm[block[1]];
 580         pixels[2] = cm[block[2]];
 581         pixels[3] = cm[block[3]];
 582         pixels[4] = cm[block[4]];
 583         pixels[5] = cm[block[5]];
 584         pixels[6] = cm[block[6]];
 585         pixels[7] = cm[block[7]];
 586
 587         pixels += line_size;
 588         block += 8;
 589     }
 590 }
 591
 592 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 593                                  int line_size)
 594 {
 595     int i;
 596     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 597
 598     /* read the pixels */
 599     for(i=0;i<4;i++) {
 600         pixels[0] = cm[block[0]];
 601         pixels[1] = cm[block[1]];
 602         pixels[2] = cm[block[2]];
 603         pixels[3] = cm[block[3]];
 604
 605         pixels += line_size;
 606         block += 8;
 607     }
 608 }
 609
 610 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 611                                  int line_size)
 612 {
 613     int i;
 614     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 615
 616     /* read the pixels */
 617     for(i=0;i<2;i++) {
 618         pixels[0] = cm[block[0]];
 619         pixels[1] = cm[block[1]];
 620
 621         pixels += line_size;
 622         block += 8;
 623     }
 624 }
 625
 626 static void put_signed_pixels_clamped_c(const DCTELEM *block,
 627                                         uint8_t *restrict pixels,
 628                                         int line_size)
 629 {
 630     int i, j;
 631
 632     for (i = 0; i < 8; i++) {
 633         for (j = 0; j < 8; j++) {
 634             if (*block < -128)
 635                 *pixels = 0;
 636             else if (*block > 127)
 637                 *pixels = 255;
 638             else
 639                 *pixels = (uint8_t)(*block + 128);
 640             block++;
 641             pixels++;
 642         }
 643         pixels += (line_size - 8);
 644     }
 645 }
 646
 647 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 648                           int line_size)
 649 {
 650     int i;
 651     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 652
 653     /* read the pixels */
 654     for(i=0;i<8;i++) {
 655         pixels[0] = cm[pixels[0] + block[0]];
 656         pixels[1] = cm[pixels[1] + block[1]];
 657         pixels[2] = cm[pixels[2] + block[2]];
 658         pixels[3] = cm[pixels[3] + block[3]];
 659         pixels[4] = cm[pixels[4] + block[4]];
 660         pixels[5] = cm[pixels[5] + block[5]];
 661         pixels[6] = cm[pixels[6] + block[6]];
 662         pixels[7] = cm[pixels[7] + block[7]];
 663         pixels += line_size;
 664         block += 8;
 665     }
 666 }
 667
 668 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 669                           int line_size)
 670 {
 671     int i;
 672     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 673
 674     /* read the pixels */
 675     for(i=0;i<4;i++) {
 676         pixels[0] = cm[pixels[0] + block[0]];
 677         pixels[1] = cm[pixels[1] + block[1]];
 678         pixels[2] = cm[pixels[2] + block[2]];
 679         pixels[3] = cm[pixels[3] + block[3]];
 680         pixels += line_size;
 681         block += 8;
 682     }
 683 }
 684
 685 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 686                           int line_size)
 687 {
 688     int i;
 689     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 690
 691     /* read the pixels */
 692     for(i=0;i<2;i++) {
 693         pixels[0] = cm[pixels[0] + block[0]];
 694         pixels[1] = cm[pixels[1] + block[1]];
 695         pixels += line_size;
 696         block += 8;
 697     }
 698 }
 699
 700 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 701 {
 702     int i;
 703     for(i=0;i<8;i++) {
 704         pixels[0] += block[0];
 705         pixels[1] += block[1];
 706         pixels[2] += block[2];
 707         pixels[3] += block[3];
 708         pixels[4] += block[4];
 709         pixels[5] += block[5];
 710         pixels[6] += block[6];
 711         pixels[7] += block[7];
 712         pixels += line_size;
 713         block += 8;
 714     }
 715 }
 716
 717 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 718 {
 719     int i;
 720     for(i=0;i<4;i++) {
 721         pixels[0] += block[0];
 722         pixels[1] += block[1];
 723         pixels[2] += block[2];
 724         pixels[3] += block[3];
 725         pixels += line_size;
 726         block += 4;
 727     }
 728 }
 729
 730 static int sum_abs_dctelem_c(DCTELEM *block)
 731 {
 732     int sum=0, i;
 733     for(i=0; i<64; i++)
 734         sum+= FFABS(block[i]);
 735     return sum;
 736 }
 737
 738 #if 0
 739
 740 #define PIXOP2(OPNAME, OP) \
 741 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 742 {\
 743     int i;\
 744     for(i=0; i<h; i++){\
 745         OP(*((uint64_t*)block), AV_RN64(pixels));\
 746         pixels+=line_size;\
 747         block +=line_size;\
 748     }\
 749 }\
 750 \
 751 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 752 {\
 753     int i;\
 754     for(i=0; i<h; i++){\
 755         const uint64_t a= AV_RN64(pixels  );\
 756         const uint64_t b= AV_RN64(pixels+1);\
 757         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 758         pixels+=line_size;\
 759         block +=line_size;\
 760     }\
 761 }\
 762 \
 763 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 764 {\
 765     int i;\
 766     for(i=0; i<h; i++){\
 767         const uint64_t a= AV_RN64(pixels  );\
 768         const uint64_t b= AV_RN64(pixels+1);\
 769         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 770         pixels+=line_size;\
 771         block +=line_size;\
 772     }\
 773 }\
 774 \
 775 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 776 {\
 777     int i;\
 778     for(i=0; i<h; i++){\
 779         const uint64_t a= AV_RN64(pixels          );\
 780         const uint64_t b= AV_RN64(pixels+line_size);\
 781         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 782         pixels+=line_size;\
 783         block +=line_size;\
 784     }\
 785 }\
 786 \
 787 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 788 {\
 789     int i;\
 790     for(i=0; i<h; i++){\
 791         const uint64_t a= AV_RN64(pixels          );\
 792         const uint64_t b= AV_RN64(pixels+line_size);\
 793         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 794         pixels+=line_size;\
 795         block +=line_size;\
 796     }\
 797 }\
 798 \
 799 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 800 {\
 801         int i;\
 802         const uint64_t a= AV_RN64(pixels  );\
 803         const uint64_t b= AV_RN64(pixels+1);\
 804         uint64_t l0=  (a&0x0303030303030303ULL)\
 805                     + (b&0x0303030303030303ULL)\
 806                     + 0x0202020202020202ULL;\
 807         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 808                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 809         uint64_t l1,h1;\
 810 \
 811         pixels+=line_size;\
 812         for(i=0; i<h; i+=2){\
 813             uint64_t a= AV_RN64(pixels  );\
 814             uint64_t b= AV_RN64(pixels+1);\
 815             l1=  (a&0x0303030303030303ULL)\
 816                + (b&0x0303030303030303ULL);\
 817             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 818               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 819             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 820             pixels+=line_size;\
 821             block +=line_size;\
 822             a= AV_RN64(pixels  );\
 823             b= AV_RN64(pixels+1);\
 824             l0=  (a&0x0303030303030303ULL)\
 825                + (b&0x0303030303030303ULL)\
 826                + 0x0202020202020202ULL;\
 827             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 828               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 829             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 830             pixels+=line_size;\
 831             block +=line_size;\
 832         }\
 833 }\
 834 \
 835 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 836 {\
 837         int i;\
 838         const uint64_t a= AV_RN64(pixels  );\
 839         const uint64_t b= AV_RN64(pixels+1);\
 840         uint64_t l0=  (a&0x0303030303030303ULL)\
 841                     + (b&0x0303030303030303ULL)\
 842                     + 0x0101010101010101ULL;\
 843         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 844                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 845         uint64_t l1,h1;\
 846 \
 847         pixels+=line_size;\
 848         for(i=0; i<h; i+=2){\
 849             uint64_t a= AV_RN64(pixels  );\
 850             uint64_t b= AV_RN64(pixels+1);\
 851             l1=  (a&0x0303030303030303ULL)\
 852                + (b&0x0303030303030303ULL);\
 853             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 854               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 855             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 856             pixels+=line_size;\
 857             block +=line_size;\
 858             a= AV_RN64(pixels  );\
 859             b= AV_RN64(pixels+1);\
 860             l0=  (a&0x0303030303030303ULL)\
 861                + (b&0x0303030303030303ULL)\
 862                + 0x0101010101010101ULL;\
 863             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 864               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 865             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 866             pixels+=line_size;\
 867             block +=line_size;\
 868         }\
 869 }\
 870 \
 871 CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
 872 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
 873 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
 874 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
 875 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
 876 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
 877 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
 878
 879 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
 880 #else // 64 bit variant
 881
 882 #define PIXOP2(OPNAME, OP) \
 883 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 884     int i;\
 885     for(i=0; i<h; i++){\
 886         OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
 887         pixels+=line_size;\
 888         block +=line_size;\
 889     }\
 890 }\
 891 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 892     int i;\
 893     for(i=0; i<h; i++){\
 894         OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
 895         pixels+=line_size;\
 896         block +=line_size;\
 897     }\
 898 }\
 899 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 900     int i;\
 901     for(i=0; i<h; i++){\
 902         OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
 903         OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
 904         pixels+=line_size;\
 905         block +=line_size;\
 906     }\
 907 }\
 908 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 909     OPNAME ## _pixels8_c(block, pixels, line_size, h);\
 910 }\
 911 \
 912 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 913                                                 int src_stride1, int src_stride2, int h){\
 914     int i;\
 915     for(i=0; i<h; i++){\
 916         uint32_t a,b;\
 917         a= AV_RN32(&src1[i*src_stride1  ]);\
 918         b= AV_RN32(&src2[i*src_stride2  ]);\
 919         OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
 920         a= AV_RN32(&src1[i*src_stride1+4]);\
 921         b= AV_RN32(&src2[i*src_stride2+4]);\
 922         OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
 923     }\
 924 }\
 925 \
 926 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 927                                                 int src_stride1, int src_stride2, int h){\
 928     int i;\
 929     for(i=0; i<h; i++){\
 930         uint32_t a,b;\
 931         a= AV_RN32(&src1[i*src_stride1  ]);\
 932         b= AV_RN32(&src2[i*src_stride2  ]);\
 933         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 934         a= AV_RN32(&src1[i*src_stride1+4]);\
 935         b= AV_RN32(&src2[i*src_stride2+4]);\
 936         OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
 937     }\
 938 }\
 939 \
 940 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 941                                                 int src_stride1, int src_stride2, int h){\
 942     int i;\
 943     for(i=0; i<h; i++){\
 944         uint32_t a,b;\
 945         a= AV_RN32(&src1[i*src_stride1  ]);\
 946         b= AV_RN32(&src2[i*src_stride2  ]);\
 947         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 948     }\
 949 }\
 950 \
 951 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 952                                                 int src_stride1, int src_stride2, int h){\
 953     int i;\
 954     for(i=0; i<h; i++){\
 955         uint32_t a,b;\
 956         a= AV_RN16(&src1[i*src_stride1  ]);\
 957         b= AV_RN16(&src2[i*src_stride2  ]);\
 958         OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 959     }\
 960 }\
 961 \
 962 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 963                                                 int src_stride1, int src_stride2, int h){\
 964     OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 965     OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 966 }\
 967 \
 968 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 969                                                 int src_stride1, int src_stride2, int h){\
 970     OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 971     OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 972 }\
 973 \
 974 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 975     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 976 }\
 977 \
 978 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 979     OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 980 }\
 981 \
 982 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 983     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 984 }\
 985 \
 986 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 987     OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 988 }\
 989 \
 990 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 991                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 992     int i;\
 993     for(i=0; i<h; i++){\
 994         uint32_t a, b, c, d, l0, l1, h0, h1;\
 995         a= AV_RN32(&src1[i*src_stride1]);\
 996         b= AV_RN32(&src2[i*src_stride2]);\
 997         c= AV_RN32(&src3[i*src_stride3]);\
 998         d= AV_RN32(&src4[i*src_stride4]);\
 999         l0=  (a&0x03030303UL)\
1000            + (b&0x03030303UL)\
1001            + 0x02020202UL;\
1002         h0= ((a&0xFCFCFCFCUL)>>2)\
1003           + ((b&0xFCFCFCFCUL)>>2);\
1004         l1=  (c&0x03030303UL)\
1005            + (d&0x03030303UL);\
1006         h1= ((c&0xFCFCFCFCUL)>>2)\
1007           + ((d&0xFCFCFCFCUL)>>2);\
1008         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1009         a= AV_RN32(&src1[i*src_stride1+4]);\
1010         b= AV_RN32(&src2[i*src_stride2+4]);\
1011         c= AV_RN32(&src3[i*src_stride3+4]);\
1012         d= AV_RN32(&src4[i*src_stride4+4]);\
1013         l0=  (a&0x03030303UL)\
1014            + (b&0x03030303UL)\
1015            + 0x02020202UL;\
1016         h0= ((a&0xFCFCFCFCUL)>>2)\
1017           + ((b&0xFCFCFCFCUL)>>2);\
1018         l1=  (c&0x03030303UL)\
1019            + (d&0x03030303UL);\
1020         h1= ((c&0xFCFCFCFCUL)>>2)\
1021           + ((d&0xFCFCFCFCUL)>>2);\
1022         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1023     }\
1024 }\
1025 \
1026 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1027     OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1028 }\
1029 \
1030 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1031     OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1032 }\
1033 \
1034 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1035     OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1036 }\
1037 \
1038 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1039     OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1040 }\
1041 \
1042 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1043                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1044     int i;\
1045     for(i=0; i<h; i++){\
1046         uint32_t a, b, c, d, l0, l1, h0, h1;\
1047         a= AV_RN32(&src1[i*src_stride1]);\
1048         b= AV_RN32(&src2[i*src_stride2]);\
1049         c= AV_RN32(&src3[i*src_stride3]);\
1050         d= AV_RN32(&src4[i*src_stride4]);\
1051         l0=  (a&0x03030303UL)\
1052            + (b&0x03030303UL)\
1053            + 0x01010101UL;\
1054         h0= ((a&0xFCFCFCFCUL)>>2)\
1055           + ((b&0xFCFCFCFCUL)>>2);\
1056         l1=  (c&0x03030303UL)\
1057            + (d&0x03030303UL);\
1058         h1= ((c&0xFCFCFCFCUL)>>2)\
1059           + ((d&0xFCFCFCFCUL)>>2);\
1060         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1061         a= AV_RN32(&src1[i*src_stride1+4]);\
1062         b= AV_RN32(&src2[i*src_stride2+4]);\
1063         c= AV_RN32(&src3[i*src_stride3+4]);\
1064         d= AV_RN32(&src4[i*src_stride4+4]);\
1065         l0=  (a&0x03030303UL)\
1066            + (b&0x03030303UL)\
1067            + 0x01010101UL;\
1068         h0= ((a&0xFCFCFCFCUL)>>2)\
1069           + ((b&0xFCFCFCFCUL)>>2);\
1070         l1=  (c&0x03030303UL)\
1071            + (d&0x03030303UL);\
1072         h1= ((c&0xFCFCFCFCUL)>>2)\
1073           + ((d&0xFCFCFCFCUL)>>2);\
1074         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1075     }\
1076 }\
1077 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1078                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1079     OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1080     OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1081 }\
1082 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1083                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1084     OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1085     OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1086 }\
1087 \
1088 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1089 {\
1090         int i, a0, b0, a1, b1;\
1091         a0= pixels[0];\
1092         b0= pixels[1] + 2;\
1093         a0 += b0;\
1094         b0 += pixels[2];\
1095 \
1096         pixels+=line_size;\
1097         for(i=0; i<h; i+=2){\
1098             a1= pixels[0];\
1099             b1= pixels[1];\
1100             a1 += b1;\
1101             b1 += pixels[2];\
1102 \
1103             block[0]= (a1+a0)>>2; /* FIXME non put */\
1104             block[1]= (b1+b0)>>2;\
1105 \
1106             pixels+=line_size;\
1107             block +=line_size;\
1108 \
1109             a0= pixels[0];\
1110             b0= pixels[1] + 2;\
1111             a0 += b0;\
1112             b0 += pixels[2];\
1113 \
1114             block[0]= (a1+a0)>>2;\
1115             block[1]= (b1+b0)>>2;\
1116             pixels+=line_size;\
1117             block +=line_size;\
1118         }\
1119 }\
1120 \
1121 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1122 {\
1123         int i;\
1124         const uint32_t a= AV_RN32(pixels  );\
1125         const uint32_t b= AV_RN32(pixels+1);\
1126         uint32_t l0=  (a&0x03030303UL)\
1127                     + (b&0x03030303UL)\
1128                     + 0x02020202UL;\
1129         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1130                    + ((b&0xFCFCFCFCUL)>>2);\
1131         uint32_t l1,h1;\
1132 \
1133         pixels+=line_size;\
1134         for(i=0; i<h; i+=2){\
1135             uint32_t a= AV_RN32(pixels  );\
1136             uint32_t b= AV_RN32(pixels+1);\
1137             l1=  (a&0x03030303UL)\
1138                + (b&0x03030303UL);\
1139             h1= ((a&0xFCFCFCFCUL)>>2)\
1140               + ((b&0xFCFCFCFCUL)>>2);\
1141             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1142             pixels+=line_size;\
1143             block +=line_size;\
1144             a= AV_RN32(pixels  );\
1145             b= AV_RN32(pixels+1);\
1146             l0=  (a&0x03030303UL)\
1147                + (b&0x03030303UL)\
1148                + 0x02020202UL;\
1149             h0= ((a&0xFCFCFCFCUL)>>2)\
1150               + ((b&0xFCFCFCFCUL)>>2);\
1151             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1152             pixels+=line_size;\
1153             block +=line_size;\
1154         }\
1155 }\
1156 \
1157 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1158 {\
1159     int j;\
1160     for(j=0; j<2; j++){\
1161         int i;\
1162         const uint32_t a= AV_RN32(pixels  );\
1163         const uint32_t b= AV_RN32(pixels+1);\
1164         uint32_t l0=  (a&0x03030303UL)\
1165                     + (b&0x03030303UL)\
1166                     + 0x02020202UL;\
1167         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1168                    + ((b&0xFCFCFCFCUL)>>2);\
1169         uint32_t l1,h1;\
1170 \
1171         pixels+=line_size;\
1172         for(i=0; i<h; i+=2){\
1173             uint32_t a= AV_RN32(pixels  );\
1174             uint32_t b= AV_RN32(pixels+1);\
1175             l1=  (a&0x03030303UL)\
1176                + (b&0x03030303UL);\
1177             h1= ((a&0xFCFCFCFCUL)>>2)\
1178               + ((b&0xFCFCFCFCUL)>>2);\
1179             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1180             pixels+=line_size;\
1181             block +=line_size;\
1182             a= AV_RN32(pixels  );\
1183             b= AV_RN32(pixels+1);\
1184             l0=  (a&0x03030303UL)\
1185                + (b&0x03030303UL)\
1186                + 0x02020202UL;\
1187             h0= ((a&0xFCFCFCFCUL)>>2)\
1188               + ((b&0xFCFCFCFCUL)>>2);\
1189             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1190             pixels+=line_size;\
1191             block +=line_size;\
1192         }\
1193         pixels+=4-line_size*(h+1);\
1194         block +=4-line_size*h;\
1195     }\
1196 }\
1197 \
1198 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1199 {\
1200     int j;\
1201     for(j=0; j<2; j++){\
1202         int i;\
1203         const uint32_t a= AV_RN32(pixels  );\
1204         const uint32_t b= AV_RN32(pixels+1);\
1205         uint32_t l0=  (a&0x03030303UL)\
1206                     + (b&0x03030303UL)\
1207                     + 0x01010101UL;\
1208         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1209                    + ((b&0xFCFCFCFCUL)>>2);\
1210         uint32_t l1,h1;\
1211 \
1212         pixels+=line_size;\
1213         for(i=0; i<h; i+=2){\
1214             uint32_t a= AV_RN32(pixels  );\
1215             uint32_t b= AV_RN32(pixels+1);\
1216             l1=  (a&0x03030303UL)\
1217                + (b&0x03030303UL);\
1218             h1= ((a&0xFCFCFCFCUL)>>2)\
1219               + ((b&0xFCFCFCFCUL)>>2);\
1220             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1221             pixels+=line_size;\
1222             block +=line_size;\
1223             a= AV_RN32(pixels  );\
1224             b= AV_RN32(pixels+1);\
1225             l0=  (a&0x03030303UL)\
1226                + (b&0x03030303UL)\
1227                + 0x01010101UL;\
1228             h0= ((a&0xFCFCFCFCUL)>>2)\
1229               + ((b&0xFCFCFCFCUL)>>2);\
1230             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1231             pixels+=line_size;\
1232             block +=line_size;\
1233         }\
1234         pixels+=4-line_size*(h+1);\
1235         block +=4-line_size*h;\
1236     }\
1237 }\
1238 \
1239 CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1240 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1241 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1242 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1243 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1244 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1245 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1246 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1247
1248 #define op_avg(a, b) a = rnd_avg32(a, b)
1249 #endif
1250 #define op_put(a, b) a = b
1251
1252 PIXOP2(avg, op_avg)
1253 PIXOP2(put, op_put)
1254 #undef op_avg
1255 #undef op_put
1256
1257 #define avg2(a,b) ((a+b+1)>>1)
1258 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1259
1260 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1261     put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1262 }
1263
1264 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1265     put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1266 }
1267
1268 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1269 {
1270     const int A=(16-x16)*(16-y16);
1271     const int B=(   x16)*(16-y16);
1272     const int C=(16-x16)*(   y16);
1273     const int D=(   x16)*(   y16);
1274     int i;
1275
1276     for(i=0; i<h; i++)
1277     {
1278         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1279         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1280         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1281         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1282         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1283         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1284         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1285         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1286         dst+= stride;
1287         src+= stride;
1288     }
1289 }
1290
1291 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1292                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1293 {
1294     int y, vx, vy;
1295     const int s= 1<<shift;
1296
1297     width--;
1298     height--;
1299
1300     for(y=0; y<h; y++){
1301         int x;
1302
1303         vx= ox;
1304         vy= oy;
1305         for(x=0; x<8; x++){ //XXX FIXME optimize
1306             int src_x, src_y, frac_x, frac_y, index;
1307
1308             src_x= vx>>16;
1309             src_y= vy>>16;
1310             frac_x= src_x&(s-1);
1311             frac_y= src_y&(s-1);
1312             src_x>>=shift;
1313             src_y>>=shift;
1314
1315             if((unsigned)src_x < width){
1316                 if((unsigned)src_y < height){
1317                     index= src_x + src_y*stride;
1318                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1319                                            + src[index       +1]*   frac_x )*(s-frac_y)
1320                                         + (  src[index+stride  ]*(s-frac_x)
1321                                            + src[index+stride+1]*   frac_x )*   frac_y
1322                                         + r)>>(shift*2);
1323                 }else{
1324                     index= src_x + av_clip(src_y, 0, height)*stride;
1325                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1326                                           + src[index       +1]*   frac_x )*s
1327                                         + r)>>(shift*2);
1328                 }
1329             }else{
1330                 if((unsigned)src_y < height){
1331                     index= av_clip(src_x, 0, width) + src_y*stride;
1332                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1333                                            + src[index+stride  ]*   frac_y )*s
1334                                         + r)>>(shift*2);
1335                 }else{
1336                     index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1337                     dst[y*stride + x]=    src[index         ];
1338                 }
1339             }
1340
1341             vx+= dxx;
1342             vy+= dyx;
1343         }
1344         ox += dxy;
1345         oy += dyy;
1346     }
1347 }
1348
1349 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1350     switch(width){
1351     case 2: put_pixels2_c (dst, src, stride, height); break;
1352     case 4: put_pixels4_c (dst, src, stride, height); break;
1353     case 8: put_pixels8_c (dst, src, stride, height); break;
1354     case 16:put_pixels16_c(dst, src, stride, height); break;
1355     }
1356 }
1357
1358 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1359     int i,j;
1360     for (i=0; i < height; i++) {
1361       for (j=0; j < width; j++) {
1362         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1363       }
1364       src += stride;
1365       dst += stride;
1366     }
1367 }
1368
1369 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1370     int i,j;
1371     for (i=0; i < height; i++) {
1372       for (j=0; j < width; j++) {
1373         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1374       }
1375       src += stride;
1376       dst += stride;
1377     }
1378 }
1379
1380 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1381     int i,j;
1382     for (i=0; i < height; i++) {
1383       for (j=0; j < width; j++) {
1384         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1385       }
1386       src += stride;
1387       dst += stride;
1388     }
1389 }
1390
1391 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1392     int i,j;
1393     for (i=0; i < height; i++) {
1394       for (j=0; j < width; j++) {
1395         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1396       }
1397       src += stride;
1398       dst += stride;
1399     }
1400 }
1401
1402 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1403     int i,j;
1404     for (i=0; i < height; i++) {
1405       for (j=0; j < width; j++) {
1406         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1407       }
1408       src += stride;
1409       dst += stride;
1410     }
1411 }
1412
1413 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1414     int i,j;
1415     for (i=0; i < height; i++) {
1416       for (j=0; j < width; j++) {
1417         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1418       }
1419       src += stride;
1420       dst += stride;
1421     }
1422 }
1423
1424 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1425     int i,j;
1426     for (i=0; i < height; i++) {
1427       for (j=0; j < width; j++) {
1428         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1429       }
1430       src += stride;
1431       dst += stride;
1432     }
1433 }
1434
1435 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1436     int i,j;
1437     for (i=0; i < height; i++) {
1438       for (j=0; j < width; j++) {
1439         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1440       }
1441       src += stride;
1442       dst += stride;
1443     }
1444 }
1445
1446 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1447     switch(width){
1448     case 2: avg_pixels2_c (dst, src, stride, height); break;
1449     case 4: avg_pixels4_c (dst, src, stride, height); break;
1450     case 8: avg_pixels8_c (dst, src, stride, height); break;
1451     case 16:avg_pixels16_c(dst, src, stride, height); break;
1452     }
1453 }
1454
1455 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1456     int i,j;
1457     for (i=0; i < height; i++) {
1458       for (j=0; j < width; j++) {
1459         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1460       }
1461       src += stride;
1462       dst += stride;
1463     }
1464 }
1465
1466 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1467     int i,j;
1468     for (i=0; i < height; i++) {
1469       for (j=0; j < width; j++) {
1470         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1471       }
1472       src += stride;
1473       dst += stride;
1474     }
1475 }
1476
1477 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1478     int i,j;
1479     for (i=0; i < height; i++) {
1480       for (j=0; j < width; j++) {
1481         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1482       }
1483       src += stride;
1484       dst += stride;
1485     }
1486 }
1487
1488 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1489     int i,j;
1490     for (i=0; i < height; i++) {
1491       for (j=0; j < width; j++) {
1492         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1493       }
1494       src += stride;
1495       dst += stride;
1496     }
1497 }
1498
1499 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1500     int i,j;
1501     for (i=0; i < height; i++) {
1502       for (j=0; j < width; j++) {
1503         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1504       }
1505       src += stride;
1506       dst += stride;
1507     }
1508 }
1509
1510 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1511     int i,j;
1512     for (i=0; i < height; i++) {
1513       for (j=0; j < width; j++) {
1514         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1515       }
1516       src += stride;
1517       dst += stride;
1518     }
1519 }
1520
1521 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1522     int i,j;
1523     for (i=0; i < height; i++) {
1524       for (j=0; j < width; j++) {
1525         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1526       }
1527       src += stride;
1528       dst += stride;
1529     }
1530 }
1531
1532 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1533     int i,j;
1534     for (i=0; i < height; i++) {
1535       for (j=0; j < width; j++) {
1536         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1537       }
1538       src += stride;
1539       dst += stride;
1540     }
1541 }
1542 #if 0
1543 #define TPEL_WIDTH(width)\
1544 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1545     void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1546 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1547     void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1548 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1549     void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1550 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1551     void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1552 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1553     void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1554 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1555     void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1556 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1557     void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1558 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1559     void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1560 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1561     void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1562 #endif
1563
1564 #define H264_CHROMA_MC(OPNAME, OP)\
1565 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1566     const int A=(8-x)*(8-y);\
1567     const int B=(  x)*(8-y);\
1568     const int C=(8-x)*(  y);\
1569     const int D=(  x)*(  y);\
1570     int i;\
1571     \
1572     assert(x<8 && y<8 && x>=0 && y>=0);\
1573 \
1574     if(D){\
1575         for(i=0; i<h; i++){\
1576             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1577             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1578             dst+= stride;\
1579             src+= stride;\
1580         }\
1581     }else{\
1582         const int E= B+C;\
1583         const int step= C ? stride : 1;\
1584         for(i=0; i<h; i++){\
1585             OP(dst[0], (A*src[0] + E*src[step+0]));\
1586             OP(dst[1], (A*src[1] + E*src[step+1]));\
1587             dst+= stride;\
1588             src+= stride;\
1589         }\
1590     }\
1591 }\
1592 \
1593 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1594     const int A=(8-x)*(8-y);\
1595     const int B=(  x)*(8-y);\
1596     const int C=(8-x)*(  y);\
1597     const int D=(  x)*(  y);\
1598     int i;\
1599     \
1600     assert(x<8 && y<8 && x>=0 && y>=0);\
1601 \
1602     if(D){\
1603         for(i=0; i<h; i++){\
1604             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1605             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1606             OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1607             OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1608             dst+= stride;\
1609             src+= stride;\
1610         }\
1611     }else{\
1612         const int E= B+C;\
1613         const int step= C ? stride : 1;\
1614         for(i=0; i<h; i++){\
1615             OP(dst[0], (A*src[0] + E*src[step+0]));\
1616             OP(dst[1], (A*src[1] + E*src[step+1]));\
1617             OP(dst[2], (A*src[2] + E*src[step+2]));\
1618             OP(dst[3], (A*src[3] + E*src[step+3]));\
1619             dst+= stride;\
1620             src+= stride;\
1621         }\
1622     }\
1623 }\
1624 \
1625 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1626     const int A=(8-x)*(8-y);\
1627     const int B=(  x)*(8-y);\
1628     const int C=(8-x)*(  y);\
1629     const int D=(  x)*(  y);\
1630     int i;\
1631     \
1632     assert(x<8 && y<8 && x>=0 && y>=0);\
1633 \
1634     if(D){\
1635         for(i=0; i<h; i++){\
1636             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1637             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1638             OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1639             OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1640             OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1641             OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1642             OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1643             OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1644             dst+= stride;\
1645             src+= stride;\
1646         }\
1647     }else{\
1648         const int E= B+C;\
1649         const int step= C ? stride : 1;\
1650         for(i=0; i<h; i++){\
1651             OP(dst[0], (A*src[0] + E*src[step+0]));\
1652             OP(dst[1], (A*src[1] + E*src[step+1]));\
1653             OP(dst[2], (A*src[2] + E*src[step+2]));\
1654             OP(dst[3], (A*src[3] + E*src[step+3]));\
1655             OP(dst[4], (A*src[4] + E*src[step+4]));\
1656             OP(dst[5], (A*src[5] + E*src[step+5]));\
1657             OP(dst[6], (A*src[6] + E*src[step+6]));\
1658             OP(dst[7], (A*src[7] + E*src[step+7]));\
1659             dst+= stride;\
1660             src+= stride;\
1661         }\
1662     }\
1663 }
1664
1665 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1666 #define op_put(a, b) a = (((b) + 32)>>6)
1667
1668 H264_CHROMA_MC(put_       , op_put)
1669 H264_CHROMA_MC(avg_       , op_avg)
1670 #undef op_avg
1671 #undef op_put
1672
1673 static void put_no_rnd_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1674     const int A=(8-x)*(8-y);
1675     const int B=(  x)*(8-y);
1676     const int C=(8-x)*(  y);
1677     const int D=(  x)*(  y);
1678     int i;
1679
1680     assert(x<8 && y<8 && x>=0 && y>=0);
1681
1682     for(i=0; i<h; i++)
1683     {
1684         dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1685         dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1686         dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1687         dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1688         dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1689         dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1690         dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1691         dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1692         dst+= stride;
1693         src+= stride;
1694     }
1695 }
1696
1697 #define QPEL_MC(r, OPNAME, RND, OP) \
1698 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1699     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1700     int i;\
1701     for(i=0; i<h; i++)\
1702     {\
1703         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1704         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1705         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1706         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1707         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1708         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1709         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1710         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1711         dst+=dstStride;\
1712         src+=srcStride;\
1713     }\
1714 }\
1715 \
1716 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1717     const int w=8;\
1718     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1719     int i;\
1720     for(i=0; i<w; i++)\
1721     {\
1722         const int src0= src[0*srcStride];\
1723         const int src1= src[1*srcStride];\
1724         const int src2= src[2*srcStride];\
1725         const int src3= src[3*srcStride];\
1726         const int src4= src[4*srcStride];\
1727         const int src5= src[5*srcStride];\
1728         const int src6= src[6*srcStride];\
1729         const int src7= src[7*srcStride];\
1730         const int src8= src[8*srcStride];\
1731         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1732         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1733         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1734         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1735         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1736         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1737         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1738         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1739         dst++;\
1740         src++;\
1741     }\
1742 }\
1743 \
1744 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1745     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1746     int i;\
1747     \
1748     for(i=0; i<h; i++)\
1749     {\
1750         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1751         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1752         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1753         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1754         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1755         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1756         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1757         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1758         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1759         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1760         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1761         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1762         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1763         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1764         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1765         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1766         dst+=dstStride;\
1767         src+=srcStride;\
1768     }\
1769 }\
1770 \
1771 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1772     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1773     int i;\
1774     const int w=16;\
1775     for(i=0; i<w; i++)\
1776     {\
1777         const int src0= src[0*srcStride];\
1778         const int src1= src[1*srcStride];\
1779         const int src2= src[2*srcStride];\
1780         const int src3= src[3*srcStride];\
1781         const int src4= src[4*srcStride];\
1782         const int src5= src[5*srcStride];\
1783         const int src6= src[6*srcStride];\
1784         const int src7= src[7*srcStride];\
1785         const int src8= src[8*srcStride];\
1786         const int src9= src[9*srcStride];\
1787         const int src10= src[10*srcStride];\
1788         const int src11= src[11*srcStride];\
1789         const int src12= src[12*srcStride];\
1790         const int src13= src[13*srcStride];\
1791         const int src14= src[14*srcStride];\
1792         const int src15= src[15*srcStride];\
1793         const int src16= src[16*srcStride];\
1794         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1795         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1796         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1797         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1798         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1799         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1800         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1801         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1802         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1803         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1804         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1805         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1806         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1807         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1808         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1809         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1810         dst++;\
1811         src++;\
1812     }\
1813 }\
1814 \
1815 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1816     OPNAME ## pixels8_c(dst, src, stride, 8);\
1817 }\
1818 \
1819 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1820     uint8_t half[64];\
1821     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1822     OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1823 }\
1824 \
1825 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1826     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1827 }\
1828 \
1829 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1830     uint8_t half[64];\
1831     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1832     OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1833 }\
1834 \
1835 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1836     uint8_t full[16*9];\
1837     uint8_t half[64];\
1838     copy_block9(full, src, 16, stride, 9);\
1839     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1840     OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1841 }\
1842 \
1843 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1844     uint8_t full[16*9];\
1845     copy_block9(full, src, 16, stride, 9);\
1846     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1847 }\
1848 \
1849 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1850     uint8_t full[16*9];\
1851     uint8_t half[64];\
1852     copy_block9(full, src, 16, stride, 9);\
1853     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1854     OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1855 }\
1856 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1857     uint8_t full[16*9];\
1858     uint8_t halfH[72];\
1859     uint8_t halfV[64];\
1860     uint8_t halfHV[64];\
1861     copy_block9(full, src, 16, stride, 9);\
1862     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1863     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1864     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1865     OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1866 }\
1867 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1868     uint8_t full[16*9];\
1869     uint8_t halfH[72];\
1870     uint8_t halfHV[64];\
1871     copy_block9(full, src, 16, stride, 9);\
1872     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1873     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1874     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1875     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1876 }\
1877 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1878     uint8_t full[16*9];\
1879     uint8_t halfH[72];\
1880     uint8_t halfV[64];\
1881     uint8_t halfHV[64];\
1882     copy_block9(full, src, 16, stride, 9);\
1883     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1884     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1885     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1886     OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1887 }\
1888 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1889     uint8_t full[16*9];\
1890     uint8_t halfH[72];\
1891     uint8_t halfHV[64];\
1892     copy_block9(full, src, 16, stride, 9);\
1893     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1894     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1895     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1896     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1897 }\
1898 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1899     uint8_t full[16*9];\
1900     uint8_t halfH[72];\
1901     uint8_t halfV[64];\
1902     uint8_t halfHV[64];\
1903     copy_block9(full, src, 16, stride, 9);\
1904     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1905     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1906     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1907     OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1908 }\
1909 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1910     uint8_t full[16*9];\
1911     uint8_t halfH[72];\
1912     uint8_t halfHV[64];\
1913     copy_block9(full, src, 16, stride, 9);\
1914     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1915     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1916     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1917     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1918 }\
1919 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1920     uint8_t full[16*9];\
1921     uint8_t halfH[72];\
1922     uint8_t halfV[64];\
1923     uint8_t halfHV[64];\
1924     copy_block9(full, src, 16, stride, 9);\
1925     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1926     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1927     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1928     OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1929 }\
1930 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1931     uint8_t full[16*9];\
1932     uint8_t halfH[72];\
1933     uint8_t halfHV[64];\
1934     copy_block9(full, src, 16, stride, 9);\
1935     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1936     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1937     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1938     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1939 }\
1940 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1941     uint8_t halfH[72];\
1942     uint8_t halfHV[64];\
1943     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1944     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1945     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1946 }\
1947 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1948     uint8_t halfH[72];\
1949     uint8_t halfHV[64];\
1950     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1951     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1952     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1953 }\
1954 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1955     uint8_t full[16*9];\
1956     uint8_t halfH[72];\
1957     uint8_t halfV[64];\
1958     uint8_t halfHV[64];\
1959     copy_block9(full, src, 16, stride, 9);\
1960     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1961     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1962     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1963     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1964 }\
1965 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1966     uint8_t full[16*9];\
1967     uint8_t halfH[72];\
1968     copy_block9(full, src, 16, stride, 9);\
1969     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1970     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1971     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1972 }\
1973 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1974     uint8_t full[16*9];\
1975     uint8_t halfH[72];\
1976     uint8_t halfV[64];\
1977     uint8_t halfHV[64];\
1978     copy_block9(full, src, 16, stride, 9);\
1979     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1980     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1981     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1982     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1983 }\
1984 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1985     uint8_t full[16*9];\
1986     uint8_t halfH[72];\
1987     copy_block9(full, src, 16, stride, 9);\
1988     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1989     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1990     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1991 }\
1992 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1993     uint8_t halfH[72];\
1994     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1995     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1996 }\
1997 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1998     OPNAME ## pixels16_c(dst, src, stride, 16);\
1999 }\
2000 \
2001 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2002     uint8_t half[256];\
2003     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2004     OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
2005 }\
2006 \
2007 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2008     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
2009 }\
2010 \
2011 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2012     uint8_t half[256];\
2013     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2014     OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
2015 }\
2016 \
2017 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2018     uint8_t full[24*17];\
2019     uint8_t half[256];\
2020     copy_block17(full, src, 24, stride, 17);\
2021     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2022     OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
2023 }\
2024 \
2025 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2026     uint8_t full[24*17];\
2027     copy_block17(full, src, 24, stride, 17);\
2028     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
2029 }\
2030 \
2031 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2032     uint8_t full[24*17];\
2033     uint8_t half[256];\
2034     copy_block17(full, src, 24, stride, 17);\
2035     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2036     OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
2037 }\
2038 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
2039     uint8_t full[24*17];\
2040     uint8_t halfH[272];\
2041     uint8_t halfV[256];\
2042     uint8_t halfHV[256];\
2043     copy_block17(full, src, 24, stride, 17);\
2044     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2045     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2046     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2047     OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2048 }\
2049 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2050     uint8_t full[24*17];\
2051     uint8_t halfH[272];\
2052     uint8_t halfHV[256];\
2053     copy_block17(full, src, 24, stride, 17);\
2054     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2055     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2056     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2057     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2058 }\
2059 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2060     uint8_t full[24*17];\
2061     uint8_t halfH[272];\
2062     uint8_t halfV[256];\
2063     uint8_t halfHV[256];\
2064     copy_block17(full, src, 24, stride, 17);\
2065     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2066     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2067     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2068     OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2069 }\
2070 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2071     uint8_t full[24*17];\
2072     uint8_t halfH[272];\
2073     uint8_t halfHV[256];\
2074     copy_block17(full, src, 24, stride, 17);\
2075     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2076     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2077     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2078     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2079 }\
2080 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2081     uint8_t full[24*17];\
2082     uint8_t halfH[272];\
2083     uint8_t halfV[256];\
2084     uint8_t halfHV[256];\
2085     copy_block17(full, src, 24, stride, 17);\
2086     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2087     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2088     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2089     OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2090 }\
2091 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2092     uint8_t full[24*17];\
2093     uint8_t halfH[272];\
2094     uint8_t halfHV[256];\
2095     copy_block17(full, src, 24, stride, 17);\
2096     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2097     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2098     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2099     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2100 }\
2101 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2102     uint8_t full[24*17];\
2103     uint8_t halfH[272];\
2104     uint8_t halfV[256];\
2105     uint8_t halfHV[256];\
2106     copy_block17(full, src, 24, stride, 17);\
2107     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
2108     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2109     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2110     OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2111 }\
2112 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2113     uint8_t full[24*17];\
2114     uint8_t halfH[272];\
2115     uint8_t halfHV[256];\
2116     copy_block17(full, src, 24, stride, 17);\
2117     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2118     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2119     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2120     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2121 }\
2122 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2123     uint8_t halfH[272];\
2124     uint8_t halfHV[256];\
2125     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2126     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2127     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2128 }\
2129 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2130     uint8_t halfH[272];\
2131     uint8_t halfHV[256];\
2132     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2133     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2134     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2135 }\
2136 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2137     uint8_t full[24*17];\
2138     uint8_t halfH[272];\
2139     uint8_t halfV[256];\
2140     uint8_t halfHV[256];\
2141     copy_block17(full, src, 24, stride, 17);\
2142     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2143     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2144     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2145     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2146 }\
2147 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2148     uint8_t full[24*17];\
2149     uint8_t halfH[272];\
2150     copy_block17(full, src, 24, stride, 17);\
2151     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2152     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2153     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2154 }\
2155 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2156     uint8_t full[24*17];\
2157     uint8_t halfH[272];\
2158     uint8_t halfV[256];\
2159     uint8_t halfHV[256];\
2160     copy_block17(full, src, 24, stride, 17);\
2161     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2162     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2163     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2164     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2165 }\
2166 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2167     uint8_t full[24*17];\
2168     uint8_t halfH[272];\
2169     copy_block17(full, src, 24, stride, 17);\
2170     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2171     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2172     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2173 }\
2174 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2175     uint8_t halfH[272];\
2176     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2177     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2178 }
2179
2180 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2181 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2182 #define op_put(a, b) a = cm[((b) + 16)>>5]
2183 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2184
2185 QPEL_MC(0, put_       , _       , op_put)
2186 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2187 QPEL_MC(0, avg_       , _       , op_avg)
2188 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
2189 #undef op_avg
2190 #undef op_avg_no_rnd
2191 #undef op_put
2192 #undef op_put_no_rnd
2193
2194 #if 1
2195 #define H264_LOWPASS(OPNAME, OP, OP2) \
2196 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2197     const int h=2;\
2198     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2199     int i;\
2200     for(i=0; i<h; i++)\
2201     {\
2202         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2203         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2204         dst+=dstStride;\
2205         src+=srcStride;\
2206     }\
2207 }\
2208 \
2209 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2210     const int w=2;\
2211     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2212     int i;\
2213     for(i=0; i<w; i++)\
2214     {\
2215         const int srcB= src[-2*srcStride];\
2216         const int srcA= src[-1*srcStride];\
2217         const int src0= src[0 *srcStride];\
2218         const int src1= src[1 *srcStride];\
2219         const int src2= src[2 *srcStride];\
2220         const int src3= src[3 *srcStride];\
2221         const int src4= src[4 *srcStride];\
2222         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2223         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2224         dst++;\
2225         src++;\
2226     }\
2227 }\
2228 \
2229 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2230     const int h=2;\
2231     const int w=2;\
2232     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2233     int i;\
2234     src -= 2*srcStride;\
2235     for(i=0; i<h+5; i++)\
2236     {\
2237         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2238         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2239         tmp+=tmpStride;\
2240         src+=srcStride;\
2241     }\
2242     tmp -= tmpStride*(h+5-2);\
2243     for(i=0; i<w; i++)\
2244     {\
2245         const int tmpB= tmp[-2*tmpStride];\
2246         const int tmpA= tmp[-1*tmpStride];\
2247         const int tmp0= tmp[0 *tmpStride];\
2248         const int tmp1= tmp[1 *tmpStride];\
2249         const int tmp2= tmp[2 *tmpStride];\
2250         const int tmp3= tmp[3 *tmpStride];\
2251         const int tmp4= tmp[4 *tmpStride];\
2252         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2253         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2254         dst++;\
2255         tmp++;\
2256     }\
2257 }\
2258 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2259     const int h=4;\
2260     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2261     int i;\
2262     for(i=0; i<h; i++)\
2263     {\
2264         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2265         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2266         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2267         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2268         dst+=dstStride;\
2269         src+=srcStride;\
2270     }\
2271 }\
2272 \
2273 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2274     const int w=4;\
2275     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2276     int i;\
2277     for(i=0; i<w; i++)\
2278     {\
2279         const int srcB= src[-2*srcStride];\
2280         const int srcA= src[-1*srcStride];\
2281         const int src0= src[0 *srcStride];\
2282         const int src1= src[1 *srcStride];\
2283         const int src2= src[2 *srcStride];\
2284         const int src3= src[3 *srcStride];\
2285         const int src4= src[4 *srcStride];\
2286         const int src5= src[5 *srcStride];\
2287         const int src6= src[6 *srcStride];\
2288         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2289         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2290         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2291         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2292         dst++;\
2293         src++;\
2294     }\
2295 }\
2296 \
2297 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2298     const int h=4;\
2299     const int w=4;\
2300     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2301     int i;\
2302     src -= 2*srcStride;\
2303     for(i=0; i<h+5; i++)\
2304     {\
2305         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2306         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2307         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2308         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2309         tmp+=tmpStride;\
2310         src+=srcStride;\
2311     }\
2312     tmp -= tmpStride*(h+5-2);\
2313     for(i=0; i<w; i++)\
2314     {\
2315         const int tmpB= tmp[-2*tmpStride];\
2316         const int tmpA= tmp[-1*tmpStride];\
2317         const int tmp0= tmp[0 *tmpStride];\
2318         const int tmp1= tmp[1 *tmpStride];\
2319         const int tmp2= tmp[2 *tmpStride];\
2320         const int tmp3= tmp[3 *tmpStride];\
2321         const int tmp4= tmp[4 *tmpStride];\
2322         const int tmp5= tmp[5 *tmpStride];\
2323         const int tmp6= tmp[6 *tmpStride];\
2324         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2325         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2326         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2327         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2328         dst++;\
2329         tmp++;\
2330     }\
2331 }\
2332 \
2333 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2334     const int h=8;\
2335     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2336     int i;\
2337     for(i=0; i<h; i++)\
2338     {\
2339         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2340         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2341         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2342         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2343         OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2344         OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2345         OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2346         OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2347         dst+=dstStride;\
2348         src+=srcStride;\
2349     }\
2350 }\
2351 \
2352 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2353     const int w=8;\
2354     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2355     int i;\
2356     for(i=0; i<w; i++)\
2357     {\
2358         const int srcB= src[-2*srcStride];\
2359         const int srcA= src[-1*srcStride];\
2360         const int src0= src[0 *srcStride];\
2361         const int src1= src[1 *srcStride];\
2362         const int src2= src[2 *srcStride];\
2363         const int src3= src[3 *srcStride];\
2364         const int src4= src[4 *srcStride];\
2365         const int src5= src[5 *srcStride];\
2366         const int src6= src[6 *srcStride];\
2367         const int src7= src[7 *srcStride];\
2368         const int src8= src[8 *srcStride];\
2369         const int src9= src[9 *srcStride];\
2370         const int src10=src[10*srcStride];\
2371         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2372         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2373         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2374         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2375         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2376         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2377         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2378         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2379         dst++;\
2380         src++;\
2381     }\
2382 }\
2383 \
2384 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2385     const int h=8;\
2386     const int w=8;\
2387     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2388     int i;\
2389     src -= 2*srcStride;\
2390     for(i=0; i<h+5; i++)\
2391     {\
2392         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2393         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2394         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2395         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2396         tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2397         tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2398         tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2399         tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2400         tmp+=tmpStride;\
2401         src+=srcStride;\
2402     }\
2403     tmp -= tmpStride*(h+5-2);\
2404     for(i=0; i<w; i++)\
2405     {\
2406         const int tmpB= tmp[-2*tmpStride];\
2407         const int tmpA= tmp[-1*tmpStride];\
2408         const int tmp0= tmp[0 *tmpStride];\
2409         const int tmp1= tmp[1 *tmpStride];\
2410         const int tmp2= tmp[2 *tmpStride];\
2411         const int tmp3= tmp[3 *tmpStride];\
2412         const int tmp4= tmp[4 *tmpStride];\
2413         const int tmp5= tmp[5 *tmpStride];\
2414         const int tmp6= tmp[6 *tmpStride];\
2415         const int tmp7= tmp[7 *tmpStride];\
2416         const int tmp8= tmp[8 *tmpStride];\
2417         const int tmp9= tmp[9 *tmpStride];\
2418         const int tmp10=tmp[10*tmpStride];\
2419         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2420         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2421         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2422         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2423         OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2424         OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2425         OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2426         OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2427         dst++;\
2428         tmp++;\
2429     }\
2430 }\
2431 \
2432 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2433     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2434     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2435     src += 8*srcStride;\
2436     dst += 8*dstStride;\
2437     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2438     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2439 }\
2440 \
2441 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2442     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2443     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2444     src += 8*srcStride;\
2445     dst += 8*dstStride;\
2446     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2447     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2448 }\
2449 \
2450 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2451     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2452     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2453     src += 8*srcStride;\
2454     dst += 8*dstStride;\
2455     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2456     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2457 }\
2458
2459 #define H264_MC(OPNAME, SIZE) \
2460 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2461     OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2462 }\
2463 \
2464 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2465     uint8_t half[SIZE*SIZE];\
2466     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2467     OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2468 }\
2469 \
2470 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2471     OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2472 }\
2473 \
2474 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2475     uint8_t half[SIZE*SIZE];\
2476     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2477     OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2478 }\
2479 \
2480 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2481     uint8_t full[SIZE*(SIZE+5)];\
2482     uint8_t * const full_mid= full + SIZE*2;\
2483     uint8_t half[SIZE*SIZE];\
2484     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2485     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2486     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2487 }\
2488 \
2489 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2490     uint8_t full[SIZE*(SIZE+5)];\
2491     uint8_t * const full_mid= full + SIZE*2;\
2492     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2493     OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2494 }\
2495 \
2496 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2497     uint8_t full[SIZE*(SIZE+5)];\
2498     uint8_t * const full_mid= full + SIZE*2;\
2499     uint8_t half[SIZE*SIZE];\
2500     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2501     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2502     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2503 }\
2504 \
2505 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2506     uint8_t full[SIZE*(SIZE+5)];\
2507     uint8_t * const full_mid= full + SIZE*2;\
2508     uint8_t halfH[SIZE*SIZE];\
2509     uint8_t halfV[SIZE*SIZE];\
2510     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2511     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2512     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2513     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2514 }\
2515 \
2516 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2517     uint8_t full[SIZE*(SIZE+5)];\
2518     uint8_t * const full_mid= full + SIZE*2;\
2519     uint8_t halfH[SIZE*SIZE];\
2520     uint8_t halfV[SIZE*SIZE];\
2521     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2522     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2523     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2524     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2525 }\
2526 \
2527 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2528     uint8_t full[SIZE*(SIZE+5)];\
2529     uint8_t * const full_mid= full + SIZE*2;\
2530     uint8_t halfH[SIZE*SIZE];\
2531     uint8_t halfV[SIZE*SIZE];\
2532     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2533     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2534     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2535     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2536 }\
2537 \
2538 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2539     uint8_t full[SIZE*(SIZE+5)];\
2540     uint8_t * const full_mid= full + SIZE*2;\
2541     uint8_t halfH[SIZE*SIZE];\
2542     uint8_t halfV[SIZE*SIZE];\
2543     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2544     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2545     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2546     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2547 }\
2548 \
2549 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2550     int16_t tmp[SIZE*(SIZE+5)];\
2551     OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2552 }\
2553 \
2554 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2555     int16_t tmp[SIZE*(SIZE+5)];\
2556     uint8_t halfH[SIZE*SIZE];\
2557     uint8_t halfHV[SIZE*SIZE];\
2558     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2559     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2560     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2561 }\
2562 \
2563 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2564     int16_t tmp[SIZE*(SIZE+5)];\
2565     uint8_t halfH[SIZE*SIZE];\
2566     uint8_t halfHV[SIZE*SIZE];\
2567     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2568     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2569     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2570 }\
2571 \
2572 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2573     uint8_t full[SIZE*(SIZE+5)];\
2574     uint8_t * const full_mid= full + SIZE*2;\
2575     int16_t tmp[SIZE*(SIZE+5)];\
2576     uint8_t halfV[SIZE*SIZE];\
2577     uint8_t halfHV[SIZE*SIZE];\
2578     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2579     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2580     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2581     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2582 }\
2583 \
2584 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2585     uint8_t full[SIZE*(SIZE+5)];\
2586     uint8_t * const full_mid= full + SIZE*2;\
2587     int16_t tmp[SIZE*(SIZE+5)];\
2588     uint8_t halfV[SIZE*SIZE];\
2589     uint8_t halfHV[SIZE*SIZE];\
2590     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2591     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2592     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2593     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2594 }\
2595
2596 #define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2597 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2598 #define op_put(a, b)  a = cm[((b) + 16)>>5]
2599 #define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2600 #define op2_put(a, b)  a = cm[((b) + 512)>>10]
2601
2602 H264_LOWPASS(put_       , op_put, op2_put)
2603 H264_LOWPASS(avg_       , op_avg, op2_avg)
2604 H264_MC(put_, 2)
2605 H264_MC(put_, 4)
2606 H264_MC(put_, 8)
2607 H264_MC(put_, 16)
2608 H264_MC(avg_, 4)
2609 H264_MC(avg_, 8)
2610 H264_MC(avg_, 16)
2611
2612 #undef op_avg
2613 #undef op_put
2614 #undef op2_avg
2615 #undef op2_put
2616 #endif
2617
2618 #define op_scale1(x)  block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2619 #define op_scale2(x)  dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2620 #define H264_WEIGHT(W,H) \
2621 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2622     int y; \
2623     offset <<= log2_denom; \
2624     if(log2_denom) offset += 1<<(log2_denom-1); \
2625     for(y=0; y<H; y++, block += stride){ \
2626         op_scale1(0); \
2627         op_scale1(1); \
2628         if(W==2) continue; \
2629         op_scale1(2); \
2630         op_scale1(3); \
2631         if(W==4) continue; \
2632         op_scale1(4); \
2633         op_scale1(5); \
2634         op_scale1(6); \
2635         op_scale1(7); \
2636         if(W==8) continue; \
2637         op_scale1(8); \
2638         op_scale1(9); \
2639         op_scale1(10); \
2640         op_scale1(11); \
2641         op_scale1(12); \
2642         op_scale1(13); \
2643         op_scale1(14); \
2644         op_scale1(15); \
2645     } \
2646 } \
2647 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2648     int y; \
2649     offset = ((offset + 1) | 1) << log2_denom; \
2650     for(y=0; y<H; y++, dst += stride, src += stride){ \
2651         op_scale2(0); \
2652         op_scale2(1); \
2653         if(W==2) continue; \
2654         op_scale2(2); \
2655         op_scale2(3); \
2656         if(W==4) continue; \
2657         op_scale2(4); \
2658         op_scale2(5); \
2659         op_scale2(6); \
2660         op_scale2(7); \
2661         if(W==8) continue; \
2662         op_scale2(8); \
2663         op_scale2(9); \
2664         op_scale2(10); \
2665         op_scale2(11); \
2666         op_scale2(12); \
2667         op_scale2(13); \
2668         op_scale2(14); \
2669         op_scale2(15); \
2670     } \
2671 }
2672
2673 H264_WEIGHT(16,16)
2674 H264_WEIGHT(16,8)
2675 H264_WEIGHT(8,16)
2676 H264_WEIGHT(8,8)
2677 H264_WEIGHT(8,4)
2678 H264_WEIGHT(4,8)
2679 H264_WEIGHT(4,4)
2680 H264_WEIGHT(4,2)
2681 H264_WEIGHT(2,4)
2682 H264_WEIGHT(2,2)
2683
2684 #undef op_scale1
2685 #undef op_scale2
2686 #undef H264_WEIGHT
2687
2688 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2689     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2690     int i;
2691
2692     for(i=0; i<h; i++){
2693         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2694         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2695         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2696         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2697         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2698         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2699         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2700         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2701         dst+=dstStride;
2702         src+=srcStride;
2703     }
2704 }
2705
2706 #ifdef CONFIG_CAVS_DECODER
2707 /* AVS specific */
2708 void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2709
2710 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2711     put_pixels8_c(dst, src, stride, 8);
2712 }
2713 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2714     avg_pixels8_c(dst, src, stride, 8);
2715 }
2716 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2717     put_pixels16_c(dst, src, stride, 16);
2718 }
2719 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2720     avg_pixels16_c(dst, src, stride, 16);
2721 }
2722 #endif /* CONFIG_CAVS_DECODER */
2723
2724 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
2725 /* VC-1 specific */
2726 void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2727
2728 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2729     put_pixels8_c(dst, src, stride, 8);
2730 }
2731 #endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
2732
2733 void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
2734
2735 /* H264 specific */
2736 void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2737
2738 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2739     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2740     int i;
2741
2742     for(i=0; i<w; i++){
2743         const int src_1= src[ -srcStride];
2744         const int src0 = src[0          ];
2745         const int src1 = src[  srcStride];
2746         const int src2 = src[2*srcStride];
2747         const int src3 = src[3*srcStride];
2748         const int src4 = src[4*srcStride];
2749         const int src5 = src[5*srcStride];
2750         const int src6 = src[6*srcStride];
2751         const int src7 = src[7*srcStride];
2752         const int src8 = src[8*srcStride];
2753         const int src9 = src[9*srcStride];
2754         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2755         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2756         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2757         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2758         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2759         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2760         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2761         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2762         src++;
2763         dst++;
2764     }
2765 }
2766
2767 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2768     put_pixels8_c(dst, src, stride, 8);
2769 }
2770
2771 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2772     uint8_t half[64];
2773     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2774     put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2775 }
2776
2777 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2778     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2779 }
2780
2781 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2782     uint8_t half[64];
2783     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2784     put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2785 }
2786
2787 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2788     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2789 }
2790
2791 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2792     uint8_t halfH[88];
2793     uint8_t halfV[64];
2794     uint8_t halfHV[64];
2795     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2796     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2797     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2798     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2799 }
2800 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2801     uint8_t halfH[88];
2802     uint8_t halfV[64];
2803     uint8_t halfHV[64];
2804     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2805     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2806     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2807     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2808 }
2809 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2810     uint8_t halfH[88];
2811     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2812     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2813 }
2814
2815 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2816     if(ENABLE_ANY_H263) {
2817     int x;
2818     const int strength= ff_h263_loop_filter_strength[qscale];
2819
2820     for(x=0; x<8; x++){
2821         int d1, d2, ad1;
2822         int p0= src[x-2*stride];
2823         int p1= src[x-1*stride];
2824         int p2= src[x+0*stride];
2825         int p3= src[x+1*stride];
2826         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2827
2828         if     (d<-2*strength) d1= 0;
2829         else if(d<-  strength) d1=-2*strength - d;
2830         else if(d<   strength) d1= d;
2831         else if(d< 2*strength) d1= 2*strength - d;
2832         else                   d1= 0;
2833
2834         p1 += d1;
2835         p2 -= d1;
2836         if(p1&256) p1= ~(p1>>31);
2837         if(p2&256) p2= ~(p2>>31);
2838
2839         src[x-1*stride] = p1;
2840         src[x+0*stride] = p2;
2841
2842         ad1= FFABS(d1)>>1;
2843
2844         d2= av_clip((p0-p3)/4, -ad1, ad1);
2845
2846         src[x-2*stride] = p0 - d2;
2847         src[x+  stride] = p3 + d2;
2848     }
2849     }
2850 }
2851
2852 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2853     if(ENABLE_ANY_H263) {
2854     int y;
2855     const int strength= ff_h263_loop_filter_strength[qscale];
2856
2857     for(y=0; y<8; y++){
2858         int d1, d2, ad1;
2859         int p0= src[y*stride-2];
2860         int p1= src[y*stride-1];
2861         int p2= src[y*stride+0];
2862         int p3= src[y*stride+1];
2863         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2864
2865         if     (d<-2*strength) d1= 0;
2866         else if(d<-  strength) d1=-2*strength - d;
2867         else if(d<   strength) d1= d;
2868         else if(d< 2*strength) d1= 2*strength - d;
2869         else                   d1= 0;
2870
2871         p1 += d1;
2872         p2 -= d1;
2873         if(p1&256) p1= ~(p1>>31);
2874         if(p2&256) p2= ~(p2>>31);
2875
2876         src[y*stride-1] = p1;
2877         src[y*stride+0] = p2;
2878
2879         ad1= FFABS(d1)>>1;
2880
2881         d2= av_clip((p0-p3)/4, -ad1, ad1);
2882
2883         src[y*stride-2] = p0 - d2;
2884         src[y*stride+1] = p3 + d2;
2885     }
2886     }
2887 }
2888
2889 static void h261_loop_filter_c(uint8_t *src, int stride){
2890     int x,y,xy,yz;
2891     int temp[64];
2892
2893     for(x=0; x<8; x++){
2894         temp[x      ] = 4*src[x           ];
2895         temp[x + 7*8] = 4*src[x + 7*stride];
2896     }
2897     for(y=1; y<7; y++){
2898         for(x=0; x<8; x++){
2899             xy = y * stride + x;
2900             yz = y * 8 + x;
2901             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2902         }
2903     }
2904
2905     for(y=0; y<8; y++){
2906         src[  y*stride] = (temp[  y*8] + 2)>>2;
2907         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2908         for(x=1; x<7; x++){
2909             xy = y * stride + x;
2910             yz = y * 8 + x;
2911             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2912         }
2913     }
2914 }
2915
2916 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2917 {
2918     int i, d;
2919     for( i = 0; i < 4; i++ ) {
2920         if( tc0[i] < 0 ) {
2921             pix += 4*ystride;
2922             continue;
2923         }
2924         for( d = 0; d < 4; d++ ) {
2925             const int p0 = pix[-1*xstride];
2926             const int p1 = pix[-2*xstride];
2927             const int p2 = pix[-3*xstride];
2928             const int q0 = pix[0];
2929             const int q1 = pix[1*xstride];
2930             const int q2 = pix[2*xstride];
2931
2932             if( FFABS( p0 - q0 ) < alpha &&
2933                 FFABS( p1 - p0 ) < beta &&
2934                 FFABS( q1 - q0 ) < beta ) {
2935
2936                 int tc = tc0[i];
2937                 int i_delta;
2938
2939                 if( FFABS( p2 - p0 ) < beta ) {
2940                     pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2941                     tc++;
2942                 }
2943                 if( FFABS( q2 - q0 ) < beta ) {
2944                     pix[   xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2945                     tc++;
2946                 }
2947
2948                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2949                 pix[-xstride] = av_clip_uint8( p0 + i_delta );    /* p0' */
2950                 pix[0]        = av_clip_uint8( q0 - i_delta );    /* q0' */
2951             }
2952             pix += ystride;
2953         }
2954     }
2955 }
2956 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2957 {
2958     h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2959 }
2960 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2961 {
2962     h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2963 }
2964
2965 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2966 {
2967     int i, d;
2968     for( i = 0; i < 4; i++ ) {
2969         const int tc = tc0[i];
2970         if( tc <= 0 ) {
2971             pix += 2*ystride;
2972             continue;
2973         }
2974         for( d = 0; d < 2; d++ ) {
2975             const int p0 = pix[-1*xstride];
2976             const int p1 = pix[-2*xstride];
2977             const int q0 = pix[0];
2978             const int q1 = pix[1*xstride];
2979
2980             if( FFABS( p0 - q0 ) < alpha &&
2981                 FFABS( p1 - p0 ) < beta &&
2982                 FFABS( q1 - q0 ) < beta ) {
2983
2984                 int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2985
2986                 pix[-xstride] = av_clip_uint8( p0 + delta );    /* p0' */
2987                 pix[0]        = av_clip_uint8( q0 - delta );    /* q0' */
2988             }
2989             pix += ystride;
2990         }
2991     }
2992 }
2993 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2994 {
2995     h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2996 }
2997 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2998 {
2999     h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
3000 }
3001
3002 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3003 {
3004     int d;
3005     for( d = 0; d < 8; d++ ) {
3006         const int p0 = pix[-1*xstride];
3007         const int p1 = pix[-2*xstride];
3008         const int q0 = pix[0];
3009         const int q1 = pix[1*xstride];
3010
3011         if( FFABS( p0 - q0 ) < alpha &&
3012             FFABS( p1 - p0 ) < beta &&
3013             FFABS( q1 - q0 ) < beta ) {
3014
3015             pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
3016             pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
3017         }
3018         pix += ystride;
3019     }
3020 }
3021 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3022 {
3023     h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
3024 }
3025 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3026 {
3027     h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
3028 }
3029
3030 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3031 {
3032     int s, i;
3033
3034     s = 0;
3035     for(i=0;i<h;i++) {
3036         s += abs(pix1[0] - pix2[0]);
3037         s += abs(pix1[1] - pix2[1]);
3038         s += abs(pix1[2] - pix2[2]);
3039         s += abs(pix1[3] - pix2[3]);
3040         s += abs(pix1[4] - pix2[4]);
3041         s += abs(pix1[5] - pix2[5]);
3042         s += abs(pix1[6] - pix2[6]);
3043         s += abs(pix1[7] - pix2[7]);
3044         s += abs(pix1[8] - pix2[8]);
3045         s += abs(pix1[9] - pix2[9]);
3046         s += abs(pix1[10] - pix2[10]);
3047         s += abs(pix1[11] - pix2[11]);
3048         s += abs(pix1[12] - pix2[12]);
3049         s += abs(pix1[13] - pix2[13]);
3050         s += abs(pix1[14] - pix2[14]);
3051         s += abs(pix1[15] - pix2[15]);
3052         pix1 += line_size;
3053         pix2 += line_size;
3054     }
3055     return s;
3056 }
3057
3058 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3059 {
3060     int s, i;
3061
3062     s = 0;
3063     for(i=0;i<h;i++) {
3064         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3065         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3066         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3067         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3068         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3069         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3070         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3071         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3072         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
3073         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
3074         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
3075         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
3076         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
3077         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
3078         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
3079         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
3080         pix1 += line_size;
3081         pix2 += line_size;
3082     }
3083     return s;
3084 }
3085
3086 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3087 {
3088     int s, i;
3089     uint8_t *pix3 = pix2 + line_size;
3090
3091     s = 0;
3092     for(i=0;i<h;i++) {
3093         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3094         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3095         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3096         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3097         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3098         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3099         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3100         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3101         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
3102         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
3103         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
3104         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
3105         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
3106         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
3107         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
3108         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
3109         pix1 += line_size;
3110         pix2 += line_size;
3111         pix3 += line_size;
3112     }
3113     return s;
3114 }
3115
3116 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3117 {
3118     int s, i;
3119     uint8_t *pix3 = pix2 + line_size;
3120
3121     s = 0;
3122     for(i=0;i<h;i++) {
3123         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3124         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3125         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3126         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3127         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3128         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3129         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3130         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3131         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3132         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3133         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3134         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3135         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3136         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3137         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3138         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3139         pix1 += line_size;
3140         pix2 += line_size;
3141         pix3 += line_size;
3142     }
3143     return s;
3144 }
3145
3146 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3147 {
3148     int s, i;
3149
3150     s = 0;
3151     for(i=0;i<h;i++) {
3152         s += abs(pix1[0] - pix2[0]);
3153         s += abs(pix1[1] - pix2[1]);
3154         s += abs(pix1[2] - pix2[2]);
3155         s += abs(pix1[3] - pix2[3]);
3156         s += abs(pix1[4] - pix2[4]);
3157         s += abs(pix1[5] - pix2[5]);
3158         s += abs(pix1[6] - pix2[6]);
3159         s += abs(pix1[7] - pix2[7]);
3160         pix1 += line_size;
3161         pix2 += line_size;
3162     }
3163     return s;
3164 }
3165
3166 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3167 {
3168     int s, i;
3169
3170     s = 0;
3171     for(i=0;i<h;i++) {
3172         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3173         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3174         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3175         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3176         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3177         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3178         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3179         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3180         pix1 += line_size;
3181         pix2 += line_size;
3182     }
3183     return s;
3184 }
3185
3186 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3187 {
3188     int s, i;
3189     uint8_t *pix3 = pix2 + line_size;
3190
3191     s = 0;
3192     for(i=0;i<h;i++) {
3193         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3194         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3195         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3196         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3197         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3198         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3199         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3200         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3201         pix1 += line_size;
3202         pix2 += line_size;
3203         pix3 += line_size;
3204     }
3205     return s;
3206 }
3207
3208 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3209 {
3210     int s, i;
3211     uint8_t *pix3 = pix2 + line_size;
3212
3213     s = 0;
3214     for(i=0;i<h;i++) {
3215         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3216         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3217         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3218         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3219         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3220         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3221         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3222         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3223         pix1 += line_size;
3224         pix2 += line_size;
3225         pix3 += line_size;
3226     }
3227     return s;
3228 }
3229
3230 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3231     MpegEncContext *c = v;
3232     int score1=0;
3233     int score2=0;
3234     int x,y;
3235
3236     for(y=0; y<h; y++){
3237         for(x=0; x<16; x++){
3238             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3239         }
3240         if(y+1<h){
3241             for(x=0; x<15; x++){
3242                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
3243                              - s1[x+1] + s1[x+1+stride])
3244                         -FFABS(  s2[x  ] - s2[x  +stride]
3245                              - s2[x+1] + s2[x+1+stride]);
3246             }
3247         }
3248         s1+= stride;
3249         s2+= stride;
3250     }
3251
3252     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3253     else  return score1 + FFABS(score2)*8;
3254 }
3255
3256 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3257     MpegEncContext *c = v;
3258     int score1=0;
3259     int score2=0;
3260     int x,y;
3261
3262     for(y=0; y<h; y++){
3263         for(x=0; x<8; x++){
3264             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3265         }
3266         if(y+1<h){
3267             for(x=0; x<7; x++){
3268                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
3269                              - s1[x+1] + s1[x+1+stride])
3270                         -FFABS(  s2[x  ] - s2[x  +stride]
3271                              - s2[x+1] + s2[x+1+stride]);
3272             }
3273         }
3274         s1+= stride;
3275         s2+= stride;
3276     }
3277
3278     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3279     else  return score1 + FFABS(score2)*8;
3280 }
3281
3282 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3283     int i;
3284     unsigned int sum=0;
3285
3286     for(i=0; i<8*8; i++){
3287         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3288         int w= weight[i];
3289         b>>= RECON_SHIFT;
3290         assert(-512<b && b<512);
3291
3292         sum += (w*b)*(w*b)>>4;
3293     }
3294     return sum>>2;
3295 }
3296
3297 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3298     int i;
3299
3300     for(i=0; i<8*8; i++){
3301         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3302     }
3303 }
3304
3305 /**
3306  * permutes an 8x8 block.
3307  * @param block the block which will be permuted according to the given permutation vector
3308  * @param permutation the permutation vector
3309  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3310  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3311  *                  (inverse) permutated to scantable order!
3312  */
3313 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3314 {
3315     int i;
3316     DCTELEM temp[64];
3317
3318     if(last<=0) return;
3319     //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3320
3321     for(i=0; i<=last; i++){
3322         const int j= scantable[i];
3323         temp[j]= block[j];
3324         block[j]=0;
3325     }
3326
3327     for(i=0; i<=last; i++){
3328         const int j= scantable[i];
3329         const int perm_j= permutation[j];
3330         block[perm_j]= temp[j];
3331     }
3332 }
3333
3334 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3335     return 0;
3336 }
3337
3338 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3339     int i;
3340
3341     memset(cmp, 0, sizeof(void*)*5);
3342
3343     for(i=0; i<5; i++){
3344         switch(type&0xFF){
3345         case FF_CMP_SAD:
3346             cmp[i]= c->sad[i];
3347             break;
3348         case FF_CMP_SATD:
3349             cmp[i]= c->hadamard8_diff[i];
3350             break;
3351         case FF_CMP_SSE:
3352             cmp[i]= c->sse[i];
3353             break;
3354         case FF_CMP_DCT:
3355             cmp[i]= c->dct_sad[i];
3356             break;
3357         case FF_CMP_DCT264:
3358             cmp[i]= c->dct264_sad[i];
3359             break;
3360         case FF_CMP_DCTMAX:
3361             cmp[i]= c->dct_max[i];
3362             break;
3363         case FF_CMP_PSNR:
3364             cmp[i]= c->quant_psnr[i];
3365             break;
3366         case FF_CMP_BIT:
3367             cmp[i]= c->bit[i];
3368             break;
3369         case FF_CMP_RD:
3370             cmp[i]= c->rd[i];
3371             break;
3372         case FF_CMP_VSAD:
3373             cmp[i]= c->vsad[i];
3374             break;
3375         case FF_CMP_VSSE:
3376             cmp[i]= c->vsse[i];
3377             break;
3378         case FF_CMP_ZERO:
3379             cmp[i]= zero_cmp;
3380             break;
3381         case FF_CMP_NSSE:
3382             cmp[i]= c->nsse[i];
3383             break;
3384 #ifdef CONFIG_SNOW_ENCODER
3385         case FF_CMP_W53:
3386             cmp[i]= c->w53[i];
3387             break;
3388         case FF_CMP_W97:
3389             cmp[i]= c->w97[i];
3390             break;
3391 #endif
3392         default:
3393             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3394         }
3395     }
3396 }
3397
3398 /**
3399  * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3400  */
3401 static void clear_blocks_c(DCTELEM *blocks)
3402 {
3403     memset(blocks, 0, sizeof(DCTELEM)*6*64);
3404 }
3405
3406 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3407     long i;
3408     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3409         long a = *(long*)(src+i);
3410         long b = *(long*)(dst+i);
3411         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3412     }
3413     for(; i<w; i++)
3414         dst[i+0] += src[i+0];
3415 }
3416
3417 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3418     long i;
3419     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3420         long a = *(long*)(src1+i);
3421         long b = *(long*)(src2+i);
3422         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3423     }
3424     for(; i<w; i++)
3425         dst[i] = src1[i]+src2[i];
3426 }
3427
3428 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3429     long i;
3430 #ifndef HAVE_FAST_UNALIGNED
3431     if((long)src2 & (sizeof(long)-1)){
3432         for(i=0; i+7<w; i+=8){
3433             dst[i+0] = src1[i+0]-src2[i+0];
3434             dst[i+1] = src1[i+1]-src2[i+1];
3435             dst[i+2] = src1[i+2]-src2[i+2];
3436             dst[i+3] = src1[i+3]-src2[i+3];
3437             dst[i+4] = src1[i+4]-src2[i+4];
3438             dst[i+5] = src1[i+5]-src2[i+5];
3439             dst[i+6] = src1[i+6]-src2[i+6];
3440             dst[i+7] = src1[i+7]-src2[i+7];
3441         }
3442     }else
3443 #endif
3444     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3445         long a = *(long*)(src1+i);
3446         long b = *(long*)(src2+i);
3447         *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3448     }
3449     for(; i<w; i++)
3450         dst[i+0] = src1[i+0]-src2[i+0];
3451 }
3452
3453 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3454     int i;
3455     uint8_t l, lt;
3456
3457     l= *left;
3458     lt= *left_top;
3459
3460     for(i=0; i<w; i++){
3461         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3462         lt= src1[i];
3463         l= src2[i];
3464         dst[i]= l - pred;
3465     }
3466
3467     *left= l;
3468     *left_top= lt;
3469 }
3470
3471 #define BUTTERFLY2(o1,o2,i1,i2) \
3472 o1= (i1)+(i2);\
3473 o2= (i1)-(i2);
3474
3475 #define BUTTERFLY1(x,y) \
3476 {\
3477     int a,b;\
3478     a= x;\
3479     b= y;\
3480     x= a+b;\
3481     y= a-b;\
3482 }
3483
3484 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3485
3486 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3487     int i;
3488     int temp[64];
3489     int sum=0;
3490
3491     assert(h==8);
3492
3493     for(i=0; i<8; i++){
3494         //FIXME try pointer walks
3495         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3496         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3497         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3498         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3499
3500         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3501         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3502         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3503         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3504
3505         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3506         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3507         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3508         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3509     }
3510
3511     for(i=0; i<8; i++){
3512         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3513         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3514         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3515         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3516
3517         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3518         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3519         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3520         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3521
3522         sum +=
3523              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3524             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3525             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3526             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3527     }
3528 #if 0
3529 static int maxi=0;
3530 if(sum>maxi){
3531     maxi=sum;
3532     printf("MAX:%d\n", maxi);
3533 }
3534 #endif
3535     return sum;
3536 }
3537
3538 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3539     int i;
3540     int temp[64];
3541     int sum=0;
3542
3543     assert(h==8);
3544
3545     for(i=0; i<8; i++){
3546         //FIXME try pointer walks
3547         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3548         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3549         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3550         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3551
3552         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3553         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3554         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3555         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3556
3557         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3558         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3559         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3560         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3561     }
3562
3563     for(i=0; i<8; i++){
3564         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3565         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3566         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3567         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3568
3569         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3570         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3571         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3572         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3573
3574         sum +=
3575              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3576             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3577             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3578             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3579     }
3580
3581     sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3582
3583     return sum;
3584 }
3585
3586 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3587     MpegEncContext * const s= (MpegEncContext *)c;
3588     DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3589     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3590
3591     assert(h==8);
3592
3593     s->dsp.diff_pixels(temp, src1, src2, stride);
3594     s->dsp.fdct(temp);
3595     return s->dsp.sum_abs_dctelem(temp);
3596 }
3597
3598 #ifdef CONFIG_GPL
3599 #define DCT8_1D {\
3600     const int s07 = SRC(0) + SRC(7);\
3601     const int s16 = SRC(1) + SRC(6);\
3602     const int s25 = SRC(2) + SRC(5);\
3603     const int s34 = SRC(3) + SRC(4);\
3604     const int a0 = s07 + s34;\
3605     const int a1 = s16 + s25;\
3606     const int a2 = s07 - s34;\
3607     const int a3 = s16 - s25;\
3608     const int d07 = SRC(0) - SRC(7);\
3609     const int d16 = SRC(1) - SRC(6);\
3610     const int d25 = SRC(2) - SRC(5);\
3611     const int d34 = SRC(3) - SRC(4);\
3612     const int a4 = d16 + d25 + (d07 + (d07>>1));\
3613     const int a5 = d07 - d34 - (d25 + (d25>>1));\
3614     const int a6 = d07 + d34 - (d16 + (d16>>1));\
3615     const int a7 = d16 - d25 + (d34 + (d34>>1));\
3616     DST(0,  a0 + a1     ) ;\
3617     DST(1,  a4 + (a7>>2)) ;\
3618     DST(2,  a2 + (a3>>1)) ;\
3619     DST(3,  a5 + (a6>>2)) ;\
3620     DST(4,  a0 - a1     ) ;\
3621     DST(5,  a6 - (a5>>2)) ;\
3622     DST(6, (a2>>1) - a3 ) ;\
3623     DST(7, (a4>>2) - a7 ) ;\
3624 }
3625
3626 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3627     MpegEncContext * const s= (MpegEncContext *)c;
3628     DCTELEM dct[8][8];
3629     int i;
3630     int sum=0;
3631
3632     s->dsp.diff_pixels(dct[0], src1, src2, stride);
3633
3634 #define SRC(x) dct[i][x]
3635 #define DST(x,v) dct[i][x]= v
3636     for( i = 0; i < 8; i++ )
3637         DCT8_1D
3638 #undef SRC
3639 #undef DST
3640
3641 #define SRC(x) dct[x][i]
3642 #define DST(x,v) sum += FFABS(v)
3643     for( i = 0; i < 8; i++ )
3644         DCT8_1D
3645 #undef SRC
3646 #undef DST
3647     return sum;
3648 }
3649 #endif
3650
3651 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3652     MpegEncContext * const s= (MpegEncContext *)c;
3653     DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3654     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3655     int sum=0, i;
3656
3657     assert(h==8);
3658
3659     s->dsp.diff_pixels(temp, src1, src2, stride);
3660     s->dsp.fdct(temp);
3661
3662     for(i=0; i<64; i++)
3663         sum= FFMAX(sum, FFABS(temp[i]));
3664
3665     return sum;
3666 }
3667
3668 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3669     MpegEncContext * const s= (MpegEncContext *)c;
3670     DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3671     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3672     DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3673     int sum=0, i;
3674
3675     assert(h==8);
3676     s->mb_intra=0;
3677
3678     s->dsp.diff_pixels(temp, src1, src2, stride);
3679
3680     memcpy(bak, temp, 64*sizeof(DCTELEM));
3681
3682     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3683     s->dct_unquantize_inter(s, temp, 0, s->qscale);
3684     ff_simple_idct(temp); //FIXME
3685
3686     for(i=0; i<64; i++)
3687         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3688
3689     return sum;
3690 }
3691
3692 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3693     MpegEncContext * const s= (MpegEncContext *)c;
3694     const uint8_t *scantable= s->intra_scantable.permutated;
3695     DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3696     DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3697     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3698     uint8_t * const bak= (uint8_t*)aligned_bak;
3699     int i, last, run, bits, level, distoration, start_i;
3700     const int esc_length= s->ac_esc_length;
3701     uint8_t * length;
3702     uint8_t * last_length;
3703
3704     assert(h==8);
3705
3706     for(i=0; i<8; i++){
3707         ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3708         ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3709     }
3710
3711     s->dsp.diff_pixels(temp, src1, src2, stride);
3712
3713     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3714
3715     bits=0;
3716
3717     if (s->mb_intra) {
3718         start_i = 1;
3719         length     = s->intra_ac_vlc_length;
3720         last_length= s->intra_ac_vlc_last_length;
3721         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3722     } else {
3723         start_i = 0;
3724         length     = s->inter_ac_vlc_length;
3725         last_length= s->inter_ac_vlc_last_length;
3726     }
3727
3728     if(last>=start_i){
3729         run=0;
3730         for(i=start_i; i<last; i++){
3731             int j= scantable[i];
3732             level= temp[j];
3733
3734             if(level){
3735                 level+=64;
3736                 if((level&(~127)) == 0){
3737                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3738                 }else
3739                     bits+= esc_length;
3740                 run=0;
3741             }else
3742                 run++;
3743         }
3744         i= scantable[last];
3745
3746         level= temp[i] + 64;
3747
3748         assert(level - 64);
3749
3750         if((level&(~127)) == 0){
3751             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3752         }else
3753             bits+= esc_length;
3754
3755     }
3756
3757     if(last>=0){
3758         if(s->mb_intra)
3759             s->dct_unquantize_intra(s, temp, 0, s->qscale);
3760         else
3761             s->dct_unquantize_inter(s, temp, 0, s->qscale);
3762     }
3763
3764     s->dsp.idct_add(bak, stride, temp);
3765
3766     distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3767
3768     return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3769 }
3770
3771 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3772     MpegEncContext * const s= (MpegEncContext *)c;
3773     const uint8_t *scantable= s->intra_scantable.permutated;
3774     DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3775     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3776     int i, last, run, bits, level, start_i;
3777     const int esc_length= s->ac_esc_length;
3778     uint8_t * length;
3779     uint8_t * last_length;
3780
3781     assert(h==8);
3782
3783     s->dsp.diff_pixels(temp, src1, src2, stride);
3784
3785     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3786
3787     bits=0;
3788
3789     if (s->mb_intra) {
3790         start_i = 1;
3791         length     = s->intra_ac_vlc_length;
3792         last_length= s->intra_ac_vlc_last_length;
3793         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3794     } else {
3795         start_i = 0;
3796         length     = s->inter_ac_vlc_length;
3797         last_length= s->inter_ac_vlc_last_length;
3798     }
3799
3800     if(last>=start_i){
3801         run=0;
3802         for(i=start_i; i<last; i++){
3803             int j= scantable[i];
3804             level= temp[j];
3805
3806             if(level){
3807                 level+=64;
3808                 if((level&(~127)) == 0){
3809                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3810                 }else
3811                     bits+= esc_length;
3812                 run=0;
3813             }else
3814                 run++;
3815         }
3816         i= scantable[last];
3817
3818         level= temp[i] + 64;
3819
3820         assert(level - 64);
3821
3822         if((level&(~127)) == 0){
3823             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3824         }else
3825             bits+= esc_length;
3826     }
3827
3828     return bits;
3829 }
3830
3831 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3832     int score=0;
3833     int x,y;
3834
3835     for(y=1; y<h; y++){
3836         for(x=0; x<16; x+=4){
3837             score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])
3838                    +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);
3839         }
3840         s+= stride;
3841     }
3842
3843     return score;
3844 }
3845
3846 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3847     int score=0;
3848     int x,y;
3849
3850     for(y=1; y<h; y++){
3851         for(x=0; x<16; x++){
3852             score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3853         }
3854         s1+= stride;
3855         s2+= stride;
3856     }
3857
3858     return score;
3859 }
3860
3861 #define SQ(a) ((a)*(a))
3862 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3863     int score=0;
3864     int x,y;
3865
3866     for(y=1; y<h; y++){
3867         for(x=0; x<16; x+=4){
3868             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])
3869                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3870         }
3871         s+= stride;
3872     }
3873
3874     return score;
3875 }
3876
3877 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3878     int score=0;
3879     int x,y;
3880
3881     for(y=1; y<h; y++){
3882         for(x=0; x<16; x++){
3883             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3884         }
3885         s1+= stride;
3886         s2+= stride;
3887     }
3888
3889     return score;
3890 }
3891
3892 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3893                                int size){
3894     int score=0;
3895     int i;
3896     for(i=0; i<size; i++)
3897         score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3898     return score;
3899 }
3900
3901 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3902 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3903 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3904 #ifdef CONFIG_GPL
3905 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3906 #endif
3907 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3908 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3909 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
3910 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
3911
3912 static void vector_fmul_c(float *dst, const float *src, int len){
3913     int i;
3914     for(i=0; i<len; i++)
3915         dst[i] *= src[i];
3916 }
3917
3918 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3919     int i;
3920     src1 += len-1;
3921     for(i=0; i<len; i++)
3922         dst[i] = src0[i] * src1[-i];
3923 }
3924
3925 void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){
3926     int i;
3927     for(i=0; i<len; i++)
3928         dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
3929 }
3930
3931 void ff_float_to_int16_c(int16_t *dst, const float *src, int len){
3932     int i;
3933     for(i=0; i<len; i++) {
3934         int_fast32_t tmp = ((const int32_t*)src)[i];
3935         if(tmp & 0xf0000){
3936             tmp = (0x43c0ffff - tmp)>>31;
3937             // is this faster on some gcc/cpu combinations?
3938 //          if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3939 //          else                 tmp = 0;
3940         }
3941         dst[i] = tmp - 0x8000;
3942     }
3943 }
3944
3945 #define W0 2048
3946 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
3947 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
3948 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
3949 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
3950 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
3951 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
3952 #define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
3953
3954 static void wmv2_idct_row(short * b)
3955 {
3956     int s1,s2;
3957     int a0,a1,a2,a3,a4,a5,a6,a7;
3958     /*step 1*/
3959     a1 = W1*b[1]+W7*b[7];
3960     a7 = W7*b[1]-W1*b[7];
3961     a5 = W5*b[5]+W3*b[3];
3962     a3 = W3*b[5]-W5*b[3];
3963     a2 = W2*b[2]+W6*b[6];
3964     a6 = W6*b[2]-W2*b[6];
3965     a0 = W0*b[0]+W0*b[4];
3966     a4 = W0*b[0]-W0*b[4];
3967     /*step 2*/
3968     s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
3969     s2 = (181*(a1-a5-a7+a3)+128)>>8;
3970     /*step 3*/
3971     b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
3972     b[1] = (a4+a6 +s1   + (1<<7))>>8;
3973     b[2] = (a4-a6 +s2   + (1<<7))>>8;
3974     b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
3975     b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
3976     b[5] = (a4-a6 -s2   + (1<<7))>>8;
3977     b[6] = (a4+a6 -s1   + (1<<7))>>8;
3978     b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
3979 }
3980 static void wmv2_idct_col(short * b)
3981 {
3982     int s1,s2;
3983     int a0,a1,a2,a3,a4,a5,a6,a7;
3984     /*step 1, with extended precision*/
3985     a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
3986     a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
3987     a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
3988     a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
3989     a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
3990     a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
3991     a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
3992     a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
3993     /*step 2*/
3994     s1 = (181*(a1-a5+a7-a3)+128)>>8;
3995     s2 = (181*(a1-a5-a7+a3)+128)>>8;
3996     /*step 3*/
3997     b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
3998     b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
3999     b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
4000     b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
4001
4002     b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
4003     b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
4004     b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
4005     b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
4006 }
4007 void ff_wmv2_idct_c(short * block){
4008     int i;
4009
4010     for(i=0;i<64;i+=8){
4011         wmv2_idct_row(block+i);
4012     }
4013     for(i=0;i<8;i++){
4014         wmv2_idct_col(block+i);
4015     }
4016 }
4017 /* XXX: those functions should be suppressed ASAP when all IDCTs are
4018  converted */
4019 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
4020 {
4021     ff_wmv2_idct_c(block);
4022     put_pixels_clamped_c(block, dest, line_size);
4023 }
4024 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
4025 {
4026     ff_wmv2_idct_c(block);
4027     add_pixels_clamped_c(block, dest, line_size);
4028 }
4029 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
4030 {
4031     j_rev_dct (block);
4032     put_pixels_clamped_c(block, dest, line_size);
4033 }
4034 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
4035 {
4036     j_rev_dct (block);
4037     add_pixels_clamped_c(block, dest, line_size);
4038 }
4039
4040 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
4041 {
4042     j_rev_dct4 (block);
4043     put_pixels_clamped4_c(block, dest, line_size);
4044 }
4045 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
4046 {
4047     j_rev_dct4 (block);
4048     add_pixels_clamped4_c(block, dest, line_size);
4049 }
4050
4051 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4052 {
4053     j_rev_dct2 (block);
4054     put_pixels_clamped2_c(block, dest, line_size);
4055 }
4056 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4057 {
4058     j_rev_dct2 (block);
4059     add_pixels_clamped2_c(block, dest, line_size);
4060 }
4061
4062 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4063 {
4064     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4065
4066     dest[0] = cm[(block[0] + 4)>>3];
4067 }
4068 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4069 {
4070     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4071
4072     dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4073 }
4074
4075 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4076
4077 /* init static data */
4078 void dsputil_static_init(void)
4079 {
4080     int i;
4081
4082     for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4083     for(i=0;i<MAX_NEG_CROP;i++) {
4084         ff_cropTbl[i] = 0;
4085         ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4086     }
4087
4088     for(i=0;i<512;i++) {
4089         ff_squareTbl[i] = (i - 256) * (i - 256);
4090     }
4091
4092     for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4093 }
4094
4095 int ff_check_alignment(void){
4096     static int did_fail=0;
4097     DECLARE_ALIGNED_16(int, aligned);
4098
4099     if((long)&aligned & 15){
4100         if(!did_fail){
4101 #if defined(HAVE_MMX) || defined(HAVE_ALTIVEC)
4102             av_log(NULL, AV_LOG_ERROR,
4103                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4104                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
4105                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4106                 "Do not report crashes to FFmpeg developers.\n");
4107 #endif
4108             did_fail=1;
4109         }
4110         return -1;
4111     }
4112     return 0;
4113 }
4114
4115 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4116 {
4117     int i;
4118
4119     ff_check_alignment();
4120
4121 #ifdef CONFIG_ENCODERS
4122     if(avctx->dct_algo==FF_DCT_FASTINT) {
4123         c->fdct = fdct_ifast;
4124         c->fdct248 = fdct_ifast248;
4125     }
4126     else if(avctx->dct_algo==FF_DCT_FAAN) {
4127         c->fdct = ff_faandct;
4128         c->fdct248 = ff_faandct248;
4129     }
4130     else {
4131         c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4132         c->fdct248 = ff_fdct248_islow;
4133     }
4134 #endif //CONFIG_ENCODERS
4135
4136     if(avctx->lowres==1){
4137         if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !ENABLE_H264_DECODER){
4138             c->idct_put= ff_jref_idct4_put;
4139             c->idct_add= ff_jref_idct4_add;
4140         }else{
4141             c->idct_put= ff_h264_lowres_idct_put_c;
4142             c->idct_add= ff_h264_lowres_idct_add_c;
4143         }
4144         c->idct    = j_rev_dct4;
4145         c->idct_permutation_type= FF_NO_IDCT_PERM;
4146     }else if(avctx->lowres==2){
4147         c->idct_put= ff_jref_idct2_put;
4148         c->idct_add= ff_jref_idct2_add;
4149         c->idct    = j_rev_dct2;
4150         c->idct_permutation_type= FF_NO_IDCT_PERM;
4151     }else if(avctx->lowres==3){
4152         c->idct_put= ff_jref_idct1_put;
4153         c->idct_add= ff_jref_idct1_add;
4154         c->idct    = j_rev_dct1;
4155         c->idct_permutation_type= FF_NO_IDCT_PERM;
4156     }else{
4157         if(avctx->idct_algo==FF_IDCT_INT){
4158             c->idct_put= ff_jref_idct_put;
4159             c->idct_add= ff_jref_idct_add;
4160             c->idct    = j_rev_dct;
4161             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4162         }else if((ENABLE_VP3_DECODER || ENABLE_VP5_DECODER || ENABLE_VP6_DECODER || ENABLE_THEORA_DECODER ) &&
4163                 avctx->idct_algo==FF_IDCT_VP3){
4164             c->idct_put= ff_vp3_idct_put_c;
4165             c->idct_add= ff_vp3_idct_add_c;
4166             c->idct    = ff_vp3_idct_c;
4167             c->idct_permutation_type= FF_NO_IDCT_PERM;
4168         }else if(avctx->idct_algo==FF_IDCT_WMV2){
4169             c->idct_put= ff_wmv2_idct_put_c;
4170             c->idct_add= ff_wmv2_idct_add_c;
4171             c->idct    = ff_wmv2_idct_c;
4172             c->idct_permutation_type= FF_NO_IDCT_PERM;
4173         }else if(avctx->idct_algo==FF_IDCT_FAAN){
4174             c->idct_put= ff_faanidct_put;
4175             c->idct_add= ff_faanidct_add;
4176             c->idct    = ff_faanidct;
4177             c->idct_permutation_type= FF_NO_IDCT_PERM;
4178         }else{ //accurate/default
4179             c->idct_put= ff_simple_idct_put;
4180             c->idct_add= ff_simple_idct_add;
4181             c->idct    = ff_simple_idct;
4182             c->idct_permutation_type= FF_NO_IDCT_PERM;
4183         }
4184     }
4185
4186     if (ENABLE_H264_DECODER) {
4187         c->h264_idct_add= ff_h264_idct_add_c;
4188         c->h264_idct8_add= ff_h264_idct8_add_c;
4189         c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
4190         c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
4191     }
4192
4193     c->get_pixels = get_pixels_c;
4194     c->diff_pixels = diff_pixels_c;
4195     c->put_pixels_clamped = put_pixels_clamped_c;
4196     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4197     c->add_pixels_clamped = add_pixels_clamped_c;
4198     c->add_pixels8 = add_pixels8_c;
4199     c->add_pixels4 = add_pixels4_c;
4200     c->sum_abs_dctelem = sum_abs_dctelem_c;
4201     c->gmc1 = gmc1_c;
4202     c->gmc = ff_gmc_c;
4203     c->clear_blocks = clear_blocks_c;
4204     c->pix_sum = pix_sum_c;
4205     c->pix_norm1 = pix_norm1_c;
4206
4207     /* TODO [0] 16  [1] 8 */
4208     c->pix_abs[0][0] = pix_abs16_c;
4209     c->pix_abs[0][1] = pix_abs16_x2_c;
4210     c->pix_abs[0][2] = pix_abs16_y2_c;
4211     c->pix_abs[0][3] = pix_abs16_xy2_c;
4212     c->pix_abs[1][0] = pix_abs8_c;
4213     c->pix_abs[1][1] = pix_abs8_x2_c;
4214     c->pix_abs[1][2] = pix_abs8_y2_c;
4215     c->pix_abs[1][3] = pix_abs8_xy2_c;
4216
4217 #define dspfunc(PFX, IDX, NUM) \
4218     c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
4219     c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
4220     c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
4221     c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4222
4223     dspfunc(put, 0, 16);
4224     dspfunc(put_no_rnd, 0, 16);
4225     dspfunc(put, 1, 8);
4226     dspfunc(put_no_rnd, 1, 8);
4227     dspfunc(put, 2, 4);
4228     dspfunc(put, 3, 2);
4229
4230     dspfunc(avg, 0, 16);
4231     dspfunc(avg_no_rnd, 0, 16);
4232     dspfunc(avg, 1, 8);
4233     dspfunc(avg_no_rnd, 1, 8);
4234     dspfunc(avg, 2, 4);
4235     dspfunc(avg, 3, 2);
4236 #undef dspfunc
4237
4238     c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4239     c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4240
4241     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4242     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4243     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4244     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4245     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4246     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4247     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4248     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4249     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4250
4251     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4252     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4253     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4254     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4255     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4256     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4257     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4258     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4259     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4260
4261 #define dspfunc(PFX, IDX, NUM) \
4262     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4263     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4264     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4265     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4266     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4267     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4268     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4269     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4270     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4271     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4272     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4273     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4274     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4275     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4276     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4277     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4278
4279     dspfunc(put_qpel, 0, 16);
4280     dspfunc(put_no_rnd_qpel, 0, 16);
4281
4282     dspfunc(avg_qpel, 0, 16);
4283     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4284
4285     dspfunc(put_qpel, 1, 8);
4286     dspfunc(put_no_rnd_qpel, 1, 8);
4287
4288     dspfunc(avg_qpel, 1, 8);
4289     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4290
4291     dspfunc(put_h264_qpel, 0, 16);
4292     dspfunc(put_h264_qpel, 1, 8);
4293     dspfunc(put_h264_qpel, 2, 4);
4294     dspfunc(put_h264_qpel, 3, 2);
4295     dspfunc(avg_h264_qpel, 0, 16);
4296     dspfunc(avg_h264_qpel, 1, 8);
4297     dspfunc(avg_h264_qpel, 2, 4);
4298
4299 #undef dspfunc
4300     c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4301     c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4302     c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4303     c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4304     c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4305     c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4306     c->put_no_rnd_h264_chroma_pixels_tab[0]= put_no_rnd_h264_chroma_mc8_c;
4307
4308     c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
4309     c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
4310     c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
4311     c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
4312     c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
4313     c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
4314     c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
4315     c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
4316     c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
4317     c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
4318     c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
4319     c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
4320     c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
4321     c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
4322     c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
4323     c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
4324     c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
4325     c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
4326     c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
4327     c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
4328
4329     c->draw_edges = draw_edges_c;
4330
4331 #ifdef CONFIG_CAVS_DECODER
4332     ff_cavsdsp_init(c,avctx);
4333 #endif
4334 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4335     ff_vc1dsp_init(c,avctx);
4336 #endif
4337 #if defined(CONFIG_WMV2_DECODER) || defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4338     ff_intrax8dsp_init(c,avctx);
4339 #endif
4340 #if defined(CONFIG_H264_ENCODER)
4341     ff_h264dspenc_init(c,avctx);
4342 #endif
4343
4344     c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4345     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4346     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4347     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4348     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4349     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4350     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4351     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4352
4353 #define SET_CMP_FUNC(name) \
4354     c->name[0]= name ## 16_c;\
4355     c->name[1]= name ## 8x8_c;
4356
4357     SET_CMP_FUNC(hadamard8_diff)
4358     c->hadamard8_diff[4]= hadamard8_intra16_c;
4359     SET_CMP_FUNC(dct_sad)
4360     SET_CMP_FUNC(dct_max)
4361 #ifdef CONFIG_GPL
4362     SET_CMP_FUNC(dct264_sad)
4363 #endif
4364     c->sad[0]= pix_abs16_c;
4365     c->sad[1]= pix_abs8_c;
4366     c->sse[0]= sse16_c;
4367     c->sse[1]= sse8_c;
4368     c->sse[2]= sse4_c;
4369     SET_CMP_FUNC(quant_psnr)
4370     SET_CMP_FUNC(rd)
4371     SET_CMP_FUNC(bit)
4372     c->vsad[0]= vsad16_c;
4373     c->vsad[4]= vsad_intra16_c;
4374     c->vsse[0]= vsse16_c;
4375     c->vsse[4]= vsse_intra16_c;
4376     c->nsse[0]= nsse16_c;
4377     c->nsse[1]= nsse8_c;
4378 #ifdef CONFIG_SNOW_ENCODER
4379     c->w53[0]= w53_16_c;
4380     c->w53[1]= w53_8_c;
4381     c->w97[0]= w97_16_c;
4382     c->w97[1]= w97_8_c;
4383 #endif
4384
4385     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4386
4387     c->add_bytes= add_bytes_c;
4388     c->add_bytes_l2= add_bytes_l2_c;
4389     c->diff_bytes= diff_bytes_c;
4390     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4391     c->bswap_buf= bswap_buf;
4392 #ifdef CONFIG_PNG_DECODER
4393     c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
4394 #endif
4395
4396     c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4397     c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4398     c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4399     c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4400     c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4401     c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4402     c->h264_loop_filter_strength= NULL;
4403
4404     if (ENABLE_ANY_H263) {
4405         c->h263_h_loop_filter= h263_h_loop_filter_c;
4406         c->h263_v_loop_filter= h263_v_loop_filter_c;
4407     }
4408
4409     c->h261_loop_filter= h261_loop_filter_c;
4410
4411     c->try_8x8basis= try_8x8basis_c;
4412     c->add_8x8basis= add_8x8basis_c;
4413
4414 #ifdef CONFIG_SNOW_DECODER
4415     c->vertical_compose97i = ff_snow_vertical_compose97i;
4416     c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4417     c->inner_add_yblock = ff_snow_inner_add_yblock;
4418 #endif
4419
4420 #ifdef CONFIG_VORBIS_DECODER
4421     c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4422 #endif
4423 #ifdef CONFIG_FLAC_ENCODER
4424     c->flac_compute_autocorr = ff_flac_compute_autocorr;
4425 #endif
4426     c->vector_fmul = vector_fmul_c;
4427     c->vector_fmul_reverse = vector_fmul_reverse_c;
4428     c->vector_fmul_add_add = ff_vector_fmul_add_add_c;
4429     c->float_to_int16 = ff_float_to_int16_c;
4430
4431     c->shrink[0]= ff_img_copy_plane;
4432     c->shrink[1]= ff_shrink22;
4433     c->shrink[2]= ff_shrink44;
4434     c->shrink[3]= ff_shrink88;
4435
4436     c->prefetch= just_return;
4437
4438     memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4439     memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4440
4441     if (ENABLE_MMX)      dsputil_init_mmx   (c, avctx);
4442     if (ENABLE_ARMV4L)   dsputil_init_armv4l(c, avctx);
4443     if (ENABLE_MLIB)     dsputil_init_mlib  (c, avctx);
4444     if (ENABLE_VIS)      dsputil_init_vis   (c, avctx);
4445     if (ENABLE_ALPHA)    dsputil_init_alpha (c, avctx);
4446     if (ENABLE_POWERPC)  dsputil_init_ppc   (c, avctx);
4447     if (ENABLE_MMI)      dsputil_init_mmi   (c, avctx);
4448     if (ENABLE_SH4)      dsputil_init_sh4   (c, avctx);
4449     if (ENABLE_BFIN)     dsputil_init_bfin  (c, avctx);
4450
4451     for(i=0; i<64; i++){
4452         if(!c->put_2tap_qpel_pixels_tab[0][i])
4453             c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4454         if(!c->avg_2tap_qpel_pixels_tab[0][i])
4455             c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4456     }
4457
4458     switch(c->idct_permutation_type){
4459     case FF_NO_IDCT_PERM:
4460         for(i=0; i<64; i++)
4461             c->idct_permutation[i]= i;
4462         break;
4463     case FF_LIBMPEG2_IDCT_PERM:
4464         for(i=0; i<64; i++)
4465             c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4466         break;
4467     case FF_SIMPLE_IDCT_PERM:
4468         for(i=0; i<64; i++)
4469             c->idct_permutation[i]= simple_mmx_permutation[i];
4470         break;
4471     case FF_TRANSPOSE_IDCT_PERM:
4472         for(i=0; i<64; i++)
4473             c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4474         break;
4475     case FF_PARTTRANS_IDCT_PERM:
4476         for(i=0; i<64; i++)
4477             c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4478         break;
4479     default:
4480         av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
4481     }
4482 }
4483