libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
   7  *
   8  * This file is part of FFmpeg.
   9  *
  10  * FFmpeg is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * FFmpeg is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with FFmpeg; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 /**
  26  * @file libavcodec/dsputil.c
  27  * DSP utils
  28  */
  29
  30 #include "avcodec.h"
  31 #include "dsputil.h"
  32 #include "simple_idct.h"
  33 #include "faandct.h"
  34 #include "faanidct.h"
  35 #include "mathops.h"
  36 #include "h263.h"
  37 #include "snow.h"
  38
  39 /* snow.c */
  40 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
  41
  42 /* vorbis.c */
  43 void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
  44
  45 /* ac3dec.c */
  46 void ff_ac3_downmix_c(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len);
  47
  48 /* lpc.c */
  49 void ff_lpc_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
  50
  51 /* pngdec.c */
  52 void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
  53
  54 /* eaidct.c */
  55 void ff_ea_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block);
  56
  57 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
  58 uint32_t ff_squareTbl[512] = {0, };
  59
  60 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
  61 #define pb_7f (~0UL/255 * 0x7f)
  62 #define pb_80 (~0UL/255 * 0x80)
  63
  64 const uint8_t ff_zigzag_direct[64] = {
  65     0,   1,  8, 16,  9,  2,  3, 10,
  66     17, 24, 32, 25, 18, 11,  4,  5,
  67     12, 19, 26, 33, 40, 48, 41, 34,
  68     27, 20, 13,  6,  7, 14, 21, 28,
  69     35, 42, 49, 56, 57, 50, 43, 36,
  70     29, 22, 15, 23, 30, 37, 44, 51,
  71     58, 59, 52, 45, 38, 31, 39, 46,
  72     53, 60, 61, 54, 47, 55, 62, 63
  73 };
  74
  75 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  76    specification, we interleave the fields */
  77 const uint8_t ff_zigzag248_direct[64] = {
  78      0,  8,  1,  9, 16, 24,  2, 10,
  79     17, 25, 32, 40, 48, 56, 33, 41,
  80     18, 26,  3, 11,  4, 12, 19, 27,
  81     34, 42, 49, 57, 50, 58, 35, 43,
  82     20, 28,  5, 13,  6, 14, 21, 29,
  83     36, 44, 51, 59, 52, 60, 37, 45,
  84     22, 30,  7, 15, 23, 31, 38, 46,
  85     53, 61, 54, 62, 39, 47, 55, 63,
  86 };
  87
  88 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  89 DECLARE_ALIGNED_16(uint16_t, inv_zigzag_direct16[64]);
  90
  91 const uint8_t ff_alternate_horizontal_scan[64] = {
  92     0,  1,   2,  3,  8,  9, 16, 17,
  93     10, 11,  4,  5,  6,  7, 15, 14,
  94     13, 12, 19, 18, 24, 25, 32, 33,
  95     26, 27, 20, 21, 22, 23, 28, 29,
  96     30, 31, 34, 35, 40, 41, 48, 49,
  97     42, 43, 36, 37, 38, 39, 44, 45,
  98     46, 47, 50, 51, 56, 57, 58, 59,
  99     52, 53, 54, 55, 60, 61, 62, 63,
 100 };
 101
 102 const uint8_t ff_alternate_vertical_scan[64] = {
 103     0,  8,  16, 24,  1,  9,  2, 10,
 104     17, 25, 32, 40, 48, 56, 57, 49,
 105     41, 33, 26, 18,  3, 11,  4, 12,
 106     19, 27, 34, 42, 50, 58, 35, 43,
 107     51, 59, 20, 28,  5, 13,  6, 14,
 108     21, 29, 36, 44, 52, 60, 37, 45,
 109     53, 61, 22, 30,  7, 15, 23, 31,
 110     38, 46, 54, 62, 39, 47, 55, 63,
 111 };
 112
 113 /* a*inverse[b]>>32 == a/b for all 0<=a<=16909558 && 2<=b<=256
 114  * for a>16909558, is an overestimate by less than 1 part in 1<<24 */
 115 const uint32_t ff_inverse[257]={
 116          0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
 117  536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
 118  268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
 119  178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
 120  134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
 121  107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
 122   89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
 123   76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
 124   67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
 125   59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
 126   53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
 127   48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
 128   44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
 129   41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
 130   38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
 131   35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
 132   33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
 133   31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
 134   29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
 135   28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
 136   26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
 137   25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
 138   24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
 139   23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
 140   22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
 141   21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
 142   20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
 143   19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
 144   19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
 145   18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
 146   17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
 147   17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
 148   16777216
 149 };
 150
 151 /* Input permutation for the simple_idct_mmx */
 152 static const uint8_t simple_mmx_permutation[64]={
 153         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 154         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 155         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 156         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 157         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 158         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 159         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 160         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 161 };
 162
 163 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
 164
 165 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
 166     int i;
 167     int end;
 168
 169     st->scantable= src_scantable;
 170
 171     for(i=0; i<64; i++){
 172         int j;
 173         j = src_scantable[i];
 174         st->permutated[i] = permutation[j];
 175 #if ARCH_PPC
 176         st->inverse[j] = i;
 177 #endif
 178     }
 179
 180     end=-1;
 181     for(i=0; i<64; i++){
 182         int j;
 183         j = st->permutated[i];
 184         if(j>end) end=j;
 185         st->raster_end[i]= end;
 186     }
 187 }
 188
 189 static int pix_sum_c(uint8_t * pix, int line_size)
 190 {
 191     int s, i, j;
 192
 193     s = 0;
 194     for (i = 0; i < 16; i++) {
 195         for (j = 0; j < 16; j += 8) {
 196             s += pix[0];
 197             s += pix[1];
 198             s += pix[2];
 199             s += pix[3];
 200             s += pix[4];
 201             s += pix[5];
 202             s += pix[6];
 203             s += pix[7];
 204             pix += 8;
 205         }
 206         pix += line_size - 16;
 207     }
 208     return s;
 209 }
 210
 211 static int pix_norm1_c(uint8_t * pix, int line_size)
 212 {
 213     int s, i, j;
 214     uint32_t *sq = ff_squareTbl + 256;
 215
 216     s = 0;
 217     for (i = 0; i < 16; i++) {
 218         for (j = 0; j < 16; j += 8) {
 219 #if 0
 220             s += sq[pix[0]];
 221             s += sq[pix[1]];
 222             s += sq[pix[2]];
 223             s += sq[pix[3]];
 224             s += sq[pix[4]];
 225             s += sq[pix[5]];
 226             s += sq[pix[6]];
 227             s += sq[pix[7]];
 228 #else
 229 #if LONG_MAX > 2147483647
 230             register uint64_t x=*(uint64_t*)pix;
 231             s += sq[x&0xff];
 232             s += sq[(x>>8)&0xff];
 233             s += sq[(x>>16)&0xff];
 234             s += sq[(x>>24)&0xff];
 235             s += sq[(x>>32)&0xff];
 236             s += sq[(x>>40)&0xff];
 237             s += sq[(x>>48)&0xff];
 238             s += sq[(x>>56)&0xff];
 239 #else
 240             register uint32_t x=*(uint32_t*)pix;
 241             s += sq[x&0xff];
 242             s += sq[(x>>8)&0xff];
 243             s += sq[(x>>16)&0xff];
 244             s += sq[(x>>24)&0xff];
 245             x=*(uint32_t*)(pix+4);
 246             s += sq[x&0xff];
 247             s += sq[(x>>8)&0xff];
 248             s += sq[(x>>16)&0xff];
 249             s += sq[(x>>24)&0xff];
 250 #endif
 251 #endif
 252             pix += 8;
 253         }
 254         pix += line_size - 16;
 255     }
 256     return s;
 257 }
 258
 259 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
 260     int i;
 261
 262     for(i=0; i+8<=w; i+=8){
 263         dst[i+0]= bswap_32(src[i+0]);
 264         dst[i+1]= bswap_32(src[i+1]);
 265         dst[i+2]= bswap_32(src[i+2]);
 266         dst[i+3]= bswap_32(src[i+3]);
 267         dst[i+4]= bswap_32(src[i+4]);
 268         dst[i+5]= bswap_32(src[i+5]);
 269         dst[i+6]= bswap_32(src[i+6]);
 270         dst[i+7]= bswap_32(src[i+7]);
 271     }
 272     for(;i<w; i++){
 273         dst[i+0]= bswap_32(src[i+0]);
 274     }
 275 }
 276
 277 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 278 {
 279     int s, i;
 280     uint32_t *sq = ff_squareTbl + 256;
 281
 282     s = 0;
 283     for (i = 0; i < h; i++) {
 284         s += sq[pix1[0] - pix2[0]];
 285         s += sq[pix1[1] - pix2[1]];
 286         s += sq[pix1[2] - pix2[2]];
 287         s += sq[pix1[3] - pix2[3]];
 288         pix1 += line_size;
 289         pix2 += line_size;
 290     }
 291     return s;
 292 }
 293
 294 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 295 {
 296     int s, i;
 297     uint32_t *sq = ff_squareTbl + 256;
 298
 299     s = 0;
 300     for (i = 0; i < h; i++) {
 301         s += sq[pix1[0] - pix2[0]];
 302         s += sq[pix1[1] - pix2[1]];
 303         s += sq[pix1[2] - pix2[2]];
 304         s += sq[pix1[3] - pix2[3]];
 305         s += sq[pix1[4] - pix2[4]];
 306         s += sq[pix1[5] - pix2[5]];
 307         s += sq[pix1[6] - pix2[6]];
 308         s += sq[pix1[7] - pix2[7]];
 309         pix1 += line_size;
 310         pix2 += line_size;
 311     }
 312     return s;
 313 }
 314
 315 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 316 {
 317     int s, i;
 318     uint32_t *sq = ff_squareTbl + 256;
 319
 320     s = 0;
 321     for (i = 0; i < h; i++) {
 322         s += sq[pix1[ 0] - pix2[ 0]];
 323         s += sq[pix1[ 1] - pix2[ 1]];
 324         s += sq[pix1[ 2] - pix2[ 2]];
 325         s += sq[pix1[ 3] - pix2[ 3]];
 326         s += sq[pix1[ 4] - pix2[ 4]];
 327         s += sq[pix1[ 5] - pix2[ 5]];
 328         s += sq[pix1[ 6] - pix2[ 6]];
 329         s += sq[pix1[ 7] - pix2[ 7]];
 330         s += sq[pix1[ 8] - pix2[ 8]];
 331         s += sq[pix1[ 9] - pix2[ 9]];
 332         s += sq[pix1[10] - pix2[10]];
 333         s += sq[pix1[11] - pix2[11]];
 334         s += sq[pix1[12] - pix2[12]];
 335         s += sq[pix1[13] - pix2[13]];
 336         s += sq[pix1[14] - pix2[14]];
 337         s += sq[pix1[15] - pix2[15]];
 338
 339         pix1 += line_size;
 340         pix2 += line_size;
 341     }
 342     return s;
 343 }
 344
 345
 346 #if CONFIG_SNOW_ENCODER //dwt is in snow.c
 347 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
 348     int s, i, j;
 349     const int dec_count= w==8 ? 3 : 4;
 350     int tmp[32*32];
 351     int level, ori;
 352     static const int scale[2][2][4][4]={
 353       {
 354         {
 355             // 9/7 8x8 dec=3
 356             {268, 239, 239, 213},
 357             {  0, 224, 224, 152},
 358             {  0, 135, 135, 110},
 359         },{
 360             // 9/7 16x16 or 32x32 dec=4
 361             {344, 310, 310, 280},
 362             {  0, 320, 320, 228},
 363             {  0, 175, 175, 136},
 364             {  0, 129, 129, 102},
 365         }
 366       },{
 367         {
 368             // 5/3 8x8 dec=3
 369             {275, 245, 245, 218},
 370             {  0, 230, 230, 156},
 371             {  0, 138, 138, 113},
 372         },{
 373             // 5/3 16x16 or 32x32 dec=4
 374             {352, 317, 317, 286},
 375             {  0, 328, 328, 233},
 376             {  0, 180, 180, 140},
 377             {  0, 132, 132, 105},
 378         }
 379       }
 380     };
 381
 382     for (i = 0; i < h; i++) {
 383         for (j = 0; j < w; j+=4) {
 384             tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
 385             tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
 386             tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
 387             tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
 388         }
 389         pix1 += line_size;
 390         pix2 += line_size;
 391     }
 392
 393     ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
 394
 395     s=0;
 396     assert(w==h);
 397     for(level=0; level<dec_count; level++){
 398         for(ori= level ? 1 : 0; ori<4; ori++){
 399             int size= w>>(dec_count-level);
 400             int sx= (ori&1) ? size : 0;
 401             int stride= 32<<(dec_count-level);
 402             int sy= (ori&2) ? stride>>1 : 0;
 403
 404             for(i=0; i<size; i++){
 405                 for(j=0; j<size; j++){
 406                     int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
 407                     s += FFABS(v);
 408                 }
 409             }
 410         }
 411     }
 412     assert(s>=0);
 413     return s>>9;
 414 }
 415
 416 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 417     return w_c(v, pix1, pix2, line_size,  8, h, 1);
 418 }
 419
 420 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 421     return w_c(v, pix1, pix2, line_size,  8, h, 0);
 422 }
 423
 424 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 425     return w_c(v, pix1, pix2, line_size, 16, h, 1);
 426 }
 427
 428 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 429     return w_c(v, pix1, pix2, line_size, 16, h, 0);
 430 }
 431
 432 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 433     return w_c(v, pix1, pix2, line_size, 32, h, 1);
 434 }
 435
 436 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 437     return w_c(v, pix1, pix2, line_size, 32, h, 0);
 438 }
 439 #endif
 440
 441 /* draw the edges of width 'w' of an image of size width, height */
 442 //FIXME check that this is ok for mpeg4 interlaced
 443 static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
 444 {
 445     uint8_t *ptr, *last_line;
 446     int i;
 447
 448     last_line = buf + (height - 1) * wrap;
 449     for(i=0;i<w;i++) {
 450         /* top and bottom */
 451         memcpy(buf - (i + 1) * wrap, buf, width);
 452         memcpy(last_line + (i + 1) * wrap, last_line, width);
 453     }
 454     /* left and right */
 455     ptr = buf;
 456     for(i=0;i<height;i++) {
 457         memset(ptr - w, ptr[0], w);
 458         memset(ptr + width, ptr[width-1], w);
 459         ptr += wrap;
 460     }
 461     /* corners */
 462     for(i=0;i<w;i++) {
 463         memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
 464         memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
 465         memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
 466         memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
 467     }
 468 }
 469
 470 /**
 471  * Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples.
 472  * @param buf destination buffer
 473  * @param src source buffer
 474  * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
 475  * @param block_w width of block
 476  * @param block_h height of block
 477  * @param src_x x coordinate of the top left sample of the block in the source buffer
 478  * @param src_y y coordinate of the top left sample of the block in the source buffer
 479  * @param w width of the source buffer
 480  * @param h height of the source buffer
 481  */
 482 void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, int block_w, int block_h,
 483                                     int src_x, int src_y, int w, int h){
 484     int x, y;
 485     int start_y, start_x, end_y, end_x;
 486
 487     if(src_y>= h){
 488         src+= (h-1-src_y)*linesize;
 489         src_y=h-1;
 490     }else if(src_y<=-block_h){
 491         src+= (1-block_h-src_y)*linesize;
 492         src_y=1-block_h;
 493     }
 494     if(src_x>= w){
 495         src+= (w-1-src_x);
 496         src_x=w-1;
 497     }else if(src_x<=-block_w){
 498         src+= (1-block_w-src_x);
 499         src_x=1-block_w;
 500     }
 501
 502     start_y= FFMAX(0, -src_y);
 503     start_x= FFMAX(0, -src_x);
 504     end_y= FFMIN(block_h, h-src_y);
 505     end_x= FFMIN(block_w, w-src_x);
 506
 507     // copy existing part
 508     for(y=start_y; y<end_y; y++){
 509         for(x=start_x; x<end_x; x++){
 510             buf[x + y*linesize]= src[x + y*linesize];
 511         }
 512     }
 513
 514     //top
 515     for(y=0; y<start_y; y++){
 516         for(x=start_x; x<end_x; x++){
 517             buf[x + y*linesize]= buf[x + start_y*linesize];
 518         }
 519     }
 520
 521     //bottom
 522     for(y=end_y; y<block_h; y++){
 523         for(x=start_x; x<end_x; x++){
 524             buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
 525         }
 526     }
 527
 528     for(y=0; y<block_h; y++){
 529        //left
 530         for(x=0; x<start_x; x++){
 531             buf[x + y*linesize]= buf[start_x + y*linesize];
 532         }
 533
 534        //right
 535         for(x=end_x; x<block_w; x++){
 536             buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
 537         }
 538     }
 539 }
 540
 541 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
 542 {
 543     int i;
 544
 545     /* read the pixels */
 546     for(i=0;i<8;i++) {
 547         block[0] = pixels[0];
 548         block[1] = pixels[1];
 549         block[2] = pixels[2];
 550         block[3] = pixels[3];
 551         block[4] = pixels[4];
 552         block[5] = pixels[5];
 553         block[6] = pixels[6];
 554         block[7] = pixels[7];
 555         pixels += line_size;
 556         block += 8;
 557     }
 558 }
 559
 560 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
 561                           const uint8_t *s2, int stride){
 562     int i;
 563
 564     /* read the pixels */
 565     for(i=0;i<8;i++) {
 566         block[0] = s1[0] - s2[0];
 567         block[1] = s1[1] - s2[1];
 568         block[2] = s1[2] - s2[2];
 569         block[3] = s1[3] - s2[3];
 570         block[4] = s1[4] - s2[4];
 571         block[5] = s1[5] - s2[5];
 572         block[6] = s1[6] - s2[6];
 573         block[7] = s1[7] - s2[7];
 574         s1 += stride;
 575         s2 += stride;
 576         block += 8;
 577     }
 578 }
 579
 580
 581 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 582                                  int line_size)
 583 {
 584     int i;
 585     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 586
 587     /* read the pixels */
 588     for(i=0;i<8;i++) {
 589         pixels[0] = cm[block[0]];
 590         pixels[1] = cm[block[1]];
 591         pixels[2] = cm[block[2]];
 592         pixels[3] = cm[block[3]];
 593         pixels[4] = cm[block[4]];
 594         pixels[5] = cm[block[5]];
 595         pixels[6] = cm[block[6]];
 596         pixels[7] = cm[block[7]];
 597
 598         pixels += line_size;
 599         block += 8;
 600     }
 601 }
 602
 603 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 604                                  int line_size)
 605 {
 606     int i;
 607     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 608
 609     /* read the pixels */
 610     for(i=0;i<4;i++) {
 611         pixels[0] = cm[block[0]];
 612         pixels[1] = cm[block[1]];
 613         pixels[2] = cm[block[2]];
 614         pixels[3] = cm[block[3]];
 615
 616         pixels += line_size;
 617         block += 8;
 618     }
 619 }
 620
 621 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 622                                  int line_size)
 623 {
 624     int i;
 625     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 626
 627     /* read the pixels */
 628     for(i=0;i<2;i++) {
 629         pixels[0] = cm[block[0]];
 630         pixels[1] = cm[block[1]];
 631
 632         pixels += line_size;
 633         block += 8;
 634     }
 635 }
 636
 637 static void put_signed_pixels_clamped_c(const DCTELEM *block,
 638                                         uint8_t *restrict pixels,
 639                                         int line_size)
 640 {
 641     int i, j;
 642
 643     for (i = 0; i < 8; i++) {
 644         for (j = 0; j < 8; j++) {
 645             if (*block < -128)
 646                 *pixels = 0;
 647             else if (*block > 127)
 648                 *pixels = 255;
 649             else
 650                 *pixels = (uint8_t)(*block + 128);
 651             block++;
 652             pixels++;
 653         }
 654         pixels += (line_size - 8);
 655     }
 656 }
 657
 658 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 659                           int line_size)
 660 {
 661     int i;
 662     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 663
 664     /* read the pixels */
 665     for(i=0;i<8;i++) {
 666         pixels[0] = cm[pixels[0] + block[0]];
 667         pixels[1] = cm[pixels[1] + block[1]];
 668         pixels[2] = cm[pixels[2] + block[2]];
 669         pixels[3] = cm[pixels[3] + block[3]];
 670         pixels[4] = cm[pixels[4] + block[4]];
 671         pixels[5] = cm[pixels[5] + block[5]];
 672         pixels[6] = cm[pixels[6] + block[6]];
 673         pixels[7] = cm[pixels[7] + block[7]];
 674         pixels += line_size;
 675         block += 8;
 676     }
 677 }
 678
 679 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 680                           int line_size)
 681 {
 682     int i;
 683     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 684
 685     /* read the pixels */
 686     for(i=0;i<4;i++) {
 687         pixels[0] = cm[pixels[0] + block[0]];
 688         pixels[1] = cm[pixels[1] + block[1]];
 689         pixels[2] = cm[pixels[2] + block[2]];
 690         pixels[3] = cm[pixels[3] + block[3]];
 691         pixels += line_size;
 692         block += 8;
 693     }
 694 }
 695
 696 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 697                           int line_size)
 698 {
 699     int i;
 700     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 701
 702     /* read the pixels */
 703     for(i=0;i<2;i++) {
 704         pixels[0] = cm[pixels[0] + block[0]];
 705         pixels[1] = cm[pixels[1] + block[1]];
 706         pixels += line_size;
 707         block += 8;
 708     }
 709 }
 710
 711 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 712 {
 713     int i;
 714     for(i=0;i<8;i++) {
 715         pixels[0] += block[0];
 716         pixels[1] += block[1];
 717         pixels[2] += block[2];
 718         pixels[3] += block[3];
 719         pixels[4] += block[4];
 720         pixels[5] += block[5];
 721         pixels[6] += block[6];
 722         pixels[7] += block[7];
 723         pixels += line_size;
 724         block += 8;
 725     }
 726 }
 727
 728 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 729 {
 730     int i;
 731     for(i=0;i<4;i++) {
 732         pixels[0] += block[0];
 733         pixels[1] += block[1];
 734         pixels[2] += block[2];
 735         pixels[3] += block[3];
 736         pixels += line_size;
 737         block += 4;
 738     }
 739 }
 740
 741 static int sum_abs_dctelem_c(DCTELEM *block)
 742 {
 743     int sum=0, i;
 744     for(i=0; i<64; i++)
 745         sum+= FFABS(block[i]);
 746     return sum;
 747 }
 748
 749 #if 0
 750
 751 #define PIXOP2(OPNAME, OP) \
 752 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 753 {\
 754     int i;\
 755     for(i=0; i<h; i++){\
 756         OP(*((uint64_t*)block), AV_RN64(pixels));\
 757         pixels+=line_size;\
 758         block +=line_size;\
 759     }\
 760 }\
 761 \
 762 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 763 {\
 764     int i;\
 765     for(i=0; i<h; i++){\
 766         const uint64_t a= AV_RN64(pixels  );\
 767         const uint64_t b= AV_RN64(pixels+1);\
 768         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 769         pixels+=line_size;\
 770         block +=line_size;\
 771     }\
 772 }\
 773 \
 774 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 775 {\
 776     int i;\
 777     for(i=0; i<h; i++){\
 778         const uint64_t a= AV_RN64(pixels  );\
 779         const uint64_t b= AV_RN64(pixels+1);\
 780         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 781         pixels+=line_size;\
 782         block +=line_size;\
 783     }\
 784 }\
 785 \
 786 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 787 {\
 788     int i;\
 789     for(i=0; i<h; i++){\
 790         const uint64_t a= AV_RN64(pixels          );\
 791         const uint64_t b= AV_RN64(pixels+line_size);\
 792         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 793         pixels+=line_size;\
 794         block +=line_size;\
 795     }\
 796 }\
 797 \
 798 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 799 {\
 800     int i;\
 801     for(i=0; i<h; i++){\
 802         const uint64_t a= AV_RN64(pixels          );\
 803         const uint64_t b= AV_RN64(pixels+line_size);\
 804         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 805         pixels+=line_size;\
 806         block +=line_size;\
 807     }\
 808 }\
 809 \
 810 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 811 {\
 812         int i;\
 813         const uint64_t a= AV_RN64(pixels  );\
 814         const uint64_t b= AV_RN64(pixels+1);\
 815         uint64_t l0=  (a&0x0303030303030303ULL)\
 816                     + (b&0x0303030303030303ULL)\
 817                     + 0x0202020202020202ULL;\
 818         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 819                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 820         uint64_t l1,h1;\
 821 \
 822         pixels+=line_size;\
 823         for(i=0; i<h; i+=2){\
 824             uint64_t a= AV_RN64(pixels  );\
 825             uint64_t b= AV_RN64(pixels+1);\
 826             l1=  (a&0x0303030303030303ULL)\
 827                + (b&0x0303030303030303ULL);\
 828             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 829               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 830             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 831             pixels+=line_size;\
 832             block +=line_size;\
 833             a= AV_RN64(pixels  );\
 834             b= AV_RN64(pixels+1);\
 835             l0=  (a&0x0303030303030303ULL)\
 836                + (b&0x0303030303030303ULL)\
 837                + 0x0202020202020202ULL;\
 838             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 839               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 840             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 841             pixels+=line_size;\
 842             block +=line_size;\
 843         }\
 844 }\
 845 \
 846 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 847 {\
 848         int i;\
 849         const uint64_t a= AV_RN64(pixels  );\
 850         const uint64_t b= AV_RN64(pixels+1);\
 851         uint64_t l0=  (a&0x0303030303030303ULL)\
 852                     + (b&0x0303030303030303ULL)\
 853                     + 0x0101010101010101ULL;\
 854         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 855                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 856         uint64_t l1,h1;\
 857 \
 858         pixels+=line_size;\
 859         for(i=0; i<h; i+=2){\
 860             uint64_t a= AV_RN64(pixels  );\
 861             uint64_t b= AV_RN64(pixels+1);\
 862             l1=  (a&0x0303030303030303ULL)\
 863                + (b&0x0303030303030303ULL);\
 864             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 865               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 866             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 867             pixels+=line_size;\
 868             block +=line_size;\
 869             a= AV_RN64(pixels  );\
 870             b= AV_RN64(pixels+1);\
 871             l0=  (a&0x0303030303030303ULL)\
 872                + (b&0x0303030303030303ULL)\
 873                + 0x0101010101010101ULL;\
 874             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 875               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 876             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 877             pixels+=line_size;\
 878             block +=line_size;\
 879         }\
 880 }\
 881 \
 882 CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
 883 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
 884 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
 885 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
 886 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
 887 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
 888 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
 889
 890 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
 891 #else // 64 bit variant
 892
 893 #define PIXOP2(OPNAME, OP) \
 894 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 895     int i;\
 896     for(i=0; i<h; i++){\
 897         OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
 898         pixels+=line_size;\
 899         block +=line_size;\
 900     }\
 901 }\
 902 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 903     int i;\
 904     for(i=0; i<h; i++){\
 905         OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
 906         pixels+=line_size;\
 907         block +=line_size;\
 908     }\
 909 }\
 910 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 911     int i;\
 912     for(i=0; i<h; i++){\
 913         OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
 914         OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
 915         pixels+=line_size;\
 916         block +=line_size;\
 917     }\
 918 }\
 919 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 920     OPNAME ## _pixels8_c(block, pixels, line_size, h);\
 921 }\
 922 \
 923 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 924                                                 int src_stride1, int src_stride2, int h){\
 925     int i;\
 926     for(i=0; i<h; i++){\
 927         uint32_t a,b;\
 928         a= AV_RN32(&src1[i*src_stride1  ]);\
 929         b= AV_RN32(&src2[i*src_stride2  ]);\
 930         OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
 931         a= AV_RN32(&src1[i*src_stride1+4]);\
 932         b= AV_RN32(&src2[i*src_stride2+4]);\
 933         OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
 934     }\
 935 }\
 936 \
 937 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 938                                                 int src_stride1, int src_stride2, int h){\
 939     int i;\
 940     for(i=0; i<h; i++){\
 941         uint32_t a,b;\
 942         a= AV_RN32(&src1[i*src_stride1  ]);\
 943         b= AV_RN32(&src2[i*src_stride2  ]);\
 944         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 945         a= AV_RN32(&src1[i*src_stride1+4]);\
 946         b= AV_RN32(&src2[i*src_stride2+4]);\
 947         OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
 948     }\
 949 }\
 950 \
 951 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 952                                                 int src_stride1, int src_stride2, int h){\
 953     int i;\
 954     for(i=0; i<h; i++){\
 955         uint32_t a,b;\
 956         a= AV_RN32(&src1[i*src_stride1  ]);\
 957         b= AV_RN32(&src2[i*src_stride2  ]);\
 958         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 959     }\
 960 }\
 961 \
 962 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 963                                                 int src_stride1, int src_stride2, int h){\
 964     int i;\
 965     for(i=0; i<h; i++){\
 966         uint32_t a,b;\
 967         a= AV_RN16(&src1[i*src_stride1  ]);\
 968         b= AV_RN16(&src2[i*src_stride2  ]);\
 969         OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 970     }\
 971 }\
 972 \
 973 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 974                                                 int src_stride1, int src_stride2, int h){\
 975     OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 976     OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 977 }\
 978 \
 979 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 980                                                 int src_stride1, int src_stride2, int h){\
 981     OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 982     OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 983 }\
 984 \
 985 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 986     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 987 }\
 988 \
 989 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 990     OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 991 }\
 992 \
 993 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 994     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 995 }\
 996 \
 997 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 998     OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 999 }\
1000 \
1001 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1002                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1003     int i;\
1004     for(i=0; i<h; i++){\
1005         uint32_t a, b, c, d, l0, l1, h0, h1;\
1006         a= AV_RN32(&src1[i*src_stride1]);\
1007         b= AV_RN32(&src2[i*src_stride2]);\
1008         c= AV_RN32(&src3[i*src_stride3]);\
1009         d= AV_RN32(&src4[i*src_stride4]);\
1010         l0=  (a&0x03030303UL)\
1011            + (b&0x03030303UL)\
1012            + 0x02020202UL;\
1013         h0= ((a&0xFCFCFCFCUL)>>2)\
1014           + ((b&0xFCFCFCFCUL)>>2);\
1015         l1=  (c&0x03030303UL)\
1016            + (d&0x03030303UL);\
1017         h1= ((c&0xFCFCFCFCUL)>>2)\
1018           + ((d&0xFCFCFCFCUL)>>2);\
1019         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1020         a= AV_RN32(&src1[i*src_stride1+4]);\
1021         b= AV_RN32(&src2[i*src_stride2+4]);\
1022         c= AV_RN32(&src3[i*src_stride3+4]);\
1023         d= AV_RN32(&src4[i*src_stride4+4]);\
1024         l0=  (a&0x03030303UL)\
1025            + (b&0x03030303UL)\
1026            + 0x02020202UL;\
1027         h0= ((a&0xFCFCFCFCUL)>>2)\
1028           + ((b&0xFCFCFCFCUL)>>2);\
1029         l1=  (c&0x03030303UL)\
1030            + (d&0x03030303UL);\
1031         h1= ((c&0xFCFCFCFCUL)>>2)\
1032           + ((d&0xFCFCFCFCUL)>>2);\
1033         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1034     }\
1035 }\
1036 \
1037 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1038     OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1039 }\
1040 \
1041 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1042     OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1043 }\
1044 \
1045 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1046     OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1047 }\
1048 \
1049 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1050     OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1051 }\
1052 \
1053 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1054                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1055     int i;\
1056     for(i=0; i<h; i++){\
1057         uint32_t a, b, c, d, l0, l1, h0, h1;\
1058         a= AV_RN32(&src1[i*src_stride1]);\
1059         b= AV_RN32(&src2[i*src_stride2]);\
1060         c= AV_RN32(&src3[i*src_stride3]);\
1061         d= AV_RN32(&src4[i*src_stride4]);\
1062         l0=  (a&0x03030303UL)\
1063            + (b&0x03030303UL)\
1064            + 0x01010101UL;\
1065         h0= ((a&0xFCFCFCFCUL)>>2)\
1066           + ((b&0xFCFCFCFCUL)>>2);\
1067         l1=  (c&0x03030303UL)\
1068            + (d&0x03030303UL);\
1069         h1= ((c&0xFCFCFCFCUL)>>2)\
1070           + ((d&0xFCFCFCFCUL)>>2);\
1071         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1072         a= AV_RN32(&src1[i*src_stride1+4]);\
1073         b= AV_RN32(&src2[i*src_stride2+4]);\
1074         c= AV_RN32(&src3[i*src_stride3+4]);\
1075         d= AV_RN32(&src4[i*src_stride4+4]);\
1076         l0=  (a&0x03030303UL)\
1077            + (b&0x03030303UL)\
1078            + 0x01010101UL;\
1079         h0= ((a&0xFCFCFCFCUL)>>2)\
1080           + ((b&0xFCFCFCFCUL)>>2);\
1081         l1=  (c&0x03030303UL)\
1082            + (d&0x03030303UL);\
1083         h1= ((c&0xFCFCFCFCUL)>>2)\
1084           + ((d&0xFCFCFCFCUL)>>2);\
1085         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1086     }\
1087 }\
1088 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1089                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1090     OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1091     OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1092 }\
1093 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1094                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1095     OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1096     OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1097 }\
1098 \
1099 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1100 {\
1101         int i, a0, b0, a1, b1;\
1102         a0= pixels[0];\
1103         b0= pixels[1] + 2;\
1104         a0 += b0;\
1105         b0 += pixels[2];\
1106 \
1107         pixels+=line_size;\
1108         for(i=0; i<h; i+=2){\
1109             a1= pixels[0];\
1110             b1= pixels[1];\
1111             a1 += b1;\
1112             b1 += pixels[2];\
1113 \
1114             block[0]= (a1+a0)>>2; /* FIXME non put */\
1115             block[1]= (b1+b0)>>2;\
1116 \
1117             pixels+=line_size;\
1118             block +=line_size;\
1119 \
1120             a0= pixels[0];\
1121             b0= pixels[1] + 2;\
1122             a0 += b0;\
1123             b0 += pixels[2];\
1124 \
1125             block[0]= (a1+a0)>>2;\
1126             block[1]= (b1+b0)>>2;\
1127             pixels+=line_size;\
1128             block +=line_size;\
1129         }\
1130 }\
1131 \
1132 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1133 {\
1134         int i;\
1135         const uint32_t a= AV_RN32(pixels  );\
1136         const uint32_t b= AV_RN32(pixels+1);\
1137         uint32_t l0=  (a&0x03030303UL)\
1138                     + (b&0x03030303UL)\
1139                     + 0x02020202UL;\
1140         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1141                    + ((b&0xFCFCFCFCUL)>>2);\
1142         uint32_t l1,h1;\
1143 \
1144         pixels+=line_size;\
1145         for(i=0; i<h; i+=2){\
1146             uint32_t a= AV_RN32(pixels  );\
1147             uint32_t b= AV_RN32(pixels+1);\
1148             l1=  (a&0x03030303UL)\
1149                + (b&0x03030303UL);\
1150             h1= ((a&0xFCFCFCFCUL)>>2)\
1151               + ((b&0xFCFCFCFCUL)>>2);\
1152             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1153             pixels+=line_size;\
1154             block +=line_size;\
1155             a= AV_RN32(pixels  );\
1156             b= AV_RN32(pixels+1);\
1157             l0=  (a&0x03030303UL)\
1158                + (b&0x03030303UL)\
1159                + 0x02020202UL;\
1160             h0= ((a&0xFCFCFCFCUL)>>2)\
1161               + ((b&0xFCFCFCFCUL)>>2);\
1162             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1163             pixels+=line_size;\
1164             block +=line_size;\
1165         }\
1166 }\
1167 \
1168 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1169 {\
1170     int j;\
1171     for(j=0; j<2; j++){\
1172         int i;\
1173         const uint32_t a= AV_RN32(pixels  );\
1174         const uint32_t b= AV_RN32(pixels+1);\
1175         uint32_t l0=  (a&0x03030303UL)\
1176                     + (b&0x03030303UL)\
1177                     + 0x02020202UL;\
1178         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1179                    + ((b&0xFCFCFCFCUL)>>2);\
1180         uint32_t l1,h1;\
1181 \
1182         pixels+=line_size;\
1183         for(i=0; i<h; i+=2){\
1184             uint32_t a= AV_RN32(pixels  );\
1185             uint32_t b= AV_RN32(pixels+1);\
1186             l1=  (a&0x03030303UL)\
1187                + (b&0x03030303UL);\
1188             h1= ((a&0xFCFCFCFCUL)>>2)\
1189               + ((b&0xFCFCFCFCUL)>>2);\
1190             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1191             pixels+=line_size;\
1192             block +=line_size;\
1193             a= AV_RN32(pixels  );\
1194             b= AV_RN32(pixels+1);\
1195             l0=  (a&0x03030303UL)\
1196                + (b&0x03030303UL)\
1197                + 0x02020202UL;\
1198             h0= ((a&0xFCFCFCFCUL)>>2)\
1199               + ((b&0xFCFCFCFCUL)>>2);\
1200             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1201             pixels+=line_size;\
1202             block +=line_size;\
1203         }\
1204         pixels+=4-line_size*(h+1);\
1205         block +=4-line_size*h;\
1206     }\
1207 }\
1208 \
1209 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1210 {\
1211     int j;\
1212     for(j=0; j<2; j++){\
1213         int i;\
1214         const uint32_t a= AV_RN32(pixels  );\
1215         const uint32_t b= AV_RN32(pixels+1);\
1216         uint32_t l0=  (a&0x03030303UL)\
1217                     + (b&0x03030303UL)\
1218                     + 0x01010101UL;\
1219         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1220                    + ((b&0xFCFCFCFCUL)>>2);\
1221         uint32_t l1,h1;\
1222 \
1223         pixels+=line_size;\
1224         for(i=0; i<h; i+=2){\
1225             uint32_t a= AV_RN32(pixels  );\
1226             uint32_t b= AV_RN32(pixels+1);\
1227             l1=  (a&0x03030303UL)\
1228                + (b&0x03030303UL);\
1229             h1= ((a&0xFCFCFCFCUL)>>2)\
1230               + ((b&0xFCFCFCFCUL)>>2);\
1231             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1232             pixels+=line_size;\
1233             block +=line_size;\
1234             a= AV_RN32(pixels  );\
1235             b= AV_RN32(pixels+1);\
1236             l0=  (a&0x03030303UL)\
1237                + (b&0x03030303UL)\
1238                + 0x01010101UL;\
1239             h0= ((a&0xFCFCFCFCUL)>>2)\
1240               + ((b&0xFCFCFCFCUL)>>2);\
1241             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1242             pixels+=line_size;\
1243             block +=line_size;\
1244         }\
1245         pixels+=4-line_size*(h+1);\
1246         block +=4-line_size*h;\
1247     }\
1248 }\
1249 \
1250 CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1251 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1252 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1253 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1254 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1255 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1256 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1257 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1258
1259 #define op_avg(a, b) a = rnd_avg32(a, b)
1260 #endif
1261 #define op_put(a, b) a = b
1262
1263 PIXOP2(avg, op_avg)
1264 PIXOP2(put, op_put)
1265 #undef op_avg
1266 #undef op_put
1267
1268 #define avg2(a,b) ((a+b+1)>>1)
1269 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1270
1271 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1272     put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1273 }
1274
1275 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1276     put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1277 }
1278
1279 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1280 {
1281     const int A=(16-x16)*(16-y16);
1282     const int B=(   x16)*(16-y16);
1283     const int C=(16-x16)*(   y16);
1284     const int D=(   x16)*(   y16);
1285     int i;
1286
1287     for(i=0; i<h; i++)
1288     {
1289         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1290         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1291         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1292         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1293         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1294         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1295         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1296         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1297         dst+= stride;
1298         src+= stride;
1299     }
1300 }
1301
1302 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1303                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1304 {
1305     int y, vx, vy;
1306     const int s= 1<<shift;
1307
1308     width--;
1309     height--;
1310
1311     for(y=0; y<h; y++){
1312         int x;
1313
1314         vx= ox;
1315         vy= oy;
1316         for(x=0; x<8; x++){ //XXX FIXME optimize
1317             int src_x, src_y, frac_x, frac_y, index;
1318
1319             src_x= vx>>16;
1320             src_y= vy>>16;
1321             frac_x= src_x&(s-1);
1322             frac_y= src_y&(s-1);
1323             src_x>>=shift;
1324             src_y>>=shift;
1325
1326             if((unsigned)src_x < width){
1327                 if((unsigned)src_y < height){
1328                     index= src_x + src_y*stride;
1329                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1330                                            + src[index       +1]*   frac_x )*(s-frac_y)
1331                                         + (  src[index+stride  ]*(s-frac_x)
1332                                            + src[index+stride+1]*   frac_x )*   frac_y
1333                                         + r)>>(shift*2);
1334                 }else{
1335                     index= src_x + av_clip(src_y, 0, height)*stride;
1336                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1337                                           + src[index       +1]*   frac_x )*s
1338                                         + r)>>(shift*2);
1339                 }
1340             }else{
1341                 if((unsigned)src_y < height){
1342                     index= av_clip(src_x, 0, width) + src_y*stride;
1343                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1344                                            + src[index+stride  ]*   frac_y )*s
1345                                         + r)>>(shift*2);
1346                 }else{
1347                     index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1348                     dst[y*stride + x]=    src[index         ];
1349                 }
1350             }
1351
1352             vx+= dxx;
1353             vy+= dyx;
1354         }
1355         ox += dxy;
1356         oy += dyy;
1357     }
1358 }
1359
1360 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1361     switch(width){
1362     case 2: put_pixels2_c (dst, src, stride, height); break;
1363     case 4: put_pixels4_c (dst, src, stride, height); break;
1364     case 8: put_pixels8_c (dst, src, stride, height); break;
1365     case 16:put_pixels16_c(dst, src, stride, height); break;
1366     }
1367 }
1368
1369 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1370     int i,j;
1371     for (i=0; i < height; i++) {
1372       for (j=0; j < width; j++) {
1373         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1374       }
1375       src += stride;
1376       dst += stride;
1377     }
1378 }
1379
1380 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1381     int i,j;
1382     for (i=0; i < height; i++) {
1383       for (j=0; j < width; j++) {
1384         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1385       }
1386       src += stride;
1387       dst += stride;
1388     }
1389 }
1390
1391 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1392     int i,j;
1393     for (i=0; i < height; i++) {
1394       for (j=0; j < width; j++) {
1395         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1396       }
1397       src += stride;
1398       dst += stride;
1399     }
1400 }
1401
1402 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1403     int i,j;
1404     for (i=0; i < height; i++) {
1405       for (j=0; j < width; j++) {
1406         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1407       }
1408       src += stride;
1409       dst += stride;
1410     }
1411 }
1412
1413 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1414     int i,j;
1415     for (i=0; i < height; i++) {
1416       for (j=0; j < width; j++) {
1417         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1418       }
1419       src += stride;
1420       dst += stride;
1421     }
1422 }
1423
1424 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1425     int i,j;
1426     for (i=0; i < height; i++) {
1427       for (j=0; j < width; j++) {
1428         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1429       }
1430       src += stride;
1431       dst += stride;
1432     }
1433 }
1434
1435 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1436     int i,j;
1437     for (i=0; i < height; i++) {
1438       for (j=0; j < width; j++) {
1439         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1440       }
1441       src += stride;
1442       dst += stride;
1443     }
1444 }
1445
1446 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1447     int i,j;
1448     for (i=0; i < height; i++) {
1449       for (j=0; j < width; j++) {
1450         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1451       }
1452       src += stride;
1453       dst += stride;
1454     }
1455 }
1456
1457 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1458     switch(width){
1459     case 2: avg_pixels2_c (dst, src, stride, height); break;
1460     case 4: avg_pixels4_c (dst, src, stride, height); break;
1461     case 8: avg_pixels8_c (dst, src, stride, height); break;
1462     case 16:avg_pixels16_c(dst, src, stride, height); break;
1463     }
1464 }
1465
1466 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1467     int i,j;
1468     for (i=0; i < height; i++) {
1469       for (j=0; j < width; j++) {
1470         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1471       }
1472       src += stride;
1473       dst += stride;
1474     }
1475 }
1476
1477 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1478     int i,j;
1479     for (i=0; i < height; i++) {
1480       for (j=0; j < width; j++) {
1481         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1482       }
1483       src += stride;
1484       dst += stride;
1485     }
1486 }
1487
1488 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1489     int i,j;
1490     for (i=0; i < height; i++) {
1491       for (j=0; j < width; j++) {
1492         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1493       }
1494       src += stride;
1495       dst += stride;
1496     }
1497 }
1498
1499 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1500     int i,j;
1501     for (i=0; i < height; i++) {
1502       for (j=0; j < width; j++) {
1503         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1504       }
1505       src += stride;
1506       dst += stride;
1507     }
1508 }
1509
1510 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1511     int i,j;
1512     for (i=0; i < height; i++) {
1513       for (j=0; j < width; j++) {
1514         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1515       }
1516       src += stride;
1517       dst += stride;
1518     }
1519 }
1520
1521 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1522     int i,j;
1523     for (i=0; i < height; i++) {
1524       for (j=0; j < width; j++) {
1525         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1526       }
1527       src += stride;
1528       dst += stride;
1529     }
1530 }
1531
1532 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1533     int i,j;
1534     for (i=0; i < height; i++) {
1535       for (j=0; j < width; j++) {
1536         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1537       }
1538       src += stride;
1539       dst += stride;
1540     }
1541 }
1542
1543 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1544     int i,j;
1545     for (i=0; i < height; i++) {
1546       for (j=0; j < width; j++) {
1547         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1548       }
1549       src += stride;
1550       dst += stride;
1551     }
1552 }
1553 #if 0
1554 #define TPEL_WIDTH(width)\
1555 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1556     void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1557 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1558     void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1559 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1560     void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1561 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1562     void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1563 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1564     void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1565 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1566     void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1567 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1568     void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1569 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1570     void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1571 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1572     void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1573 #endif
1574
1575 #define H264_CHROMA_MC(OPNAME, OP)\
1576 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1577     const int A=(8-x)*(8-y);\
1578     const int B=(  x)*(8-y);\
1579     const int C=(8-x)*(  y);\
1580     const int D=(  x)*(  y);\
1581     int i;\
1582     \
1583     assert(x<8 && y<8 && x>=0 && y>=0);\
1584 \
1585     if(D){\
1586         for(i=0; i<h; i++){\
1587             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1588             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1589             dst+= stride;\
1590             src+= stride;\
1591         }\
1592     }else{\
1593         const int E= B+C;\
1594         const int step= C ? stride : 1;\
1595         for(i=0; i<h; i++){\
1596             OP(dst[0], (A*src[0] + E*src[step+0]));\
1597             OP(dst[1], (A*src[1] + E*src[step+1]));\
1598             dst+= stride;\
1599             src+= stride;\
1600         }\
1601     }\
1602 }\
1603 \
1604 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1605     const int A=(8-x)*(8-y);\
1606     const int B=(  x)*(8-y);\
1607     const int C=(8-x)*(  y);\
1608     const int D=(  x)*(  y);\
1609     int i;\
1610     \
1611     assert(x<8 && y<8 && x>=0 && y>=0);\
1612 \
1613     if(D){\
1614         for(i=0; i<h; i++){\
1615             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1616             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1617             OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1618             OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1619             dst+= stride;\
1620             src+= stride;\
1621         }\
1622     }else{\
1623         const int E= B+C;\
1624         const int step= C ? stride : 1;\
1625         for(i=0; i<h; i++){\
1626             OP(dst[0], (A*src[0] + E*src[step+0]));\
1627             OP(dst[1], (A*src[1] + E*src[step+1]));\
1628             OP(dst[2], (A*src[2] + E*src[step+2]));\
1629             OP(dst[3], (A*src[3] + E*src[step+3]));\
1630             dst+= stride;\
1631             src+= stride;\
1632         }\
1633     }\
1634 }\
1635 \
1636 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1637     const int A=(8-x)*(8-y);\
1638     const int B=(  x)*(8-y);\
1639     const int C=(8-x)*(  y);\
1640     const int D=(  x)*(  y);\
1641     int i;\
1642     \
1643     assert(x<8 && y<8 && x>=0 && y>=0);\
1644 \
1645     if(D){\
1646         for(i=0; i<h; i++){\
1647             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1648             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1649             OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1650             OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1651             OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1652             OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1653             OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1654             OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1655             dst+= stride;\
1656             src+= stride;\
1657         }\
1658     }else{\
1659         const int E= B+C;\
1660         const int step= C ? stride : 1;\
1661         for(i=0; i<h; i++){\
1662             OP(dst[0], (A*src[0] + E*src[step+0]));\
1663             OP(dst[1], (A*src[1] + E*src[step+1]));\
1664             OP(dst[2], (A*src[2] + E*src[step+2]));\
1665             OP(dst[3], (A*src[3] + E*src[step+3]));\
1666             OP(dst[4], (A*src[4] + E*src[step+4]));\
1667             OP(dst[5], (A*src[5] + E*src[step+5]));\
1668             OP(dst[6], (A*src[6] + E*src[step+6]));\
1669             OP(dst[7], (A*src[7] + E*src[step+7]));\
1670             dst+= stride;\
1671             src+= stride;\
1672         }\
1673     }\
1674 }
1675
1676 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1677 #define op_put(a, b) a = (((b) + 32)>>6)
1678
1679 H264_CHROMA_MC(put_       , op_put)
1680 H264_CHROMA_MC(avg_       , op_avg)
1681 #undef op_avg
1682 #undef op_put
1683
1684 static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1685     const int A=(8-x)*(8-y);
1686     const int B=(  x)*(8-y);
1687     const int C=(8-x)*(  y);
1688     const int D=(  x)*(  y);
1689     int i;
1690
1691     assert(x<8 && y<8 && x>=0 && y>=0);
1692
1693     for(i=0; i<h; i++)
1694     {
1695         dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1696         dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1697         dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1698         dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1699         dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1700         dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1701         dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1702         dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1703         dst+= stride;
1704         src+= stride;
1705     }
1706 }
1707
1708 static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1709     const int A=(8-x)*(8-y);
1710     const int B=(  x)*(8-y);
1711     const int C=(8-x)*(  y);
1712     const int D=(  x)*(  y);
1713     int i;
1714
1715     assert(x<8 && y<8 && x>=0 && y>=0);
1716
1717     for(i=0; i<h; i++)
1718     {
1719         dst[0] = avg2(dst[0], ((A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6));
1720         dst[1] = avg2(dst[1], ((A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6));
1721         dst[2] = avg2(dst[2], ((A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6));
1722         dst[3] = avg2(dst[3], ((A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6));
1723         dst[4] = avg2(dst[4], ((A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6));
1724         dst[5] = avg2(dst[5], ((A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6));
1725         dst[6] = avg2(dst[6], ((A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6));
1726         dst[7] = avg2(dst[7], ((A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6));
1727         dst+= stride;
1728         src+= stride;
1729     }
1730 }
1731
1732 #define QPEL_MC(r, OPNAME, RND, OP) \
1733 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1734     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1735     int i;\
1736     for(i=0; i<h; i++)\
1737     {\
1738         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1739         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1740         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1741         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1742         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1743         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1744         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1745         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1746         dst+=dstStride;\
1747         src+=srcStride;\
1748     }\
1749 }\
1750 \
1751 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1752     const int w=8;\
1753     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1754     int i;\
1755     for(i=0; i<w; i++)\
1756     {\
1757         const int src0= src[0*srcStride];\
1758         const int src1= src[1*srcStride];\
1759         const int src2= src[2*srcStride];\
1760         const int src3= src[3*srcStride];\
1761         const int src4= src[4*srcStride];\
1762         const int src5= src[5*srcStride];\
1763         const int src6= src[6*srcStride];\
1764         const int src7= src[7*srcStride];\
1765         const int src8= src[8*srcStride];\
1766         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1767         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1768         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1769         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1770         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1771         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1772         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1773         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1774         dst++;\
1775         src++;\
1776     }\
1777 }\
1778 \
1779 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1780     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1781     int i;\
1782     \
1783     for(i=0; i<h; i++)\
1784     {\
1785         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1786         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1787         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1788         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1789         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1790         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1791         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1792         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1793         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1794         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1795         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1796         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1797         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1798         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1799         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1800         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1801         dst+=dstStride;\
1802         src+=srcStride;\
1803     }\
1804 }\
1805 \
1806 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1807     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1808     int i;\
1809     const int w=16;\
1810     for(i=0; i<w; i++)\
1811     {\
1812         const int src0= src[0*srcStride];\
1813         const int src1= src[1*srcStride];\
1814         const int src2= src[2*srcStride];\
1815         const int src3= src[3*srcStride];\
1816         const int src4= src[4*srcStride];\
1817         const int src5= src[5*srcStride];\
1818         const int src6= src[6*srcStride];\
1819         const int src7= src[7*srcStride];\
1820         const int src8= src[8*srcStride];\
1821         const int src9= src[9*srcStride];\
1822         const int src10= src[10*srcStride];\
1823         const int src11= src[11*srcStride];\
1824         const int src12= src[12*srcStride];\
1825         const int src13= src[13*srcStride];\
1826         const int src14= src[14*srcStride];\
1827         const int src15= src[15*srcStride];\
1828         const int src16= src[16*srcStride];\
1829         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1830         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1831         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1832         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1833         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1834         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1835         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1836         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1837         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1838         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1839         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1840         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1841         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1842         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1843         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1844         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1845         dst++;\
1846         src++;\
1847     }\
1848 }\
1849 \
1850 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1851     OPNAME ## pixels8_c(dst, src, stride, 8);\
1852 }\
1853 \
1854 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1855     uint8_t half[64];\
1856     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1857     OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1858 }\
1859 \
1860 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1861     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1862 }\
1863 \
1864 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1865     uint8_t half[64];\
1866     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1867     OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1868 }\
1869 \
1870 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1871     uint8_t full[16*9];\
1872     uint8_t half[64];\
1873     copy_block9(full, src, 16, stride, 9);\
1874     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1875     OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1876 }\
1877 \
1878 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1879     uint8_t full[16*9];\
1880     copy_block9(full, src, 16, stride, 9);\
1881     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1882 }\
1883 \
1884 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1885     uint8_t full[16*9];\
1886     uint8_t half[64];\
1887     copy_block9(full, src, 16, stride, 9);\
1888     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1889     OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1890 }\
1891 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1892     uint8_t full[16*9];\
1893     uint8_t halfH[72];\
1894     uint8_t halfV[64];\
1895     uint8_t halfHV[64];\
1896     copy_block9(full, src, 16, stride, 9);\
1897     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1898     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1899     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1900     OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1901 }\
1902 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1903     uint8_t full[16*9];\
1904     uint8_t halfH[72];\
1905     uint8_t halfHV[64];\
1906     copy_block9(full, src, 16, stride, 9);\
1907     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1908     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1909     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1910     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1911 }\
1912 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1913     uint8_t full[16*9];\
1914     uint8_t halfH[72];\
1915     uint8_t halfV[64];\
1916     uint8_t halfHV[64];\
1917     copy_block9(full, src, 16, stride, 9);\
1918     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1919     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1920     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1921     OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1922 }\
1923 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1924     uint8_t full[16*9];\
1925     uint8_t halfH[72];\
1926     uint8_t halfHV[64];\
1927     copy_block9(full, src, 16, stride, 9);\
1928     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1929     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1930     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1931     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1932 }\
1933 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1934     uint8_t full[16*9];\
1935     uint8_t halfH[72];\
1936     uint8_t halfV[64];\
1937     uint8_t halfHV[64];\
1938     copy_block9(full, src, 16, stride, 9);\
1939     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1940     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1941     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1942     OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1943 }\
1944 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1945     uint8_t full[16*9];\
1946     uint8_t halfH[72];\
1947     uint8_t halfHV[64];\
1948     copy_block9(full, src, 16, stride, 9);\
1949     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1950     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1951     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1952     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1953 }\
1954 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1955     uint8_t full[16*9];\
1956     uint8_t halfH[72];\
1957     uint8_t halfV[64];\
1958     uint8_t halfHV[64];\
1959     copy_block9(full, src, 16, stride, 9);\
1960     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1961     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1962     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1963     OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1964 }\
1965 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1966     uint8_t full[16*9];\
1967     uint8_t halfH[72];\
1968     uint8_t halfHV[64];\
1969     copy_block9(full, src, 16, stride, 9);\
1970     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1971     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1972     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1973     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1974 }\
1975 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1976     uint8_t halfH[72];\
1977     uint8_t halfHV[64];\
1978     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1979     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1980     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1981 }\
1982 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1983     uint8_t halfH[72];\
1984     uint8_t halfHV[64];\
1985     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1986     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1987     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1988 }\
1989 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1990     uint8_t full[16*9];\
1991     uint8_t halfH[72];\
1992     uint8_t halfV[64];\
1993     uint8_t halfHV[64];\
1994     copy_block9(full, src, 16, stride, 9);\
1995     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1996     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1997     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1998     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1999 }\
2000 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2001     uint8_t full[16*9];\
2002     uint8_t halfH[72];\
2003     copy_block9(full, src, 16, stride, 9);\
2004     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2005     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
2006     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2007 }\
2008 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2009     uint8_t full[16*9];\
2010     uint8_t halfH[72];\
2011     uint8_t halfV[64];\
2012     uint8_t halfHV[64];\
2013     copy_block9(full, src, 16, stride, 9);\
2014     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2015     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
2016     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2017     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
2018 }\
2019 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2020     uint8_t full[16*9];\
2021     uint8_t halfH[72];\
2022     copy_block9(full, src, 16, stride, 9);\
2023     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2024     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
2025     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2026 }\
2027 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2028     uint8_t halfH[72];\
2029     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
2030     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2031 }\
2032 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2033     OPNAME ## pixels16_c(dst, src, stride, 16);\
2034 }\
2035 \
2036 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2037     uint8_t half[256];\
2038     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2039     OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
2040 }\
2041 \
2042 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2043     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
2044 }\
2045 \
2046 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2047     uint8_t half[256];\
2048     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2049     OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
2050 }\
2051 \
2052 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2053     uint8_t full[24*17];\
2054     uint8_t half[256];\
2055     copy_block17(full, src, 24, stride, 17);\
2056     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2057     OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
2058 }\
2059 \
2060 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2061     uint8_t full[24*17];\
2062     copy_block17(full, src, 24, stride, 17);\
2063     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
2064 }\
2065 \
2066 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2067     uint8_t full[24*17];\
2068     uint8_t half[256];\
2069     copy_block17(full, src, 24, stride, 17);\
2070     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2071     OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
2072 }\
2073 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
2074     uint8_t full[24*17];\
2075     uint8_t halfH[272];\
2076     uint8_t halfV[256];\
2077     uint8_t halfHV[256];\
2078     copy_block17(full, src, 24, stride, 17);\
2079     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2080     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2081     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2082     OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2083 }\
2084 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2085     uint8_t full[24*17];\
2086     uint8_t halfH[272];\
2087     uint8_t halfHV[256];\
2088     copy_block17(full, src, 24, stride, 17);\
2089     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2090     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2091     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2092     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2093 }\
2094 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2095     uint8_t full[24*17];\
2096     uint8_t halfH[272];\
2097     uint8_t halfV[256];\
2098     uint8_t halfHV[256];\
2099     copy_block17(full, src, 24, stride, 17);\
2100     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2101     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2102     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2103     OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2104 }\
2105 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2106     uint8_t full[24*17];\
2107     uint8_t halfH[272];\
2108     uint8_t halfHV[256];\
2109     copy_block17(full, src, 24, stride, 17);\
2110     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2111     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2112     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2113     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2114 }\
2115 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2116     uint8_t full[24*17];\
2117     uint8_t halfH[272];\
2118     uint8_t halfV[256];\
2119     uint8_t halfHV[256];\
2120     copy_block17(full, src, 24, stride, 17);\
2121     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2122     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2123     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2124     OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2125 }\
2126 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2127     uint8_t full[24*17];\
2128     uint8_t halfH[272];\
2129     uint8_t halfHV[256];\
2130     copy_block17(full, src, 24, stride, 17);\
2131     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2132     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2133     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2134     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2135 }\
2136 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2137     uint8_t full[24*17];\
2138     uint8_t halfH[272];\
2139     uint8_t halfV[256];\
2140     uint8_t halfHV[256];\
2141     copy_block17(full, src, 24, stride, 17);\
2142     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
2143     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2144     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2145     OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2146 }\
2147 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2148     uint8_t full[24*17];\
2149     uint8_t halfH[272];\
2150     uint8_t halfHV[256];\
2151     copy_block17(full, src, 24, stride, 17);\
2152     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2153     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2154     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2155     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2156 }\
2157 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2158     uint8_t halfH[272];\
2159     uint8_t halfHV[256];\
2160     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2161     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2162     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2163 }\
2164 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2165     uint8_t halfH[272];\
2166     uint8_t halfHV[256];\
2167     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2168     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2169     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2170 }\
2171 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2172     uint8_t full[24*17];\
2173     uint8_t halfH[272];\
2174     uint8_t halfV[256];\
2175     uint8_t halfHV[256];\
2176     copy_block17(full, src, 24, stride, 17);\
2177     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2178     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2179     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2180     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2181 }\
2182 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2183     uint8_t full[24*17];\
2184     uint8_t halfH[272];\
2185     copy_block17(full, src, 24, stride, 17);\
2186     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2187     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2188     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2189 }\
2190 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2191     uint8_t full[24*17];\
2192     uint8_t halfH[272];\
2193     uint8_t halfV[256];\
2194     uint8_t halfHV[256];\
2195     copy_block17(full, src, 24, stride, 17);\
2196     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2197     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2198     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2199     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2200 }\
2201 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2202     uint8_t full[24*17];\
2203     uint8_t halfH[272];\
2204     copy_block17(full, src, 24, stride, 17);\
2205     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2206     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2207     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2208 }\
2209 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2210     uint8_t halfH[272];\
2211     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2212     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2213 }
2214
2215 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2216 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2217 #define op_put(a, b) a = cm[((b) + 16)>>5]
2218 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2219
2220 QPEL_MC(0, put_       , _       , op_put)
2221 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2222 QPEL_MC(0, avg_       , _       , op_avg)
2223 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
2224 #undef op_avg
2225 #undef op_avg_no_rnd
2226 #undef op_put
2227 #undef op_put_no_rnd
2228
2229 #if 1
2230 #define H264_LOWPASS(OPNAME, OP, OP2) \
2231 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2232     const int h=2;\
2233     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2234     int i;\
2235     for(i=0; i<h; i++)\
2236     {\
2237         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2238         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2239         dst+=dstStride;\
2240         src+=srcStride;\
2241     }\
2242 }\
2243 \
2244 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2245     const int w=2;\
2246     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2247     int i;\
2248     for(i=0; i<w; i++)\
2249     {\
2250         const int srcB= src[-2*srcStride];\
2251         const int srcA= src[-1*srcStride];\
2252         const int src0= src[0 *srcStride];\
2253         const int src1= src[1 *srcStride];\
2254         const int src2= src[2 *srcStride];\
2255         const int src3= src[3 *srcStride];\
2256         const int src4= src[4 *srcStride];\
2257         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2258         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2259         dst++;\
2260         src++;\
2261     }\
2262 }\
2263 \
2264 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2265     const int h=2;\
2266     const int w=2;\
2267     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2268     int i;\
2269     src -= 2*srcStride;\
2270     for(i=0; i<h+5; i++)\
2271     {\
2272         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2273         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2274         tmp+=tmpStride;\
2275         src+=srcStride;\
2276     }\
2277     tmp -= tmpStride*(h+5-2);\
2278     for(i=0; i<w; i++)\
2279     {\
2280         const int tmpB= tmp[-2*tmpStride];\
2281         const int tmpA= tmp[-1*tmpStride];\
2282         const int tmp0= tmp[0 *tmpStride];\
2283         const int tmp1= tmp[1 *tmpStride];\
2284         const int tmp2= tmp[2 *tmpStride];\
2285         const int tmp3= tmp[3 *tmpStride];\
2286         const int tmp4= tmp[4 *tmpStride];\
2287         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2288         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2289         dst++;\
2290         tmp++;\
2291     }\
2292 }\
2293 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2294     const int h=4;\
2295     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2296     int i;\
2297     for(i=0; i<h; i++)\
2298     {\
2299         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2300         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2301         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2302         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2303         dst+=dstStride;\
2304         src+=srcStride;\
2305     }\
2306 }\
2307 \
2308 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2309     const int w=4;\
2310     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2311     int i;\
2312     for(i=0; i<w; i++)\
2313     {\
2314         const int srcB= src[-2*srcStride];\
2315         const int srcA= src[-1*srcStride];\
2316         const int src0= src[0 *srcStride];\
2317         const int src1= src[1 *srcStride];\
2318         const int src2= src[2 *srcStride];\
2319         const int src3= src[3 *srcStride];\
2320         const int src4= src[4 *srcStride];\
2321         const int src5= src[5 *srcStride];\
2322         const int src6= src[6 *srcStride];\
2323         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2324         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2325         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2326         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2327         dst++;\
2328         src++;\
2329     }\
2330 }\
2331 \
2332 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2333     const int h=4;\
2334     const int w=4;\
2335     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2336     int i;\
2337     src -= 2*srcStride;\
2338     for(i=0; i<h+5; i++)\
2339     {\
2340         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2341         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2342         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2343         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2344         tmp+=tmpStride;\
2345         src+=srcStride;\
2346     }\
2347     tmp -= tmpStride*(h+5-2);\
2348     for(i=0; i<w; i++)\
2349     {\
2350         const int tmpB= tmp[-2*tmpStride];\
2351         const int tmpA= tmp[-1*tmpStride];\
2352         const int tmp0= tmp[0 *tmpStride];\
2353         const int tmp1= tmp[1 *tmpStride];\
2354         const int tmp2= tmp[2 *tmpStride];\
2355         const int tmp3= tmp[3 *tmpStride];\
2356         const int tmp4= tmp[4 *tmpStride];\
2357         const int tmp5= tmp[5 *tmpStride];\
2358         const int tmp6= tmp[6 *tmpStride];\
2359         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2360         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2361         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2362         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2363         dst++;\
2364         tmp++;\
2365     }\
2366 }\
2367 \
2368 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2369     const int h=8;\
2370     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2371     int i;\
2372     for(i=0; i<h; i++)\
2373     {\
2374         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2375         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2376         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2377         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2378         OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2379         OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2380         OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2381         OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2382         dst+=dstStride;\
2383         src+=srcStride;\
2384     }\
2385 }\
2386 \
2387 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2388     const int w=8;\
2389     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2390     int i;\
2391     for(i=0; i<w; i++)\
2392     {\
2393         const int srcB= src[-2*srcStride];\
2394         const int srcA= src[-1*srcStride];\
2395         const int src0= src[0 *srcStride];\
2396         const int src1= src[1 *srcStride];\
2397         const int src2= src[2 *srcStride];\
2398         const int src3= src[3 *srcStride];\
2399         const int src4= src[4 *srcStride];\
2400         const int src5= src[5 *srcStride];\
2401         const int src6= src[6 *srcStride];\
2402         const int src7= src[7 *srcStride];\
2403         const int src8= src[8 *srcStride];\
2404         const int src9= src[9 *srcStride];\
2405         const int src10=src[10*srcStride];\
2406         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2407         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2408         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2409         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2410         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2411         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2412         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2413         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2414         dst++;\
2415         src++;\
2416     }\
2417 }\
2418 \
2419 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2420     const int h=8;\
2421     const int w=8;\
2422     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2423     int i;\
2424     src -= 2*srcStride;\
2425     for(i=0; i<h+5; i++)\
2426     {\
2427         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2428         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2429         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2430         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2431         tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2432         tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2433         tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2434         tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2435         tmp+=tmpStride;\
2436         src+=srcStride;\
2437     }\
2438     tmp -= tmpStride*(h+5-2);\
2439     for(i=0; i<w; i++)\
2440     {\
2441         const int tmpB= tmp[-2*tmpStride];\
2442         const int tmpA= tmp[-1*tmpStride];\
2443         const int tmp0= tmp[0 *tmpStride];\
2444         const int tmp1= tmp[1 *tmpStride];\
2445         const int tmp2= tmp[2 *tmpStride];\
2446         const int tmp3= tmp[3 *tmpStride];\
2447         const int tmp4= tmp[4 *tmpStride];\
2448         const int tmp5= tmp[5 *tmpStride];\
2449         const int tmp6= tmp[6 *tmpStride];\
2450         const int tmp7= tmp[7 *tmpStride];\
2451         const int tmp8= tmp[8 *tmpStride];\
2452         const int tmp9= tmp[9 *tmpStride];\
2453         const int tmp10=tmp[10*tmpStride];\
2454         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2455         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2456         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2457         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2458         OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2459         OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2460         OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2461         OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2462         dst++;\
2463         tmp++;\
2464     }\
2465 }\
2466 \
2467 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2468     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2469     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2470     src += 8*srcStride;\
2471     dst += 8*dstStride;\
2472     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2473     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2474 }\
2475 \
2476 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2477     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2478     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2479     src += 8*srcStride;\
2480     dst += 8*dstStride;\
2481     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2482     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2483 }\
2484 \
2485 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2486     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2487     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2488     src += 8*srcStride;\
2489     dst += 8*dstStride;\
2490     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2491     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2492 }\
2493
2494 #define H264_MC(OPNAME, SIZE) \
2495 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2496     OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2497 }\
2498 \
2499 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2500     uint8_t half[SIZE*SIZE];\
2501     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2502     OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2503 }\
2504 \
2505 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2506     OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2507 }\
2508 \
2509 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2510     uint8_t half[SIZE*SIZE];\
2511     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2512     OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2513 }\
2514 \
2515 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2516     uint8_t full[SIZE*(SIZE+5)];\
2517     uint8_t * const full_mid= full + SIZE*2;\
2518     uint8_t half[SIZE*SIZE];\
2519     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2520     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2521     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2522 }\
2523 \
2524 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2525     uint8_t full[SIZE*(SIZE+5)];\
2526     uint8_t * const full_mid= full + SIZE*2;\
2527     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2528     OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2529 }\
2530 \
2531 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2532     uint8_t full[SIZE*(SIZE+5)];\
2533     uint8_t * const full_mid= full + SIZE*2;\
2534     uint8_t half[SIZE*SIZE];\
2535     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2536     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2537     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2538 }\
2539 \
2540 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2541     uint8_t full[SIZE*(SIZE+5)];\
2542     uint8_t * const full_mid= full + SIZE*2;\
2543     uint8_t halfH[SIZE*SIZE];\
2544     uint8_t halfV[SIZE*SIZE];\
2545     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2546     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2547     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2548     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2549 }\
2550 \
2551 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2552     uint8_t full[SIZE*(SIZE+5)];\
2553     uint8_t * const full_mid= full + SIZE*2;\
2554     uint8_t halfH[SIZE*SIZE];\
2555     uint8_t halfV[SIZE*SIZE];\
2556     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2557     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2558     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2559     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2560 }\
2561 \
2562 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2563     uint8_t full[SIZE*(SIZE+5)];\
2564     uint8_t * const full_mid= full + SIZE*2;\
2565     uint8_t halfH[SIZE*SIZE];\
2566     uint8_t halfV[SIZE*SIZE];\
2567     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2568     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2569     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2570     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2571 }\
2572 \
2573 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2574     uint8_t full[SIZE*(SIZE+5)];\
2575     uint8_t * const full_mid= full + SIZE*2;\
2576     uint8_t halfH[SIZE*SIZE];\
2577     uint8_t halfV[SIZE*SIZE];\
2578     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2579     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2580     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2581     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2582 }\
2583 \
2584 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2585     int16_t tmp[SIZE*(SIZE+5)];\
2586     OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2587 }\
2588 \
2589 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2590     int16_t tmp[SIZE*(SIZE+5)];\
2591     uint8_t halfH[SIZE*SIZE];\
2592     uint8_t halfHV[SIZE*SIZE];\
2593     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2594     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2595     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2596 }\
2597 \
2598 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2599     int16_t tmp[SIZE*(SIZE+5)];\
2600     uint8_t halfH[SIZE*SIZE];\
2601     uint8_t halfHV[SIZE*SIZE];\
2602     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2603     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2604     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2605 }\
2606 \
2607 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2608     uint8_t full[SIZE*(SIZE+5)];\
2609     uint8_t * const full_mid= full + SIZE*2;\
2610     int16_t tmp[SIZE*(SIZE+5)];\
2611     uint8_t halfV[SIZE*SIZE];\
2612     uint8_t halfHV[SIZE*SIZE];\
2613     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2614     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2615     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2616     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2617 }\
2618 \
2619 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2620     uint8_t full[SIZE*(SIZE+5)];\
2621     uint8_t * const full_mid= full + SIZE*2;\
2622     int16_t tmp[SIZE*(SIZE+5)];\
2623     uint8_t halfV[SIZE*SIZE];\
2624     uint8_t halfHV[SIZE*SIZE];\
2625     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2626     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2627     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2628     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2629 }\
2630
2631 #define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2632 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2633 #define op_put(a, b)  a = cm[((b) + 16)>>5]
2634 #define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2635 #define op2_put(a, b)  a = cm[((b) + 512)>>10]
2636
2637 H264_LOWPASS(put_       , op_put, op2_put)
2638 H264_LOWPASS(avg_       , op_avg, op2_avg)
2639 H264_MC(put_, 2)
2640 H264_MC(put_, 4)
2641 H264_MC(put_, 8)
2642 H264_MC(put_, 16)
2643 H264_MC(avg_, 4)
2644 H264_MC(avg_, 8)
2645 H264_MC(avg_, 16)
2646
2647 #undef op_avg
2648 #undef op_put
2649 #undef op2_avg
2650 #undef op2_put
2651 #endif
2652
2653 #define op_scale1(x)  block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2654 #define op_scale2(x)  dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2655 #define H264_WEIGHT(W,H) \
2656 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2657     int y; \
2658     offset <<= log2_denom; \
2659     if(log2_denom) offset += 1<<(log2_denom-1); \
2660     for(y=0; y<H; y++, block += stride){ \
2661         op_scale1(0); \
2662         op_scale1(1); \
2663         if(W==2) continue; \
2664         op_scale1(2); \
2665         op_scale1(3); \
2666         if(W==4) continue; \
2667         op_scale1(4); \
2668         op_scale1(5); \
2669         op_scale1(6); \
2670         op_scale1(7); \
2671         if(W==8) continue; \
2672         op_scale1(8); \
2673         op_scale1(9); \
2674         op_scale1(10); \
2675         op_scale1(11); \
2676         op_scale1(12); \
2677         op_scale1(13); \
2678         op_scale1(14); \
2679         op_scale1(15); \
2680     } \
2681 } \
2682 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2683     int y; \
2684     offset = ((offset + 1) | 1) << log2_denom; \
2685     for(y=0; y<H; y++, dst += stride, src += stride){ \
2686         op_scale2(0); \
2687         op_scale2(1); \
2688         if(W==2) continue; \
2689         op_scale2(2); \
2690         op_scale2(3); \
2691         if(W==4) continue; \
2692         op_scale2(4); \
2693         op_scale2(5); \
2694         op_scale2(6); \
2695         op_scale2(7); \
2696         if(W==8) continue; \
2697         op_scale2(8); \
2698         op_scale2(9); \
2699         op_scale2(10); \
2700         op_scale2(11); \
2701         op_scale2(12); \
2702         op_scale2(13); \
2703         op_scale2(14); \
2704         op_scale2(15); \
2705     } \
2706 }
2707
2708 H264_WEIGHT(16,16)
2709 H264_WEIGHT(16,8)
2710 H264_WEIGHT(8,16)
2711 H264_WEIGHT(8,8)
2712 H264_WEIGHT(8,4)
2713 H264_WEIGHT(4,8)
2714 H264_WEIGHT(4,4)
2715 H264_WEIGHT(4,2)
2716 H264_WEIGHT(2,4)
2717 H264_WEIGHT(2,2)
2718
2719 #undef op_scale1
2720 #undef op_scale2
2721 #undef H264_WEIGHT
2722
2723 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2724     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2725     int i;
2726
2727     for(i=0; i<h; i++){
2728         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2729         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2730         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2731         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2732         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2733         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2734         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2735         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2736         dst+=dstStride;
2737         src+=srcStride;
2738     }
2739 }
2740
2741 #if CONFIG_CAVS_DECODER
2742 /* AVS specific */
2743 void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2744
2745 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2746     put_pixels8_c(dst, src, stride, 8);
2747 }
2748 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2749     avg_pixels8_c(dst, src, stride, 8);
2750 }
2751 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2752     put_pixels16_c(dst, src, stride, 16);
2753 }
2754 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2755     avg_pixels16_c(dst, src, stride, 16);
2756 }
2757 #endif /* CONFIG_CAVS_DECODER */
2758
2759 void ff_mlp_init(DSPContext* c, AVCodecContext *avctx);
2760
2761 #if CONFIG_VC1_DECODER
2762 /* VC-1 specific */
2763 void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2764
2765 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2766     put_pixels8_c(dst, src, stride, 8);
2767 }
2768 void ff_avg_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2769     avg_pixels8_c(dst, src, stride, 8);
2770 }
2771 #endif /* CONFIG_VC1_DECODER */
2772
2773 void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
2774
2775 /* H264 specific */
2776 void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2777
2778 #if CONFIG_RV30_DECODER
2779 void ff_rv30dsp_init(DSPContext* c, AVCodecContext *avctx);
2780 #endif /* CONFIG_RV30_DECODER */
2781
2782 #if CONFIG_RV40_DECODER
2783 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2784     put_pixels16_xy2_c(dst, src, stride, 16);
2785 }
2786 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2787     avg_pixels16_xy2_c(dst, src, stride, 16);
2788 }
2789 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2790     put_pixels8_xy2_c(dst, src, stride, 8);
2791 }
2792 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2793     avg_pixels8_xy2_c(dst, src, stride, 8);
2794 }
2795
2796 void ff_rv40dsp_init(DSPContext* c, AVCodecContext *avctx);
2797 #endif /* CONFIG_RV40_DECODER */
2798
2799 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2800     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2801     int i;
2802
2803     for(i=0; i<w; i++){
2804         const int src_1= src[ -srcStride];
2805         const int src0 = src[0          ];
2806         const int src1 = src[  srcStride];
2807         const int src2 = src[2*srcStride];
2808         const int src3 = src[3*srcStride];
2809         const int src4 = src[4*srcStride];
2810         const int src5 = src[5*srcStride];
2811         const int src6 = src[6*srcStride];
2812         const int src7 = src[7*srcStride];
2813         const int src8 = src[8*srcStride];
2814         const int src9 = src[9*srcStride];
2815         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2816         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2817         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2818         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2819         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2820         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2821         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2822         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2823         src++;
2824         dst++;
2825     }
2826 }
2827
2828 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2829     put_pixels8_c(dst, src, stride, 8);
2830 }
2831
2832 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2833     uint8_t half[64];
2834     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2835     put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2836 }
2837
2838 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2839     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2840 }
2841
2842 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2843     uint8_t half[64];
2844     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2845     put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2846 }
2847
2848 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2849     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2850 }
2851
2852 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2853     uint8_t halfH[88];
2854     uint8_t halfV[64];
2855     uint8_t halfHV[64];
2856     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2857     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2858     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2859     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2860 }
2861 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2862     uint8_t halfH[88];
2863     uint8_t halfV[64];
2864     uint8_t halfHV[64];
2865     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2866     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2867     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2868     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2869 }
2870 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2871     uint8_t halfH[88];
2872     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2873     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2874 }
2875
2876 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2877     if(CONFIG_ANY_H263) {
2878     int x;
2879     const int strength= ff_h263_loop_filter_strength[qscale];
2880
2881     for(x=0; x<8; x++){
2882         int d1, d2, ad1;
2883         int p0= src[x-2*stride];
2884         int p1= src[x-1*stride];
2885         int p2= src[x+0*stride];
2886         int p3= src[x+1*stride];
2887         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2888
2889         if     (d<-2*strength) d1= 0;
2890         else if(d<-  strength) d1=-2*strength - d;
2891         else if(d<   strength) d1= d;
2892         else if(d< 2*strength) d1= 2*strength - d;
2893         else                   d1= 0;
2894
2895         p1 += d1;
2896         p2 -= d1;
2897         if(p1&256) p1= ~(p1>>31);
2898         if(p2&256) p2= ~(p2>>31);
2899
2900         src[x-1*stride] = p1;
2901         src[x+0*stride] = p2;
2902
2903         ad1= FFABS(d1)>>1;
2904
2905         d2= av_clip((p0-p3)/4, -ad1, ad1);
2906
2907         src[x-2*stride] = p0 - d2;
2908         src[x+  stride] = p3 + d2;
2909     }
2910     }
2911 }
2912
2913 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2914     if(CONFIG_ANY_H263) {
2915     int y;
2916     const int strength= ff_h263_loop_filter_strength[qscale];
2917
2918     for(y=0; y<8; y++){
2919         int d1, d2, ad1;
2920         int p0= src[y*stride-2];
2921         int p1= src[y*stride-1];
2922         int p2= src[y*stride+0];
2923         int p3= src[y*stride+1];
2924         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2925
2926         if     (d<-2*strength) d1= 0;
2927         else if(d<-  strength) d1=-2*strength - d;
2928         else if(d<   strength) d1= d;
2929         else if(d< 2*strength) d1= 2*strength - d;
2930         else                   d1= 0;
2931
2932         p1 += d1;
2933         p2 -= d1;
2934         if(p1&256) p1= ~(p1>>31);
2935         if(p2&256) p2= ~(p2>>31);
2936
2937         src[y*stride-1] = p1;
2938         src[y*stride+0] = p2;
2939
2940         ad1= FFABS(d1)>>1;
2941
2942         d2= av_clip((p0-p3)/4, -ad1, ad1);
2943
2944         src[y*stride-2] = p0 - d2;
2945         src[y*stride+1] = p3 + d2;
2946     }
2947     }
2948 }
2949
2950 static void h261_loop_filter_c(uint8_t *src, int stride){
2951     int x,y,xy,yz;
2952     int temp[64];
2953
2954     for(x=0; x<8; x++){
2955         temp[x      ] = 4*src[x           ];
2956         temp[x + 7*8] = 4*src[x + 7*stride];
2957     }
2958     for(y=1; y<7; y++){
2959         for(x=0; x<8; x++){
2960             xy = y * stride + x;
2961             yz = y * 8 + x;
2962             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2963         }
2964     }
2965
2966     for(y=0; y<8; y++){
2967         src[  y*stride] = (temp[  y*8] + 2)>>2;
2968         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2969         for(x=1; x<7; x++){
2970             xy = y * stride + x;
2971             yz = y * 8 + x;
2972             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2973         }
2974     }
2975 }
2976
2977 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2978 {
2979     int i, d;
2980     for( i = 0; i < 4; i++ ) {
2981         if( tc0[i] < 0 ) {
2982             pix += 4*ystride;
2983             continue;
2984         }
2985         for( d = 0; d < 4; d++ ) {
2986             const int p0 = pix[-1*xstride];
2987             const int p1 = pix[-2*xstride];
2988             const int p2 = pix[-3*xstride];
2989             const int q0 = pix[0];
2990             const int q1 = pix[1*xstride];
2991             const int q2 = pix[2*xstride];
2992
2993             if( FFABS( p0 - q0 ) < alpha &&
2994                 FFABS( p1 - p0 ) < beta &&
2995                 FFABS( q1 - q0 ) < beta ) {
2996
2997                 int tc = tc0[i];
2998                 int i_delta;
2999
3000                 if( FFABS( p2 - p0 ) < beta ) {
3001                     pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
3002                     tc++;
3003                 }
3004                 if( FFABS( q2 - q0 ) < beta ) {
3005                     pix[   xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
3006                     tc++;
3007                 }
3008
3009                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
3010                 pix[-xstride] = av_clip_uint8( p0 + i_delta );    /* p0' */
3011                 pix[0]        = av_clip_uint8( q0 - i_delta );    /* q0' */
3012             }
3013             pix += ystride;
3014         }
3015     }
3016 }
3017 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3018 {
3019     h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
3020 }
3021 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3022 {
3023     h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
3024 }
3025
3026 static inline void h264_loop_filter_luma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3027 {
3028     int d;
3029     for( d = 0; d < 16; d++ ) {
3030         const int p2 = pix[-3*xstride];
3031         const int p1 = pix[-2*xstride];
3032         const int p0 = pix[-1*xstride];
3033
3034         const int q0 = pix[ 0*xstride];
3035         const int q1 = pix[ 1*xstride];
3036         const int q2 = pix[ 2*xstride];
3037
3038         if( FFABS( p0 - q0 ) < alpha &&
3039             FFABS( p1 - p0 ) < beta &&
3040             FFABS( q1 - q0 ) < beta ) {
3041
3042             if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
3043                 if( FFABS( p2 - p0 ) < beta)
3044                 {
3045                     const int p3 = pix[-4*xstride];
3046                     /* p0', p1', p2' */
3047                     pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
3048                     pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
3049                     pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
3050                 } else {
3051                     /* p0' */
3052                     pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3053                 }
3054                 if( FFABS( q2 - q0 ) < beta)
3055                 {
3056                     const int q3 = pix[3*xstride];
3057                     /* q0', q1', q2' */
3058                     pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
3059                     pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
3060                     pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
3061                 } else {
3062                     /* q0' */
3063                     pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3064                 }
3065             }else{
3066                 /* p0', q0' */
3067                 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3068                 pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3069             }
3070         }
3071         pix += ystride;
3072     }
3073 }
3074 static void h264_v_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3075 {
3076     h264_loop_filter_luma_intra_c(pix, stride, 1, alpha, beta);
3077 }
3078 static void h264_h_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3079 {
3080     h264_loop_filter_luma_intra_c(pix, 1, stride, alpha, beta);
3081 }
3082
3083 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
3084 {
3085     int i, d;
3086     for( i = 0; i < 4; i++ ) {
3087         const int tc = tc0[i];
3088         if( tc <= 0 ) {
3089             pix += 2*ystride;
3090             continue;
3091         }
3092         for( d = 0; d < 2; d++ ) {
3093             const int p0 = pix[-1*xstride];
3094             const int p1 = pix[-2*xstride];
3095             const int q0 = pix[0];
3096             const int q1 = pix[1*xstride];
3097
3098             if( FFABS( p0 - q0 ) < alpha &&
3099                 FFABS( p1 - p0 ) < beta &&
3100                 FFABS( q1 - q0 ) < beta ) {
3101
3102                 int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
3103
3104                 pix[-xstride] = av_clip_uint8( p0 + delta );    /* p0' */
3105                 pix[0]        = av_clip_uint8( q0 - delta );    /* q0' */
3106             }
3107             pix += ystride;
3108         }
3109     }
3110 }
3111 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3112 {
3113     h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
3114 }
3115 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3116 {
3117     h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
3118 }
3119
3120 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3121 {
3122     int d;
3123     for( d = 0; d < 8; d++ ) {
3124         const int p0 = pix[-1*xstride];
3125         const int p1 = pix[-2*xstride];
3126         const int q0 = pix[0];
3127         const int q1 = pix[1*xstride];
3128
3129         if( FFABS( p0 - q0 ) < alpha &&
3130             FFABS( p1 - p0 ) < beta &&
3131             FFABS( q1 - q0 ) < beta ) {
3132
3133             pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
3134             pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
3135         }
3136         pix += ystride;
3137     }
3138 }
3139 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3140 {
3141     h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
3142 }
3143 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3144 {
3145     h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
3146 }
3147
3148 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3149 {
3150     int s, i;
3151
3152     s = 0;
3153     for(i=0;i<h;i++) {
3154         s += abs(pix1[0] - pix2[0]);
3155         s += abs(pix1[1] - pix2[1]);
3156         s += abs(pix1[2] - pix2[2]);
3157         s += abs(pix1[3] - pix2[3]);
3158         s += abs(pix1[4] - pix2[4]);
3159         s += abs(pix1[5] - pix2[5]);
3160         s += abs(pix1[6] - pix2[6]);
3161         s += abs(pix1[7] - pix2[7]);
3162         s += abs(pix1[8] - pix2[8]);
3163         s += abs(pix1[9] - pix2[9]);
3164         s += abs(pix1[10] - pix2[10]);
3165         s += abs(pix1[11] - pix2[11]);
3166         s += abs(pix1[12] - pix2[12]);
3167         s += abs(pix1[13] - pix2[13]);
3168         s += abs(pix1[14] - pix2[14]);
3169         s += abs(pix1[15] - pix2[15]);
3170         pix1 += line_size;
3171         pix2 += line_size;
3172     }
3173     return s;
3174 }
3175
3176 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3177 {
3178     int s, i;
3179
3180     s = 0;
3181     for(i=0;i<h;i++) {
3182         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3183         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3184         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3185         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3186         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3187         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3188         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3189         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3190         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
3191         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
3192         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
3193         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
3194         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
3195         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
3196         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
3197         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
3198         pix1 += line_size;
3199         pix2 += line_size;
3200     }
3201     return s;
3202 }
3203
3204 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3205 {
3206     int s, i;
3207     uint8_t *pix3 = pix2 + line_size;
3208
3209     s = 0;
3210     for(i=0;i<h;i++) {
3211         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3212         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3213         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3214         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3215         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3216         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3217         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3218         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3219         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
3220         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
3221         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
3222         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
3223         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
3224         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
3225         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
3226         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
3227         pix1 += line_size;
3228         pix2 += line_size;
3229         pix3 += line_size;
3230     }
3231     return s;
3232 }
3233
3234 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3235 {
3236     int s, i;
3237     uint8_t *pix3 = pix2 + line_size;
3238
3239     s = 0;
3240     for(i=0;i<h;i++) {
3241         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3242         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3243         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3244         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3245         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3246         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3247         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3248         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3249         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3250         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3251         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3252         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3253         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3254         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3255         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3256         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3257         pix1 += line_size;
3258         pix2 += line_size;
3259         pix3 += line_size;
3260     }
3261     return s;
3262 }
3263
3264 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3265 {
3266     int s, i;
3267
3268     s = 0;
3269     for(i=0;i<h;i++) {
3270         s += abs(pix1[0] - pix2[0]);
3271         s += abs(pix1[1] - pix2[1]);
3272         s += abs(pix1[2] - pix2[2]);
3273         s += abs(pix1[3] - pix2[3]);
3274         s += abs(pix1[4] - pix2[4]);
3275         s += abs(pix1[5] - pix2[5]);
3276         s += abs(pix1[6] - pix2[6]);
3277         s += abs(pix1[7] - pix2[7]);
3278         pix1 += line_size;
3279         pix2 += line_size;
3280     }
3281     return s;
3282 }
3283
3284 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3285 {
3286     int s, i;
3287
3288     s = 0;
3289     for(i=0;i<h;i++) {
3290         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3291         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3292         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3293         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3294         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3295         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3296         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3297         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3298         pix1 += line_size;
3299         pix2 += line_size;
3300     }
3301     return s;
3302 }
3303
3304 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3305 {
3306     int s, i;
3307     uint8_t *pix3 = pix2 + line_size;
3308
3309     s = 0;
3310     for(i=0;i<h;i++) {
3311         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3312         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3313         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3314         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3315         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3316         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3317         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3318         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3319         pix1 += line_size;
3320         pix2 += line_size;
3321         pix3 += line_size;
3322     }
3323     return s;
3324 }
3325
3326 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3327 {
3328     int s, i;
3329     uint8_t *pix3 = pix2 + line_size;
3330
3331     s = 0;
3332     for(i=0;i<h;i++) {
3333         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3334         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3335         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3336         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3337         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3338         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3339         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3340         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3341         pix1 += line_size;
3342         pix2 += line_size;
3343         pix3 += line_size;
3344     }
3345     return s;
3346 }
3347
3348 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3349     MpegEncContext *c = v;
3350     int score1=0;
3351     int score2=0;
3352     int x,y;
3353
3354     for(y=0; y<h; y++){
3355         for(x=0; x<16; x++){
3356             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3357         }
3358         if(y+1<h){
3359             for(x=0; x<15; x++){
3360                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
3361                              - s1[x+1] + s1[x+1+stride])
3362                         -FFABS(  s2[x  ] - s2[x  +stride]
3363                              - s2[x+1] + s2[x+1+stride]);
3364             }
3365         }
3366         s1+= stride;
3367         s2+= stride;
3368     }
3369
3370     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3371     else  return score1 + FFABS(score2)*8;
3372 }
3373
3374 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3375     MpegEncContext *c = v;
3376     int score1=0;
3377     int score2=0;
3378     int x,y;
3379
3380     for(y=0; y<h; y++){
3381         for(x=0; x<8; x++){
3382             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3383         }
3384         if(y+1<h){
3385             for(x=0; x<7; x++){
3386                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
3387                              - s1[x+1] + s1[x+1+stride])
3388                         -FFABS(  s2[x  ] - s2[x  +stride]
3389                              - s2[x+1] + s2[x+1+stride]);
3390             }
3391         }
3392         s1+= stride;
3393         s2+= stride;
3394     }
3395
3396     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3397     else  return score1 + FFABS(score2)*8;
3398 }
3399
3400 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3401     int i;
3402     unsigned int sum=0;
3403
3404     for(i=0; i<8*8; i++){
3405         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3406         int w= weight[i];
3407         b>>= RECON_SHIFT;
3408         assert(-512<b && b<512);
3409
3410         sum += (w*b)*(w*b)>>4;
3411     }
3412     return sum>>2;
3413 }
3414
3415 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3416     int i;
3417
3418     for(i=0; i<8*8; i++){
3419         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3420     }
3421 }
3422
3423 /**
3424  * permutes an 8x8 block.
3425  * @param block the block which will be permuted according to the given permutation vector
3426  * @param permutation the permutation vector
3427  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3428  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3429  *                  (inverse) permutated to scantable order!
3430  */
3431 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3432 {
3433     int i;
3434     DCTELEM temp[64];
3435
3436     if(last<=0) return;
3437     //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3438
3439     for(i=0; i<=last; i++){
3440         const int j= scantable[i];
3441         temp[j]= block[j];
3442         block[j]=0;
3443     }
3444
3445     for(i=0; i<=last; i++){
3446         const int j= scantable[i];
3447         const int perm_j= permutation[j];
3448         block[perm_j]= temp[j];
3449     }
3450 }
3451
3452 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3453     return 0;
3454 }
3455
3456 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3457     int i;
3458
3459     memset(cmp, 0, sizeof(void*)*6);
3460
3461     for(i=0; i<6; i++){
3462         switch(type&0xFF){
3463         case FF_CMP_SAD:
3464             cmp[i]= c->sad[i];
3465             break;
3466         case FF_CMP_SATD:
3467             cmp[i]= c->hadamard8_diff[i];
3468             break;
3469         case FF_CMP_SSE:
3470             cmp[i]= c->sse[i];
3471             break;
3472         case FF_CMP_DCT:
3473             cmp[i]= c->dct_sad[i];
3474             break;
3475         case FF_CMP_DCT264:
3476             cmp[i]= c->dct264_sad[i];
3477             break;
3478         case FF_CMP_DCTMAX:
3479             cmp[i]= c->dct_max[i];
3480             break;
3481         case FF_CMP_PSNR:
3482             cmp[i]= c->quant_psnr[i];
3483             break;
3484         case FF_CMP_BIT:
3485             cmp[i]= c->bit[i];
3486             break;
3487         case FF_CMP_RD:
3488             cmp[i]= c->rd[i];
3489             break;
3490         case FF_CMP_VSAD:
3491             cmp[i]= c->vsad[i];
3492             break;
3493         case FF_CMP_VSSE:
3494             cmp[i]= c->vsse[i];
3495             break;
3496         case FF_CMP_ZERO:
3497             cmp[i]= zero_cmp;
3498             break;
3499         case FF_CMP_NSSE:
3500             cmp[i]= c->nsse[i];
3501             break;
3502 #if CONFIG_SNOW_ENCODER
3503         case FF_CMP_W53:
3504             cmp[i]= c->w53[i];
3505             break;
3506         case FF_CMP_W97:
3507             cmp[i]= c->w97[i];
3508             break;
3509 #endif
3510         default:
3511             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3512         }
3513     }
3514 }
3515
3516 static void clear_block_c(DCTELEM *block)
3517 {
3518     memset(block, 0, sizeof(DCTELEM)*64);
3519 }
3520
3521 /**
3522  * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3523  */
3524 static void clear_blocks_c(DCTELEM *blocks)
3525 {
3526     memset(blocks, 0, sizeof(DCTELEM)*6*64);
3527 }
3528
3529 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3530     long i;
3531     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3532         long a = *(long*)(src+i);
3533         long b = *(long*)(dst+i);
3534         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3535     }
3536     for(; i<w; i++)
3537         dst[i+0] += src[i+0];
3538 }
3539
3540 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3541     long i;
3542     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3543         long a = *(long*)(src1+i);
3544         long b = *(long*)(src2+i);
3545         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3546     }
3547     for(; i<w; i++)
3548         dst[i] = src1[i]+src2[i];
3549 }
3550
3551 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3552     long i;
3553 #if !HAVE_FAST_UNALIGNED
3554     if((long)src2 & (sizeof(long)-1)){
3555         for(i=0; i+7<w; i+=8){
3556             dst[i+0] = src1[i+0]-src2[i+0];
3557             dst[i+1] = src1[i+1]-src2[i+1];
3558             dst[i+2] = src1[i+2]-src2[i+2];
3559             dst[i+3] = src1[i+3]-src2[i+3];
3560             dst[i+4] = src1[i+4]-src2[i+4];
3561             dst[i+5] = src1[i+5]-src2[i+5];
3562             dst[i+6] = src1[i+6]-src2[i+6];
3563             dst[i+7] = src1[i+7]-src2[i+7];
3564         }
3565     }else
3566 #endif
3567     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3568         long a = *(long*)(src1+i);
3569         long b = *(long*)(src2+i);
3570         *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3571     }
3572     for(; i<w; i++)
3573         dst[i+0] = src1[i+0]-src2[i+0];
3574 }
3575
3576 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
3577     int i;
3578     uint8_t l, lt;
3579
3580     l= *left;
3581     lt= *left_top;
3582
3583     for(i=0; i<w; i++){
3584         l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
3585         lt= src1[i];
3586         dst[i]= l;
3587     }
3588
3589     *left= l;
3590     *left_top= lt;
3591 }
3592
3593 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
3594     int i;
3595     uint8_t l, lt;
3596
3597     l= *left;
3598     lt= *left_top;
3599
3600     for(i=0; i<w; i++){
3601         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3602         lt= src1[i];
3603         l= src2[i];
3604         dst[i]= l - pred;
3605     }
3606
3607     *left= l;
3608     *left_top= lt;
3609 }
3610
3611 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
3612     int i;
3613
3614     for(i=0; i<w-1; i++){
3615         acc+= src[i];
3616         dst[i]= acc;
3617         i++;
3618         acc+= src[i];
3619         dst[i]= acc;
3620     }
3621
3622     for(; i<w; i++){
3623         acc+= src[i];
3624         dst[i]= acc;
3625     }
3626
3627     return acc;
3628 }
3629
3630 #if HAVE_BIGENDIAN
3631 #define B 3
3632 #define G 2
3633 #define R 1
3634 #else
3635 #define B 0
3636 #define G 1
3637 #define R 2
3638 #endif
3639 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue){
3640     int i;
3641     int r,g,b;
3642     r= *red;
3643     g= *green;
3644     b= *blue;
3645
3646     for(i=0; i<w; i++){
3647         b+= src[4*i+B];
3648         g+= src[4*i+G];
3649         r+= src[4*i+R];
3650
3651         dst[4*i+B]= b;
3652         dst[4*i+G]= g;
3653         dst[4*i+R]= r;
3654     }
3655
3656     *red= r;
3657     *green= g;
3658     *blue= b;
3659 }
3660 #undef B
3661 #undef G
3662 #undef R
3663
3664 #define BUTTERFLY2(o1,o2,i1,i2) \
3665 o1= (i1)+(i2);\
3666 o2= (i1)-(i2);
3667
3668 #define BUTTERFLY1(x,y) \
3669 {\
3670     int a,b;\
3671     a= x;\
3672     b= y;\
3673     x= a+b;\
3674     y= a-b;\
3675 }
3676
3677 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3678
3679 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3680     int i;
3681     int temp[64];
3682     int sum=0;
3683
3684     assert(h==8);
3685
3686     for(i=0; i<8; i++){
3687         //FIXME try pointer walks
3688         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3689         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3690         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3691         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3692
3693         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3694         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3695         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3696         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3697
3698         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3699         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3700         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3701         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3702     }
3703
3704     for(i=0; i<8; i++){
3705         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3706         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3707         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3708         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3709
3710         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3711         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3712         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3713         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3714
3715         sum +=
3716              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3717             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3718             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3719             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3720     }
3721 #if 0
3722 static int maxi=0;
3723 if(sum>maxi){
3724     maxi=sum;
3725     printf("MAX:%d\n", maxi);
3726 }
3727 #endif
3728     return sum;
3729 }
3730
3731 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3732     int i;
3733     int temp[64];
3734     int sum=0;
3735
3736     assert(h==8);
3737
3738     for(i=0; i<8; i++){
3739         //FIXME try pointer walks
3740         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3741         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3742         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3743         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3744
3745         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3746         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3747         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3748         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3749
3750         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3751         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3752         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3753         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3754     }
3755
3756     for(i=0; i<8; i++){
3757         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3758         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3759         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3760         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3761
3762         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3763         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3764         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3765         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3766
3767         sum +=
3768              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3769             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3770             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3771             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3772     }
3773
3774     sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3775
3776     return sum;
3777 }
3778
3779 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3780     MpegEncContext * const s= (MpegEncContext *)c;
3781     DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3782     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3783
3784     assert(h==8);
3785
3786     s->dsp.diff_pixels(temp, src1, src2, stride);
3787     s->dsp.fdct(temp);
3788     return s->dsp.sum_abs_dctelem(temp);
3789 }
3790
3791 #if CONFIG_GPL
3792 #define DCT8_1D {\
3793     const int s07 = SRC(0) + SRC(7);\
3794     const int s16 = SRC(1) + SRC(6);\
3795     const int s25 = SRC(2) + SRC(5);\
3796     const int s34 = SRC(3) + SRC(4);\
3797     const int a0 = s07 + s34;\
3798     const int a1 = s16 + s25;\
3799     const int a2 = s07 - s34;\
3800     const int a3 = s16 - s25;\
3801     const int d07 = SRC(0) - SRC(7);\
3802     const int d16 = SRC(1) - SRC(6);\
3803     const int d25 = SRC(2) - SRC(5);\
3804     const int d34 = SRC(3) - SRC(4);\
3805     const int a4 = d16 + d25 + (d07 + (d07>>1));\
3806     const int a5 = d07 - d34 - (d25 + (d25>>1));\
3807     const int a6 = d07 + d34 - (d16 + (d16>>1));\
3808     const int a7 = d16 - d25 + (d34 + (d34>>1));\
3809     DST(0,  a0 + a1     ) ;\
3810     DST(1,  a4 + (a7>>2)) ;\
3811     DST(2,  a2 + (a3>>1)) ;\
3812     DST(3,  a5 + (a6>>2)) ;\
3813     DST(4,  a0 - a1     ) ;\
3814     DST(5,  a6 - (a5>>2)) ;\
3815     DST(6, (a2>>1) - a3 ) ;\
3816     DST(7, (a4>>2) - a7 ) ;\
3817 }
3818
3819 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3820     MpegEncContext * const s= (MpegEncContext *)c;
3821     DCTELEM dct[8][8];
3822     int i;
3823     int sum=0;
3824
3825     s->dsp.diff_pixels(dct[0], src1, src2, stride);
3826
3827 #define SRC(x) dct[i][x]
3828 #define DST(x,v) dct[i][x]= v
3829     for( i = 0; i < 8; i++ )
3830         DCT8_1D
3831 #undef SRC
3832 #undef DST
3833
3834 #define SRC(x) dct[x][i]
3835 #define DST(x,v) sum += FFABS(v)
3836     for( i = 0; i < 8; i++ )
3837         DCT8_1D
3838 #undef SRC
3839 #undef DST
3840     return sum;
3841 }
3842 #endif
3843
3844 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3845     MpegEncContext * const s= (MpegEncContext *)c;
3846     DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3847     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3848     int sum=0, i;
3849
3850     assert(h==8);
3851
3852     s->dsp.diff_pixels(temp, src1, src2, stride);
3853     s->dsp.fdct(temp);
3854
3855     for(i=0; i<64; i++)
3856         sum= FFMAX(sum, FFABS(temp[i]));
3857
3858     return sum;
3859 }
3860
3861 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3862     MpegEncContext * const s= (MpegEncContext *)c;
3863     DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3864     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3865     DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3866     int sum=0, i;
3867
3868     assert(h==8);
3869     s->mb_intra=0;
3870
3871     s->dsp.diff_pixels(temp, src1, src2, stride);
3872
3873     memcpy(bak, temp, 64*sizeof(DCTELEM));
3874
3875     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3876     s->dct_unquantize_inter(s, temp, 0, s->qscale);
3877     ff_simple_idct(temp); //FIXME
3878
3879     for(i=0; i<64; i++)
3880         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3881
3882     return sum;
3883 }
3884
3885 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3886     MpegEncContext * const s= (MpegEncContext *)c;
3887     const uint8_t *scantable= s->intra_scantable.permutated;
3888     DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3889     DECLARE_ALIGNED_16(uint64_t, aligned_src1[8]);
3890     DECLARE_ALIGNED_16(uint64_t, aligned_src2[8]);
3891     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3892     uint8_t * const lsrc1 = (uint8_t*)aligned_src1;
3893     uint8_t * const lsrc2 = (uint8_t*)aligned_src2;
3894     int i, last, run, bits, level, distortion, start_i;
3895     const int esc_length= s->ac_esc_length;
3896     uint8_t * length;
3897     uint8_t * last_length;
3898
3899     assert(h==8);
3900
3901     copy_block8(lsrc1, src1, 8, stride, 8);
3902     copy_block8(lsrc2, src2, 8, stride, 8);
3903
3904     s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
3905
3906     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3907
3908     bits=0;
3909
3910     if (s->mb_intra) {
3911         start_i = 1;
3912         length     = s->intra_ac_vlc_length;
3913         last_length= s->intra_ac_vlc_last_length;
3914         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3915     } else {
3916         start_i = 0;
3917         length     = s->inter_ac_vlc_length;
3918         last_length= s->inter_ac_vlc_last_length;
3919     }
3920
3921     if(last>=start_i){
3922         run=0;
3923         for(i=start_i; i<last; i++){
3924             int j= scantable[i];
3925             level= temp[j];
3926
3927             if(level){
3928                 level+=64;
3929                 if((level&(~127)) == 0){
3930                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3931                 }else
3932                     bits+= esc_length;
3933                 run=0;
3934             }else
3935                 run++;
3936         }
3937         i= scantable[last];
3938
3939         level= temp[i] + 64;
3940
3941         assert(level - 64);
3942
3943         if((level&(~127)) == 0){
3944             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3945         }else
3946             bits+= esc_length;
3947
3948     }
3949
3950     if(last>=0){
3951         if(s->mb_intra)
3952             s->dct_unquantize_intra(s, temp, 0, s->qscale);
3953         else
3954             s->dct_unquantize_inter(s, temp, 0, s->qscale);
3955     }
3956
3957     s->dsp.idct_add(lsrc2, 8, temp);
3958
3959     distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
3960
3961     return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3962 }
3963
3964 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3965     MpegEncContext * const s= (MpegEncContext *)c;
3966     const uint8_t *scantable= s->intra_scantable.permutated;
3967     DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3968     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3969     int i, last, run, bits, level, start_i;
3970     const int esc_length= s->ac_esc_length;
3971     uint8_t * length;
3972     uint8_t * last_length;
3973
3974     assert(h==8);
3975
3976     s->dsp.diff_pixels(temp, src1, src2, stride);
3977
3978     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3979
3980     bits=0;
3981
3982     if (s->mb_intra) {
3983         start_i = 1;
3984         length     = s->intra_ac_vlc_length;
3985         last_length= s->intra_ac_vlc_last_length;
3986         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3987     } else {
3988         start_i = 0;
3989         length     = s->inter_ac_vlc_length;
3990         last_length= s->inter_ac_vlc_last_length;
3991     }
3992
3993     if(last>=start_i){
3994         run=0;
3995         for(i=start_i; i<last; i++){
3996             int j= scantable[i];
3997             level= temp[j];
3998
3999             if(level){
4000                 level+=64;
4001                 if((level&(~127)) == 0){
4002                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
4003                 }else
4004                     bits+= esc_length;
4005                 run=0;
4006             }else
4007                 run++;
4008         }
4009         i= scantable[last];
4010
4011         level= temp[i] + 64;
4012
4013         assert(level - 64);
4014
4015         if((level&(~127)) == 0){
4016             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
4017         }else
4018             bits+= esc_length;
4019     }
4020
4021     return bits;
4022 }
4023
4024 #define VSAD_INTRA(size) \
4025 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
4026     int score=0;                                                                                            \
4027     int x,y;                                                                                                \
4028                                                                                                             \
4029     for(y=1; y<h; y++){                                                                                     \
4030         for(x=0; x<size; x+=4){                                                                             \
4031             score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
4032                    +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
4033         }                                                                                                   \
4034         s+= stride;                                                                                         \
4035     }                                                                                                       \
4036                                                                                                             \
4037     return score;                                                                                           \
4038 }
4039 VSAD_INTRA(8)
4040 VSAD_INTRA(16)
4041
4042 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
4043     int score=0;
4044     int x,y;
4045
4046     for(y=1; y<h; y++){
4047         for(x=0; x<16; x++){
4048             score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
4049         }
4050         s1+= stride;
4051         s2+= stride;
4052     }
4053
4054     return score;
4055 }
4056
4057 #define SQ(a) ((a)*(a))
4058 #define VSSE_INTRA(size) \
4059 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
4060     int score=0;                                                                                            \
4061     int x,y;                                                                                                \
4062                                                                                                             \
4063     for(y=1; y<h; y++){                                                                                     \
4064         for(x=0; x<size; x+=4){                                                                               \
4065             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
4066                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
4067         }                                                                                                   \
4068         s+= stride;                                                                                         \
4069     }                                                                                                       \
4070                                                                                                             \
4071     return score;                                                                                           \
4072 }
4073 VSSE_INTRA(8)
4074 VSSE_INTRA(16)
4075
4076 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
4077     int score=0;
4078     int x,y;
4079
4080     for(y=1; y<h; y++){
4081         for(x=0; x<16; x++){
4082             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
4083         }
4084         s1+= stride;
4085         s2+= stride;
4086     }
4087
4088     return score;
4089 }
4090
4091 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
4092                                int size){
4093     int score=0;
4094     int i;
4095     for(i=0; i<size; i++)
4096         score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
4097     return score;
4098 }
4099
4100 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
4101 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
4102 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
4103 #if CONFIG_GPL
4104 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
4105 #endif
4106 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
4107 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
4108 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
4109 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
4110
4111 static void vector_fmul_c(float *dst, const float *src, int len){
4112     int i;
4113     for(i=0; i<len; i++)
4114         dst[i] *= src[i];
4115 }
4116
4117 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
4118     int i;
4119     src1 += len-1;
4120     for(i=0; i<len; i++)
4121         dst[i] = src0[i] * src1[-i];
4122 }
4123
4124 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
4125     int i;
4126     for(i=0; i<len; i++)
4127         dst[i] = src0[i] * src1[i] + src2[i];
4128 }
4129
4130 void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
4131     int i,j;
4132     dst += len;
4133     win += len;
4134     src0+= len;
4135     for(i=-len, j=len-1; i<0; i++, j--) {
4136         float s0 = src0[i];
4137         float s1 = src1[j];
4138         float wi = win[i];
4139         float wj = win[j];
4140         dst[i] = s0*wj - s1*wi + add_bias;
4141         dst[j] = s0*wi + s1*wj + add_bias;
4142     }
4143 }
4144
4145 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
4146                                  int len)
4147 {
4148     int i;
4149     for (i = 0; i < len; i++)
4150         dst[i] = src[i] * mul;
4151 }
4152
4153 static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
4154                                       const float **sv, float mul, int len)
4155 {
4156     int i;
4157     for (i = 0; i < len; i += 2, sv++) {
4158         dst[i  ] = src[i  ] * sv[0][0] * mul;
4159         dst[i+1] = src[i+1] * sv[0][1] * mul;
4160     }
4161 }
4162
4163 static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
4164                                       const float **sv, float mul, int len)
4165 {
4166     int i;
4167     for (i = 0; i < len; i += 4, sv++) {
4168         dst[i  ] = src[i  ] * sv[0][0] * mul;
4169         dst[i+1] = src[i+1] * sv[0][1] * mul;
4170         dst[i+2] = src[i+2] * sv[0][2] * mul;
4171         dst[i+3] = src[i+3] * sv[0][3] * mul;
4172     }
4173 }
4174
4175 static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
4176                                int len)
4177 {
4178     int i;
4179     for (i = 0; i < len; i += 2, sv++) {
4180         dst[i  ] = sv[0][0] * mul;
4181         dst[i+1] = sv[0][1] * mul;
4182     }
4183 }
4184
4185 static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
4186                                int len)
4187 {
4188     int i;
4189     for (i = 0; i < len; i += 4, sv++) {
4190         dst[i  ] = sv[0][0] * mul;
4191         dst[i+1] = sv[0][1] * mul;
4192         dst[i+2] = sv[0][2] * mul;
4193         dst[i+3] = sv[0][3] * mul;
4194     }
4195 }
4196
4197 static void butterflies_float_c(float *restrict v1, float *restrict v2,
4198                                 int len)
4199 {
4200     int i;
4201     for (i = 0; i < len; i++) {
4202         float t = v1[i] - v2[i];
4203         v1[i] += v2[i];
4204         v2[i] = t;
4205     }
4206 }
4207
4208 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
4209 {
4210     float p = 0.0;
4211     int i;
4212
4213     for (i = 0; i < len; i++)
4214         p += v1[i] * v2[i];
4215
4216     return p;
4217 }
4218
4219 static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
4220     int i;
4221     for(i=0; i<len; i++)
4222         dst[i] = src[i] * mul;
4223 }
4224
4225 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
4226                    uint32_t maxi, uint32_t maxisign)
4227 {
4228
4229     if(a > mini) return mini;
4230     else if((a^(1<<31)) > maxisign) return maxi;
4231     else return a;
4232 }
4233
4234 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
4235     int i;
4236     uint32_t mini = *(uint32_t*)min;
4237     uint32_t maxi = *(uint32_t*)max;
4238     uint32_t maxisign = maxi ^ (1<<31);
4239     uint32_t *dsti = (uint32_t*)dst;
4240     const uint32_t *srci = (const uint32_t*)src;
4241     for(i=0; i<len; i+=8) {
4242         dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
4243         dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
4244         dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
4245         dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
4246         dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
4247         dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
4248         dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
4249         dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
4250     }
4251 }
4252 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
4253     int i;
4254     if(min < 0 && max > 0) {
4255         vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
4256     } else {
4257         for(i=0; i < len; i+=8) {
4258             dst[i    ] = av_clipf(src[i    ], min, max);
4259             dst[i + 1] = av_clipf(src[i + 1], min, max);
4260             dst[i + 2] = av_clipf(src[i + 2], min, max);
4261             dst[i + 3] = av_clipf(src[i + 3], min, max);
4262             dst[i + 4] = av_clipf(src[i + 4], min, max);
4263             dst[i + 5] = av_clipf(src[i + 5], min, max);
4264             dst[i + 6] = av_clipf(src[i + 6], min, max);
4265             dst[i + 7] = av_clipf(src[i + 7], min, max);
4266         }
4267     }
4268 }
4269
4270 static av_always_inline int float_to_int16_one(const float *src){
4271     int_fast32_t tmp = *(const int32_t*)src;
4272     if(tmp & 0xf0000){
4273         tmp = (0x43c0ffff - tmp)>>31;
4274         // is this faster on some gcc/cpu combinations?
4275 //      if(tmp > 0x43c0ffff) tmp = 0xFFFF;
4276 //      else                 tmp = 0;
4277     }
4278     return tmp - 0x8000;
4279 }
4280
4281 void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
4282     int i;
4283     for(i=0; i<len; i++)
4284         dst[i] = float_to_int16_one(src+i);
4285 }
4286
4287 void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
4288     int i,j,c;
4289     if(channels==2){
4290         for(i=0; i<len; i++){
4291             dst[2*i]   = float_to_int16_one(src[0]+i);
4292             dst[2*i+1] = float_to_int16_one(src[1]+i);
4293         }
4294     }else{
4295         for(c=0; c<channels; c++)
4296             for(i=0, j=c; i<len; i++, j+=channels)
4297                 dst[j] = float_to_int16_one(src[c]+i);
4298     }
4299 }
4300
4301 static void add_int16_c(int16_t * v1, int16_t * v2, int order)
4302 {
4303     while (order--)
4304        *v1++ += *v2++;
4305 }
4306
4307 static void sub_int16_c(int16_t * v1, int16_t * v2, int order)
4308 {
4309     while (order--)
4310         *v1++ -= *v2++;
4311 }
4312
4313 static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift)
4314 {
4315     int res = 0;
4316
4317     while (order--)
4318         res += (*v1++ * *v2++) >> shift;
4319
4320     return res;
4321 }
4322
4323 #define W0 2048
4324 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
4325 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
4326 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
4327 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
4328 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
4329 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
4330 #define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
4331
4332 static void wmv2_idct_row(short * b)
4333 {
4334     int s1,s2;
4335     int a0,a1,a2,a3,a4,a5,a6,a7;
4336     /*step 1*/
4337     a1 = W1*b[1]+W7*b[7];
4338     a7 = W7*b[1]-W1*b[7];
4339     a5 = W5*b[5]+W3*b[3];
4340     a3 = W3*b[5]-W5*b[3];
4341     a2 = W2*b[2]+W6*b[6];
4342     a6 = W6*b[2]-W2*b[6];
4343     a0 = W0*b[0]+W0*b[4];
4344     a4 = W0*b[0]-W0*b[4];
4345     /*step 2*/
4346     s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
4347     s2 = (181*(a1-a5-a7+a3)+128)>>8;
4348     /*step 3*/
4349     b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
4350     b[1] = (a4+a6 +s1   + (1<<7))>>8;
4351     b[2] = (a4-a6 +s2   + (1<<7))>>8;
4352     b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
4353     b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
4354     b[5] = (a4-a6 -s2   + (1<<7))>>8;
4355     b[6] = (a4+a6 -s1   + (1<<7))>>8;
4356     b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
4357 }
4358 static void wmv2_idct_col(short * b)
4359 {
4360     int s1,s2;
4361     int a0,a1,a2,a3,a4,a5,a6,a7;
4362     /*step 1, with extended precision*/
4363     a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
4364     a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
4365     a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
4366     a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
4367     a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
4368     a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
4369     a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
4370     a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
4371     /*step 2*/
4372     s1 = (181*(a1-a5+a7-a3)+128)>>8;
4373     s2 = (181*(a1-a5-a7+a3)+128)>>8;
4374     /*step 3*/
4375     b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
4376     b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
4377     b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
4378     b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
4379
4380     b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
4381     b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
4382     b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
4383     b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
4384 }
4385 void ff_wmv2_idct_c(short * block){
4386     int i;
4387
4388     for(i=0;i<64;i+=8){
4389         wmv2_idct_row(block+i);
4390     }
4391     for(i=0;i<8;i++){
4392         wmv2_idct_col(block+i);
4393     }
4394 }
4395 /* XXX: those functions should be suppressed ASAP when all IDCTs are
4396  converted */
4397 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
4398 {
4399     ff_wmv2_idct_c(block);
4400     put_pixels_clamped_c(block, dest, line_size);
4401 }
4402 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
4403 {
4404     ff_wmv2_idct_c(block);
4405     add_pixels_clamped_c(block, dest, line_size);
4406 }
4407 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
4408 {
4409     j_rev_dct (block);
4410     put_pixels_clamped_c(block, dest, line_size);
4411 }
4412 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
4413 {
4414     j_rev_dct (block);
4415     add_pixels_clamped_c(block, dest, line_size);
4416 }
4417
4418 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
4419 {
4420     j_rev_dct4 (block);
4421     put_pixels_clamped4_c(block, dest, line_size);
4422 }
4423 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
4424 {
4425     j_rev_dct4 (block);
4426     add_pixels_clamped4_c(block, dest, line_size);
4427 }
4428
4429 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4430 {
4431     j_rev_dct2 (block);
4432     put_pixels_clamped2_c(block, dest, line_size);
4433 }
4434 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4435 {
4436     j_rev_dct2 (block);
4437     add_pixels_clamped2_c(block, dest, line_size);
4438 }
4439
4440 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4441 {
4442     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4443
4444     dest[0] = cm[(block[0] + 4)>>3];
4445 }
4446 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4447 {
4448     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4449
4450     dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4451 }
4452
4453 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4454
4455 /* init static data */
4456 void dsputil_static_init(void)
4457 {
4458     int i;
4459
4460     for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4461     for(i=0;i<MAX_NEG_CROP;i++) {
4462         ff_cropTbl[i] = 0;
4463         ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4464     }
4465
4466     for(i=0;i<512;i++) {
4467         ff_squareTbl[i] = (i - 256) * (i - 256);
4468     }
4469
4470     for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4471 }
4472
4473 int ff_check_alignment(void){
4474     static int did_fail=0;
4475     DECLARE_ALIGNED_16(int, aligned);
4476
4477     if((intptr_t)&aligned & 15){
4478         if(!did_fail){
4479 #if HAVE_MMX || HAVE_ALTIVEC
4480             av_log(NULL, AV_LOG_ERROR,
4481                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4482                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
4483                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4484                 "Do not report crashes to FFmpeg developers.\n");
4485 #endif
4486             did_fail=1;
4487         }
4488         return -1;
4489     }
4490     return 0;
4491 }
4492
4493 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4494 {
4495     int i;
4496
4497     ff_check_alignment();
4498
4499 #if CONFIG_ENCODERS
4500     if(avctx->dct_algo==FF_DCT_FASTINT) {
4501         c->fdct = fdct_ifast;
4502         c->fdct248 = fdct_ifast248;
4503     }
4504     else if(avctx->dct_algo==FF_DCT_FAAN) {
4505         c->fdct = ff_faandct;
4506         c->fdct248 = ff_faandct248;
4507     }
4508     else {
4509         c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4510         c->fdct248 = ff_fdct248_islow;
4511     }
4512 #endif //CONFIG_ENCODERS
4513
4514     if(avctx->lowres==1){
4515         if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
4516             c->idct_put= ff_jref_idct4_put;
4517             c->idct_add= ff_jref_idct4_add;
4518         }else{
4519             c->idct_put= ff_h264_lowres_idct_put_c;
4520             c->idct_add= ff_h264_lowres_idct_add_c;
4521         }
4522         c->idct    = j_rev_dct4;
4523         c->idct_permutation_type= FF_NO_IDCT_PERM;
4524     }else if(avctx->lowres==2){
4525         c->idct_put= ff_jref_idct2_put;
4526         c->idct_add= ff_jref_idct2_add;
4527         c->idct    = j_rev_dct2;
4528         c->idct_permutation_type= FF_NO_IDCT_PERM;
4529     }else if(avctx->lowres==3){
4530         c->idct_put= ff_jref_idct1_put;
4531         c->idct_add= ff_jref_idct1_add;
4532         c->idct    = j_rev_dct1;
4533         c->idct_permutation_type= FF_NO_IDCT_PERM;
4534     }else{
4535         if(avctx->idct_algo==FF_IDCT_INT){
4536             c->idct_put= ff_jref_idct_put;
4537             c->idct_add= ff_jref_idct_add;
4538             c->idct    = j_rev_dct;
4539             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4540         }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
4541                 avctx->idct_algo==FF_IDCT_VP3){
4542             c->idct_put= ff_vp3_idct_put_c;
4543             c->idct_add= ff_vp3_idct_add_c;
4544             c->idct    = ff_vp3_idct_c;
4545             c->idct_permutation_type= FF_NO_IDCT_PERM;
4546         }else if(avctx->idct_algo==FF_IDCT_WMV2){
4547             c->idct_put= ff_wmv2_idct_put_c;
4548             c->idct_add= ff_wmv2_idct_add_c;
4549             c->idct    = ff_wmv2_idct_c;
4550             c->idct_permutation_type= FF_NO_IDCT_PERM;
4551         }else if(avctx->idct_algo==FF_IDCT_FAAN){
4552             c->idct_put= ff_faanidct_put;
4553             c->idct_add= ff_faanidct_add;
4554             c->idct    = ff_faanidct;
4555             c->idct_permutation_type= FF_NO_IDCT_PERM;
4556         }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
4557             c->idct_put= ff_ea_idct_put_c;
4558             c->idct_permutation_type= FF_NO_IDCT_PERM;
4559         }else{ //accurate/default
4560             c->idct_put= ff_simple_idct_put;
4561             c->idct_add= ff_simple_idct_add;
4562             c->idct    = ff_simple_idct;
4563             c->idct_permutation_type= FF_NO_IDCT_PERM;
4564         }
4565     }
4566
4567     if (CONFIG_H264_DECODER) {
4568         c->h264_idct_add= ff_h264_idct_add_c;
4569         c->h264_idct8_add= ff_h264_idct8_add_c;
4570         c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
4571         c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
4572         c->h264_idct_add16     = ff_h264_idct_add16_c;
4573         c->h264_idct8_add4     = ff_h264_idct8_add4_c;
4574         c->h264_idct_add8      = ff_h264_idct_add8_c;
4575         c->h264_idct_add16intra= ff_h264_idct_add16intra_c;
4576     }
4577
4578     c->get_pixels = get_pixels_c;
4579     c->diff_pixels = diff_pixels_c;
4580     c->put_pixels_clamped = put_pixels_clamped_c;
4581     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4582     c->add_pixels_clamped = add_pixels_clamped_c;
4583     c->add_pixels8 = add_pixels8_c;
4584     c->add_pixels4 = add_pixels4_c;
4585     c->sum_abs_dctelem = sum_abs_dctelem_c;
4586     c->gmc1 = gmc1_c;
4587     c->gmc = ff_gmc_c;
4588     c->clear_block = clear_block_c;
4589     c->clear_blocks = clear_blocks_c;
4590     c->pix_sum = pix_sum_c;
4591     c->pix_norm1 = pix_norm1_c;
4592
4593     /* TODO [0] 16  [1] 8 */
4594     c->pix_abs[0][0] = pix_abs16_c;
4595     c->pix_abs[0][1] = pix_abs16_x2_c;
4596     c->pix_abs[0][2] = pix_abs16_y2_c;
4597     c->pix_abs[0][3] = pix_abs16_xy2_c;
4598     c->pix_abs[1][0] = pix_abs8_c;
4599     c->pix_abs[1][1] = pix_abs8_x2_c;
4600     c->pix_abs[1][2] = pix_abs8_y2_c;
4601     c->pix_abs[1][3] = pix_abs8_xy2_c;
4602
4603 #define dspfunc(PFX, IDX, NUM) \
4604     c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
4605     c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
4606     c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
4607     c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4608
4609     dspfunc(put, 0, 16);
4610     dspfunc(put_no_rnd, 0, 16);
4611     dspfunc(put, 1, 8);
4612     dspfunc(put_no_rnd, 1, 8);
4613     dspfunc(put, 2, 4);
4614     dspfunc(put, 3, 2);
4615
4616     dspfunc(avg, 0, 16);
4617     dspfunc(avg_no_rnd, 0, 16);
4618     dspfunc(avg, 1, 8);
4619     dspfunc(avg_no_rnd, 1, 8);
4620     dspfunc(avg, 2, 4);
4621     dspfunc(avg, 3, 2);
4622 #undef dspfunc
4623
4624     c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4625     c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4626
4627     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4628     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4629     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4630     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4631     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4632     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4633     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4634     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4635     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4636
4637     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4638     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4639     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4640     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4641     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4642     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4643     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4644     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4645     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4646
4647 #define dspfunc(PFX, IDX, NUM) \
4648     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4649     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4650     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4651     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4652     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4653     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4654     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4655     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4656     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4657     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4658     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4659     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4660     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4661     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4662     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4663     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4664
4665     dspfunc(put_qpel, 0, 16);
4666     dspfunc(put_no_rnd_qpel, 0, 16);
4667
4668     dspfunc(avg_qpel, 0, 16);
4669     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4670
4671     dspfunc(put_qpel, 1, 8);
4672     dspfunc(put_no_rnd_qpel, 1, 8);
4673
4674     dspfunc(avg_qpel, 1, 8);
4675     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4676
4677     dspfunc(put_h264_qpel, 0, 16);
4678     dspfunc(put_h264_qpel, 1, 8);
4679     dspfunc(put_h264_qpel, 2, 4);
4680     dspfunc(put_h264_qpel, 3, 2);
4681     dspfunc(avg_h264_qpel, 0, 16);
4682     dspfunc(avg_h264_qpel, 1, 8);
4683     dspfunc(avg_h264_qpel, 2, 4);
4684
4685 #undef dspfunc
4686     c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4687     c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4688     c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4689     c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4690     c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4691     c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4692     c->put_no_rnd_vc1_chroma_pixels_tab[0]= put_no_rnd_vc1_chroma_mc8_c;
4693     c->avg_no_rnd_vc1_chroma_pixels_tab[0]= avg_no_rnd_vc1_chroma_mc8_c;
4694
4695     c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
4696     c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
4697     c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
4698     c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
4699     c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
4700     c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
4701     c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
4702     c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
4703     c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
4704     c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
4705     c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
4706     c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
4707     c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
4708     c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
4709     c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
4710     c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
4711     c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
4712     c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
4713     c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
4714     c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
4715
4716     c->draw_edges = draw_edges_c;
4717
4718 #if CONFIG_CAVS_DECODER
4719     ff_cavsdsp_init(c,avctx);
4720 #endif
4721
4722 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
4723     ff_mlp_init(c, avctx);
4724 #endif
4725 #if CONFIG_VC1_DECODER
4726     ff_vc1dsp_init(c,avctx);
4727 #endif
4728 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
4729     ff_intrax8dsp_init(c,avctx);
4730 #endif
4731 #if CONFIG_RV30_DECODER
4732     ff_rv30dsp_init(c,avctx);
4733 #endif
4734 #if CONFIG_RV40_DECODER
4735     ff_rv40dsp_init(c,avctx);
4736     c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
4737     c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
4738     c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
4739     c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
4740 #endif
4741
4742     c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4743     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4744     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4745     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4746     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4747     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4748     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4749     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4750
4751 #define SET_CMP_FUNC(name) \
4752     c->name[0]= name ## 16_c;\
4753     c->name[1]= name ## 8x8_c;
4754
4755     SET_CMP_FUNC(hadamard8_diff)
4756     c->hadamard8_diff[4]= hadamard8_intra16_c;
4757     c->hadamard8_diff[5]= hadamard8_intra8x8_c;
4758     SET_CMP_FUNC(dct_sad)
4759     SET_CMP_FUNC(dct_max)
4760 #if CONFIG_GPL
4761     SET_CMP_FUNC(dct264_sad)
4762 #endif
4763     c->sad[0]= pix_abs16_c;
4764     c->sad[1]= pix_abs8_c;
4765     c->sse[0]= sse16_c;
4766     c->sse[1]= sse8_c;
4767     c->sse[2]= sse4_c;
4768     SET_CMP_FUNC(quant_psnr)
4769     SET_CMP_FUNC(rd)
4770     SET_CMP_FUNC(bit)
4771     c->vsad[0]= vsad16_c;
4772     c->vsad[4]= vsad_intra16_c;
4773     c->vsad[5]= vsad_intra8_c;
4774     c->vsse[0]= vsse16_c;
4775     c->vsse[4]= vsse_intra16_c;
4776     c->vsse[5]= vsse_intra8_c;
4777     c->nsse[0]= nsse16_c;
4778     c->nsse[1]= nsse8_c;
4779 #if CONFIG_SNOW_ENCODER
4780     c->w53[0]= w53_16_c;
4781     c->w53[1]= w53_8_c;
4782     c->w97[0]= w97_16_c;
4783     c->w97[1]= w97_8_c;
4784 #endif
4785
4786     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4787
4788     c->add_bytes= add_bytes_c;
4789     c->add_bytes_l2= add_bytes_l2_c;
4790     c->diff_bytes= diff_bytes_c;
4791     c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
4792     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4793     c->add_hfyu_left_prediction  = add_hfyu_left_prediction_c;
4794     c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
4795     c->bswap_buf= bswap_buf;
4796 #if CONFIG_PNG_DECODER
4797     c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
4798 #endif
4799
4800     c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4801     c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4802     c->h264_v_loop_filter_luma_intra= h264_v_loop_filter_luma_intra_c;
4803     c->h264_h_loop_filter_luma_intra= h264_h_loop_filter_luma_intra_c;
4804     c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4805     c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4806     c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4807     c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4808     c->h264_loop_filter_strength= NULL;
4809
4810     if (CONFIG_ANY_H263) {
4811         c->h263_h_loop_filter= h263_h_loop_filter_c;
4812         c->h263_v_loop_filter= h263_v_loop_filter_c;
4813     }
4814
4815     if (CONFIG_VP3_DECODER) {
4816         c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
4817         c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
4818     }
4819     if (CONFIG_VP6_DECODER) {
4820         c->vp6_filter_diag4= ff_vp6_filter_diag4_c;
4821     }
4822
4823     c->h261_loop_filter= h261_loop_filter_c;
4824
4825     c->try_8x8basis= try_8x8basis_c;
4826     c->add_8x8basis= add_8x8basis_c;
4827
4828 #if CONFIG_SNOW_DECODER
4829     c->vertical_compose97i = ff_snow_vertical_compose97i;
4830     c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4831     c->inner_add_yblock = ff_snow_inner_add_yblock;
4832 #endif
4833
4834 #if CONFIG_VORBIS_DECODER
4835     c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4836 #endif
4837 #if CONFIG_AC3_DECODER
4838     c->ac3_downmix = ff_ac3_downmix_c;
4839 #endif
4840 #if CONFIG_LPC
4841     c->lpc_compute_autocorr = ff_lpc_compute_autocorr;
4842 #endif
4843     c->vector_fmul = vector_fmul_c;
4844     c->vector_fmul_reverse = vector_fmul_reverse_c;
4845     c->vector_fmul_add = vector_fmul_add_c;
4846     c->vector_fmul_window = ff_vector_fmul_window_c;
4847     c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
4848     c->vector_clipf = vector_clipf_c;
4849     c->float_to_int16 = ff_float_to_int16_c;
4850     c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
4851     c->add_int16 = add_int16_c;
4852     c->sub_int16 = sub_int16_c;
4853     c->scalarproduct_int16 = scalarproduct_int16_c;
4854     c->scalarproduct_float = scalarproduct_float_c;
4855     c->butterflies_float = butterflies_float_c;
4856     c->vector_fmul_scalar = vector_fmul_scalar_c;
4857
4858     c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c;
4859     c->vector_fmul_sv_scalar[1] = vector_fmul_sv_scalar_4_c;
4860
4861     c->sv_fmul_scalar[0] = sv_fmul_scalar_2_c;
4862     c->sv_fmul_scalar[1] = sv_fmul_scalar_4_c;
4863
4864     c->shrink[0]= ff_img_copy_plane;
4865     c->shrink[1]= ff_shrink22;
4866     c->shrink[2]= ff_shrink44;
4867     c->shrink[3]= ff_shrink88;
4868
4869     c->prefetch= just_return;
4870
4871     memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4872     memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4873
4874     if (HAVE_MMX)        dsputil_init_mmx   (c, avctx);
4875     if (ARCH_ARM)        dsputil_init_arm   (c, avctx);
4876     if (CONFIG_MLIB)     dsputil_init_mlib  (c, avctx);
4877     if (HAVE_VIS)        dsputil_init_vis   (c, avctx);
4878     if (ARCH_ALPHA)      dsputil_init_alpha (c, avctx);
4879     if (ARCH_PPC)        dsputil_init_ppc   (c, avctx);
4880     if (HAVE_MMI)        dsputil_init_mmi   (c, avctx);
4881     if (ARCH_SH4)        dsputil_init_sh4   (c, avctx);
4882     if (ARCH_BFIN)       dsputil_init_bfin  (c, avctx);
4883
4884     for(i=0; i<64; i++){
4885         if(!c->put_2tap_qpel_pixels_tab[0][i])
4886             c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4887         if(!c->avg_2tap_qpel_pixels_tab[0][i])
4888             c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4889     }
4890
4891     switch(c->idct_permutation_type){
4892     case FF_NO_IDCT_PERM:
4893         for(i=0; i<64; i++)
4894             c->idct_permutation[i]= i;
4895         break;
4896     case FF_LIBMPEG2_IDCT_PERM:
4897         for(i=0; i<64; i++)
4898             c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4899         break;
4900     case FF_SIMPLE_IDCT_PERM:
4901         for(i=0; i<64; i++)
4902             c->idct_permutation[i]= simple_mmx_permutation[i];
4903         break;
4904     case FF_TRANSPOSE_IDCT_PERM:
4905         for(i=0; i<64; i++)
4906             c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4907         break;
4908     case FF_PARTTRANS_IDCT_PERM:
4909         for(i=0; i<64; i++)
4910             c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4911         break;
4912     case FF_SSE2_IDCT_PERM:
4913         for(i=0; i<64; i++)
4914             c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
4915         break;
4916     default:
4917         av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
4918     }
4919 }
4920