libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
   7  *
   8  * This file is part of FFmpeg.
   9  *
  10  * FFmpeg is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * FFmpeg is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with FFmpeg; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 /**
  26  * @file libavcodec/dsputil.c
  27  * DSP utils
  28  */
  29
  30 #include "avcodec.h"
  31 #include "dsputil.h"
  32 #include "simple_idct.h"
  33 #include "faandct.h"
  34 #include "faanidct.h"
  35 #include "mathops.h"
  36 #include "h263.h"
  37 #include "snow.h"
  38
  39 /* snow.c */
  40 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
  41
  42 /* vorbis.c */
  43 void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
  44
  45 /* ac3dec.c */
  46 void ff_ac3_downmix_c(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len);
  47
  48 /* flacenc.c */
  49 void ff_flac_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
  50
  51 /* pngdec.c */
  52 void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
  53
  54 /* eaidct.c */
  55 void ff_ea_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block);
  56
  57 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
  58 uint32_t ff_squareTbl[512] = {0, };
  59
  60 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
  61 #define pb_7f (~0UL/255 * 0x7f)
  62 #define pb_80 (~0UL/255 * 0x80)
  63
  64 const uint8_t ff_zigzag_direct[64] = {
  65     0,   1,  8, 16,  9,  2,  3, 10,
  66     17, 24, 32, 25, 18, 11,  4,  5,
  67     12, 19, 26, 33, 40, 48, 41, 34,
  68     27, 20, 13,  6,  7, 14, 21, 28,
  69     35, 42, 49, 56, 57, 50, 43, 36,
  70     29, 22, 15, 23, 30, 37, 44, 51,
  71     58, 59, 52, 45, 38, 31, 39, 46,
  72     53, 60, 61, 54, 47, 55, 62, 63
  73 };
  74
  75 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  76    specification, we interleave the fields */
  77 const uint8_t ff_zigzag248_direct[64] = {
  78      0,  8,  1,  9, 16, 24,  2, 10,
  79     17, 25, 32, 40, 48, 56, 33, 41,
  80     18, 26,  3, 11,  4, 12, 19, 27,
  81     34, 42, 49, 57, 50, 58, 35, 43,
  82     20, 28,  5, 13,  6, 14, 21, 29,
  83     36, 44, 51, 59, 52, 60, 37, 45,
  84     22, 30,  7, 15, 23, 31, 38, 46,
  85     53, 61, 54, 62, 39, 47, 55, 63,
  86 };
  87
  88 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  89 DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
  90
  91 const uint8_t ff_alternate_horizontal_scan[64] = {
  92     0,  1,   2,  3,  8,  9, 16, 17,
  93     10, 11,  4,  5,  6,  7, 15, 14,
  94     13, 12, 19, 18, 24, 25, 32, 33,
  95     26, 27, 20, 21, 22, 23, 28, 29,
  96     30, 31, 34, 35, 40, 41, 48, 49,
  97     42, 43, 36, 37, 38, 39, 44, 45,
  98     46, 47, 50, 51, 56, 57, 58, 59,
  99     52, 53, 54, 55, 60, 61, 62, 63,
 100 };
 101
 102 const uint8_t ff_alternate_vertical_scan[64] = {
 103     0,  8,  16, 24,  1,  9,  2, 10,
 104     17, 25, 32, 40, 48, 56, 57, 49,
 105     41, 33, 26, 18,  3, 11,  4, 12,
 106     19, 27, 34, 42, 50, 58, 35, 43,
 107     51, 59, 20, 28,  5, 13,  6, 14,
 108     21, 29, 36, 44, 52, 60, 37, 45,
 109     53, 61, 22, 30,  7, 15, 23, 31,
 110     38, 46, 54, 62, 39, 47, 55, 63,
 111 };
 112
 113 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
 114 const uint32_t ff_inverse[256]={
 115          0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
 116  536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
 117  268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
 118  178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
 119  134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
 120  107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
 121   89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
 122   76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
 123   67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
 124   59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
 125   53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
 126   48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
 127   44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
 128   41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
 129   38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
 130   35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
 131   33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
 132   31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
 133   29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
 134   28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
 135   26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
 136   25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
 137   24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
 138   23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
 139   22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
 140   21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
 141   20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
 142   19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
 143   19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
 144   18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
 145   17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
 146   17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
 147 };
 148
 149 /* Input permutation for the simple_idct_mmx */
 150 static const uint8_t simple_mmx_permutation[64]={
 151         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 152         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 153         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 154         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 155         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 156         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 157         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 158         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 159 };
 160
 161 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
 162
 163 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
 164     int i;
 165     int end;
 166
 167     st->scantable= src_scantable;
 168
 169     for(i=0; i<64; i++){
 170         int j;
 171         j = src_scantable[i];
 172         st->permutated[i] = permutation[j];
 173 #if ARCH_PPC
 174         st->inverse[j] = i;
 175 #endif
 176     }
 177
 178     end=-1;
 179     for(i=0; i<64; i++){
 180         int j;
 181         j = st->permutated[i];
 182         if(j>end) end=j;
 183         st->raster_end[i]= end;
 184     }
 185 }
 186
 187 static int pix_sum_c(uint8_t * pix, int line_size)
 188 {
 189     int s, i, j;
 190
 191     s = 0;
 192     for (i = 0; i < 16; i++) {
 193         for (j = 0; j < 16; j += 8) {
 194             s += pix[0];
 195             s += pix[1];
 196             s += pix[2];
 197             s += pix[3];
 198             s += pix[4];
 199             s += pix[5];
 200             s += pix[6];
 201             s += pix[7];
 202             pix += 8;
 203         }
 204         pix += line_size - 16;
 205     }
 206     return s;
 207 }
 208
 209 static int pix_norm1_c(uint8_t * pix, int line_size)
 210 {
 211     int s, i, j;
 212     uint32_t *sq = ff_squareTbl + 256;
 213
 214     s = 0;
 215     for (i = 0; i < 16; i++) {
 216         for (j = 0; j < 16; j += 8) {
 217 #if 0
 218             s += sq[pix[0]];
 219             s += sq[pix[1]];
 220             s += sq[pix[2]];
 221             s += sq[pix[3]];
 222             s += sq[pix[4]];
 223             s += sq[pix[5]];
 224             s += sq[pix[6]];
 225             s += sq[pix[7]];
 226 #else
 227 #if LONG_MAX > 2147483647
 228             register uint64_t x=*(uint64_t*)pix;
 229             s += sq[x&0xff];
 230             s += sq[(x>>8)&0xff];
 231             s += sq[(x>>16)&0xff];
 232             s += sq[(x>>24)&0xff];
 233             s += sq[(x>>32)&0xff];
 234             s += sq[(x>>40)&0xff];
 235             s += sq[(x>>48)&0xff];
 236             s += sq[(x>>56)&0xff];
 237 #else
 238             register uint32_t x=*(uint32_t*)pix;
 239             s += sq[x&0xff];
 240             s += sq[(x>>8)&0xff];
 241             s += sq[(x>>16)&0xff];
 242             s += sq[(x>>24)&0xff];
 243             x=*(uint32_t*)(pix+4);
 244             s += sq[x&0xff];
 245             s += sq[(x>>8)&0xff];
 246             s += sq[(x>>16)&0xff];
 247             s += sq[(x>>24)&0xff];
 248 #endif
 249 #endif
 250             pix += 8;
 251         }
 252         pix += line_size - 16;
 253     }
 254     return s;
 255 }
 256
 257 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
 258     int i;
 259
 260     for(i=0; i+8<=w; i+=8){
 261         dst[i+0]= bswap_32(src[i+0]);
 262         dst[i+1]= bswap_32(src[i+1]);
 263         dst[i+2]= bswap_32(src[i+2]);
 264         dst[i+3]= bswap_32(src[i+3]);
 265         dst[i+4]= bswap_32(src[i+4]);
 266         dst[i+5]= bswap_32(src[i+5]);
 267         dst[i+6]= bswap_32(src[i+6]);
 268         dst[i+7]= bswap_32(src[i+7]);
 269     }
 270     for(;i<w; i++){
 271         dst[i+0]= bswap_32(src[i+0]);
 272     }
 273 }
 274
 275 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 276 {
 277     int s, i;
 278     uint32_t *sq = ff_squareTbl + 256;
 279
 280     s = 0;
 281     for (i = 0; i < h; i++) {
 282         s += sq[pix1[0] - pix2[0]];
 283         s += sq[pix1[1] - pix2[1]];
 284         s += sq[pix1[2] - pix2[2]];
 285         s += sq[pix1[3] - pix2[3]];
 286         pix1 += line_size;
 287         pix2 += line_size;
 288     }
 289     return s;
 290 }
 291
 292 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 293 {
 294     int s, i;
 295     uint32_t *sq = ff_squareTbl + 256;
 296
 297     s = 0;
 298     for (i = 0; i < h; i++) {
 299         s += sq[pix1[0] - pix2[0]];
 300         s += sq[pix1[1] - pix2[1]];
 301         s += sq[pix1[2] - pix2[2]];
 302         s += sq[pix1[3] - pix2[3]];
 303         s += sq[pix1[4] - pix2[4]];
 304         s += sq[pix1[5] - pix2[5]];
 305         s += sq[pix1[6] - pix2[6]];
 306         s += sq[pix1[7] - pix2[7]];
 307         pix1 += line_size;
 308         pix2 += line_size;
 309     }
 310     return s;
 311 }
 312
 313 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 314 {
 315     int s, i;
 316     uint32_t *sq = ff_squareTbl + 256;
 317
 318     s = 0;
 319     for (i = 0; i < h; i++) {
 320         s += sq[pix1[ 0] - pix2[ 0]];
 321         s += sq[pix1[ 1] - pix2[ 1]];
 322         s += sq[pix1[ 2] - pix2[ 2]];
 323         s += sq[pix1[ 3] - pix2[ 3]];
 324         s += sq[pix1[ 4] - pix2[ 4]];
 325         s += sq[pix1[ 5] - pix2[ 5]];
 326         s += sq[pix1[ 6] - pix2[ 6]];
 327         s += sq[pix1[ 7] - pix2[ 7]];
 328         s += sq[pix1[ 8] - pix2[ 8]];
 329         s += sq[pix1[ 9] - pix2[ 9]];
 330         s += sq[pix1[10] - pix2[10]];
 331         s += sq[pix1[11] - pix2[11]];
 332         s += sq[pix1[12] - pix2[12]];
 333         s += sq[pix1[13] - pix2[13]];
 334         s += sq[pix1[14] - pix2[14]];
 335         s += sq[pix1[15] - pix2[15]];
 336
 337         pix1 += line_size;
 338         pix2 += line_size;
 339     }
 340     return s;
 341 }
 342
 343
 344 #if CONFIG_SNOW_ENCODER //dwt is in snow.c
 345 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
 346     int s, i, j;
 347     const int dec_count= w==8 ? 3 : 4;
 348     int tmp[32*32];
 349     int level, ori;
 350     static const int scale[2][2][4][4]={
 351       {
 352         {
 353             // 9/7 8x8 dec=3
 354             {268, 239, 239, 213},
 355             {  0, 224, 224, 152},
 356             {  0, 135, 135, 110},
 357         },{
 358             // 9/7 16x16 or 32x32 dec=4
 359             {344, 310, 310, 280},
 360             {  0, 320, 320, 228},
 361             {  0, 175, 175, 136},
 362             {  0, 129, 129, 102},
 363         }
 364       },{
 365         {
 366             // 5/3 8x8 dec=3
 367             {275, 245, 245, 218},
 368             {  0, 230, 230, 156},
 369             {  0, 138, 138, 113},
 370         },{
 371             // 5/3 16x16 or 32x32 dec=4
 372             {352, 317, 317, 286},
 373             {  0, 328, 328, 233},
 374             {  0, 180, 180, 140},
 375             {  0, 132, 132, 105},
 376         }
 377       }
 378     };
 379
 380     for (i = 0; i < h; i++) {
 381         for (j = 0; j < w; j+=4) {
 382             tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
 383             tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
 384             tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
 385             tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
 386         }
 387         pix1 += line_size;
 388         pix2 += line_size;
 389     }
 390
 391     ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
 392
 393     s=0;
 394     assert(w==h);
 395     for(level=0; level<dec_count; level++){
 396         for(ori= level ? 1 : 0; ori<4; ori++){
 397             int size= w>>(dec_count-level);
 398             int sx= (ori&1) ? size : 0;
 399             int stride= 32<<(dec_count-level);
 400             int sy= (ori&2) ? stride>>1 : 0;
 401
 402             for(i=0; i<size; i++){
 403                 for(j=0; j<size; j++){
 404                     int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
 405                     s += FFABS(v);
 406                 }
 407             }
 408         }
 409     }
 410     assert(s>=0);
 411     return s>>9;
 412 }
 413
 414 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 415     return w_c(v, pix1, pix2, line_size,  8, h, 1);
 416 }
 417
 418 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 419     return w_c(v, pix1, pix2, line_size,  8, h, 0);
 420 }
 421
 422 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 423     return w_c(v, pix1, pix2, line_size, 16, h, 1);
 424 }
 425
 426 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 427     return w_c(v, pix1, pix2, line_size, 16, h, 0);
 428 }
 429
 430 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 431     return w_c(v, pix1, pix2, line_size, 32, h, 1);
 432 }
 433
 434 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 435     return w_c(v, pix1, pix2, line_size, 32, h, 0);
 436 }
 437 #endif
 438
 439 /* draw the edges of width 'w' of an image of size width, height */
 440 //FIXME check that this is ok for mpeg4 interlaced
 441 static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
 442 {
 443     uint8_t *ptr, *last_line;
 444     int i;
 445
 446     last_line = buf + (height - 1) * wrap;
 447     for(i=0;i<w;i++) {
 448         /* top and bottom */
 449         memcpy(buf - (i + 1) * wrap, buf, width);
 450         memcpy(last_line + (i + 1) * wrap, last_line, width);
 451     }
 452     /* left and right */
 453     ptr = buf;
 454     for(i=0;i<height;i++) {
 455         memset(ptr - w, ptr[0], w);
 456         memset(ptr + width, ptr[width-1], w);
 457         ptr += wrap;
 458     }
 459     /* corners */
 460     for(i=0;i<w;i++) {
 461         memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
 462         memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
 463         memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
 464         memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
 465     }
 466 }
 467
 468 /**
 469  * Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples.
 470  * @param buf destination buffer
 471  * @param src source buffer
 472  * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
 473  * @param block_w width of block
 474  * @param block_h height of block
 475  * @param src_x x coordinate of the top left sample of the block in the source buffer
 476  * @param src_y y coordinate of the top left sample of the block in the source buffer
 477  * @param w width of the source buffer
 478  * @param h height of the source buffer
 479  */
 480 void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, int block_w, int block_h,
 481                                     int src_x, int src_y, int w, int h){
 482     int x, y;
 483     int start_y, start_x, end_y, end_x;
 484
 485     if(src_y>= h){
 486         src+= (h-1-src_y)*linesize;
 487         src_y=h-1;
 488     }else if(src_y<=-block_h){
 489         src+= (1-block_h-src_y)*linesize;
 490         src_y=1-block_h;
 491     }
 492     if(src_x>= w){
 493         src+= (w-1-src_x);
 494         src_x=w-1;
 495     }else if(src_x<=-block_w){
 496         src+= (1-block_w-src_x);
 497         src_x=1-block_w;
 498     }
 499
 500     start_y= FFMAX(0, -src_y);
 501     start_x= FFMAX(0, -src_x);
 502     end_y= FFMIN(block_h, h-src_y);
 503     end_x= FFMIN(block_w, w-src_x);
 504
 505     // copy existing part
 506     for(y=start_y; y<end_y; y++){
 507         for(x=start_x; x<end_x; x++){
 508             buf[x + y*linesize]= src[x + y*linesize];
 509         }
 510     }
 511
 512     //top
 513     for(y=0; y<start_y; y++){
 514         for(x=start_x; x<end_x; x++){
 515             buf[x + y*linesize]= buf[x + start_y*linesize];
 516         }
 517     }
 518
 519     //bottom
 520     for(y=end_y; y<block_h; y++){
 521         for(x=start_x; x<end_x; x++){
 522             buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
 523         }
 524     }
 525
 526     for(y=0; y<block_h; y++){
 527        //left
 528         for(x=0; x<start_x; x++){
 529             buf[x + y*linesize]= buf[start_x + y*linesize];
 530         }
 531
 532        //right
 533         for(x=end_x; x<block_w; x++){
 534             buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
 535         }
 536     }
 537 }
 538
 539 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
 540 {
 541     int i;
 542
 543     /* read the pixels */
 544     for(i=0;i<8;i++) {
 545         block[0] = pixels[0];
 546         block[1] = pixels[1];
 547         block[2] = pixels[2];
 548         block[3] = pixels[3];
 549         block[4] = pixels[4];
 550         block[5] = pixels[5];
 551         block[6] = pixels[6];
 552         block[7] = pixels[7];
 553         pixels += line_size;
 554         block += 8;
 555     }
 556 }
 557
 558 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
 559                           const uint8_t *s2, int stride){
 560     int i;
 561
 562     /* read the pixels */
 563     for(i=0;i<8;i++) {
 564         block[0] = s1[0] - s2[0];
 565         block[1] = s1[1] - s2[1];
 566         block[2] = s1[2] - s2[2];
 567         block[3] = s1[3] - s2[3];
 568         block[4] = s1[4] - s2[4];
 569         block[5] = s1[5] - s2[5];
 570         block[6] = s1[6] - s2[6];
 571         block[7] = s1[7] - s2[7];
 572         s1 += stride;
 573         s2 += stride;
 574         block += 8;
 575     }
 576 }
 577
 578
 579 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 580                                  int line_size)
 581 {
 582     int i;
 583     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 584
 585     /* read the pixels */
 586     for(i=0;i<8;i++) {
 587         pixels[0] = cm[block[0]];
 588         pixels[1] = cm[block[1]];
 589         pixels[2] = cm[block[2]];
 590         pixels[3] = cm[block[3]];
 591         pixels[4] = cm[block[4]];
 592         pixels[5] = cm[block[5]];
 593         pixels[6] = cm[block[6]];
 594         pixels[7] = cm[block[7]];
 595
 596         pixels += line_size;
 597         block += 8;
 598     }
 599 }
 600
 601 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 602                                  int line_size)
 603 {
 604     int i;
 605     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 606
 607     /* read the pixels */
 608     for(i=0;i<4;i++) {
 609         pixels[0] = cm[block[0]];
 610         pixels[1] = cm[block[1]];
 611         pixels[2] = cm[block[2]];
 612         pixels[3] = cm[block[3]];
 613
 614         pixels += line_size;
 615         block += 8;
 616     }
 617 }
 618
 619 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 620                                  int line_size)
 621 {
 622     int i;
 623     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 624
 625     /* read the pixels */
 626     for(i=0;i<2;i++) {
 627         pixels[0] = cm[block[0]];
 628         pixels[1] = cm[block[1]];
 629
 630         pixels += line_size;
 631         block += 8;
 632     }
 633 }
 634
 635 static void put_signed_pixels_clamped_c(const DCTELEM *block,
 636                                         uint8_t *restrict pixels,
 637                                         int line_size)
 638 {
 639     int i, j;
 640
 641     for (i = 0; i < 8; i++) {
 642         for (j = 0; j < 8; j++) {
 643             if (*block < -128)
 644                 *pixels = 0;
 645             else if (*block > 127)
 646                 *pixels = 255;
 647             else
 648                 *pixels = (uint8_t)(*block + 128);
 649             block++;
 650             pixels++;
 651         }
 652         pixels += (line_size - 8);
 653     }
 654 }
 655
 656 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 657                           int line_size)
 658 {
 659     int i;
 660     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 661
 662     /* read the pixels */
 663     for(i=0;i<8;i++) {
 664         pixels[0] = cm[pixels[0] + block[0]];
 665         pixels[1] = cm[pixels[1] + block[1]];
 666         pixels[2] = cm[pixels[2] + block[2]];
 667         pixels[3] = cm[pixels[3] + block[3]];
 668         pixels[4] = cm[pixels[4] + block[4]];
 669         pixels[5] = cm[pixels[5] + block[5]];
 670         pixels[6] = cm[pixels[6] + block[6]];
 671         pixels[7] = cm[pixels[7] + block[7]];
 672         pixels += line_size;
 673         block += 8;
 674     }
 675 }
 676
 677 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 678                           int line_size)
 679 {
 680     int i;
 681     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 682
 683     /* read the pixels */
 684     for(i=0;i<4;i++) {
 685         pixels[0] = cm[pixels[0] + block[0]];
 686         pixels[1] = cm[pixels[1] + block[1]];
 687         pixels[2] = cm[pixels[2] + block[2]];
 688         pixels[3] = cm[pixels[3] + block[3]];
 689         pixels += line_size;
 690         block += 8;
 691     }
 692 }
 693
 694 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 695                           int line_size)
 696 {
 697     int i;
 698     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 699
 700     /* read the pixels */
 701     for(i=0;i<2;i++) {
 702         pixels[0] = cm[pixels[0] + block[0]];
 703         pixels[1] = cm[pixels[1] + block[1]];
 704         pixels += line_size;
 705         block += 8;
 706     }
 707 }
 708
 709 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 710 {
 711     int i;
 712     for(i=0;i<8;i++) {
 713         pixels[0] += block[0];
 714         pixels[1] += block[1];
 715         pixels[2] += block[2];
 716         pixels[3] += block[3];
 717         pixels[4] += block[4];
 718         pixels[5] += block[5];
 719         pixels[6] += block[6];
 720         pixels[7] += block[7];
 721         pixels += line_size;
 722         block += 8;
 723     }
 724 }
 725
 726 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 727 {
 728     int i;
 729     for(i=0;i<4;i++) {
 730         pixels[0] += block[0];
 731         pixels[1] += block[1];
 732         pixels[2] += block[2];
 733         pixels[3] += block[3];
 734         pixels += line_size;
 735         block += 4;
 736     }
 737 }
 738
 739 static int sum_abs_dctelem_c(DCTELEM *block)
 740 {
 741     int sum=0, i;
 742     for(i=0; i<64; i++)
 743         sum+= FFABS(block[i]);
 744     return sum;
 745 }
 746
 747 #if 0
 748
 749 #define PIXOP2(OPNAME, OP) \
 750 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 751 {\
 752     int i;\
 753     for(i=0; i<h; i++){\
 754         OP(*((uint64_t*)block), AV_RN64(pixels));\
 755         pixels+=line_size;\
 756         block +=line_size;\
 757     }\
 758 }\
 759 \
 760 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 761 {\
 762     int i;\
 763     for(i=0; i<h; i++){\
 764         const uint64_t a= AV_RN64(pixels  );\
 765         const uint64_t b= AV_RN64(pixels+1);\
 766         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 767         pixels+=line_size;\
 768         block +=line_size;\
 769     }\
 770 }\
 771 \
 772 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 773 {\
 774     int i;\
 775     for(i=0; i<h; i++){\
 776         const uint64_t a= AV_RN64(pixels  );\
 777         const uint64_t b= AV_RN64(pixels+1);\
 778         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 779         pixels+=line_size;\
 780         block +=line_size;\
 781     }\
 782 }\
 783 \
 784 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 785 {\
 786     int i;\
 787     for(i=0; i<h; i++){\
 788         const uint64_t a= AV_RN64(pixels          );\
 789         const uint64_t b= AV_RN64(pixels+line_size);\
 790         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 791         pixels+=line_size;\
 792         block +=line_size;\
 793     }\
 794 }\
 795 \
 796 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 797 {\
 798     int i;\
 799     for(i=0; i<h; i++){\
 800         const uint64_t a= AV_RN64(pixels          );\
 801         const uint64_t b= AV_RN64(pixels+line_size);\
 802         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 803         pixels+=line_size;\
 804         block +=line_size;\
 805     }\
 806 }\
 807 \
 808 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 809 {\
 810         int i;\
 811         const uint64_t a= AV_RN64(pixels  );\
 812         const uint64_t b= AV_RN64(pixels+1);\
 813         uint64_t l0=  (a&0x0303030303030303ULL)\
 814                     + (b&0x0303030303030303ULL)\
 815                     + 0x0202020202020202ULL;\
 816         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 817                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 818         uint64_t l1,h1;\
 819 \
 820         pixels+=line_size;\
 821         for(i=0; i<h; i+=2){\
 822             uint64_t a= AV_RN64(pixels  );\
 823             uint64_t b= AV_RN64(pixels+1);\
 824             l1=  (a&0x0303030303030303ULL)\
 825                + (b&0x0303030303030303ULL);\
 826             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 827               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 828             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 829             pixels+=line_size;\
 830             block +=line_size;\
 831             a= AV_RN64(pixels  );\
 832             b= AV_RN64(pixels+1);\
 833             l0=  (a&0x0303030303030303ULL)\
 834                + (b&0x0303030303030303ULL)\
 835                + 0x0202020202020202ULL;\
 836             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 837               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 838             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 839             pixels+=line_size;\
 840             block +=line_size;\
 841         }\
 842 }\
 843 \
 844 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 845 {\
 846         int i;\
 847         const uint64_t a= AV_RN64(pixels  );\
 848         const uint64_t b= AV_RN64(pixels+1);\
 849         uint64_t l0=  (a&0x0303030303030303ULL)\
 850                     + (b&0x0303030303030303ULL)\
 851                     + 0x0101010101010101ULL;\
 852         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 853                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 854         uint64_t l1,h1;\
 855 \
 856         pixels+=line_size;\
 857         for(i=0; i<h; i+=2){\
 858             uint64_t a= AV_RN64(pixels  );\
 859             uint64_t b= AV_RN64(pixels+1);\
 860             l1=  (a&0x0303030303030303ULL)\
 861                + (b&0x0303030303030303ULL);\
 862             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 863               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 864             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 865             pixels+=line_size;\
 866             block +=line_size;\
 867             a= AV_RN64(pixels  );\
 868             b= AV_RN64(pixels+1);\
 869             l0=  (a&0x0303030303030303ULL)\
 870                + (b&0x0303030303030303ULL)\
 871                + 0x0101010101010101ULL;\
 872             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 873               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 874             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 875             pixels+=line_size;\
 876             block +=line_size;\
 877         }\
 878 }\
 879 \
 880 CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
 881 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
 882 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
 883 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
 884 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
 885 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
 886 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
 887
 888 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
 889 #else // 64 bit variant
 890
 891 #define PIXOP2(OPNAME, OP) \
 892 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 893     int i;\
 894     for(i=0; i<h; i++){\
 895         OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
 896         pixels+=line_size;\
 897         block +=line_size;\
 898     }\
 899 }\
 900 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 901     int i;\
 902     for(i=0; i<h; i++){\
 903         OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
 904         pixels+=line_size;\
 905         block +=line_size;\
 906     }\
 907 }\
 908 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 909     int i;\
 910     for(i=0; i<h; i++){\
 911         OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
 912         OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
 913         pixels+=line_size;\
 914         block +=line_size;\
 915     }\
 916 }\
 917 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 918     OPNAME ## _pixels8_c(block, pixels, line_size, h);\
 919 }\
 920 \
 921 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 922                                                 int src_stride1, int src_stride2, int h){\
 923     int i;\
 924     for(i=0; i<h; i++){\
 925         uint32_t a,b;\
 926         a= AV_RN32(&src1[i*src_stride1  ]);\
 927         b= AV_RN32(&src2[i*src_stride2  ]);\
 928         OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
 929         a= AV_RN32(&src1[i*src_stride1+4]);\
 930         b= AV_RN32(&src2[i*src_stride2+4]);\
 931         OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
 932     }\
 933 }\
 934 \
 935 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 936                                                 int src_stride1, int src_stride2, int h){\
 937     int i;\
 938     for(i=0; i<h; i++){\
 939         uint32_t a,b;\
 940         a= AV_RN32(&src1[i*src_stride1  ]);\
 941         b= AV_RN32(&src2[i*src_stride2  ]);\
 942         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 943         a= AV_RN32(&src1[i*src_stride1+4]);\
 944         b= AV_RN32(&src2[i*src_stride2+4]);\
 945         OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
 946     }\
 947 }\
 948 \
 949 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 950                                                 int src_stride1, int src_stride2, int h){\
 951     int i;\
 952     for(i=0; i<h; i++){\
 953         uint32_t a,b;\
 954         a= AV_RN32(&src1[i*src_stride1  ]);\
 955         b= AV_RN32(&src2[i*src_stride2  ]);\
 956         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 957     }\
 958 }\
 959 \
 960 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 961                                                 int src_stride1, int src_stride2, int h){\
 962     int i;\
 963     for(i=0; i<h; i++){\
 964         uint32_t a,b;\
 965         a= AV_RN16(&src1[i*src_stride1  ]);\
 966         b= AV_RN16(&src2[i*src_stride2  ]);\
 967         OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 968     }\
 969 }\
 970 \
 971 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 972                                                 int src_stride1, int src_stride2, int h){\
 973     OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 974     OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 975 }\
 976 \
 977 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 978                                                 int src_stride1, int src_stride2, int h){\
 979     OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 980     OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 981 }\
 982 \
 983 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 984     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 985 }\
 986 \
 987 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 988     OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 989 }\
 990 \
 991 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 992     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 993 }\
 994 \
 995 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 996     OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 997 }\
 998 \
 999 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1000                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1001     int i;\
1002     for(i=0; i<h; i++){\
1003         uint32_t a, b, c, d, l0, l1, h0, h1;\
1004         a= AV_RN32(&src1[i*src_stride1]);\
1005         b= AV_RN32(&src2[i*src_stride2]);\
1006         c= AV_RN32(&src3[i*src_stride3]);\
1007         d= AV_RN32(&src4[i*src_stride4]);\
1008         l0=  (a&0x03030303UL)\
1009            + (b&0x03030303UL)\
1010            + 0x02020202UL;\
1011         h0= ((a&0xFCFCFCFCUL)>>2)\
1012           + ((b&0xFCFCFCFCUL)>>2);\
1013         l1=  (c&0x03030303UL)\
1014            + (d&0x03030303UL);\
1015         h1= ((c&0xFCFCFCFCUL)>>2)\
1016           + ((d&0xFCFCFCFCUL)>>2);\
1017         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1018         a= AV_RN32(&src1[i*src_stride1+4]);\
1019         b= AV_RN32(&src2[i*src_stride2+4]);\
1020         c= AV_RN32(&src3[i*src_stride3+4]);\
1021         d= AV_RN32(&src4[i*src_stride4+4]);\
1022         l0=  (a&0x03030303UL)\
1023            + (b&0x03030303UL)\
1024            + 0x02020202UL;\
1025         h0= ((a&0xFCFCFCFCUL)>>2)\
1026           + ((b&0xFCFCFCFCUL)>>2);\
1027         l1=  (c&0x03030303UL)\
1028            + (d&0x03030303UL);\
1029         h1= ((c&0xFCFCFCFCUL)>>2)\
1030           + ((d&0xFCFCFCFCUL)>>2);\
1031         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1032     }\
1033 }\
1034 \
1035 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1036     OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1037 }\
1038 \
1039 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1040     OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1041 }\
1042 \
1043 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1044     OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1045 }\
1046 \
1047 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1048     OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1049 }\
1050 \
1051 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1052                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1053     int i;\
1054     for(i=0; i<h; i++){\
1055         uint32_t a, b, c, d, l0, l1, h0, h1;\
1056         a= AV_RN32(&src1[i*src_stride1]);\
1057         b= AV_RN32(&src2[i*src_stride2]);\
1058         c= AV_RN32(&src3[i*src_stride3]);\
1059         d= AV_RN32(&src4[i*src_stride4]);\
1060         l0=  (a&0x03030303UL)\
1061            + (b&0x03030303UL)\
1062            + 0x01010101UL;\
1063         h0= ((a&0xFCFCFCFCUL)>>2)\
1064           + ((b&0xFCFCFCFCUL)>>2);\
1065         l1=  (c&0x03030303UL)\
1066            + (d&0x03030303UL);\
1067         h1= ((c&0xFCFCFCFCUL)>>2)\
1068           + ((d&0xFCFCFCFCUL)>>2);\
1069         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1070         a= AV_RN32(&src1[i*src_stride1+4]);\
1071         b= AV_RN32(&src2[i*src_stride2+4]);\
1072         c= AV_RN32(&src3[i*src_stride3+4]);\
1073         d= AV_RN32(&src4[i*src_stride4+4]);\
1074         l0=  (a&0x03030303UL)\
1075            + (b&0x03030303UL)\
1076            + 0x01010101UL;\
1077         h0= ((a&0xFCFCFCFCUL)>>2)\
1078           + ((b&0xFCFCFCFCUL)>>2);\
1079         l1=  (c&0x03030303UL)\
1080            + (d&0x03030303UL);\
1081         h1= ((c&0xFCFCFCFCUL)>>2)\
1082           + ((d&0xFCFCFCFCUL)>>2);\
1083         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1084     }\
1085 }\
1086 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1087                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1088     OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1089     OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1090 }\
1091 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1092                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1093     OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1094     OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1095 }\
1096 \
1097 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1098 {\
1099         int i, a0, b0, a1, b1;\
1100         a0= pixels[0];\
1101         b0= pixels[1] + 2;\
1102         a0 += b0;\
1103         b0 += pixels[2];\
1104 \
1105         pixels+=line_size;\
1106         for(i=0; i<h; i+=2){\
1107             a1= pixels[0];\
1108             b1= pixels[1];\
1109             a1 += b1;\
1110             b1 += pixels[2];\
1111 \
1112             block[0]= (a1+a0)>>2; /* FIXME non put */\
1113             block[1]= (b1+b0)>>2;\
1114 \
1115             pixels+=line_size;\
1116             block +=line_size;\
1117 \
1118             a0= pixels[0];\
1119             b0= pixels[1] + 2;\
1120             a0 += b0;\
1121             b0 += pixels[2];\
1122 \
1123             block[0]= (a1+a0)>>2;\
1124             block[1]= (b1+b0)>>2;\
1125             pixels+=line_size;\
1126             block +=line_size;\
1127         }\
1128 }\
1129 \
1130 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1131 {\
1132         int i;\
1133         const uint32_t a= AV_RN32(pixels  );\
1134         const uint32_t b= AV_RN32(pixels+1);\
1135         uint32_t l0=  (a&0x03030303UL)\
1136                     + (b&0x03030303UL)\
1137                     + 0x02020202UL;\
1138         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1139                    + ((b&0xFCFCFCFCUL)>>2);\
1140         uint32_t l1,h1;\
1141 \
1142         pixels+=line_size;\
1143         for(i=0; i<h; i+=2){\
1144             uint32_t a= AV_RN32(pixels  );\
1145             uint32_t b= AV_RN32(pixels+1);\
1146             l1=  (a&0x03030303UL)\
1147                + (b&0x03030303UL);\
1148             h1= ((a&0xFCFCFCFCUL)>>2)\
1149               + ((b&0xFCFCFCFCUL)>>2);\
1150             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1151             pixels+=line_size;\
1152             block +=line_size;\
1153             a= AV_RN32(pixels  );\
1154             b= AV_RN32(pixels+1);\
1155             l0=  (a&0x03030303UL)\
1156                + (b&0x03030303UL)\
1157                + 0x02020202UL;\
1158             h0= ((a&0xFCFCFCFCUL)>>2)\
1159               + ((b&0xFCFCFCFCUL)>>2);\
1160             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1161             pixels+=line_size;\
1162             block +=line_size;\
1163         }\
1164 }\
1165 \
1166 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1167 {\
1168     int j;\
1169     for(j=0; j<2; j++){\
1170         int i;\
1171         const uint32_t a= AV_RN32(pixels  );\
1172         const uint32_t b= AV_RN32(pixels+1);\
1173         uint32_t l0=  (a&0x03030303UL)\
1174                     + (b&0x03030303UL)\
1175                     + 0x02020202UL;\
1176         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1177                    + ((b&0xFCFCFCFCUL)>>2);\
1178         uint32_t l1,h1;\
1179 \
1180         pixels+=line_size;\
1181         for(i=0; i<h; i+=2){\
1182             uint32_t a= AV_RN32(pixels  );\
1183             uint32_t b= AV_RN32(pixels+1);\
1184             l1=  (a&0x03030303UL)\
1185                + (b&0x03030303UL);\
1186             h1= ((a&0xFCFCFCFCUL)>>2)\
1187               + ((b&0xFCFCFCFCUL)>>2);\
1188             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1189             pixels+=line_size;\
1190             block +=line_size;\
1191             a= AV_RN32(pixels  );\
1192             b= AV_RN32(pixels+1);\
1193             l0=  (a&0x03030303UL)\
1194                + (b&0x03030303UL)\
1195                + 0x02020202UL;\
1196             h0= ((a&0xFCFCFCFCUL)>>2)\
1197               + ((b&0xFCFCFCFCUL)>>2);\
1198             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1199             pixels+=line_size;\
1200             block +=line_size;\
1201         }\
1202         pixels+=4-line_size*(h+1);\
1203         block +=4-line_size*h;\
1204     }\
1205 }\
1206 \
1207 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1208 {\
1209     int j;\
1210     for(j=0; j<2; j++){\
1211         int i;\
1212         const uint32_t a= AV_RN32(pixels  );\
1213         const uint32_t b= AV_RN32(pixels+1);\
1214         uint32_t l0=  (a&0x03030303UL)\
1215                     + (b&0x03030303UL)\
1216                     + 0x01010101UL;\
1217         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1218                    + ((b&0xFCFCFCFCUL)>>2);\
1219         uint32_t l1,h1;\
1220 \
1221         pixels+=line_size;\
1222         for(i=0; i<h; i+=2){\
1223             uint32_t a= AV_RN32(pixels  );\
1224             uint32_t b= AV_RN32(pixels+1);\
1225             l1=  (a&0x03030303UL)\
1226                + (b&0x03030303UL);\
1227             h1= ((a&0xFCFCFCFCUL)>>2)\
1228               + ((b&0xFCFCFCFCUL)>>2);\
1229             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1230             pixels+=line_size;\
1231             block +=line_size;\
1232             a= AV_RN32(pixels  );\
1233             b= AV_RN32(pixels+1);\
1234             l0=  (a&0x03030303UL)\
1235                + (b&0x03030303UL)\
1236                + 0x01010101UL;\
1237             h0= ((a&0xFCFCFCFCUL)>>2)\
1238               + ((b&0xFCFCFCFCUL)>>2);\
1239             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1240             pixels+=line_size;\
1241             block +=line_size;\
1242         }\
1243         pixels+=4-line_size*(h+1);\
1244         block +=4-line_size*h;\
1245     }\
1246 }\
1247 \
1248 CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1249 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1250 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1251 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1252 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1253 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1254 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1255 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1256
1257 #define op_avg(a, b) a = rnd_avg32(a, b)
1258 #endif
1259 #define op_put(a, b) a = b
1260
1261 PIXOP2(avg, op_avg)
1262 PIXOP2(put, op_put)
1263 #undef op_avg
1264 #undef op_put
1265
1266 #define avg2(a,b) ((a+b+1)>>1)
1267 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1268
1269 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1270     put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1271 }
1272
1273 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1274     put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1275 }
1276
1277 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1278 {
1279     const int A=(16-x16)*(16-y16);
1280     const int B=(   x16)*(16-y16);
1281     const int C=(16-x16)*(   y16);
1282     const int D=(   x16)*(   y16);
1283     int i;
1284
1285     for(i=0; i<h; i++)
1286     {
1287         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1288         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1289         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1290         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1291         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1292         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1293         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1294         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1295         dst+= stride;
1296         src+= stride;
1297     }
1298 }
1299
1300 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1301                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1302 {
1303     int y, vx, vy;
1304     const int s= 1<<shift;
1305
1306     width--;
1307     height--;
1308
1309     for(y=0; y<h; y++){
1310         int x;
1311
1312         vx= ox;
1313         vy= oy;
1314         for(x=0; x<8; x++){ //XXX FIXME optimize
1315             int src_x, src_y, frac_x, frac_y, index;
1316
1317             src_x= vx>>16;
1318             src_y= vy>>16;
1319             frac_x= src_x&(s-1);
1320             frac_y= src_y&(s-1);
1321             src_x>>=shift;
1322             src_y>>=shift;
1323
1324             if((unsigned)src_x < width){
1325                 if((unsigned)src_y < height){
1326                     index= src_x + src_y*stride;
1327                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1328                                            + src[index       +1]*   frac_x )*(s-frac_y)
1329                                         + (  src[index+stride  ]*(s-frac_x)
1330                                            + src[index+stride+1]*   frac_x )*   frac_y
1331                                         + r)>>(shift*2);
1332                 }else{
1333                     index= src_x + av_clip(src_y, 0, height)*stride;
1334                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1335                                           + src[index       +1]*   frac_x )*s
1336                                         + r)>>(shift*2);
1337                 }
1338             }else{
1339                 if((unsigned)src_y < height){
1340                     index= av_clip(src_x, 0, width) + src_y*stride;
1341                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1342                                            + src[index+stride  ]*   frac_y )*s
1343                                         + r)>>(shift*2);
1344                 }else{
1345                     index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1346                     dst[y*stride + x]=    src[index         ];
1347                 }
1348             }
1349
1350             vx+= dxx;
1351             vy+= dyx;
1352         }
1353         ox += dxy;
1354         oy += dyy;
1355     }
1356 }
1357
1358 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1359     switch(width){
1360     case 2: put_pixels2_c (dst, src, stride, height); break;
1361     case 4: put_pixels4_c (dst, src, stride, height); break;
1362     case 8: put_pixels8_c (dst, src, stride, height); break;
1363     case 16:put_pixels16_c(dst, src, stride, height); break;
1364     }
1365 }
1366
1367 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1368     int i,j;
1369     for (i=0; i < height; i++) {
1370       for (j=0; j < width; j++) {
1371         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1372       }
1373       src += stride;
1374       dst += stride;
1375     }
1376 }
1377
1378 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1379     int i,j;
1380     for (i=0; i < height; i++) {
1381       for (j=0; j < width; j++) {
1382         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1383       }
1384       src += stride;
1385       dst += stride;
1386     }
1387 }
1388
1389 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1390     int i,j;
1391     for (i=0; i < height; i++) {
1392       for (j=0; j < width; j++) {
1393         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1394       }
1395       src += stride;
1396       dst += stride;
1397     }
1398 }
1399
1400 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1401     int i,j;
1402     for (i=0; i < height; i++) {
1403       for (j=0; j < width; j++) {
1404         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1405       }
1406       src += stride;
1407       dst += stride;
1408     }
1409 }
1410
1411 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1412     int i,j;
1413     for (i=0; i < height; i++) {
1414       for (j=0; j < width; j++) {
1415         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1416       }
1417       src += stride;
1418       dst += stride;
1419     }
1420 }
1421
1422 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1423     int i,j;
1424     for (i=0; i < height; i++) {
1425       for (j=0; j < width; j++) {
1426         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1427       }
1428       src += stride;
1429       dst += stride;
1430     }
1431 }
1432
1433 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1434     int i,j;
1435     for (i=0; i < height; i++) {
1436       for (j=0; j < width; j++) {
1437         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1438       }
1439       src += stride;
1440       dst += stride;
1441     }
1442 }
1443
1444 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1445     int i,j;
1446     for (i=0; i < height; i++) {
1447       for (j=0; j < width; j++) {
1448         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1449       }
1450       src += stride;
1451       dst += stride;
1452     }
1453 }
1454
1455 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1456     switch(width){
1457     case 2: avg_pixels2_c (dst, src, stride, height); break;
1458     case 4: avg_pixels4_c (dst, src, stride, height); break;
1459     case 8: avg_pixels8_c (dst, src, stride, height); break;
1460     case 16:avg_pixels16_c(dst, src, stride, height); break;
1461     }
1462 }
1463
1464 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1465     int i,j;
1466     for (i=0; i < height; i++) {
1467       for (j=0; j < width; j++) {
1468         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1469       }
1470       src += stride;
1471       dst += stride;
1472     }
1473 }
1474
1475 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1476     int i,j;
1477     for (i=0; i < height; i++) {
1478       for (j=0; j < width; j++) {
1479         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1480       }
1481       src += stride;
1482       dst += stride;
1483     }
1484 }
1485
1486 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1487     int i,j;
1488     for (i=0; i < height; i++) {
1489       for (j=0; j < width; j++) {
1490         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1491       }
1492       src += stride;
1493       dst += stride;
1494     }
1495 }
1496
1497 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1498     int i,j;
1499     for (i=0; i < height; i++) {
1500       for (j=0; j < width; j++) {
1501         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1502       }
1503       src += stride;
1504       dst += stride;
1505     }
1506 }
1507
1508 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1509     int i,j;
1510     for (i=0; i < height; i++) {
1511       for (j=0; j < width; j++) {
1512         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1513       }
1514       src += stride;
1515       dst += stride;
1516     }
1517 }
1518
1519 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1520     int i,j;
1521     for (i=0; i < height; i++) {
1522       for (j=0; j < width; j++) {
1523         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1524       }
1525       src += stride;
1526       dst += stride;
1527     }
1528 }
1529
1530 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1531     int i,j;
1532     for (i=0; i < height; i++) {
1533       for (j=0; j < width; j++) {
1534         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1535       }
1536       src += stride;
1537       dst += stride;
1538     }
1539 }
1540
1541 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1542     int i,j;
1543     for (i=0; i < height; i++) {
1544       for (j=0; j < width; j++) {
1545         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1546       }
1547       src += stride;
1548       dst += stride;
1549     }
1550 }
1551 #if 0
1552 #define TPEL_WIDTH(width)\
1553 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1554     void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1555 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1556     void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1557 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1558     void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1559 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1560     void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1561 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1562     void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1563 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1564     void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1565 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1566     void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1567 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1568     void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1569 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1570     void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1571 #endif
1572
1573 #define H264_CHROMA_MC(OPNAME, OP)\
1574 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1575     const int A=(8-x)*(8-y);\
1576     const int B=(  x)*(8-y);\
1577     const int C=(8-x)*(  y);\
1578     const int D=(  x)*(  y);\
1579     int i;\
1580     \
1581     assert(x<8 && y<8 && x>=0 && y>=0);\
1582 \
1583     if(D){\
1584         for(i=0; i<h; i++){\
1585             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1586             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1587             dst+= stride;\
1588             src+= stride;\
1589         }\
1590     }else{\
1591         const int E= B+C;\
1592         const int step= C ? stride : 1;\
1593         for(i=0; i<h; i++){\
1594             OP(dst[0], (A*src[0] + E*src[step+0]));\
1595             OP(dst[1], (A*src[1] + E*src[step+1]));\
1596             dst+= stride;\
1597             src+= stride;\
1598         }\
1599     }\
1600 }\
1601 \
1602 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1603     const int A=(8-x)*(8-y);\
1604     const int B=(  x)*(8-y);\
1605     const int C=(8-x)*(  y);\
1606     const int D=(  x)*(  y);\
1607     int i;\
1608     \
1609     assert(x<8 && y<8 && x>=0 && y>=0);\
1610 \
1611     if(D){\
1612         for(i=0; i<h; i++){\
1613             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1614             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1615             OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1616             OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1617             dst+= stride;\
1618             src+= stride;\
1619         }\
1620     }else{\
1621         const int E= B+C;\
1622         const int step= C ? stride : 1;\
1623         for(i=0; i<h; i++){\
1624             OP(dst[0], (A*src[0] + E*src[step+0]));\
1625             OP(dst[1], (A*src[1] + E*src[step+1]));\
1626             OP(dst[2], (A*src[2] + E*src[step+2]));\
1627             OP(dst[3], (A*src[3] + E*src[step+3]));\
1628             dst+= stride;\
1629             src+= stride;\
1630         }\
1631     }\
1632 }\
1633 \
1634 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1635     const int A=(8-x)*(8-y);\
1636     const int B=(  x)*(8-y);\
1637     const int C=(8-x)*(  y);\
1638     const int D=(  x)*(  y);\
1639     int i;\
1640     \
1641     assert(x<8 && y<8 && x>=0 && y>=0);\
1642 \
1643     if(D){\
1644         for(i=0; i<h; i++){\
1645             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1646             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1647             OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1648             OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1649             OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1650             OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1651             OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1652             OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1653             dst+= stride;\
1654             src+= stride;\
1655         }\
1656     }else{\
1657         const int E= B+C;\
1658         const int step= C ? stride : 1;\
1659         for(i=0; i<h; i++){\
1660             OP(dst[0], (A*src[0] + E*src[step+0]));\
1661             OP(dst[1], (A*src[1] + E*src[step+1]));\
1662             OP(dst[2], (A*src[2] + E*src[step+2]));\
1663             OP(dst[3], (A*src[3] + E*src[step+3]));\
1664             OP(dst[4], (A*src[4] + E*src[step+4]));\
1665             OP(dst[5], (A*src[5] + E*src[step+5]));\
1666             OP(dst[6], (A*src[6] + E*src[step+6]));\
1667             OP(dst[7], (A*src[7] + E*src[step+7]));\
1668             dst+= stride;\
1669             src+= stride;\
1670         }\
1671     }\
1672 }
1673
1674 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1675 #define op_put(a, b) a = (((b) + 32)>>6)
1676
1677 H264_CHROMA_MC(put_       , op_put)
1678 H264_CHROMA_MC(avg_       , op_avg)
1679 #undef op_avg
1680 #undef op_put
1681
1682 static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1683     const int A=(8-x)*(8-y);
1684     const int B=(  x)*(8-y);
1685     const int C=(8-x)*(  y);
1686     const int D=(  x)*(  y);
1687     int i;
1688
1689     assert(x<8 && y<8 && x>=0 && y>=0);
1690
1691     for(i=0; i<h; i++)
1692     {
1693         dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1694         dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1695         dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1696         dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1697         dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1698         dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1699         dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1700         dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1701         dst+= stride;
1702         src+= stride;
1703     }
1704 }
1705
1706 static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1707     const int A=(8-x)*(8-y);
1708     const int B=(  x)*(8-y);
1709     const int C=(8-x)*(  y);
1710     const int D=(  x)*(  y);
1711     int i;
1712
1713     assert(x<8 && y<8 && x>=0 && y>=0);
1714
1715     for(i=0; i<h; i++)
1716     {
1717         dst[0] = avg2(dst[0], ((A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6));
1718         dst[1] = avg2(dst[1], ((A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6));
1719         dst[2] = avg2(dst[2], ((A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6));
1720         dst[3] = avg2(dst[3], ((A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6));
1721         dst[4] = avg2(dst[4], ((A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6));
1722         dst[5] = avg2(dst[5], ((A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6));
1723         dst[6] = avg2(dst[6], ((A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6));
1724         dst[7] = avg2(dst[7], ((A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6));
1725         dst+= stride;
1726         src+= stride;
1727     }
1728 }
1729
1730 #define QPEL_MC(r, OPNAME, RND, OP) \
1731 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1732     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1733     int i;\
1734     for(i=0; i<h; i++)\
1735     {\
1736         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1737         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1738         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1739         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1740         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1741         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1742         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1743         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1744         dst+=dstStride;\
1745         src+=srcStride;\
1746     }\
1747 }\
1748 \
1749 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1750     const int w=8;\
1751     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1752     int i;\
1753     for(i=0; i<w; i++)\
1754     {\
1755         const int src0= src[0*srcStride];\
1756         const int src1= src[1*srcStride];\
1757         const int src2= src[2*srcStride];\
1758         const int src3= src[3*srcStride];\
1759         const int src4= src[4*srcStride];\
1760         const int src5= src[5*srcStride];\
1761         const int src6= src[6*srcStride];\
1762         const int src7= src[7*srcStride];\
1763         const int src8= src[8*srcStride];\
1764         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1765         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1766         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1767         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1768         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1769         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1770         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1771         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1772         dst++;\
1773         src++;\
1774     }\
1775 }\
1776 \
1777 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1778     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1779     int i;\
1780     \
1781     for(i=0; i<h; i++)\
1782     {\
1783         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1784         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1785         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1786         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1787         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1788         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1789         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1790         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1791         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1792         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1793         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1794         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1795         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1796         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1797         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1798         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1799         dst+=dstStride;\
1800         src+=srcStride;\
1801     }\
1802 }\
1803 \
1804 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1805     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1806     int i;\
1807     const int w=16;\
1808     for(i=0; i<w; i++)\
1809     {\
1810         const int src0= src[0*srcStride];\
1811         const int src1= src[1*srcStride];\
1812         const int src2= src[2*srcStride];\
1813         const int src3= src[3*srcStride];\
1814         const int src4= src[4*srcStride];\
1815         const int src5= src[5*srcStride];\
1816         const int src6= src[6*srcStride];\
1817         const int src7= src[7*srcStride];\
1818         const int src8= src[8*srcStride];\
1819         const int src9= src[9*srcStride];\
1820         const int src10= src[10*srcStride];\
1821         const int src11= src[11*srcStride];\
1822         const int src12= src[12*srcStride];\
1823         const int src13= src[13*srcStride];\
1824         const int src14= src[14*srcStride];\
1825         const int src15= src[15*srcStride];\
1826         const int src16= src[16*srcStride];\
1827         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1828         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1829         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1830         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1831         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1832         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1833         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1834         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1835         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1836         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1837         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1838         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1839         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1840         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1841         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1842         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1843         dst++;\
1844         src++;\
1845     }\
1846 }\
1847 \
1848 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1849     OPNAME ## pixels8_c(dst, src, stride, 8);\
1850 }\
1851 \
1852 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1853     uint8_t half[64];\
1854     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1855     OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1856 }\
1857 \
1858 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1859     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1860 }\
1861 \
1862 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1863     uint8_t half[64];\
1864     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1865     OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1866 }\
1867 \
1868 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1869     uint8_t full[16*9];\
1870     uint8_t half[64];\
1871     copy_block9(full, src, 16, stride, 9);\
1872     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1873     OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1874 }\
1875 \
1876 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1877     uint8_t full[16*9];\
1878     copy_block9(full, src, 16, stride, 9);\
1879     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1880 }\
1881 \
1882 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1883     uint8_t full[16*9];\
1884     uint8_t half[64];\
1885     copy_block9(full, src, 16, stride, 9);\
1886     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1887     OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1888 }\
1889 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1890     uint8_t full[16*9];\
1891     uint8_t halfH[72];\
1892     uint8_t halfV[64];\
1893     uint8_t halfHV[64];\
1894     copy_block9(full, src, 16, stride, 9);\
1895     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1896     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1897     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1898     OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1899 }\
1900 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1901     uint8_t full[16*9];\
1902     uint8_t halfH[72];\
1903     uint8_t halfHV[64];\
1904     copy_block9(full, src, 16, stride, 9);\
1905     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1906     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1907     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1908     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1909 }\
1910 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1911     uint8_t full[16*9];\
1912     uint8_t halfH[72];\
1913     uint8_t halfV[64];\
1914     uint8_t halfHV[64];\
1915     copy_block9(full, src, 16, stride, 9);\
1916     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1917     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1918     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1919     OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1920 }\
1921 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1922     uint8_t full[16*9];\
1923     uint8_t halfH[72];\
1924     uint8_t halfHV[64];\
1925     copy_block9(full, src, 16, stride, 9);\
1926     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1927     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1928     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1929     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1930 }\
1931 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1932     uint8_t full[16*9];\
1933     uint8_t halfH[72];\
1934     uint8_t halfV[64];\
1935     uint8_t halfHV[64];\
1936     copy_block9(full, src, 16, stride, 9);\
1937     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1938     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1939     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1940     OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1941 }\
1942 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1943     uint8_t full[16*9];\
1944     uint8_t halfH[72];\
1945     uint8_t halfHV[64];\
1946     copy_block9(full, src, 16, stride, 9);\
1947     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1948     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1949     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1950     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1951 }\
1952 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1953     uint8_t full[16*9];\
1954     uint8_t halfH[72];\
1955     uint8_t halfV[64];\
1956     uint8_t halfHV[64];\
1957     copy_block9(full, src, 16, stride, 9);\
1958     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1959     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1960     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1961     OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1962 }\
1963 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1964     uint8_t full[16*9];\
1965     uint8_t halfH[72];\
1966     uint8_t halfHV[64];\
1967     copy_block9(full, src, 16, stride, 9);\
1968     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1969     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1970     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1971     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1972 }\
1973 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1974     uint8_t halfH[72];\
1975     uint8_t halfHV[64];\
1976     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1977     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1978     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1979 }\
1980 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1981     uint8_t halfH[72];\
1982     uint8_t halfHV[64];\
1983     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1984     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1985     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1986 }\
1987 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1988     uint8_t full[16*9];\
1989     uint8_t halfH[72];\
1990     uint8_t halfV[64];\
1991     uint8_t halfHV[64];\
1992     copy_block9(full, src, 16, stride, 9);\
1993     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1994     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1995     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1996     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1997 }\
1998 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1999     uint8_t full[16*9];\
2000     uint8_t halfH[72];\
2001     copy_block9(full, src, 16, stride, 9);\
2002     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2003     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
2004     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2005 }\
2006 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2007     uint8_t full[16*9];\
2008     uint8_t halfH[72];\
2009     uint8_t halfV[64];\
2010     uint8_t halfHV[64];\
2011     copy_block9(full, src, 16, stride, 9);\
2012     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2013     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
2014     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2015     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
2016 }\
2017 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2018     uint8_t full[16*9];\
2019     uint8_t halfH[72];\
2020     copy_block9(full, src, 16, stride, 9);\
2021     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2022     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
2023     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2024 }\
2025 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2026     uint8_t halfH[72];\
2027     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
2028     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2029 }\
2030 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2031     OPNAME ## pixels16_c(dst, src, stride, 16);\
2032 }\
2033 \
2034 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2035     uint8_t half[256];\
2036     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2037     OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
2038 }\
2039 \
2040 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2041     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
2042 }\
2043 \
2044 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2045     uint8_t half[256];\
2046     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2047     OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
2048 }\
2049 \
2050 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2051     uint8_t full[24*17];\
2052     uint8_t half[256];\
2053     copy_block17(full, src, 24, stride, 17);\
2054     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2055     OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
2056 }\
2057 \
2058 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2059     uint8_t full[24*17];\
2060     copy_block17(full, src, 24, stride, 17);\
2061     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
2062 }\
2063 \
2064 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2065     uint8_t full[24*17];\
2066     uint8_t half[256];\
2067     copy_block17(full, src, 24, stride, 17);\
2068     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2069     OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
2070 }\
2071 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
2072     uint8_t full[24*17];\
2073     uint8_t halfH[272];\
2074     uint8_t halfV[256];\
2075     uint8_t halfHV[256];\
2076     copy_block17(full, src, 24, stride, 17);\
2077     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2078     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2079     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2080     OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2081 }\
2082 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2083     uint8_t full[24*17];\
2084     uint8_t halfH[272];\
2085     uint8_t halfHV[256];\
2086     copy_block17(full, src, 24, stride, 17);\
2087     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2088     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2089     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2090     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2091 }\
2092 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2093     uint8_t full[24*17];\
2094     uint8_t halfH[272];\
2095     uint8_t halfV[256];\
2096     uint8_t halfHV[256];\
2097     copy_block17(full, src, 24, stride, 17);\
2098     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2099     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2100     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2101     OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2102 }\
2103 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2104     uint8_t full[24*17];\
2105     uint8_t halfH[272];\
2106     uint8_t halfHV[256];\
2107     copy_block17(full, src, 24, stride, 17);\
2108     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2109     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2110     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2111     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2112 }\
2113 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2114     uint8_t full[24*17];\
2115     uint8_t halfH[272];\
2116     uint8_t halfV[256];\
2117     uint8_t halfHV[256];\
2118     copy_block17(full, src, 24, stride, 17);\
2119     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2120     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2121     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2122     OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2123 }\
2124 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2125     uint8_t full[24*17];\
2126     uint8_t halfH[272];\
2127     uint8_t halfHV[256];\
2128     copy_block17(full, src, 24, stride, 17);\
2129     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2130     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2131     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2132     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2133 }\
2134 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2135     uint8_t full[24*17];\
2136     uint8_t halfH[272];\
2137     uint8_t halfV[256];\
2138     uint8_t halfHV[256];\
2139     copy_block17(full, src, 24, stride, 17);\
2140     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
2141     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2142     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2143     OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2144 }\
2145 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2146     uint8_t full[24*17];\
2147     uint8_t halfH[272];\
2148     uint8_t halfHV[256];\
2149     copy_block17(full, src, 24, stride, 17);\
2150     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2151     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2152     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2153     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2154 }\
2155 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2156     uint8_t halfH[272];\
2157     uint8_t halfHV[256];\
2158     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2159     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2160     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2161 }\
2162 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2163     uint8_t halfH[272];\
2164     uint8_t halfHV[256];\
2165     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2166     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2167     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2168 }\
2169 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2170     uint8_t full[24*17];\
2171     uint8_t halfH[272];\
2172     uint8_t halfV[256];\
2173     uint8_t halfHV[256];\
2174     copy_block17(full, src, 24, stride, 17);\
2175     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2176     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2177     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2178     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2179 }\
2180 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2181     uint8_t full[24*17];\
2182     uint8_t halfH[272];\
2183     copy_block17(full, src, 24, stride, 17);\
2184     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2185     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2186     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2187 }\
2188 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2189     uint8_t full[24*17];\
2190     uint8_t halfH[272];\
2191     uint8_t halfV[256];\
2192     uint8_t halfHV[256];\
2193     copy_block17(full, src, 24, stride, 17);\
2194     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2195     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2196     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2197     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2198 }\
2199 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2200     uint8_t full[24*17];\
2201     uint8_t halfH[272];\
2202     copy_block17(full, src, 24, stride, 17);\
2203     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2204     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2205     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2206 }\
2207 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2208     uint8_t halfH[272];\
2209     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2210     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2211 }
2212
2213 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2214 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2215 #define op_put(a, b) a = cm[((b) + 16)>>5]
2216 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2217
2218 QPEL_MC(0, put_       , _       , op_put)
2219 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2220 QPEL_MC(0, avg_       , _       , op_avg)
2221 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
2222 #undef op_avg
2223 #undef op_avg_no_rnd
2224 #undef op_put
2225 #undef op_put_no_rnd
2226
2227 #if 1
2228 #define H264_LOWPASS(OPNAME, OP, OP2) \
2229 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2230     const int h=2;\
2231     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2232     int i;\
2233     for(i=0; i<h; i++)\
2234     {\
2235         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2236         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2237         dst+=dstStride;\
2238         src+=srcStride;\
2239     }\
2240 }\
2241 \
2242 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2243     const int w=2;\
2244     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2245     int i;\
2246     for(i=0; i<w; i++)\
2247     {\
2248         const int srcB= src[-2*srcStride];\
2249         const int srcA= src[-1*srcStride];\
2250         const int src0= src[0 *srcStride];\
2251         const int src1= src[1 *srcStride];\
2252         const int src2= src[2 *srcStride];\
2253         const int src3= src[3 *srcStride];\
2254         const int src4= src[4 *srcStride];\
2255         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2256         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2257         dst++;\
2258         src++;\
2259     }\
2260 }\
2261 \
2262 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2263     const int h=2;\
2264     const int w=2;\
2265     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2266     int i;\
2267     src -= 2*srcStride;\
2268     for(i=0; i<h+5; i++)\
2269     {\
2270         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2271         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2272         tmp+=tmpStride;\
2273         src+=srcStride;\
2274     }\
2275     tmp -= tmpStride*(h+5-2);\
2276     for(i=0; i<w; i++)\
2277     {\
2278         const int tmpB= tmp[-2*tmpStride];\
2279         const int tmpA= tmp[-1*tmpStride];\
2280         const int tmp0= tmp[0 *tmpStride];\
2281         const int tmp1= tmp[1 *tmpStride];\
2282         const int tmp2= tmp[2 *tmpStride];\
2283         const int tmp3= tmp[3 *tmpStride];\
2284         const int tmp4= tmp[4 *tmpStride];\
2285         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2286         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2287         dst++;\
2288         tmp++;\
2289     }\
2290 }\
2291 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2292     const int h=4;\
2293     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2294     int i;\
2295     for(i=0; i<h; i++)\
2296     {\
2297         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2298         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2299         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2300         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2301         dst+=dstStride;\
2302         src+=srcStride;\
2303     }\
2304 }\
2305 \
2306 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2307     const int w=4;\
2308     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2309     int i;\
2310     for(i=0; i<w; i++)\
2311     {\
2312         const int srcB= src[-2*srcStride];\
2313         const int srcA= src[-1*srcStride];\
2314         const int src0= src[0 *srcStride];\
2315         const int src1= src[1 *srcStride];\
2316         const int src2= src[2 *srcStride];\
2317         const int src3= src[3 *srcStride];\
2318         const int src4= src[4 *srcStride];\
2319         const int src5= src[5 *srcStride];\
2320         const int src6= src[6 *srcStride];\
2321         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2322         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2323         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2324         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2325         dst++;\
2326         src++;\
2327     }\
2328 }\
2329 \
2330 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2331     const int h=4;\
2332     const int w=4;\
2333     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2334     int i;\
2335     src -= 2*srcStride;\
2336     for(i=0; i<h+5; i++)\
2337     {\
2338         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2339         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2340         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2341         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2342         tmp+=tmpStride;\
2343         src+=srcStride;\
2344     }\
2345     tmp -= tmpStride*(h+5-2);\
2346     for(i=0; i<w; i++)\
2347     {\
2348         const int tmpB= tmp[-2*tmpStride];\
2349         const int tmpA= tmp[-1*tmpStride];\
2350         const int tmp0= tmp[0 *tmpStride];\
2351         const int tmp1= tmp[1 *tmpStride];\
2352         const int tmp2= tmp[2 *tmpStride];\
2353         const int tmp3= tmp[3 *tmpStride];\
2354         const int tmp4= tmp[4 *tmpStride];\
2355         const int tmp5= tmp[5 *tmpStride];\
2356         const int tmp6= tmp[6 *tmpStride];\
2357         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2358         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2359         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2360         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2361         dst++;\
2362         tmp++;\
2363     }\
2364 }\
2365 \
2366 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2367     const int h=8;\
2368     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2369     int i;\
2370     for(i=0; i<h; i++)\
2371     {\
2372         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2373         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2374         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2375         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2376         OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2377         OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2378         OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2379         OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2380         dst+=dstStride;\
2381         src+=srcStride;\
2382     }\
2383 }\
2384 \
2385 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2386     const int w=8;\
2387     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2388     int i;\
2389     for(i=0; i<w; i++)\
2390     {\
2391         const int srcB= src[-2*srcStride];\
2392         const int srcA= src[-1*srcStride];\
2393         const int src0= src[0 *srcStride];\
2394         const int src1= src[1 *srcStride];\
2395         const int src2= src[2 *srcStride];\
2396         const int src3= src[3 *srcStride];\
2397         const int src4= src[4 *srcStride];\
2398         const int src5= src[5 *srcStride];\
2399         const int src6= src[6 *srcStride];\
2400         const int src7= src[7 *srcStride];\
2401         const int src8= src[8 *srcStride];\
2402         const int src9= src[9 *srcStride];\
2403         const int src10=src[10*srcStride];\
2404         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2405         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2406         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2407         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2408         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2409         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2410         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2411         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2412         dst++;\
2413         src++;\
2414     }\
2415 }\
2416 \
2417 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2418     const int h=8;\
2419     const int w=8;\
2420     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2421     int i;\
2422     src -= 2*srcStride;\
2423     for(i=0; i<h+5; i++)\
2424     {\
2425         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2426         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2427         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2428         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2429         tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2430         tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2431         tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2432         tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2433         tmp+=tmpStride;\
2434         src+=srcStride;\
2435     }\
2436     tmp -= tmpStride*(h+5-2);\
2437     for(i=0; i<w; i++)\
2438     {\
2439         const int tmpB= tmp[-2*tmpStride];\
2440         const int tmpA= tmp[-1*tmpStride];\
2441         const int tmp0= tmp[0 *tmpStride];\
2442         const int tmp1= tmp[1 *tmpStride];\
2443         const int tmp2= tmp[2 *tmpStride];\
2444         const int tmp3= tmp[3 *tmpStride];\
2445         const int tmp4= tmp[4 *tmpStride];\
2446         const int tmp5= tmp[5 *tmpStride];\
2447         const int tmp6= tmp[6 *tmpStride];\
2448         const int tmp7= tmp[7 *tmpStride];\
2449         const int tmp8= tmp[8 *tmpStride];\
2450         const int tmp9= tmp[9 *tmpStride];\
2451         const int tmp10=tmp[10*tmpStride];\
2452         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2453         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2454         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2455         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2456         OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2457         OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2458         OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2459         OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2460         dst++;\
2461         tmp++;\
2462     }\
2463 }\
2464 \
2465 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2466     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2467     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2468     src += 8*srcStride;\
2469     dst += 8*dstStride;\
2470     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2471     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2472 }\
2473 \
2474 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2475     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2476     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2477     src += 8*srcStride;\
2478     dst += 8*dstStride;\
2479     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2480     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2481 }\
2482 \
2483 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2484     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2485     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2486     src += 8*srcStride;\
2487     dst += 8*dstStride;\
2488     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2489     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2490 }\
2491
2492 #define H264_MC(OPNAME, SIZE) \
2493 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2494     OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2495 }\
2496 \
2497 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2498     uint8_t half[SIZE*SIZE];\
2499     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2500     OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2501 }\
2502 \
2503 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2504     OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2505 }\
2506 \
2507 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2508     uint8_t half[SIZE*SIZE];\
2509     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2510     OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2511 }\
2512 \
2513 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2514     uint8_t full[SIZE*(SIZE+5)];\
2515     uint8_t * const full_mid= full + SIZE*2;\
2516     uint8_t half[SIZE*SIZE];\
2517     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2518     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2519     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2520 }\
2521 \
2522 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2523     uint8_t full[SIZE*(SIZE+5)];\
2524     uint8_t * const full_mid= full + SIZE*2;\
2525     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2526     OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2527 }\
2528 \
2529 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2530     uint8_t full[SIZE*(SIZE+5)];\
2531     uint8_t * const full_mid= full + SIZE*2;\
2532     uint8_t half[SIZE*SIZE];\
2533     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2534     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2535     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2536 }\
2537 \
2538 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2539     uint8_t full[SIZE*(SIZE+5)];\
2540     uint8_t * const full_mid= full + SIZE*2;\
2541     uint8_t halfH[SIZE*SIZE];\
2542     uint8_t halfV[SIZE*SIZE];\
2543     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2544     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2545     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2546     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2547 }\
2548 \
2549 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2550     uint8_t full[SIZE*(SIZE+5)];\
2551     uint8_t * const full_mid= full + SIZE*2;\
2552     uint8_t halfH[SIZE*SIZE];\
2553     uint8_t halfV[SIZE*SIZE];\
2554     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2555     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2556     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2557     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2558 }\
2559 \
2560 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2561     uint8_t full[SIZE*(SIZE+5)];\
2562     uint8_t * const full_mid= full + SIZE*2;\
2563     uint8_t halfH[SIZE*SIZE];\
2564     uint8_t halfV[SIZE*SIZE];\
2565     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2566     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2567     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2568     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2569 }\
2570 \
2571 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2572     uint8_t full[SIZE*(SIZE+5)];\
2573     uint8_t * const full_mid= full + SIZE*2;\
2574     uint8_t halfH[SIZE*SIZE];\
2575     uint8_t halfV[SIZE*SIZE];\
2576     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2577     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2578     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2579     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2580 }\
2581 \
2582 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2583     int16_t tmp[SIZE*(SIZE+5)];\
2584     OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2585 }\
2586 \
2587 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2588     int16_t tmp[SIZE*(SIZE+5)];\
2589     uint8_t halfH[SIZE*SIZE];\
2590     uint8_t halfHV[SIZE*SIZE];\
2591     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2592     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2593     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2594 }\
2595 \
2596 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2597     int16_t tmp[SIZE*(SIZE+5)];\
2598     uint8_t halfH[SIZE*SIZE];\
2599     uint8_t halfHV[SIZE*SIZE];\
2600     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2601     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2602     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2603 }\
2604 \
2605 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2606     uint8_t full[SIZE*(SIZE+5)];\
2607     uint8_t * const full_mid= full + SIZE*2;\
2608     int16_t tmp[SIZE*(SIZE+5)];\
2609     uint8_t halfV[SIZE*SIZE];\
2610     uint8_t halfHV[SIZE*SIZE];\
2611     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2612     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2613     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2614     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2615 }\
2616 \
2617 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2618     uint8_t full[SIZE*(SIZE+5)];\
2619     uint8_t * const full_mid= full + SIZE*2;\
2620     int16_t tmp[SIZE*(SIZE+5)];\
2621     uint8_t halfV[SIZE*SIZE];\
2622     uint8_t halfHV[SIZE*SIZE];\
2623     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2624     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2625     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2626     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2627 }\
2628
2629 #define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2630 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2631 #define op_put(a, b)  a = cm[((b) + 16)>>5]
2632 #define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2633 #define op2_put(a, b)  a = cm[((b) + 512)>>10]
2634
2635 H264_LOWPASS(put_       , op_put, op2_put)
2636 H264_LOWPASS(avg_       , op_avg, op2_avg)
2637 H264_MC(put_, 2)
2638 H264_MC(put_, 4)
2639 H264_MC(put_, 8)
2640 H264_MC(put_, 16)
2641 H264_MC(avg_, 4)
2642 H264_MC(avg_, 8)
2643 H264_MC(avg_, 16)
2644
2645 #undef op_avg
2646 #undef op_put
2647 #undef op2_avg
2648 #undef op2_put
2649 #endif
2650
2651 #define op_scale1(x)  block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2652 #define op_scale2(x)  dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2653 #define H264_WEIGHT(W,H) \
2654 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2655     int y; \
2656     offset <<= log2_denom; \
2657     if(log2_denom) offset += 1<<(log2_denom-1); \
2658     for(y=0; y<H; y++, block += stride){ \
2659         op_scale1(0); \
2660         op_scale1(1); \
2661         if(W==2) continue; \
2662         op_scale1(2); \
2663         op_scale1(3); \
2664         if(W==4) continue; \
2665         op_scale1(4); \
2666         op_scale1(5); \
2667         op_scale1(6); \
2668         op_scale1(7); \
2669         if(W==8) continue; \
2670         op_scale1(8); \
2671         op_scale1(9); \
2672         op_scale1(10); \
2673         op_scale1(11); \
2674         op_scale1(12); \
2675         op_scale1(13); \
2676         op_scale1(14); \
2677         op_scale1(15); \
2678     } \
2679 } \
2680 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2681     int y; \
2682     offset = ((offset + 1) | 1) << log2_denom; \
2683     for(y=0; y<H; y++, dst += stride, src += stride){ \
2684         op_scale2(0); \
2685         op_scale2(1); \
2686         if(W==2) continue; \
2687         op_scale2(2); \
2688         op_scale2(3); \
2689         if(W==4) continue; \
2690         op_scale2(4); \
2691         op_scale2(5); \
2692         op_scale2(6); \
2693         op_scale2(7); \
2694         if(W==8) continue; \
2695         op_scale2(8); \
2696         op_scale2(9); \
2697         op_scale2(10); \
2698         op_scale2(11); \
2699         op_scale2(12); \
2700         op_scale2(13); \
2701         op_scale2(14); \
2702         op_scale2(15); \
2703     } \
2704 }
2705
2706 H264_WEIGHT(16,16)
2707 H264_WEIGHT(16,8)
2708 H264_WEIGHT(8,16)
2709 H264_WEIGHT(8,8)
2710 H264_WEIGHT(8,4)
2711 H264_WEIGHT(4,8)
2712 H264_WEIGHT(4,4)
2713 H264_WEIGHT(4,2)
2714 H264_WEIGHT(2,4)
2715 H264_WEIGHT(2,2)
2716
2717 #undef op_scale1
2718 #undef op_scale2
2719 #undef H264_WEIGHT
2720
2721 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2722     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2723     int i;
2724
2725     for(i=0; i<h; i++){
2726         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2727         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2728         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2729         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2730         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2731         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2732         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2733         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2734         dst+=dstStride;
2735         src+=srcStride;
2736     }
2737 }
2738
2739 #if CONFIG_CAVS_DECODER
2740 /* AVS specific */
2741 void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2742
2743 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2744     put_pixels8_c(dst, src, stride, 8);
2745 }
2746 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2747     avg_pixels8_c(dst, src, stride, 8);
2748 }
2749 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2750     put_pixels16_c(dst, src, stride, 16);
2751 }
2752 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2753     avg_pixels16_c(dst, src, stride, 16);
2754 }
2755 #endif /* CONFIG_CAVS_DECODER */
2756
2757 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
2758     void ff_mlp_init(DSPContext* c, AVCodecContext *avctx);
2759 #endif
2760
2761 #if CONFIG_VC1_DECODER || CONFIG_WMV3_DECODER
2762 /* VC-1 specific */
2763 void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2764
2765 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2766     put_pixels8_c(dst, src, stride, 8);
2767 }
2768 void ff_avg_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2769     avg_pixels8_c(dst, src, stride, 8);
2770 }
2771 #endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
2772
2773 void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
2774
2775 /* H264 specific */
2776 void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2777
2778 #if CONFIG_RV30_DECODER
2779 void ff_rv30dsp_init(DSPContext* c, AVCodecContext *avctx);
2780 #endif /* CONFIG_RV30_DECODER */
2781
2782 #if CONFIG_RV40_DECODER
2783 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2784     put_pixels16_xy2_c(dst, src, stride, 16);
2785 }
2786 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2787     avg_pixels16_xy2_c(dst, src, stride, 16);
2788 }
2789 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2790     put_pixels8_xy2_c(dst, src, stride, 8);
2791 }
2792 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2793     avg_pixels8_xy2_c(dst, src, stride, 8);
2794 }
2795
2796 void ff_rv40dsp_init(DSPContext* c, AVCodecContext *avctx);
2797 #endif /* CONFIG_RV40_DECODER */
2798
2799 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2800     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2801     int i;
2802
2803     for(i=0; i<w; i++){
2804         const int src_1= src[ -srcStride];
2805         const int src0 = src[0          ];
2806         const int src1 = src[  srcStride];
2807         const int src2 = src[2*srcStride];
2808         const int src3 = src[3*srcStride];
2809         const int src4 = src[4*srcStride];
2810         const int src5 = src[5*srcStride];
2811         const int src6 = src[6*srcStride];
2812         const int src7 = src[7*srcStride];
2813         const int src8 = src[8*srcStride];
2814         const int src9 = src[9*srcStride];
2815         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2816         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2817         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2818         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2819         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2820         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2821         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2822         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2823         src++;
2824         dst++;
2825     }
2826 }
2827
2828 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2829     put_pixels8_c(dst, src, stride, 8);
2830 }
2831
2832 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2833     uint8_t half[64];
2834     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2835     put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2836 }
2837
2838 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2839     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2840 }
2841
2842 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2843     uint8_t half[64];
2844     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2845     put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2846 }
2847
2848 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2849     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2850 }
2851
2852 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2853     uint8_t halfH[88];
2854     uint8_t halfV[64];
2855     uint8_t halfHV[64];
2856     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2857     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2858     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2859     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2860 }
2861 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2862     uint8_t halfH[88];
2863     uint8_t halfV[64];
2864     uint8_t halfHV[64];
2865     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2866     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2867     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2868     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2869 }
2870 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2871     uint8_t halfH[88];
2872     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2873     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2874 }
2875
2876 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2877     if(CONFIG_ANY_H263) {
2878     int x;
2879     const int strength= ff_h263_loop_filter_strength[qscale];
2880
2881     for(x=0; x<8; x++){
2882         int d1, d2, ad1;
2883         int p0= src[x-2*stride];
2884         int p1= src[x-1*stride];
2885         int p2= src[x+0*stride];
2886         int p3= src[x+1*stride];
2887         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2888
2889         if     (d<-2*strength) d1= 0;
2890         else if(d<-  strength) d1=-2*strength - d;
2891         else if(d<   strength) d1= d;
2892         else if(d< 2*strength) d1= 2*strength - d;
2893         else                   d1= 0;
2894
2895         p1 += d1;
2896         p2 -= d1;
2897         if(p1&256) p1= ~(p1>>31);
2898         if(p2&256) p2= ~(p2>>31);
2899
2900         src[x-1*stride] = p1;
2901         src[x+0*stride] = p2;
2902
2903         ad1= FFABS(d1)>>1;
2904
2905         d2= av_clip((p0-p3)/4, -ad1, ad1);
2906
2907         src[x-2*stride] = p0 - d2;
2908         src[x+  stride] = p3 + d2;
2909     }
2910     }
2911 }
2912
2913 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2914     if(CONFIG_ANY_H263) {
2915     int y;
2916     const int strength= ff_h263_loop_filter_strength[qscale];
2917
2918     for(y=0; y<8; y++){
2919         int d1, d2, ad1;
2920         int p0= src[y*stride-2];
2921         int p1= src[y*stride-1];
2922         int p2= src[y*stride+0];
2923         int p3= src[y*stride+1];
2924         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2925
2926         if     (d<-2*strength) d1= 0;
2927         else if(d<-  strength) d1=-2*strength - d;
2928         else if(d<   strength) d1= d;
2929         else if(d< 2*strength) d1= 2*strength - d;
2930         else                   d1= 0;
2931
2932         p1 += d1;
2933         p2 -= d1;
2934         if(p1&256) p1= ~(p1>>31);
2935         if(p2&256) p2= ~(p2>>31);
2936
2937         src[y*stride-1] = p1;
2938         src[y*stride+0] = p2;
2939
2940         ad1= FFABS(d1)>>1;
2941
2942         d2= av_clip((p0-p3)/4, -ad1, ad1);
2943
2944         src[y*stride-2] = p0 - d2;
2945         src[y*stride+1] = p3 + d2;
2946     }
2947     }
2948 }
2949
2950 static void h261_loop_filter_c(uint8_t *src, int stride){
2951     int x,y,xy,yz;
2952     int temp[64];
2953
2954     for(x=0; x<8; x++){
2955         temp[x      ] = 4*src[x           ];
2956         temp[x + 7*8] = 4*src[x + 7*stride];
2957     }
2958     for(y=1; y<7; y++){
2959         for(x=0; x<8; x++){
2960             xy = y * stride + x;
2961             yz = y * 8 + x;
2962             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2963         }
2964     }
2965
2966     for(y=0; y<8; y++){
2967         src[  y*stride] = (temp[  y*8] + 2)>>2;
2968         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2969         for(x=1; x<7; x++){
2970             xy = y * stride + x;
2971             yz = y * 8 + x;
2972             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2973         }
2974     }
2975 }
2976
2977 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2978 {
2979     int i, d;
2980     for( i = 0; i < 4; i++ ) {
2981         if( tc0[i] < 0 ) {
2982             pix += 4*ystride;
2983             continue;
2984         }
2985         for( d = 0; d < 4; d++ ) {
2986             const int p0 = pix[-1*xstride];
2987             const int p1 = pix[-2*xstride];
2988             const int p2 = pix[-3*xstride];
2989             const int q0 = pix[0];
2990             const int q1 = pix[1*xstride];
2991             const int q2 = pix[2*xstride];
2992
2993             if( FFABS( p0 - q0 ) < alpha &&
2994                 FFABS( p1 - p0 ) < beta &&
2995                 FFABS( q1 - q0 ) < beta ) {
2996
2997                 int tc = tc0[i];
2998                 int i_delta;
2999
3000                 if( FFABS( p2 - p0 ) < beta ) {
3001                     pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
3002                     tc++;
3003                 }
3004                 if( FFABS( q2 - q0 ) < beta ) {
3005                     pix[   xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
3006                     tc++;
3007                 }
3008
3009                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
3010                 pix[-xstride] = av_clip_uint8( p0 + i_delta );    /* p0' */
3011                 pix[0]        = av_clip_uint8( q0 - i_delta );    /* q0' */
3012             }
3013             pix += ystride;
3014         }
3015     }
3016 }
3017 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3018 {
3019     h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
3020 }
3021 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3022 {
3023     h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
3024 }
3025
3026 static inline void h264_loop_filter_luma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3027 {
3028     int d;
3029     for( d = 0; d < 16; d++ ) {
3030         const int p2 = pix[-3*xstride];
3031         const int p1 = pix[-2*xstride];
3032         const int p0 = pix[-1*xstride];
3033
3034         const int q0 = pix[ 0*xstride];
3035         const int q1 = pix[ 1*xstride];
3036         const int q2 = pix[ 2*xstride];
3037
3038         if( FFABS( p0 - q0 ) < alpha &&
3039             FFABS( p1 - p0 ) < beta &&
3040             FFABS( q1 - q0 ) < beta ) {
3041
3042             if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
3043                 if( FFABS( p2 - p0 ) < beta)
3044                 {
3045                     const int p3 = pix[-4*xstride];
3046                     /* p0', p1', p2' */
3047                     pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
3048                     pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
3049                     pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
3050                 } else {
3051                     /* p0' */
3052                     pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3053                 }
3054                 if( FFABS( q2 - q0 ) < beta)
3055                 {
3056                     const int q3 = pix[3*xstride];
3057                     /* q0', q1', q2' */
3058                     pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
3059                     pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
3060                     pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
3061                 } else {
3062                     /* q0' */
3063                     pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3064                 }
3065             }else{
3066                 /* p0', q0' */
3067                 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3068                 pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3069             }
3070         }
3071         pix += ystride;
3072     }
3073 }
3074 static void h264_v_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3075 {
3076     h264_loop_filter_luma_intra_c(pix, stride, 1, alpha, beta);
3077 }
3078 static void h264_h_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3079 {
3080     h264_loop_filter_luma_intra_c(pix, 1, stride, alpha, beta);
3081 }
3082
3083 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
3084 {
3085     int i, d;
3086     for( i = 0; i < 4; i++ ) {
3087         const int tc = tc0[i];
3088         if( tc <= 0 ) {
3089             pix += 2*ystride;
3090             continue;
3091         }
3092         for( d = 0; d < 2; d++ ) {
3093             const int p0 = pix[-1*xstride];
3094             const int p1 = pix[-2*xstride];
3095             const int q0 = pix[0];
3096             const int q1 = pix[1*xstride];
3097
3098             if( FFABS( p0 - q0 ) < alpha &&
3099                 FFABS( p1 - p0 ) < beta &&
3100                 FFABS( q1 - q0 ) < beta ) {
3101
3102                 int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
3103
3104                 pix[-xstride] = av_clip_uint8( p0 + delta );    /* p0' */
3105                 pix[0]        = av_clip_uint8( q0 - delta );    /* q0' */
3106             }
3107             pix += ystride;
3108         }
3109     }
3110 }
3111 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3112 {
3113     h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
3114 }
3115 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3116 {
3117     h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
3118 }
3119
3120 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3121 {
3122     int d;
3123     for( d = 0; d < 8; d++ ) {
3124         const int p0 = pix[-1*xstride];
3125         const int p1 = pix[-2*xstride];
3126         const int q0 = pix[0];
3127         const int q1 = pix[1*xstride];
3128
3129         if( FFABS( p0 - q0 ) < alpha &&
3130             FFABS( p1 - p0 ) < beta &&
3131             FFABS( q1 - q0 ) < beta ) {
3132
3133             pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
3134             pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
3135         }
3136         pix += ystride;
3137     }
3138 }
3139 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3140 {
3141     h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
3142 }
3143 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3144 {
3145     h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
3146 }
3147
3148 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3149 {
3150     int s, i;
3151
3152     s = 0;
3153     for(i=0;i<h;i++) {
3154         s += abs(pix1[0] - pix2[0]);
3155         s += abs(pix1[1] - pix2[1]);
3156         s += abs(pix1[2] - pix2[2]);
3157         s += abs(pix1[3] - pix2[3]);
3158         s += abs(pix1[4] - pix2[4]);
3159         s += abs(pix1[5] - pix2[5]);
3160         s += abs(pix1[6] - pix2[6]);
3161         s += abs(pix1[7] - pix2[7]);
3162         s += abs(pix1[8] - pix2[8]);
3163         s += abs(pix1[9] - pix2[9]);
3164         s += abs(pix1[10] - pix2[10]);
3165         s += abs(pix1[11] - pix2[11]);
3166         s += abs(pix1[12] - pix2[12]);
3167         s += abs(pix1[13] - pix2[13]);
3168         s += abs(pix1[14] - pix2[14]);
3169         s += abs(pix1[15] - pix2[15]);
3170         pix1 += line_size;
3171         pix2 += line_size;
3172     }
3173     return s;
3174 }
3175
3176 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3177 {
3178     int s, i;
3179
3180     s = 0;
3181     for(i=0;i<h;i++) {
3182         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3183         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3184         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3185         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3186         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3187         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3188         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3189         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3190         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
3191         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
3192         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
3193         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
3194         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
3195         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
3196         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
3197         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
3198         pix1 += line_size;
3199         pix2 += line_size;
3200     }
3201     return s;
3202 }
3203
3204 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3205 {
3206     int s, i;
3207     uint8_t *pix3 = pix2 + line_size;
3208
3209     s = 0;
3210     for(i=0;i<h;i++) {
3211         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3212         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3213         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3214         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3215         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3216         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3217         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3218         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3219         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
3220         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
3221         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
3222         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
3223         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
3224         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
3225         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
3226         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
3227         pix1 += line_size;
3228         pix2 += line_size;
3229         pix3 += line_size;
3230     }
3231     return s;
3232 }
3233
3234 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3235 {
3236     int s, i;
3237     uint8_t *pix3 = pix2 + line_size;
3238
3239     s = 0;
3240     for(i=0;i<h;i++) {
3241         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3242         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3243         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3244         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3245         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3246         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3247         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3248         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3249         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3250         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3251         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3252         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3253         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3254         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3255         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3256         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3257         pix1 += line_size;
3258         pix2 += line_size;
3259         pix3 += line_size;
3260     }
3261     return s;
3262 }
3263
3264 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3265 {
3266     int s, i;
3267
3268     s = 0;
3269     for(i=0;i<h;i++) {
3270         s += abs(pix1[0] - pix2[0]);
3271         s += abs(pix1[1] - pix2[1]);
3272         s += abs(pix1[2] - pix2[2]);
3273         s += abs(pix1[3] - pix2[3]);
3274         s += abs(pix1[4] - pix2[4]);
3275         s += abs(pix1[5] - pix2[5]);
3276         s += abs(pix1[6] - pix2[6]);
3277         s += abs(pix1[7] - pix2[7]);
3278         pix1 += line_size;
3279         pix2 += line_size;
3280     }
3281     return s;
3282 }
3283
3284 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3285 {
3286     int s, i;
3287
3288     s = 0;
3289     for(i=0;i<h;i++) {
3290         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3291         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3292         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3293         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3294         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3295         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3296         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3297         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3298         pix1 += line_size;
3299         pix2 += line_size;
3300     }
3301     return s;
3302 }
3303
3304 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3305 {
3306     int s, i;
3307     uint8_t *pix3 = pix2 + line_size;
3308
3309     s = 0;
3310     for(i=0;i<h;i++) {
3311         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3312         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3313         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3314         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3315         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3316         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3317         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3318         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3319         pix1 += line_size;
3320         pix2 += line_size;
3321         pix3 += line_size;
3322     }
3323     return s;
3324 }
3325
3326 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3327 {
3328     int s, i;
3329     uint8_t *pix3 = pix2 + line_size;
3330
3331     s = 0;
3332     for(i=0;i<h;i++) {
3333         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3334         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3335         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3336         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3337         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3338         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3339         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3340         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3341         pix1 += line_size;
3342         pix2 += line_size;
3343         pix3 += line_size;
3344     }
3345     return s;
3346 }
3347
3348 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3349     MpegEncContext *c = v;
3350     int score1=0;
3351     int score2=0;
3352     int x,y;
3353
3354     for(y=0; y<h; y++){
3355         for(x=0; x<16; x++){
3356             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3357         }
3358         if(y+1<h){
3359             for(x=0; x<15; x++){
3360                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
3361                              - s1[x+1] + s1[x+1+stride])
3362                         -FFABS(  s2[x  ] - s2[x  +stride]
3363                              - s2[x+1] + s2[x+1+stride]);
3364             }
3365         }
3366         s1+= stride;
3367         s2+= stride;
3368     }
3369
3370     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3371     else  return score1 + FFABS(score2)*8;
3372 }
3373
3374 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3375     MpegEncContext *c = v;
3376     int score1=0;
3377     int score2=0;
3378     int x,y;
3379
3380     for(y=0; y<h; y++){
3381         for(x=0; x<8; x++){
3382             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3383         }
3384         if(y+1<h){
3385             for(x=0; x<7; x++){
3386                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
3387                              - s1[x+1] + s1[x+1+stride])
3388                         -FFABS(  s2[x  ] - s2[x  +stride]
3389                              - s2[x+1] + s2[x+1+stride]);
3390             }
3391         }
3392         s1+= stride;
3393         s2+= stride;
3394     }
3395
3396     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3397     else  return score1 + FFABS(score2)*8;
3398 }
3399
3400 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3401     int i;
3402     unsigned int sum=0;
3403
3404     for(i=0; i<8*8; i++){
3405         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3406         int w= weight[i];
3407         b>>= RECON_SHIFT;
3408         assert(-512<b && b<512);
3409
3410         sum += (w*b)*(w*b)>>4;
3411     }
3412     return sum>>2;
3413 }
3414
3415 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3416     int i;
3417
3418     for(i=0; i<8*8; i++){
3419         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3420     }
3421 }
3422
3423 /**
3424  * permutes an 8x8 block.
3425  * @param block the block which will be permuted according to the given permutation vector
3426  * @param permutation the permutation vector
3427  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3428  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3429  *                  (inverse) permutated to scantable order!
3430  */
3431 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3432 {
3433     int i;
3434     DCTELEM temp[64];
3435
3436     if(last<=0) return;
3437     //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3438
3439     for(i=0; i<=last; i++){
3440         const int j= scantable[i];
3441         temp[j]= block[j];
3442         block[j]=0;
3443     }
3444
3445     for(i=0; i<=last; i++){
3446         const int j= scantable[i];
3447         const int perm_j= permutation[j];
3448         block[perm_j]= temp[j];
3449     }
3450 }
3451
3452 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3453     return 0;
3454 }
3455
3456 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3457     int i;
3458
3459     memset(cmp, 0, sizeof(void*)*6);
3460
3461     for(i=0; i<6; i++){
3462         switch(type&0xFF){
3463         case FF_CMP_SAD:
3464             cmp[i]= c->sad[i];
3465             break;
3466         case FF_CMP_SATD:
3467             cmp[i]= c->hadamard8_diff[i];
3468             break;
3469         case FF_CMP_SSE:
3470             cmp[i]= c->sse[i];
3471             break;
3472         case FF_CMP_DCT:
3473             cmp[i]= c->dct_sad[i];
3474             break;
3475         case FF_CMP_DCT264:
3476             cmp[i]= c->dct264_sad[i];
3477             break;
3478         case FF_CMP_DCTMAX:
3479             cmp[i]= c->dct_max[i];
3480             break;
3481         case FF_CMP_PSNR:
3482             cmp[i]= c->quant_psnr[i];
3483             break;
3484         case FF_CMP_BIT:
3485             cmp[i]= c->bit[i];
3486             break;
3487         case FF_CMP_RD:
3488             cmp[i]= c->rd[i];
3489             break;
3490         case FF_CMP_VSAD:
3491             cmp[i]= c->vsad[i];
3492             break;
3493         case FF_CMP_VSSE:
3494             cmp[i]= c->vsse[i];
3495             break;
3496         case FF_CMP_ZERO:
3497             cmp[i]= zero_cmp;
3498             break;
3499         case FF_CMP_NSSE:
3500             cmp[i]= c->nsse[i];
3501             break;
3502 #if CONFIG_SNOW_ENCODER
3503         case FF_CMP_W53:
3504             cmp[i]= c->w53[i];
3505             break;
3506         case FF_CMP_W97:
3507             cmp[i]= c->w97[i];
3508             break;
3509 #endif
3510         default:
3511             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3512         }
3513     }
3514 }
3515
3516 static void clear_block_c(DCTELEM *block)
3517 {
3518     memset(block, 0, sizeof(DCTELEM)*64);
3519 }
3520
3521 /**
3522  * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3523  */
3524 static void clear_blocks_c(DCTELEM *blocks)
3525 {
3526     memset(blocks, 0, sizeof(DCTELEM)*6*64);
3527 }
3528
3529 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3530     long i;
3531     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3532         long a = *(long*)(src+i);
3533         long b = *(long*)(dst+i);
3534         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3535     }
3536     for(; i<w; i++)
3537         dst[i+0] += src[i+0];
3538 }
3539
3540 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3541     long i;
3542     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3543         long a = *(long*)(src1+i);
3544         long b = *(long*)(src2+i);
3545         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3546     }
3547     for(; i<w; i++)
3548         dst[i] = src1[i]+src2[i];
3549 }
3550
3551 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3552     long i;
3553 #if !HAVE_FAST_UNALIGNED
3554     if((long)src2 & (sizeof(long)-1)){
3555         for(i=0; i+7<w; i+=8){
3556             dst[i+0] = src1[i+0]-src2[i+0];
3557             dst[i+1] = src1[i+1]-src2[i+1];
3558             dst[i+2] = src1[i+2]-src2[i+2];
3559             dst[i+3] = src1[i+3]-src2[i+3];
3560             dst[i+4] = src1[i+4]-src2[i+4];
3561             dst[i+5] = src1[i+5]-src2[i+5];
3562             dst[i+6] = src1[i+6]-src2[i+6];
3563             dst[i+7] = src1[i+7]-src2[i+7];
3564         }
3565     }else
3566 #endif
3567     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3568         long a = *(long*)(src1+i);
3569         long b = *(long*)(src2+i);
3570         *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3571     }
3572     for(; i<w; i++)
3573         dst[i+0] = src1[i+0]-src2[i+0];
3574 }
3575
3576 static void add_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *diff, int w, int *left, int *left_top){
3577     int i;
3578     uint8_t l, lt;
3579
3580     l= *left;
3581     lt= *left_top;
3582
3583     for(i=0; i<w; i++){
3584         l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
3585         lt= src1[i];
3586         dst[i]= l;
3587     }
3588
3589     *left= l;
3590     *left_top= lt;
3591 }
3592
3593 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3594     int i;
3595     uint8_t l, lt;
3596
3597     l= *left;
3598     lt= *left_top;
3599
3600     for(i=0; i<w; i++){
3601         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3602         lt= src1[i];
3603         l= src2[i];
3604         dst[i]= l - pred;
3605     }
3606
3607     *left= l;
3608     *left_top= lt;
3609 }
3610
3611 #define BUTTERFLY2(o1,o2,i1,i2) \
3612 o1= (i1)+(i2);\
3613 o2= (i1)-(i2);
3614
3615 #define BUTTERFLY1(x,y) \
3616 {\
3617     int a,b;\
3618     a= x;\
3619     b= y;\
3620     x= a+b;\
3621     y= a-b;\
3622 }
3623
3624 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3625
3626 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3627     int i;
3628     int temp[64];
3629     int sum=0;
3630
3631     assert(h==8);
3632
3633     for(i=0; i<8; i++){
3634         //FIXME try pointer walks
3635         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3636         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3637         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3638         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3639
3640         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3641         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3642         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3643         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3644
3645         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3646         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3647         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3648         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3649     }
3650
3651     for(i=0; i<8; i++){
3652         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3653         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3654         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3655         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3656
3657         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3658         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3659         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3660         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3661
3662         sum +=
3663              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3664             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3665             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3666             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3667     }
3668 #if 0
3669 static int maxi=0;
3670 if(sum>maxi){
3671     maxi=sum;
3672     printf("MAX:%d\n", maxi);
3673 }
3674 #endif
3675     return sum;
3676 }
3677
3678 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3679     int i;
3680     int temp[64];
3681     int sum=0;
3682
3683     assert(h==8);
3684
3685     for(i=0; i<8; i++){
3686         //FIXME try pointer walks
3687         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3688         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3689         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3690         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3691
3692         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3693         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3694         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3695         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3696
3697         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3698         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3699         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3700         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3701     }
3702
3703     for(i=0; i<8; i++){
3704         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3705         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3706         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3707         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3708
3709         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3710         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3711         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3712         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3713
3714         sum +=
3715              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3716             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3717             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3718             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3719     }
3720
3721     sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3722
3723     return sum;
3724 }
3725
3726 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3727     MpegEncContext * const s= (MpegEncContext *)c;
3728     DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3729     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3730
3731     assert(h==8);
3732
3733     s->dsp.diff_pixels(temp, src1, src2, stride);
3734     s->dsp.fdct(temp);
3735     return s->dsp.sum_abs_dctelem(temp);
3736 }
3737
3738 #if CONFIG_GPL
3739 #define DCT8_1D {\
3740     const int s07 = SRC(0) + SRC(7);\
3741     const int s16 = SRC(1) + SRC(6);\
3742     const int s25 = SRC(2) + SRC(5);\
3743     const int s34 = SRC(3) + SRC(4);\
3744     const int a0 = s07 + s34;\
3745     const int a1 = s16 + s25;\
3746     const int a2 = s07 - s34;\
3747     const int a3 = s16 - s25;\
3748     const int d07 = SRC(0) - SRC(7);\
3749     const int d16 = SRC(1) - SRC(6);\
3750     const int d25 = SRC(2) - SRC(5);\
3751     const int d34 = SRC(3) - SRC(4);\
3752     const int a4 = d16 + d25 + (d07 + (d07>>1));\
3753     const int a5 = d07 - d34 - (d25 + (d25>>1));\
3754     const int a6 = d07 + d34 - (d16 + (d16>>1));\
3755     const int a7 = d16 - d25 + (d34 + (d34>>1));\
3756     DST(0,  a0 + a1     ) ;\
3757     DST(1,  a4 + (a7>>2)) ;\
3758     DST(2,  a2 + (a3>>1)) ;\
3759     DST(3,  a5 + (a6>>2)) ;\
3760     DST(4,  a0 - a1     ) ;\
3761     DST(5,  a6 - (a5>>2)) ;\
3762     DST(6, (a2>>1) - a3 ) ;\
3763     DST(7, (a4>>2) - a7 ) ;\
3764 }
3765
3766 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3767     MpegEncContext * const s= (MpegEncContext *)c;
3768     DCTELEM dct[8][8];
3769     int i;
3770     int sum=0;
3771
3772     s->dsp.diff_pixels(dct[0], src1, src2, stride);
3773
3774 #define SRC(x) dct[i][x]
3775 #define DST(x,v) dct[i][x]= v
3776     for( i = 0; i < 8; i++ )
3777         DCT8_1D
3778 #undef SRC
3779 #undef DST
3780
3781 #define SRC(x) dct[x][i]
3782 #define DST(x,v) sum += FFABS(v)
3783     for( i = 0; i < 8; i++ )
3784         DCT8_1D
3785 #undef SRC
3786 #undef DST
3787     return sum;
3788 }
3789 #endif
3790
3791 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3792     MpegEncContext * const s= (MpegEncContext *)c;
3793     DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3794     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3795     int sum=0, i;
3796
3797     assert(h==8);
3798
3799     s->dsp.diff_pixels(temp, src1, src2, stride);
3800     s->dsp.fdct(temp);
3801
3802     for(i=0; i<64; i++)
3803         sum= FFMAX(sum, FFABS(temp[i]));
3804
3805     return sum;
3806 }
3807
3808 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3809     MpegEncContext * const s= (MpegEncContext *)c;
3810     DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3811     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3812     DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3813     int sum=0, i;
3814
3815     assert(h==8);
3816     s->mb_intra=0;
3817
3818     s->dsp.diff_pixels(temp, src1, src2, stride);
3819
3820     memcpy(bak, temp, 64*sizeof(DCTELEM));
3821
3822     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3823     s->dct_unquantize_inter(s, temp, 0, s->qscale);
3824     ff_simple_idct(temp); //FIXME
3825
3826     for(i=0; i<64; i++)
3827         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3828
3829     return sum;
3830 }
3831
3832 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3833     MpegEncContext * const s= (MpegEncContext *)c;
3834     const uint8_t *scantable= s->intra_scantable.permutated;
3835     DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3836     DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3837     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3838     uint8_t * const bak= (uint8_t*)aligned_bak;
3839     int i, last, run, bits, level, distortion, start_i;
3840     const int esc_length= s->ac_esc_length;
3841     uint8_t * length;
3842     uint8_t * last_length;
3843
3844     assert(h==8);
3845
3846     for(i=0; i<8; i++){
3847         ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3848         ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3849     }
3850
3851     s->dsp.diff_pixels(temp, src1, src2, stride);
3852
3853     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3854
3855     bits=0;
3856
3857     if (s->mb_intra) {
3858         start_i = 1;
3859         length     = s->intra_ac_vlc_length;
3860         last_length= s->intra_ac_vlc_last_length;
3861         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3862     } else {
3863         start_i = 0;
3864         length     = s->inter_ac_vlc_length;
3865         last_length= s->inter_ac_vlc_last_length;
3866     }
3867
3868     if(last>=start_i){
3869         run=0;
3870         for(i=start_i; i<last; i++){
3871             int j= scantable[i];
3872             level= temp[j];
3873
3874             if(level){
3875                 level+=64;
3876                 if((level&(~127)) == 0){
3877                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3878                 }else
3879                     bits+= esc_length;
3880                 run=0;
3881             }else
3882                 run++;
3883         }
3884         i= scantable[last];
3885
3886         level= temp[i] + 64;
3887
3888         assert(level - 64);
3889
3890         if((level&(~127)) == 0){
3891             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3892         }else
3893             bits+= esc_length;
3894
3895     }
3896
3897     if(last>=0){
3898         if(s->mb_intra)
3899             s->dct_unquantize_intra(s, temp, 0, s->qscale);
3900         else
3901             s->dct_unquantize_inter(s, temp, 0, s->qscale);
3902     }
3903
3904     s->dsp.idct_add(bak, stride, temp);
3905
3906     distortion= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3907
3908     return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3909 }
3910
3911 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3912     MpegEncContext * const s= (MpegEncContext *)c;
3913     const uint8_t *scantable= s->intra_scantable.permutated;
3914     DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3915     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3916     int i, last, run, bits, level, start_i;
3917     const int esc_length= s->ac_esc_length;
3918     uint8_t * length;
3919     uint8_t * last_length;
3920
3921     assert(h==8);
3922
3923     s->dsp.diff_pixels(temp, src1, src2, stride);
3924
3925     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3926
3927     bits=0;
3928
3929     if (s->mb_intra) {
3930         start_i = 1;
3931         length     = s->intra_ac_vlc_length;
3932         last_length= s->intra_ac_vlc_last_length;
3933         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3934     } else {
3935         start_i = 0;
3936         length     = s->inter_ac_vlc_length;
3937         last_length= s->inter_ac_vlc_last_length;
3938     }
3939
3940     if(last>=start_i){
3941         run=0;
3942         for(i=start_i; i<last; i++){
3943             int j= scantable[i];
3944             level= temp[j];
3945
3946             if(level){
3947                 level+=64;
3948                 if((level&(~127)) == 0){
3949                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3950                 }else
3951                     bits+= esc_length;
3952                 run=0;
3953             }else
3954                 run++;
3955         }
3956         i= scantable[last];
3957
3958         level= temp[i] + 64;
3959
3960         assert(level - 64);
3961
3962         if((level&(~127)) == 0){
3963             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3964         }else
3965             bits+= esc_length;
3966     }
3967
3968     return bits;
3969 }
3970
3971 #define VSAD_INTRA(size) \
3972 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3973     int score=0;                                                                                            \
3974     int x,y;                                                                                                \
3975                                                                                                             \
3976     for(y=1; y<h; y++){                                                                                     \
3977         for(x=0; x<size; x+=4){                                                                             \
3978             score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
3979                    +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
3980         }                                                                                                   \
3981         s+= stride;                                                                                         \
3982     }                                                                                                       \
3983                                                                                                             \
3984     return score;                                                                                           \
3985 }
3986 VSAD_INTRA(8)
3987 VSAD_INTRA(16)
3988
3989 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3990     int score=0;
3991     int x,y;
3992
3993     for(y=1; y<h; y++){
3994         for(x=0; x<16; x++){
3995             score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3996         }
3997         s1+= stride;
3998         s2+= stride;
3999     }
4000
4001     return score;
4002 }
4003
4004 #define SQ(a) ((a)*(a))
4005 #define VSSE_INTRA(size) \
4006 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
4007     int score=0;                                                                                            \
4008     int x,y;                                                                                                \
4009                                                                                                             \
4010     for(y=1; y<h; y++){                                                                                     \
4011         for(x=0; x<size; x+=4){                                                                               \
4012             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
4013                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
4014         }                                                                                                   \
4015         s+= stride;                                                                                         \
4016     }                                                                                                       \
4017                                                                                                             \
4018     return score;                                                                                           \
4019 }
4020 VSSE_INTRA(8)
4021 VSSE_INTRA(16)
4022
4023 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
4024     int score=0;
4025     int x,y;
4026
4027     for(y=1; y<h; y++){
4028         for(x=0; x<16; x++){
4029             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
4030         }
4031         s1+= stride;
4032         s2+= stride;
4033     }
4034
4035     return score;
4036 }
4037
4038 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
4039                                int size){
4040     int score=0;
4041     int i;
4042     for(i=0; i<size; i++)
4043         score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
4044     return score;
4045 }
4046
4047 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
4048 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
4049 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
4050 #if CONFIG_GPL
4051 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
4052 #endif
4053 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
4054 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
4055 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
4056 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
4057
4058 static void vector_fmul_c(float *dst, const float *src, int len){
4059     int i;
4060     for(i=0; i<len; i++)
4061         dst[i] *= src[i];
4062 }
4063
4064 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
4065     int i;
4066     src1 += len-1;
4067     for(i=0; i<len; i++)
4068         dst[i] = src0[i] * src1[-i];
4069 }
4070
4071 void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){
4072     int i;
4073     for(i=0; i<len; i++)
4074         dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
4075 }
4076
4077 void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
4078     int i,j;
4079     dst += len;
4080     win += len;
4081     src0+= len;
4082     for(i=-len, j=len-1; i<0; i++, j--) {
4083         float s0 = src0[i];
4084         float s1 = src1[j];
4085         float wi = win[i];
4086         float wj = win[j];
4087         dst[i] = s0*wj - s1*wi + add_bias;
4088         dst[j] = s0*wi + s1*wj + add_bias;
4089     }
4090 }
4091
4092 static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
4093     int i;
4094     for(i=0; i<len; i++)
4095         dst[i] = src[i] * mul;
4096 }
4097
4098 static av_always_inline int float_to_int16_one(const float *src){
4099     int_fast32_t tmp = *(const int32_t*)src;
4100     if(tmp & 0xf0000){
4101         tmp = (0x43c0ffff - tmp)>>31;
4102         // is this faster on some gcc/cpu combinations?
4103 //      if(tmp > 0x43c0ffff) tmp = 0xFFFF;
4104 //      else                 tmp = 0;
4105     }
4106     return tmp - 0x8000;
4107 }
4108
4109 void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
4110     int i;
4111     for(i=0; i<len; i++)
4112         dst[i] = float_to_int16_one(src+i);
4113 }
4114
4115 void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
4116     int i,j,c;
4117     if(channels==2){
4118         for(i=0; i<len; i++){
4119             dst[2*i]   = float_to_int16_one(src[0]+i);
4120             dst[2*i+1] = float_to_int16_one(src[1]+i);
4121         }
4122     }else{
4123         for(c=0; c<channels; c++)
4124             for(i=0, j=c; i<len; i++, j+=channels)
4125                 dst[j] = float_to_int16_one(src[c]+i);
4126     }
4127 }
4128
4129 static void add_int16_c(int16_t * v1, int16_t * v2, int order)
4130 {
4131     while (order--)
4132        *v1++ += *v2++;
4133 }
4134
4135 static void sub_int16_c(int16_t * v1, int16_t * v2, int order)
4136 {
4137     while (order--)
4138         *v1++ -= *v2++;
4139 }
4140
4141 static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift)
4142 {
4143     int res = 0;
4144
4145     while (order--)
4146         res += (*v1++ * *v2++) >> shift;
4147
4148     return res;
4149 }
4150
4151 #define W0 2048
4152 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
4153 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
4154 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
4155 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
4156 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
4157 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
4158 #define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
4159
4160 static void wmv2_idct_row(short * b)
4161 {
4162     int s1,s2;
4163     int a0,a1,a2,a3,a4,a5,a6,a7;
4164     /*step 1*/
4165     a1 = W1*b[1]+W7*b[7];
4166     a7 = W7*b[1]-W1*b[7];
4167     a5 = W5*b[5]+W3*b[3];
4168     a3 = W3*b[5]-W5*b[3];
4169     a2 = W2*b[2]+W6*b[6];
4170     a6 = W6*b[2]-W2*b[6];
4171     a0 = W0*b[0]+W0*b[4];
4172     a4 = W0*b[0]-W0*b[4];
4173     /*step 2*/
4174     s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
4175     s2 = (181*(a1-a5-a7+a3)+128)>>8;
4176     /*step 3*/
4177     b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
4178     b[1] = (a4+a6 +s1   + (1<<7))>>8;
4179     b[2] = (a4-a6 +s2   + (1<<7))>>8;
4180     b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
4181     b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
4182     b[5] = (a4-a6 -s2   + (1<<7))>>8;
4183     b[6] = (a4+a6 -s1   + (1<<7))>>8;
4184     b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
4185 }
4186 static void wmv2_idct_col(short * b)
4187 {
4188     int s1,s2;
4189     int a0,a1,a2,a3,a4,a5,a6,a7;
4190     /*step 1, with extended precision*/
4191     a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
4192     a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
4193     a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
4194     a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
4195     a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
4196     a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
4197     a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
4198     a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
4199     /*step 2*/
4200     s1 = (181*(a1-a5+a7-a3)+128)>>8;
4201     s2 = (181*(a1-a5-a7+a3)+128)>>8;
4202     /*step 3*/
4203     b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
4204     b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
4205     b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
4206     b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
4207
4208     b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
4209     b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
4210     b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
4211     b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
4212 }
4213 void ff_wmv2_idct_c(short * block){
4214     int i;
4215
4216     for(i=0;i<64;i+=8){
4217         wmv2_idct_row(block+i);
4218     }
4219     for(i=0;i<8;i++){
4220         wmv2_idct_col(block+i);
4221     }
4222 }
4223 /* XXX: those functions should be suppressed ASAP when all IDCTs are
4224  converted */
4225 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
4226 {
4227     ff_wmv2_idct_c(block);
4228     put_pixels_clamped_c(block, dest, line_size);
4229 }
4230 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
4231 {
4232     ff_wmv2_idct_c(block);
4233     add_pixels_clamped_c(block, dest, line_size);
4234 }
4235 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
4236 {
4237     j_rev_dct (block);
4238     put_pixels_clamped_c(block, dest, line_size);
4239 }
4240 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
4241 {
4242     j_rev_dct (block);
4243     add_pixels_clamped_c(block, dest, line_size);
4244 }
4245
4246 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
4247 {
4248     j_rev_dct4 (block);
4249     put_pixels_clamped4_c(block, dest, line_size);
4250 }
4251 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
4252 {
4253     j_rev_dct4 (block);
4254     add_pixels_clamped4_c(block, dest, line_size);
4255 }
4256
4257 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4258 {
4259     j_rev_dct2 (block);
4260     put_pixels_clamped2_c(block, dest, line_size);
4261 }
4262 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4263 {
4264     j_rev_dct2 (block);
4265     add_pixels_clamped2_c(block, dest, line_size);
4266 }
4267
4268 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4269 {
4270     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4271
4272     dest[0] = cm[(block[0] + 4)>>3];
4273 }
4274 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4275 {
4276     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4277
4278     dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4279 }
4280
4281 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4282
4283 /* init static data */
4284 void dsputil_static_init(void)
4285 {
4286     int i;
4287
4288     for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4289     for(i=0;i<MAX_NEG_CROP;i++) {
4290         ff_cropTbl[i] = 0;
4291         ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4292     }
4293
4294     for(i=0;i<512;i++) {
4295         ff_squareTbl[i] = (i - 256) * (i - 256);
4296     }
4297
4298     for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4299 }
4300
4301 int ff_check_alignment(void){
4302     static int did_fail=0;
4303     DECLARE_ALIGNED_16(int, aligned);
4304
4305     if((intptr_t)&aligned & 15){
4306         if(!did_fail){
4307 #if HAVE_MMX || HAVE_ALTIVEC
4308             av_log(NULL, AV_LOG_ERROR,
4309                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4310                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
4311                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4312                 "Do not report crashes to FFmpeg developers.\n");
4313 #endif
4314             did_fail=1;
4315         }
4316         return -1;
4317     }
4318     return 0;
4319 }
4320
4321 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4322 {
4323     int i;
4324
4325     ff_check_alignment();
4326
4327 #if CONFIG_ENCODERS
4328     if(avctx->dct_algo==FF_DCT_FASTINT) {
4329         c->fdct = fdct_ifast;
4330         c->fdct248 = fdct_ifast248;
4331     }
4332     else if(avctx->dct_algo==FF_DCT_FAAN) {
4333         c->fdct = ff_faandct;
4334         c->fdct248 = ff_faandct248;
4335     }
4336     else {
4337         c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4338         c->fdct248 = ff_fdct248_islow;
4339     }
4340 #endif //CONFIG_ENCODERS
4341
4342     if(avctx->lowres==1){
4343         if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
4344             c->idct_put= ff_jref_idct4_put;
4345             c->idct_add= ff_jref_idct4_add;
4346         }else{
4347             c->idct_put= ff_h264_lowres_idct_put_c;
4348             c->idct_add= ff_h264_lowres_idct_add_c;
4349         }
4350         c->idct    = j_rev_dct4;
4351         c->idct_permutation_type= FF_NO_IDCT_PERM;
4352     }else if(avctx->lowres==2){
4353         c->idct_put= ff_jref_idct2_put;
4354         c->idct_add= ff_jref_idct2_add;
4355         c->idct    = j_rev_dct2;
4356         c->idct_permutation_type= FF_NO_IDCT_PERM;
4357     }else if(avctx->lowres==3){
4358         c->idct_put= ff_jref_idct1_put;
4359         c->idct_add= ff_jref_idct1_add;
4360         c->idct    = j_rev_dct1;
4361         c->idct_permutation_type= FF_NO_IDCT_PERM;
4362     }else{
4363         if(avctx->idct_algo==FF_IDCT_INT){
4364             c->idct_put= ff_jref_idct_put;
4365             c->idct_add= ff_jref_idct_add;
4366             c->idct    = j_rev_dct;
4367             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4368         }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER || CONFIG_THEORA_DECODER ) &&
4369                 avctx->idct_algo==FF_IDCT_VP3){
4370             c->idct_put= ff_vp3_idct_put_c;
4371             c->idct_add= ff_vp3_idct_add_c;
4372             c->idct    = ff_vp3_idct_c;
4373             c->idct_permutation_type= FF_NO_IDCT_PERM;
4374         }else if(avctx->idct_algo==FF_IDCT_WMV2){
4375             c->idct_put= ff_wmv2_idct_put_c;
4376             c->idct_add= ff_wmv2_idct_add_c;
4377             c->idct    = ff_wmv2_idct_c;
4378             c->idct_permutation_type= FF_NO_IDCT_PERM;
4379         }else if(avctx->idct_algo==FF_IDCT_FAAN){
4380             c->idct_put= ff_faanidct_put;
4381             c->idct_add= ff_faanidct_add;
4382             c->idct    = ff_faanidct;
4383             c->idct_permutation_type= FF_NO_IDCT_PERM;
4384         }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
4385             c->idct_put= ff_ea_idct_put_c;
4386             c->idct_permutation_type= FF_NO_IDCT_PERM;
4387         }else{ //accurate/default
4388             c->idct_put= ff_simple_idct_put;
4389             c->idct_add= ff_simple_idct_add;
4390             c->idct    = ff_simple_idct;
4391             c->idct_permutation_type= FF_NO_IDCT_PERM;
4392         }
4393     }
4394
4395     if (CONFIG_H264_DECODER) {
4396         c->h264_idct_add= ff_h264_idct_add_c;
4397         c->h264_idct8_add= ff_h264_idct8_add_c;
4398         c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
4399         c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
4400         c->h264_idct_add16     = ff_h264_idct_add16_c;
4401         c->h264_idct8_add4     = ff_h264_idct8_add4_c;
4402         c->h264_idct_add8      = ff_h264_idct_add8_c;
4403         c->h264_idct_add16intra= ff_h264_idct_add16intra_c;
4404     }
4405
4406     c->get_pixels = get_pixels_c;
4407     c->diff_pixels = diff_pixels_c;
4408     c->put_pixels_clamped = put_pixels_clamped_c;
4409     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4410     c->add_pixels_clamped = add_pixels_clamped_c;
4411     c->add_pixels8 = add_pixels8_c;
4412     c->add_pixels4 = add_pixels4_c;
4413     c->sum_abs_dctelem = sum_abs_dctelem_c;
4414     c->gmc1 = gmc1_c;
4415     c->gmc = ff_gmc_c;
4416     c->clear_block = clear_block_c;
4417     c->clear_blocks = clear_blocks_c;
4418     c->pix_sum = pix_sum_c;
4419     c->pix_norm1 = pix_norm1_c;
4420
4421     /* TODO [0] 16  [1] 8 */
4422     c->pix_abs[0][0] = pix_abs16_c;
4423     c->pix_abs[0][1] = pix_abs16_x2_c;
4424     c->pix_abs[0][2] = pix_abs16_y2_c;
4425     c->pix_abs[0][3] = pix_abs16_xy2_c;
4426     c->pix_abs[1][0] = pix_abs8_c;
4427     c->pix_abs[1][1] = pix_abs8_x2_c;
4428     c->pix_abs[1][2] = pix_abs8_y2_c;
4429     c->pix_abs[1][3] = pix_abs8_xy2_c;
4430
4431 #define dspfunc(PFX, IDX, NUM) \
4432     c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
4433     c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
4434     c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
4435     c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4436
4437     dspfunc(put, 0, 16);
4438     dspfunc(put_no_rnd, 0, 16);
4439     dspfunc(put, 1, 8);
4440     dspfunc(put_no_rnd, 1, 8);
4441     dspfunc(put, 2, 4);
4442     dspfunc(put, 3, 2);
4443
4444     dspfunc(avg, 0, 16);
4445     dspfunc(avg_no_rnd, 0, 16);
4446     dspfunc(avg, 1, 8);
4447     dspfunc(avg_no_rnd, 1, 8);
4448     dspfunc(avg, 2, 4);
4449     dspfunc(avg, 3, 2);
4450 #undef dspfunc
4451
4452     c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4453     c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4454
4455     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4456     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4457     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4458     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4459     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4460     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4461     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4462     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4463     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4464
4465     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4466     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4467     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4468     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4469     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4470     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4471     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4472     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4473     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4474
4475 #define dspfunc(PFX, IDX, NUM) \
4476     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4477     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4478     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4479     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4480     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4481     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4482     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4483     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4484     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4485     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4486     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4487     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4488     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4489     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4490     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4491     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4492
4493     dspfunc(put_qpel, 0, 16);
4494     dspfunc(put_no_rnd_qpel, 0, 16);
4495
4496     dspfunc(avg_qpel, 0, 16);
4497     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4498
4499     dspfunc(put_qpel, 1, 8);
4500     dspfunc(put_no_rnd_qpel, 1, 8);
4501
4502     dspfunc(avg_qpel, 1, 8);
4503     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4504
4505     dspfunc(put_h264_qpel, 0, 16);
4506     dspfunc(put_h264_qpel, 1, 8);
4507     dspfunc(put_h264_qpel, 2, 4);
4508     dspfunc(put_h264_qpel, 3, 2);
4509     dspfunc(avg_h264_qpel, 0, 16);
4510     dspfunc(avg_h264_qpel, 1, 8);
4511     dspfunc(avg_h264_qpel, 2, 4);
4512
4513 #undef dspfunc
4514     c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4515     c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4516     c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4517     c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4518     c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4519     c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4520     c->put_no_rnd_vc1_chroma_pixels_tab[0]= put_no_rnd_vc1_chroma_mc8_c;
4521     c->avg_no_rnd_vc1_chroma_pixels_tab[0]= avg_no_rnd_vc1_chroma_mc8_c;
4522
4523     c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
4524     c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
4525     c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
4526     c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
4527     c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
4528     c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
4529     c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
4530     c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
4531     c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
4532     c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
4533     c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
4534     c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
4535     c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
4536     c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
4537     c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
4538     c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
4539     c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
4540     c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
4541     c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
4542     c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
4543
4544     c->draw_edges = draw_edges_c;
4545
4546 #if CONFIG_CAVS_DECODER
4547     ff_cavsdsp_init(c,avctx);
4548 #endif
4549
4550 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
4551     ff_mlp_init(c, avctx);
4552 #endif
4553 #if CONFIG_VC1_DECODER || CONFIG_WMV3_DECODER
4554     ff_vc1dsp_init(c,avctx);
4555 #endif
4556 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER || CONFIG_WMV3_DECODER
4557     ff_intrax8dsp_init(c,avctx);
4558 #endif
4559 #if CONFIG_RV30_DECODER
4560     ff_rv30dsp_init(c,avctx);
4561 #endif
4562 #if CONFIG_RV40_DECODER
4563     ff_rv40dsp_init(c,avctx);
4564     c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
4565     c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
4566     c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
4567     c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
4568 #endif
4569
4570     c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4571     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4572     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4573     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4574     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4575     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4576     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4577     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4578
4579 #define SET_CMP_FUNC(name) \
4580     c->name[0]= name ## 16_c;\
4581     c->name[1]= name ## 8x8_c;
4582
4583     SET_CMP_FUNC(hadamard8_diff)
4584     c->hadamard8_diff[4]= hadamard8_intra16_c;
4585     c->hadamard8_diff[5]= hadamard8_intra8x8_c;
4586     SET_CMP_FUNC(dct_sad)
4587     SET_CMP_FUNC(dct_max)
4588 #if CONFIG_GPL
4589     SET_CMP_FUNC(dct264_sad)
4590 #endif
4591     c->sad[0]= pix_abs16_c;
4592     c->sad[1]= pix_abs8_c;
4593     c->sse[0]= sse16_c;
4594     c->sse[1]= sse8_c;
4595     c->sse[2]= sse4_c;
4596     SET_CMP_FUNC(quant_psnr)
4597     SET_CMP_FUNC(rd)
4598     SET_CMP_FUNC(bit)
4599     c->vsad[0]= vsad16_c;
4600     c->vsad[4]= vsad_intra16_c;
4601     c->vsad[5]= vsad_intra8_c;
4602     c->vsse[0]= vsse16_c;
4603     c->vsse[4]= vsse_intra16_c;
4604     c->vsse[5]= vsse_intra8_c;
4605     c->nsse[0]= nsse16_c;
4606     c->nsse[1]= nsse8_c;
4607 #if CONFIG_SNOW_ENCODER
4608     c->w53[0]= w53_16_c;
4609     c->w53[1]= w53_8_c;
4610     c->w97[0]= w97_16_c;
4611     c->w97[1]= w97_8_c;
4612 #endif
4613
4614     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4615
4616     c->add_bytes= add_bytes_c;
4617     c->add_bytes_l2= add_bytes_l2_c;
4618     c->diff_bytes= diff_bytes_c;
4619     c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
4620     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4621     c->bswap_buf= bswap_buf;
4622 #if CONFIG_PNG_DECODER
4623     c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
4624 #endif
4625
4626     c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4627     c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4628     c->h264_v_loop_filter_luma_intra= h264_v_loop_filter_luma_intra_c;
4629     c->h264_h_loop_filter_luma_intra= h264_h_loop_filter_luma_intra_c;
4630     c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4631     c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4632     c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4633     c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4634     c->h264_loop_filter_strength= NULL;
4635
4636     if (CONFIG_ANY_H263) {
4637         c->h263_h_loop_filter= h263_h_loop_filter_c;
4638         c->h263_v_loop_filter= h263_v_loop_filter_c;
4639     }
4640
4641     if (CONFIG_VP3_DECODER || CONFIG_THEORA_DECODER) {
4642         c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
4643         c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
4644     }
4645     if (CONFIG_VP6_DECODER) {
4646         c->vp6_filter_diag4= ff_vp6_filter_diag4_c;
4647     }
4648
4649     c->h261_loop_filter= h261_loop_filter_c;
4650
4651     c->try_8x8basis= try_8x8basis_c;
4652     c->add_8x8basis= add_8x8basis_c;
4653
4654 #if CONFIG_SNOW_DECODER
4655     c->vertical_compose97i = ff_snow_vertical_compose97i;
4656     c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4657     c->inner_add_yblock = ff_snow_inner_add_yblock;
4658 #endif
4659
4660 #if CONFIG_VORBIS_DECODER
4661     c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4662 #endif
4663 #if CONFIG_AC3_DECODER
4664     c->ac3_downmix = ff_ac3_downmix_c;
4665 #endif
4666 #if CONFIG_FLAC_ENCODER
4667     c->flac_compute_autocorr = ff_flac_compute_autocorr;
4668 #endif
4669     c->vector_fmul = vector_fmul_c;
4670     c->vector_fmul_reverse = vector_fmul_reverse_c;
4671     c->vector_fmul_add_add = ff_vector_fmul_add_add_c;
4672     c->vector_fmul_window = ff_vector_fmul_window_c;
4673     c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
4674     c->float_to_int16 = ff_float_to_int16_c;
4675     c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
4676     c->add_int16 = add_int16_c;
4677     c->sub_int16 = sub_int16_c;
4678     c->scalarproduct_int16 = scalarproduct_int16_c;
4679
4680     c->shrink[0]= ff_img_copy_plane;
4681     c->shrink[1]= ff_shrink22;
4682     c->shrink[2]= ff_shrink44;
4683     c->shrink[3]= ff_shrink88;
4684
4685     c->prefetch= just_return;
4686
4687     memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4688     memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4689
4690     if (HAVE_MMX)        dsputil_init_mmx   (c, avctx);
4691     if (ARCH_ARM)        dsputil_init_arm   (c, avctx);
4692     if (CONFIG_MLIB)     dsputil_init_mlib  (c, avctx);
4693     if (HAVE_VIS)        dsputil_init_vis   (c, avctx);
4694     if (ARCH_ALPHA)      dsputil_init_alpha (c, avctx);
4695     if (ARCH_PPC)        dsputil_init_ppc   (c, avctx);
4696     if (HAVE_MMI)        dsputil_init_mmi   (c, avctx);
4697     if (ARCH_SH4)        dsputil_init_sh4   (c, avctx);
4698     if (ARCH_BFIN)       dsputil_init_bfin  (c, avctx);
4699
4700     for(i=0; i<64; i++){
4701         if(!c->put_2tap_qpel_pixels_tab[0][i])
4702             c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4703         if(!c->avg_2tap_qpel_pixels_tab[0][i])
4704             c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4705     }
4706
4707     switch(c->idct_permutation_type){
4708     case FF_NO_IDCT_PERM:
4709         for(i=0; i<64; i++)
4710             c->idct_permutation[i]= i;
4711         break;
4712     case FF_LIBMPEG2_IDCT_PERM:
4713         for(i=0; i<64; i++)
4714             c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4715         break;
4716     case FF_SIMPLE_IDCT_PERM:
4717         for(i=0; i<64; i++)
4718             c->idct_permutation[i]= simple_mmx_permutation[i];
4719         break;
4720     case FF_TRANSPOSE_IDCT_PERM:
4721         for(i=0; i<64; i++)
4722             c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4723         break;
4724     case FF_PARTTRANS_IDCT_PERM:
4725         for(i=0; i<64; i++)
4726             c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4727         break;
4728     case FF_SSE2_IDCT_PERM:
4729         for(i=0; i<64; i++)
4730             c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
4731         break;
4732     default:
4733         av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
4734     }
4735 }
4736