libpostproc/postprocess.c

   1 /*
   2  * Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)
   3  *
   4  * AltiVec optimizations (C) 2004 Romain Dolbeau <romain@dolbeau.org>
   5  *
   6  * This file is part of FFmpeg.
   7  *
   8  * FFmpeg is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * FFmpeg is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License
  19  * along with FFmpeg; if not, write to the Free Software
  20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21  */
  22
  23 /**
  24  * @file postprocess.c
  25  * postprocessing.
  26  */
  27
  28 /*
  29                         C       MMX     MMX2    3DNow   AltiVec
  30 isVertDC                Ec      Ec                      Ec
  31 isVertMinMaxOk          Ec      Ec                      Ec
  32 doVertLowPass           E               e       e       Ec
  33 doVertDefFilter         Ec      Ec      e       e       Ec
  34 isHorizDC               Ec      Ec                      Ec
  35 isHorizMinMaxOk         a       E                       Ec
  36 doHorizLowPass          E               e       e       Ec
  37 doHorizDefFilter        Ec      Ec      e       e       Ec
  38 do_a_deblock            Ec      E       Ec      E
  39 deRing                  E               e       e*      Ecp
  40 Vertical RKAlgo1        E               a       a
  41 Horizontal RKAlgo1                      a       a
  42 Vertical X1#            a               E       E
  43 Horizontal X1#          a               E       E
  44 LinIpolDeinterlace      e               E       E*
  45 CubicIpolDeinterlace    a               e       e*
  46 LinBlendDeinterlace     e               E       E*
  47 MedianDeinterlace#      E       Ec      Ec
  48 TempDeNoiser#           E               e       e       Ec
  49
  50 * I do not have a 3DNow! CPU -> it is untested, but no one said it does not work so it seems to work
  51 # more or less selfinvented filters so the exactness is not too meaningful
  52 E = Exact implementation
  53 e = allmost exact implementation (slightly different rounding,...)
  54 a = alternative / approximate impl
  55 c = checked against the other implementations (-vo md5)
  56 p = partially optimized, still some work to do
  57 */
  58
  59 /*
  60 TODO:
  61 reduce the time wasted on the mem transfer
  62 unroll stuff if instructions depend too much on the prior one
  63 move YScale thing to the end instead of fixing QP
  64 write a faster and higher quality deblocking filter :)
  65 make the mainloop more flexible (variable number of blocks at once
  66         (the if/else stuff per block is slowing things down)
  67 compare the quality & speed of all filters
  68 split this huge file
  69 optimize c versions
  70 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
  71 ...
  72 */
  73
  74 //Changelog: use the Subversion log
  75
  76 #include "config.h"
  77 #include "avutil.h"
  78 #include <inttypes.h>
  79 #include <stdio.h>
  80 #include <stdlib.h>
  81 #include <string.h>
  82 #ifdef HAVE_MALLOC_H
  83 #include <malloc.h>
  84 #endif
  85 //#undef HAVE_MMX2
  86 //#define HAVE_3DNOW
  87 //#undef HAVE_MMX
  88 //#undef ARCH_X86
  89 //#define DEBUG_BRIGHTNESS
  90 #include "postprocess.h"
  91 #include "postprocess_internal.h"
  92
  93 #ifdef HAVE_ALTIVEC_H
  94 #include <altivec.h>
  95 #endif
  96
  97 #define GET_MODE_BUFFER_SIZE 500
  98 #define OPTIONS_ARRAY_SIZE 10
  99 #define BLOCK_SIZE 8
 100 #define TEMP_STRIDE 8
 101 //#define NUM_BLOCKS_AT_ONCE 16 //not used yet
 102
 103 #if defined(ARCH_X86)
 104 DECLARE_ASM_CONST(8, uint64_t, w05)= 0x0005000500050005LL;
 105 DECLARE_ASM_CONST(8, uint64_t, w04)= 0x0004000400040004LL;
 106 DECLARE_ASM_CONST(8, uint64_t, w20)= 0x0020002000200020LL;
 107 DECLARE_ASM_CONST(8, uint64_t, b00)= 0x0000000000000000LL;
 108 DECLARE_ASM_CONST(8, uint64_t, b01)= 0x0101010101010101LL;
 109 DECLARE_ASM_CONST(8, uint64_t, b02)= 0x0202020202020202LL;
 110 DECLARE_ASM_CONST(8, uint64_t, b08)= 0x0808080808080808LL;
 111 DECLARE_ASM_CONST(8, uint64_t, b80)= 0x8080808080808080LL;
 112 #endif
 113
 114 DECLARE_ASM_CONST(8, int, deringThreshold)= 20;
 115
 116
 117 static struct PPFilter filters[]=
 118 {
 119         {"hb", "hdeblock",              1, 1, 3, H_DEBLOCK},
 120         {"vb", "vdeblock",              1, 2, 4, V_DEBLOCK},
 121 /*      {"hr", "rkhdeblock",            1, 1, 3, H_RK1_FILTER},
 122         {"vr", "rkvdeblock",            1, 2, 4, V_RK1_FILTER},*/
 123         {"h1", "x1hdeblock",            1, 1, 3, H_X1_FILTER},
 124         {"v1", "x1vdeblock",            1, 2, 4, V_X1_FILTER},
 125         {"ha", "ahdeblock",             1, 1, 3, H_A_DEBLOCK},
 126         {"va", "avdeblock",             1, 2, 4, V_A_DEBLOCK},
 127         {"dr", "dering",                1, 5, 6, DERING},
 128         {"al", "autolevels",            0, 1, 2, LEVEL_FIX},
 129         {"lb", "linblenddeint",         1, 1, 4, LINEAR_BLEND_DEINT_FILTER},
 130         {"li", "linipoldeint",          1, 1, 4, LINEAR_IPOL_DEINT_FILTER},
 131         {"ci", "cubicipoldeint",        1, 1, 4, CUBIC_IPOL_DEINT_FILTER},
 132         {"md", "mediandeint",           1, 1, 4, MEDIAN_DEINT_FILTER},
 133         {"fd", "ffmpegdeint",           1, 1, 4, FFMPEG_DEINT_FILTER},
 134         {"l5", "lowpass5",              1, 1, 4, LOWPASS5_DEINT_FILTER},
 135         {"tn", "tmpnoise",              1, 7, 8, TEMP_NOISE_FILTER},
 136         {"fq", "forcequant",            1, 0, 0, FORCE_QUANT},
 137         {NULL, NULL,0,0,0,0} //End Marker
 138 };
 139
 140 static const char *replaceTable[]=
 141 {
 142         "default",      "hb:a,vb:a,dr:a",
 143         "de",           "hb:a,vb:a,dr:a",
 144         "fast",         "h1:a,v1:a,dr:a",
 145         "fa",           "h1:a,v1:a,dr:a",
 146         "ac",           "ha:a:128:7,va:a,dr:a",
 147         NULL //End Marker
 148 };
 149
 150
 151 #if defined(ARCH_X86)
 152 static inline void prefetchnta(void *p)
 153 {
 154         asm volatile(   "prefetchnta (%0)\n\t"
 155                 : : "r" (p)
 156         );
 157 }
 158
 159 static inline void prefetcht0(void *p)
 160 {
 161         asm volatile(   "prefetcht0 (%0)\n\t"
 162                 : : "r" (p)
 163         );
 164 }
 165
 166 static inline void prefetcht1(void *p)
 167 {
 168         asm volatile(   "prefetcht1 (%0)\n\t"
 169                 : : "r" (p)
 170         );
 171 }
 172
 173 static inline void prefetcht2(void *p)
 174 {
 175         asm volatile(   "prefetcht2 (%0)\n\t"
 176                 : : "r" (p)
 177         );
 178 }
 179 #endif
 180
 181 // The horizontal Functions exist only in C cuz the MMX code is faster with vertical filters and transposing
 182
 183 /**
 184  * Check if the given 8x8 Block is mostly "flat"
 185  */
 186 static inline int isHorizDC_C(uint8_t src[], int stride, PPContext *c)
 187 {
 188         int numEq= 0;
 189         int y;
 190         const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
 191         const int dcThreshold= dcOffset*2 + 1;
 192
 193         for(y=0; y<BLOCK_SIZE; y++)
 194         {
 195                 if(((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold) numEq++;
 196                 if(((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold) numEq++;
 197                 if(((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold) numEq++;
 198                 if(((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold) numEq++;
 199                 if(((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold) numEq++;
 200                 if(((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold) numEq++;
 201                 if(((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold) numEq++;
 202                 src+= stride;
 203         }
 204         return numEq > c->ppMode.flatnessThreshold;
 205 }
 206
 207 /**
 208  * Check if the middle 8x8 Block in the given 8x16 block is flat
 209  */
 210 static inline int isVertDC_C(uint8_t src[], int stride, PPContext *c){
 211         int numEq= 0;
 212         int y;
 213         const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
 214         const int dcThreshold= dcOffset*2 + 1;
 215
 216         src+= stride*4; // src points to begin of the 8x8 Block
 217         for(y=0; y<BLOCK_SIZE-1; y++)
 218         {
 219                 if(((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold) numEq++;
 220                 if(((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold) numEq++;
 221                 if(((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold) numEq++;
 222                 if(((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold) numEq++;
 223                 if(((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold) numEq++;
 224                 if(((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold) numEq++;
 225                 if(((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold) numEq++;
 226                 if(((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold) numEq++;
 227                 src+= stride;
 228         }
 229         return numEq > c->ppMode.flatnessThreshold;
 230 }
 231
 232 static inline int isHorizMinMaxOk_C(uint8_t src[], int stride, int QP)
 233 {
 234         int i;
 235 #if 1
 236         for(i=0; i<2; i++){
 237                 if((unsigned)(src[0] - src[5] + 2*QP) > 4*QP) return 0;
 238                 src += stride;
 239                 if((unsigned)(src[2] - src[7] + 2*QP) > 4*QP) return 0;
 240                 src += stride;
 241                 if((unsigned)(src[4] - src[1] + 2*QP) > 4*QP) return 0;
 242                 src += stride;
 243                 if((unsigned)(src[6] - src[3] + 2*QP) > 4*QP) return 0;
 244                 src += stride;
 245         }
 246 #else
 247         for(i=0; i<8; i++){
 248                 if((unsigned)(src[0] - src[7] + 2*QP) > 4*QP) return 0;
 249                 src += stride;
 250         }
 251 #endif
 252         return 1;
 253 }
 254
 255 static inline int isVertMinMaxOk_C(uint8_t src[], int stride, int QP)
 256 {
 257 #if 1
 258 #if 1
 259         int x;
 260         src+= stride*4;
 261         for(x=0; x<BLOCK_SIZE; x+=4)
 262         {
 263                 if((unsigned)(src[  x + 0*stride] - src[  x + 5*stride] + 2*QP) > 4*QP) return 0;
 264                 if((unsigned)(src[1+x + 2*stride] - src[1+x + 7*stride] + 2*QP) > 4*QP) return 0;
 265                 if((unsigned)(src[2+x + 4*stride] - src[2+x + 1*stride] + 2*QP) > 4*QP) return 0;
 266                 if((unsigned)(src[3+x + 6*stride] - src[3+x + 3*stride] + 2*QP) > 4*QP) return 0;
 267         }
 268 #else
 269         int x;
 270         src+= stride*3;
 271         for(x=0; x<BLOCK_SIZE; x++)
 272         {
 273                 if((unsigned)(src[x + stride] - src[x + (stride<<3)] + 2*QP) > 4*QP) return 0;
 274         }
 275 #endif
 276         return 1;
 277 #else
 278         int x;
 279         src+= stride*4;
 280         for(x=0; x<BLOCK_SIZE; x++)
 281         {
 282                 int min=255;
 283                 int max=0;
 284                 int y;
 285                 for(y=0; y<8; y++){
 286                         int v= src[x + y*stride];
 287                         if(v>max) max=v;
 288                         if(v<min) min=v;
 289                 }
 290                 if(max-min > 2*QP) return 0;
 291         }
 292         return 1;
 293 #endif
 294 }
 295
 296 static inline int horizClassify_C(uint8_t src[], int stride, PPContext *c){
 297         if( isHorizDC_C(src, stride, c) ){
 298                 if( isHorizMinMaxOk_C(src, stride, c->QP) )
 299                         return 1;
 300                 else
 301                         return 0;
 302         }else{
 303                 return 2;
 304         }
 305 }
 306
 307 static inline int vertClassify_C(uint8_t src[], int stride, PPContext *c){
 308         if( isVertDC_C(src, stride, c) ){
 309                 if( isVertMinMaxOk_C(src, stride, c->QP) )
 310                         return 1;
 311                 else
 312                         return 0;
 313         }else{
 314                 return 2;
 315         }
 316 }
 317
 318 static inline void doHorizDefFilter_C(uint8_t dst[], int stride, PPContext *c)
 319 {
 320         int y;
 321         for(y=0; y<BLOCK_SIZE; y++)
 322         {
 323                 const int middleEnergy= 5*(dst[4] - dst[3]) + 2*(dst[2] - dst[5]);
 324
 325                 if(FFABS(middleEnergy) < 8*c->QP)
 326                 {
 327                         const int q=(dst[3] - dst[4])/2;
 328                         const int leftEnergy=  5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
 329                         const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
 330
 331                         int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
 332                         d= FFMAX(d, 0);
 333
 334                         d= (5*d + 32) >> 6;
 335                         d*= FFSIGN(-middleEnergy);
 336
 337                         if(q>0)
 338                         {
 339                                 d= d<0 ? 0 : d;
 340                                 d= d>q ? q : d;
 341                         }
 342                         else
 343                         {
 344                                 d= d>0 ? 0 : d;
 345                                 d= d<q ? q : d;
 346                         }
 347
 348                         dst[3]-= d;
 349                         dst[4]+= d;
 350                 }
 351                 dst+= stride;
 352         }
 353 }
 354
 355 /**
 356  * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
 357  * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
 358  */
 359 static inline void doHorizLowPass_C(uint8_t dst[], int stride, PPContext *c)
 360 {
 361         int y;
 362         for(y=0; y<BLOCK_SIZE; y++)
 363         {
 364                 const int first= FFABS(dst[-1] - dst[0]) < c->QP ? dst[-1] : dst[0];
 365                 const int last= FFABS(dst[8] - dst[7]) < c->QP ? dst[8] : dst[7];
 366
 367                 int sums[10];
 368                 sums[0] = 4*first + dst[0] + dst[1] + dst[2] + 4;
 369                 sums[1] = sums[0] - first  + dst[3];
 370                 sums[2] = sums[1] - first  + dst[4];
 371                 sums[3] = sums[2] - first  + dst[5];
 372                 sums[4] = sums[3] - first  + dst[6];
 373                 sums[5] = sums[4] - dst[0] + dst[7];
 374                 sums[6] = sums[5] - dst[1] + last;
 375                 sums[7] = sums[6] - dst[2] + last;
 376                 sums[8] = sums[7] - dst[3] + last;
 377                 sums[9] = sums[8] - dst[4] + last;
 378
 379                 dst[0]= (sums[0] + sums[2] + 2*dst[0])>>4;
 380                 dst[1]= (sums[1] + sums[3] + 2*dst[1])>>4;
 381                 dst[2]= (sums[2] + sums[4] + 2*dst[2])>>4;
 382                 dst[3]= (sums[3] + sums[5] + 2*dst[3])>>4;
 383                 dst[4]= (sums[4] + sums[6] + 2*dst[4])>>4;
 384                 dst[5]= (sums[5] + sums[7] + 2*dst[5])>>4;
 385                 dst[6]= (sums[6] + sums[8] + 2*dst[6])>>4;
 386                 dst[7]= (sums[7] + sums[9] + 2*dst[7])>>4;
 387
 388                 dst+= stride;
 389         }
 390 }
 391
 392 /**
 393  * Experimental Filter 1 (Horizontal)
 394  * will not damage linear gradients
 395  * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
 396  * can only smooth blocks at the expected locations (it cannot smooth them if they did move)
 397  * MMX2 version does correct clipping C version does not
 398  * not identical with the vertical one
 399  */
 400 static inline void horizX1Filter(uint8_t *src, int stride, int QP)
 401 {
 402         int y;
 403         static uint64_t *lut= NULL;
 404         if(lut==NULL)
 405         {
 406                 int i;
 407                 lut = av_malloc(256*8);
 408                 for(i=0; i<256; i++)
 409                 {
 410                         int v= i < 128 ? 2*i : 2*(i-256);
 411 /*
 412 //Simulate 112242211 9-Tap filter
 413                         uint64_t a= (v/16) & 0xFF;
 414                         uint64_t b= (v/8) & 0xFF;
 415                         uint64_t c= (v/4) & 0xFF;
 416                         uint64_t d= (3*v/8) & 0xFF;
 417 */
 418 //Simulate piecewise linear interpolation
 419                         uint64_t a= (v/16) & 0xFF;
 420                         uint64_t b= (v*3/16) & 0xFF;
 421                         uint64_t c= (v*5/16) & 0xFF;
 422                         uint64_t d= (7*v/16) & 0xFF;
 423                         uint64_t A= (0x100 - a)&0xFF;
 424                         uint64_t B= (0x100 - b)&0xFF;
 425                         uint64_t C= (0x100 - c)&0xFF;
 426                         uint64_t D= (0x100 - c)&0xFF;
 427
 428                         lut[i]   = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
 429                                 (D<<24) | (C<<16) | (B<<8) | (A);
 430                         //lut[i] = (v<<32) | (v<<24);
 431                 }
 432         }
 433
 434         for(y=0; y<BLOCK_SIZE; y++)
 435         {
 436                 int a= src[1] - src[2];
 437                 int b= src[3] - src[4];
 438                 int c= src[5] - src[6];
 439
 440                 int d= FFMAX(FFABS(b) - (FFABS(a) + FFABS(c))/2, 0);
 441
 442                 if(d < QP)
 443                 {
 444                         int v = d * FFSIGN(-b);
 445
 446                         src[1] +=v/8;
 447                         src[2] +=v/4;
 448                         src[3] +=3*v/8;
 449                         src[4] -=3*v/8;
 450                         src[5] -=v/4;
 451                         src[6] -=v/8;
 452
 453                 }
 454                 src+=stride;
 455         }
 456 }
 457
 458 /**
 459  * accurate deblock filter
 460  */
 461 static av_always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPContext *c){
 462         int y;
 463         const int QP= c->QP;
 464         const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
 465         const int dcThreshold= dcOffset*2 + 1;
 466 //START_TIMER
 467         src+= step*4; // src points to begin of the 8x8 Block
 468         for(y=0; y<8; y++){
 469                 int numEq= 0;
 470
 471                 if(((unsigned)(src[-1*step] - src[0*step] + dcOffset)) < dcThreshold) numEq++;
 472                 if(((unsigned)(src[ 0*step] - src[1*step] + dcOffset)) < dcThreshold) numEq++;
 473                 if(((unsigned)(src[ 1*step] - src[2*step] + dcOffset)) < dcThreshold) numEq++;
 474                 if(((unsigned)(src[ 2*step] - src[3*step] + dcOffset)) < dcThreshold) numEq++;
 475                 if(((unsigned)(src[ 3*step] - src[4*step] + dcOffset)) < dcThreshold) numEq++;
 476                 if(((unsigned)(src[ 4*step] - src[5*step] + dcOffset)) < dcThreshold) numEq++;
 477                 if(((unsigned)(src[ 5*step] - src[6*step] + dcOffset)) < dcThreshold) numEq++;
 478                 if(((unsigned)(src[ 6*step] - src[7*step] + dcOffset)) < dcThreshold) numEq++;
 479                 if(((unsigned)(src[ 7*step] - src[8*step] + dcOffset)) < dcThreshold) numEq++;
 480                 if(numEq > c->ppMode.flatnessThreshold){
 481                         int min, max, x;
 482
 483                         if(src[0] > src[step]){
 484                             max= src[0];
 485                             min= src[step];
 486                         }else{
 487                             max= src[step];
 488                             min= src[0];
 489                         }
 490                         for(x=2; x<8; x+=2){
 491                                 if(src[x*step] > src[(x+1)*step]){
 492                                         if(src[x    *step] > max) max= src[ x   *step];
 493                                         if(src[(x+1)*step] < min) min= src[(x+1)*step];
 494                                 }else{
 495                                         if(src[(x+1)*step] > max) max= src[(x+1)*step];
 496                                         if(src[ x   *step] < min) min= src[ x   *step];
 497                                 }
 498                         }
 499                         if(max-min < 2*QP){
 500                                 const int first= FFABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0];
 501                                 const int last= FFABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step];
 502
 503                                 int sums[10];
 504                                 sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4;
 505                                 sums[1] = sums[0] - first       + src[3*step];
 506                                 sums[2] = sums[1] - first       + src[4*step];
 507                                 sums[3] = sums[2] - first       + src[5*step];
 508                                 sums[4] = sums[3] - first       + src[6*step];
 509                                 sums[5] = sums[4] - src[0*step] + src[7*step];
 510                                 sums[6] = sums[5] - src[1*step] + last;
 511                                 sums[7] = sums[6] - src[2*step] + last;
 512                                 sums[8] = sums[7] - src[3*step] + last;
 513                                 sums[9] = sums[8] - src[4*step] + last;
 514
 515                                 src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4;
 516                                 src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4;
 517                                 src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4;
 518                                 src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4;
 519                                 src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4;
 520                                 src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4;
 521                                 src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4;
 522                                 src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4;
 523                         }
 524                 }else{
 525                         const int middleEnergy= 5*(src[4*step] - src[3*step]) + 2*(src[2*step] - src[5*step]);
 526
 527                         if(FFABS(middleEnergy) < 8*QP)
 528                         {
 529                                 const int q=(src[3*step] - src[4*step])/2;
 530                                 const int leftEnergy=  5*(src[2*step] - src[1*step]) + 2*(src[0*step] - src[3*step]);
 531                                 const int rightEnergy= 5*(src[6*step] - src[5*step]) + 2*(src[4*step] - src[7*step]);
 532
 533                                 int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
 534                                 d= FFMAX(d, 0);
 535
 536                                 d= (5*d + 32) >> 6;
 537                                 d*= FFSIGN(-middleEnergy);
 538
 539                                 if(q>0)
 540                                 {
 541                                         d= d<0 ? 0 : d;
 542                                         d= d>q ? q : d;
 543                                 }
 544                                 else
 545                                 {
 546                                         d= d>0 ? 0 : d;
 547                                         d= d<q ? q : d;
 548                                 }
 549
 550                                 src[3*step]-= d;
 551                                 src[4*step]+= d;
 552                         }
 553                 }
 554
 555                 src += stride;
 556         }
 557 /*if(step==16){
 558     STOP_TIMER("step16")
 559 }else{
 560     STOP_TIMER("stepX")
 561 }*/
 562 }
 563
 564 //Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one
 565 //Plain C versions
 566 #if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT)
 567 #define COMPILE_C
 568 #endif
 569
 570 #ifdef ARCH_POWERPC
 571 #ifdef HAVE_ALTIVEC
 572 #define COMPILE_ALTIVEC
 573 #endif //HAVE_ALTIVEC
 574 #endif //ARCH_POWERPC
 575
 576 #if defined(ARCH_X86)
 577
 578 #if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
 579 #define COMPILE_MMX
 580 #endif
 581
 582 #if defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT)
 583 #define COMPILE_MMX2
 584 #endif
 585
 586 #if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
 587 #define COMPILE_3DNOW
 588 #endif
 589 #endif /* defined(ARCH_X86) */
 590
 591 #undef HAVE_MMX
 592 #undef HAVE_MMX2
 593 #undef HAVE_3DNOW
 594 #undef HAVE_ALTIVEC
 595
 596 #ifdef COMPILE_C
 597 #undef HAVE_MMX
 598 #undef HAVE_MMX2
 599 #undef HAVE_3DNOW
 600 #define RENAME(a) a ## _C
 601 #include "postprocess_template.c"
 602 #endif
 603
 604 #ifdef ARCH_POWERPC
 605 #ifdef COMPILE_ALTIVEC
 606 #undef RENAME
 607 #define HAVE_ALTIVEC
 608 #define RENAME(a) a ## _altivec
 609 #include "postprocess_altivec_template.c"
 610 #include "postprocess_template.c"
 611 #endif
 612 #endif //ARCH_POWERPC
 613
 614 //MMX versions
 615 #ifdef COMPILE_MMX
 616 #undef RENAME
 617 #define HAVE_MMX
 618 #undef HAVE_MMX2
 619 #undef HAVE_3DNOW
 620 #define RENAME(a) a ## _MMX
 621 #include "postprocess_template.c"
 622 #endif
 623
 624 //MMX2 versions
 625 #ifdef COMPILE_MMX2
 626 #undef RENAME
 627 #define HAVE_MMX
 628 #define HAVE_MMX2
 629 #undef HAVE_3DNOW
 630 #define RENAME(a) a ## _MMX2
 631 #include "postprocess_template.c"
 632 #endif
 633
 634 //3DNOW versions
 635 #ifdef COMPILE_3DNOW
 636 #undef RENAME
 637 #define HAVE_MMX
 638 #undef HAVE_MMX2
 639 #define HAVE_3DNOW
 640 #define RENAME(a) a ## _3DNow
 641 #include "postprocess_template.c"
 642 #endif
 643
 644 // minor note: the HAVE_xyz is messed up after that line so do not use it.
 645
 646 static inline void postProcess(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
 647         const QP_STORE_T QPs[], int QPStride, int isColor, pp_mode_t *vm, pp_context_t *vc)
 648 {
 649         PPContext *c= (PPContext *)vc;
 650         PPMode *ppMode= (PPMode *)vm;
 651         c->ppMode= *ppMode; //FIXME
 652
 653         // Using ifs here as they are faster than function pointers although the
 654         // difference would not be measurable here but it is much better because
 655         // someone might exchange the CPU whithout restarting MPlayer ;)
 656 #ifdef RUNTIME_CPUDETECT
 657 #if defined(ARCH_X86)
 658         // ordered per speed fastest first
 659         if(c->cpuCaps & PP_CPU_CAPS_MMX2)
 660                 postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
 661         else if(c->cpuCaps & PP_CPU_CAPS_3DNOW)
 662                 postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
 663         else if(c->cpuCaps & PP_CPU_CAPS_MMX)
 664                 postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
 665         else
 666                 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
 667 #else
 668 #ifdef ARCH_POWERPC
 669 #ifdef HAVE_ALTIVEC
 670         if(c->cpuCaps & PP_CPU_CAPS_ALTIVEC)
 671                 postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
 672         else
 673 #endif
 674 #endif
 675                 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
 676 #endif
 677 #else //RUNTIME_CPUDETECT
 678 #ifdef HAVE_MMX2
 679                 postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
 680 #elif defined (HAVE_3DNOW)
 681                 postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
 682 #elif defined (HAVE_MMX)
 683                 postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
 684 #elif defined (HAVE_ALTIVEC)
 685                 postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
 686 #else
 687                 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
 688 #endif
 689 #endif //!RUNTIME_CPUDETECT
 690 }
 691
 692 //static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
 693 //        QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode);
 694
 695 /* -pp Command line Help
 696 */
 697 #if LIBPOSTPROC_VERSION_INT < (52<<16)
 698 const char *const pp_help=
 699 #else
 700 const char pp_help[] =
 701 #endif
 702 "Available postprocessing filters:\n"
 703 "Filters                        Options\n"
 704 "short  long name       short   long option     Description\n"
 705 "*      *               a       autoq           CPU power dependent enabler\n"
 706 "                       c       chrom           chrominance filtering enabled\n"
 707 "                       y       nochrom         chrominance filtering disabled\n"
 708 "                       n       noluma          luma filtering disabled\n"
 709 "hb     hdeblock        (2 threshold)           horizontal deblocking filter\n"
 710 "       1. difference factor: default=32, higher -> more deblocking\n"
 711 "       2. flatness threshold: default=39, lower -> more deblocking\n"
 712 "                       the h & v deblocking filters share these\n"
 713 "                       so you can't set different thresholds for h / v\n"
 714 "vb     vdeblock        (2 threshold)           vertical deblocking filter\n"
 715 "ha     hadeblock       (2 threshold)           horizontal deblocking filter\n"
 716 "va     vadeblock       (2 threshold)           vertical deblocking filter\n"
 717 "h1     x1hdeblock                              experimental h deblock filter 1\n"
 718 "v1     x1vdeblock                              experimental v deblock filter 1\n"
 719 "dr     dering                                  deringing filter\n"
 720 "al     autolevels                              automatic brightness / contrast\n"
 721 "                       f        fullyrange     stretch luminance to (0..255)\n"
 722 "lb     linblenddeint                           linear blend deinterlacer\n"
 723 "li     linipoldeint                            linear interpolating deinterlace\n"
 724 "ci     cubicipoldeint                          cubic interpolating deinterlacer\n"
 725 "md     mediandeint                             median deinterlacer\n"
 726 "fd     ffmpegdeint                             ffmpeg deinterlacer\n"
 727 "l5     lowpass5                                FIR lowpass deinterlacer\n"
 728 "de     default                                 hb:a,vb:a,dr:a\n"
 729 "fa     fast                                    h1:a,v1:a,dr:a\n"
 730 "ac                                             ha:a:128:7,va:a,dr:a\n"
 731 "tn     tmpnoise        (3 threshold)           temporal noise reducer\n"
 732 "                     1. <= 2. <= 3.            larger -> stronger filtering\n"
 733 "fq     forceQuant      <quantizer>             force quantizer\n"
 734 "Usage:\n"
 735 "<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n"
 736 "long form example:\n"
 737 "vdeblock:autoq/hdeblock:autoq/linblenddeint    default,-vdeblock\n"
 738 "short form example:\n"
 739 "vb:a/hb:a/lb                                   de,-vb\n"
 740 "more examples:\n"
 741 "tn:64:128:256\n"
 742 "\n"
 743 ;
 744
 745 pp_mode_t *pp_get_mode_by_name_and_quality(const char *name, int quality)
 746 {
 747         char temp[GET_MODE_BUFFER_SIZE];
 748         char *p= temp;
 749         static const char filterDelimiters[] = ",/";
 750         static const char optionDelimiters[] = ":";
 751         struct PPMode *ppMode;
 752         char *filterToken;
 753
 754         ppMode= av_malloc(sizeof(PPMode));
 755
 756         ppMode->lumMode= 0;
 757         ppMode->chromMode= 0;
 758         ppMode->maxTmpNoise[0]= 700;
 759         ppMode->maxTmpNoise[1]= 1500;
 760         ppMode->maxTmpNoise[2]= 3000;
 761         ppMode->maxAllowedY= 234;
 762         ppMode->minAllowedY= 16;
 763         ppMode->baseDcDiff= 256/8;
 764         ppMode->flatnessThreshold= 56-16-1;
 765         ppMode->maxClippedThreshold= 0.01;
 766         ppMode->error=0;
 767
 768         strncpy(temp, name, GET_MODE_BUFFER_SIZE);
 769
 770         av_log(NULL, AV_LOG_DEBUG, "pp: %s\n", name);
 771
 772         for(;;){
 773                 char *filterName;
 774                 int q= 1000000; //PP_QUALITY_MAX;
 775                 int chrom=-1;
 776                 int luma=-1;
 777                 char *option;
 778                 char *options[OPTIONS_ARRAY_SIZE];
 779                 int i;
 780                 int filterNameOk=0;
 781                 int numOfUnknownOptions=0;
 782                 int enable=1; //does the user want us to enabled or disabled the filter
 783
 784                 filterToken= strtok(p, filterDelimiters);
 785                 if(filterToken == NULL) break;
 786                 p+= strlen(filterToken) + 1; // p points to next filterToken
 787                 filterName= strtok(filterToken, optionDelimiters);
 788                 av_log(NULL, AV_LOG_DEBUG, "pp: %s::%s\n", filterToken, filterName);
 789
 790                 if(*filterName == '-')
 791                 {
 792                         enable=0;
 793                         filterName++;
 794                 }
 795
 796                 for(;;){ //for all options
 797                         option= strtok(NULL, optionDelimiters);
 798                         if(option == NULL) break;
 799
 800                         av_log(NULL, AV_LOG_DEBUG, "pp: option: %s\n", option);
 801                         if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
 802                         else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
 803                         else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
 804                         else if(!strcmp("noluma", option) || !strcmp("n", option)) luma=0;
 805                         else
 806                         {
 807                                 options[numOfUnknownOptions] = option;
 808                                 numOfUnknownOptions++;
 809                         }
 810                         if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
 811                 }
 812                 options[numOfUnknownOptions] = NULL;
 813
 814                 /* replace stuff from the replace Table */
 815                 for(i=0; replaceTable[2*i]!=NULL; i++)
 816                 {
 817                         if(!strcmp(replaceTable[2*i], filterName))
 818                         {
 819                                 int newlen= strlen(replaceTable[2*i + 1]);
 820                                 int plen;
 821                                 int spaceLeft;
 822
 823                                 if(p==NULL) p= temp, *p=0;      //last filter
 824                                 else p--, *p=',';               //not last filter
 825
 826                                 plen= strlen(p);
 827                                 spaceLeft= p - temp + plen;
 828                                 if(spaceLeft + newlen  >= GET_MODE_BUFFER_SIZE)
 829                                 {
 830                                         ppMode->error++;
 831                                         break;
 832                                 }
 833                                 memmove(p + newlen, p, plen+1);
 834                                 memcpy(p, replaceTable[2*i + 1], newlen);
 835                                 filterNameOk=1;
 836                         }
 837                 }
 838
 839                 for(i=0; filters[i].shortName!=NULL; i++)
 840                 {
 841                         if(   !strcmp(filters[i].longName, filterName)
 842                            || !strcmp(filters[i].shortName, filterName))
 843                         {
 844                                 ppMode->lumMode &= ~filters[i].mask;
 845                                 ppMode->chromMode &= ~filters[i].mask;
 846
 847                                 filterNameOk=1;
 848                                 if(!enable) break; // user wants to disable it
 849
 850                                 if(q >= filters[i].minLumQuality && luma)
 851                                         ppMode->lumMode|= filters[i].mask;
 852                                 if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
 853                                         if(q >= filters[i].minChromQuality)
 854                                                 ppMode->chromMode|= filters[i].mask;
 855
 856                                 if(filters[i].mask == LEVEL_FIX)
 857                                 {
 858                                         int o;
 859                                         ppMode->minAllowedY= 16;
 860                                         ppMode->maxAllowedY= 234;
 861                                         for(o=0; options[o]!=NULL; o++)
 862                                         {
 863                                                 if(  !strcmp(options[o],"fullyrange")
 864                                                    ||!strcmp(options[o],"f"))
 865                                                 {
 866                                                         ppMode->minAllowedY= 0;
 867                                                         ppMode->maxAllowedY= 255;
 868                                                         numOfUnknownOptions--;
 869                                                 }
 870                                         }
 871                                 }
 872                                 else if(filters[i].mask == TEMP_NOISE_FILTER)
 873                                 {
 874                                         int o;
 875                                         int numOfNoises=0;
 876
 877                                         for(o=0; options[o]!=NULL; o++)
 878                                         {
 879                                                 char *tail;
 880                                                 ppMode->maxTmpNoise[numOfNoises]=
 881                                                         strtol(options[o], &tail, 0);
 882                                                 if(tail!=options[o])
 883                                                 {
 884                                                         numOfNoises++;
 885                                                         numOfUnknownOptions--;
 886                                                         if(numOfNoises >= 3) break;
 887                                                 }
 888                                         }
 889                                 }
 890                                 else if(filters[i].mask == V_DEBLOCK   || filters[i].mask == H_DEBLOCK
 891                                      || filters[i].mask == V_A_DEBLOCK || filters[i].mask == H_A_DEBLOCK)
 892                                 {
 893                                         int o;
 894
 895                                         for(o=0; options[o]!=NULL && o<2; o++)
 896                                         {
 897                                                 char *tail;
 898                                                 int val= strtol(options[o], &tail, 0);
 899                                                 if(tail==options[o]) break;
 900
 901                                                 numOfUnknownOptions--;
 902                                                 if(o==0) ppMode->baseDcDiff= val;
 903                                                 else ppMode->flatnessThreshold= val;
 904                                         }
 905                                 }
 906                                 else if(filters[i].mask == FORCE_QUANT)
 907                                 {
 908                                         int o;
 909                                         ppMode->forcedQuant= 15;
 910
 911                                         for(o=0; options[o]!=NULL && o<1; o++)
 912                                         {
 913                                                 char *tail;
 914                                                 int val= strtol(options[o], &tail, 0);
 915                                                 if(tail==options[o]) break;
 916
 917                                                 numOfUnknownOptions--;
 918                                                 ppMode->forcedQuant= val;
 919                                         }
 920                                 }
 921                         }
 922                 }
 923                 if(!filterNameOk) ppMode->error++;
 924                 ppMode->error += numOfUnknownOptions;
 925         }
 926
 927         av_log(NULL, AV_LOG_DEBUG, "pp: lumMode=%X, chromMode=%X\n", ppMode->lumMode, ppMode->chromMode);
 928         if(ppMode->error)
 929         {
 930                 av_log(NULL, AV_LOG_ERROR, "%d errors in postprocess string \"%s\"\n", ppMode->error, name);
 931                 av_free(ppMode);
 932                 return NULL;
 933         }
 934         return ppMode;
 935 }
 936
 937 void pp_free_mode(pp_mode_t *mode){
 938     av_free(mode);
 939 }
 940
 941 static void reallocAlign(void **p, int alignment, int size){
 942         av_free(*p);
 943         *p= av_mallocz(size);
 944 }
 945
 946 static void reallocBuffers(PPContext *c, int width, int height, int stride, int qpStride){
 947         int mbWidth = (width+15)>>4;
 948         int mbHeight= (height+15)>>4;
 949         int i;
 950
 951         c->stride= stride;
 952         c->qpStride= qpStride;
 953
 954         reallocAlign((void **)&c->tempDst, 8, stride*24);
 955         reallocAlign((void **)&c->tempSrc, 8, stride*24);
 956         reallocAlign((void **)&c->tempBlocks, 8, 2*16*8);
 957         reallocAlign((void **)&c->yHistogram, 8, 256*sizeof(uint64_t));
 958         for(i=0; i<256; i++)
 959                 c->yHistogram[i]= width*height/64*15/256;
 960
 961         for(i=0; i<3; i++)
 962         {
 963                 //Note: The +17*1024 is just there so i do not have to worry about r/w over the end.
 964                 reallocAlign((void **)&c->tempBlured[i], 8, stride*mbHeight*16 + 17*1024);
 965                 reallocAlign((void **)&c->tempBluredPast[i], 8, 256*((height+7)&(~7))/2 + 17*1024);//FIXME size
 966         }
 967
 968         reallocAlign((void **)&c->deintTemp, 8, 2*width+32);
 969         reallocAlign((void **)&c->nonBQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
 970         reallocAlign((void **)&c->stdQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
 971         reallocAlign((void **)&c->forcedQPTable, 8, mbWidth*sizeof(QP_STORE_T));
 972 }
 973
 974 static const char * context_to_name(void * ptr) {
 975     return "postproc";
 976 }
 977
 978 static const AVClass av_codec_context_class = { "Postproc", context_to_name, NULL };
 979
 980 pp_context_t *pp_get_context(int width, int height, int cpuCaps){
 981         PPContext *c= av_malloc(sizeof(PPContext));
 982         int stride= (width+15)&(~15);    //assumed / will realloc if needed
 983         int qpStride= (width+15)/16 + 2; //assumed / will realloc if needed
 984
 985         memset(c, 0, sizeof(PPContext));
 986         c->av_class = &av_codec_context_class;
 987         c->cpuCaps= cpuCaps;
 988         if(cpuCaps&PP_FORMAT){
 989                 c->hChromaSubSample= cpuCaps&0x3;
 990                 c->vChromaSubSample= (cpuCaps>>4)&0x3;
 991         }else{
 992                 c->hChromaSubSample= 1;
 993                 c->vChromaSubSample= 1;
 994         }
 995
 996         reallocBuffers(c, width, height, stride, qpStride);
 997
 998         c->frameNum=-1;
 999
1000         return c;
1001 }
1002
1003 void pp_free_context(void *vc){
1004         PPContext *c = (PPContext*)vc;
1005         int i;
1006
1007         for(i=0; i<3; i++) av_free(c->tempBlured[i]);
1008         for(i=0; i<3; i++) av_free(c->tempBluredPast[i]);
1009
1010         av_free(c->tempBlocks);
1011         av_free(c->yHistogram);
1012         av_free(c->tempDst);
1013         av_free(c->tempSrc);
1014         av_free(c->deintTemp);
1015         av_free(c->stdQPTable);
1016         av_free(c->nonBQPTable);
1017         av_free(c->forcedQPTable);
1018
1019         memset(c, 0, sizeof(PPContext));
1020
1021         av_free(c);
1022 }
1023
1024 void  pp_postprocess(const uint8_t * src[3], const int srcStride[3],
1025                  uint8_t * dst[3], const int dstStride[3],
1026                  int width, int height,
1027                  const QP_STORE_T *QP_store,  int QPStride,
1028                  pp_mode_t *vm,  void *vc, int pict_type)
1029 {
1030         int mbWidth = (width+15)>>4;
1031         int mbHeight= (height+15)>>4;
1032         PPMode *mode = (PPMode*)vm;
1033         PPContext *c = (PPContext*)vc;
1034         int minStride= FFMAX(FFABS(srcStride[0]), FFABS(dstStride[0]));
1035         int absQPStride = FFABS(QPStride);
1036
1037         // c->stride and c->QPStride are always positive
1038         if(c->stride < minStride || c->qpStride < absQPStride)
1039                 reallocBuffers(c, width, height,
1040                                 FFMAX(minStride, c->stride),
1041                                 FFMAX(c->qpStride, absQPStride));
1042
1043         if(QP_store==NULL || (mode->lumMode & FORCE_QUANT))
1044         {
1045                 int i;
1046                 QP_store= c->forcedQPTable;
1047                 absQPStride = QPStride = 0;
1048                 if(mode->lumMode & FORCE_QUANT)
1049                         for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= mode->forcedQuant;
1050                 else
1051                         for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= 1;
1052         }
1053
1054         if(pict_type & PP_PICT_TYPE_QP2){
1055                 int i;
1056                 const int count= mbHeight * absQPStride;
1057                 for(i=0; i<(count>>2); i++){
1058                         ((uint32_t*)c->stdQPTable)[i] = (((const uint32_t*)QP_store)[i]>>1) & 0x7F7F7F7F;
1059                 }
1060                 for(i<<=2; i<count; i++){
1061                         c->stdQPTable[i] = QP_store[i]>>1;
1062                 }
1063                 QP_store= c->stdQPTable;
1064                 QPStride= absQPStride;
1065         }
1066
1067 if(0){
1068 int x,y;
1069 for(y=0; y<mbHeight; y++){
1070         for(x=0; x<mbWidth; x++){
1071                 av_log(c, AV_LOG_INFO, "%2d ", QP_store[x + y*QPStride]);
1072         }
1073         av_log(c, AV_LOG_INFO, "\n");
1074 }
1075         av_log(c, AV_LOG_INFO, "\n");
1076 }
1077
1078         if((pict_type&7)!=3)
1079         {
1080                 if (QPStride >= 0) {
1081                         int i;
1082                         const int count= mbHeight * QPStride;
1083                         for(i=0; i<(count>>2); i++){
1084                                 ((uint32_t*)c->nonBQPTable)[i] = ((const uint32_t*)QP_store)[i] & 0x3F3F3F3F;
1085                         }
1086                         for(i<<=2; i<count; i++){
1087                                 c->nonBQPTable[i] = QP_store[i] & 0x3F;
1088                         }
1089                 } else {
1090                         int i,j;
1091                         for(i=0; i<mbHeight; i++) {
1092                                     for(j=0; j<absQPStride; j++) {
1093                                         c->nonBQPTable[i*absQPStride+j] = QP_store[i*QPStride+j] & 0x3F;
1094                                 }
1095                         }
1096                 }
1097         }
1098
1099         av_log(c, AV_LOG_DEBUG, "using npp filters 0x%X/0x%X\n",
1100                mode->lumMode, mode->chromMode);
1101
1102         postProcess(src[0], srcStride[0], dst[0], dstStride[0],
1103                 width, height, QP_store, QPStride, 0, mode, c);
1104
1105         width  = (width )>>c->hChromaSubSample;
1106         height = (height)>>c->vChromaSubSample;
1107
1108         if(mode->chromMode)
1109         {
1110                 postProcess(src[1], srcStride[1], dst[1], dstStride[1],
1111                         width, height, QP_store, QPStride, 1, mode, c);
1112                 postProcess(src[2], srcStride[2], dst[2], dstStride[2],
1113                         width, height, QP_store, QPStride, 2, mode, c);
1114         }
1115         else if(srcStride[1] == dstStride[1] && srcStride[2] == dstStride[2])
1116         {
1117                 linecpy(dst[1], src[1], height, srcStride[1]);
1118                 linecpy(dst[2], src[2], height, srcStride[2]);
1119         }
1120         else
1121         {
1122                 int y;
1123                 for(y=0; y<height; y++)
1124                 {
1125                         memcpy(&(dst[1][y*dstStride[1]]), &(src[1][y*srcStride[1]]), width);
1126                         memcpy(&(dst[2][y*dstStride[2]]), &(src[2][y*srcStride[2]]), width);
1127                 }
1128         }
1129 }
1130