]> rtime.felk.cvut.cz Git - frescor/ffmpeg.git/blob - libpostproc/postprocess.c
typos
[frescor/ffmpeg.git] / libpostproc / postprocess.c
1 /*
2  * Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)
3  *
4  * AltiVec optimizations (C) 2004 Romain Dolbeau <romain@dolbeau.org>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22
23 /**
24  * @file postprocess.c
25  * postprocessing.
26  */
27
28 /*
29                         C       MMX     MMX2    3DNow   AltiVec
30 isVertDC                Ec      Ec                      Ec
31 isVertMinMaxOk          Ec      Ec                      Ec
32 doVertLowPass           E               e       e       Ec
33 doVertDefFilter         Ec      Ec      e       e       Ec
34 isHorizDC               Ec      Ec                      Ec
35 isHorizMinMaxOk         a       E                       Ec
36 doHorizLowPass          E               e       e       Ec
37 doHorizDefFilter        Ec      Ec      e       e       Ec
38 do_a_deblock            Ec      E       Ec      E
39 deRing                  E               e       e*      Ecp
40 Vertical RKAlgo1        E               a       a
41 Horizontal RKAlgo1                      a       a
42 Vertical X1#            a               E       E
43 Horizontal X1#          a               E       E
44 LinIpolDeinterlace      e               E       E*
45 CubicIpolDeinterlace    a               e       e*
46 LinBlendDeinterlace     e               E       E*
47 MedianDeinterlace#      E       Ec      Ec
48 TempDeNoiser#           E               e       e       Ec
49
50 * I do not have a 3DNow! CPU -> it is untested, but no one said it does not work so it seems to work
51 # more or less selfinvented filters so the exactness is not too meaningful
52 E = Exact implementation
53 e = allmost exact implementation (slightly different rounding,...)
54 a = alternative / approximate impl
55 c = checked against the other implementations (-vo md5)
56 p = partially optimized, still some work to do
57 */
58
59 /*
60 TODO:
61 reduce the time wasted on the mem transfer
62 unroll stuff if instructions depend too much on the prior one
63 move YScale thing to the end instead of fixing QP
64 write a faster and higher quality deblocking filter :)
65 make the mainloop more flexible (variable number of blocks at once
66         (the if/else stuff per block is slowing things down)
67 compare the quality & speed of all filters
68 split this huge file
69 optimize c versions
70 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
71 ...
72 */
73
74 //Changelog: use the Subversion log
75
76 #include "config.h"
77 #include "avutil.h"
78 #include <inttypes.h>
79 #include <stdio.h>
80 #include <stdlib.h>
81 #include <string.h>
82 #ifdef HAVE_MALLOC_H
83 #include <malloc.h>
84 #endif
85 //#undef HAVE_MMX2
86 //#define HAVE_3DNOW
87 //#undef HAVE_MMX
88 //#undef ARCH_X86
89 //#define DEBUG_BRIGHTNESS
90 #include "postprocess.h"
91 #include "postprocess_internal.h"
92
93 #ifdef HAVE_ALTIVEC_H
94 #include <altivec.h>
95 #endif
96
97 #define GET_MODE_BUFFER_SIZE 500
98 #define OPTIONS_ARRAY_SIZE 10
99 #define BLOCK_SIZE 8
100 #define TEMP_STRIDE 8
101 //#define NUM_BLOCKS_AT_ONCE 16 //not used yet
102
103 #if defined(ARCH_X86)
104 DECLARE_ASM_CONST(8, uint64_t, w05)= 0x0005000500050005LL;
105 DECLARE_ASM_CONST(8, uint64_t, w04)= 0x0004000400040004LL;
106 DECLARE_ASM_CONST(8, uint64_t, w20)= 0x0020002000200020LL;
107 DECLARE_ASM_CONST(8, uint64_t, b00)= 0x0000000000000000LL;
108 DECLARE_ASM_CONST(8, uint64_t, b01)= 0x0101010101010101LL;
109 DECLARE_ASM_CONST(8, uint64_t, b02)= 0x0202020202020202LL;
110 DECLARE_ASM_CONST(8, uint64_t, b08)= 0x0808080808080808LL;
111 DECLARE_ASM_CONST(8, uint64_t, b80)= 0x8080808080808080LL;
112 #endif
113
114 DECLARE_ASM_CONST(8, int, deringThreshold)= 20;
115
116
117 static struct PPFilter filters[]=
118 {
119         {"hb", "hdeblock",              1, 1, 3, H_DEBLOCK},
120         {"vb", "vdeblock",              1, 2, 4, V_DEBLOCK},
121 /*      {"hr", "rkhdeblock",            1, 1, 3, H_RK1_FILTER},
122         {"vr", "rkvdeblock",            1, 2, 4, V_RK1_FILTER},*/
123         {"h1", "x1hdeblock",            1, 1, 3, H_X1_FILTER},
124         {"v1", "x1vdeblock",            1, 2, 4, V_X1_FILTER},
125         {"ha", "ahdeblock",             1, 1, 3, H_A_DEBLOCK},
126         {"va", "avdeblock",             1, 2, 4, V_A_DEBLOCK},
127         {"dr", "dering",                1, 5, 6, DERING},
128         {"al", "autolevels",            0, 1, 2, LEVEL_FIX},
129         {"lb", "linblenddeint",         1, 1, 4, LINEAR_BLEND_DEINT_FILTER},
130         {"li", "linipoldeint",          1, 1, 4, LINEAR_IPOL_DEINT_FILTER},
131         {"ci", "cubicipoldeint",        1, 1, 4, CUBIC_IPOL_DEINT_FILTER},
132         {"md", "mediandeint",           1, 1, 4, MEDIAN_DEINT_FILTER},
133         {"fd", "ffmpegdeint",           1, 1, 4, FFMPEG_DEINT_FILTER},
134         {"l5", "lowpass5",              1, 1, 4, LOWPASS5_DEINT_FILTER},
135         {"tn", "tmpnoise",              1, 7, 8, TEMP_NOISE_FILTER},
136         {"fq", "forcequant",            1, 0, 0, FORCE_QUANT},
137         {NULL, NULL,0,0,0,0} //End Marker
138 };
139
140 static const char *replaceTable[]=
141 {
142         "default",      "hb:a,vb:a,dr:a",
143         "de",           "hb:a,vb:a,dr:a",
144         "fast",         "h1:a,v1:a,dr:a",
145         "fa",           "h1:a,v1:a,dr:a",
146         "ac",           "ha:a:128:7,va:a,dr:a",
147         NULL //End Marker
148 };
149
150
151 #if defined(ARCH_X86)
152 static inline void prefetchnta(void *p)
153 {
154         asm volatile(   "prefetchnta (%0)\n\t"
155                 : : "r" (p)
156         );
157 }
158
159 static inline void prefetcht0(void *p)
160 {
161         asm volatile(   "prefetcht0 (%0)\n\t"
162                 : : "r" (p)
163         );
164 }
165
166 static inline void prefetcht1(void *p)
167 {
168         asm volatile(   "prefetcht1 (%0)\n\t"
169                 : : "r" (p)
170         );
171 }
172
173 static inline void prefetcht2(void *p)
174 {
175         asm volatile(   "prefetcht2 (%0)\n\t"
176                 : : "r" (p)
177         );
178 }
179 #endif
180
181 // The horizontal Functions exist only in C cuz the MMX code is faster with vertical filters and transposing
182
183 /**
184  * Check if the given 8x8 Block is mostly "flat"
185  */
186 static inline int isHorizDC_C(uint8_t src[], int stride, PPContext *c)
187 {
188         int numEq= 0;
189         int y;
190         const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
191         const int dcThreshold= dcOffset*2 + 1;
192
193         for(y=0; y<BLOCK_SIZE; y++)
194         {
195                 if(((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold) numEq++;
196                 if(((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold) numEq++;
197                 if(((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold) numEq++;
198                 if(((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold) numEq++;
199                 if(((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold) numEq++;
200                 if(((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold) numEq++;
201                 if(((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold) numEq++;
202                 src+= stride;
203         }
204         return numEq > c->ppMode.flatnessThreshold;
205 }
206
207 /**
208  * Check if the middle 8x8 Block in the given 8x16 block is flat
209  */
210 static inline int isVertDC_C(uint8_t src[], int stride, PPContext *c){
211         int numEq= 0;
212         int y;
213         const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
214         const int dcThreshold= dcOffset*2 + 1;
215
216         src+= stride*4; // src points to begin of the 8x8 Block
217         for(y=0; y<BLOCK_SIZE-1; y++)
218         {
219                 if(((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold) numEq++;
220                 if(((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold) numEq++;
221                 if(((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold) numEq++;
222                 if(((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold) numEq++;
223                 if(((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold) numEq++;
224                 if(((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold) numEq++;
225                 if(((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold) numEq++;
226                 if(((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold) numEq++;
227                 src+= stride;
228         }
229         return numEq > c->ppMode.flatnessThreshold;
230 }
231
232 static inline int isHorizMinMaxOk_C(uint8_t src[], int stride, int QP)
233 {
234         int i;
235 #if 1
236         for(i=0; i<2; i++){
237                 if((unsigned)(src[0] - src[5] + 2*QP) > 4*QP) return 0;
238                 src += stride;
239                 if((unsigned)(src[2] - src[7] + 2*QP) > 4*QP) return 0;
240                 src += stride;
241                 if((unsigned)(src[4] - src[1] + 2*QP) > 4*QP) return 0;
242                 src += stride;
243                 if((unsigned)(src[6] - src[3] + 2*QP) > 4*QP) return 0;
244                 src += stride;
245         }
246 #else
247         for(i=0; i<8; i++){
248                 if((unsigned)(src[0] - src[7] + 2*QP) > 4*QP) return 0;
249                 src += stride;
250         }
251 #endif
252         return 1;
253 }
254
255 static inline int isVertMinMaxOk_C(uint8_t src[], int stride, int QP)
256 {
257 #if 1
258 #if 1
259         int x;
260         src+= stride*4;
261         for(x=0; x<BLOCK_SIZE; x+=4)
262         {
263                 if((unsigned)(src[  x + 0*stride] - src[  x + 5*stride] + 2*QP) > 4*QP) return 0;
264                 if((unsigned)(src[1+x + 2*stride] - src[1+x + 7*stride] + 2*QP) > 4*QP) return 0;
265                 if((unsigned)(src[2+x + 4*stride] - src[2+x + 1*stride] + 2*QP) > 4*QP) return 0;
266                 if((unsigned)(src[3+x + 6*stride] - src[3+x + 3*stride] + 2*QP) > 4*QP) return 0;
267         }
268 #else
269         int x;
270         src+= stride*3;
271         for(x=0; x<BLOCK_SIZE; x++)
272         {
273                 if((unsigned)(src[x + stride] - src[x + (stride<<3)] + 2*QP) > 4*QP) return 0;
274         }
275 #endif
276         return 1;
277 #else
278         int x;
279         src+= stride*4;
280         for(x=0; x<BLOCK_SIZE; x++)
281         {
282                 int min=255;
283                 int max=0;
284                 int y;
285                 for(y=0; y<8; y++){
286                         int v= src[x + y*stride];
287                         if(v>max) max=v;
288                         if(v<min) min=v;
289                 }
290                 if(max-min > 2*QP) return 0;
291         }
292         return 1;
293 #endif
294 }
295
296 static inline int horizClassify_C(uint8_t src[], int stride, PPContext *c){
297         if( isHorizDC_C(src, stride, c) ){
298                 if( isHorizMinMaxOk_C(src, stride, c->QP) )
299                         return 1;
300                 else
301                         return 0;
302         }else{
303                 return 2;
304         }
305 }
306
307 static inline int vertClassify_C(uint8_t src[], int stride, PPContext *c){
308         if( isVertDC_C(src, stride, c) ){
309                 if( isVertMinMaxOk_C(src, stride, c->QP) )
310                         return 1;
311                 else
312                         return 0;
313         }else{
314                 return 2;
315         }
316 }
317
318 static inline void doHorizDefFilter_C(uint8_t dst[], int stride, PPContext *c)
319 {
320         int y;
321         for(y=0; y<BLOCK_SIZE; y++)
322         {
323                 const int middleEnergy= 5*(dst[4] - dst[3]) + 2*(dst[2] - dst[5]);
324
325                 if(FFABS(middleEnergy) < 8*c->QP)
326                 {
327                         const int q=(dst[3] - dst[4])/2;
328                         const int leftEnergy=  5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
329                         const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
330
331                         int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
332                         d= FFMAX(d, 0);
333
334                         d= (5*d + 32) >> 6;
335                         d*= FFSIGN(-middleEnergy);
336
337                         if(q>0)
338                         {
339                                 d= d<0 ? 0 : d;
340                                 d= d>q ? q : d;
341                         }
342                         else
343                         {
344                                 d= d>0 ? 0 : d;
345                                 d= d<q ? q : d;
346                         }
347
348                         dst[3]-= d;
349                         dst[4]+= d;
350                 }
351                 dst+= stride;
352         }
353 }
354
355 /**
356  * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
357  * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
358  */
359 static inline void doHorizLowPass_C(uint8_t dst[], int stride, PPContext *c)
360 {
361         int y;
362         for(y=0; y<BLOCK_SIZE; y++)
363         {
364                 const int first= FFABS(dst[-1] - dst[0]) < c->QP ? dst[-1] : dst[0];
365                 const int last= FFABS(dst[8] - dst[7]) < c->QP ? dst[8] : dst[7];
366
367                 int sums[10];
368                 sums[0] = 4*first + dst[0] + dst[1] + dst[2] + 4;
369                 sums[1] = sums[0] - first  + dst[3];
370                 sums[2] = sums[1] - first  + dst[4];
371                 sums[3] = sums[2] - first  + dst[5];
372                 sums[4] = sums[3] - first  + dst[6];
373                 sums[5] = sums[4] - dst[0] + dst[7];
374                 sums[6] = sums[5] - dst[1] + last;
375                 sums[7] = sums[6] - dst[2] + last;
376                 sums[8] = sums[7] - dst[3] + last;
377                 sums[9] = sums[8] - dst[4] + last;
378
379                 dst[0]= (sums[0] + sums[2] + 2*dst[0])>>4;
380                 dst[1]= (sums[1] + sums[3] + 2*dst[1])>>4;
381                 dst[2]= (sums[2] + sums[4] + 2*dst[2])>>4;
382                 dst[3]= (sums[3] + sums[5] + 2*dst[3])>>4;
383                 dst[4]= (sums[4] + sums[6] + 2*dst[4])>>4;
384                 dst[5]= (sums[5] + sums[7] + 2*dst[5])>>4;
385                 dst[6]= (sums[6] + sums[8] + 2*dst[6])>>4;
386                 dst[7]= (sums[7] + sums[9] + 2*dst[7])>>4;
387
388                 dst+= stride;
389         }
390 }
391
392 /**
393  * Experimental Filter 1 (Horizontal)
394  * will not damage linear gradients
395  * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
396  * can only smooth blocks at the expected locations (it cannot smooth them if they did move)
397  * MMX2 version does correct clipping C version does not
398  * not identical with the vertical one
399  */
400 static inline void horizX1Filter(uint8_t *src, int stride, int QP)
401 {
402         int y;
403         static uint64_t *lut= NULL;
404         if(lut==NULL)
405         {
406                 int i;
407                 lut = av_malloc(256*8);
408                 for(i=0; i<256; i++)
409                 {
410                         int v= i < 128 ? 2*i : 2*(i-256);
411 /*
412 //Simulate 112242211 9-Tap filter
413                         uint64_t a= (v/16) & 0xFF;
414                         uint64_t b= (v/8) & 0xFF;
415                         uint64_t c= (v/4) & 0xFF;
416                         uint64_t d= (3*v/8) & 0xFF;
417 */
418 //Simulate piecewise linear interpolation
419                         uint64_t a= (v/16) & 0xFF;
420                         uint64_t b= (v*3/16) & 0xFF;
421                         uint64_t c= (v*5/16) & 0xFF;
422                         uint64_t d= (7*v/16) & 0xFF;
423                         uint64_t A= (0x100 - a)&0xFF;
424                         uint64_t B= (0x100 - b)&0xFF;
425                         uint64_t C= (0x100 - c)&0xFF;
426                         uint64_t D= (0x100 - c)&0xFF;
427
428                         lut[i]   = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
429                                 (D<<24) | (C<<16) | (B<<8) | (A);
430                         //lut[i] = (v<<32) | (v<<24);
431                 }
432         }
433
434         for(y=0; y<BLOCK_SIZE; y++)
435         {
436                 int a= src[1] - src[2];
437                 int b= src[3] - src[4];
438                 int c= src[5] - src[6];
439
440                 int d= FFMAX(FFABS(b) - (FFABS(a) + FFABS(c))/2, 0);
441
442                 if(d < QP)
443                 {
444                         int v = d * FFSIGN(-b);
445
446                         src[1] +=v/8;
447                         src[2] +=v/4;
448                         src[3] +=3*v/8;
449                         src[4] -=3*v/8;
450                         src[5] -=v/4;
451                         src[6] -=v/8;
452
453                 }
454                 src+=stride;
455         }
456 }
457
458 /**
459  * accurate deblock filter
460  */
461 static av_always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPContext *c){
462         int y;
463         const int QP= c->QP;
464         const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
465         const int dcThreshold= dcOffset*2 + 1;
466 //START_TIMER
467         src+= step*4; // src points to begin of the 8x8 Block
468         for(y=0; y<8; y++){
469                 int numEq= 0;
470
471                 if(((unsigned)(src[-1*step] - src[0*step] + dcOffset)) < dcThreshold) numEq++;
472                 if(((unsigned)(src[ 0*step] - src[1*step] + dcOffset)) < dcThreshold) numEq++;
473                 if(((unsigned)(src[ 1*step] - src[2*step] + dcOffset)) < dcThreshold) numEq++;
474                 if(((unsigned)(src[ 2*step] - src[3*step] + dcOffset)) < dcThreshold) numEq++;
475                 if(((unsigned)(src[ 3*step] - src[4*step] + dcOffset)) < dcThreshold) numEq++;
476                 if(((unsigned)(src[ 4*step] - src[5*step] + dcOffset)) < dcThreshold) numEq++;
477                 if(((unsigned)(src[ 5*step] - src[6*step] + dcOffset)) < dcThreshold) numEq++;
478                 if(((unsigned)(src[ 6*step] - src[7*step] + dcOffset)) < dcThreshold) numEq++;
479                 if(((unsigned)(src[ 7*step] - src[8*step] + dcOffset)) < dcThreshold) numEq++;
480                 if(numEq > c->ppMode.flatnessThreshold){
481                         int min, max, x;
482
483                         if(src[0] > src[step]){
484                             max= src[0];
485                             min= src[step];
486                         }else{
487                             max= src[step];
488                             min= src[0];
489                         }
490                         for(x=2; x<8; x+=2){
491                                 if(src[x*step] > src[(x+1)*step]){
492                                         if(src[x    *step] > max) max= src[ x   *step];
493                                         if(src[(x+1)*step] < min) min= src[(x+1)*step];
494                                 }else{
495                                         if(src[(x+1)*step] > max) max= src[(x+1)*step];
496                                         if(src[ x   *step] < min) min= src[ x   *step];
497                                 }
498                         }
499                         if(max-min < 2*QP){
500                                 const int first= FFABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0];
501                                 const int last= FFABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step];
502
503                                 int sums[10];
504                                 sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4;
505                                 sums[1] = sums[0] - first       + src[3*step];
506                                 sums[2] = sums[1] - first       + src[4*step];
507                                 sums[3] = sums[2] - first       + src[5*step];
508                                 sums[4] = sums[3] - first       + src[6*step];
509                                 sums[5] = sums[4] - src[0*step] + src[7*step];
510                                 sums[6] = sums[5] - src[1*step] + last;
511                                 sums[7] = sums[6] - src[2*step] + last;
512                                 sums[8] = sums[7] - src[3*step] + last;
513                                 sums[9] = sums[8] - src[4*step] + last;
514
515                                 src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4;
516                                 src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4;
517                                 src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4;
518                                 src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4;
519                                 src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4;
520                                 src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4;
521                                 src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4;
522                                 src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4;
523                         }
524                 }else{
525                         const int middleEnergy= 5*(src[4*step] - src[3*step]) + 2*(src[2*step] - src[5*step]);
526
527                         if(FFABS(middleEnergy) < 8*QP)
528                         {
529                                 const int q=(src[3*step] - src[4*step])/2;
530                                 const int leftEnergy=  5*(src[2*step] - src[1*step]) + 2*(src[0*step] - src[3*step]);
531                                 const int rightEnergy= 5*(src[6*step] - src[5*step]) + 2*(src[4*step] - src[7*step]);
532
533                                 int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
534                                 d= FFMAX(d, 0);
535
536                                 d= (5*d + 32) >> 6;
537                                 d*= FFSIGN(-middleEnergy);
538
539                                 if(q>0)
540                                 {
541                                         d= d<0 ? 0 : d;
542                                         d= d>q ? q : d;
543                                 }
544                                 else
545                                 {
546                                         d= d>0 ? 0 : d;
547                                         d= d<q ? q : d;
548                                 }
549
550                                 src[3*step]-= d;
551                                 src[4*step]+= d;
552                         }
553                 }
554
555                 src += stride;
556         }
557 /*if(step==16){
558     STOP_TIMER("step16")
559 }else{
560     STOP_TIMER("stepX")
561 }*/
562 }
563
564 //Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one
565 //Plain C versions
566 #if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT)
567 #define COMPILE_C
568 #endif
569
570 #ifdef ARCH_POWERPC
571 #ifdef HAVE_ALTIVEC
572 #define COMPILE_ALTIVEC
573 #endif //HAVE_ALTIVEC
574 #endif //ARCH_POWERPC
575
576 #if defined(ARCH_X86)
577
578 #if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
579 #define COMPILE_MMX
580 #endif
581
582 #if defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT)
583 #define COMPILE_MMX2
584 #endif
585
586 #if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
587 #define COMPILE_3DNOW
588 #endif
589 #endif /* defined(ARCH_X86) */
590
591 #undef HAVE_MMX
592 #undef HAVE_MMX2
593 #undef HAVE_3DNOW
594 #undef HAVE_ALTIVEC
595
596 #ifdef COMPILE_C
597 #undef HAVE_MMX
598 #undef HAVE_MMX2
599 #undef HAVE_3DNOW
600 #define RENAME(a) a ## _C
601 #include "postprocess_template.c"
602 #endif
603
604 #ifdef ARCH_POWERPC
605 #ifdef COMPILE_ALTIVEC
606 #undef RENAME
607 #define HAVE_ALTIVEC
608 #define RENAME(a) a ## _altivec
609 #include "postprocess_altivec_template.c"
610 #include "postprocess_template.c"
611 #endif
612 #endif //ARCH_POWERPC
613
614 //MMX versions
615 #ifdef COMPILE_MMX
616 #undef RENAME
617 #define HAVE_MMX
618 #undef HAVE_MMX2
619 #undef HAVE_3DNOW
620 #define RENAME(a) a ## _MMX
621 #include "postprocess_template.c"
622 #endif
623
624 //MMX2 versions
625 #ifdef COMPILE_MMX2
626 #undef RENAME
627 #define HAVE_MMX
628 #define HAVE_MMX2
629 #undef HAVE_3DNOW
630 #define RENAME(a) a ## _MMX2
631 #include "postprocess_template.c"
632 #endif
633
634 //3DNOW versions
635 #ifdef COMPILE_3DNOW
636 #undef RENAME
637 #define HAVE_MMX
638 #undef HAVE_MMX2
639 #define HAVE_3DNOW
640 #define RENAME(a) a ## _3DNow
641 #include "postprocess_template.c"
642 #endif
643
644 // minor note: the HAVE_xyz is messed up after that line so do not use it.
645
646 static inline void postProcess(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
647         const QP_STORE_T QPs[], int QPStride, int isColor, pp_mode_t *vm, pp_context_t *vc)
648 {
649         PPContext *c= (PPContext *)vc;
650         PPMode *ppMode= (PPMode *)vm;
651         c->ppMode= *ppMode; //FIXME
652
653         // Using ifs here as they are faster than function pointers although the
654         // difference would not be measurable here but it is much better because
655         // someone might exchange the CPU whithout restarting MPlayer ;)
656 #ifdef RUNTIME_CPUDETECT
657 #if defined(ARCH_X86)
658         // ordered per speed fastest first
659         if(c->cpuCaps & PP_CPU_CAPS_MMX2)
660                 postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
661         else if(c->cpuCaps & PP_CPU_CAPS_3DNOW)
662                 postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
663         else if(c->cpuCaps & PP_CPU_CAPS_MMX)
664                 postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
665         else
666                 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
667 #else
668 #ifdef ARCH_POWERPC
669 #ifdef HAVE_ALTIVEC
670         if(c->cpuCaps & PP_CPU_CAPS_ALTIVEC)
671                 postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
672         else
673 #endif
674 #endif
675                 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
676 #endif
677 #else //RUNTIME_CPUDETECT
678 #ifdef HAVE_MMX2
679                 postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
680 #elif defined (HAVE_3DNOW)
681                 postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
682 #elif defined (HAVE_MMX)
683                 postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
684 #elif defined (HAVE_ALTIVEC)
685                 postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
686 #else
687                 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
688 #endif
689 #endif //!RUNTIME_CPUDETECT
690 }
691
692 //static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
693 //        QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode);
694
695 /* -pp Command line Help
696 */
697 #if LIBPOSTPROC_VERSION_INT < (52<<16)
698 const char *const pp_help=
699 #else
700 const char pp_help[] =
701 #endif
702 "Available postprocessing filters:\n"
703 "Filters                        Options\n"
704 "short  long name       short   long option     Description\n"
705 "*      *               a       autoq           CPU power dependent enabler\n"
706 "                       c       chrom           chrominance filtering enabled\n"
707 "                       y       nochrom         chrominance filtering disabled\n"
708 "                       n       noluma          luma filtering disabled\n"
709 "hb     hdeblock        (2 threshold)           horizontal deblocking filter\n"
710 "       1. difference factor: default=32, higher -> more deblocking\n"
711 "       2. flatness threshold: default=39, lower -> more deblocking\n"
712 "                       the h & v deblocking filters share these\n"
713 "                       so you can't set different thresholds for h / v\n"
714 "vb     vdeblock        (2 threshold)           vertical deblocking filter\n"
715 "ha     hadeblock       (2 threshold)           horizontal deblocking filter\n"
716 "va     vadeblock       (2 threshold)           vertical deblocking filter\n"
717 "h1     x1hdeblock                              experimental h deblock filter 1\n"
718 "v1     x1vdeblock                              experimental v deblock filter 1\n"
719 "dr     dering                                  deringing filter\n"
720 "al     autolevels                              automatic brightness / contrast\n"
721 "                       f        fullyrange     stretch luminance to (0..255)\n"
722 "lb     linblenddeint                           linear blend deinterlacer\n"
723 "li     linipoldeint                            linear interpolating deinterlace\n"
724 "ci     cubicipoldeint                          cubic interpolating deinterlacer\n"
725 "md     mediandeint                             median deinterlacer\n"
726 "fd     ffmpegdeint                             ffmpeg deinterlacer\n"
727 "l5     lowpass5                                FIR lowpass deinterlacer\n"
728 "de     default                                 hb:a,vb:a,dr:a\n"
729 "fa     fast                                    h1:a,v1:a,dr:a\n"
730 "ac                                             ha:a:128:7,va:a,dr:a\n"
731 "tn     tmpnoise        (3 threshold)           temporal noise reducer\n"
732 "                     1. <= 2. <= 3.            larger -> stronger filtering\n"
733 "fq     forceQuant      <quantizer>             force quantizer\n"
734 "Usage:\n"
735 "<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n"
736 "long form example:\n"
737 "vdeblock:autoq/hdeblock:autoq/linblenddeint    default,-vdeblock\n"
738 "short form example:\n"
739 "vb:a/hb:a/lb                                   de,-vb\n"
740 "more examples:\n"
741 "tn:64:128:256\n"
742 "\n"
743 ;
744
745 pp_mode_t *pp_get_mode_by_name_and_quality(const char *name, int quality)
746 {
747         char temp[GET_MODE_BUFFER_SIZE];
748         char *p= temp;
749         static const char filterDelimiters[] = ",/";
750         static const char optionDelimiters[] = ":";
751         struct PPMode *ppMode;
752         char *filterToken;
753
754         ppMode= av_malloc(sizeof(PPMode));
755
756         ppMode->lumMode= 0;
757         ppMode->chromMode= 0;
758         ppMode->maxTmpNoise[0]= 700;
759         ppMode->maxTmpNoise[1]= 1500;
760         ppMode->maxTmpNoise[2]= 3000;
761         ppMode->maxAllowedY= 234;
762         ppMode->minAllowedY= 16;
763         ppMode->baseDcDiff= 256/8;
764         ppMode->flatnessThreshold= 56-16-1;
765         ppMode->maxClippedThreshold= 0.01;
766         ppMode->error=0;
767
768         strncpy(temp, name, GET_MODE_BUFFER_SIZE);
769
770         av_log(NULL, AV_LOG_DEBUG, "pp: %s\n", name);
771
772         for(;;){
773                 char *filterName;
774                 int q= 1000000; //PP_QUALITY_MAX;
775                 int chrom=-1;
776                 int luma=-1;
777                 char *option;
778                 char *options[OPTIONS_ARRAY_SIZE];
779                 int i;
780                 int filterNameOk=0;
781                 int numOfUnknownOptions=0;
782                 int enable=1; //does the user want us to enabled or disabled the filter
783
784                 filterToken= strtok(p, filterDelimiters);
785                 if(filterToken == NULL) break;
786                 p+= strlen(filterToken) + 1; // p points to next filterToken
787                 filterName= strtok(filterToken, optionDelimiters);
788                 av_log(NULL, AV_LOG_DEBUG, "pp: %s::%s\n", filterToken, filterName);
789
790                 if(*filterName == '-')
791                 {
792                         enable=0;
793                         filterName++;
794                 }
795
796                 for(;;){ //for all options
797                         option= strtok(NULL, optionDelimiters);
798                         if(option == NULL) break;
799
800                         av_log(NULL, AV_LOG_DEBUG, "pp: option: %s\n", option);
801                         if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
802                         else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
803                         else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
804                         else if(!strcmp("noluma", option) || !strcmp("n", option)) luma=0;
805                         else
806                         {
807                                 options[numOfUnknownOptions] = option;
808                                 numOfUnknownOptions++;
809                         }
810                         if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
811                 }
812                 options[numOfUnknownOptions] = NULL;
813
814                 /* replace stuff from the replace Table */
815                 for(i=0; replaceTable[2*i]!=NULL; i++)
816                 {
817                         if(!strcmp(replaceTable[2*i], filterName))
818                         {
819                                 int newlen= strlen(replaceTable[2*i + 1]);
820                                 int plen;
821                                 int spaceLeft;
822
823                                 if(p==NULL) p= temp, *p=0;      //last filter
824                                 else p--, *p=',';               //not last filter
825
826                                 plen= strlen(p);
827                                 spaceLeft= p - temp + plen;
828                                 if(spaceLeft + newlen  >= GET_MODE_BUFFER_SIZE)
829                                 {
830                                         ppMode->error++;
831                                         break;
832                                 }
833                                 memmove(p + newlen, p, plen+1);
834                                 memcpy(p, replaceTable[2*i + 1], newlen);
835                                 filterNameOk=1;
836                         }
837                 }
838
839                 for(i=0; filters[i].shortName!=NULL; i++)
840                 {
841                         if(   !strcmp(filters[i].longName, filterName)
842                            || !strcmp(filters[i].shortName, filterName))
843                         {
844                                 ppMode->lumMode &= ~filters[i].mask;
845                                 ppMode->chromMode &= ~filters[i].mask;
846
847                                 filterNameOk=1;
848                                 if(!enable) break; // user wants to disable it
849
850                                 if(q >= filters[i].minLumQuality && luma)
851                                         ppMode->lumMode|= filters[i].mask;
852                                 if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
853                                         if(q >= filters[i].minChromQuality)
854                                                 ppMode->chromMode|= filters[i].mask;
855
856                                 if(filters[i].mask == LEVEL_FIX)
857                                 {
858                                         int o;
859                                         ppMode->minAllowedY= 16;
860                                         ppMode->maxAllowedY= 234;
861                                         for(o=0; options[o]!=NULL; o++)
862                                         {
863                                                 if(  !strcmp(options[o],"fullyrange")
864                                                    ||!strcmp(options[o],"f"))
865                                                 {
866                                                         ppMode->minAllowedY= 0;
867                                                         ppMode->maxAllowedY= 255;
868                                                         numOfUnknownOptions--;
869                                                 }
870                                         }
871                                 }
872                                 else if(filters[i].mask == TEMP_NOISE_FILTER)
873                                 {
874                                         int o;
875                                         int numOfNoises=0;
876
877                                         for(o=0; options[o]!=NULL; o++)
878                                         {
879                                                 char *tail;
880                                                 ppMode->maxTmpNoise[numOfNoises]=
881                                                         strtol(options[o], &tail, 0);
882                                                 if(tail!=options[o])
883                                                 {
884                                                         numOfNoises++;
885                                                         numOfUnknownOptions--;
886                                                         if(numOfNoises >= 3) break;
887                                                 }
888                                         }
889                                 }
890                                 else if(filters[i].mask == V_DEBLOCK   || filters[i].mask == H_DEBLOCK
891                                      || filters[i].mask == V_A_DEBLOCK || filters[i].mask == H_A_DEBLOCK)
892                                 {
893                                         int o;
894
895                                         for(o=0; options[o]!=NULL && o<2; o++)
896                                         {
897                                                 char *tail;
898                                                 int val= strtol(options[o], &tail, 0);
899                                                 if(tail==options[o]) break;
900
901                                                 numOfUnknownOptions--;
902                                                 if(o==0) ppMode->baseDcDiff= val;
903                                                 else ppMode->flatnessThreshold= val;
904                                         }
905                                 }
906                                 else if(filters[i].mask == FORCE_QUANT)
907                                 {
908                                         int o;
909                                         ppMode->forcedQuant= 15;
910
911                                         for(o=0; options[o]!=NULL && o<1; o++)
912                                         {
913                                                 char *tail;
914                                                 int val= strtol(options[o], &tail, 0);
915                                                 if(tail==options[o]) break;
916
917                                                 numOfUnknownOptions--;
918                                                 ppMode->forcedQuant= val;
919                                         }
920                                 }
921                         }
922                 }
923                 if(!filterNameOk) ppMode->error++;
924                 ppMode->error += numOfUnknownOptions;
925         }
926
927         av_log(NULL, AV_LOG_DEBUG, "pp: lumMode=%X, chromMode=%X\n", ppMode->lumMode, ppMode->chromMode);
928         if(ppMode->error)
929         {
930                 av_log(NULL, AV_LOG_ERROR, "%d errors in postprocess string \"%s\"\n", ppMode->error, name);
931                 av_free(ppMode);
932                 return NULL;
933         }
934         return ppMode;
935 }
936
937 void pp_free_mode(pp_mode_t *mode){
938     av_free(mode);
939 }
940
941 static void reallocAlign(void **p, int alignment, int size){
942         av_free(*p);
943         *p= av_mallocz(size);
944 }
945
946 static void reallocBuffers(PPContext *c, int width, int height, int stride, int qpStride){
947         int mbWidth = (width+15)>>4;
948         int mbHeight= (height+15)>>4;
949         int i;
950
951         c->stride= stride;
952         c->qpStride= qpStride;
953
954         reallocAlign((void **)&c->tempDst, 8, stride*24);
955         reallocAlign((void **)&c->tempSrc, 8, stride*24);
956         reallocAlign((void **)&c->tempBlocks, 8, 2*16*8);
957         reallocAlign((void **)&c->yHistogram, 8, 256*sizeof(uint64_t));
958         for(i=0; i<256; i++)
959                 c->yHistogram[i]= width*height/64*15/256;
960
961         for(i=0; i<3; i++)
962         {
963                 //Note: The +17*1024 is just there so i do not have to worry about r/w over the end.
964                 reallocAlign((void **)&c->tempBlured[i], 8, stride*mbHeight*16 + 17*1024);
965                 reallocAlign((void **)&c->tempBluredPast[i], 8, 256*((height+7)&(~7))/2 + 17*1024);//FIXME size
966         }
967
968         reallocAlign((void **)&c->deintTemp, 8, 2*width+32);
969         reallocAlign((void **)&c->nonBQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
970         reallocAlign((void **)&c->stdQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
971         reallocAlign((void **)&c->forcedQPTable, 8, mbWidth*sizeof(QP_STORE_T));
972 }
973
974 static const char * context_to_name(void * ptr) {
975     return "postproc";
976 }
977
978 static const AVClass av_codec_context_class = { "Postproc", context_to_name, NULL };
979
980 pp_context_t *pp_get_context(int width, int height, int cpuCaps){
981         PPContext *c= av_malloc(sizeof(PPContext));
982         int stride= (width+15)&(~15);    //assumed / will realloc if needed
983         int qpStride= (width+15)/16 + 2; //assumed / will realloc if needed
984
985         memset(c, 0, sizeof(PPContext));
986         c->av_class = &av_codec_context_class;
987         c->cpuCaps= cpuCaps;
988         if(cpuCaps&PP_FORMAT){
989                 c->hChromaSubSample= cpuCaps&0x3;
990                 c->vChromaSubSample= (cpuCaps>>4)&0x3;
991         }else{
992                 c->hChromaSubSample= 1;
993                 c->vChromaSubSample= 1;
994         }
995
996         reallocBuffers(c, width, height, stride, qpStride);
997
998         c->frameNum=-1;
999
1000         return c;
1001 }
1002
1003 void pp_free_context(void *vc){
1004         PPContext *c = (PPContext*)vc;
1005         int i;
1006
1007         for(i=0; i<3; i++) av_free(c->tempBlured[i]);
1008         for(i=0; i<3; i++) av_free(c->tempBluredPast[i]);
1009
1010         av_free(c->tempBlocks);
1011         av_free(c->yHistogram);
1012         av_free(c->tempDst);
1013         av_free(c->tempSrc);
1014         av_free(c->deintTemp);
1015         av_free(c->stdQPTable);
1016         av_free(c->nonBQPTable);
1017         av_free(c->forcedQPTable);
1018
1019         memset(c, 0, sizeof(PPContext));
1020
1021         av_free(c);
1022 }
1023
1024 void  pp_postprocess(const uint8_t * src[3], const int srcStride[3],
1025                  uint8_t * dst[3], const int dstStride[3],
1026                  int width, int height,
1027                  const QP_STORE_T *QP_store,  int QPStride,
1028                  pp_mode_t *vm,  void *vc, int pict_type)
1029 {
1030         int mbWidth = (width+15)>>4;
1031         int mbHeight= (height+15)>>4;
1032         PPMode *mode = (PPMode*)vm;
1033         PPContext *c = (PPContext*)vc;
1034         int minStride= FFMAX(FFABS(srcStride[0]), FFABS(dstStride[0]));
1035         int absQPStride = FFABS(QPStride);
1036
1037         // c->stride and c->QPStride are always positive
1038         if(c->stride < minStride || c->qpStride < absQPStride)
1039                 reallocBuffers(c, width, height,
1040                                 FFMAX(minStride, c->stride),
1041                                 FFMAX(c->qpStride, absQPStride));
1042
1043         if(QP_store==NULL || (mode->lumMode & FORCE_QUANT))
1044         {
1045                 int i;
1046                 QP_store= c->forcedQPTable;
1047                 absQPStride = QPStride = 0;
1048                 if(mode->lumMode & FORCE_QUANT)
1049                         for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= mode->forcedQuant;
1050                 else
1051                         for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= 1;
1052         }
1053
1054         if(pict_type & PP_PICT_TYPE_QP2){
1055                 int i;
1056                 const int count= mbHeight * absQPStride;
1057                 for(i=0; i<(count>>2); i++){
1058                         ((uint32_t*)c->stdQPTable)[i] = (((const uint32_t*)QP_store)[i]>>1) & 0x7F7F7F7F;
1059                 }
1060                 for(i<<=2; i<count; i++){
1061                         c->stdQPTable[i] = QP_store[i]>>1;
1062                 }
1063                 QP_store= c->stdQPTable;
1064                 QPStride= absQPStride;
1065         }
1066
1067 if(0){
1068 int x,y;
1069 for(y=0; y<mbHeight; y++){
1070         for(x=0; x<mbWidth; x++){
1071                 av_log(c, AV_LOG_INFO, "%2d ", QP_store[x + y*QPStride]);
1072         }
1073         av_log(c, AV_LOG_INFO, "\n");
1074 }
1075         av_log(c, AV_LOG_INFO, "\n");
1076 }
1077
1078         if((pict_type&7)!=3)
1079         {
1080                 if (QPStride >= 0) {
1081                         int i;
1082                         const int count= mbHeight * QPStride;
1083                         for(i=0; i<(count>>2); i++){
1084                                 ((uint32_t*)c->nonBQPTable)[i] = ((const uint32_t*)QP_store)[i] & 0x3F3F3F3F;
1085                         }
1086                         for(i<<=2; i<count; i++){
1087                                 c->nonBQPTable[i] = QP_store[i] & 0x3F;
1088                         }
1089                 } else {
1090                         int i,j;
1091                         for(i=0; i<mbHeight; i++) {
1092                                     for(j=0; j<absQPStride; j++) {
1093                                         c->nonBQPTable[i*absQPStride+j] = QP_store[i*QPStride+j] & 0x3F;
1094                                 }
1095                         }
1096                 }
1097         }
1098
1099         av_log(c, AV_LOG_DEBUG, "using npp filters 0x%X/0x%X\n",
1100                mode->lumMode, mode->chromMode);
1101
1102         postProcess(src[0], srcStride[0], dst[0], dstStride[0],
1103                 width, height, QP_store, QPStride, 0, mode, c);
1104
1105         width  = (width )>>c->hChromaSubSample;
1106         height = (height)>>c->vChromaSubSample;
1107
1108         if(mode->chromMode)
1109         {
1110                 postProcess(src[1], srcStride[1], dst[1], dstStride[1],
1111                         width, height, QP_store, QPStride, 1, mode, c);
1112                 postProcess(src[2], srcStride[2], dst[2], dstStride[2],
1113                         width, height, QP_store, QPStride, 2, mode, c);
1114         }
1115         else if(srcStride[1] == dstStride[1] && srcStride[2] == dstStride[2])
1116         {
1117                 linecpy(dst[1], src[1], height, srcStride[1]);
1118                 linecpy(dst[2], src[2], height, srcStride[2]);
1119         }
1120         else
1121         {
1122                 int y;
1123                 for(y=0; y<height; y++)
1124                 {
1125                         memcpy(&(dst[1][y*dstStride[1]]), &(src[1][y*srcStride[1]]), width);
1126                         memcpy(&(dst[2][y*dstStride[2]]), &(src[2][y*srcStride[2]]), width);
1127                 }
1128         }
1129 }
1130