]> rtime.felk.cvut.cz Git - frescor/ffmpeg.git/blob - libpostproc/postprocess_template.c
get register names from x86_cpu.h
[frescor/ffmpeg.git] / libpostproc / postprocess_template.c
1 /*
2  * Copyright (C) 2001-2002 Michael Niedermayer (michaelni@gmx.at)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20
21 /**
22  * @file postprocess_template.c
23  * mmx/mmx2/3dnow postprocess code.
24  */
25
26 #include "x86_cpu.h"
27
28 #ifdef ARCH_X86_64
29 #  define ALIGN_MASK "$0xFFFFFFFFFFFFFFF8"
30 #else
31 #  define ALIGN_MASK "$0xFFFFFFF8"
32 #endif
33
34
35 #undef PAVGB
36 #undef PMINUB
37 #undef PMAXUB
38
39 #ifdef HAVE_MMX2
40 #define REAL_PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
41 #elif defined (HAVE_3DNOW)
42 #define REAL_PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
43 #endif
44 #define PAVGB(a,b)  REAL_PAVGB(a,b)
45
46 #ifdef HAVE_MMX2
47 #define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t"
48 #elif defined (HAVE_MMX)
49 #define PMINUB(b,a,t) \
50         "movq " #a ", " #t " \n\t"\
51         "psubusb " #b ", " #t " \n\t"\
52         "psubb " #t ", " #a " \n\t"
53 #endif
54
55 #ifdef HAVE_MMX2
56 #define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t"
57 #elif defined (HAVE_MMX)
58 #define PMAXUB(a,b) \
59         "psubusb " #a ", " #b " \n\t"\
60         "paddb " #a ", " #b " \n\t"
61 #endif
62
63 //FIXME? |255-0| = 1 (should not be a problem ...)
64 #ifdef HAVE_MMX
65 /**
66  * Check if the middle 8x8 Block in the given 8x16 block is flat
67  */
68 static inline int RENAME(vertClassify)(uint8_t src[], int stride, PPContext *c){
69         int numEq= 0, dcOk;
70         src+= stride*4; // src points to begin of the 8x8 Block
71 asm volatile(
72                 "movq %0, %%mm7                         \n\t"
73                 "movq %1, %%mm6                         \n\t"
74                 : : "m" (c->mmxDcOffset[c->nonBQP]),  "m" (c->mmxDcThreshold[c->nonBQP])
75                 );
76
77 asm volatile(
78                 "lea (%2, %3), %%"REG_a"                \n\t"
79 //      0       1       2       3       4       5       6       7       8       9
80 //      %1      eax     eax+%2  eax+2%2 %1+4%2  ecx     ecx+%2  ecx+2%2 %1+8%2  ecx+4%2
81
82                 "movq (%2), %%mm0                       \n\t"
83                 "movq (%%"REG_a"), %%mm1                \n\t"
84                 "movq %%mm0, %%mm3                      \n\t"
85                 "movq %%mm0, %%mm4                      \n\t"
86                 PMAXUB(%%mm1, %%mm4)
87                 PMINUB(%%mm1, %%mm3, %%mm5)
88                 "psubb %%mm1, %%mm0                     \n\t" // mm0 = differnece
89                 "paddb %%mm7, %%mm0                     \n\t"
90                 "pcmpgtb %%mm6, %%mm0                   \n\t"
91
92                 "movq (%%"REG_a",%3), %%mm2             \n\t"
93                 PMAXUB(%%mm2, %%mm4)
94                 PMINUB(%%mm2, %%mm3, %%mm5)
95                 "psubb %%mm2, %%mm1                     \n\t"
96                 "paddb %%mm7, %%mm1                     \n\t"
97                 "pcmpgtb %%mm6, %%mm1                   \n\t"
98                 "paddb %%mm1, %%mm0                     \n\t"
99
100                 "movq (%%"REG_a", %3, 2), %%mm1         \n\t"
101                 PMAXUB(%%mm1, %%mm4)
102                 PMINUB(%%mm1, %%mm3, %%mm5)
103                 "psubb %%mm1, %%mm2                     \n\t"
104                 "paddb %%mm7, %%mm2                     \n\t"
105                 "pcmpgtb %%mm6, %%mm2                   \n\t"
106                 "paddb %%mm2, %%mm0                     \n\t"
107
108                 "lea (%%"REG_a", %3, 4), %%"REG_a"      \n\t"
109
110                 "movq (%2, %3, 4), %%mm2                \n\t"
111                 PMAXUB(%%mm2, %%mm4)
112                 PMINUB(%%mm2, %%mm3, %%mm5)
113                 "psubb %%mm2, %%mm1                     \n\t"
114                 "paddb %%mm7, %%mm1                     \n\t"
115                 "pcmpgtb %%mm6, %%mm1                   \n\t"
116                 "paddb %%mm1, %%mm0                     \n\t"
117
118                 "movq (%%"REG_a"), %%mm1                \n\t"
119                 PMAXUB(%%mm1, %%mm4)
120                 PMINUB(%%mm1, %%mm3, %%mm5)
121                 "psubb %%mm1, %%mm2                     \n\t"
122                 "paddb %%mm7, %%mm2                     \n\t"
123                 "pcmpgtb %%mm6, %%mm2                   \n\t"
124                 "paddb %%mm2, %%mm0                     \n\t"
125
126                 "movq (%%"REG_a", %3), %%mm2            \n\t"
127                 PMAXUB(%%mm2, %%mm4)
128                 PMINUB(%%mm2, %%mm3, %%mm5)
129                 "psubb %%mm2, %%mm1                     \n\t"
130                 "paddb %%mm7, %%mm1                     \n\t"
131                 "pcmpgtb %%mm6, %%mm1                   \n\t"
132                 "paddb %%mm1, %%mm0                     \n\t"
133
134                 "movq (%%"REG_a", %3, 2), %%mm1         \n\t"
135                 PMAXUB(%%mm1, %%mm4)
136                 PMINUB(%%mm1, %%mm3, %%mm5)
137                 "psubb %%mm1, %%mm2                     \n\t"
138                 "paddb %%mm7, %%mm2                     \n\t"
139                 "pcmpgtb %%mm6, %%mm2                   \n\t"
140                 "paddb %%mm2, %%mm0                     \n\t"
141                 "psubusb %%mm3, %%mm4                   \n\t"
142
143                 "                                       \n\t"
144 #ifdef HAVE_MMX2
145                 "pxor %%mm7, %%mm7                      \n\t"
146                 "psadbw %%mm7, %%mm0                    \n\t"
147 #else
148                 "movq %%mm0, %%mm1                      \n\t"
149                 "psrlw $8, %%mm0                        \n\t"
150                 "paddb %%mm1, %%mm0                     \n\t"
151                 "movq %%mm0, %%mm1                      \n\t"
152                 "psrlq $16, %%mm0                       \n\t"
153                 "paddb %%mm1, %%mm0                     \n\t"
154                 "movq %%mm0, %%mm1                      \n\t"
155                 "psrlq $32, %%mm0                       \n\t"
156                 "paddb %%mm1, %%mm0                     \n\t"
157 #endif
158                 "movq %4, %%mm7                         \n\t" // QP,..., QP
159                 "paddusb %%mm7, %%mm7                   \n\t" // 2QP ... 2QP
160                 "psubusb %%mm7, %%mm4                   \n\t" // Diff <= 2QP -> 0
161                 "packssdw %%mm4, %%mm4                  \n\t"
162                 "movd %%mm0, %0                         \n\t"
163                 "movd %%mm4, %1                         \n\t"
164
165                 : "=r" (numEq), "=r" (dcOk)
166                 : "r" (src), "r" ((long)stride), "m" (c->pQPb)
167                 : "%"REG_a
168                 );
169
170         numEq= (-numEq) &0xFF;
171         if(numEq > c->ppMode.flatnessThreshold){
172             if(dcOk) return 0;
173             else     return 1;
174         }else{
175             return 2;
176         }
177 }
178 #endif //HAVE_MMX
179
180 /**
181  * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
182  * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
183  */
184 #ifndef HAVE_ALTIVEC
185 static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c)
186 {
187 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
188         src+= stride*3;
189         asm volatile(        //"movv %0 %1 %2\n\t"
190                 "movq %2, %%mm0                         \n\t"  // QP,..., QP
191                 "pxor %%mm4, %%mm4                      \n\t"
192
193                 "movq (%0), %%mm6                       \n\t"
194                 "movq (%0, %1), %%mm5                   \n\t"
195                 "movq %%mm5, %%mm1                      \n\t"
196                 "movq %%mm6, %%mm2                      \n\t"
197                 "psubusb %%mm6, %%mm5                   \n\t"
198                 "psubusb %%mm1, %%mm2                   \n\t"
199                 "por %%mm5, %%mm2                       \n\t" // ABS Diff of lines
200                 "psubusb %%mm0, %%mm2                   \n\t" // diff <= QP -> 0
201                 "pcmpeqb %%mm4, %%mm2                   \n\t" // diff <= QP -> FF
202
203                 "pand %%mm2, %%mm6                      \n\t"
204                 "pandn %%mm1, %%mm2                     \n\t"
205                 "por %%mm2, %%mm6                       \n\t"// First Line to Filter
206
207                 "movq (%0, %1, 8), %%mm5                \n\t"
208                 "lea (%0, %1, 4), %%"REG_a"             \n\t"
209                 "lea (%0, %1, 8), %%"REG_c"             \n\t"
210                 "sub %1, %%"REG_c"                      \n\t"
211                 "add %1, %0                             \n\t" // %0 points to line 1 not 0
212                 "movq (%0, %1, 8), %%mm7                \n\t"
213                 "movq %%mm5, %%mm1                      \n\t"
214                 "movq %%mm7, %%mm2                      \n\t"
215                 "psubusb %%mm7, %%mm5                   \n\t"
216                 "psubusb %%mm1, %%mm2                   \n\t"
217                 "por %%mm5, %%mm2                       \n\t" // ABS Diff of lines
218                 "psubusb %%mm0, %%mm2                   \n\t" // diff <= QP -> 0
219                 "pcmpeqb %%mm4, %%mm2                   \n\t" // diff <= QP -> FF
220
221                 "pand %%mm2, %%mm7                      \n\t"
222                 "pandn %%mm1, %%mm2                     \n\t"
223                 "por %%mm2, %%mm7                       \n\t" // First Line to Filter
224
225
226                 //      1       2       3       4       5       6       7       8
227                 //      %0      %0+%1   %0+2%1  eax     %0+4%1  eax+2%1 ecx     eax+4%1
228                 // 6 4 2 2 1 1
229                 // 6 4 4 2
230                 // 6 8 2
231
232                 "movq (%0, %1), %%mm0                   \n\t" //  1
233                 "movq %%mm0, %%mm1                      \n\t" //  1
234                 PAVGB(%%mm6, %%mm0)                           //1 1        /2
235                 PAVGB(%%mm6, %%mm0)                           //3 1        /4
236
237                 "movq (%0, %1, 4), %%mm2                \n\t" //     1
238                 "movq %%mm2, %%mm5                      \n\t" //     1
239                 PAVGB((%%REGa), %%mm2)                        //    11        /2
240                 PAVGB((%0, %1, 2), %%mm2)                     //   211        /4
241                 "movq %%mm2, %%mm3                      \n\t" //   211        /4
242                 "movq (%0), %%mm4                       \n\t" // 1
243                 PAVGB(%%mm4, %%mm3)                           // 4 211        /8
244                 PAVGB(%%mm0, %%mm3)                           //642211        /16
245                 "movq %%mm3, (%0)                       \n\t" // X
246                 // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
247                 "movq %%mm1, %%mm0                      \n\t" //  1
248                 PAVGB(%%mm6, %%mm0)                           //1 1        /2
249                 "movq %%mm4, %%mm3                      \n\t" // 1
250                 PAVGB((%0,%1,2), %%mm3)                       // 1 1        /2
251                 PAVGB((%%REGa,%1,2), %%mm5)                   //     11        /2
252                 PAVGB((%%REGa), %%mm5)                        //    211 /4
253                 PAVGB(%%mm5, %%mm3)                           // 2 2211 /8
254                 PAVGB(%%mm0, %%mm3)                           //4242211 /16
255                 "movq %%mm3, (%0,%1)                    \n\t" //  X
256                 // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
257                 PAVGB(%%mm4, %%mm6)                                   //11        /2
258                 "movq (%%"REG_c"), %%mm0                \n\t" //       1
259                 PAVGB((%%REGa, %1, 2), %%mm0)                 //      11/2
260                 "movq %%mm0, %%mm3                      \n\t" //      11/2
261                 PAVGB(%%mm1, %%mm0)                           //  2   11/4
262                 PAVGB(%%mm6, %%mm0)                           //222   11/8
263                 PAVGB(%%mm2, %%mm0)                           //22242211/16
264                 "movq (%0, %1, 2), %%mm2                \n\t" //   1
265                 "movq %%mm0, (%0, %1, 2)                \n\t" //   X
266                 // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
267                 "movq (%%"REG_a", %1, 4), %%mm0         \n\t" //        1
268                 PAVGB((%%REGc), %%mm0)                        //       11        /2
269                 PAVGB(%%mm0, %%mm6)                           //11     11        /4
270                 PAVGB(%%mm1, %%mm4)                           // 11                /2
271                 PAVGB(%%mm2, %%mm1)                           //  11                /2
272                 PAVGB(%%mm1, %%mm6)                           //1122   11        /8
273                 PAVGB(%%mm5, %%mm6)                           //112242211        /16
274                 "movq (%%"REG_a"), %%mm5                \n\t" //    1
275                 "movq %%mm6, (%%"REG_a")                \n\t" //    X
276                 // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
277                 "movq (%%"REG_a", %1, 4), %%mm6         \n\t" //        1
278                 PAVGB(%%mm7, %%mm6)                           //        11        /2
279                 PAVGB(%%mm4, %%mm6)                           // 11     11        /4
280                 PAVGB(%%mm3, %%mm6)                           // 11   2211        /8
281                 PAVGB(%%mm5, %%mm2)                           //   11                /2
282                 "movq (%0, %1, 4), %%mm4                \n\t" //     1
283                 PAVGB(%%mm4, %%mm2)                           //   112                /4
284                 PAVGB(%%mm2, %%mm6)                           // 112242211        /16
285                 "movq %%mm6, (%0, %1, 4)                \n\t" //     X
286                 // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
287                 PAVGB(%%mm7, %%mm1)                           //  11     2        /4
288                 PAVGB(%%mm4, %%mm5)                           //    11                /2
289                 PAVGB(%%mm5, %%mm0)                           //    11 11        /4
290                 "movq (%%"REG_a", %1, 2), %%mm6         \n\t" //      1
291                 PAVGB(%%mm6, %%mm1)                           //  11  4  2        /8
292                 PAVGB(%%mm0, %%mm1)                           //  11224222        /16
293                 "movq %%mm1, (%%"REG_a", %1, 2)         \n\t" //      X
294                 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
295                 PAVGB((%%REGc), %%mm2)                        //   112 4        /8
296                 "movq (%%"REG_a", %1, 4), %%mm0         \n\t" //        1
297                 PAVGB(%%mm0, %%mm6)                           //      1 1        /2
298                 PAVGB(%%mm7, %%mm6)                           //      1 12        /4
299                 PAVGB(%%mm2, %%mm6)                           //   1122424        /4
300                 "movq %%mm6, (%%"REG_c")                \n\t" //       X
301                 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
302                 PAVGB(%%mm7, %%mm5)                           //    11   2        /4
303                 PAVGB(%%mm7, %%mm5)                           //    11   6        /8
304
305                 PAVGB(%%mm3, %%mm0)                           //      112        /4
306                 PAVGB(%%mm0, %%mm5)                           //    112246        /16
307                 "movq %%mm5, (%%"REG_a", %1, 4)         \n\t" //        X
308                 "sub %1, %0                             \n\t"
309
310                 :
311                 : "r" (src), "r" ((long)stride), "m" (c->pQPb)
312                 : "%"REG_a, "%"REG_c
313         );
314 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
315         const int l1= stride;
316         const int l2= stride + l1;
317         const int l3= stride + l2;
318         const int l4= stride + l3;
319         const int l5= stride + l4;
320         const int l6= stride + l5;
321         const int l7= stride + l6;
322         const int l8= stride + l7;
323         const int l9= stride + l8;
324         int x;
325         src+= stride*3;
326         for(x=0; x<BLOCK_SIZE; x++)
327         {
328                 const int first= FFABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1];
329                 const int last= FFABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8];
330
331                 int sums[10];
332                 sums[0] = 4*first + src[l1] + src[l2] + src[l3] + 4;
333                 sums[1] = sums[0] - first  + src[l4];
334                 sums[2] = sums[1] - first  + src[l5];
335                 sums[3] = sums[2] - first  + src[l6];
336                 sums[4] = sums[3] - first  + src[l7];
337                 sums[5] = sums[4] - src[l1] + src[l8];
338                 sums[6] = sums[5] - src[l2] + last;
339                 sums[7] = sums[6] - src[l3] + last;
340                 sums[8] = sums[7] - src[l4] + last;
341                 sums[9] = sums[8] - src[l5] + last;
342
343                 src[l1]= (sums[0] + sums[2] + 2*src[l1])>>4;
344                 src[l2]= (sums[1] + sums[3] + 2*src[l2])>>4;
345                 src[l3]= (sums[2] + sums[4] + 2*src[l3])>>4;
346                 src[l4]= (sums[3] + sums[5] + 2*src[l4])>>4;
347                 src[l5]= (sums[4] + sums[6] + 2*src[l5])>>4;
348                 src[l6]= (sums[5] + sums[7] + 2*src[l6])>>4;
349                 src[l7]= (sums[6] + sums[8] + 2*src[l7])>>4;
350                 src[l8]= (sums[7] + sums[9] + 2*src[l8])>>4;
351
352                 src++;
353         }
354 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
355 }
356 #endif //HAVE_ALTIVEC
357
358 #if 0
359 /**
360  * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar
361  * values are correctly clipped (MMX2)
362  * values are wraparound (C)
363  * Conclusion: It is fast, but introduces ugly horizontal patterns
364  * if there is a continuous gradient.
365         0 8 16 24
366         x = 8
367         x/2 = 4
368         x/8 = 1
369         1 12 12 23
370  */
371 static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP)
372 {
373 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
374         src+= stride*3;
375 // FIXME rounding
376         asm volatile(
377                 "pxor %%mm7, %%mm7                      \n\t" // 0
378                 "movq "MANGLE(b80)", %%mm6              \n\t" // MIN_SIGNED_BYTE
379                 "leal (%0, %1), %%"REG_a"               \n\t"
380                 "leal (%%"REG_a", %1, 4), %%"REG_c"     \n\t"
381 //      0       1       2       3       4       5       6       7       8       9
382 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ecx     ecx+%1  ecx+2%1 %0+8%1  ecx+4%1
383                 "movq "MANGLE(pQPb)", %%mm0             \n\t" // QP,..., QP
384                 "movq %%mm0, %%mm1                      \n\t" // QP,..., QP
385                 "paddusb "MANGLE(b02)", %%mm0           \n\t"
386                 "psrlw $2, %%mm0                        \n\t"
387                 "pand "MANGLE(b3F)", %%mm0              \n\t" // QP/4,..., QP/4
388                 "paddusb %%mm1, %%mm0                   \n\t" // QP*1.25 ...
389                 "movq (%0, %1, 4), %%mm2                \n\t" // line 4
390                 "movq (%%"REG_c"), %%mm3                \n\t" // line 5
391                 "movq %%mm2, %%mm4                      \n\t" // line 4
392                 "pcmpeqb %%mm5, %%mm5                   \n\t" // -1
393                 "pxor %%mm2, %%mm5                      \n\t" // -line 4 - 1
394                 PAVGB(%%mm3, %%mm5)
395                 "paddb %%mm6, %%mm5                     \n\t" // (l5-l4)/2
396                 "psubusb %%mm3, %%mm4                   \n\t"
397                 "psubusb %%mm2, %%mm3                   \n\t"
398                 "por %%mm3, %%mm4                       \n\t" // |l4 - l5|
399                 "psubusb %%mm0, %%mm4                   \n\t"
400                 "pcmpeqb %%mm7, %%mm4                   \n\t"
401                 "pand %%mm4, %%mm5                      \n\t" // d/2
402
403 //                "paddb %%mm6, %%mm2                     \n\t" // line 4 + 0x80
404                 "paddb %%mm5, %%mm2                     \n\t"
405 //                "psubb %%mm6, %%mm2                     \n\t"
406                 "movq %%mm2, (%0,%1, 4)                 \n\t"
407
408                 "movq (%%"REG_c"), %%mm2                \n\t"
409 //                "paddb %%mm6, %%mm2                     \n\t" // line 5 + 0x80
410                 "psubb %%mm5, %%mm2                     \n\t"
411 //                "psubb %%mm6, %%mm2                     \n\t"
412                 "movq %%mm2, (%%"REG_c")                \n\t"
413
414                 "paddb %%mm6, %%mm5                     \n\t"
415                 "psrlw $2, %%mm5                        \n\t"
416                 "pand "MANGLE(b3F)", %%mm5              \n\t"
417                 "psubb "MANGLE(b20)", %%mm5             \n\t" // (l5-l4)/8
418
419                 "movq (%%"REG_a", %1, 2), %%mm2         \n\t"
420                 "paddb %%mm6, %%mm2                     \n\t" // line 3 + 0x80
421                 "paddsb %%mm5, %%mm2                    \n\t"
422                 "psubb %%mm6, %%mm2                     \n\t"
423                 "movq %%mm2, (%%"REG_a", %1, 2)         \n\t"
424
425                 "movq (%%"REG_c", %1), %%mm2            \n\t"
426                 "paddb %%mm6, %%mm2                     \n\t" // line 6 + 0x80
427                 "psubsb %%mm5, %%mm2                    \n\t"
428                 "psubb %%mm6, %%mm2                     \n\t"
429                 "movq %%mm2, (%%"REG_c", %1)            \n\t"
430
431                 :
432                 : "r" (src), "r" ((long)stride)
433                 : "%"REG_a, "%"REG_c
434         );
435 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
436          const int l1= stride;
437         const int l2= stride + l1;
438         const int l3= stride + l2;
439         const int l4= stride + l3;
440         const int l5= stride + l4;
441         const int l6= stride + l5;
442 //        const int l7= stride + l6;
443 //        const int l8= stride + l7;
444 //        const int l9= stride + l8;
445         int x;
446         const int QP15= QP + (QP>>2);
447         src+= stride*3;
448         for(x=0; x<BLOCK_SIZE; x++)
449         {
450                 const int v = (src[x+l5] - src[x+l4]);
451                 if(FFABS(v) < QP15)
452                 {
453                         src[x+l3] +=v>>3;
454                         src[x+l4] +=v>>1;
455                         src[x+l5] -=v>>1;
456                         src[x+l6] -=v>>3;
457
458                 }
459         }
460
461 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
462 }
463 #endif //0
464
465 /**
466  * Experimental Filter 1
467  * will not damage linear gradients
468  * Flat blocks should look like they were passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
469  * can only smooth blocks at the expected locations (it cannot smooth them if they did move)
470  * MMX2 version does correct clipping C version does not
471  */
472 static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co)
473 {
474 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
475         src+= stride*3;
476
477         asm volatile(
478                 "pxor %%mm7, %%mm7                      \n\t" // 0
479                 "lea (%0, %1), %%"REG_a"                \n\t"
480                 "lea (%%"REG_a", %1, 4), %%"REG_c"      \n\t"
481 //      0       1       2       3       4       5       6       7       8       9
482 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ecx     ecx+%1  ecx+2%1 %0+8%1  ecx+4%1
483                 "movq (%%"REG_a", %1, 2), %%mm0         \n\t" // line 3
484                 "movq (%0, %1, 4), %%mm1                \n\t" // line 4
485                 "movq %%mm1, %%mm2                      \n\t" // line 4
486                 "psubusb %%mm0, %%mm1                   \n\t"
487                 "psubusb %%mm2, %%mm0                   \n\t"
488                 "por %%mm1, %%mm0                       \n\t" // |l2 - l3|
489                 "movq (%%"REG_c"), %%mm3                \n\t" // line 5
490                 "movq (%%"REG_c", %1), %%mm4            \n\t" // line 6
491                 "movq %%mm3, %%mm5                      \n\t" // line 5
492                 "psubusb %%mm4, %%mm3                   \n\t"
493                 "psubusb %%mm5, %%mm4                   \n\t"
494                 "por %%mm4, %%mm3                       \n\t" // |l5 - l6|
495                 PAVGB(%%mm3, %%mm0)                           // (|l2 - l3| + |l5 - l6|)/2
496                 "movq %%mm2, %%mm1                      \n\t" // line 4
497                 "psubusb %%mm5, %%mm2                   \n\t"
498                 "movq %%mm2, %%mm4                      \n\t"
499                 "pcmpeqb %%mm7, %%mm2                   \n\t" // (l4 - l5) <= 0 ? -1 : 0
500                 "psubusb %%mm1, %%mm5                   \n\t"
501                 "por %%mm5, %%mm4                       \n\t" // |l4 - l5|
502                 "psubusb %%mm0, %%mm4                   \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
503                 "movq %%mm4, %%mm3                      \n\t" // d
504                 "movq %2, %%mm0                         \n\t"
505                 "paddusb %%mm0, %%mm0                   \n\t"
506                 "psubusb %%mm0, %%mm4                   \n\t"
507                 "pcmpeqb %%mm7, %%mm4                   \n\t" // d <= QP ? -1 : 0
508                 "psubusb "MANGLE(b01)", %%mm3           \n\t"
509                 "pand %%mm4, %%mm3                      \n\t" // d <= QP ? d : 0
510
511                 PAVGB(%%mm7, %%mm3)                           // d/2
512                 "movq %%mm3, %%mm1                      \n\t" // d/2
513                 PAVGB(%%mm7, %%mm3)                           // d/4
514                 PAVGB(%%mm1, %%mm3)                           // 3*d/8
515
516                 "movq (%0, %1, 4), %%mm0                \n\t" // line 4
517                 "pxor %%mm2, %%mm0                      \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
518                 "psubusb %%mm3, %%mm0                   \n\t"
519                 "pxor %%mm2, %%mm0                      \n\t"
520                 "movq %%mm0, (%0, %1, 4)                \n\t" // line 4
521
522                 "movq (%%"REG_c"), %%mm0                \n\t" // line 5
523                 "pxor %%mm2, %%mm0                      \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
524                 "paddusb %%mm3, %%mm0                   \n\t"
525                 "pxor %%mm2, %%mm0                      \n\t"
526                 "movq %%mm0, (%%"REG_c")                \n\t" // line 5
527
528                 PAVGB(%%mm7, %%mm1)                           // d/4
529
530                 "movq (%%"REG_a", %1, 2), %%mm0         \n\t" // line 3
531                 "pxor %%mm2, %%mm0                      \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
532                 "psubusb %%mm1, %%mm0                   \n\t"
533                 "pxor %%mm2, %%mm0                      \n\t"
534                 "movq %%mm0, (%%"REG_a", %1, 2)         \n\t" // line 3
535
536                 "movq (%%"REG_c", %1), %%mm0            \n\t" // line 6
537                 "pxor %%mm2, %%mm0                      \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
538                 "paddusb %%mm1, %%mm0                   \n\t"
539                 "pxor %%mm2, %%mm0                      \n\t"
540                 "movq %%mm0, (%%"REG_c", %1)            \n\t" // line 6
541
542                 PAVGB(%%mm7, %%mm1)                           // d/8
543
544                 "movq (%%"REG_a", %1), %%mm0            \n\t" // line 2
545                 "pxor %%mm2, %%mm0                      \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
546                 "psubusb %%mm1, %%mm0                   \n\t"
547                 "pxor %%mm2, %%mm0                      \n\t"
548                 "movq %%mm0, (%%"REG_a", %1)            \n\t" // line 2
549
550                 "movq (%%"REG_c", %1, 2), %%mm0         \n\t" // line 7
551                 "pxor %%mm2, %%mm0                      \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
552                 "paddusb %%mm1, %%mm0                   \n\t"
553                 "pxor %%mm2, %%mm0                      \n\t"
554                 "movq %%mm0, (%%"REG_c", %1, 2)         \n\t" // line 7
555
556                 :
557                 : "r" (src), "r" ((long)stride), "m" (co->pQPb)
558                 : "%"REG_a, "%"REG_c
559         );
560 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
561
562         const int l1= stride;
563         const int l2= stride + l1;
564         const int l3= stride + l2;
565         const int l4= stride + l3;
566         const int l5= stride + l4;
567         const int l6= stride + l5;
568         const int l7= stride + l6;
569 //        const int l8= stride + l7;
570 //        const int l9= stride + l8;
571         int x;
572
573         src+= stride*3;
574         for(x=0; x<BLOCK_SIZE; x++)
575         {
576                 int a= src[l3] - src[l4];
577                 int b= src[l4] - src[l5];
578                 int c= src[l5] - src[l6];
579
580                 int d= FFABS(b) - ((FFABS(a) + FFABS(c))>>1);
581                 d= FFMAX(d, 0);
582
583                 if(d < co->QP*2)
584                 {
585                         int v = d * FFSIGN(-b);
586
587                         src[l2] +=v>>3;
588                         src[l3] +=v>>2;
589                         src[l4] +=(3*v)>>3;
590                         src[l5] -=(3*v)>>3;
591                         src[l6] -=v>>2;
592                         src[l7] -=v>>3;
593
594                 }
595                 src++;
596         }
597 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
598 }
599
600 #ifndef HAVE_ALTIVEC
601 static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c)
602 {
603 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
604 /*
605         uint8_t tmp[16];
606         const int l1= stride;
607         const int l2= stride + l1;
608         const int l3= stride + l2;
609         const int l4= (int)tmp - (int)src - stride*3;
610         const int l5= (int)tmp - (int)src - stride*3 + 8;
611         const int l6= stride*3 + l3;
612         const int l7= stride + l6;
613         const int l8= stride + l7;
614
615         memcpy(tmp, src+stride*7, 8);
616         memcpy(tmp+8, src+stride*8, 8);
617 */
618         src+= stride*4;
619         asm volatile(
620
621 #if 0 //sligtly more accurate and slightly slower
622                 "pxor %%mm7, %%mm7                      \n\t" // 0
623                 "lea (%0, %1), %%"REG_a"                \n\t"
624                 "lea (%%"REG_a", %1, 4), %%"REG_c"      \n\t"
625 //      0       1       2       3       4       5       6       7
626 //      %0      %0+%1   %0+2%1  eax+2%1 %0+4%1  eax+4%1 ecx+%1  ecx+2%1
627 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ecx     ecx+%1  ecx+2%1
628
629
630                 "movq (%0, %1, 2), %%mm0                \n\t" // l2
631                 "movq (%0), %%mm1                       \n\t" // l0
632                 "movq %%mm0, %%mm2                      \n\t" // l2
633                 PAVGB(%%mm7, %%mm0)                           // ~l2/2
634                 PAVGB(%%mm1, %%mm0)                           // ~(l2 + 2l0)/4
635                 PAVGB(%%mm2, %%mm0)                           // ~(5l2 + 2l0)/8
636
637                 "movq (%%"REG_a"), %%mm1                \n\t" // l1
638                 "movq (%%"REG_a", %1, 2), %%mm3         \n\t" // l3
639                 "movq %%mm1, %%mm4                      \n\t" // l1
640                 PAVGB(%%mm7, %%mm1)                           // ~l1/2
641                 PAVGB(%%mm3, %%mm1)                           // ~(l1 + 2l3)/4
642                 PAVGB(%%mm4, %%mm1)                           // ~(5l1 + 2l3)/8
643
644                 "movq %%mm0, %%mm4                      \n\t" // ~(5l2 + 2l0)/8
645                 "psubusb %%mm1, %%mm0                   \n\t"
646                 "psubusb %%mm4, %%mm1                   \n\t"
647                 "por %%mm0, %%mm1                       \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8
648 // mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0
649
650                 "movq (%0, %1, 4), %%mm0                \n\t" // l4
651                 "movq %%mm0, %%mm4                      \n\t" // l4
652                 PAVGB(%%mm7, %%mm0)                           // ~l4/2
653                 PAVGB(%%mm2, %%mm0)                           // ~(l4 + 2l2)/4
654                 PAVGB(%%mm4, %%mm0)                           // ~(5l4 + 2l2)/8
655
656                 "movq (%%"REG_c"), %%mm2                \n\t" // l5
657                 "movq %%mm3, %%mm5                      \n\t" // l3
658                 PAVGB(%%mm7, %%mm3)                           // ~l3/2
659                 PAVGB(%%mm2, %%mm3)                           // ~(l3 + 2l5)/4
660                 PAVGB(%%mm5, %%mm3)                           // ~(5l3 + 2l5)/8
661
662                 "movq %%mm0, %%mm6                      \n\t" // ~(5l4 + 2l2)/8
663                 "psubusb %%mm3, %%mm0                   \n\t"
664                 "psubusb %%mm6, %%mm3                   \n\t"
665                 "por %%mm0, %%mm3                       \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8
666                 "pcmpeqb %%mm7, %%mm0                   \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5)
667 // mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0
668
669                 "movq (%%"REG_c", %1), %%mm6            \n\t" // l6
670                 "movq %%mm6, %%mm5                      \n\t" // l6
671                 PAVGB(%%mm7, %%mm6)                           // ~l6/2
672                 PAVGB(%%mm4, %%mm6)                           // ~(l6 + 2l4)/4
673                 PAVGB(%%mm5, %%mm6)                           // ~(5l6 + 2l4)/8
674
675                 "movq (%%"REG_c", %1, 2), %%mm5         \n\t" // l7
676                 "movq %%mm2, %%mm4                      \n\t" // l5
677                 PAVGB(%%mm7, %%mm2)                           // ~l5/2
678                 PAVGB(%%mm5, %%mm2)                           // ~(l5 + 2l7)/4
679                 PAVGB(%%mm4, %%mm2)                           // ~(5l5 + 2l7)/8
680
681                 "movq %%mm6, %%mm4                      \n\t" // ~(5l6 + 2l4)/8
682                 "psubusb %%mm2, %%mm6                   \n\t"
683                 "psubusb %%mm4, %%mm2                   \n\t"
684                 "por %%mm6, %%mm2                       \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8
685 // mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0
686
687
688                 PMINUB(%%mm2, %%mm1, %%mm4)                   // MIN(|lenergy|,|renergy|)/8
689                 "movq %2, %%mm4                         \n\t" // QP //FIXME QP+1 ?
690                 "paddusb "MANGLE(b01)", %%mm4           \n\t"
691                 "pcmpgtb %%mm3, %%mm4                   \n\t" // |menergy|/8 < QP
692                 "psubusb %%mm1, %%mm3                   \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8
693                 "pand %%mm4, %%mm3                      \n\t"
694
695                 "movq %%mm3, %%mm1                      \n\t"
696 //                "psubusb "MANGLE(b01)", %%mm3           \n\t"
697                 PAVGB(%%mm7, %%mm3)
698                 PAVGB(%%mm7, %%mm3)
699                 "paddusb %%mm1, %%mm3                   \n\t"
700 //                "paddusb "MANGLE(b01)", %%mm3           \n\t"
701
702                 "movq (%%"REG_a", %1, 2), %%mm6         \n\t" //l3
703                 "movq (%0, %1, 4), %%mm5                \n\t" //l4
704                 "movq (%0, %1, 4), %%mm4                \n\t" //l4
705                 "psubusb %%mm6, %%mm5                   \n\t"
706                 "psubusb %%mm4, %%mm6                   \n\t"
707                 "por %%mm6, %%mm5                       \n\t" // |l3-l4|
708                 "pcmpeqb %%mm7, %%mm6                   \n\t" // SIGN(l3-l4)
709                 "pxor %%mm6, %%mm0                      \n\t"
710                 "pand %%mm0, %%mm3                      \n\t"
711                 PMINUB(%%mm5, %%mm3, %%mm0)
712
713                 "psubusb "MANGLE(b01)", %%mm3           \n\t"
714                 PAVGB(%%mm7, %%mm3)
715
716                 "movq (%%"REG_a", %1, 2), %%mm0         \n\t"
717                 "movq (%0, %1, 4), %%mm2                \n\t"
718                 "pxor %%mm6, %%mm0                      \n\t"
719                 "pxor %%mm6, %%mm2                      \n\t"
720                 "psubb %%mm3, %%mm0                     \n\t"
721                 "paddb %%mm3, %%mm2                     \n\t"
722                 "pxor %%mm6, %%mm0                      \n\t"
723                 "pxor %%mm6, %%mm2                      \n\t"
724                 "movq %%mm0, (%%"REG_a", %1, 2)         \n\t"
725                 "movq %%mm2, (%0, %1, 4)                \n\t"
726 #endif //0
727
728                 "lea (%0, %1), %%"REG_a"                \n\t"
729                 "pcmpeqb %%mm6, %%mm6                   \n\t" // -1
730 //      0       1       2       3       4       5       6       7
731 //      %0      %0+%1   %0+2%1  eax+2%1 %0+4%1  eax+4%1 ecx+%1  ecx+2%1
732 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ecx     ecx+%1  ecx+2%1
733
734
735                 "movq (%%"REG_a", %1, 2), %%mm1         \n\t" // l3
736                 "movq (%0, %1, 4), %%mm0                \n\t" // l4
737                 "pxor %%mm6, %%mm1                      \n\t" // -l3-1
738                 PAVGB(%%mm1, %%mm0)                           // -q+128 = (l4-l3+256)/2
739 // mm1=-l3-1, mm0=128-q
740
741                 "movq (%%"REG_a", %1, 4), %%mm2         \n\t" // l5
742                 "movq (%%"REG_a", %1), %%mm3            \n\t" // l2
743                 "pxor %%mm6, %%mm2                      \n\t" // -l5-1
744                 "movq %%mm2, %%mm5                      \n\t" // -l5-1
745                 "movq "MANGLE(b80)", %%mm4              \n\t" // 128
746                 "lea (%%"REG_a", %1, 4), %%"REG_c"      \n\t"
747                 PAVGB(%%mm3, %%mm2)                           // (l2-l5+256)/2
748                 PAVGB(%%mm0, %%mm4)                           // ~(l4-l3)/4 + 128
749                 PAVGB(%%mm2, %%mm4)                           // ~(l2-l5)/4 +(l4-l3)/8 + 128
750                 PAVGB(%%mm0, %%mm4)                           // ~(l2-l5)/8 +5(l4-l3)/16 + 128
751 // mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1
752
753                 "movq (%%"REG_a"), %%mm2                \n\t" // l1
754                 "pxor %%mm6, %%mm2                      \n\t" // -l1-1
755                 PAVGB(%%mm3, %%mm2)                           // (l2-l1+256)/2
756                 PAVGB((%0), %%mm1)                            // (l0-l3+256)/2
757                 "movq "MANGLE(b80)", %%mm3              \n\t" // 128
758                 PAVGB(%%mm2, %%mm3)                           // ~(l2-l1)/4 + 128
759                 PAVGB(%%mm1, %%mm3)                           // ~(l0-l3)/4 +(l2-l1)/8 + 128
760                 PAVGB(%%mm2, %%mm3)                           // ~(l0-l3)/8 +5(l2-l1)/16 + 128
761 // mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1
762
763                 PAVGB((%%REGc, %1), %%mm5)                    // (l6-l5+256)/2
764                 "movq (%%"REG_c", %1, 2), %%mm1         \n\t" // l7
765                 "pxor %%mm6, %%mm1                      \n\t" // -l7-1
766                 PAVGB((%0, %1, 4), %%mm1)                     // (l4-l7+256)/2
767                 "movq "MANGLE(b80)", %%mm2              \n\t" // 128
768                 PAVGB(%%mm5, %%mm2)                           // ~(l6-l5)/4 + 128
769                 PAVGB(%%mm1, %%mm2)                           // ~(l4-l7)/4 +(l6-l5)/8 + 128
770                 PAVGB(%%mm5, %%mm2)                           // ~(l4-l7)/8 +5(l6-l5)/16 + 128
771 // mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128
772
773                 "movq "MANGLE(b00)", %%mm1              \n\t" // 0
774                 "movq "MANGLE(b00)", %%mm5              \n\t" // 0
775                 "psubb %%mm2, %%mm1                     \n\t" // 128 - renergy/16
776                 "psubb %%mm3, %%mm5                     \n\t" // 128 - lenergy/16
777                 PMAXUB(%%mm1, %%mm2)                          // 128 + |renergy/16|
778                  PMAXUB(%%mm5, %%mm3)                         // 128 + |lenergy/16|
779                 PMINUB(%%mm2, %%mm3, %%mm1)                   // 128 + MIN(|lenergy|,|renergy|)/16
780
781 // mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128
782
783                 "movq "MANGLE(b00)", %%mm7              \n\t" // 0
784                 "movq %2, %%mm2                         \n\t" // QP
785                 PAVGB(%%mm6, %%mm2)                           // 128 + QP/2
786                 "psubb %%mm6, %%mm2                     \n\t"
787
788                 "movq %%mm4, %%mm1                      \n\t"
789                 "pcmpgtb %%mm7, %%mm1                   \n\t" // SIGN(menergy)
790                 "pxor %%mm1, %%mm4                      \n\t"
791                 "psubb %%mm1, %%mm4                     \n\t" // 128 + |menergy|/16
792                 "pcmpgtb %%mm4, %%mm2                   \n\t" // |menergy|/16 < QP/2
793                 "psubusb %%mm3, %%mm4                   \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16
794 // mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16
795
796                 "movq %%mm4, %%mm3                      \n\t" // d
797                 "psubusb "MANGLE(b01)", %%mm4           \n\t"
798                 PAVGB(%%mm7, %%mm4)                           // d/32
799                 PAVGB(%%mm7, %%mm4)                           // (d + 32)/64
800                 "paddb %%mm3, %%mm4                     \n\t" // 5d/64
801                 "pand %%mm2, %%mm4                      \n\t"
802
803                 "movq "MANGLE(b80)", %%mm5              \n\t" // 128
804                 "psubb %%mm0, %%mm5                     \n\t" // q
805                 "paddsb %%mm6, %%mm5                    \n\t" // fix bad rounding
806                 "pcmpgtb %%mm5, %%mm7                   \n\t" // SIGN(q)
807                 "pxor %%mm7, %%mm5                      \n\t"
808
809                 PMINUB(%%mm5, %%mm4, %%mm3)                   // MIN(|q|, 5d/64)
810                 "pxor %%mm1, %%mm7                      \n\t" // SIGN(d*q)
811
812                 "pand %%mm7, %%mm4                      \n\t"
813                 "movq (%%"REG_a", %1, 2), %%mm0         \n\t"
814                 "movq (%0, %1, 4), %%mm2                \n\t"
815                 "pxor %%mm1, %%mm0                      \n\t"
816                 "pxor %%mm1, %%mm2                      \n\t"
817                 "paddb %%mm4, %%mm0                     \n\t"
818                 "psubb %%mm4, %%mm2                     \n\t"
819                 "pxor %%mm1, %%mm0                      \n\t"
820                 "pxor %%mm1, %%mm2                      \n\t"
821                 "movq %%mm0, (%%"REG_a", %1, 2)         \n\t"
822                 "movq %%mm2, (%0, %1, 4)                \n\t"
823
824                 :
825                 : "r" (src), "r" ((long)stride), "m" (c->pQPb)
826                 : "%"REG_a, "%"REG_c
827         );
828
829 /*
830         {
831         int x;
832         src-= stride;
833         for(x=0; x<BLOCK_SIZE; x++)
834         {
835                 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
836                 if(FFABS(middleEnergy)< 8*QP)
837                 {
838                         const int q=(src[l4] - src[l5])/2;
839                         const int leftEnergy=  5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
840                         const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
841
842                         int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
843                         d= FFMAX(d, 0);
844
845                         d= (5*d + 32) >> 6;
846                         d*= FFSIGN(-middleEnergy);
847
848                         if(q>0)
849                         {
850                                 d= d<0 ? 0 : d;
851                                 d= d>q ? q : d;
852                         }
853                         else
854                         {
855                                 d= d>0 ? 0 : d;
856                                 d= d<q ? q : d;
857                         }
858
859                         src[l4]-= d;
860                         src[l5]+= d;
861                 }
862                 src++;
863         }
864 src-=8;
865         for(x=0; x<8; x++)
866         {
867                 int y;
868                 for(y=4; y<6; y++)
869                 {
870                         int d= src[x+y*stride] - tmp[x+(y-4)*8];
871                         int ad= FFABS(d);
872                         static int max=0;
873                         static int sum=0;
874                         static int num=0;
875                         static int bias=0;
876
877                         if(max<ad) max=ad;
878                         sum+= ad>3 ? 1 : 0;
879                         if(ad>3)
880                         {
881                                 src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255;
882                         }
883                         if(y==4) bias+=d;
884                         num++;
885                         if(num%1000000 == 0)
886                         {
887                                 av_log(c, AV_LOG_INFO, " %d %d %d %d\n", num, sum, max, bias);
888                         }
889                 }
890         }
891 }
892 */
893 #elif defined (HAVE_MMX)
894         src+= stride*4;
895         asm volatile(
896                 "pxor %%mm7, %%mm7                      \n\t"
897                 "lea -40(%%"REG_SP"), %%"REG_c"         \n\t" // make space for 4 8-byte vars
898                 "and "ALIGN_MASK", %%"REG_c"            \n\t" // align
899 //      0       1       2       3       4       5       6       7
900 //      %0      %0+%1   %0+2%1  eax+2%1 %0+4%1  eax+4%1 edx+%1  edx+2%1
901 //      %0      eax     eax+%1  eax+2%1 %0+4%1  edx     edx+%1  edx+2%1
902
903                 "movq (%0), %%mm0                       \n\t"
904                 "movq %%mm0, %%mm1                      \n\t"
905                 "punpcklbw %%mm7, %%mm0                 \n\t" // low part of line 0
906                 "punpckhbw %%mm7, %%mm1                 \n\t" // high part of line 0
907
908                 "movq (%0, %1), %%mm2                   \n\t"
909                 "lea (%0, %1, 2), %%"REG_a"             \n\t"
910                 "movq %%mm2, %%mm3                      \n\t"
911                 "punpcklbw %%mm7, %%mm2                 \n\t" // low part of line 1
912                 "punpckhbw %%mm7, %%mm3                 \n\t" // high part of line 1
913
914                 "movq (%%"REG_a"), %%mm4                \n\t"
915                 "movq %%mm4, %%mm5                      \n\t"
916                 "punpcklbw %%mm7, %%mm4                 \n\t" // low part of line 2
917                 "punpckhbw %%mm7, %%mm5                 \n\t" // high part of line 2
918
919                 "paddw %%mm0, %%mm0                     \n\t" // 2L0
920                 "paddw %%mm1, %%mm1                     \n\t" // 2H0
921                 "psubw %%mm4, %%mm2                     \n\t" // L1 - L2
922                 "psubw %%mm5, %%mm3                     \n\t" // H1 - H2
923                 "psubw %%mm2, %%mm0                     \n\t" // 2L0 - L1 + L2
924                 "psubw %%mm3, %%mm1                     \n\t" // 2H0 - H1 + H2
925
926                 "psllw $2, %%mm2                        \n\t" // 4L1 - 4L2
927                 "psllw $2, %%mm3                        \n\t" // 4H1 - 4H2
928                 "psubw %%mm2, %%mm0                     \n\t" // 2L0 - 5L1 + 5L2
929                 "psubw %%mm3, %%mm1                     \n\t" // 2H0 - 5H1 + 5H2
930
931                 "movq (%%"REG_a", %1), %%mm2            \n\t"
932                 "movq %%mm2, %%mm3                      \n\t"
933                 "punpcklbw %%mm7, %%mm2                 \n\t" // L3
934                 "punpckhbw %%mm7, %%mm3                 \n\t" // H3
935
936                 "psubw %%mm2, %%mm0                     \n\t" // 2L0 - 5L1 + 5L2 - L3
937                 "psubw %%mm3, %%mm1                     \n\t" // 2H0 - 5H1 + 5H2 - H3
938                 "psubw %%mm2, %%mm0                     \n\t" // 2L0 - 5L1 + 5L2 - 2L3
939                 "psubw %%mm3, %%mm1                     \n\t" // 2H0 - 5H1 + 5H2 - 2H3
940                 "movq %%mm0, (%%"REG_c")                \n\t" // 2L0 - 5L1 + 5L2 - 2L3
941                 "movq %%mm1, 8(%%"REG_c")               \n\t" // 2H0 - 5H1 + 5H2 - 2H3
942
943                 "movq (%%"REG_a", %1, 2), %%mm0         \n\t"
944                 "movq %%mm0, %%mm1                      \n\t"
945                 "punpcklbw %%mm7, %%mm0                 \n\t" // L4
946                 "punpckhbw %%mm7, %%mm1                 \n\t" // H4
947
948                 "psubw %%mm0, %%mm2                     \n\t" // L3 - L4
949                 "psubw %%mm1, %%mm3                     \n\t" // H3 - H4
950                 "movq %%mm2, 16(%%"REG_c")              \n\t" // L3 - L4
951                 "movq %%mm3, 24(%%"REG_c")              \n\t" // H3 - H4
952                 "paddw %%mm4, %%mm4                     \n\t" // 2L2
953                 "paddw %%mm5, %%mm5                     \n\t" // 2H2
954                 "psubw %%mm2, %%mm4                     \n\t" // 2L2 - L3 + L4
955                 "psubw %%mm3, %%mm5                     \n\t" // 2H2 - H3 + H4
956
957                 "lea (%%"REG_a", %1), %0                \n\t"
958                 "psllw $2, %%mm2                        \n\t" // 4L3 - 4L4
959                 "psllw $2, %%mm3                        \n\t" // 4H3 - 4H4
960                 "psubw %%mm2, %%mm4                     \n\t" // 2L2 - 5L3 + 5L4
961                 "psubw %%mm3, %%mm5                     \n\t" // 2H2 - 5H3 + 5H4
962 //50 opcodes so far
963                 "movq (%0, %1, 2), %%mm2                \n\t"
964                 "movq %%mm2, %%mm3                      \n\t"
965                 "punpcklbw %%mm7, %%mm2                 \n\t" // L5
966                 "punpckhbw %%mm7, %%mm3                 \n\t" // H5
967                 "psubw %%mm2, %%mm4                     \n\t" // 2L2 - 5L3 + 5L4 - L5
968                 "psubw %%mm3, %%mm5                     \n\t" // 2H2 - 5H3 + 5H4 - H5
969                 "psubw %%mm2, %%mm4                     \n\t" // 2L2 - 5L3 + 5L4 - 2L5
970                 "psubw %%mm3, %%mm5                     \n\t" // 2H2 - 5H3 + 5H4 - 2H5
971
972                 "movq (%%"REG_a", %1, 4), %%mm6         \n\t"
973                 "punpcklbw %%mm7, %%mm6                 \n\t" // L6
974                 "psubw %%mm6, %%mm2                     \n\t" // L5 - L6
975                 "movq (%%"REG_a", %1, 4), %%mm6         \n\t"
976                 "punpckhbw %%mm7, %%mm6                 \n\t" // H6
977                 "psubw %%mm6, %%mm3                     \n\t" // H5 - H6
978
979                 "paddw %%mm0, %%mm0                     \n\t" // 2L4
980                 "paddw %%mm1, %%mm1                     \n\t" // 2H4
981                 "psubw %%mm2, %%mm0                     \n\t" // 2L4 - L5 + L6
982                 "psubw %%mm3, %%mm1                     \n\t" // 2H4 - H5 + H6
983
984                 "psllw $2, %%mm2                        \n\t" // 4L5 - 4L6
985                 "psllw $2, %%mm3                        \n\t" // 4H5 - 4H6
986                 "psubw %%mm2, %%mm0                     \n\t" // 2L4 - 5L5 + 5L6
987                 "psubw %%mm3, %%mm1                     \n\t" // 2H4 - 5H5 + 5H6
988
989                 "movq (%0, %1, 4), %%mm2                \n\t"
990                 "movq %%mm2, %%mm3                      \n\t"
991                 "punpcklbw %%mm7, %%mm2                 \n\t" // L7
992                 "punpckhbw %%mm7, %%mm3                 \n\t" // H7
993
994                 "paddw %%mm2, %%mm2                     \n\t" // 2L7
995                 "paddw %%mm3, %%mm3                     \n\t" // 2H7
996                 "psubw %%mm2, %%mm0                     \n\t" // 2L4 - 5L5 + 5L6 - 2L7
997                 "psubw %%mm3, %%mm1                     \n\t" // 2H4 - 5H5 + 5H6 - 2H7
998
999                 "movq (%%"REG_c"), %%mm2                \n\t" // 2L0 - 5L1 + 5L2 - 2L3
1000                 "movq 8(%%"REG_c"), %%mm3               \n\t" // 2H0 - 5H1 + 5H2 - 2H3
1001
1002 #ifdef HAVE_MMX2
1003                 "movq %%mm7, %%mm6                      \n\t" // 0
1004                 "psubw %%mm0, %%mm6                     \n\t"
1005                 "pmaxsw %%mm6, %%mm0                    \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
1006                 "movq %%mm7, %%mm6                      \n\t" // 0
1007                 "psubw %%mm1, %%mm6                     \n\t"
1008                 "pmaxsw %%mm6, %%mm1                    \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
1009                 "movq %%mm7, %%mm6                      \n\t" // 0
1010                 "psubw %%mm2, %%mm6                     \n\t"
1011                 "pmaxsw %%mm6, %%mm2                    \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1012                 "movq %%mm7, %%mm6                      \n\t" // 0
1013                 "psubw %%mm3, %%mm6                     \n\t"
1014                 "pmaxsw %%mm6, %%mm3                    \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
1015 #else
1016                 "movq %%mm7, %%mm6                      \n\t" // 0
1017                 "pcmpgtw %%mm0, %%mm6                   \n\t"
1018                 "pxor %%mm6, %%mm0                      \n\t"
1019                 "psubw %%mm6, %%mm0                     \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
1020                 "movq %%mm7, %%mm6                      \n\t" // 0
1021                 "pcmpgtw %%mm1, %%mm6                   \n\t"
1022                 "pxor %%mm6, %%mm1                      \n\t"
1023                 "psubw %%mm6, %%mm1                     \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
1024                 "movq %%mm7, %%mm6                      \n\t" // 0
1025                 "pcmpgtw %%mm2, %%mm6                   \n\t"
1026                 "pxor %%mm6, %%mm2                      \n\t"
1027                 "psubw %%mm6, %%mm2                     \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1028                 "movq %%mm7, %%mm6                      \n\t" // 0
1029                 "pcmpgtw %%mm3, %%mm6                   \n\t"
1030                 "pxor %%mm6, %%mm3                      \n\t"
1031                 "psubw %%mm6, %%mm3                     \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
1032 #endif
1033
1034 #ifdef HAVE_MMX2
1035                 "pminsw %%mm2, %%mm0                    \n\t"
1036                 "pminsw %%mm3, %%mm1                    \n\t"
1037 #else
1038                 "movq %%mm0, %%mm6                      \n\t"
1039                 "psubusw %%mm2, %%mm6                   \n\t"
1040                 "psubw %%mm6, %%mm0                     \n\t"
1041                 "movq %%mm1, %%mm6                      \n\t"
1042                 "psubusw %%mm3, %%mm6                   \n\t"
1043                 "psubw %%mm6, %%mm1                     \n\t"
1044 #endif
1045
1046                 "movd %2, %%mm2                         \n\t" // QP
1047                 "punpcklbw %%mm7, %%mm2                 \n\t"
1048
1049                 "movq %%mm7, %%mm6                      \n\t" // 0
1050                 "pcmpgtw %%mm4, %%mm6                   \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
1051                 "pxor %%mm6, %%mm4                      \n\t"
1052                 "psubw %%mm6, %%mm4                     \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
1053                 "pcmpgtw %%mm5, %%mm7                   \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
1054                 "pxor %%mm7, %%mm5                      \n\t"
1055                 "psubw %%mm7, %%mm5                     \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
1056 // 100 opcodes
1057                 "psllw $3, %%mm2                        \n\t" // 8QP
1058                 "movq %%mm2, %%mm3                      \n\t" // 8QP
1059                 "pcmpgtw %%mm4, %%mm2                   \n\t"
1060                 "pcmpgtw %%mm5, %%mm3                   \n\t"
1061                 "pand %%mm2, %%mm4                      \n\t"
1062                 "pand %%mm3, %%mm5                      \n\t"
1063
1064
1065                 "psubusw %%mm0, %%mm4                   \n\t" // hd
1066                 "psubusw %%mm1, %%mm5                   \n\t" // ld
1067
1068
1069                 "movq "MANGLE(w05)", %%mm2              \n\t" // 5
1070                 "pmullw %%mm2, %%mm4                    \n\t"
1071                 "pmullw %%mm2, %%mm5                    \n\t"
1072                 "movq "MANGLE(w20)", %%mm2              \n\t" // 32
1073                 "paddw %%mm2, %%mm4                     \n\t"
1074                 "paddw %%mm2, %%mm5                     \n\t"
1075                 "psrlw $6, %%mm4                        \n\t"
1076                 "psrlw $6, %%mm5                        \n\t"
1077
1078                 "movq 16(%%"REG_c"), %%mm0              \n\t" // L3 - L4
1079                 "movq 24(%%"REG_c"), %%mm1              \n\t" // H3 - H4
1080
1081                 "pxor %%mm2, %%mm2                      \n\t"
1082                 "pxor %%mm3, %%mm3                      \n\t"
1083
1084                 "pcmpgtw %%mm0, %%mm2                   \n\t" // sign (L3-L4)
1085                 "pcmpgtw %%mm1, %%mm3                   \n\t" // sign (H3-H4)
1086                 "pxor %%mm2, %%mm0                      \n\t"
1087                 "pxor %%mm3, %%mm1                      \n\t"
1088                 "psubw %%mm2, %%mm0                     \n\t" // |L3-L4|
1089                 "psubw %%mm3, %%mm1                     \n\t" // |H3-H4|
1090                 "psrlw $1, %%mm0                        \n\t" // |L3 - L4|/2
1091                 "psrlw $1, %%mm1                        \n\t" // |H3 - H4|/2
1092
1093                 "pxor %%mm6, %%mm2                      \n\t"
1094                 "pxor %%mm7, %%mm3                      \n\t"
1095                 "pand %%mm2, %%mm4                      \n\t"
1096                 "pand %%mm3, %%mm5                      \n\t"
1097
1098 #ifdef HAVE_MMX2
1099                 "pminsw %%mm0, %%mm4                    \n\t"
1100                 "pminsw %%mm1, %%mm5                    \n\t"
1101 #else
1102                 "movq %%mm4, %%mm2                      \n\t"
1103                 "psubusw %%mm0, %%mm2                   \n\t"
1104                 "psubw %%mm2, %%mm4                     \n\t"
1105                 "movq %%mm5, %%mm2                      \n\t"
1106                 "psubusw %%mm1, %%mm2                   \n\t"
1107                 "psubw %%mm2, %%mm5                     \n\t"
1108 #endif
1109                 "pxor %%mm6, %%mm4                      \n\t"
1110                 "pxor %%mm7, %%mm5                      \n\t"
1111                 "psubw %%mm6, %%mm4                     \n\t"
1112                 "psubw %%mm7, %%mm5                     \n\t"
1113                 "packsswb %%mm5, %%mm4                  \n\t"
1114                 "movq (%0), %%mm0                       \n\t"
1115                 "paddb   %%mm4, %%mm0                   \n\t"
1116                 "movq %%mm0, (%0)                       \n\t"
1117                 "movq (%0, %1), %%mm0                   \n\t"
1118                 "psubb %%mm4, %%mm0                     \n\t"
1119                 "movq %%mm0, (%0, %1)                   \n\t"
1120
1121                 : "+r" (src)
1122                 : "r" ((long)stride), "m" (c->pQPb)
1123                 : "%"REG_a, "%"REG_c
1124         );
1125 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1126         const int l1= stride;
1127         const int l2= stride + l1;
1128         const int l3= stride + l2;
1129         const int l4= stride + l3;
1130         const int l5= stride + l4;
1131         const int l6= stride + l5;
1132         const int l7= stride + l6;
1133         const int l8= stride + l7;
1134 //        const int l9= stride + l8;
1135         int x;
1136         src+= stride*3;
1137         for(x=0; x<BLOCK_SIZE; x++)
1138         {
1139                 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
1140                 if(FFABS(middleEnergy) < 8*c->QP)
1141                 {
1142                         const int q=(src[l4] - src[l5])/2;
1143                         const int leftEnergy=  5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
1144                         const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
1145
1146                         int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
1147                         d= FFMAX(d, 0);
1148
1149                         d= (5*d + 32) >> 6;
1150                         d*= FFSIGN(-middleEnergy);
1151
1152                         if(q>0)
1153                         {
1154                                 d= d<0 ? 0 : d;
1155                                 d= d>q ? q : d;
1156                         }
1157                         else
1158                         {
1159                                 d= d>0 ? 0 : d;
1160                                 d= d<q ? q : d;
1161                         }
1162
1163                         src[l4]-= d;
1164                         src[l5]+= d;
1165                 }
1166                 src++;
1167         }
1168 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1169 }
1170 #endif //HAVE_ALTIVEC
1171
1172 #ifndef HAVE_ALTIVEC
1173 static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c)
1174 {
1175 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1176         asm volatile(
1177                 "pxor %%mm6, %%mm6                      \n\t"
1178                 "pcmpeqb %%mm7, %%mm7                   \n\t"
1179                 "movq %2, %%mm0                         \n\t"
1180                 "punpcklbw %%mm6, %%mm0                 \n\t"
1181                 "psrlw $1, %%mm0                        \n\t"
1182                 "psubw %%mm7, %%mm0                     \n\t"
1183                 "packuswb %%mm0, %%mm0                  \n\t"
1184                 "movq %%mm0, %3                         \n\t"
1185
1186                 "lea (%0, %1), %%"REG_a"                \n\t"
1187                 "lea (%%"REG_a", %1, 4), %%"REG_d"      \n\t"
1188
1189 //        0        1        2        3        4        5        6        7        8        9
1190 //        %0        eax        eax+%1        eax+2%1        %0+4%1        edx        edx+%1        edx+2%1        %0+8%1        edx+4%1
1191
1192 #undef FIND_MIN_MAX
1193 #ifdef HAVE_MMX2
1194 #define REAL_FIND_MIN_MAX(addr)\
1195                 "movq " #addr ", %%mm0                  \n\t"\
1196                 "pminub %%mm0, %%mm7                    \n\t"\
1197                 "pmaxub %%mm0, %%mm6                    \n\t"
1198 #else
1199 #define REAL_FIND_MIN_MAX(addr)\
1200                 "movq " #addr ", %%mm0                  \n\t"\
1201                 "movq %%mm7, %%mm1                      \n\t"\
1202                 "psubusb %%mm0, %%mm6                   \n\t"\
1203                 "paddb %%mm0, %%mm6                     \n\t"\
1204                 "psubusb %%mm0, %%mm1                   \n\t"\
1205                 "psubb %%mm1, %%mm7                     \n\t"
1206 #endif
1207 #define FIND_MIN_MAX(addr)  REAL_FIND_MIN_MAX(addr)
1208
1209 FIND_MIN_MAX((%%REGa))
1210 FIND_MIN_MAX((%%REGa, %1))
1211 FIND_MIN_MAX((%%REGa, %1, 2))
1212 FIND_MIN_MAX((%0, %1, 4))
1213 FIND_MIN_MAX((%%REGd))
1214 FIND_MIN_MAX((%%REGd, %1))
1215 FIND_MIN_MAX((%%REGd, %1, 2))
1216 FIND_MIN_MAX((%0, %1, 8))
1217
1218                 "movq %%mm7, %%mm4                      \n\t"
1219                 "psrlq $8, %%mm7                        \n\t"
1220 #ifdef HAVE_MMX2
1221                 "pminub %%mm4, %%mm7                    \n\t" // min of pixels
1222                 "pshufw $0xF9, %%mm7, %%mm4             \n\t"
1223                 "pminub %%mm4, %%mm7                    \n\t" // min of pixels
1224                 "pshufw $0xFE, %%mm7, %%mm4             \n\t"
1225                 "pminub %%mm4, %%mm7                    \n\t"
1226 #else
1227                 "movq %%mm7, %%mm1                      \n\t"
1228                 "psubusb %%mm4, %%mm1                   \n\t"
1229                 "psubb %%mm1, %%mm7                     \n\t"
1230                 "movq %%mm7, %%mm4                      \n\t"
1231                 "psrlq $16, %%mm7                       \n\t"
1232                 "movq %%mm7, %%mm1                      \n\t"
1233                 "psubusb %%mm4, %%mm1                   \n\t"
1234                 "psubb %%mm1, %%mm7                     \n\t"
1235                 "movq %%mm7, %%mm4                      \n\t"
1236                 "psrlq $32, %%mm7                       \n\t"
1237                 "movq %%mm7, %%mm1                      \n\t"
1238                 "psubusb %%mm4, %%mm1                   \n\t"
1239                 "psubb %%mm1, %%mm7                     \n\t"
1240 #endif
1241
1242
1243                 "movq %%mm6, %%mm4                      \n\t"
1244                 "psrlq $8, %%mm6                        \n\t"
1245 #ifdef HAVE_MMX2
1246                 "pmaxub %%mm4, %%mm6                    \n\t" // max of pixels
1247                 "pshufw $0xF9, %%mm6, %%mm4             \n\t"
1248                 "pmaxub %%mm4, %%mm6                    \n\t"
1249                 "pshufw $0xFE, %%mm6, %%mm4             \n\t"
1250                 "pmaxub %%mm4, %%mm6                    \n\t"
1251 #else
1252                 "psubusb %%mm4, %%mm6                   \n\t"
1253                 "paddb %%mm4, %%mm6                     \n\t"
1254                 "movq %%mm6, %%mm4                      \n\t"
1255                 "psrlq $16, %%mm6                       \n\t"
1256                 "psubusb %%mm4, %%mm6                   \n\t"
1257                 "paddb %%mm4, %%mm6                     \n\t"
1258                 "movq %%mm6, %%mm4                      \n\t"
1259                 "psrlq $32, %%mm6                       \n\t"
1260                 "psubusb %%mm4, %%mm6                   \n\t"
1261                 "paddb %%mm4, %%mm6                     \n\t"
1262 #endif
1263                 "movq %%mm6, %%mm0                      \n\t" // max
1264                 "psubb %%mm7, %%mm6                     \n\t" // max - min
1265                 "movd %%mm6, %%ecx                      \n\t"
1266                 "cmpb "MANGLE(deringThreshold)", %%cl   \n\t"
1267                 " jb 1f                                 \n\t"
1268                 "lea -24(%%"REG_SP"), %%"REG_c"         \n\t"
1269                 "and "ALIGN_MASK", %%"REG_c"            \n\t"
1270                 PAVGB(%%mm0, %%mm7)                           // a=(max + min)/2
1271                 "punpcklbw %%mm7, %%mm7                 \n\t"
1272                 "punpcklbw %%mm7, %%mm7                 \n\t"
1273                 "punpcklbw %%mm7, %%mm7                 \n\t"
1274                 "movq %%mm7, (%%"REG_c")                \n\t"
1275
1276                 "movq (%0), %%mm0                       \n\t" // L10
1277                 "movq %%mm0, %%mm1                      \n\t" // L10
1278                 "movq %%mm0, %%mm2                      \n\t" // L10
1279                 "psllq $8, %%mm1                        \n\t"
1280                 "psrlq $8, %%mm2                        \n\t"
1281                 "movd -4(%0), %%mm3                     \n\t"
1282                 "movd 8(%0), %%mm4                      \n\t"
1283                 "psrlq $24, %%mm3                       \n\t"
1284                 "psllq $56, %%mm4                       \n\t"
1285                 "por %%mm3, %%mm1                       \n\t" // L00
1286                 "por %%mm4, %%mm2                       \n\t" // L20
1287                 "movq %%mm1, %%mm3                      \n\t" // L00
1288                 PAVGB(%%mm2, %%mm1)                           // (L20 + L00)/2
1289                 PAVGB(%%mm0, %%mm1)                           // (L20 + L00 + 2L10)/4
1290                 "psubusb %%mm7, %%mm0                   \n\t"
1291                 "psubusb %%mm7, %%mm2                   \n\t"
1292                 "psubusb %%mm7, %%mm3                   \n\t"
1293                 "pcmpeqb "MANGLE(b00)", %%mm0           \n\t" // L10 > a ? 0 : -1
1294                 "pcmpeqb "MANGLE(b00)", %%mm2           \n\t" // L20 > a ? 0 : -1
1295                 "pcmpeqb "MANGLE(b00)", %%mm3           \n\t" // L00 > a ? 0 : -1
1296                 "paddb %%mm2, %%mm0                     \n\t"
1297                 "paddb %%mm3, %%mm0                     \n\t"
1298
1299                 "movq (%%"REG_a"), %%mm2                \n\t" // L11
1300                 "movq %%mm2, %%mm3                      \n\t" // L11
1301                 "movq %%mm2, %%mm4                      \n\t" // L11
1302                 "psllq $8, %%mm3                        \n\t"
1303                 "psrlq $8, %%mm4                        \n\t"
1304                 "movd -4(%%"REG_a"), %%mm5              \n\t"
1305                 "movd 8(%%"REG_a"), %%mm6               \n\t"
1306                 "psrlq $24, %%mm5                       \n\t"
1307                 "psllq $56, %%mm6                       \n\t"
1308                 "por %%mm5, %%mm3                       \n\t" // L01
1309                 "por %%mm6, %%mm4                       \n\t" // L21
1310                 "movq %%mm3, %%mm5                      \n\t" // L01
1311                 PAVGB(%%mm4, %%mm3)                           // (L21 + L01)/2
1312                 PAVGB(%%mm2, %%mm3)                           // (L21 + L01 + 2L11)/4
1313                 "psubusb %%mm7, %%mm2                   \n\t"
1314                 "psubusb %%mm7, %%mm4                   \n\t"
1315                 "psubusb %%mm7, %%mm5                   \n\t"
1316                 "pcmpeqb "MANGLE(b00)", %%mm2           \n\t" // L11 > a ? 0 : -1
1317                 "pcmpeqb "MANGLE(b00)", %%mm4           \n\t" // L21 > a ? 0 : -1
1318                 "pcmpeqb "MANGLE(b00)", %%mm5           \n\t" // L01 > a ? 0 : -1
1319                 "paddb %%mm4, %%mm2                     \n\t"
1320                 "paddb %%mm5, %%mm2                     \n\t"
1321 // 0, 2, 3, 1
1322 #define REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
1323                 "movq " #src ", " #sx "                 \n\t" /* src[0] */\
1324                 "movq " #sx ", " #lx "                  \n\t" /* src[0] */\
1325                 "movq " #sx ", " #t0 "                  \n\t" /* src[0] */\
1326                 "psllq $8, " #lx "                      \n\t"\
1327                 "psrlq $8, " #t0 "                      \n\t"\
1328                 "movd -4" #src ", " #t1 "               \n\t"\
1329                 "psrlq $24, " #t1 "                     \n\t"\
1330                 "por " #t1 ", " #lx "                   \n\t" /* src[-1] */\
1331                 "movd 8" #src ", " #t1 "                \n\t"\
1332                 "psllq $56, " #t1 "                     \n\t"\
1333                 "por " #t1 ", " #t0 "                   \n\t" /* src[+1] */\
1334                 "movq " #lx ", " #t1 "                  \n\t" /* src[-1] */\
1335                 PAVGB(t0, lx)                                 /* (src[-1] + src[+1])/2 */\
1336                 PAVGB(sx, lx)                                 /* (src[-1] + 2src[0] + src[+1])/4 */\
1337                 PAVGB(lx, pplx)                                     \
1338                 "movq " #lx ", 8(%%"REG_c")             \n\t"\
1339                 "movq (%%"REG_c"), " #lx "              \n\t"\
1340                 "psubusb " #lx ", " #t1 "               \n\t"\
1341                 "psubusb " #lx ", " #t0 "               \n\t"\
1342                 "psubusb " #lx ", " #sx "               \n\t"\
1343                 "movq "MANGLE(b00)", " #lx "            \n\t"\
1344                 "pcmpeqb " #lx ", " #t1 "               \n\t" /* src[-1] > a ? 0 : -1*/\
1345                 "pcmpeqb " #lx ", " #t0 "               \n\t" /* src[+1] > a ? 0 : -1*/\
1346                 "pcmpeqb " #lx ", " #sx "               \n\t" /* src[0]  > a ? 0 : -1*/\
1347                 "paddb " #t1 ", " #t0 "                 \n\t"\
1348                 "paddb " #t0 ", " #sx "                 \n\t"\
1349 \
1350                 PAVGB(plx, pplx)                              /* filtered */\
1351                 "movq " #dst ", " #t0 "                 \n\t" /* dst */\
1352                 "movq " #t0 ", " #t1 "                  \n\t" /* dst */\
1353                 "psubusb %3, " #t0 "                    \n\t"\
1354                 "paddusb %3, " #t1 "                    \n\t"\
1355                 PMAXUB(t0, pplx)\
1356                 PMINUB(t1, pplx, t0)\
1357                 "paddb " #sx ", " #ppsx "               \n\t"\
1358                 "paddb " #psx ", " #ppsx "              \n\t"\
1359                 "#paddb "MANGLE(b02)", " #ppsx "        \n\t"\
1360                 "pand "MANGLE(b08)", " #ppsx "          \n\t"\
1361                 "pcmpeqb " #lx ", " #ppsx "             \n\t"\
1362                 "pand " #ppsx ", " #pplx "              \n\t"\
1363                 "pandn " #dst ", " #ppsx "              \n\t"\
1364                 "por " #pplx ", " #ppsx "               \n\t"\
1365                 "movq " #ppsx ", " #dst "               \n\t"\
1366                 "movq 8(%%"REG_c"), " #lx "             \n\t"
1367
1368 #define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
1369    REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1)
1370 /*
1371 0000000
1372 1111111
1373
1374 1111110
1375 1111101
1376 1111100
1377 1111011
1378 1111010
1379 1111001
1380
1381 1111000
1382 1110111
1383
1384 */
1385 //DERING_CORE(dst          ,src            ,ppsx ,psx  ,sx   ,pplx ,plx  ,lx   ,t0   ,t1)
1386 DERING_CORE((%%REGa)       ,(%%REGa, %1)   ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1387 DERING_CORE((%%REGa, %1)   ,(%%REGa, %1, 2),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1388 DERING_CORE((%%REGa, %1, 2),(%0, %1, 4)    ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1389 DERING_CORE((%0, %1, 4)    ,(%%REGd)       ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1390 DERING_CORE((%%REGd)       ,(%%REGd, %1)   ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1391 DERING_CORE((%%REGd, %1)   ,(%%REGd, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1392 DERING_CORE((%%REGd, %1, 2),(%0, %1, 8)    ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1393 DERING_CORE((%0, %1, 8)    ,(%%REGd, %1, 4),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1394
1395                 "1:                        \n\t"
1396                 : : "r" (src), "r" ((long)stride), "m" (c->pQPb), "m"(c->pQPb2)
1397                 : "%"REG_a, "%"REG_d, "%"REG_c
1398         );
1399 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1400         int y;
1401         int min=255;
1402         int max=0;
1403         int avg;
1404         uint8_t *p;
1405         int s[10];
1406         const int QP2= c->QP/2 + 1;
1407
1408         for(y=1; y<9; y++)
1409         {
1410                 int x;
1411                 p= src + stride*y;
1412                 for(x=1; x<9; x++)
1413                 {
1414                         p++;
1415                         if(*p > max) max= *p;
1416                         if(*p < min) min= *p;
1417                 }
1418         }
1419         avg= (min + max + 1)>>1;
1420
1421         if(max - min <deringThreshold) return;
1422
1423         for(y=0; y<10; y++)
1424         {
1425                 int t = 0;
1426
1427                 if(src[stride*y + 0] > avg) t+= 1;
1428                 if(src[stride*y + 1] > avg) t+= 2;
1429                 if(src[stride*y + 2] > avg) t+= 4;
1430                 if(src[stride*y + 3] > avg) t+= 8;
1431                 if(src[stride*y + 4] > avg) t+= 16;
1432                 if(src[stride*y + 5] > avg) t+= 32;
1433                 if(src[stride*y + 6] > avg) t+= 64;
1434                 if(src[stride*y + 7] > avg) t+= 128;
1435                 if(src[stride*y + 8] > avg) t+= 256;
1436                 if(src[stride*y + 9] > avg) t+= 512;
1437
1438                 t |= (~t)<<16;
1439                 t &= (t<<1) & (t>>1);
1440                 s[y] = t;
1441         }
1442
1443         for(y=1; y<9; y++)
1444         {
1445                 int t = s[y-1] & s[y] & s[y+1];
1446                 t|= t>>16;
1447                 s[y-1]= t;
1448         }
1449
1450         for(y=1; y<9; y++)
1451         {
1452                 int x;
1453                 int t = s[y-1];
1454
1455                 p= src + stride*y;
1456                 for(x=1; x<9; x++)
1457                 {
1458                         p++;
1459                         if(t & (1<<x))
1460                         {
1461                                 int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1))
1462                                       +2*(*(p     -1)) + 4*(*p         ) + 2*(*(p     +1))
1463                                       +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1));
1464                                 f= (f + 8)>>4;
1465
1466 #ifdef DEBUG_DERING_THRESHOLD
1467                                 asm volatile("emms\n\t":);
1468                                 {
1469                                 static long long numPixels=0;
1470                                 if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++;
1471 //                                if((max-min)<20 || (max-min)*QP<200)
1472 //                                if((max-min)*QP < 500)
1473 //                                if(max-min<QP/2)
1474                                 if(max-min < 20)
1475                                 {
1476                                         static int numSkiped=0;
1477                                         static int errorSum=0;
1478                                         static int worstQP=0;
1479                                         static int worstRange=0;
1480                                         static int worstDiff=0;
1481                                         int diff= (f - *p);
1482                                         int absDiff= FFABS(diff);
1483                                         int error= diff*diff;
1484
1485                                         if(x==1 || x==8 || y==1 || y==8) continue;
1486
1487                                         numSkiped++;
1488                                         if(absDiff > worstDiff)
1489                                         {
1490                                                 worstDiff= absDiff;
1491                                                 worstQP= QP;
1492                                                 worstRange= max-min;
1493                                         }
1494                                         errorSum+= error;
1495
1496                                         if(1024LL*1024LL*1024LL % numSkiped == 0)
1497                                         {
1498                                                 av_log(c, AV_LOG_INFO, "sum:%1.3f, skip:%d, wQP:%d, "
1499                                                         "wRange:%d, wDiff:%d, relSkip:%1.3f\n",
1500                                                         (float)errorSum/numSkiped, numSkiped, worstQP, worstRange,
1501                                                         worstDiff, (float)numSkiped/numPixels);
1502                                         }
1503                                 }
1504                                 }
1505 #endif
1506                                 if     (*p + QP2 < f) *p= *p + QP2;
1507                                 else if(*p - QP2 > f) *p= *p - QP2;
1508                                 else *p=f;
1509                         }
1510                 }
1511         }
1512 #ifdef DEBUG_DERING_THRESHOLD
1513         if(max-min < 20)
1514         {
1515                 for(y=1; y<9; y++)
1516                 {
1517                         int x;
1518                         int t = 0;
1519                         p= src + stride*y;
1520                         for(x=1; x<9; x++)
1521                         {
1522                                 p++;
1523                                 *p = FFMIN(*p + 20, 255);
1524                         }
1525                 }
1526 //                src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255;
1527         }
1528 #endif
1529 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1530 }
1531 #endif //HAVE_ALTIVEC
1532
1533 /**
1534  * Deinterlaces the given block by linearly interpolating every second line.
1535  * will be called for every 8x8 block and can read & write from line 4-15
1536  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1537  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1538  */
1539 static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride)
1540 {
1541 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1542         src+= 4*stride;
1543         asm volatile(
1544                 "lea (%0, %1), %%"REG_a"                \n\t"
1545                 "lea (%%"REG_a", %1, 4), %%"REG_c"      \n\t"
1546 //      0       1       2       3       4       5       6       7       8       9
1547 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ecx     ecx+%1  ecx+2%1 %0+8%1  ecx+4%1
1548
1549                 "movq (%0), %%mm0                       \n\t"
1550                 "movq (%%"REG_a", %1), %%mm1            \n\t"
1551                 PAVGB(%%mm1, %%mm0)
1552                 "movq %%mm0, (%%"REG_a")                \n\t"
1553                 "movq (%0, %1, 4), %%mm0                \n\t"
1554                 PAVGB(%%mm0, %%mm1)
1555                 "movq %%mm1, (%%"REG_a", %1, 2)         \n\t"
1556                 "movq (%%"REG_c", %1), %%mm1            \n\t"
1557                 PAVGB(%%mm1, %%mm0)
1558                 "movq %%mm0, (%%"REG_c")                \n\t"
1559                 "movq (%0, %1, 8), %%mm0                \n\t"
1560                 PAVGB(%%mm0, %%mm1)
1561                 "movq %%mm1, (%%"REG_c", %1, 2)         \n\t"
1562
1563                 : : "r" (src), "r" ((long)stride)
1564                 : "%"REG_a, "%"REG_c
1565         );
1566 #else
1567         int a, b, x;
1568         src+= 4*stride;
1569
1570         for(x=0; x<2; x++){
1571                 a= *(uint32_t*)&src[stride*0];
1572                 b= *(uint32_t*)&src[stride*2];
1573                 *(uint32_t*)&src[stride*1]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1574                 a= *(uint32_t*)&src[stride*4];
1575                 *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1576                 b= *(uint32_t*)&src[stride*6];
1577                 *(uint32_t*)&src[stride*5]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1578                 a= *(uint32_t*)&src[stride*8];
1579                 *(uint32_t*)&src[stride*7]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1580                 src += 4;
1581         }
1582 #endif
1583 }
1584
1585 /**
1586  * Deinterlaces the given block by cubic interpolating every second line.
1587  * will be called for every 8x8 block and can read & write from line 4-15
1588  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1589  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1590  * this filter will read lines 3-15 and write 7-13
1591  */
1592 static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride)
1593 {
1594 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1595         src+= stride*3;
1596         asm volatile(
1597                 "lea (%0, %1), %%"REG_a"                \n\t"
1598                 "lea (%%"REG_a", %1, 4), %%"REG_d"      \n\t"
1599                 "lea (%%"REG_d", %1, 4), %%"REG_c"      \n\t"
1600                 "add %1, %%"REG_c"                      \n\t"
1601                 "pxor %%mm7, %%mm7                      \n\t"
1602 //      0       1       2       3       4       5       6       7       8       9       10
1603 //      %0      eax     eax+%1  eax+2%1 %0+4%1  edx     edx+%1  edx+2%1 %0+8%1  edx+4%1 ecx
1604
1605 #define REAL_DEINT_CUBIC(a,b,c,d,e)\
1606                 "movq " #a ", %%mm0                     \n\t"\
1607                 "movq " #b ", %%mm1                     \n\t"\
1608                 "movq " #d ", %%mm2                     \n\t"\
1609                 "movq " #e ", %%mm3                     \n\t"\
1610                 PAVGB(%%mm2, %%mm1)                             /* (b+d) /2 */\
1611                 PAVGB(%%mm3, %%mm0)                             /* a(a+e) /2 */\
1612                 "movq %%mm0, %%mm2                      \n\t"\
1613                 "punpcklbw %%mm7, %%mm0                 \n\t"\
1614                 "punpckhbw %%mm7, %%mm2                 \n\t"\
1615                 "movq %%mm1, %%mm3                      \n\t"\
1616                 "punpcklbw %%mm7, %%mm1                 \n\t"\
1617                 "punpckhbw %%mm7, %%mm3                 \n\t"\
1618                 "psubw %%mm1, %%mm0                     \n\t"   /* L(a+e - (b+d))/2 */\
1619                 "psubw %%mm3, %%mm2                     \n\t"   /* H(a+e - (b+d))/2 */\
1620                 "psraw $3, %%mm0                        \n\t"   /* L(a+e - (b+d))/16 */\
1621                 "psraw $3, %%mm2                        \n\t"   /* H(a+e - (b+d))/16 */\
1622                 "psubw %%mm0, %%mm1                     \n\t"   /* L(9b + 9d - a - e)/16 */\
1623                 "psubw %%mm2, %%mm3                     \n\t"   /* H(9b + 9d - a - e)/16 */\
1624                 "packuswb %%mm3, %%mm1                  \n\t"\
1625                 "movq %%mm1, " #c "                     \n\t"
1626 #define DEINT_CUBIC(a,b,c,d,e)  REAL_DEINT_CUBIC(a,b,c,d,e)
1627
1628 DEINT_CUBIC((%0)        , (%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4) , (%%REGd, %1))
1629 DEINT_CUBIC((%%REGa, %1), (%0, %1, 4) , (%%REGd)       , (%%REGd, %1), (%0, %1, 8))
1630 DEINT_CUBIC((%0, %1, 4) , (%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8) , (%%REGc))
1631 DEINT_CUBIC((%%REGd, %1), (%0, %1, 8) , (%%REGd, %1, 4), (%%REGc)    , (%%REGc, %1, 2))
1632
1633                 : : "r" (src), "r" ((long)stride)
1634                 : "%"REG_a, "%"REG_d, "%"REG_c
1635         );
1636 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1637         int x;
1638         src+= stride*3;
1639         for(x=0; x<8; x++)
1640         {
1641                 src[stride*3] = CLIP((-src[0]        + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4);
1642                 src[stride*5] = CLIP((-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4);
1643                 src[stride*7] = CLIP((-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4);
1644                 src[stride*9] = CLIP((-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4);
1645                 src++;
1646         }
1647 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1648 }
1649
1650 /**
1651  * Deinterlaces the given block by filtering every second line with a (-1 4 2 4 -1) filter.
1652  * will be called for every 8x8 block and can read & write from line 4-15
1653  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1654  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1655  * this filter will read lines 4-13 and write 5-11
1656  */
1657 static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp)
1658 {
1659 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1660         src+= stride*4;
1661         asm volatile(
1662                 "lea (%0, %1), %%"REG_a"                \n\t"
1663                 "lea (%%"REG_a", %1, 4), %%"REG_d"      \n\t"
1664                 "pxor %%mm7, %%mm7                      \n\t"
1665                 "movq (%2), %%mm0                       \n\t"
1666 //      0       1       2       3       4       5       6       7       8       9       10
1667 //      %0      eax     eax+%1  eax+2%1 %0+4%1  edx     edx+%1  edx+2%1 %0+8%1  edx+4%1 ecx
1668
1669 #define REAL_DEINT_FF(a,b,c,d)\
1670                 "movq " #a ", %%mm1                     \n\t"\
1671                 "movq " #b ", %%mm2                     \n\t"\
1672                 "movq " #c ", %%mm3                     \n\t"\
1673                 "movq " #d ", %%mm4                     \n\t"\
1674                 PAVGB(%%mm3, %%mm1)                          \
1675                 PAVGB(%%mm4, %%mm0)                          \
1676                 "movq %%mm0, %%mm3                      \n\t"\
1677                 "punpcklbw %%mm7, %%mm0                 \n\t"\
1678                 "punpckhbw %%mm7, %%mm3                 \n\t"\
1679                 "movq %%mm1, %%mm4                      \n\t"\
1680                 "punpcklbw %%mm7, %%mm1                 \n\t"\
1681                 "punpckhbw %%mm7, %%mm4                 \n\t"\
1682                 "psllw $2, %%mm1                        \n\t"\
1683                 "psllw $2, %%mm4                        \n\t"\
1684                 "psubw %%mm0, %%mm1                     \n\t"\
1685                 "psubw %%mm3, %%mm4                     \n\t"\
1686                 "movq %%mm2, %%mm5                      \n\t"\
1687                 "movq %%mm2, %%mm0                      \n\t"\
1688                 "punpcklbw %%mm7, %%mm2                 \n\t"\
1689                 "punpckhbw %%mm7, %%mm5                 \n\t"\
1690                 "paddw %%mm2, %%mm1                     \n\t"\
1691                 "paddw %%mm5, %%mm4                     \n\t"\
1692                 "psraw $2, %%mm1                        \n\t"\
1693                 "psraw $2, %%mm4                        \n\t"\
1694                 "packuswb %%mm4, %%mm1                  \n\t"\
1695                 "movq %%mm1, " #b "                     \n\t"\
1696
1697 #define DEINT_FF(a,b,c,d)  REAL_DEINT_FF(a,b,c,d)
1698
1699 DEINT_FF((%0)        , (%%REGa)       , (%%REGa, %1), (%%REGa, %1, 2))
1700 DEINT_FF((%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4) , (%%REGd)       )
1701 DEINT_FF((%0, %1, 4) , (%%REGd)       , (%%REGd, %1), (%%REGd, %1, 2))
1702 DEINT_FF((%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8) , (%%REGd, %1, 4))
1703
1704                 "movq %%mm0, (%2)                       \n\t"
1705                 : : "r" (src), "r" ((long)stride), "r"(tmp)
1706                 : "%"REG_a, "%"REG_d
1707         );
1708 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1709         int x;
1710         src+= stride*4;
1711         for(x=0; x<8; x++)
1712         {
1713                 int t1= tmp[x];
1714                 int t2= src[stride*1];
1715
1716                 src[stride*1]= CLIP((-t1 + 4*src[stride*0] + 2*t2 + 4*src[stride*2] - src[stride*3] + 4)>>3);
1717                 t1= src[stride*4];
1718                 src[stride*3]= CLIP((-t2 + 4*src[stride*2] + 2*t1 + 4*src[stride*4] - src[stride*5] + 4)>>3);
1719                 t2= src[stride*6];
1720                 src[stride*5]= CLIP((-t1 + 4*src[stride*4] + 2*t2 + 4*src[stride*6] - src[stride*7] + 4)>>3);
1721                 t1= src[stride*8];
1722                 src[stride*7]= CLIP((-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3);
1723                 tmp[x]= t1;
1724
1725                 src++;
1726         }
1727 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1728 }
1729
1730 /**
1731  * Deinterlaces the given block by filtering every line with a (-1 2 6 2 -1) filter.
1732  * will be called for every 8x8 block and can read & write from line 4-15
1733  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1734  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1735  * this filter will read lines 4-13 and write 4-11
1736  */
1737 static inline void RENAME(deInterlaceL5)(uint8_t src[], int stride, uint8_t *tmp, uint8_t *tmp2)
1738 {
1739 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1740         src+= stride*4;
1741         asm volatile(
1742                 "lea (%0, %1), %%"REG_a"                \n\t"
1743                 "lea (%%"REG_a", %1, 4), %%"REG_d"      \n\t"
1744                 "pxor %%mm7, %%mm7                      \n\t"
1745                 "movq (%2), %%mm0                       \n\t"
1746                 "movq (%3), %%mm1                       \n\t"
1747 //      0       1       2       3       4       5       6       7       8       9       10
1748 //      %0      eax     eax+%1  eax+2%1 %0+4%1  edx     edx+%1  edx+2%1 %0+8%1  edx+4%1 ecx
1749
1750 #define REAL_DEINT_L5(t1,t2,a,b,c)\
1751                 "movq " #a ", %%mm2                     \n\t"\
1752                 "movq " #b ", %%mm3                     \n\t"\
1753                 "movq " #c ", %%mm4                     \n\t"\
1754                 PAVGB(t2, %%mm3)                             \
1755                 PAVGB(t1, %%mm4)                             \
1756                 "movq %%mm2, %%mm5                      \n\t"\
1757                 "movq %%mm2, " #t1 "                    \n\t"\
1758                 "punpcklbw %%mm7, %%mm2                 \n\t"\
1759                 "punpckhbw %%mm7, %%mm5                 \n\t"\
1760                 "movq %%mm2, %%mm6                      \n\t"\
1761                 "paddw %%mm2, %%mm2                     \n\t"\
1762                 "paddw %%mm6, %%mm2                     \n\t"\
1763                 "movq %%mm5, %%mm6                      \n\t"\
1764                 "paddw %%mm5, %%mm5                     \n\t"\
1765                 "paddw %%mm6, %%mm5                     \n\t"\
1766                 "movq %%mm3, %%mm6                      \n\t"\
1767                 "punpcklbw %%mm7, %%mm3                 \n\t"\
1768                 "punpckhbw %%mm7, %%mm6                 \n\t"\
1769                 "paddw %%mm3, %%mm3                     \n\t"\
1770                 "paddw %%mm6, %%mm6                     \n\t"\
1771                 "paddw %%mm3, %%mm2                     \n\t"\
1772                 "paddw %%mm6, %%mm5                     \n\t"\
1773                 "movq %%mm4, %%mm6                      \n\t"\
1774                 "punpcklbw %%mm7, %%mm4                 \n\t"\
1775                 "punpckhbw %%mm7, %%mm6                 \n\t"\
1776                 "psubw %%mm4, %%mm2                     \n\t"\
1777                 "psubw %%mm6, %%mm5                     \n\t"\
1778                 "psraw $2, %%mm2                        \n\t"\
1779                 "psraw $2, %%mm5                        \n\t"\
1780                 "packuswb %%mm5, %%mm2                  \n\t"\
1781                 "movq %%mm2, " #a "                     \n\t"\
1782
1783 #define DEINT_L5(t1,t2,a,b,c)  REAL_DEINT_L5(t1,t2,a,b,c)
1784
1785 DEINT_L5(%%mm0, %%mm1, (%0)           , (%%REGa)       , (%%REGa, %1)   )
1786 DEINT_L5(%%mm1, %%mm0, (%%REGa)       , (%%REGa, %1)   , (%%REGa, %1, 2))
1787 DEINT_L5(%%mm0, %%mm1, (%%REGa, %1)   , (%%REGa, %1, 2), (%0, %1, 4)   )
1788 DEINT_L5(%%mm1, %%mm0, (%%REGa, %1, 2), (%0, %1, 4)    , (%%REGd)       )
1789 DEINT_L5(%%mm0, %%mm1, (%0, %1, 4)    , (%%REGd)       , (%%REGd, %1)   )
1790 DEINT_L5(%%mm1, %%mm0, (%%REGd)       , (%%REGd, %1)   , (%%REGd, %1, 2))
1791 DEINT_L5(%%mm0, %%mm1, (%%REGd, %1)   , (%%REGd, %1, 2), (%0, %1, 8)   )
1792 DEINT_L5(%%mm1, %%mm0, (%%REGd, %1, 2), (%0, %1, 8)    , (%%REGd, %1, 4))
1793
1794                 "movq %%mm0, (%2)                       \n\t"
1795                 "movq %%mm1, (%3)                       \n\t"
1796                 : : "r" (src), "r" ((long)stride), "r"(tmp), "r"(tmp2)
1797                 : "%"REG_a, "%"REG_d
1798         );
1799 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1800         int x;
1801         src+= stride*4;
1802         for(x=0; x<8; x++)
1803         {
1804                 int t1= tmp[x];
1805                 int t2= tmp2[x];
1806                 int t3= src[0];
1807
1808                 src[stride*0]= CLIP((-(t1 + src[stride*2]) + 2*(t2 + src[stride*1]) + 6*t3 + 4)>>3);
1809                 t1= src[stride*1];
1810                 src[stride*1]= CLIP((-(t2 + src[stride*3]) + 2*(t3 + src[stride*2]) + 6*t1 + 4)>>3);
1811                 t2= src[stride*2];
1812                 src[stride*2]= CLIP((-(t3 + src[stride*4]) + 2*(t1 + src[stride*3]) + 6*t2 + 4)>>3);
1813                 t3= src[stride*3];
1814                 src[stride*3]= CLIP((-(t1 + src[stride*5]) + 2*(t2 + src[stride*4]) + 6*t3 + 4)>>3);
1815                 t1= src[stride*4];
1816                 src[stride*4]= CLIP((-(t2 + src[stride*6]) + 2*(t3 + src[stride*5]) + 6*t1 + 4)>>3);
1817                 t2= src[stride*5];
1818                 src[stride*5]= CLIP((-(t3 + src[stride*7]) + 2*(t1 + src[stride*6]) + 6*t2 + 4)>>3);
1819                 t3= src[stride*6];
1820                 src[stride*6]= CLIP((-(t1 + src[stride*8]) + 2*(t2 + src[stride*7]) + 6*t3 + 4)>>3);
1821                 t1= src[stride*7];
1822                 src[stride*7]= CLIP((-(t2 + src[stride*9]) + 2*(t3 + src[stride*8]) + 6*t1 + 4)>>3);
1823
1824                 tmp[x]= t3;
1825                 tmp2[x]= t1;
1826
1827                 src++;
1828         }
1829 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1830 }
1831
1832 /**
1833  * Deinterlaces the given block by filtering all lines with a (1 2 1) filter.
1834  * will be called for every 8x8 block and can read & write from line 4-15
1835  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1836  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1837  * this filter will read lines 4-13 and write 4-11
1838  */
1839 static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride, uint8_t *tmp)
1840 {
1841 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1842         src+= 4*stride;
1843         asm volatile(
1844                 "lea (%0, %1), %%"REG_a"                \n\t"
1845                 "lea (%%"REG_a", %1, 4), %%"REG_d"      \n\t"
1846 //      0       1       2       3       4       5       6       7       8       9
1847 //      %0      eax     eax+%1  eax+2%1 %0+4%1  edx     edx+%1  edx+2%1 %0+8%1  edx+4%1
1848
1849                 "movq (%2), %%mm0                       \n\t" // L0
1850                 "movq (%%"REG_a"), %%mm1                \n\t" // L2
1851                 PAVGB(%%mm1, %%mm0)                           // L0+L2
1852                 "movq (%0), %%mm2                       \n\t" // L1
1853                 PAVGB(%%mm2, %%mm0)
1854                 "movq %%mm0, (%0)                       \n\t"
1855                 "movq (%%"REG_a", %1), %%mm0            \n\t" // L3
1856                 PAVGB(%%mm0, %%mm2)                           // L1+L3
1857                 PAVGB(%%mm1, %%mm2)                           // 2L2 + L1 + L3
1858                 "movq %%mm2, (%%"REG_a")                \n\t"
1859                 "movq (%%"REG_a", %1, 2), %%mm2         \n\t" // L4
1860                 PAVGB(%%mm2, %%mm1)                           // L2+L4
1861                 PAVGB(%%mm0, %%mm1)                           // 2L3 + L2 + L4
1862                 "movq %%mm1, (%%"REG_a", %1)            \n\t"
1863                 "movq (%0, %1, 4), %%mm1                \n\t" // L5
1864                 PAVGB(%%mm1, %%mm0)                           // L3+L5
1865                 PAVGB(%%mm2, %%mm0)                           // 2L4 + L3 + L5
1866                 "movq %%mm0, (%%"REG_a", %1, 2)         \n\t"
1867                 "movq (%%"REG_d"), %%mm0                \n\t" // L6
1868                 PAVGB(%%mm0, %%mm2)                           // L4+L6
1869                 PAVGB(%%mm1, %%mm2)                           // 2L5 + L4 + L6
1870                 "movq %%mm2, (%0, %1, 4)                \n\t"
1871                 "movq (%%"REG_d", %1), %%mm2            \n\t" // L7
1872                 PAVGB(%%mm2, %%mm1)                           // L5+L7
1873                 PAVGB(%%mm0, %%mm1)                           // 2L6 + L5 + L7
1874                 "movq %%mm1, (%%"REG_d")                \n\t"
1875                 "movq (%%"REG_d", %1, 2), %%mm1         \n\t" // L8
1876                 PAVGB(%%mm1, %%mm0)                           // L6+L8
1877                 PAVGB(%%mm2, %%mm0)                           // 2L7 + L6 + L8
1878                 "movq %%mm0, (%%"REG_d", %1)            \n\t"
1879                 "movq (%0, %1, 8), %%mm0                \n\t" // L9
1880                 PAVGB(%%mm0, %%mm2)                           // L7+L9
1881                 PAVGB(%%mm1, %%mm2)                           // 2L8 + L7 + L9
1882                 "movq %%mm2, (%%"REG_d", %1, 2)         \n\t"
1883                 "movq %%mm1, (%2)                       \n\t"
1884
1885                 : : "r" (src), "r" ((long)stride), "r" (tmp)
1886                 : "%"REG_a, "%"REG_d
1887         );
1888 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1889         int a, b, c, x;
1890         src+= 4*stride;
1891
1892         for(x=0; x<2; x++){
1893                 a= *(uint32_t*)&tmp[stride*0];
1894                 b= *(uint32_t*)&src[stride*0];
1895                 c= *(uint32_t*)&src[stride*1];
1896                 a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1897                 *(uint32_t*)&src[stride*0]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1898
1899                 a= *(uint32_t*)&src[stride*2];
1900                 b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1901                 *(uint32_t*)&src[stride*1]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1902
1903                 b= *(uint32_t*)&src[stride*3];
1904                 c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1);
1905                 *(uint32_t*)&src[stride*2]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1);
1906
1907                 c= *(uint32_t*)&src[stride*4];
1908                 a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1909                 *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1910
1911                 a= *(uint32_t*)&src[stride*5];
1912                 b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1913                 *(uint32_t*)&src[stride*4]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1914
1915                 b= *(uint32_t*)&src[stride*6];
1916                 c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1);
1917                 *(uint32_t*)&src[stride*5]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1);
1918
1919                 c= *(uint32_t*)&src[stride*7];
1920                 a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1921                 *(uint32_t*)&src[stride*6]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1922
1923                 a= *(uint32_t*)&src[stride*8];
1924                 b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1925                 *(uint32_t*)&src[stride*7]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1926
1927                 *(uint32_t*)&tmp[stride*0]= c;
1928                 src += 4;
1929                 tmp += 4;
1930         }
1931 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1932 }
1933
1934 /**
1935  * Deinterlaces the given block by applying a median filter to every second line.
1936  * will be called for every 8x8 block and can read & write from line 4-15,
1937  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1938  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1939  */
1940 static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride)
1941 {
1942 #ifdef HAVE_MMX
1943         src+= 4*stride;
1944 #ifdef HAVE_MMX2
1945         asm volatile(
1946                 "lea (%0, %1), %%"REG_a"                \n\t"
1947                 "lea (%%"REG_a", %1, 4), %%"REG_d"      \n\t"
1948 //      0       1       2       3       4       5       6       7       8       9
1949 //      %0      eax     eax+%1  eax+2%1 %0+4%1  edx     edx+%1  edx+2%1 %0+8%1  edx+4%1
1950
1951                 "movq (%0), %%mm0                       \n\t" //
1952                 "movq (%%"REG_a", %1), %%mm2            \n\t" //
1953                 "movq (%%"REG_a"), %%mm1                \n\t" //
1954                 "movq %%mm0, %%mm3                      \n\t"
1955                 "pmaxub %%mm1, %%mm0                    \n\t" //
1956                 "pminub %%mm3, %%mm1                    \n\t" //
1957                 "pmaxub %%mm2, %%mm1                    \n\t" //
1958                 "pminub %%mm1, %%mm0                    \n\t"
1959                 "movq %%mm0, (%%"REG_a")                \n\t"
1960
1961                 "movq (%0, %1, 4), %%mm0                \n\t" //
1962                 "movq (%%"REG_a", %1, 2), %%mm1         \n\t" //
1963                 "movq %%mm2, %%mm3                      \n\t"
1964                 "pmaxub %%mm1, %%mm2                    \n\t" //
1965                 "pminub %%mm3, %%mm1                    \n\t" //
1966                 "pmaxub %%mm0, %%mm1                    \n\t" //
1967                 "pminub %%mm1, %%mm2                    \n\t"
1968                 "movq %%mm2, (%%"REG_a", %1, 2)         \n\t"
1969
1970                 "movq (%%"REG_d"), %%mm2                \n\t" //
1971                 "movq (%%"REG_d", %1), %%mm1            \n\t" //
1972                 "movq %%mm2, %%mm3                      \n\t"
1973                 "pmaxub %%mm0, %%mm2                    \n\t" //
1974                 "pminub %%mm3, %%mm0                    \n\t" //
1975                 "pmaxub %%mm1, %%mm0                    \n\t" //
1976                 "pminub %%mm0, %%mm2                    \n\t"
1977                 "movq %%mm2, (%%"REG_d")                \n\t"
1978
1979                 "movq (%%"REG_d", %1, 2), %%mm2         \n\t" //
1980                 "movq (%0, %1, 8), %%mm0                \n\t" //
1981                 "movq %%mm2, %%mm3                      \n\t"
1982                 "pmaxub %%mm0, %%mm2                    \n\t" //
1983                 "pminub %%mm3, %%mm0                    \n\t" //
1984                 "pmaxub %%mm1, %%mm0                    \n\t" //
1985                 "pminub %%mm0, %%mm2                    \n\t"
1986                 "movq %%mm2, (%%"REG_d", %1, 2)         \n\t"
1987
1988
1989                 : : "r" (src), "r" ((long)stride)
1990                 : "%"REG_a, "%"REG_d
1991         );
1992
1993 #else // MMX without MMX2
1994         asm volatile(
1995                 "lea (%0, %1), %%"REG_a"                \n\t"
1996                 "lea (%%"REG_a", %1, 4), %%"REG_d"      \n\t"
1997 //      0       1       2       3       4       5       6       7       8       9
1998 //      %0      eax     eax+%1  eax+2%1 %0+4%1  edx     edx+%1  edx+2%1 %0+8%1  edx+4%1
1999                 "pxor %%mm7, %%mm7                      \n\t"
2000
2001 #define REAL_MEDIAN(a,b,c)\
2002                 "movq " #a ", %%mm0                     \n\t"\
2003                 "movq " #b ", %%mm2                     \n\t"\
2004                 "movq " #c ", %%mm1                     \n\t"\
2005                 "movq %%mm0, %%mm3                      \n\t"\
2006                 "movq %%mm1, %%mm4                      \n\t"\
2007                 "movq %%mm2, %%mm5                      \n\t"\
2008                 "psubusb %%mm1, %%mm3                   \n\t"\
2009                 "psubusb %%mm2, %%mm4                   \n\t"\
2010                 "psubusb %%mm0, %%mm5                   \n\t"\
2011                 "pcmpeqb %%mm7, %%mm3                   \n\t"\
2012                 "pcmpeqb %%mm7, %%mm4                   \n\t"\
2013                 "pcmpeqb %%mm7, %%mm5                   \n\t"\
2014                 "movq %%mm3, %%mm6                      \n\t"\
2015                 "pxor %%mm4, %%mm3                      \n\t"\
2016                 "pxor %%mm5, %%mm4                      \n\t"\
2017                 "pxor %%mm6, %%mm5                      \n\t"\
2018                 "por %%mm3, %%mm1                       \n\t"\
2019                 "por %%mm4, %%mm2                       \n\t"\
2020                 "por %%mm5, %%mm0                       \n\t"\
2021                 "pand %%mm2, %%mm0                      \n\t"\
2022                 "pand %%mm1, %%mm0                      \n\t"\
2023                 "movq %%mm0, " #b "                     \n\t"
2024 #define MEDIAN(a,b,c)  REAL_MEDIAN(a,b,c)
2025
2026 MEDIAN((%0)        , (%%REGa)       , (%%REGa, %1))
2027 MEDIAN((%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4))
2028 MEDIAN((%0, %1, 4) , (%%REGd)       , (%%REGd, %1))
2029 MEDIAN((%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8))
2030
2031                 : : "r" (src), "r" ((long)stride)
2032                 : "%"REG_a, "%"REG_d
2033         );
2034 #endif //HAVE_MMX2
2035 #else //HAVE_MMX
2036         int x, y;
2037         src+= 4*stride;
2038         // FIXME - there should be a way to do a few columns in parallel like w/mmx
2039         for(x=0; x<8; x++)
2040         {
2041                 uint8_t *colsrc = src;
2042                 for (y=0; y<4; y++)
2043                 {
2044                         int a, b, c, d, e, f;
2045                         a = colsrc[0       ];
2046                         b = colsrc[stride  ];
2047                         c = colsrc[stride*2];
2048                         d = (a-b)>>31;
2049                         e = (b-c)>>31;
2050                         f = (c-a)>>31;
2051                         colsrc[stride  ] = (a|(d^f)) & (b|(d^e)) & (c|(e^f));
2052                         colsrc += stride*2;
2053                 }
2054                 src++;
2055         }
2056 #endif //HAVE_MMX
2057 }
2058
2059 #ifdef HAVE_MMX
2060 /**
2061  * transposes and shift the given 8x8 Block into dst1 and dst2
2062  */
2063 static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride)
2064 {
2065         asm(
2066                 "lea (%0, %1), %%"REG_a"                \n\t"
2067 //      0       1       2       3       4       5       6       7       8       9
2068 //      %0      eax     eax+%1  eax+2%1 %0+4%1  edx     edx+%1  edx+2%1 %0+8%1  edx+4%1
2069                 "movq (%0), %%mm0                       \n\t" // 12345678
2070                 "movq (%%"REG_a"), %%mm1                \n\t" // abcdefgh
2071                 "movq %%mm0, %%mm2                      \n\t" // 12345678
2072                 "punpcklbw %%mm1, %%mm0                 \n\t" // 1a2b3c4d
2073                 "punpckhbw %%mm1, %%mm2                 \n\t" // 5e6f7g8h
2074
2075                 "movq (%%"REG_a", %1), %%mm1            \n\t"
2076                 "movq (%%"REG_a", %1, 2), %%mm3         \n\t"
2077                 "movq %%mm1, %%mm4                      \n\t"
2078                 "punpcklbw %%mm3, %%mm1                 \n\t"
2079                 "punpckhbw %%mm3, %%mm4                 \n\t"
2080
2081                 "movq %%mm0, %%mm3                      \n\t"
2082                 "punpcklwd %%mm1, %%mm0                 \n\t"
2083                 "punpckhwd %%mm1, %%mm3                 \n\t"
2084                 "movq %%mm2, %%mm1                      \n\t"
2085                 "punpcklwd %%mm4, %%mm2                 \n\t"
2086                 "punpckhwd %%mm4, %%mm1                 \n\t"
2087
2088                 "movd %%mm0, 128(%2)                    \n\t"
2089                 "psrlq $32, %%mm0                       \n\t"
2090                 "movd %%mm0, 144(%2)                    \n\t"
2091                 "movd %%mm3, 160(%2)                    \n\t"
2092                 "psrlq $32, %%mm3                       \n\t"
2093                 "movd %%mm3, 176(%2)                    \n\t"
2094                 "movd %%mm3, 48(%3)                     \n\t"
2095                 "movd %%mm2, 192(%2)                    \n\t"
2096                 "movd %%mm2, 64(%3)                     \n\t"
2097                 "psrlq $32, %%mm2                       \n\t"
2098                 "movd %%mm2, 80(%3)                     \n\t"
2099                 "movd %%mm1, 96(%3)                     \n\t"
2100                 "psrlq $32, %%mm1                       \n\t"
2101                 "movd %%mm1, 112(%3)                    \n\t"
2102
2103                 "lea (%%"REG_a", %1, 4), %%"REG_a"      \n\t"
2104
2105                 "movq (%0, %1, 4), %%mm0                \n\t" // 12345678
2106                 "movq (%%"REG_a"), %%mm1                \n\t" // abcdefgh
2107                 "movq %%mm0, %%mm2                      \n\t" // 12345678
2108                 "punpcklbw %%mm1, %%mm0                 \n\t" // 1a2b3c4d
2109                 "punpckhbw %%mm1, %%mm2                 \n\t" // 5e6f7g8h
2110
2111                 "movq (%%"REG_a", %1), %%mm1            \n\t"
2112                 "movq (%%"REG_a", %1, 2), %%mm3         \n\t"
2113                 "movq %%mm1, %%mm4                      \n\t"
2114                 "punpcklbw %%mm3, %%mm1                 \n\t"
2115                 "punpckhbw %%mm3, %%mm4                 \n\t"
2116
2117                 "movq %%mm0, %%mm3                      \n\t"
2118                 "punpcklwd %%mm1, %%mm0                 \n\t"
2119                 "punpckhwd %%mm1, %%mm3                 \n\t"
2120                 "movq %%mm2, %%mm1                      \n\t"
2121                 "punpcklwd %%mm4, %%mm2                 \n\t"
2122                 "punpckhwd %%mm4, %%mm1                 \n\t"
2123
2124                 "movd %%mm0, 132(%2)                    \n\t"
2125                 "psrlq $32, %%mm0                       \n\t"
2126                 "movd %%mm0, 148(%2)                    \n\t"
2127                 "movd %%mm3, 164(%2)                    \n\t"
2128                 "psrlq $32, %%mm3                       \n\t"
2129                 "movd %%mm3, 180(%2)                    \n\t"
2130                 "movd %%mm3, 52(%3)                     \n\t"
2131                 "movd %%mm2, 196(%2)                    \n\t"
2132                 "movd %%mm2, 68(%3)                     \n\t"
2133                 "psrlq $32, %%mm2                       \n\t"
2134                 "movd %%mm2, 84(%3)                     \n\t"
2135                 "movd %%mm1, 100(%3)                    \n\t"
2136                 "psrlq $32, %%mm1                       \n\t"
2137                 "movd %%mm1, 116(%3)                    \n\t"
2138
2139
2140         :: "r" (src), "r" ((long)srcStride), "r" (dst1), "r" (dst2)
2141         : "%"REG_a
2142         );
2143 }
2144
2145 /**
2146  * transposes the given 8x8 block
2147  */
2148 static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, uint8_t *src)
2149 {
2150         asm(
2151                 "lea (%0, %1), %%"REG_a"                \n\t"
2152                 "lea (%%"REG_a",%1,4), %%"REG_d"        \n\t"
2153 //      0       1       2       3       4       5       6       7       8       9
2154 //      %0      eax     eax+%1  eax+2%1 %0+4%1  edx     edx+%1  edx+2%1 %0+8%1  edx+4%1
2155                 "movq (%2), %%mm0                       \n\t" // 12345678
2156                 "movq 16(%2), %%mm1                     \n\t" // abcdefgh
2157                 "movq %%mm0, %%mm2                      \n\t" // 12345678
2158                 "punpcklbw %%mm1, %%mm0                 \n\t" // 1a2b3c4d
2159                 "punpckhbw %%mm1, %%mm2                 \n\t" // 5e6f7g8h
2160
2161                 "movq 32(%2), %%mm1                     \n\t"
2162                 "movq 48(%2), %%mm3                     \n\t"
2163                 "movq %%mm1, %%mm4                      \n\t"
2164                 "punpcklbw %%mm3, %%mm1                 \n\t"
2165                 "punpckhbw %%mm3, %%mm4                 \n\t"
2166
2167                 "movq %%mm0, %%mm3                      \n\t"
2168                 "punpcklwd %%mm1, %%mm0                 \n\t"
2169                 "punpckhwd %%mm1, %%mm3                 \n\t"
2170                 "movq %%mm2, %%mm1                      \n\t"
2171                 "punpcklwd %%mm4, %%mm2                 \n\t"
2172                 "punpckhwd %%mm4, %%mm1                 \n\t"
2173
2174                 "movd %%mm0, (%0)                       \n\t"
2175                 "psrlq $32, %%mm0                       \n\t"
2176                 "movd %%mm0, (%%"REG_a")                \n\t"
2177                 "movd %%mm3, (%%"REG_a", %1)            \n\t"
2178                 "psrlq $32, %%mm3                       \n\t"
2179                 "movd %%mm3, (%%"REG_a", %1, 2)         \n\t"
2180                 "movd %%mm2, (%0, %1, 4)                \n\t"
2181                 "psrlq $32, %%mm2                       \n\t"
2182                 "movd %%mm2, (%%"REG_d")                \n\t"
2183                 "movd %%mm1, (%%"REG_d", %1)            \n\t"
2184                 "psrlq $32, %%mm1                       \n\t"
2185                 "movd %%mm1, (%%"REG_d", %1, 2)         \n\t"
2186
2187
2188                 "movq 64(%2), %%mm0                     \n\t" // 12345678
2189                 "movq 80(%2), %%mm1                     \n\t" // abcdefgh
2190                 "movq %%mm0, %%mm2                      \n\t" // 12345678
2191                 "punpcklbw %%mm1, %%mm0                 \n\t" // 1a2b3c4d
2192                 "punpckhbw %%mm1, %%mm2                 \n\t" // 5e6f7g8h
2193
2194                 "movq 96(%2), %%mm1                     \n\t"
2195                 "movq 112(%2), %%mm3                    \n\t"
2196                 "movq %%mm1, %%mm4                      \n\t"
2197                 "punpcklbw %%mm3, %%mm1                 \n\t"
2198                 "punpckhbw %%mm3, %%mm4                 \n\t"
2199
2200                 "movq %%mm0, %%mm3                      \n\t"
2201                 "punpcklwd %%mm1, %%mm0                 \n\t"
2202                 "punpckhwd %%mm1, %%mm3                 \n\t"
2203                 "movq %%mm2, %%mm1                      \n\t"
2204                 "punpcklwd %%mm4, %%mm2                 \n\t"
2205                 "punpckhwd %%mm4, %%mm1                 \n\t"
2206
2207                 "movd %%mm0, 4(%0)                      \n\t"
2208                 "psrlq $32, %%mm0                       \n\t"
2209                 "movd %%mm0, 4(%%"REG_a")               \n\t"
2210                 "movd %%mm3, 4(%%"REG_a", %1)           \n\t"
2211                 "psrlq $32, %%mm3                       \n\t"
2212                 "movd %%mm3, 4(%%"REG_a", %1, 2)        \n\t"
2213                 "movd %%mm2, 4(%0, %1, 4)               \n\t"
2214                 "psrlq $32, %%mm2                       \n\t"
2215                 "movd %%mm2, 4(%%"REG_d")               \n\t"
2216                 "movd %%mm1, 4(%%"REG_d", %1)           \n\t"
2217                 "psrlq $32, %%mm1                       \n\t"
2218                 "movd %%mm1, 4(%%"REG_d", %1, 2)        \n\t"
2219
2220         :: "r" (dst), "r" ((long)dstStride), "r" (src)
2221         : "%"REG_a, "%"REG_d
2222         );
2223 }
2224 #endif //HAVE_MMX
2225 //static long test=0;
2226
2227 #ifndef HAVE_ALTIVEC
2228 static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride,
2229                                     uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise)
2230 {
2231         // to save a register (FIXME do this outside of the loops)
2232         tempBluredPast[127]= maxNoise[0];
2233         tempBluredPast[128]= maxNoise[1];
2234         tempBluredPast[129]= maxNoise[2];
2235
2236 #define FAST_L2_DIFF
2237 //#define L1_DIFF //u should change the thresholds too if u try that one
2238 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2239         asm volatile(
2240                 "lea (%2, %2, 2), %%"REG_a"             \n\t" // 3*stride
2241                 "lea (%2, %2, 4), %%"REG_d"             \n\t" // 5*stride
2242                 "lea (%%"REG_d", %2, 2), %%"REG_c"      \n\t" // 7*stride
2243 //      0       1       2       3       4       5       6       7       8       9
2244 //      %x      %x+%2   %x+2%2  %x+eax  %x+4%2  %x+edx  %x+2eax %x+ecx  %x+8%2
2245 //FIXME reorder?
2246 #ifdef L1_DIFF //needs mmx2
2247                 "movq (%0), %%mm0                       \n\t" // L0
2248                 "psadbw (%1), %%mm0                     \n\t" // |L0-R0|
2249                 "movq (%0, %2), %%mm1                   \n\t" // L1
2250                 "psadbw (%1, %2), %%mm1                 \n\t" // |L1-R1|
2251                 "movq (%0, %2, 2), %%mm2                \n\t" // L2
2252                 "psadbw (%1, %2, 2), %%mm2              \n\t" // |L2-R2|
2253                 "movq (%0, %%"REG_a"), %%mm3            \n\t" // L3
2254                 "psadbw (%1, %%"REG_a"), %%mm3          \n\t" // |L3-R3|
2255
2256                 "movq (%0, %2, 4), %%mm4                \n\t" // L4
2257                 "paddw %%mm1, %%mm0                     \n\t"
2258                 "psadbw (%1, %2, 4), %%mm4              \n\t" // |L4-R4|
2259                 "movq (%0, %%"REG_d"), %%mm5            \n\t" // L5
2260                 "paddw %%mm2, %%mm0                     \n\t"
2261                 "psadbw (%1, %%"REG_d"), %%mm5          \n\t" // |L5-R5|
2262                 "movq (%0, %%"REG_a", 2), %%mm6         \n\t" // L6
2263                 "paddw %%mm3, %%mm0                     \n\t"
2264                 "psadbw (%1, %%"REG_a", 2), %%mm6       \n\t" // |L6-R6|
2265                 "movq (%0, %%"REG_c"), %%mm7            \n\t" // L7
2266                 "paddw %%mm4, %%mm0                     \n\t"
2267                 "psadbw (%1, %%"REG_c"), %%mm7          \n\t" // |L7-R7|
2268                 "paddw %%mm5, %%mm6                     \n\t"
2269                 "paddw %%mm7, %%mm6                     \n\t"
2270                 "paddw %%mm6, %%mm0                     \n\t"
2271 #else //L1_DIFF
2272 #if defined (FAST_L2_DIFF)
2273                 "pcmpeqb %%mm7, %%mm7                   \n\t"
2274                 "movq "MANGLE(b80)", %%mm6              \n\t"
2275                 "pxor %%mm0, %%mm0                      \n\t"
2276 #define REAL_L2_DIFF_CORE(a, b)\
2277                 "movq " #a ", %%mm5                     \n\t"\
2278                 "movq " #b ", %%mm2                     \n\t"\
2279                 "pxor %%mm7, %%mm2                      \n\t"\
2280                 PAVGB(%%mm2, %%mm5)\
2281                 "paddb %%mm6, %%mm5                     \n\t"\
2282                 "movq %%mm5, %%mm2                      \n\t"\
2283                 "psllw $8, %%mm5                        \n\t"\
2284                 "pmaddwd %%mm5, %%mm5                   \n\t"\
2285                 "pmaddwd %%mm2, %%mm2                   \n\t"\
2286                 "paddd %%mm2, %%mm5                     \n\t"\
2287                 "psrld $14, %%mm5                       \n\t"\
2288                 "paddd %%mm5, %%mm0                     \n\t"
2289
2290 #else //defined (FAST_L2_DIFF)
2291                 "pxor %%mm7, %%mm7                      \n\t"
2292                 "pxor %%mm0, %%mm0                      \n\t"
2293 #define REAL_L2_DIFF_CORE(a, b)\
2294                 "movq " #a ", %%mm5                     \n\t"\
2295                 "movq " #b ", %%mm2                     \n\t"\
2296                 "movq %%mm5, %%mm1                      \n\t"\
2297                 "movq %%mm2, %%mm3                      \n\t"\
2298                 "punpcklbw %%mm7, %%mm5                 \n\t"\
2299                 "punpckhbw %%mm7, %%mm1                 \n\t"\
2300                 "punpcklbw %%mm7, %%mm2                 \n\t"\
2301                 "punpckhbw %%mm7, %%mm3                 \n\t"\
2302                 "psubw %%mm2, %%mm5                     \n\t"\
2303                 "psubw %%mm3, %%mm1                     \n\t"\
2304                 "pmaddwd %%mm5, %%mm5                   \n\t"\
2305                 "pmaddwd %%mm1, %%mm1                   \n\t"\
2306                 "paddd %%mm1, %%mm5                     \n\t"\
2307                 "paddd %%mm5, %%mm0                     \n\t"
2308
2309 #endif //defined (FAST_L2_DIFF)
2310
2311 #define L2_DIFF_CORE(a, b)  REAL_L2_DIFF_CORE(a, b)
2312
2313 L2_DIFF_CORE((%0)          , (%1))
2314 L2_DIFF_CORE((%0, %2)      , (%1, %2))
2315 L2_DIFF_CORE((%0, %2, 2)   , (%1, %2, 2))
2316 L2_DIFF_CORE((%0, %%REGa)  , (%1, %%REGa))
2317 L2_DIFF_CORE((%0, %2, 4)   , (%1, %2, 4))
2318 L2_DIFF_CORE((%0, %%REGd)  , (%1, %%REGd))
2319 L2_DIFF_CORE((%0, %%REGa,2), (%1, %%REGa,2))
2320 L2_DIFF_CORE((%0, %%REGc)  , (%1, %%REGc))
2321
2322 #endif //L1_DIFF
2323
2324                 "movq %%mm0, %%mm4                      \n\t"
2325                 "psrlq $32, %%mm0                       \n\t"
2326                 "paddd %%mm0, %%mm4                     \n\t"
2327                 "movd %%mm4, %%ecx                      \n\t"
2328                 "shll $2, %%ecx                         \n\t"
2329                 "mov %3, %%"REG_d"                      \n\t"
2330                 "addl -4(%%"REG_d"), %%ecx              \n\t"
2331                 "addl 4(%%"REG_d"), %%ecx               \n\t"
2332                 "addl -1024(%%"REG_d"), %%ecx           \n\t"
2333                 "addl $4, %%ecx                         \n\t"
2334                 "addl 1024(%%"REG_d"), %%ecx            \n\t"
2335                 "shrl $3, %%ecx                         \n\t"
2336                 "movl %%ecx, (%%"REG_d")                \n\t"
2337
2338 //                "mov %3, %%"REG_c"                      \n\t"
2339 //                "mov %%"REG_c", test                    \n\t"
2340 //                "jmp 4f                                 \n\t"
2341                 "cmpl 512(%%"REG_d"), %%ecx             \n\t"
2342                 " jb 2f                                 \n\t"
2343                 "cmpl 516(%%"REG_d"), %%ecx             \n\t"
2344                 " jb 1f                                 \n\t"
2345
2346                 "lea (%%"REG_a", %2, 2), %%"REG_d"      \n\t" // 5*stride
2347                 "lea (%%"REG_d", %2, 2), %%"REG_c"      \n\t" // 7*stride
2348                 "movq (%0), %%mm0                       \n\t" // L0
2349                 "movq (%0, %2), %%mm1                   \n\t" // L1
2350                 "movq (%0, %2, 2), %%mm2                \n\t" // L2
2351                 "movq (%0, %%"REG_a"), %%mm3            \n\t" // L3
2352                 "movq (%0, %2, 4), %%mm4                \n\t" // L4
2353                 "movq (%0, %%"REG_d"), %%mm5            \n\t" // L5
2354                 "movq (%0, %%"REG_a", 2), %%mm6         \n\t" // L6
2355                 "movq (%0, %%"REG_c"), %%mm7            \n\t" // L7
2356                 "movq %%mm0, (%1)                       \n\t" // L0
2357                 "movq %%mm1, (%1, %2)                   \n\t" // L1
2358                 "movq %%mm2, (%1, %2, 2)                \n\t" // L2
2359                 "movq %%mm3, (%1, %%"REG_a")            \n\t" // L3
2360                 "movq %%mm4, (%1, %2, 4)                \n\t" // L4
2361                 "movq %%mm5, (%1, %%"REG_d")            \n\t" // L5
2362                 "movq %%mm6, (%1, %%"REG_a", 2)         \n\t" // L6
2363                 "movq %%mm7, (%1, %%"REG_c")            \n\t" // L7
2364                 "jmp 4f                                 \n\t"
2365
2366                 "1:                                     \n\t"
2367                 "lea (%%"REG_a", %2, 2), %%"REG_d"      \n\t" // 5*stride
2368                 "lea (%%"REG_d", %2, 2), %%"REG_c"      \n\t" // 7*stride
2369                 "movq (%0), %%mm0                       \n\t" // L0
2370                 PAVGB((%1), %%mm0)                            // L0
2371                 "movq (%0, %2), %%mm1                   \n\t" // L1
2372                 PAVGB((%1, %2), %%mm1)                        // L1
2373                 "movq (%0, %2, 2), %%mm2                \n\t" // L2
2374                 PAVGB((%1, %2, 2), %%mm2)                     // L2
2375                 "movq (%0, %%"REG_a"), %%mm3            \n\t" // L3
2376                 PAVGB((%1, %%REGa), %%mm3)                    // L3
2377                 "movq (%0, %2, 4), %%mm4                \n\t" // L4
2378                 PAVGB((%1, %2, 4), %%mm4)                     // L4
2379                 "movq (%0, %%"REG_d"), %%mm5            \n\t" // L5
2380                 PAVGB((%1, %%REGd), %%mm5)                    // L5
2381                 "movq (%0, %%"REG_a", 2), %%mm6         \n\t" // L6
2382                 PAVGB((%1, %%REGa, 2), %%mm6)                 // L6
2383                 "movq (%0, %%"REG_c"), %%mm7            \n\t" // L7
2384                 PAVGB((%1, %%REGc), %%mm7)                    // L7
2385                 "movq %%mm0, (%1)                       \n\t" // R0
2386                 "movq %%mm1, (%1, %2)                   \n\t" // R1
2387                 "movq %%mm2, (%1, %2, 2)                \n\t" // R2
2388                 "movq %%mm3, (%1, %%"REG_a")            \n\t" // R3
2389                 "movq %%mm4, (%1, %2, 4)                \n\t" // R4
2390                 "movq %%mm5, (%1, %%"REG_d")            \n\t" // R5
2391                 "movq %%mm6, (%1, %%"REG_a", 2)         \n\t" // R6
2392                 "movq %%mm7, (%1, %%"REG_c")            \n\t" // R7
2393                 "movq %%mm0, (%0)                       \n\t" // L0
2394                 "movq %%mm1, (%0, %2)                   \n\t" // L1
2395                 "movq %%mm2, (%0, %2, 2)                \n\t" // L2
2396                 "movq %%mm3, (%0, %%"REG_a")            \n\t" // L3
2397                 "movq %%mm4, (%0, %2, 4)                \n\t" // L4
2398                 "movq %%mm5, (%0, %%"REG_d")            \n\t" // L5
2399                 "movq %%mm6, (%0, %%"REG_a", 2)         \n\t" // L6
2400                 "movq %%mm7, (%0, %%"REG_c")            \n\t" // L7
2401                 "jmp 4f                                 \n\t"
2402
2403                 "2:                                     \n\t"
2404                 "cmpl 508(%%"REG_d"), %%ecx             \n\t"
2405                 " jb 3f                                 \n\t"
2406
2407                 "lea (%%"REG_a", %2, 2), %%"REG_d"      \n\t" // 5*stride
2408                 "lea (%%"REG_d", %2, 2), %%"REG_c"      \n\t" // 7*stride
2409                 "movq (%0), %%mm0                       \n\t" // L0
2410                 "movq (%0, %2), %%mm1                   \n\t" // L1
2411                 "movq (%0, %2, 2), %%mm2                \n\t" // L2
2412                 "movq (%0, %%"REG_a"), %%mm3            \n\t" // L3
2413                 "movq (%1), %%mm4                       \n\t" // R0
2414                 "movq (%1, %2), %%mm5                   \n\t" // R1
2415                 "movq (%1, %2, 2), %%mm6                \n\t" // R2
2416                 "movq (%1, %%"REG_a"), %%mm7            \n\t" // R3
2417                 PAVGB(%%mm4, %%mm0)
2418                 PAVGB(%%mm5, %%mm1)
2419                 PAVGB(%%mm6, %%mm2)
2420                 PAVGB(%%mm7, %%mm3)
2421                 PAVGB(%%mm4, %%mm0)
2422                 PAVGB(%%mm5, %%mm1)
2423                 PAVGB(%%mm6, %%mm2)
2424                 PAVGB(%%mm7, %%mm3)
2425                 "movq %%mm0, (%1)                       \n\t" // R0
2426                 "movq %%mm1, (%1, %2)                   \n\t" // R1
2427                 "movq %%mm2, (%1, %2, 2)                \n\t" // R2
2428                 "movq %%mm3, (%1, %%"REG_a")            \n\t" // R3
2429                 "movq %%mm0, (%0)                       \n\t" // L0
2430                 "movq %%mm1, (%0, %2)                   \n\t" // L1
2431                 "movq %%mm2, (%0, %2, 2)                \n\t" // L2
2432                 "movq %%mm3, (%0, %%"REG_a")            \n\t" // L3
2433
2434                 "movq (%0, %2, 4), %%mm0                \n\t" // L4
2435                 "movq (%0, %%"REG_d"), %%mm1            \n\t" // L5
2436                 "movq (%0, %%"REG_a", 2), %%mm2         \n\t" // L6
2437                 "movq (%0, %%"REG_c"), %%mm3            \n\t" // L7
2438                 "movq (%1, %2, 4), %%mm4                \n\t" // R4
2439                 "movq (%1, %%"REG_d"), %%mm5            \n\t" // R5
2440                 "movq (%1, %%"REG_a", 2), %%mm6         \n\t" // R6
2441                 "movq (%1, %%"REG_c"), %%mm7            \n\t" // R7
2442                 PAVGB(%%mm4, %%mm0)
2443                 PAVGB(%%mm5, %%mm1)
2444                 PAVGB(%%mm6, %%mm2)
2445                 PAVGB(%%mm7, %%mm3)
2446                 PAVGB(%%mm4, %%mm0)
2447                 PAVGB(%%mm5, %%mm1)
2448                 PAVGB(%%mm6, %%mm2)
2449                 PAVGB(%%mm7, %%mm3)
2450                 "movq %%mm0, (%1, %2, 4)                \n\t" // R4
2451                 "movq %%mm1, (%1, %%"REG_d")            \n\t" // R5
2452                 "movq %%mm2, (%1, %%"REG_a", 2)         \n\t" // R6
2453                 "movq %%mm3, (%1, %%"REG_c")            \n\t" // R7
2454                 "movq %%mm0, (%0, %2, 4)                \n\t" // L4
2455                 "movq %%mm1, (%0, %%"REG_d")            \n\t" // L5
2456                 "movq %%mm2, (%0, %%"REG_a", 2)         \n\t" // L6
2457                 "movq %%mm3, (%0, %%"REG_c")            \n\t" // L7
2458                 "jmp 4f                                 \n\t"
2459
2460                 "3:                                     \n\t"
2461                 "lea (%%"REG_a", %2, 2), %%"REG_d"      \n\t" // 5*stride
2462                 "lea (%%"REG_d", %2, 2), %%"REG_c"      \n\t" // 7*stride
2463                 "movq (%0), %%mm0                       \n\t" // L0
2464                 "movq (%0, %2), %%mm1                   \n\t" // L1
2465                 "movq (%0, %2, 2), %%mm2                \n\t" // L2
2466                 "movq (%0, %%"REG_a"), %%mm3            \n\t" // L3
2467                 "movq (%1), %%mm4                       \n\t" // R0
2468                 "movq (%1, %2), %%mm5                   \n\t" // R1
2469                 "movq (%1, %2, 2), %%mm6                \n\t" // R2
2470                 "movq (%1, %%"REG_a"), %%mm7            \n\t" // R3
2471                 PAVGB(%%mm4, %%mm0)
2472                 PAVGB(%%mm5, %%mm1)
2473                 PAVGB(%%mm6, %%mm2)
2474                 PAVGB(%%mm7, %%mm3)
2475                 PAVGB(%%mm4, %%mm0)
2476                 PAVGB(%%mm5, %%mm1)
2477                 PAVGB(%%mm6, %%mm2)
2478                 PAVGB(%%mm7, %%mm3)
2479                 PAVGB(%%mm4, %%mm0)
2480                 PAVGB(%%mm5, %%mm1)
2481                 PAVGB(%%mm6, %%mm2)
2482                 PAVGB(%%mm7, %%mm3)
2483                 "movq %%mm0, (%1)                       \n\t" // R0
2484                 "movq %%mm1, (%1, %2)                   \n\t" // R1
2485                 "movq %%mm2, (%1, %2, 2)                \n\t" // R2
2486                 "movq %%mm3, (%1, %%"REG_a")            \n\t" // R3
2487                 "movq %%mm0, (%0)                       \n\t" // L0
2488                 "movq %%mm1, (%0, %2)                   \n\t" // L1
2489                 "movq %%mm2, (%0, %2, 2)                \n\t" // L2
2490                 "movq %%mm3, (%0, %%"REG_a")            \n\t" // L3
2491
2492                 "movq (%0, %2, 4), %%mm0                \n\t" // L4
2493                 "movq (%0, %%"REG_d"), %%mm1            \n\t" // L5
2494                 "movq (%0, %%"REG_a", 2), %%mm2         \n\t" // L6
2495                 "movq (%0, %%"REG_c"), %%mm3            \n\t" // L7
2496                 "movq (%1, %2, 4), %%mm4                \n\t" // R4
2497                 "movq (%1, %%"REG_d"), %%mm5            \n\t" // R5
2498                 "movq (%1, %%"REG_a", 2), %%mm6         \n\t" // R6
2499                 "movq (%1, %%"REG_c"), %%mm7            \n\t" // R7
2500                 PAVGB(%%mm4, %%mm0)
2501                 PAVGB(%%mm5, %%mm1)
2502                 PAVGB(%%mm6, %%mm2)
2503                 PAVGB(%%mm7, %%mm3)
2504                 PAVGB(%%mm4, %%mm0)
2505                 PAVGB(%%mm5, %%mm1)
2506                 PAVGB(%%mm6, %%mm2)
2507                 PAVGB(%%mm7, %%mm3)
2508                 PAVGB(%%mm4, %%mm0)
2509                 PAVGB(%%mm5, %%mm1)
2510                 PAVGB(%%mm6, %%mm2)
2511                 PAVGB(%%mm7, %%mm3)
2512                 "movq %%mm0, (%1, %2, 4)                \n\t" // R4
2513                 "movq %%mm1, (%1, %%"REG_d")            \n\t" // R5
2514                 "movq %%mm2, (%1, %%"REG_a", 2)         \n\t" // R6
2515                 "movq %%mm3, (%1, %%"REG_c")            \n\t" // R7
2516                 "movq %%mm0, (%0, %2, 4)                \n\t" // L4
2517                 "movq %%mm1, (%0, %%"REG_d")            \n\t" // L5
2518                 "movq %%mm2, (%0, %%"REG_a", 2)         \n\t" // L6
2519                 "movq %%mm3, (%0, %%"REG_c")            \n\t" // L7
2520
2521                 "4:                                     \n\t"
2522
2523                 :: "r" (src), "r" (tempBlured), "r"((long)stride), "m" (tempBluredPast)
2524                 : "%"REG_a, "%"REG_d, "%"REG_c, "memory"
2525                 );
2526 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2527 {
2528         int y;
2529         int d=0;
2530 //        int sysd=0;
2531         int i;
2532
2533         for(y=0; y<8; y++)
2534         {
2535                 int x;
2536                 for(x=0; x<8; x++)
2537                 {
2538                         int ref= tempBlured[ x + y*stride ];
2539                         int cur= src[ x + y*stride ];
2540                         int d1=ref - cur;
2541 //                        if(x==0 || x==7) d1+= d1>>1;
2542 //                        if(y==0 || y==7) d1+= d1>>1;
2543 //                        d+= FFABS(d1);
2544                         d+= d1*d1;
2545 //                        sysd+= d1;
2546                 }
2547         }
2548         i=d;
2549         d=         (
2550                 4*d
2551                 +(*(tempBluredPast-256))
2552                 +(*(tempBluredPast-1))+ (*(tempBluredPast+1))
2553                 +(*(tempBluredPast+256))
2554                 +4)>>3;
2555         *tempBluredPast=i;
2556 //        ((*tempBluredPast)*3 + d + 2)>>2;
2557
2558 /*
2559 Switch between
2560  1  0  0  0  0  0  0  (0)
2561 64 32 16  8  4  2  1  (1)
2562 64 48 36 27 20 15 11 (33) (approx)
2563 64 56 49 43 37 33 29 (200) (approx)
2564 */
2565         if(d > maxNoise[1])
2566         {
2567                 if(d < maxNoise[2])
2568                 {
2569                         for(y=0; y<8; y++)
2570                         {
2571                                 int x;
2572                                 for(x=0; x<8; x++)
2573                                 {
2574                                         int ref= tempBlured[ x + y*stride ];
2575                                         int cur= src[ x + y*stride ];
2576                                         tempBlured[ x + y*stride ]=
2577                                         src[ x + y*stride ]=
2578                                                 (ref + cur + 1)>>1;
2579                                 }
2580                         }
2581                 }
2582                 else
2583                 {
2584                         for(y=0; y<8; y++)
2585                         {
2586                                 int x;
2587                                 for(x=0; x<8; x++)
2588                                 {
2589                                         tempBlured[ x + y*stride ]= src[ x + y*stride ];
2590                                 }
2591                         }
2592                 }
2593         }
2594         else
2595         {
2596                 if(d < maxNoise[0])
2597                 {
2598                         for(y=0; y<8; y++)
2599                         {
2600                                 int x;
2601                                 for(x=0; x<8; x++)
2602                                 {
2603                                         int ref= tempBlured[ x + y*stride ];
2604                                         int cur= src[ x + y*stride ];
2605                                         tempBlured[ x + y*stride ]=
2606                                         src[ x + y*stride ]=
2607                                                 (ref*7 + cur + 4)>>3;
2608                                 }
2609                         }
2610                 }
2611                 else
2612                 {
2613                         for(y=0; y<8; y++)
2614                         {
2615                                 int x;
2616                                 for(x=0; x<8; x++)
2617                                 {
2618                                         int ref= tempBlured[ x + y*stride ];
2619                                         int cur= src[ x + y*stride ];
2620                                         tempBlured[ x + y*stride ]=
2621                                         src[ x + y*stride ]=
2622                                                 (ref*3 + cur + 2)>>2;
2623                                 }
2624                         }
2625                 }
2626         }
2627 }
2628 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2629 }
2630 #endif //HAVE_ALTIVEC
2631
2632 #ifdef HAVE_MMX
2633 /**
2634  * accurate deblock filter
2635  */
2636 static av_always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int stride, PPContext *c){
2637         int64_t dc_mask, eq_mask, both_masks;
2638         int64_t sums[10*8*2];
2639         src+= step*3; // src points to begin of the 8x8 Block
2640 //START_TIMER
2641 asm volatile(
2642                 "movq %0, %%mm7                         \n\t"
2643                 "movq %1, %%mm6                         \n\t"
2644                 : : "m" (c->mmxDcOffset[c->nonBQP]),  "m" (c->mmxDcThreshold[c->nonBQP])
2645                 );
2646
2647 asm volatile(
2648                 "lea (%2, %3), %%"REG_a"                \n\t"
2649 //      0       1       2       3       4       5       6       7       8       9
2650 //      %1      eax     eax+%2  eax+2%2 %1+4%2  ecx     ecx+%2  ecx+2%2 %1+8%2  ecx+4%2
2651
2652                 "movq (%2), %%mm0                       \n\t"
2653                 "movq (%%"REG_a"), %%mm1                \n\t"
2654                 "movq %%mm1, %%mm3                      \n\t"
2655                 "movq %%mm1, %%mm4                      \n\t"
2656                 "psubb %%mm1, %%mm0                     \n\t" // mm0 = differnece
2657                 "paddb %%mm7, %%mm0                     \n\t"
2658                 "pcmpgtb %%mm6, %%mm0                   \n\t"
2659
2660                 "movq (%%"REG_a",%3), %%mm2             \n\t"
2661                 PMAXUB(%%mm2, %%mm4)
2662                 PMINUB(%%mm2, %%mm3, %%mm5)
2663                 "psubb %%mm2, %%mm1                     \n\t"
2664                 "paddb %%mm7, %%mm1                     \n\t"
2665                 "pcmpgtb %%mm6, %%mm1                   \n\t"
2666                 "paddb %%mm1, %%mm0                     \n\t"
2667
2668                 "movq (%%"REG_a", %3, 2), %%mm1         \n\t"
2669                 PMAXUB(%%mm1, %%mm4)
2670                 PMINUB(%%mm1, %%mm3, %%mm5)
2671                 "psubb %%mm1, %%mm2                     \n\t"
2672                 "paddb %%mm7, %%mm2                     \n\t"
2673                 "pcmpgtb %%mm6, %%mm2                   \n\t"
2674                 "paddb %%mm2, %%mm0                     \n\t"
2675
2676                 "lea (%%"REG_a", %3, 4), %%"REG_a"      \n\t"
2677
2678                 "movq (%2, %3, 4), %%mm2                \n\t"
2679                 PMAXUB(%%mm2, %%mm4)
2680                 PMINUB(%%mm2, %%mm3, %%mm5)
2681                 "psubb %%mm2, %%mm1                     \n\t"
2682                 "paddb %%mm7, %%mm1                     \n\t"
2683                 "pcmpgtb %%mm6, %%mm1                   \n\t"
2684                 "paddb %%mm1, %%mm0                     \n\t"
2685
2686                 "movq (%%"REG_a"), %%mm1                \n\t"
2687                 PMAXUB(%%mm1, %%mm4)
2688                 PMINUB(%%mm1, %%mm3, %%mm5)
2689                 "psubb %%mm1, %%mm2                     \n\t"
2690                 "paddb %%mm7, %%mm2                     \n\t"
2691                 "pcmpgtb %%mm6, %%mm2                   \n\t"
2692                 "paddb %%mm2, %%mm0                     \n\t"
2693
2694                 "movq (%%"REG_a", %3), %%mm2            \n\t"
2695                 PMAXUB(%%mm2, %%mm4)
2696                 PMINUB(%%mm2, %%mm3, %%mm5)
2697                 "psubb %%mm2, %%mm1                     \n\t"
2698                 "paddb %%mm7, %%mm1                     \n\t"
2699                 "pcmpgtb %%mm6, %%mm1                   \n\t"
2700                 "paddb %%mm1, %%mm0                     \n\t"
2701
2702                 "movq (%%"REG_a", %3, 2), %%mm1         \n\t"
2703                 PMAXUB(%%mm1, %%mm4)
2704                 PMINUB(%%mm1, %%mm3, %%mm5)
2705                 "psubb %%mm1, %%mm2                     \n\t"
2706                 "paddb %%mm7, %%mm2                     \n\t"
2707                 "pcmpgtb %%mm6, %%mm2                   \n\t"
2708                 "paddb %%mm2, %%mm0                     \n\t"
2709
2710                 "movq (%2, %3, 8), %%mm2                \n\t"
2711                 PMAXUB(%%mm2, %%mm4)
2712                 PMINUB(%%mm2, %%mm3, %%mm5)
2713                 "psubb %%mm2, %%mm1                     \n\t"
2714                 "paddb %%mm7, %%mm1                     \n\t"
2715                 "pcmpgtb %%mm6, %%mm1                   \n\t"
2716                 "paddb %%mm1, %%mm0                     \n\t"
2717
2718                 "movq (%%"REG_a", %3, 4), %%mm1         \n\t"
2719                 "psubb %%mm1, %%mm2                     \n\t"
2720                 "paddb %%mm7, %%mm2                     \n\t"
2721                 "pcmpgtb %%mm6, %%mm2                   \n\t"
2722                 "paddb %%mm2, %%mm0                     \n\t"
2723                 "psubusb %%mm3, %%mm4                   \n\t"
2724
2725                 "pxor %%mm6, %%mm6                      \n\t"
2726                 "movq %4, %%mm7                         \n\t" // QP,..., QP
2727                 "paddusb %%mm7, %%mm7                   \n\t" // 2QP ... 2QP
2728                 "psubusb %%mm4, %%mm7                   \n\t" // Diff >=2QP -> 0
2729                 "pcmpeqb %%mm6, %%mm7                   \n\t" // Diff < 2QP -> 0
2730                 "pcmpeqb %%mm6, %%mm7                   \n\t" // Diff < 2QP -> 0
2731                 "movq %%mm7, %1                         \n\t"
2732
2733                 "movq %5, %%mm7                         \n\t"
2734                 "punpcklbw %%mm7, %%mm7                 \n\t"
2735                 "punpcklbw %%mm7, %%mm7                 \n\t"
2736                 "punpcklbw %%mm7, %%mm7                 \n\t"
2737                 "psubb %%mm0, %%mm6                     \n\t"
2738                 "pcmpgtb %%mm7, %%mm6                   \n\t"
2739                 "movq %%mm6, %0                         \n\t"
2740
2741                 : "=m" (eq_mask), "=m" (dc_mask)
2742                 : "r" (src), "r" ((long)step), "m" (c->pQPb), "m"(c->ppMode.flatnessThreshold)
2743                 : "%"REG_a
2744                 );
2745
2746         both_masks = dc_mask & eq_mask;
2747
2748         if(both_masks){
2749                 long offset= -8*step;
2750                 int64_t *temp_sums= sums;
2751
2752                 asm volatile(
2753                 "movq %2, %%mm0                         \n\t"  // QP,..., QP
2754                 "pxor %%mm4, %%mm4                      \n\t"
2755
2756                 "movq (%0), %%mm6                       \n\t"
2757                 "movq (%0, %1), %%mm5                   \n\t"
2758                 "movq %%mm5, %%mm1                      \n\t"
2759                 "movq %%mm6, %%mm2                      \n\t"
2760                 "psubusb %%mm6, %%mm5                   \n\t"
2761                 "psubusb %%mm1, %%mm2                   \n\t"
2762                 "por %%mm5, %%mm2                       \n\t" // ABS Diff of lines
2763                 "psubusb %%mm2, %%mm0                   \n\t" // diff >= QP -> 0
2764                 "pcmpeqb %%mm4, %%mm0                   \n\t" // diff >= QP -> FF
2765
2766                 "pxor %%mm6, %%mm1                      \n\t"
2767                 "pand %%mm0, %%mm1                      \n\t"
2768                 "pxor %%mm1, %%mm6                      \n\t"
2769                 // 0:QP  6:First
2770
2771                 "movq (%0, %1, 8), %%mm5                \n\t"
2772                 "add %1, %0                             \n\t" // %0 points to line 1 not 0
2773                 "movq (%0, %1, 8), %%mm7                \n\t"
2774                 "movq %%mm5, %%mm1                      \n\t"
2775                 "movq %%mm7, %%mm2                      \n\t"
2776                 "psubusb %%mm7, %%mm5                   \n\t"
2777                 "psubusb %%mm1, %%mm2                   \n\t"
2778                 "por %%mm5, %%mm2                       \n\t" // ABS Diff of lines
2779                 "movq %2, %%mm0                         \n\t"  // QP,..., QP
2780                 "psubusb %%mm2, %%mm0                   \n\t" // diff >= QP -> 0
2781                 "pcmpeqb %%mm4, %%mm0                   \n\t" // diff >= QP -> FF
2782
2783                 "pxor %%mm7, %%mm1                      \n\t"
2784                 "pand %%mm0, %%mm1                      \n\t"
2785                 "pxor %%mm1, %%mm7                      \n\t"
2786
2787                 "movq %%mm6, %%mm5                      \n\t"
2788                 "punpckhbw %%mm4, %%mm6                 \n\t"
2789                 "punpcklbw %%mm4, %%mm5                 \n\t"
2790                 // 4:0 5/6:First 7:Last
2791
2792                 "movq %%mm5, %%mm0                      \n\t"
2793                 "movq %%mm6, %%mm1                      \n\t"
2794                 "psllw $2, %%mm0                        \n\t"
2795                 "psllw $2, %%mm1                        \n\t"
2796                 "paddw "MANGLE(w04)", %%mm0             \n\t"
2797                 "paddw "MANGLE(w04)", %%mm1             \n\t"
2798
2799 #define NEXT\
2800                 "movq (%0), %%mm2                       \n\t"\
2801                 "movq (%0), %%mm3                       \n\t"\
2802                 "add %1, %0                             \n\t"\
2803                 "punpcklbw %%mm4, %%mm2                 \n\t"\
2804                 "punpckhbw %%mm4, %%mm3                 \n\t"\
2805                 "paddw %%mm2, %%mm0                     \n\t"\
2806                 "paddw %%mm3, %%mm1                     \n\t"
2807
2808 #define PREV\
2809                 "movq (%0), %%mm2                       \n\t"\
2810                 "movq (%0), %%mm3                       \n\t"\
2811                 "add %1, %0                             \n\t"\
2812                 "punpcklbw %%mm4, %%mm2                 \n\t"\
2813                 "punpckhbw %%mm4, %%mm3                 \n\t"\
2814                 "psubw %%mm2, %%mm0                     \n\t"\
2815                 "psubw %%mm3, %%mm1                     \n\t"
2816
2817
2818                 NEXT //0
2819                 NEXT //1
2820                 NEXT //2
2821                 "movq %%mm0, (%3)                       \n\t"
2822                 "movq %%mm1, 8(%3)                      \n\t"
2823
2824                 NEXT //3
2825                 "psubw %%mm5, %%mm0                     \n\t"
2826                 "psubw %%mm6, %%mm1                     \n\t"
2827                 "movq %%mm0, 16(%3)                     \n\t"
2828                 "movq %%mm1, 24(%3)                     \n\t"
2829
2830                 NEXT //4
2831                 "psubw %%mm5, %%mm0                     \n\t"
2832                 "psubw %%mm6, %%mm1                     \n\t"
2833                 "movq %%mm0, 32(%3)                     \n\t"
2834                 "movq %%mm1, 40(%3)                     \n\t"
2835
2836                 NEXT //5
2837                 "psubw %%mm5, %%mm0                     \n\t"
2838                 "psubw %%mm6, %%mm1                     \n\t"
2839                 "movq %%mm0, 48(%3)                     \n\t"
2840                 "movq %%mm1, 56(%3)                     \n\t"
2841
2842                 NEXT //6
2843                 "psubw %%mm5, %%mm0                     \n\t"
2844                 "psubw %%mm6, %%mm1                     \n\t"
2845                 "movq %%mm0, 64(%3)                     \n\t"
2846                 "movq %%mm1, 72(%3)                     \n\t"
2847
2848                 "movq %%mm7, %%mm6                      \n\t"
2849                 "punpckhbw %%mm4, %%mm7                 \n\t"
2850                 "punpcklbw %%mm4, %%mm6                 \n\t"
2851
2852                 NEXT //7
2853                 "mov %4, %0                             \n\t"
2854                 "add %1, %0                             \n\t"
2855                 PREV //0
2856                 "movq %%mm0, 80(%3)                     \n\t"
2857                 "movq %%mm1, 88(%3)                     \n\t"
2858
2859                 PREV //1
2860                 "paddw %%mm6, %%mm0                     \n\t"
2861                 "paddw %%mm7, %%mm1                     \n\t"
2862                 "movq %%mm0, 96(%3)                     \n\t"
2863                 "movq %%mm1, 104(%3)                    \n\t"
2864
2865                 PREV //2
2866                 "paddw %%mm6, %%mm0                     \n\t"
2867                 "paddw %%mm7, %%mm1                     \n\t"
2868                 "movq %%mm0, 112(%3)                    \n\t"
2869                 "movq %%mm1, 120(%3)                    \n\t"
2870
2871                 PREV //3
2872                 "paddw %%mm6, %%mm0                     \n\t"
2873                 "paddw %%mm7, %%mm1                     \n\t"
2874                 "movq %%mm0, 128(%3)                    \n\t"
2875                 "movq %%mm1, 136(%3)                    \n\t"
2876
2877                 PREV //4
2878                 "paddw %%mm6, %%mm0                     \n\t"
2879                 "paddw %%mm7, %%mm1                     \n\t"
2880                 "movq %%mm0, 144(%3)                    \n\t"
2881                 "movq %%mm1, 152(%3)                    \n\t"
2882
2883                 "mov %4, %0                             \n\t" //FIXME
2884
2885                 : "+&r"(src)
2886                 : "r" ((long)step), "m" (c->pQPb), "r"(sums), "g"(src)
2887                 );
2888
2889                 src+= step; // src points to begin of the 8x8 Block
2890
2891                 asm volatile(
2892                 "movq %4, %%mm6                         \n\t"
2893                 "pcmpeqb %%mm5, %%mm5                   \n\t"
2894                 "pxor %%mm6, %%mm5                      \n\t"
2895                 "pxor %%mm7, %%mm7                      \n\t"
2896
2897                 "1:                                     \n\t"
2898                 "movq (%1), %%mm0                       \n\t"
2899                 "movq 8(%1), %%mm1                      \n\t"
2900                 "paddw 32(%1), %%mm0                    \n\t"
2901                 "paddw 40(%1), %%mm1                    \n\t"
2902                 "movq (%0, %3), %%mm2                   \n\t"
2903                 "movq %%mm2, %%mm3                      \n\t"
2904                 "movq %%mm2, %%mm4                      \n\t"
2905                 "punpcklbw %%mm7, %%mm2                 \n\t"
2906                 "punpckhbw %%mm7, %%mm3                 \n\t"
2907                 "paddw %%mm2, %%mm0                     \n\t"
2908                 "paddw %%mm3, %%mm1                     \n\t"
2909                 "paddw %%mm2, %%mm0                     \n\t"
2910                 "paddw %%mm3, %%mm1                     \n\t"
2911                 "psrlw $4, %%mm0                        \n\t"
2912                 "psrlw $4, %%mm1                        \n\t"
2913                 "packuswb %%mm1, %%mm0                  \n\t"
2914                 "pand %%mm6, %%mm0                      \n\t"
2915                 "pand %%mm5, %%mm4                      \n\t"
2916                 "por %%mm4, %%mm0                       \n\t"
2917                 "movq %%mm0, (%0, %3)                   \n\t"
2918                 "add $16, %1                            \n\t"
2919                 "add %2, %0                             \n\t"
2920                 " js 1b                                 \n\t"
2921
2922                 : "+r"(offset), "+r"(temp_sums)
2923                 : "r" ((long)step), "r"(src - offset), "m"(both_masks)
2924                 );
2925         }else
2926                 src+= step; // src points to begin of the 8x8 Block
2927
2928         if(eq_mask != -1LL){
2929                 uint8_t *temp_src= src;
2930                 asm volatile(
2931                 "pxor %%mm7, %%mm7                      \n\t"
2932                 "lea -40(%%"REG_SP"), %%"REG_c"         \n\t" // make space for 4 8-byte vars
2933                 "and "ALIGN_MASK", %%"REG_c"            \n\t" // align
2934 //      0       1       2       3       4       5       6       7       8       9
2935 //      %0      eax     eax+%1  eax+2%1 %0+4%1  ecx     ecx+%1  ecx+2%1 %1+8%1  ecx+4%1
2936
2937                 "movq (%0), %%mm0                       \n\t"
2938                 "movq %%mm0, %%mm1                      \n\t"
2939                 "punpcklbw %%mm7, %%mm0                 \n\t" // low part of line 0
2940                 "punpckhbw %%mm7, %%mm1                 \n\t" // high part of line 0
2941
2942                 "movq (%0, %1), %%mm2                   \n\t"
2943                 "lea (%0, %1, 2), %%"REG_a"             \n\t"
2944                 "movq %%mm2, %%mm3                      \n\t"
2945                 "punpcklbw %%mm7, %%mm2                 \n\t" // low part of line 1
2946                 "punpckhbw %%mm7, %%mm3                 \n\t" // high part of line 1
2947
2948                 "movq (%%"REG_a"), %%mm4                \n\t"
2949                 "movq %%mm4, %%mm5                      \n\t"
2950                 "punpcklbw %%mm7, %%mm4                 \n\t" // low part of line 2
2951                 "punpckhbw %%mm7, %%mm5                 \n\t" // high part of line 2
2952
2953                 "paddw %%mm0, %%mm0                     \n\t" // 2L0
2954                 "paddw %%mm1, %%mm1                     \n\t" // 2H0
2955                 "psubw %%mm4, %%mm2                     \n\t" // L1 - L2
2956                 "psubw %%mm5, %%mm3                     \n\t" // H1 - H2
2957                 "psubw %%mm2, %%mm0                     \n\t" // 2L0 - L1 + L2
2958                 "psubw %%mm3, %%mm1                     \n\t" // 2H0 - H1 + H2
2959
2960                 "psllw $2, %%mm2                        \n\t" // 4L1 - 4L2
2961                 "psllw $2, %%mm3                        \n\t" // 4H1 - 4H2
2962                 "psubw %%mm2, %%mm0                     \n\t" // 2L0 - 5L1 + 5L2
2963                 "psubw %%mm3, %%mm1                     \n\t" // 2H0 - 5H1 + 5H2
2964
2965                 "movq (%%"REG_a", %1), %%mm2            \n\t"
2966                 "movq %%mm2, %%mm3                      \n\t"
2967                 "punpcklbw %%mm7, %%mm2                 \n\t" // L3
2968                 "punpckhbw %%mm7, %%mm3                 \n\t" // H3
2969
2970                 "psubw %%mm2, %%mm0                     \n\t" // 2L0 - 5L1 + 5L2 - L3
2971                 "psubw %%mm3, %%mm1                     \n\t" // 2H0 - 5H1 + 5H2 - H3
2972                 "psubw %%mm2, %%mm0                     \n\t" // 2L0 - 5L1 + 5L2 - 2L3
2973                 "psubw %%mm3, %%mm1                     \n\t" // 2H0 - 5H1 + 5H2 - 2H3
2974                 "movq %%mm0, (%%"REG_c")                \n\t" // 2L0 - 5L1 + 5L2 - 2L3
2975                 "movq %%mm1, 8(%%"REG_c")               \n\t" // 2H0 - 5H1 + 5H2 - 2H3
2976
2977                 "movq (%%"REG_a", %1, 2), %%mm0         \n\t"
2978                 "movq %%mm0, %%mm1                      \n\t"
2979                 "punpcklbw %%mm7, %%mm0                 \n\t" // L4
2980                 "punpckhbw %%mm7, %%mm1                 \n\t" // H4
2981
2982                 "psubw %%mm0, %%mm2                     \n\t" // L3 - L4
2983                 "psubw %%mm1, %%mm3                     \n\t" // H3 - H4
2984                 "movq %%mm2, 16(%%"REG_c")              \n\t" // L3 - L4
2985                 "movq %%mm3, 24(%%"REG_c")              \n\t" // H3 - H4
2986                 "paddw %%mm4, %%mm4                     \n\t" // 2L2
2987                 "paddw %%mm5, %%mm5                     \n\t" // 2H2
2988                 "psubw %%mm2, %%mm4                     \n\t" // 2L2 - L3 + L4
2989                 "psubw %%mm3, %%mm5                     \n\t" // 2H2 - H3 + H4
2990
2991                 "lea (%%"REG_a", %1), %0                \n\t"
2992                 "psllw $2, %%mm2                        \n\t" // 4L3 - 4L4
2993                 "psllw $2, %%mm3                        \n\t" // 4H3 - 4H4
2994                 "psubw %%mm2, %%mm4                     \n\t" // 2L2 - 5L3 + 5L4
2995                 "psubw %%mm3, %%mm5                     \n\t" // 2H2 - 5H3 + 5H4
2996 //50 opcodes so far
2997                 "movq (%0, %1, 2), %%mm2                \n\t"
2998                 "movq %%mm2, %%mm3                      \n\t"
2999                 "punpcklbw %%mm7, %%mm2                 \n\t" // L5
3000                 "punpckhbw %%mm7, %%mm3                 \n\t" // H5
3001                 "psubw %%mm2, %%mm4                     \n\t" // 2L2 - 5L3 + 5L4 - L5
3002                 "psubw %%mm3, %%mm5                     \n\t" // 2H2 - 5H3 + 5H4 - H5
3003                 "psubw %%mm2, %%mm4                     \n\t" // 2L2 - 5L3 + 5L4 - 2L5
3004                 "psubw %%mm3, %%mm5                     \n\t" // 2H2 - 5H3 + 5H4 - 2H5
3005
3006                 "movq (%%"REG_a", %1, 4), %%mm6         \n\t"
3007                 "punpcklbw %%mm7, %%mm6                 \n\t" // L6
3008                 "psubw %%mm6, %%mm2                     \n\t" // L5 - L6
3009                 "movq (%%"REG_a", %1, 4), %%mm6         \n\t"
3010                 "punpckhbw %%mm7, %%mm6                 \n\t" // H6
3011                 "psubw %%mm6, %%mm3                     \n\t" // H5 - H6
3012
3013                 "paddw %%mm0, %%mm0                     \n\t" // 2L4
3014                 "paddw %%mm1, %%mm1                     \n\t" // 2H4
3015                 "psubw %%mm2, %%mm0                     \n\t" // 2L4 - L5 + L6
3016                 "psubw %%mm3, %%mm1                     \n\t" // 2H4 - H5 + H6
3017
3018                 "psllw $2, %%mm2                        \n\t" // 4L5 - 4L6
3019                 "psllw $2, %%mm3                        \n\t" // 4H5 - 4H6
3020                 "psubw %%mm2, %%mm0                     \n\t" // 2L4 - 5L5 + 5L6
3021                 "psubw %%mm3, %%mm1                     \n\t" // 2H4 - 5H5 + 5H6
3022
3023                 "movq (%0, %1, 4), %%mm2                \n\t"
3024                 "movq %%mm2, %%mm3                      \n\t"
3025                 "punpcklbw %%mm7, %%mm2                 \n\t" // L7
3026                 "punpckhbw %%mm7, %%mm3                 \n\t" // H7
3027
3028                 "paddw %%mm2, %%mm2                     \n\t" // 2L7
3029                 "paddw %%mm3, %%mm3                     \n\t" // 2H7
3030                 "psubw %%mm2, %%mm0                     \n\t" // 2L4 - 5L5 + 5L6 - 2L7
3031                 "psubw %%mm3, %%mm1                     \n\t" // 2H4 - 5H5 + 5H6 - 2H7
3032
3033                 "movq (%%"REG_c"), %%mm2                \n\t" // 2L0 - 5L1 + 5L2 - 2L3
3034                 "movq 8(%%"REG_c"), %%mm3               \n\t" // 2H0 - 5H1 + 5H2 - 2H3
3035
3036 #ifdef HAVE_MMX2
3037                 "movq %%mm7, %%mm6                      \n\t" // 0
3038                 "psubw %%mm0, %%mm6                     \n\t"
3039                 "pmaxsw %%mm6, %%mm0                    \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
3040                 "movq %%mm7, %%mm6                      \n\t" // 0
3041                 "psubw %%mm1, %%mm6                     \n\t"
3042                 "pmaxsw %%mm6, %%mm1                    \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
3043                 "movq %%mm7, %%mm6                      \n\t" // 0
3044                 "psubw %%mm2, %%mm6                     \n\t"
3045                 "pmaxsw %%mm6, %%mm2                    \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
3046                 "movq %%mm7, %%mm6                      \n\t" // 0
3047                 "psubw %%mm3, %%mm6                     \n\t"
3048                 "pmaxsw %%mm6, %%mm3                    \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
3049 #else
3050                 "movq %%mm7, %%mm6                      \n\t" // 0
3051                 "pcmpgtw %%mm0, %%mm6                   \n\t"
3052                 "pxor %%mm6, %%mm0                      \n\t"
3053                 "psubw %%mm6, %%mm0                     \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
3054                 "movq %%mm7, %%mm6                      \n\t" // 0
3055                 "pcmpgtw %%mm1, %%mm6                   \n\t"
3056                 "pxor %%mm6, %%mm1                      \n\t"
3057                 "psubw %%mm6, %%mm1                     \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
3058                 "movq %%mm7, %%mm6                      \n\t" // 0
3059                 "pcmpgtw %%mm2, %%mm6                   \n\t"
3060                 "pxor %%mm6, %%mm2                      \n\t"
3061                 "psubw %%mm6, %%mm2                     \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
3062                 "movq %%mm7, %%mm6                      \n\t" // 0
3063                 "pcmpgtw %%mm3, %%mm6                   \n\t"
3064                 "pxor %%mm6, %%mm3                      \n\t"
3065                 "psubw %%mm6, %%mm3                     \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
3066 #endif
3067
3068 #ifdef HAVE_MMX2
3069                 "pminsw %%mm2, %%mm0                    \n\t"
3070                 "pminsw %%mm3, %%mm1                    \n\t"
3071 #else
3072                 "movq %%mm0, %%mm6                      \n\t"
3073                 "psubusw %%mm2, %%mm6                   \n\t"
3074                 "psubw %%mm6, %%mm0                     \n\t"
3075                 "movq %%mm1, %%mm6                      \n\t"
3076                 "psubusw %%mm3, %%mm6                   \n\t"
3077                 "psubw %%mm6, %%mm1                     \n\t"
3078 #endif
3079
3080                 "movd %2, %%mm2                         \n\t" // QP
3081                 "punpcklbw %%mm7, %%mm2                 \n\t"
3082
3083                 "movq %%mm7, %%mm6                      \n\t" // 0
3084                 "pcmpgtw %%mm4, %%mm6                   \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
3085                 "pxor %%mm6, %%mm4                      \n\t"
3086                 "psubw %%mm6, %%mm4                     \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
3087                 "pcmpgtw %%mm5, %%mm7                   \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
3088                 "pxor %%mm7, %%mm5                      \n\t"
3089                 "psubw %%mm7, %%mm5                     \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
3090 // 100 opcodes
3091                 "psllw $3, %%mm2                        \n\t" // 8QP
3092                 "movq %%mm2, %%mm3                      \n\t" // 8QP
3093                 "pcmpgtw %%mm4, %%mm2                   \n\t"
3094                 "pcmpgtw %%mm5, %%mm3                   \n\t"
3095                 "pand %%mm2, %%mm4                      \n\t"
3096                 "pand %%mm3, %%mm5                      \n\t"
3097
3098
3099                 "psubusw %%mm0, %%mm4                   \n\t" // hd
3100                 "psubusw %%mm1, %%mm5                   \n\t" // ld
3101
3102
3103                 "movq "MANGLE(w05)", %%mm2              \n\t" // 5
3104                 "pmullw %%mm2, %%mm4                    \n\t"
3105                 "pmullw %%mm2, %%mm5                    \n\t"
3106                 "movq "MANGLE(w20)", %%mm2              \n\t" // 32
3107                 "paddw %%mm2, %%mm4                     \n\t"
3108                 "paddw %%mm2, %%mm5                     \n\t"
3109                 "psrlw $6, %%mm4                        \n\t"
3110                 "psrlw $6, %%mm5                        \n\t"
3111
3112                 "movq 16(%%"REG_c"), %%mm0              \n\t" // L3 - L4
3113                 "movq 24(%%"REG_c"), %%mm1              \n\t" // H3 - H4
3114
3115                 "pxor %%mm2, %%mm2                      \n\t"
3116                 "pxor %%mm3, %%mm3                      \n\t"
3117
3118                 "pcmpgtw %%mm0, %%mm2                   \n\t" // sign (L3-L4)
3119                 "pcmpgtw %%mm1, %%mm3                   \n\t" // sign (H3-H4)
3120                 "pxor %%mm2, %%mm0                      \n\t"
3121                 "pxor %%mm3, %%mm1                      \n\t"
3122                 "psubw %%mm2, %%mm0                     \n\t" // |L3-L4|
3123                 "psubw %%mm3, %%mm1                     \n\t" // |H3-H4|
3124                 "psrlw $1, %%mm0                        \n\t" // |L3 - L4|/2
3125                 "psrlw $1, %%mm1                        \n\t" // |H3 - H4|/2
3126
3127                 "pxor %%mm6, %%mm2                      \n\t"
3128                 "pxor %%mm7, %%mm3                      \n\t"
3129                 "pand %%mm2, %%mm4                      \n\t"
3130                 "pand %%mm3, %%mm5                      \n\t"
3131
3132 #ifdef HAVE_MMX2
3133                 "pminsw %%mm0, %%mm4                    \n\t"
3134                 "pminsw %%mm1, %%mm5                    \n\t"
3135 #else
3136                 "movq %%mm4, %%mm2                      \n\t"
3137                 "psubusw %%mm0, %%mm2                   \n\t"
3138                 "psubw %%mm2, %%mm4                     \n\t"
3139                 "movq %%mm5, %%mm2                      \n\t"
3140                 "psubusw %%mm1, %%mm2                   \n\t"
3141                 "psubw %%mm2, %%mm5                     \n\t"
3142 #endif
3143                 "pxor %%mm6, %%mm4                      \n\t"
3144                 "pxor %%mm7, %%mm5                      \n\t"
3145                 "psubw %%mm6, %%mm4                     \n\t"
3146                 "psubw %%mm7, %%mm5                     \n\t"
3147                 "packsswb %%mm5, %%mm4                  \n\t"
3148                 "movq %3, %%mm1                         \n\t"
3149                 "pandn %%mm4, %%mm1                     \n\t"
3150                 "movq (%0), %%mm0                       \n\t"
3151                 "paddb   %%mm1, %%mm0                   \n\t"
3152                 "movq %%mm0, (%0)                       \n\t"
3153                 "movq (%0, %1), %%mm0                   \n\t"
3154                 "psubb %%mm1, %%mm0                     \n\t"
3155                 "movq %%mm0, (%0, %1)                   \n\t"
3156
3157                 : "+r" (temp_src)
3158                 : "r" ((long)step), "m" (c->pQPb), "m"(eq_mask)
3159                 : "%"REG_a, "%"REG_c
3160                 );
3161         }
3162 /*if(step==16){
3163     STOP_TIMER("step16")
3164 }else{
3165     STOP_TIMER("stepX")
3166 }*/
3167 }
3168 #endif //HAVE_MMX
3169
3170 static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
3171         const QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c);
3172
3173 /**
3174  * Copies a block from src to dst and fixes the blacklevel.
3175  * levelFix == 0 -> do not touch the brighness & contrast
3176  */
3177 #undef SCALED_CPY
3178
3179 static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, const uint8_t src[], int srcStride,
3180         int levelFix, int64_t *packedOffsetAndScale)
3181 {
3182 #ifndef HAVE_MMX
3183         int i;
3184 #endif
3185         if(levelFix)
3186         {
3187 #ifdef HAVE_MMX
3188                 asm volatile(
3189                         "movq (%%"REG_a"), %%mm2        \n\t" // packedYOffset
3190                         "movq 8(%%"REG_a"), %%mm3       \n\t" // packedYScale
3191                         "lea (%2,%4), %%"REG_a"         \n\t"
3192                         "lea (%3,%5), %%"REG_d"         \n\t"
3193                         "pxor %%mm4, %%mm4              \n\t"
3194 #ifdef HAVE_MMX2
3195 #define REAL_SCALED_CPY(src1, src2, dst1, dst2)                                                \
3196                         "movq " #src1 ", %%mm0          \n\t"\
3197                         "movq " #src1 ", %%mm5          \n\t"\
3198                         "movq " #src2 ", %%mm1          \n\t"\
3199                         "movq " #src2 ", %%mm6          \n\t"\
3200                         "punpcklbw %%mm0, %%mm0         \n\t"\
3201                         "punpckhbw %%mm5, %%mm5         \n\t"\
3202                         "punpcklbw %%mm1, %%mm1         \n\t"\
3203                         "punpckhbw %%mm6, %%mm6         \n\t"\
3204                         "pmulhuw %%mm3, %%mm0           \n\t"\
3205                         "pmulhuw %%mm3, %%mm5           \n\t"\
3206                         "pmulhuw %%mm3, %%mm1           \n\t"\
3207                         "pmulhuw %%mm3, %%mm6           \n\t"\
3208                         "psubw %%mm2, %%mm0             \n\t"\
3209                         "psubw %%mm2, %%mm5             \n\t"\
3210                         "psubw %%mm2, %%mm1             \n\t"\
3211                         "psubw %%mm2, %%mm6             \n\t"\
3212                         "packuswb %%mm5, %%mm0          \n\t"\
3213                         "packuswb %%mm6, %%mm1          \n\t"\
3214                         "movq %%mm0, " #dst1 "          \n\t"\
3215                         "movq %%mm1, " #dst2 "          \n\t"\
3216
3217 #else //HAVE_MMX2
3218 #define REAL_SCALED_CPY(src1, src2, dst1, dst2)                                        \
3219                         "movq " #src1 ", %%mm0          \n\t"\
3220                         "movq " #src1 ", %%mm5          \n\t"\
3221                         "punpcklbw %%mm4, %%mm0         \n\t"\
3222                         "punpckhbw %%mm4, %%mm5         \n\t"\
3223                         "psubw %%mm2, %%mm0             \n\t"\
3224                         "psubw %%mm2, %%mm5             \n\t"\
3225                         "movq " #src2 ", %%mm1          \n\t"\
3226                         "psllw $6, %%mm0                \n\t"\
3227                         "psllw $6, %%mm5                \n\t"\
3228                         "pmulhw %%mm3, %%mm0            \n\t"\
3229                         "movq " #src2 ", %%mm6          \n\t"\
3230                         "pmulhw %%mm3, %%mm5            \n\t"\
3231                         "punpcklbw %%mm4, %%mm1         \n\t"\
3232                         "punpckhbw %%mm4, %%mm6         \n\t"\
3233                         "psubw %%mm2, %%mm1             \n\t"\
3234                         "psubw %%mm2, %%mm6             \n\t"\
3235                         "psllw $6, %%mm1                \n\t"\
3236                         "psllw $6, %%mm6                \n\t"\
3237                         "pmulhw %%mm3, %%mm1            \n\t"\
3238                         "pmulhw %%mm3, %%mm6            \n\t"\
3239                         "packuswb %%mm5, %%mm0          \n\t"\
3240                         "packuswb %%mm6, %%mm1          \n\t"\
3241                         "movq %%mm0, " #dst1 "          \n\t"\
3242                         "movq %%mm1, " #dst2 "          \n\t"\
3243
3244 #endif //HAVE_MMX2
3245 #define SCALED_CPY(src1, src2, dst1, dst2)\
3246    REAL_SCALED_CPY(src1, src2, dst1, dst2)
3247
3248 SCALED_CPY((%2)       , (%2, %4)      , (%3)       , (%3, %5))
3249 SCALED_CPY((%2, %4, 2), (%%REGa, %4, 2), (%3, %5, 2), (%%REGd, %5, 2))
3250 SCALED_CPY((%2, %4, 4), (%%REGa, %4, 4), (%3, %5, 4), (%%REGd, %5, 4))
3251                         "lea (%%"REG_a",%4,4), %%"REG_a"        \n\t"
3252                         "lea (%%"REG_d",%5,4), %%"REG_d"        \n\t"
3253 SCALED_CPY((%%REGa, %4), (%%REGa, %4, 2), (%%REGd, %5), (%%REGd, %5, 2))
3254
3255
3256                         : "=&a" (packedOffsetAndScale)
3257                         : "0" (packedOffsetAndScale),
3258                         "r"(src),
3259                         "r"(dst),
3260                         "r" ((long)srcStride),
3261                         "r" ((long)dstStride)
3262                         : "%"REG_d
3263                                         );
3264 #else //HAVE_MMX
3265         for(i=0; i<8; i++)
3266                 memcpy( &(dst[dstStride*i]),
3267                         &(src[srcStride*i]), BLOCK_SIZE);
3268 #endif //HAVE_MMX
3269         }
3270         else
3271         {
3272 #ifdef HAVE_MMX
3273         asm volatile(
3274                 "lea (%0,%2), %%"REG_a"                 \n\t"
3275                 "lea (%1,%3), %%"REG_d"                 \n\t"
3276
3277 #define REAL_SIMPLE_CPY(src1, src2, dst1, dst2)                              \
3278                 "movq " #src1 ", %%mm0          \n\t"\
3279                 "movq " #src2 ", %%mm1          \n\t"\
3280                 "movq %%mm0, " #dst1 "          \n\t"\
3281                 "movq %%mm1, " #dst2 "          \n\t"\
3282
3283 #define SIMPLE_CPY(src1, src2, dst1, dst2)\
3284    REAL_SIMPLE_CPY(src1, src2, dst1, dst2)
3285
3286 SIMPLE_CPY((%0)       , (%0, %2)       , (%1)       , (%1, %3))
3287 SIMPLE_CPY((%0, %2, 2), (%%REGa, %2, 2), (%1, %3, 2), (%%REGd, %3, 2))
3288 SIMPLE_CPY((%0, %2, 4), (%%REGa, %2, 4), (%1, %3, 4), (%%REGd, %3, 4))
3289                 "lea (%%"REG_a",%2,4), %%"REG_a"        \n\t"
3290                 "lea (%%"REG_d",%3,4), %%"REG_d"        \n\t"
3291 SIMPLE_CPY((%%REGa, %2), (%%REGa, %2, 2), (%%REGd, %3), (%%REGd, %3, 2))
3292
3293                 : : "r" (src),
3294                 "r" (dst),
3295                 "r" ((long)srcStride),
3296                 "r" ((long)dstStride)
3297                 : "%"REG_a, "%"REG_d
3298         );
3299 #else //HAVE_MMX
3300         for(i=0; i<8; i++)
3301                 memcpy( &(dst[dstStride*i]),
3302                         &(src[srcStride*i]), BLOCK_SIZE);
3303 #endif //HAVE_MMX
3304         }
3305 }
3306
3307 /**
3308  * Duplicates the given 8 src pixels ? times upward
3309  */
3310 static inline void RENAME(duplicate)(uint8_t src[], int stride)
3311 {
3312 #ifdef HAVE_MMX
3313         asm volatile(
3314                 "movq (%0), %%mm0               \n\t"
3315                 "add %1, %0                     \n\t"
3316                 "movq %%mm0, (%0)               \n\t"
3317                 "movq %%mm0, (%0, %1)           \n\t"
3318                 "movq %%mm0, (%0, %1, 2)        \n\t"
3319                 : "+r" (src)
3320                 : "r" ((long)-stride)
3321         );
3322 #else
3323         int i;
3324         uint8_t *p=src;
3325         for(i=0; i<3; i++)
3326         {
3327                 p-= stride;
3328                 memcpy(p, src, 8);
3329         }
3330 #endif
3331 }
3332
3333 /**
3334  * Filters array of bytes (Y or U or V values)
3335  */
3336 static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
3337         const QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2)
3338 {
3339         DECLARE_ALIGNED(8, PPContext, c)= *c2; //copy to stack for faster access
3340         int x,y;
3341 #ifdef COMPILE_TIME_MODE
3342         const int mode= COMPILE_TIME_MODE;
3343 #else
3344         const int mode= isColor ? c.ppMode.chromMode : c.ppMode.lumMode;
3345 #endif
3346         int black=0, white=255; // blackest black and whitest white in the picture
3347         int QPCorrecture= 256*256;
3348
3349         int copyAhead;
3350 #ifdef HAVE_MMX
3351         int i;
3352 #endif
3353
3354         const int qpHShift= isColor ? 4-c.hChromaSubSample : 4;
3355         const int qpVShift= isColor ? 4-c.vChromaSubSample : 4;
3356
3357         //FIXME remove
3358         uint64_t * const yHistogram= c.yHistogram;
3359         uint8_t * const tempSrc= srcStride > 0 ? c.tempSrc : c.tempSrc - 23*srcStride;
3360         uint8_t * const tempDst= dstStride > 0 ? c.tempDst : c.tempDst - 23*dstStride;
3361         //const int mbWidth= isColor ? (width+7)>>3 : (width+15)>>4;
3362
3363 #ifdef HAVE_MMX
3364         for(i=0; i<57; i++){
3365                 int offset= ((i*c.ppMode.baseDcDiff)>>8) + 1;
3366                 int threshold= offset*2 + 1;
3367                 c.mmxDcOffset[i]= 0x7F - offset;
3368                 c.mmxDcThreshold[i]= 0x7F - threshold;
3369                 c.mmxDcOffset[i]*= 0x0101010101010101LL;
3370                 c.mmxDcThreshold[i]*= 0x0101010101010101LL;
3371         }
3372 #endif
3373
3374         if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16;
3375         else if(   (mode & LINEAR_BLEND_DEINT_FILTER)
3376                 || (mode & FFMPEG_DEINT_FILTER)
3377                 || (mode & LOWPASS5_DEINT_FILTER)) copyAhead=14;
3378         else if(   (mode & V_DEBLOCK)
3379                 || (mode & LINEAR_IPOL_DEINT_FILTER)
3380                 || (mode & MEDIAN_DEINT_FILTER)
3381                 || (mode & V_A_DEBLOCK)) copyAhead=13;
3382         else if(mode & V_X1_FILTER) copyAhead=11;
3383 //        else if(mode & V_RK1_FILTER) copyAhead=10;
3384         else if(mode & DERING) copyAhead=9;
3385         else copyAhead=8;
3386
3387         copyAhead-= 8;
3388
3389         if(!isColor)
3390         {
3391                 uint64_t sum= 0;
3392                 int i;
3393                 uint64_t maxClipped;
3394                 uint64_t clipped;
3395                 double scale;
3396
3397                 c.frameNum++;
3398                 // first frame is fscked so we ignore it
3399                 if(c.frameNum == 1) yHistogram[0]= width*height/64*15/256;
3400
3401                 for(i=0; i<256; i++)
3402                 {
3403                         sum+= yHistogram[i];
3404                 }
3405
3406                 /* We always get a completely black picture first. */
3407                 maxClipped= (uint64_t)(sum * c.ppMode.maxClippedThreshold);
3408
3409                 clipped= sum;
3410                 for(black=255; black>0; black--)
3411                 {
3412                         if(clipped < maxClipped) break;
3413                         clipped-= yHistogram[black];
3414                 }
3415
3416                 clipped= sum;
3417                 for(white=0; white<256; white++)
3418                 {
3419                         if(clipped < maxClipped) break;
3420                         clipped-= yHistogram[white];
3421                 }
3422
3423                 scale= (double)(c.ppMode.maxAllowedY - c.ppMode.minAllowedY) / (double)(white-black);
3424
3425 #ifdef HAVE_MMX2
3426                 c.packedYScale= (uint16_t)(scale*256.0 + 0.5);
3427                 c.packedYOffset= (((black*c.packedYScale)>>8) - c.ppMode.minAllowedY) & 0xFFFF;
3428 #else
3429                 c.packedYScale= (uint16_t)(scale*1024.0 + 0.5);
3430                 c.packedYOffset= (black - c.ppMode.minAllowedY) & 0xFFFF;
3431 #endif
3432
3433                 c.packedYOffset|= c.packedYOffset<<32;
3434                 c.packedYOffset|= c.packedYOffset<<16;
3435
3436                 c.packedYScale|= c.packedYScale<<32;
3437                 c.packedYScale|= c.packedYScale<<16;
3438
3439                 if(mode & LEVEL_FIX)        QPCorrecture= (int)(scale*256*256 + 0.5);
3440                 else                        QPCorrecture= 256*256;
3441         }
3442         else
3443         {
3444                 c.packedYScale= 0x0100010001000100LL;
3445                 c.packedYOffset= 0;
3446                 QPCorrecture= 256*256;
3447         }
3448
3449         /* copy & deinterlace first row of blocks */
3450         y=-BLOCK_SIZE;
3451         {
3452                 const uint8_t *srcBlock= &(src[y*srcStride]);
3453                 uint8_t *dstBlock= tempDst + dstStride;
3454
3455                 // From this point on it is guaranteed that we can read and write 16 lines downward
3456                 // finish 1 block before the next otherwise we might have a problem
3457                 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
3458                 for(x=0; x<width; x+=BLOCK_SIZE)
3459                 {
3460
3461 #ifdef HAVE_MMX2
3462 /*
3463                         prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
3464                         prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
3465                         prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
3466                         prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
3467 */
3468
3469                         asm(
3470                                 "mov %4, %%"REG_a"              \n\t"
3471                                 "shr $2, %%"REG_a"              \n\t"
3472                                 "and $6, %%"REG_a"              \n\t"
3473                                 "add %5, %%"REG_a"              \n\t"
3474                                 "mov %%"REG_a", %%"REG_d"       \n\t"
3475                                 "imul %1, %%"REG_a"             \n\t"
3476                                 "imul %3, %%"REG_d"             \n\t"
3477                                 "prefetchnta 32(%%"REG_a", %0)  \n\t"
3478                                 "prefetcht0 32(%%"REG_d", %2)   \n\t"
3479                                 "add %1, %%"REG_a"              \n\t"
3480                                 "add %3, %%"REG_d"              \n\t"
3481                                 "prefetchnta 32(%%"REG_a", %0)  \n\t"
3482                                 "prefetcht0 32(%%"REG_d", %2)   \n\t"
3483                         :: "r" (srcBlock), "r" ((long)srcStride), "r" (dstBlock), "r" ((long)dstStride),
3484                         "g" ((long)x), "g" ((long)copyAhead)
3485                         : "%"REG_a, "%"REG_d
3486                         );
3487
3488 #elif defined(HAVE_3DNOW)
3489 //FIXME check if this is faster on an 3dnow chip or if it is faster without the prefetch or ...
3490 /*                        prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
3491                         prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
3492                         prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
3493                         prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
3494 */
3495 #endif
3496
3497                         RENAME(blockCopy)(dstBlock + dstStride*8, dstStride,
3498                                 srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
3499
3500                         RENAME(duplicate)(dstBlock + dstStride*8, dstStride);
3501
3502                         if(mode & LINEAR_IPOL_DEINT_FILTER)
3503                                 RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
3504                         else if(mode & LINEAR_BLEND_DEINT_FILTER)
3505                                 RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x);
3506                         else if(mode & MEDIAN_DEINT_FILTER)
3507                                 RENAME(deInterlaceMedian)(dstBlock, dstStride);
3508                         else if(mode & CUBIC_IPOL_DEINT_FILTER)
3509                                 RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
3510                         else if(mode & FFMPEG_DEINT_FILTER)
3511                                 RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
3512                         else if(mode & LOWPASS5_DEINT_FILTER)
3513                                 RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x);
3514 /*                        else if(mode & CUBIC_BLEND_DEINT_FILTER)
3515                                 RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
3516 */
3517                         dstBlock+=8;
3518                         srcBlock+=8;
3519                 }
3520                 if(width==FFABS(dstStride))
3521                         linecpy(dst, tempDst + 9*dstStride, copyAhead, dstStride);
3522                 else
3523                 {
3524                         int i;
3525                         for(i=0; i<copyAhead; i++)
3526                         {
3527                                 memcpy(dst + i*dstStride, tempDst + (9+i)*dstStride, width);
3528                         }
3529                 }
3530         }
3531
3532         for(y=0; y<height; y+=BLOCK_SIZE)
3533         {
3534                 //1% speedup if these are here instead of the inner loop
3535                 const uint8_t *srcBlock= &(src[y*srcStride]);
3536                 uint8_t *dstBlock= &(dst[y*dstStride]);
3537 #ifdef HAVE_MMX
3538                 uint8_t *tempBlock1= c.tempBlocks;
3539                 uint8_t *tempBlock2= c.tempBlocks + 8;
3540 #endif
3541                 const int8_t *QPptr= &QPs[(y>>qpVShift)*QPStride];
3542                 int8_t *nonBQPptr= &c.nonBQPTable[(y>>qpVShift)*FFABS(QPStride)];
3543                 int QP=0;
3544                 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards
3545                    if not than use a temporary buffer */
3546                 if(y+15 >= height)
3547                 {
3548                         int i;
3549                         /* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with
3550                            blockcopy to dst later */
3551                         linecpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead,
3552                                 FFMAX(height-y-copyAhead, 0), srcStride);
3553
3554                         /* duplicate last line of src to fill the void upto line (copyAhead+7) */
3555                         for(i=FFMAX(height-y, 8); i<copyAhead+8; i++)
3556                                 memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), FFABS(srcStride));
3557
3558                         /* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/
3559                         linecpy(tempDst, dstBlock - dstStride, FFMIN(height-y+1, copyAhead+1), dstStride);
3560
3561                         /* duplicate last line of dst to fill the void upto line (copyAhead) */
3562                         for(i=height-y+1; i<=copyAhead; i++)
3563                                 memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), FFABS(dstStride));
3564
3565                         dstBlock= tempDst + dstStride;
3566                         srcBlock= tempSrc;
3567                 }
3568
3569                 // From this point on it is guaranteed that we can read and write 16 lines downward
3570                 // finish 1 block before the next otherwise we might have a problem
3571                 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
3572                 for(x=0; x<width; x+=BLOCK_SIZE)
3573                 {
3574                         const int stride= dstStride;
3575 #ifdef HAVE_MMX
3576                         uint8_t *tmpXchg;
3577 #endif
3578                         if(isColor)
3579                         {
3580                                 QP= QPptr[x>>qpHShift];
3581                                 c.nonBQP= nonBQPptr[x>>qpHShift];
3582                         }
3583                         else
3584                         {
3585                                 QP= QPptr[x>>4];
3586                                 QP= (QP* QPCorrecture + 256*128)>>16;
3587                                 c.nonBQP= nonBQPptr[x>>4];
3588                                 c.nonBQP= (c.nonBQP* QPCorrecture + 256*128)>>16;
3589                                 yHistogram[ srcBlock[srcStride*12 + 4] ]++;
3590                         }
3591                         c.QP= QP;
3592 #ifdef HAVE_MMX
3593                         asm volatile(
3594                                 "movd %1, %%mm7         \n\t"
3595                                 "packuswb %%mm7, %%mm7  \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
3596                                 "packuswb %%mm7, %%mm7  \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
3597                                 "packuswb %%mm7, %%mm7  \n\t" // QP,..., QP
3598                                 "movq %%mm7, %0         \n\t"
3599                                 : "=m" (c.pQPb)
3600                                 : "r" (QP)
3601                         );
3602 #endif
3603
3604
3605 #ifdef HAVE_MMX2
3606 /*
3607                         prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
3608                         prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
3609                         prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
3610                         prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
3611 */
3612
3613                         asm(
3614                                 "mov %4, %%"REG_a"              \n\t"
3615                                 "shr $2, %%"REG_a"              \n\t"
3616                                 "and $6, %%"REG_a"              \n\t"
3617                                 "add %5, %%"REG_a"              \n\t"
3618                                 "mov %%"REG_a", %%"REG_d"       \n\t"
3619                                 "imul %1, %%"REG_a"             \n\t"
3620                                 "imul %3, %%"REG_d"             \n\t"
3621                                 "prefetchnta 32(%%"REG_a", %0)  \n\t"
3622                                 "prefetcht0 32(%%"REG_d", %2)   \n\t"
3623                                 "add %1, %%"REG_a"              \n\t"
3624                                 "add %3, %%"REG_d"              \n\t"
3625                                 "prefetchnta 32(%%"REG_a", %0)  \n\t"
3626                                 "prefetcht0 32(%%"REG_d", %2)   \n\t"
3627                         :: "r" (srcBlock), "r" ((long)srcStride), "r" (dstBlock), "r" ((long)dstStride),
3628                          "g" ((long)x), "g" ((long)copyAhead)
3629                         : "%"REG_a, "%"REG_d
3630                         );
3631
3632 #elif defined(HAVE_3DNOW)
3633 //FIXME check if this is faster on an 3dnow chip or if it is faster without the prefetch or ...
3634 /*                        prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
3635                         prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
3636                         prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
3637                         prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
3638 */
3639 #endif
3640
3641                         RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride,
3642                                 srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
3643
3644                         if(mode & LINEAR_IPOL_DEINT_FILTER)
3645                                 RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
3646                         else if(mode & LINEAR_BLEND_DEINT_FILTER)
3647                                 RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x);
3648                         else if(mode & MEDIAN_DEINT_FILTER)
3649                                 RENAME(deInterlaceMedian)(dstBlock, dstStride);
3650                         else if(mode & CUBIC_IPOL_DEINT_FILTER)
3651                                 RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
3652                         else if(mode & FFMPEG_DEINT_FILTER)
3653                                 RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
3654                         else if(mode & LOWPASS5_DEINT_FILTER)
3655                                 RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x);
3656 /*                        else if(mode & CUBIC_BLEND_DEINT_FILTER)
3657                                 RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
3658 */
3659
3660                         /* only deblock if we have 2 blocks */
3661                         if(y + 8 < height)
3662                         {
3663                                 if(mode & V_X1_FILTER)
3664                                         RENAME(vertX1Filter)(dstBlock, stride, &c);
3665                                 else if(mode & V_DEBLOCK)
3666                                 {
3667                                         const int t= RENAME(vertClassify)(dstBlock, stride, &c);
3668
3669                                         if(t==1)
3670                                                 RENAME(doVertLowPass)(dstBlock, stride, &c);
3671                                         else if(t==2)
3672                                                 RENAME(doVertDefFilter)(dstBlock, stride, &c);
3673                                 }else if(mode & V_A_DEBLOCK){
3674                                         RENAME(do_a_deblock)(dstBlock, stride, 1, &c);
3675                                 }
3676                         }
3677
3678 #ifdef HAVE_MMX
3679                         RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride);
3680 #endif
3681                         /* check if we have a previous block to deblock it with dstBlock */
3682                         if(x - 8 >= 0)
3683                         {
3684 #ifdef HAVE_MMX
3685                                 if(mode & H_X1_FILTER)
3686                                         RENAME(vertX1Filter)(tempBlock1, 16, &c);
3687                                 else if(mode & H_DEBLOCK)
3688                                 {
3689 //START_TIMER
3690                                         const int t= RENAME(vertClassify)(tempBlock1, 16, &c);
3691 //STOP_TIMER("dc & minmax")
3692                                         if(t==1)
3693                                                 RENAME(doVertLowPass)(tempBlock1, 16, &c);
3694                                         else if(t==2)
3695                                                 RENAME(doVertDefFilter)(tempBlock1, 16, &c);
3696                                 }else if(mode & H_A_DEBLOCK){
3697                                         RENAME(do_a_deblock)(tempBlock1, 16, 1, &c);
3698                                 }
3699
3700                                 RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16);
3701
3702 #else
3703                                 if(mode & H_X1_FILTER)
3704                                         horizX1Filter(dstBlock-4, stride, QP);
3705                                 else if(mode & H_DEBLOCK)
3706                                 {
3707 #ifdef HAVE_ALTIVEC
3708                                         DECLARE_ALIGNED(16, unsigned char, tempBlock[272]);
3709                                         transpose_16x8_char_toPackedAlign_altivec(tempBlock, dstBlock - (4 + 1), stride);
3710
3711                                         const int t=vertClassify_altivec(tempBlock-48, 16, &c);
3712                                         if(t==1) {
3713                                                 doVertLowPass_altivec(tempBlock-48, 16, &c);
3714                                                 transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
3715                                         }
3716                                         else if(t==2) {
3717                                                 doVertDefFilter_altivec(tempBlock-48, 16, &c);
3718                                                 transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
3719                                         }
3720 #else
3721                                         const int t= RENAME(horizClassify)(dstBlock-4, stride, &c);
3722
3723                                         if(t==1)
3724                                                 RENAME(doHorizLowPass)(dstBlock-4, stride, &c);
3725                                         else if(t==2)
3726                                                 RENAME(doHorizDefFilter)(dstBlock-4, stride, &c);
3727 #endif
3728                                 }else if(mode & H_A_DEBLOCK){
3729                                         RENAME(do_a_deblock)(dstBlock-8, 1, stride, &c);
3730                                 }
3731 #endif //HAVE_MMX
3732                                 if(mode & DERING)
3733                                 {
3734                                 //FIXME filter first line
3735                                         if(y>0) RENAME(dering)(dstBlock - stride - 8, stride, &c);
3736                                 }
3737
3738                                 if(mode & TEMP_NOISE_FILTER)
3739                                 {
3740                                         RENAME(tempNoiseReducer)(dstBlock-8, stride,
3741                                                 c.tempBlured[isColor] + y*dstStride + x,
3742                                                 c.tempBluredPast[isColor] + (y>>3)*256 + (x>>3),
3743                                                 c.ppMode.maxTmpNoise);
3744                                 }
3745                         }
3746
3747                         dstBlock+=8;
3748                         srcBlock+=8;
3749
3750 #ifdef HAVE_MMX
3751                         tmpXchg= tempBlock1;
3752                         tempBlock1= tempBlock2;
3753                         tempBlock2 = tmpXchg;
3754 #endif
3755                 }
3756
3757                 if(mode & DERING)
3758                 {
3759                                 if(y > 0) RENAME(dering)(dstBlock - dstStride - 8, dstStride, &c);
3760                 }
3761
3762                 if((mode & TEMP_NOISE_FILTER))
3763                 {
3764                         RENAME(tempNoiseReducer)(dstBlock-8, dstStride,
3765                                 c.tempBlured[isColor] + y*dstStride + x,
3766                                 c.tempBluredPast[isColor] + (y>>3)*256 + (x>>3),
3767                                 c.ppMode.maxTmpNoise);
3768                 }
3769
3770                 /* did we use a tmp buffer for the last lines*/
3771                 if(y+15 >= height)
3772                 {
3773                         uint8_t *dstBlock= &(dst[y*dstStride]);
3774                         if(width==FFABS(dstStride))
3775                                 linecpy(dstBlock, tempDst + dstStride, height-y, dstStride);
3776                         else
3777                         {
3778                                 int i;
3779                                 for(i=0; i<height-y; i++)
3780                                 {
3781                                         memcpy(dstBlock + i*dstStride, tempDst + (i+1)*dstStride, width);
3782                                 }
3783                         }
3784                 }
3785 /*
3786                 for(x=0; x<width; x+=32)
3787                 {
3788                         volatile int i;
3789                         i+=        + dstBlock[x + 7*dstStride] + dstBlock[x + 8*dstStride]
3790                                 + dstBlock[x + 9*dstStride] + dstBlock[x +10*dstStride]
3791                                 + dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride];
3792 //                                + dstBlock[x +13*dstStride]
3793 //                                + dstBlock[x +14*dstStride] + dstBlock[x +15*dstStride];
3794                 }*/
3795         }
3796 #ifdef HAVE_3DNOW
3797         asm volatile("femms");
3798 #elif defined (HAVE_MMX)
3799         asm volatile("emms");
3800 #endif
3801
3802 #ifdef DEBUG_BRIGHTNESS
3803         if(!isColor)
3804         {
3805                 int max=1;
3806                 int i;
3807                 for(i=0; i<256; i++)
3808                         if(yHistogram[i] > max) max=yHistogram[i];
3809
3810                 for(i=1; i<256; i++)
3811                 {
3812                         int x;
3813                         int start=yHistogram[i-1]/(max/256+1);
3814                         int end=yHistogram[i]/(max/256+1);
3815                         int inc= end > start ? 1 : -1;
3816                         for(x=start; x!=end+inc; x+=inc)
3817                                 dst[ i*dstStride + x]+=128;
3818                 }
3819
3820                 for(i=0; i<100; i+=2)
3821                 {
3822                         dst[ (white)*dstStride + i]+=128;
3823                         dst[ (black)*dstStride + i]+=128;
3824                 }
3825
3826         }
3827 #endif
3828
3829         *c2= c; //copy local context back
3830
3831 }