2 * Copyright (C) 2004 the ffmpeg project
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 * MMX-optimized functions cribbed from the original VP3 source code.
26 #include "libavcodec/dsputil.h"
27 #include "dsputil_mmx.h"
30 extern const uint16_t ff_vp3_idct_data[];
41 /* from original comments: The Macro does IDct on 4 1-D Dcts */
42 #define BeginIDCT() { \
43 movq_m2r(*I(3), r2); \
44 movq_m2r(*C(3), r6); \
46 movq_m2r(*J(5), r7); \
47 pmulhw_r2r(r6, r4); /* r4 = c3*i3 - i3 */ \
48 movq_m2r(*C(5), r1); \
49 pmulhw_r2r(r7, r6); /* r6 = c3*i5 - i5 */ \
51 pmulhw_r2r(r2, r1); /* r1 = c5*i3 - i3 */ \
52 movq_m2r(*I(1), r3); \
53 pmulhw_r2r(r7, r5); /* r5 = c5*i5 - i5 */ \
54 movq_m2r(*C(1), r0); /* (all registers are in use) */ \
55 paddw_r2r(r2, r4); /* r4 = c3*i3 */ \
56 paddw_r2r(r7, r6); /* r6 = c3*i5 */ \
57 paddw_r2r(r1, r2); /* r2 = c5*i3 */ \
58 movq_m2r(*J(7), r1); \
59 paddw_r2r(r5, r7); /* r7 = c5*i5 */ \
60 movq_r2r(r0, r5); /* r5 = c1 */ \
61 pmulhw_r2r(r3, r0); /* r0 = c1*i1 - i1 */ \
62 paddsw_r2r(r7, r4); /* r4 = C = c3*i3 + c5*i5 */ \
63 pmulhw_r2r(r1, r5); /* r5 = c1*i7 - i7 */ \
64 movq_m2r(*C(7), r7); \
65 psubsw_r2r(r2, r6); /* r6 = D = c3*i5 - c5*i3 */ \
66 paddw_r2r(r3, r0); /* r0 = c1*i1 */ \
67 pmulhw_r2r(r7, r3); /* r3 = c7*i1 */ \
68 movq_m2r(*I(2), r2); \
69 pmulhw_r2r(r1, r7); /* r7 = c7*i7 */ \
70 paddw_r2r(r1, r5); /* r5 = c1*i7 */ \
71 movq_r2r(r2, r1); /* r1 = i2 */ \
72 pmulhw_m2r(*C(2), r2); /* r2 = c2*i2 - i2 */ \
73 psubsw_r2r(r5, r3); /* r3 = B = c7*i1 - c1*i7 */ \
74 movq_m2r(*J(6), r5); \
75 paddsw_r2r(r7, r0); /* r0 = A = c1*i1 + c7*i7 */ \
76 movq_r2r(r5, r7); /* r7 = i6 */ \
77 psubsw_r2r(r4, r0); /* r0 = A - C */ \
78 pmulhw_m2r(*C(2), r5); /* r5 = c2*i6 - i6 */ \
79 paddw_r2r(r1, r2); /* r2 = c2*i2 */ \
80 pmulhw_m2r(*C(6), r1); /* r1 = c6*i2 */ \
81 paddsw_r2r(r4, r4); /* r4 = C + C */ \
82 paddsw_r2r(r0, r4); /* r4 = C. = A + C */ \
83 psubsw_r2r(r6, r3); /* r3 = B - D */ \
84 paddw_r2r(r7, r5); /* r5 = c2*i6 */ \
85 paddsw_r2r(r6, r6); /* r6 = D + D */ \
86 pmulhw_m2r(*C(6), r7); /* r7 = c6*i6 */ \
87 paddsw_r2r(r3, r6); /* r6 = D. = B + D */ \
88 movq_r2m(r4, *I(1)); /* save C. at I(1) */ \
89 psubsw_r2r(r5, r1); /* r1 = H = c6*i2 - c2*i6 */ \
90 movq_m2r(*C(4), r4); \
91 movq_r2r(r3, r5); /* r5 = B - D */ \
92 pmulhw_r2r(r4, r3); /* r3 = (c4 - 1) * (B - D) */ \
93 paddsw_r2r(r2, r7); /* r7 = G = c6*i6 + c2*i2 */ \
94 movq_r2m(r6, *I(2)); /* save D. at I(2) */ \
95 movq_r2r(r0, r2); /* r2 = A - C */ \
96 movq_m2r(*I(0), r6); \
97 pmulhw_r2r(r4, r0); /* r0 = (c4 - 1) * (A - C) */ \
98 paddw_r2r(r3, r5); /* r5 = B. = c4 * (B - D) */ \
99 movq_m2r(*J(4), r3); \
100 psubsw_r2r(r1, r5); /* r5 = B.. = B. - H */ \
101 paddw_r2r(r0, r2); /* r0 = A. = c4 * (A - C) */ \
102 psubsw_r2r(r3, r6); /* r6 = i0 - i4 */ \
104 pmulhw_r2r(r4, r6); /* r6 = (c4 - 1) * (i0 - i4) */ \
105 paddsw_r2r(r3, r3); /* r3 = i4 + i4 */ \
106 paddsw_r2r(r1, r1); /* r1 = H + H */ \
107 paddsw_r2r(r0, r3); /* r3 = i0 + i4 */ \
108 paddsw_r2r(r5, r1); /* r1 = H. = B + H */ \
109 pmulhw_r2r(r3, r4); /* r4 = (c4 - 1) * (i0 + i4) */ \
110 paddsw_r2r(r0, r6); /* r6 = F = c4 * (i0 - i4) */ \
111 psubsw_r2r(r2, r6); /* r6 = F. = F - A. */ \
112 paddsw_r2r(r2, r2); /* r2 = A. + A. */ \
113 movq_m2r(*I(1), r0); /* r0 = C. */ \
114 paddsw_r2r(r6, r2); /* r2 = A.. = F + A. */ \
115 paddw_r2r(r3, r4); /* r4 = E = c4 * (i0 + i4) */ \
116 psubsw_r2r(r1, r2); /* r2 = R2 = A.. - H. */ \
119 /* RowIDCT gets ready to transpose */
120 #define RowIDCT() { \
124 movq_m2r(*I(2), r3); /* r3 = D. */ \
125 psubsw_r2r(r7, r4); /* r4 = E. = E - G */ \
126 paddsw_r2r(r1, r1); /* r1 = H. + H. */ \
127 paddsw_r2r(r7, r7); /* r7 = G + G */ \
128 paddsw_r2r(r2, r1); /* r1 = R1 = A.. + H. */ \
129 paddsw_r2r(r4, r7); /* r7 = G. = E + G */ \
130 psubsw_r2r(r3, r4); /* r4 = R4 = E. - D. */ \
131 paddsw_r2r(r3, r3); \
132 psubsw_r2r(r5, r6); /* r6 = R6 = F. - B.. */ \
133 paddsw_r2r(r5, r5); \
134 paddsw_r2r(r4, r3); /* r3 = R3 = E. + D. */ \
135 paddsw_r2r(r6, r5); /* r5 = R5 = F. + B.. */ \
136 psubsw_r2r(r0, r7); /* r7 = R7 = G. - C. */ \
137 paddsw_r2r(r0, r0); \
138 movq_r2m(r1, *I(1)); /* save R1 */ \
139 paddsw_r2r(r7, r0); /* r0 = R0 = G. + C. */ \
142 /* Column IDCT normalizes and stores final results */
143 #define ColumnIDCT() { \
147 paddsw_m2r(*Eight, r2); /* adjust R2 (and R1) for shift */ \
148 paddsw_r2r(r1, r1); /* r1 = H. + H. */ \
149 paddsw_r2r(r2, r1); /* r1 = R1 = A.. + H. */ \
150 psraw_i2r(4, r2); /* r2 = NR2 */ \
151 psubsw_r2r(r7, r4); /* r4 = E. = E - G */ \
152 psraw_i2r(4, r1); /* r1 = NR1 */ \
153 movq_m2r(*I(2), r3); /* r3 = D. */ \
154 paddsw_r2r(r7, r7); /* r7 = G + G */ \
155 movq_r2m(r2, *I(2)); /* store NR2 at I2 */ \
156 paddsw_r2r(r4, r7); /* r7 = G. = E + G */ \
157 movq_r2m(r1, *I(1)); /* store NR1 at I1 */ \
158 psubsw_r2r(r3, r4); /* r4 = R4 = E. - D. */ \
159 paddsw_m2r(*Eight, r4); /* adjust R4 (and R3) for shift */ \
160 paddsw_r2r(r3, r3); /* r3 = D. + D. */ \
161 paddsw_r2r(r4, r3); /* r3 = R3 = E. + D. */ \
162 psraw_i2r(4, r4); /* r4 = NR4 */ \
163 psubsw_r2r(r5, r6); /* r6 = R6 = F. - B.. */ \
164 psraw_i2r(4, r3); /* r3 = NR3 */ \
165 paddsw_m2r(*Eight, r6); /* adjust R6 (and R5) for shift */ \
166 paddsw_r2r(r5, r5); /* r5 = B.. + B.. */ \
167 paddsw_r2r(r6, r5); /* r5 = R5 = F. + B.. */ \
168 psraw_i2r(4, r6); /* r6 = NR6 */ \
169 movq_r2m(r4, *J(4)); /* store NR4 at J4 */ \
170 psraw_i2r(4, r5); /* r5 = NR5 */ \
171 movq_r2m(r3, *I(3)); /* store NR3 at I3 */ \
172 psubsw_r2r(r0, r7); /* r7 = R7 = G. - C. */ \
173 paddsw_m2r(*Eight, r7); /* adjust R7 (and R0) for shift */ \
174 paddsw_r2r(r0, r0); /* r0 = C. + C. */ \
175 paddsw_r2r(r7, r0); /* r0 = R0 = G. + C. */ \
176 psraw_i2r(4, r7); /* r7 = NR7 */ \
177 movq_r2m(r6, *J(6)); /* store NR6 at J6 */ \
178 psraw_i2r(4, r0); /* r0 = NR0 */ \
179 movq_r2m(r5, *J(5)); /* store NR5 at J5 */ \
180 movq_r2m(r7, *J(7)); /* store NR7 at J7 */ \
181 movq_r2m(r0, *I(0)); /* store NR0 at I0 */ \
184 /* Following macro does two 4x4 transposes in place.
186 At entry (we assume):
210 I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
211 J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7.
213 Since r1 is free at entry, we calculate the Js first. */
215 #define Transpose() { \
216 movq_r2r(r4, r1); /* r1 = e3 e2 e1 e0 */ \
217 punpcklwd_r2r(r5, r4); /* r4 = f1 e1 f0 e0 */ \
218 movq_r2m(r0, *I(0)); /* save a3 a2 a1 a0 */ \
219 punpckhwd_r2r(r5, r1); /* r1 = f3 e3 f2 e2 */ \
220 movq_r2r(r6, r0); /* r0 = g3 g2 g1 g0 */ \
221 punpcklwd_r2r(r7, r6); /* r6 = h1 g1 h0 g0 */ \
222 movq_r2r(r4, r5); /* r5 = f1 e1 f0 e0 */ \
223 punpckldq_r2r(r6, r4); /* r4 = h0 g0 f0 e0 = R4 */ \
224 punpckhdq_r2r(r6, r5); /* r5 = h1 g1 f1 e1 = R5 */ \
225 movq_r2r(r1, r6); /* r6 = f3 e3 f2 e2 */ \
226 movq_r2m(r4, *J(4)); \
227 punpckhwd_r2r(r7, r0); /* r0 = h3 g3 h2 g2 */ \
228 movq_r2m(r5, *J(5)); \
229 punpckhdq_r2r(r0, r6); /* r6 = h3 g3 f3 e3 = R7 */ \
230 movq_m2r(*I(0), r4); /* r4 = a3 a2 a1 a0 */ \
231 punpckldq_r2r(r0, r1); /* r1 = h2 g2 f2 e2 = R6 */ \
232 movq_m2r(*I(1), r5); /* r5 = b3 b2 b1 b0 */ \
233 movq_r2r(r4, r0); /* r0 = a3 a2 a1 a0 */ \
234 movq_r2m(r6, *J(7)); \
235 punpcklwd_r2r(r5, r0); /* r0 = b1 a1 b0 a0 */ \
236 movq_r2m(r1, *J(6)); \
237 punpckhwd_r2r(r5, r4); /* r4 = b3 a3 b2 a2 */ \
238 movq_r2r(r2, r5); /* r5 = c3 c2 c1 c0 */ \
239 punpcklwd_r2r(r3, r2); /* r2 = d1 c1 d0 c0 */ \
240 movq_r2r(r0, r1); /* r1 = b1 a1 b0 a0 */ \
241 punpckldq_r2r(r2, r0); /* r0 = d0 c0 b0 a0 = R0 */ \
242 punpckhdq_r2r(r2, r1); /* r1 = d1 c1 b1 a1 = R1 */ \
243 movq_r2r(r4, r2); /* r2 = b3 a3 b2 a2 */ \
244 movq_r2m(r0, *I(0)); \
245 punpckhwd_r2r(r3, r5); /* r5 = d3 c3 d2 c2 */ \
246 movq_r2m(r1, *I(1)); \
247 punpckhdq_r2r(r5, r4); /* r4 = d3 c3 b3 a3 = R3 */ \
248 punpckldq_r2r(r5, r2); /* r2 = d2 c2 b2 a2 = R2 */ \
249 movq_r2m(r4, *I(3)); \
250 movq_r2m(r2, *I(2)); \
253 void ff_vp3_idct_mmx(int16_t *output_data)
255 /* eax = quantized input
256 * ebx = dequantizer matrix
257 * ecx = IDCT constants
258 * M(I) = ecx + MaskOffset(0) + I * 8
259 * C(I) = ecx + CosineOffset(32) + (I-1) * 8
264 #define C(x) (ff_vp3_idct_data + (x - 1) * 8)
265 #define Eight (&ff_pw_8)
267 /* at this point, function has completed dequantization + dezigzag +
268 * partial transposition; now do the idct itself */
269 #define I(K) (output_data + K * 8)
270 #define J(K) (output_data + ((K - 4) * 8) + 4)
277 #define I(K) (output_data + (K * 8) + 32)
278 #define J(K) (output_data + ((K - 4) * 8) + 36)
285 #define I(K) (output_data + K * 8)
286 #define J(K) (output_data + K * 8)
292 #define I(K) (output_data + (K * 8) + 4)
293 #define J(K) (output_data + (K * 8) + 4)
302 void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block)
304 ff_vp3_idct_mmx(block);
305 put_signed_pixels_clamped_mmx(block, dest, line_size);
308 void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
310 ff_vp3_idct_mmx(block);
311 add_pixels_clamped_mmx(block, dest, line_size);