2 * Copyright (C) 2004 the ffmpeg project
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 * MMX-optimized functions cribbed from the original VP3 source code.
26 #include "libavcodec/dsputil.h"
27 #include "dsputil_mmx.h"
30 /* (12 * 4) 2-byte memory locations ( = 96 bytes total)
31 * idct_constants[0..15] = Mask table (M(I))
32 * idct_constants[16..43] = Cosine table (C(I))
33 * idct_constants[44..47] = 8
35 static uint16_t idct_constants[(4 + 7) * 4];
36 static const uint16_t idct_cosine_table[7] = {
37 64277, 60547, 54491, 46341, 36410, 25080, 12785
49 /* from original comments: The Macro does IDct on 4 1-D Dcts */
50 #define BeginIDCT() { \
51 movq_m2r(*I(3), r2); \
52 movq_m2r(*C(3), r6); \
54 movq_m2r(*J(5), r7); \
55 pmulhw_r2r(r6, r4); /* r4 = c3*i3 - i3 */ \
56 movq_m2r(*C(5), r1); \
57 pmulhw_r2r(r7, r6); /* r6 = c3*i5 - i5 */ \
59 pmulhw_r2r(r2, r1); /* r1 = c5*i3 - i3 */ \
60 movq_m2r(*I(1), r3); \
61 pmulhw_r2r(r7, r5); /* r5 = c5*i5 - i5 */ \
62 movq_m2r(*C(1), r0); /* (all registers are in use) */ \
63 paddw_r2r(r2, r4); /* r4 = c3*i3 */ \
64 paddw_r2r(r7, r6); /* r6 = c3*i5 */ \
65 paddw_r2r(r1, r2); /* r2 = c5*i3 */ \
66 movq_m2r(*J(7), r1); \
67 paddw_r2r(r5, r7); /* r7 = c5*i5 */ \
68 movq_r2r(r0, r5); /* r5 = c1 */ \
69 pmulhw_r2r(r3, r0); /* r0 = c1*i1 - i1 */ \
70 paddsw_r2r(r7, r4); /* r4 = C = c3*i3 + c5*i5 */ \
71 pmulhw_r2r(r1, r5); /* r5 = c1*i7 - i7 */ \
72 movq_m2r(*C(7), r7); \
73 psubsw_r2r(r2, r6); /* r6 = D = c3*i5 - c5*i3 */ \
74 paddw_r2r(r3, r0); /* r0 = c1*i1 */ \
75 pmulhw_r2r(r7, r3); /* r3 = c7*i1 */ \
76 movq_m2r(*I(2), r2); \
77 pmulhw_r2r(r1, r7); /* r7 = c7*i7 */ \
78 paddw_r2r(r1, r5); /* r5 = c1*i7 */ \
79 movq_r2r(r2, r1); /* r1 = i2 */ \
80 pmulhw_m2r(*C(2), r2); /* r2 = c2*i2 - i2 */ \
81 psubsw_r2r(r5, r3); /* r3 = B = c7*i1 - c1*i7 */ \
82 movq_m2r(*J(6), r5); \
83 paddsw_r2r(r7, r0); /* r0 = A = c1*i1 + c7*i7 */ \
84 movq_r2r(r5, r7); /* r7 = i6 */ \
85 psubsw_r2r(r4, r0); /* r0 = A - C */ \
86 pmulhw_m2r(*C(2), r5); /* r5 = c2*i6 - i6 */ \
87 paddw_r2r(r1, r2); /* r2 = c2*i2 */ \
88 pmulhw_m2r(*C(6), r1); /* r1 = c6*i2 */ \
89 paddsw_r2r(r4, r4); /* r4 = C + C */ \
90 paddsw_r2r(r0, r4); /* r4 = C. = A + C */ \
91 psubsw_r2r(r6, r3); /* r3 = B - D */ \
92 paddw_r2r(r7, r5); /* r5 = c2*i6 */ \
93 paddsw_r2r(r6, r6); /* r6 = D + D */ \
94 pmulhw_m2r(*C(6), r7); /* r7 = c6*i6 */ \
95 paddsw_r2r(r3, r6); /* r6 = D. = B + D */ \
96 movq_r2m(r4, *I(1)); /* save C. at I(1) */ \
97 psubsw_r2r(r5, r1); /* r1 = H = c6*i2 - c2*i6 */ \
98 movq_m2r(*C(4), r4); \
99 movq_r2r(r3, r5); /* r5 = B - D */ \
100 pmulhw_r2r(r4, r3); /* r3 = (c4 - 1) * (B - D) */ \
101 paddsw_r2r(r2, r7); /* r7 = G = c6*i6 + c2*i2 */ \
102 movq_r2m(r6, *I(2)); /* save D. at I(2) */ \
103 movq_r2r(r0, r2); /* r2 = A - C */ \
104 movq_m2r(*I(0), r6); \
105 pmulhw_r2r(r4, r0); /* r0 = (c4 - 1) * (A - C) */ \
106 paddw_r2r(r3, r5); /* r5 = B. = c4 * (B - D) */ \
107 movq_m2r(*J(4), r3); \
108 psubsw_r2r(r1, r5); /* r5 = B.. = B. - H */ \
109 paddw_r2r(r0, r2); /* r0 = A. = c4 * (A - C) */ \
110 psubsw_r2r(r3, r6); /* r6 = i0 - i4 */ \
112 pmulhw_r2r(r4, r6); /* r6 = (c4 - 1) * (i0 - i4) */ \
113 paddsw_r2r(r3, r3); /* r3 = i4 + i4 */ \
114 paddsw_r2r(r1, r1); /* r1 = H + H */ \
115 paddsw_r2r(r0, r3); /* r3 = i0 + i4 */ \
116 paddsw_r2r(r5, r1); /* r1 = H. = B + H */ \
117 pmulhw_r2r(r3, r4); /* r4 = (c4 - 1) * (i0 + i4) */ \
118 paddsw_r2r(r0, r6); /* r6 = F = c4 * (i0 - i4) */ \
119 psubsw_r2r(r2, r6); /* r6 = F. = F - A. */ \
120 paddsw_r2r(r2, r2); /* r2 = A. + A. */ \
121 movq_m2r(*I(1), r0); /* r0 = C. */ \
122 paddsw_r2r(r6, r2); /* r2 = A.. = F + A. */ \
123 paddw_r2r(r3, r4); /* r4 = E = c4 * (i0 + i4) */ \
124 psubsw_r2r(r1, r2); /* r2 = R2 = A.. - H. */ \
127 /* RowIDCT gets ready to transpose */
128 #define RowIDCT() { \
132 movq_m2r(*I(2), r3); /* r3 = D. */ \
133 psubsw_r2r(r7, r4); /* r4 = E. = E - G */ \
134 paddsw_r2r(r1, r1); /* r1 = H. + H. */ \
135 paddsw_r2r(r7, r7); /* r7 = G + G */ \
136 paddsw_r2r(r2, r1); /* r1 = R1 = A.. + H. */ \
137 paddsw_r2r(r4, r7); /* r7 = G. = E + G */ \
138 psubsw_r2r(r3, r4); /* r4 = R4 = E. - D. */ \
139 paddsw_r2r(r3, r3); \
140 psubsw_r2r(r5, r6); /* r6 = R6 = F. - B.. */ \
141 paddsw_r2r(r5, r5); \
142 paddsw_r2r(r4, r3); /* r3 = R3 = E. + D. */ \
143 paddsw_r2r(r6, r5); /* r5 = R5 = F. + B.. */ \
144 psubsw_r2r(r0, r7); /* r7 = R7 = G. - C. */ \
145 paddsw_r2r(r0, r0); \
146 movq_r2m(r1, *I(1)); /* save R1 */ \
147 paddsw_r2r(r7, r0); /* r0 = R0 = G. + C. */ \
150 /* Column IDCT normalizes and stores final results */
151 #define ColumnIDCT() { \
155 paddsw_m2r(*Eight, r2); /* adjust R2 (and R1) for shift */ \
156 paddsw_r2r(r1, r1); /* r1 = H. + H. */ \
157 paddsw_r2r(r2, r1); /* r1 = R1 = A.. + H. */ \
158 psraw_i2r(4, r2); /* r2 = NR2 */ \
159 psubsw_r2r(r7, r4); /* r4 = E. = E - G */ \
160 psraw_i2r(4, r1); /* r1 = NR1 */ \
161 movq_m2r(*I(2), r3); /* r3 = D. */ \
162 paddsw_r2r(r7, r7); /* r7 = G + G */ \
163 movq_r2m(r2, *I(2)); /* store NR2 at I2 */ \
164 paddsw_r2r(r4, r7); /* r7 = G. = E + G */ \
165 movq_r2m(r1, *I(1)); /* store NR1 at I1 */ \
166 psubsw_r2r(r3, r4); /* r4 = R4 = E. - D. */ \
167 paddsw_m2r(*Eight, r4); /* adjust R4 (and R3) for shift */ \
168 paddsw_r2r(r3, r3); /* r3 = D. + D. */ \
169 paddsw_r2r(r4, r3); /* r3 = R3 = E. + D. */ \
170 psraw_i2r(4, r4); /* r4 = NR4 */ \
171 psubsw_r2r(r5, r6); /* r6 = R6 = F. - B.. */ \
172 psraw_i2r(4, r3); /* r3 = NR3 */ \
173 paddsw_m2r(*Eight, r6); /* adjust R6 (and R5) for shift */ \
174 paddsw_r2r(r5, r5); /* r5 = B.. + B.. */ \
175 paddsw_r2r(r6, r5); /* r5 = R5 = F. + B.. */ \
176 psraw_i2r(4, r6); /* r6 = NR6 */ \
177 movq_r2m(r4, *J(4)); /* store NR4 at J4 */ \
178 psraw_i2r(4, r5); /* r5 = NR5 */ \
179 movq_r2m(r3, *I(3)); /* store NR3 at I3 */ \
180 psubsw_r2r(r0, r7); /* r7 = R7 = G. - C. */ \
181 paddsw_m2r(*Eight, r7); /* adjust R7 (and R0) for shift */ \
182 paddsw_r2r(r0, r0); /* r0 = C. + C. */ \
183 paddsw_r2r(r7, r0); /* r0 = R0 = G. + C. */ \
184 psraw_i2r(4, r7); /* r7 = NR7 */ \
185 movq_r2m(r6, *J(6)); /* store NR6 at J6 */ \
186 psraw_i2r(4, r0); /* r0 = NR0 */ \
187 movq_r2m(r5, *J(5)); /* store NR5 at J5 */ \
188 movq_r2m(r7, *J(7)); /* store NR7 at J7 */ \
189 movq_r2m(r0, *I(0)); /* store NR0 at I0 */ \
192 /* Following macro does two 4x4 transposes in place.
194 At entry (we assume):
218 I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
219 J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7.
221 Since r1 is free at entry, we calculate the Js first. */
223 #define Transpose() { \
224 movq_r2r(r4, r1); /* r1 = e3 e2 e1 e0 */ \
225 punpcklwd_r2r(r5, r4); /* r4 = f1 e1 f0 e0 */ \
226 movq_r2m(r0, *I(0)); /* save a3 a2 a1 a0 */ \
227 punpckhwd_r2r(r5, r1); /* r1 = f3 e3 f2 e2 */ \
228 movq_r2r(r6, r0); /* r0 = g3 g2 g1 g0 */ \
229 punpcklwd_r2r(r7, r6); /* r6 = h1 g1 h0 g0 */ \
230 movq_r2r(r4, r5); /* r5 = f1 e1 f0 e0 */ \
231 punpckldq_r2r(r6, r4); /* r4 = h0 g0 f0 e0 = R4 */ \
232 punpckhdq_r2r(r6, r5); /* r5 = h1 g1 f1 e1 = R5 */ \
233 movq_r2r(r1, r6); /* r6 = f3 e3 f2 e2 */ \
234 movq_r2m(r4, *J(4)); \
235 punpckhwd_r2r(r7, r0); /* r0 = h3 g3 h2 g2 */ \
236 movq_r2m(r5, *J(5)); \
237 punpckhdq_r2r(r0, r6); /* r6 = h3 g3 f3 e3 = R7 */ \
238 movq_m2r(*I(0), r4); /* r4 = a3 a2 a1 a0 */ \
239 punpckldq_r2r(r0, r1); /* r1 = h2 g2 f2 e2 = R6 */ \
240 movq_m2r(*I(1), r5); /* r5 = b3 b2 b1 b0 */ \
241 movq_r2r(r4, r0); /* r0 = a3 a2 a1 a0 */ \
242 movq_r2m(r6, *J(7)); \
243 punpcklwd_r2r(r5, r0); /* r0 = b1 a1 b0 a0 */ \
244 movq_r2m(r1, *J(6)); \
245 punpckhwd_r2r(r5, r4); /* r4 = b3 a3 b2 a2 */ \
246 movq_r2r(r2, r5); /* r5 = c3 c2 c1 c0 */ \
247 punpcklwd_r2r(r3, r2); /* r2 = d1 c1 d0 c0 */ \
248 movq_r2r(r0, r1); /* r1 = b1 a1 b0 a0 */ \
249 punpckldq_r2r(r2, r0); /* r0 = d0 c0 b0 a0 = R0 */ \
250 punpckhdq_r2r(r2, r1); /* r1 = d1 c1 b1 a1 = R1 */ \
251 movq_r2r(r4, r2); /* r2 = b3 a3 b2 a2 */ \
252 movq_r2m(r0, *I(0)); \
253 punpckhwd_r2r(r3, r5); /* r5 = d3 c3 d2 c2 */ \
254 movq_r2m(r1, *I(1)); \
255 punpckhdq_r2r(r5, r4); /* r4 = d3 c3 b3 a3 = R3 */ \
256 punpckldq_r2r(r5, r2); /* r2 = d2 c2 b2 a2 = R2 */ \
257 movq_r2m(r4, *I(3)); \
258 movq_r2m(r2, *I(2)); \
261 void ff_vp3_dsp_init_mmx(void)
268 p = idct_constants + ((j + 3) << 2);
269 p[0] = p[1] = p[2] = p[3] = idct_cosine_table[j - 1];
273 void ff_vp3_idct_mmx(int16_t *output_data)
275 /* eax = quantized input
276 * ebx = dequantizer matrix
277 * ecx = IDCT constants
278 * M(I) = ecx + MaskOffset(0) + I * 8
279 * C(I) = ecx + CosineOffset(32) + (I-1) * 8
284 #define C(x) (idct_constants + 16 + (x - 1) * 4)
285 #define Eight (&ff_pw_8)
287 /* at this point, function has completed dequantization + dezigzag +
288 * partial transposition; now do the idct itself */
289 #define I(K) (output_data + K * 8)
290 #define J(K) (output_data + ((K - 4) * 8) + 4)
297 #define I(K) (output_data + (K * 8) + 32)
298 #define J(K) (output_data + ((K - 4) * 8) + 36)
305 #define I(K) (output_data + K * 8)
306 #define J(K) (output_data + K * 8)
312 #define I(K) (output_data + (K * 8) + 4)
313 #define J(K) (output_data + (K * 8) + 4)
322 void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block)
324 ff_vp3_idct_mmx(block);
325 put_signed_pixels_clamped_mmx(block, dest, line_size);
328 void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
330 ff_vp3_idct_mmx(block);
331 add_pixels_clamped_mmx(block, dest, line_size);