Mercurial > libavcodec.hg
annotate i386/idct_mmx.c @ 2892:41315d0120b3 libavcodec
replace a few mov + psrlq with pshufw, there are more cases which could benefit from this but they would require us to duplicate some functions ...
the trick is from various places (my own code in libpostproc, a patch on the x264 list, ...)
author | michael |
---|---|
date | Wed, 21 Sep 2005 21:17:09 +0000 |
parents | 00ff749b33b6 |
children | bfabfdf9ce55 |
rev | line source |
---|---|
30 | 1 /* |
2 * Note: For libavcodec, this code can also be used under the LGPL license | |
3 */ | |
4 /* | |
5 * idct_mmx.c | |
6 * Copyright (C) 1999-2001 Aaron Holtzman <aholtzma@ess.engr.uvic.ca> | |
7 * | |
8 * This file is part of mpeg2dec, a free MPEG-2 video stream decoder. | |
9 * | |
10 * mpeg2dec is free software; you can redistribute it and/or modify | |
11 * it under the terms of the GNU General Public License as published by | |
12 * the Free Software Foundation; either version 2 of the License, or | |
13 * (at your option) any later version. | |
14 * | |
15 * mpeg2dec is distributed in the hope that it will be useful, | |
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
18 * GNU General Public License for more details. | |
19 * | |
20 * You should have received a copy of the GNU General Public License | |
21 * along with this program; if not, write to the Free Software | |
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
23 */ | |
24 | |
2817
b128802eb77b
libavutil: Utility code from libavcodec moved to a separate library.
al
parents:
2754
diff
changeset
|
25 #include "common.h" |
2024
f65d87bfdd5a
some of the warning fixes by (Michael Roitzsch <mroi at users dot sourceforge dot net>)
michael
parents:
436
diff
changeset
|
26 #include "../dsputil.h" |
30 | 27 |
28 #include "mmx.h" | |
29 | |
30 #define ATTR_ALIGN(align) __attribute__ ((__aligned__ (align))) | |
31 | |
32 #define ROW_SHIFT 11 | |
33 #define COL_SHIFT 6 | |
34 | |
35 #define round(bias) ((int)(((bias)+0.5) * (1<<ROW_SHIFT))) | |
36 #define rounder(bias) {round (bias), round (bias)} | |
37 | |
38 #if 0 | |
39 /* C row IDCT - its just here to document the MMXEXT and MMX versions */ | |
40 static inline void idct_row (int16_t * row, int offset, | |
41 int16_t * table, int32_t * rounder) | |
42 { | |
43 int C1, C2, C3, C4, C5, C6, C7; | |
44 int a0, a1, a2, a3, b0, b1, b2, b3; | |
45 | |
46 row += offset; | |
47 | |
48 C1 = table[1]; | |
49 C2 = table[2]; | |
50 C3 = table[3]; | |
51 C4 = table[4]; | |
52 C5 = table[5]; | |
53 C6 = table[6]; | |
54 C7 = table[7]; | |
55 | |
56 a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + *rounder; | |
57 a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + *rounder; | |
58 a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + *rounder; | |
59 a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + *rounder; | |
60 | |
61 b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7]; | |
62 b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7]; | |
63 b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7]; | |
64 b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7]; | |
65 | |
66 row[0] = (a0 + b0) >> ROW_SHIFT; | |
67 row[1] = (a1 + b1) >> ROW_SHIFT; | |
68 row[2] = (a2 + b2) >> ROW_SHIFT; | |
69 row[3] = (a3 + b3) >> ROW_SHIFT; | |
70 row[4] = (a3 - b3) >> ROW_SHIFT; | |
71 row[5] = (a2 - b2) >> ROW_SHIFT; | |
72 row[6] = (a1 - b1) >> ROW_SHIFT; | |
73 row[7] = (a0 - b0) >> ROW_SHIFT; | |
74 } | |
75 #endif | |
76 | |
77 | |
78 /* MMXEXT row IDCT */ | |
79 | |
80 #define mmxext_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, -c4, -c2, \ | |
81 c4, c6, c4, c6, \ | |
82 c1, c3, -c1, -c5, \ | |
83 c5, c7, c3, -c7, \ | |
84 c4, -c6, c4, -c6, \ | |
85 -c4, c2, c4, -c2, \ | |
86 c5, -c1, c3, -c1, \ | |
87 c7, c3, c7, -c5 } | |
88 | |
2854
00ff749b33b6
add consts (this was in my local tree, dunno where it came from, probably forgoten from some const patch)
michael
parents:
2817
diff
changeset
|
89 static inline void mmxext_row_head (int16_t * row, int offset, const int16_t * table) |
30 | 90 { |
91 movq_m2r (*(row+offset), mm2); // mm2 = x6 x4 x2 x0 | |
92 | |
93 movq_m2r (*(row+offset+4), mm5); // mm5 = x7 x5 x3 x1 | |
94 movq_r2r (mm2, mm0); // mm0 = x6 x4 x2 x0 | |
95 | |
96 movq_m2r (*table, mm3); // mm3 = -C2 -C4 C2 C4 | |
97 movq_r2r (mm5, mm6); // mm6 = x7 x5 x3 x1 | |
98 | |
99 movq_m2r (*(table+4), mm4); // mm4 = C6 C4 C6 C4 | |
100 pmaddwd_r2r (mm0, mm3); // mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 | |
101 | |
102 pshufw_r2r (mm2, mm2, 0x4e); // mm2 = x2 x0 x6 x4 | |
103 } | |
104 | |
2854
00ff749b33b6
add consts (this was in my local tree, dunno where it came from, probably forgoten from some const patch)
michael
parents:
2817
diff
changeset
|
105 static inline void mmxext_row (const int16_t * table, const int32_t * rounder) |
30 | 106 { |
107 movq_m2r (*(table+8), mm1); // mm1 = -C5 -C1 C3 C1 | |
108 pmaddwd_r2r (mm2, mm4); // mm4 = C4*x0+C6*x2 C4*x4+C6*x6 | |
109 | |
110 pmaddwd_m2r (*(table+16), mm0); // mm0 = C4*x4-C6*x6 C4*x0-C6*x2 | |
111 pshufw_r2r (mm6, mm6, 0x4e); // mm6 = x3 x1 x7 x5 | |
112 | |
113 movq_m2r (*(table+12), mm7); // mm7 = -C7 C3 C7 C5 | |
114 pmaddwd_r2r (mm5, mm1); // mm1 = -C1*x5-C5*x7 C1*x1+C3*x3 | |
115 | |
116 paddd_m2r (*rounder, mm3); // mm3 += rounder | |
117 pmaddwd_r2r (mm6, mm7); // mm7 = C3*x1-C7*x3 C5*x5+C7*x7 | |
118 | |
119 pmaddwd_m2r (*(table+20), mm2); // mm2 = C4*x0-C2*x2 -C4*x4+C2*x6 | |
120 paddd_r2r (mm4, mm3); // mm3 = a1 a0 + rounder | |
121 | |
122 pmaddwd_m2r (*(table+24), mm5); // mm5 = C3*x5-C1*x7 C5*x1-C1*x3 | |
123 movq_r2r (mm3, mm4); // mm4 = a1 a0 + rounder | |
124 | |
125 pmaddwd_m2r (*(table+28), mm6); // mm6 = C7*x1-C5*x3 C7*x5+C3*x7 | |
126 paddd_r2r (mm7, mm1); // mm1 = b1 b0 | |
127 | |
128 paddd_m2r (*rounder, mm0); // mm0 += rounder | |
129 psubd_r2r (mm1, mm3); // mm3 = a1-b1 a0-b0 + rounder | |
130 | |
131 psrad_i2r (ROW_SHIFT, mm3); // mm3 = y6 y7 | |
132 paddd_r2r (mm4, mm1); // mm1 = a1+b1 a0+b0 + rounder | |
133 | |
134 paddd_r2r (mm2, mm0); // mm0 = a3 a2 + rounder | |
135 psrad_i2r (ROW_SHIFT, mm1); // mm1 = y1 y0 | |
136 | |
137 paddd_r2r (mm6, mm5); // mm5 = b3 b2 | |
138 movq_r2r (mm0, mm4); // mm4 = a3 a2 + rounder | |
139 | |
140 paddd_r2r (mm5, mm0); // mm0 = a3+b3 a2+b2 + rounder | |
141 psubd_r2r (mm5, mm4); // mm4 = a3-b3 a2-b2 + rounder | |
142 } | |
143 | |
144 static inline void mmxext_row_tail (int16_t * row, int store) | |
145 { | |
146 psrad_i2r (ROW_SHIFT, mm0); // mm0 = y3 y2 | |
147 | |
148 psrad_i2r (ROW_SHIFT, mm4); // mm4 = y4 y5 | |
149 | |
150 packssdw_r2r (mm0, mm1); // mm1 = y3 y2 y1 y0 | |
151 | |
152 packssdw_r2r (mm3, mm4); // mm4 = y6 y7 y4 y5 | |
153 | |
154 movq_r2m (mm1, *(row+store)); // save y3 y2 y1 y0 | |
155 pshufw_r2r (mm4, mm4, 0xb1); // mm4 = y7 y6 y5 y4 | |
156 | |
157 /* slot */ | |
158 | |
159 movq_r2m (mm4, *(row+store+4)); // save y7 y6 y5 y4 | |
160 } | |
161 | |
162 static inline void mmxext_row_mid (int16_t * row, int store, | |
2854
00ff749b33b6
add consts (this was in my local tree, dunno where it came from, probably forgoten from some const patch)
michael
parents:
2817
diff
changeset
|
163 int offset, const int16_t * table) |
30 | 164 { |
165 movq_m2r (*(row+offset), mm2); // mm2 = x6 x4 x2 x0 | |
166 psrad_i2r (ROW_SHIFT, mm0); // mm0 = y3 y2 | |
167 | |
168 movq_m2r (*(row+offset+4), mm5); // mm5 = x7 x5 x3 x1 | |
169 psrad_i2r (ROW_SHIFT, mm4); // mm4 = y4 y5 | |
170 | |
171 packssdw_r2r (mm0, mm1); // mm1 = y3 y2 y1 y0 | |
172 movq_r2r (mm5, mm6); // mm6 = x7 x5 x3 x1 | |
173 | |
174 packssdw_r2r (mm3, mm4); // mm4 = y6 y7 y4 y5 | |
175 movq_r2r (mm2, mm0); // mm0 = x6 x4 x2 x0 | |
176 | |
177 movq_r2m (mm1, *(row+store)); // save y3 y2 y1 y0 | |
178 pshufw_r2r (mm4, mm4, 0xb1); // mm4 = y7 y6 y5 y4 | |
179 | |
180 movq_m2r (*table, mm3); // mm3 = -C2 -C4 C2 C4 | |
181 movq_r2m (mm4, *(row+store+4)); // save y7 y6 y5 y4 | |
182 | |
183 pmaddwd_r2r (mm0, mm3); // mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 | |
184 | |
185 movq_m2r (*(table+4), mm4); // mm4 = C6 C4 C6 C4 | |
186 pshufw_r2r (mm2, mm2, 0x4e); // mm2 = x2 x0 x6 x4 | |
187 } | |
188 | |
189 | |
190 /* MMX row IDCT */ | |
191 | |
192 #define mmx_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, c4, c6, \ | |
193 c4, c6, -c4, -c2, \ | |
194 c1, c3, c3, -c7, \ | |
195 c5, c7, -c1, -c5, \ | |
196 c4, -c6, c4, -c2, \ | |
197 -c4, c2, c4, -c6, \ | |
198 c5, -c1, c7, -c5, \ | |
199 c7, c3, c3, -c1 } | |
200 | |
2854
00ff749b33b6
add consts (this was in my local tree, dunno where it came from, probably forgoten from some const patch)
michael
parents:
2817
diff
changeset
|
201 static inline void mmx_row_head (int16_t * row, int offset, const int16_t * table) |
30 | 202 { |
203 movq_m2r (*(row+offset), mm2); // mm2 = x6 x4 x2 x0 | |
204 | |
205 movq_m2r (*(row+offset+4), mm5); // mm5 = x7 x5 x3 x1 | |
206 movq_r2r (mm2, mm0); // mm0 = x6 x4 x2 x0 | |
207 | |
208 movq_m2r (*table, mm3); // mm3 = C6 C4 C2 C4 | |
209 movq_r2r (mm5, mm6); // mm6 = x7 x5 x3 x1 | |
210 | |
211 punpckldq_r2r (mm0, mm0); // mm0 = x2 x0 x2 x0 | |
212 | |
213 movq_m2r (*(table+4), mm4); // mm4 = -C2 -C4 C6 C4 | |
214 pmaddwd_r2r (mm0, mm3); // mm3 = C4*x0+C6*x2 C4*x0+C2*x2 | |
215 | |
216 movq_m2r (*(table+8), mm1); // mm1 = -C7 C3 C3 C1 | |
217 punpckhdq_r2r (mm2, mm2); // mm2 = x6 x4 x6 x4 | |
218 } | |
219 | |
2854
00ff749b33b6
add consts (this was in my local tree, dunno where it came from, probably forgoten from some const patch)
michael
parents:
2817
diff
changeset
|
220 static inline void mmx_row (const int16_t * table, const int32_t * rounder) |
30 | 221 { |
222 pmaddwd_r2r (mm2, mm4); // mm4 = -C4*x4-C2*x6 C4*x4+C6*x6 | |
223 punpckldq_r2r (mm5, mm5); // mm5 = x3 x1 x3 x1 | |
224 | |
225 pmaddwd_m2r (*(table+16), mm0); // mm0 = C4*x0-C2*x2 C4*x0-C6*x2 | |
226 punpckhdq_r2r (mm6, mm6); // mm6 = x7 x5 x7 x5 | |
227 | |
228 movq_m2r (*(table+12), mm7); // mm7 = -C5 -C1 C7 C5 | |
229 pmaddwd_r2r (mm5, mm1); // mm1 = C3*x1-C7*x3 C1*x1+C3*x3 | |
230 | |
231 paddd_m2r (*rounder, mm3); // mm3 += rounder | |
232 pmaddwd_r2r (mm6, mm7); // mm7 = -C1*x5-C5*x7 C5*x5+C7*x7 | |
233 | |
234 pmaddwd_m2r (*(table+20), mm2); // mm2 = C4*x4-C6*x6 -C4*x4+C2*x6 | |
235 paddd_r2r (mm4, mm3); // mm3 = a1 a0 + rounder | |
236 | |
237 pmaddwd_m2r (*(table+24), mm5); // mm5 = C7*x1-C5*x3 C5*x1-C1*x3 | |
238 movq_r2r (mm3, mm4); // mm4 = a1 a0 + rounder | |
239 | |
240 pmaddwd_m2r (*(table+28), mm6); // mm6 = C3*x5-C1*x7 C7*x5+C3*x7 | |
241 paddd_r2r (mm7, mm1); // mm1 = b1 b0 | |
242 | |
243 paddd_m2r (*rounder, mm0); // mm0 += rounder | |
244 psubd_r2r (mm1, mm3); // mm3 = a1-b1 a0-b0 + rounder | |
245 | |
246 psrad_i2r (ROW_SHIFT, mm3); // mm3 = y6 y7 | |
247 paddd_r2r (mm4, mm1); // mm1 = a1+b1 a0+b0 + rounder | |
248 | |
249 paddd_r2r (mm2, mm0); // mm0 = a3 a2 + rounder | |
250 psrad_i2r (ROW_SHIFT, mm1); // mm1 = y1 y0 | |
251 | |
252 paddd_r2r (mm6, mm5); // mm5 = b3 b2 | |
253 movq_r2r (mm0, mm7); // mm7 = a3 a2 + rounder | |
254 | |
255 paddd_r2r (mm5, mm0); // mm0 = a3+b3 a2+b2 + rounder | |
256 psubd_r2r (mm5, mm7); // mm7 = a3-b3 a2-b2 + rounder | |
257 } | |
258 | |
259 static inline void mmx_row_tail (int16_t * row, int store) | |
260 { | |
261 psrad_i2r (ROW_SHIFT, mm0); // mm0 = y3 y2 | |
262 | |
263 psrad_i2r (ROW_SHIFT, mm7); // mm7 = y4 y5 | |
264 | |
265 packssdw_r2r (mm0, mm1); // mm1 = y3 y2 y1 y0 | |
266 | |
267 packssdw_r2r (mm3, mm7); // mm7 = y6 y7 y4 y5 | |
268 | |
269 movq_r2m (mm1, *(row+store)); // save y3 y2 y1 y0 | |
270 movq_r2r (mm7, mm4); // mm4 = y6 y7 y4 y5 | |
271 | |
272 pslld_i2r (16, mm7); // mm7 = y7 0 y5 0 | |
273 | |
274 psrld_i2r (16, mm4); // mm4 = 0 y6 0 y4 | |
275 | |
276 por_r2r (mm4, mm7); // mm7 = y7 y6 y5 y4 | |
277 | |
278 /* slot */ | |
279 | |
280 movq_r2m (mm7, *(row+store+4)); // save y7 y6 y5 y4 | |
281 } | |
282 | |
283 static inline void mmx_row_mid (int16_t * row, int store, | |
2854
00ff749b33b6
add consts (this was in my local tree, dunno where it came from, probably forgoten from some const patch)
michael
parents:
2817
diff
changeset
|
284 int offset, const int16_t * table) |
30 | 285 { |
286 movq_m2r (*(row+offset), mm2); // mm2 = x6 x4 x2 x0 | |
287 psrad_i2r (ROW_SHIFT, mm0); // mm0 = y3 y2 | |
288 | |
289 movq_m2r (*(row+offset+4), mm5); // mm5 = x7 x5 x3 x1 | |
290 psrad_i2r (ROW_SHIFT, mm7); // mm7 = y4 y5 | |
291 | |
292 packssdw_r2r (mm0, mm1); // mm1 = y3 y2 y1 y0 | |
293 movq_r2r (mm5, mm6); // mm6 = x7 x5 x3 x1 | |
294 | |
295 packssdw_r2r (mm3, mm7); // mm7 = y6 y7 y4 y5 | |
296 movq_r2r (mm2, mm0); // mm0 = x6 x4 x2 x0 | |
297 | |
298 movq_r2m (mm1, *(row+store)); // save y3 y2 y1 y0 | |
299 movq_r2r (mm7, mm1); // mm1 = y6 y7 y4 y5 | |
300 | |
301 punpckldq_r2r (mm0, mm0); // mm0 = x2 x0 x2 x0 | |
302 psrld_i2r (16, mm7); // mm7 = 0 y6 0 y4 | |
303 | |
304 movq_m2r (*table, mm3); // mm3 = C6 C4 C2 C4 | |
305 pslld_i2r (16, mm1); // mm1 = y7 0 y5 0 | |
306 | |
307 movq_m2r (*(table+4), mm4); // mm4 = -C2 -C4 C6 C4 | |
308 por_r2r (mm1, mm7); // mm7 = y7 y6 y5 y4 | |
309 | |
310 movq_m2r (*(table+8), mm1); // mm1 = -C7 C3 C3 C1 | |
311 punpckhdq_r2r (mm2, mm2); // mm2 = x6 x4 x6 x4 | |
312 | |
313 movq_r2m (mm7, *(row+store+4)); // save y7 y6 y5 y4 | |
314 pmaddwd_r2r (mm0, mm3); // mm3 = C4*x0+C6*x2 C4*x0+C2*x2 | |
315 } | |
316 | |
317 | |
318 #if 0 | |
319 // C column IDCT - its just here to document the MMXEXT and MMX versions | |
320 static inline void idct_col (int16_t * col, int offset) | |
321 { | |
322 /* multiplication - as implemented on mmx */ | |
323 #define F(c,x) (((c) * (x)) >> 16) | |
324 | |
325 /* saturation - it helps us handle torture test cases */ | |
326 #define S(x) (((x)>32767) ? 32767 : ((x)<-32768) ? -32768 : (x)) | |
327 | |
328 int16_t x0, x1, x2, x3, x4, x5, x6, x7; | |
329 int16_t y0, y1, y2, y3, y4, y5, y6, y7; | |
330 int16_t a0, a1, a2, a3, b0, b1, b2, b3; | |
331 int16_t u04, v04, u26, v26, u17, v17, u35, v35, u12, v12; | |
332 | |
333 col += offset; | |
334 | |
335 x0 = col[0*8]; | |
336 x1 = col[1*8]; | |
337 x2 = col[2*8]; | |
338 x3 = col[3*8]; | |
339 x4 = col[4*8]; | |
340 x5 = col[5*8]; | |
341 x6 = col[6*8]; | |
342 x7 = col[7*8]; | |
343 | |
344 u04 = S (x0 + x4); | |
345 v04 = S (x0 - x4); | |
346 u26 = S (F (T2, x6) + x2); | |
347 v26 = S (F (T2, x2) - x6); | |
348 | |
349 a0 = S (u04 + u26); | |
350 a1 = S (v04 + v26); | |
351 a2 = S (v04 - v26); | |
352 a3 = S (u04 - u26); | |
353 | |
354 u17 = S (F (T1, x7) + x1); | |
355 v17 = S (F (T1, x1) - x7); | |
356 u35 = S (F (T3, x5) + x3); | |
357 v35 = S (F (T3, x3) - x5); | |
358 | |
359 b0 = S (u17 + u35); | |
360 b3 = S (v17 - v35); | |
361 u12 = S (u17 - u35); | |
362 v12 = S (v17 + v35); | |
363 u12 = S (2 * F (C4, u12)); | |
364 v12 = S (2 * F (C4, v12)); | |
365 b1 = S (u12 + v12); | |
366 b2 = S (u12 - v12); | |
367 | |
368 y0 = S (a0 + b0) >> COL_SHIFT; | |
369 y1 = S (a1 + b1) >> COL_SHIFT; | |
370 y2 = S (a2 + b2) >> COL_SHIFT; | |
371 y3 = S (a3 + b3) >> COL_SHIFT; | |
372 | |
373 y4 = S (a3 - b3) >> COL_SHIFT; | |
374 y5 = S (a2 - b2) >> COL_SHIFT; | |
375 y6 = S (a1 - b1) >> COL_SHIFT; | |
376 y7 = S (a0 - b0) >> COL_SHIFT; | |
377 | |
378 col[0*8] = y0; | |
379 col[1*8] = y1; | |
380 col[2*8] = y2; | |
381 col[3*8] = y3; | |
382 col[4*8] = y4; | |
383 col[5*8] = y5; | |
384 col[6*8] = y6; | |
385 col[7*8] = y7; | |
386 } | |
387 #endif | |
388 | |
389 | |
390 // MMX column IDCT | |
391 static inline void idct_col (int16_t * col, int offset) | |
392 { | |
393 #define T1 13036 | |
394 #define T2 27146 | |
395 #define T3 43790 | |
396 #define C4 23170 | |
397 | |
2854
00ff749b33b6
add consts (this was in my local tree, dunno where it came from, probably forgoten from some const patch)
michael
parents:
2817
diff
changeset
|
398 static const short _T1[] ATTR_ALIGN(8) = {T1,T1,T1,T1}; |
00ff749b33b6
add consts (this was in my local tree, dunno where it came from, probably forgoten from some const patch)
michael
parents:
2817
diff
changeset
|
399 static const short _T2[] ATTR_ALIGN(8) = {T2,T2,T2,T2}; |
00ff749b33b6
add consts (this was in my local tree, dunno where it came from, probably forgoten from some const patch)
michael
parents:
2817
diff
changeset
|
400 static const short _T3[] ATTR_ALIGN(8) = {T3,T3,T3,T3}; |
00ff749b33b6
add consts (this was in my local tree, dunno where it came from, probably forgoten from some const patch)
michael
parents:
2817
diff
changeset
|
401 static const short _C4[] ATTR_ALIGN(8) = {C4,C4,C4,C4}; |
30 | 402 |
403 /* column code adapted from peter gubanov */ | |
404 /* http://www.elecard.com/peter/idct.shtml */ | |
405 | |
406 movq_m2r (*_T1, mm0); // mm0 = T1 | |
407 | |
408 movq_m2r (*(col+offset+1*8), mm1); // mm1 = x1 | |
409 movq_r2r (mm0, mm2); // mm2 = T1 | |
410 | |
411 movq_m2r (*(col+offset+7*8), mm4); // mm4 = x7 | |
412 pmulhw_r2r (mm1, mm0); // mm0 = T1*x1 | |
413 | |
414 movq_m2r (*_T3, mm5); // mm5 = T3 | |
415 pmulhw_r2r (mm4, mm2); // mm2 = T1*x7 | |
416 | |
417 movq_m2r (*(col+offset+5*8), mm6); // mm6 = x5 | |
418 movq_r2r (mm5, mm7); // mm7 = T3-1 | |
419 | |
420 movq_m2r (*(col+offset+3*8), mm3); // mm3 = x3 | |
421 psubsw_r2r (mm4, mm0); // mm0 = v17 | |
422 | |
423 movq_m2r (*_T2, mm4); // mm4 = T2 | |
424 pmulhw_r2r (mm3, mm5); // mm5 = (T3-1)*x3 | |
425 | |
426 paddsw_r2r (mm2, mm1); // mm1 = u17 | |
427 pmulhw_r2r (mm6, mm7); // mm7 = (T3-1)*x5 | |
428 | |
429 /* slot */ | |
430 | |
431 movq_r2r (mm4, mm2); // mm2 = T2 | |
432 paddsw_r2r (mm3, mm5); // mm5 = T3*x3 | |
433 | |
434 pmulhw_m2r (*(col+offset+2*8), mm4);// mm4 = T2*x2 | |
435 paddsw_r2r (mm6, mm7); // mm7 = T3*x5 | |
436 | |
437 psubsw_r2r (mm6, mm5); // mm5 = v35 | |
438 paddsw_r2r (mm3, mm7); // mm7 = u35 | |
439 | |
440 movq_m2r (*(col+offset+6*8), mm3); // mm3 = x6 | |
441 movq_r2r (mm0, mm6); // mm6 = v17 | |
442 | |
443 pmulhw_r2r (mm3, mm2); // mm2 = T2*x6 | |
444 psubsw_r2r (mm5, mm0); // mm0 = b3 | |
445 | |
446 psubsw_r2r (mm3, mm4); // mm4 = v26 | |
447 paddsw_r2r (mm6, mm5); // mm5 = v12 | |
448 | |
449 movq_r2m (mm0, *(col+offset+3*8)); // save b3 in scratch0 | |
450 movq_r2r (mm1, mm6); // mm6 = u17 | |
451 | |
452 paddsw_m2r (*(col+offset+2*8), mm2);// mm2 = u26 | |
453 paddsw_r2r (mm7, mm6); // mm6 = b0 | |
454 | |
455 psubsw_r2r (mm7, mm1); // mm1 = u12 | |
456 movq_r2r (mm1, mm7); // mm7 = u12 | |
457 | |
458 movq_m2r (*(col+offset+0*8), mm3); // mm3 = x0 | |
459 paddsw_r2r (mm5, mm1); // mm1 = u12+v12 | |
460 | |
461 movq_m2r (*_C4, mm0); // mm0 = C4/2 | |
462 psubsw_r2r (mm5, mm7); // mm7 = u12-v12 | |
463 | |
464 movq_r2m (mm6, *(col+offset+5*8)); // save b0 in scratch1 | |
465 pmulhw_r2r (mm0, mm1); // mm1 = b1/2 | |
466 | |
467 movq_r2r (mm4, mm6); // mm6 = v26 | |
468 pmulhw_r2r (mm0, mm7); // mm7 = b2/2 | |
469 | |
470 movq_m2r (*(col+offset+4*8), mm5); // mm5 = x4 | |
471 movq_r2r (mm3, mm0); // mm0 = x0 | |
472 | |
473 psubsw_r2r (mm5, mm3); // mm3 = v04 | |
474 paddsw_r2r (mm5, mm0); // mm0 = u04 | |
475 | |
476 paddsw_r2r (mm3, mm4); // mm4 = a1 | |
477 movq_r2r (mm0, mm5); // mm5 = u04 | |
478 | |
479 psubsw_r2r (mm6, mm3); // mm3 = a2 | |
480 paddsw_r2r (mm2, mm5); // mm5 = a0 | |
481 | |
482 paddsw_r2r (mm1, mm1); // mm1 = b1 | |
483 psubsw_r2r (mm2, mm0); // mm0 = a3 | |
484 | |
485 paddsw_r2r (mm7, mm7); // mm7 = b2 | |
486 movq_r2r (mm3, mm2); // mm2 = a2 | |
487 | |
488 movq_r2r (mm4, mm6); // mm6 = a1 | |
489 paddsw_r2r (mm7, mm3); // mm3 = a2+b2 | |
490 | |
491 psraw_i2r (COL_SHIFT, mm3); // mm3 = y2 | |
492 paddsw_r2r (mm1, mm4); // mm4 = a1+b1 | |
493 | |
494 psraw_i2r (COL_SHIFT, mm4); // mm4 = y1 | |
495 psubsw_r2r (mm1, mm6); // mm6 = a1-b1 | |
496 | |
497 movq_m2r (*(col+offset+5*8), mm1); // mm1 = b0 | |
498 psubsw_r2r (mm7, mm2); // mm2 = a2-b2 | |
499 | |
500 psraw_i2r (COL_SHIFT, mm6); // mm6 = y6 | |
501 movq_r2r (mm5, mm7); // mm7 = a0 | |
502 | |
503 movq_r2m (mm4, *(col+offset+1*8)); // save y1 | |
504 psraw_i2r (COL_SHIFT, mm2); // mm2 = y5 | |
505 | |
506 movq_r2m (mm3, *(col+offset+2*8)); // save y2 | |
507 paddsw_r2r (mm1, mm5); // mm5 = a0+b0 | |
508 | |
509 movq_m2r (*(col+offset+3*8), mm4); // mm4 = b3 | |
510 psubsw_r2r (mm1, mm7); // mm7 = a0-b0 | |
511 | |
512 psraw_i2r (COL_SHIFT, mm5); // mm5 = y0 | |
513 movq_r2r (mm0, mm3); // mm3 = a3 | |
514 | |
515 movq_r2m (mm2, *(col+offset+5*8)); // save y5 | |
516 psubsw_r2r (mm4, mm3); // mm3 = a3-b3 | |
517 | |
518 psraw_i2r (COL_SHIFT, mm7); // mm7 = y7 | |
519 paddsw_r2r (mm0, mm4); // mm4 = a3+b3 | |
520 | |
521 movq_r2m (mm5, *(col+offset+0*8)); // save y0 | |
522 psraw_i2r (COL_SHIFT, mm3); // mm3 = y4 | |
523 | |
524 movq_r2m (mm6, *(col+offset+6*8)); // save y6 | |
525 psraw_i2r (COL_SHIFT, mm4); // mm4 = y3 | |
526 | |
527 movq_r2m (mm7, *(col+offset+7*8)); // save y7 | |
528 | |
529 movq_r2m (mm3, *(col+offset+4*8)); // save y4 | |
530 | |
531 movq_r2m (mm4, *(col+offset+3*8)); // save y3 | |
436
35de17dd6ed8
* undefine local defines when they are no longer needed
kabi
parents:
76
diff
changeset
|
532 |
35de17dd6ed8
* undefine local defines when they are no longer needed
kabi
parents:
76
diff
changeset
|
533 #undef T1 |
35de17dd6ed8
* undefine local defines when they are no longer needed
kabi
parents:
76
diff
changeset
|
534 #undef T2 |
35de17dd6ed8
* undefine local defines when they are no longer needed
kabi
parents:
76
diff
changeset
|
535 #undef T3 |
35de17dd6ed8
* undefine local defines when they are no longer needed
kabi
parents:
76
diff
changeset
|
536 #undef C4 |
30 | 537 } |
538 | |
2854
00ff749b33b6
add consts (this was in my local tree, dunno where it came from, probably forgoten from some const patch)
michael
parents:
2817
diff
changeset
|
539 static const int32_t rounder0[] ATTR_ALIGN(8) = |
30 | 540 rounder ((1 << (COL_SHIFT - 1)) - 0.5); |
2854
00ff749b33b6
add consts (this was in my local tree, dunno where it came from, probably forgoten from some const patch)
michael
parents:
2817
diff
changeset
|
541 static const int32_t rounder4[] ATTR_ALIGN(8) = rounder (0); |
00ff749b33b6
add consts (this was in my local tree, dunno where it came from, probably forgoten from some const patch)
michael
parents:
2817
diff
changeset
|
542 static const int32_t rounder1[] ATTR_ALIGN(8) = |
30 | 543 rounder (1.25683487303); /* C1*(C1/C4+C1+C7)/2 */ |
2854
00ff749b33b6
add consts (this was in my local tree, dunno where it came from, probably forgoten from some const patch)
michael
parents:
2817
diff
changeset
|
544 static const int32_t rounder7[] ATTR_ALIGN(8) = |
30 | 545 rounder (-0.25); /* C1*(C7/C4+C7-C1)/2 */ |
2854
00ff749b33b6
add consts (this was in my local tree, dunno where it came from, probably forgoten from some const patch)
michael
parents:
2817
diff
changeset
|
546 static const int32_t rounder2[] ATTR_ALIGN(8) = |
30 | 547 rounder (0.60355339059); /* C2 * (C6+C2)/2 */ |
2854
00ff749b33b6
add consts (this was in my local tree, dunno where it came from, probably forgoten from some const patch)
michael
parents:
2817
diff
changeset
|
548 static const int32_t rounder6[] ATTR_ALIGN(8) = |
30 | 549 rounder (-0.25); /* C2 * (C6-C2)/2 */ |
2854
00ff749b33b6
add consts (this was in my local tree, dunno where it came from, probably forgoten from some const patch)
michael
parents:
2817
diff
changeset
|
550 static const int32_t rounder3[] ATTR_ALIGN(8) = |
30 | 551 rounder (0.087788325588); /* C3*(-C3/C4+C3+C5)/2 */ |
2854
00ff749b33b6
add consts (this was in my local tree, dunno where it came from, probably forgoten from some const patch)
michael
parents:
2817
diff
changeset
|
552 static const int32_t rounder5[] ATTR_ALIGN(8) = |
30 | 553 rounder (-0.441341716183); /* C3*(-C5/C4+C5-C3)/2 */ |
554 | |
436
35de17dd6ed8
* undefine local defines when they are no longer needed
kabi
parents:
76
diff
changeset
|
555 #undef COL_SHIFT |
35de17dd6ed8
* undefine local defines when they are no longer needed
kabi
parents:
76
diff
changeset
|
556 #undef ROW_SHIFT |
30 | 557 |
558 #define declare_idct(idct,table,idct_row_head,idct_row,idct_row_tail,idct_row_mid) \ | |
559 void idct (int16_t * block) \ | |
560 { \ | |
2854
00ff749b33b6
add consts (this was in my local tree, dunno where it came from, probably forgoten from some const patch)
michael
parents:
2817
diff
changeset
|
561 static const int16_t table04[] ATTR_ALIGN(16) = \ |
30 | 562 table (22725, 21407, 19266, 16384, 12873, 8867, 4520); \ |
2854
00ff749b33b6
add consts (this was in my local tree, dunno where it came from, probably forgoten from some const patch)
michael
parents:
2817
diff
changeset
|
563 static const int16_t table17[] ATTR_ALIGN(16) = \ |
30 | 564 table (31521, 29692, 26722, 22725, 17855, 12299, 6270); \ |
2854
00ff749b33b6
add consts (this was in my local tree, dunno where it came from, probably forgoten from some const patch)
michael
parents:
2817
diff
changeset
|
565 static const int16_t table26[] ATTR_ALIGN(16) = \ |
30 | 566 table (29692, 27969, 25172, 21407, 16819, 11585, 5906); \ |
2854
00ff749b33b6
add consts (this was in my local tree, dunno where it came from, probably forgoten from some const patch)
michael
parents:
2817
diff
changeset
|
567 static const int16_t table35[] ATTR_ALIGN(16) = \ |
30 | 568 table (26722, 25172, 22654, 19266, 15137, 10426, 5315); \ |
569 \ | |
570 idct_row_head (block, 0*8, table04); \ | |
571 idct_row (table04, rounder0); \ | |
572 idct_row_mid (block, 0*8, 4*8, table04); \ | |
573 idct_row (table04, rounder4); \ | |
574 idct_row_mid (block, 4*8, 1*8, table17); \ | |
575 idct_row (table17, rounder1); \ | |
576 idct_row_mid (block, 1*8, 7*8, table17); \ | |
577 idct_row (table17, rounder7); \ | |
578 idct_row_mid (block, 7*8, 2*8, table26); \ | |
579 idct_row (table26, rounder2); \ | |
580 idct_row_mid (block, 2*8, 6*8, table26); \ | |
581 idct_row (table26, rounder6); \ | |
582 idct_row_mid (block, 6*8, 3*8, table35); \ | |
583 idct_row (table35, rounder3); \ | |
584 idct_row_mid (block, 3*8, 5*8, table35); \ | |
585 idct_row (table35, rounder5); \ | |
586 idct_row_tail (block, 5*8); \ | |
587 \ | |
588 idct_col (block, 0); \ | |
589 idct_col (block, 4); \ | |
590 } | |
591 | |
2024
f65d87bfdd5a
some of the warning fixes by (Michael Roitzsch <mroi at users dot sourceforge dot net>)
michael
parents:
436
diff
changeset
|
592 void ff_mmx_idct(DCTELEM *block); |
f65d87bfdd5a
some of the warning fixes by (Michael Roitzsch <mroi at users dot sourceforge dot net>)
michael
parents:
436
diff
changeset
|
593 void ff_mmxext_idct(DCTELEM *block); |
30 | 594 |
595 declare_idct (ff_mmxext_idct, mmxext_table, | |
596 mmxext_row_head, mmxext_row, mmxext_row_tail, mmxext_row_mid) | |
597 | |
598 declare_idct (ff_mmx_idct, mmx_table, | |
599 mmx_row_head, mmx_row, mmx_row_tail, mmx_row_mid) | |
2745 | 600 |