Mercurial > libavcodec.hg
annotate x86/idct_mmx.c @ 12494:94eaea836bf4 libavcodec
Check avctx width/height more thoroughly (e.g. all values 0 except width would
have been accepted before).
Also do not fail if they are invalid but instead override them to 0.
This allows decoding e.g. MPEG video when only the container values are corrupted.
For encoding a value of 0,0 of course makes no sense, but was allowed
through before and will be caught by an extra check in the encode function.
author | reimar |
---|---|
date | Wed, 15 Sep 2010 04:46:55 +0000 |
parents | be9129c5503e |
children |
rev | line source |
---|---|
8430 | 1 /* |
2 * idct_mmx.c | |
3 * Copyright (C) 1999-2001 Aaron Holtzman <aholtzma@ess.engr.uvic.ca> | |
4 * | |
5 * This file is part of mpeg2dec, a free MPEG-2 video stream decoder. | |
6 * See http://libmpeg2.sourceforge.net/ for updates. | |
7 * | |
8 * mpeg2dec is free software; you can redistribute it and/or modify | |
9 * it under the terms of the GNU General Public License as published by | |
10 * the Free Software Foundation; either version 2 of the License, or | |
11 * (at your option) any later version. | |
12 * | |
13 * mpeg2dec is distributed in the hope that it will be useful, | |
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
16 * GNU General Public License for more details. | |
17 * | |
18 * You should have received a copy of the GNU General Public License | |
19 * along with mpeg2dec; if not, write to the Free Software | |
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
21 */ | |
22 | |
23 #include "libavutil/common.h" | |
24 #include "libavcodec/dsputil.h" | |
25 | |
12296 | 26 #include "libavutil/x86_cpu.h" |
11381 | 27 #include "dsputil_mmx.h" |
8430 | 28 |
29 #define ROW_SHIFT 11 | |
30 #define COL_SHIFT 6 | |
31 | |
32 #define round(bias) ((int)(((bias)+0.5) * (1<<ROW_SHIFT))) | |
33 #define rounder(bias) {round (bias), round (bias)} | |
34 | |
35 | |
36 #if 0 | |
37 /* C row IDCT - it is just here to document the MMXEXT and MMX versions */ | |
38 static inline void idct_row (int16_t * row, int offset, | |
39 int16_t * table, int32_t * rounder) | |
40 { | |
41 int C1, C2, C3, C4, C5, C6, C7; | |
42 int a0, a1, a2, a3, b0, b1, b2, b3; | |
43 | |
44 row += offset; | |
45 | |
46 C1 = table[1]; | |
47 C2 = table[2]; | |
48 C3 = table[3]; | |
49 C4 = table[4]; | |
50 C5 = table[5]; | |
51 C6 = table[6]; | |
52 C7 = table[7]; | |
53 | |
54 a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + *rounder; | |
55 a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + *rounder; | |
56 a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + *rounder; | |
57 a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + *rounder; | |
58 | |
59 b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7]; | |
60 b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7]; | |
61 b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7]; | |
62 b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7]; | |
63 | |
64 row[0] = (a0 + b0) >> ROW_SHIFT; | |
65 row[1] = (a1 + b1) >> ROW_SHIFT; | |
66 row[2] = (a2 + b2) >> ROW_SHIFT; | |
67 row[3] = (a3 + b3) >> ROW_SHIFT; | |
68 row[4] = (a3 - b3) >> ROW_SHIFT; | |
69 row[5] = (a2 - b2) >> ROW_SHIFT; | |
70 row[6] = (a1 - b1) >> ROW_SHIFT; | |
71 row[7] = (a0 - b0) >> ROW_SHIFT; | |
72 } | |
73 #endif | |
74 | |
75 | |
76 /* MMXEXT row IDCT */ | |
77 | |
78 #define mmxext_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, -c4, -c2, \ | |
79 c4, c6, c4, c6, \ | |
80 c1, c3, -c1, -c5, \ | |
81 c5, c7, c3, -c7, \ | |
82 c4, -c6, c4, -c6, \ | |
83 -c4, c2, c4, -c2, \ | |
84 c5, -c1, c3, -c1, \ | |
85 c7, c3, c7, -c5 } | |
86 | |
87 static inline void mmxext_row_head (int16_t * const row, const int offset, | |
88 const int16_t * const table) | |
89 { | |
12296 | 90 __asm__ volatile( |
91 "movq (%0), %%mm2 \n\t" /* mm2 = x6 x4 x2 x0 */ | |
8430 | 92 |
12296 | 93 "movq 8(%0), %%mm5 \n\t" /* mm5 = x7 x5 x3 x1 */ |
94 "movq %%mm2, %%mm0 \n\t" /* mm0 = x6 x4 x2 x0 */ | |
8430 | 95 |
12296 | 96 "movq (%1), %%mm3 \n\t" /* mm3 = -C2 -C4 C2 C4 */ |
97 "movq %%mm5, %%mm6 \n\t" /* mm6 = x7 x5 x3 x1 */ | |
8430 | 98 |
12296 | 99 "movq 8(%1), %%mm4 \n\t" /* mm4 = C6 C4 C6 C4 */ |
100 "pmaddwd %%mm0, %%mm3 \n\t" /* mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 */ | |
8430 | 101 |
12296 | 102 "pshufw $0x4e, %%mm2, %%mm2 \n\t" /* mm2 = x2 x0 x6 x4 */ |
103 :: "r" ((row+offset)), "r" (table) | |
104 ); | |
8430 | 105 } |
106 | |
107 static inline void mmxext_row (const int16_t * const table, | |
108 const int32_t * const rounder) | |
109 { | |
12296 | 110 __asm__ volatile ( |
111 "movq 16(%0), %%mm1 \n\t" /* mm1 = -C5 -C1 C3 C1 */ | |
112 "pmaddwd %%mm2, %%mm4 \n\t" /* mm4 = C4*x0+C6*x2 C4*x4+C6*x6 */ | |
8430 | 113 |
12296 | 114 "pmaddwd 32(%0), %%mm0 \n\t" /* mm0 = C4*x4-C6*x6 C4*x0-C6*x2 */ |
115 "pshufw $0x4e, %%mm6, %%mm6 \n\t" /* mm6 = x3 x1 x7 x5 */ | |
8430 | 116 |
12296 | 117 "movq 24(%0), %%mm7 \n\t" /* mm7 = -C7 C3 C7 C5 */ |
118 "pmaddwd %%mm5, %%mm1 \n\t" /* mm1= -C1*x5-C5*x7 C1*x1+C3*x3 */ | |
119 | |
120 "paddd (%1), %%mm3 \n\t" /* mm3 += rounder */ | |
121 "pmaddwd %%mm6, %%mm7 \n\t" /* mm7 = C3*x1-C7*x3 C5*x5+C7*x7 */ | |
8430 | 122 |
12296 | 123 "pmaddwd 40(%0), %%mm2 \n\t" /* mm2= C4*x0-C2*x2 -C4*x4+C2*x6 */ |
124 "paddd %%mm4, %%mm3 \n\t" /* mm3 = a1 a0 + rounder */ | |
8430 | 125 |
12296 | 126 "pmaddwd 48(%0), %%mm5 \n\t" /* mm5 = C3*x5-C1*x7 C5*x1-C1*x3 */ |
127 "movq %%mm3, %%mm4 \n\t" /* mm4 = a1 a0 + rounder */ | |
8430 | 128 |
12296 | 129 "pmaddwd 56(%0), %%mm6 \n\t" /* mm6 = C7*x1-C5*x3 C7*x5+C3*x7 */ |
130 "paddd %%mm7, %%mm1 \n\t" /* mm1 = b1 b0 */ | |
8430 | 131 |
12296 | 132 "paddd (%1), %%mm0 \n\t" /* mm0 += rounder */ |
133 "psubd %%mm1, %%mm3 \n\t" /* mm3 = a1-b1 a0-b0 + rounder */ | |
8430 | 134 |
12296 | 135 "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm3 \n\t" /* mm3 = y6 y7 */ |
136 "paddd %%mm4, %%mm1 \n\t" /* mm1 = a1+b1 a0+b0 + rounder */ | |
8430 | 137 |
12296 | 138 "paddd %%mm2, %%mm0 \n\t" /* mm0 = a3 a2 + rounder */ |
139 "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm1 \n\t" /* mm1 = y1 y0 */ | |
8430 | 140 |
12296 | 141 "paddd %%mm6, %%mm5 \n\t" /* mm5 = b3 b2 */ |
142 "movq %%mm0, %%mm4 \n\t" /* mm4 = a3 a2 + rounder */ | |
8430 | 143 |
12296 | 144 "paddd %%mm5, %%mm0 \n\t" /* mm0 = a3+b3 a2+b2 + rounder */ |
145 "psubd %%mm5, %%mm4 \n\t" /* mm4 = a3-b3 a2-b2 + rounder */ | |
146 : : "r" (table), "r" (rounder)); | |
8430 | 147 } |
148 | |
149 static inline void mmxext_row_tail (int16_t * const row, const int store) | |
150 { | |
12296 | 151 __asm__ volatile ( |
152 "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm0 \n\t" /* mm0 = y3 y2 */ | |
8430 | 153 |
12296 | 154 "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm4 \n\t" /* mm4 = y4 y5 */ |
8430 | 155 |
12296 | 156 "packssdw %%mm0, %%mm1 \n\t" /* mm1 = y3 y2 y1 y0 */ |
157 | |
158 "packssdw %%mm3, %%mm4 \n\t" /* mm4 = y6 y7 y4 y5 */ | |
8430 | 159 |
12296 | 160 "movq %%mm1, (%0) \n\t" /* save y3 y2 y1 y0 */ |
161 "pshufw $0xb1, %%mm4, %%mm4 \n\t" /* mm4 = y7 y6 y5 y4 */ | |
8430 | 162 |
12296 | 163 /* slot */ |
8430 | 164 |
12296 | 165 "movq %%mm4, 8(%0) \n\t" /* save y7 y6 y5 y4 */ |
166 :: "r" (row+store) | |
167 ); | |
8430 | 168 } |
169 | |
170 static inline void mmxext_row_mid (int16_t * const row, const int store, | |
171 const int offset, | |
172 const int16_t * const table) | |
173 { | |
12296 | 174 __asm__ volatile ( |
175 "movq (%0,%1), %%mm2 \n\t" /* mm2 = x6 x4 x2 x0 */ | |
176 "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm0 \n\t" /* mm0 = y3 y2 */ | |
8430 | 177 |
12296 | 178 "movq 8(%0,%1), %%mm5 \n\t" /* mm5 = x7 x5 x3 x1 */ |
179 "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm4 \n\t" /* mm4 = y4 y5 */ | |
8430 | 180 |
12296 | 181 "packssdw %%mm0, %%mm1 \n\t" /* mm1 = y3 y2 y1 y0 */ |
182 "movq %%mm5, %%mm6 \n\t" /* mm6 = x7 x5 x3 x1 */ | |
183 | |
184 "packssdw %%mm3, %%mm4 \n\t" /* mm4 = y6 y7 y4 y5 */ | |
185 "movq %%mm2, %%mm0 \n\t" /* mm0 = x6 x4 x2 x0 */ | |
8430 | 186 |
12296 | 187 "movq %%mm1, (%0,%2) \n\t" /* save y3 y2 y1 y0 */ |
188 "pshufw $0xb1, %%mm4, %%mm4\n\t" /* mm4 = y7 y6 y5 y4 */ | |
8430 | 189 |
12296 | 190 "movq (%3), %%mm3 \n\t" /* mm3 = -C2 -C4 C2 C4 */ |
191 "movq %%mm4, 8(%0,%2) \n\t" /* save y7 y6 y5 y4 */ | |
8430 | 192 |
12296 | 193 "pmaddwd %%mm0, %%mm3 \n\t" /* mm3= -C4*x4-C2*x6 C4*x0+C2*x2 */ |
8430 | 194 |
12296 | 195 "movq 8(%3), %%mm4 \n\t" /* mm4 = C6 C4 C6 C4 */ |
196 "pshufw $0x4e, %%mm2, %%mm2\n\t" /* mm2 = x2 x0 x6 x4 */ | |
12297 | 197 :: "r" (row), "r" ((x86_reg) (2*offset)), "r" ((x86_reg) (2*store)), "r" (table) |
12296 | 198 ); |
8430 | 199 } |
200 | |
201 | |
202 /* MMX row IDCT */ | |
203 | |
204 #define mmx_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, c4, c6, \ | |
205 c4, c6, -c4, -c2, \ | |
206 c1, c3, c3, -c7, \ | |
207 c5, c7, -c1, -c5, \ | |
208 c4, -c6, c4, -c2, \ | |
209 -c4, c2, c4, -c6, \ | |
210 c5, -c1, c7, -c5, \ | |
211 c7, c3, c3, -c1 } | |
212 | |
213 static inline void mmx_row_head (int16_t * const row, const int offset, | |
214 const int16_t * const table) | |
215 { | |
12296 | 216 __asm__ volatile ( |
217 "movq (%0), %%mm2 \n\t" /* mm2 = x6 x4 x2 x0 */ | |
8430 | 218 |
12296 | 219 "movq 8(%0), %%mm5 \n\t" /* mm5 = x7 x5 x3 x1 */ |
220 "movq %%mm2, %%mm0 \n\t" /* mm0 = x6 x4 x2 x0 */ | |
8430 | 221 |
12296 | 222 "movq (%1), %%mm3 \n\t" /* mm3 = C6 C4 C2 C4 */ |
223 "movq %%mm5, %%mm6 \n\t" /* mm6 = x7 x5 x3 x1 */ | |
8430 | 224 |
12296 | 225 "punpckldq %%mm0, %%mm0 \n\t" /* mm0 = x2 x0 x2 x0 */ |
226 | |
227 "movq 8(%1), %%mm4 \n\t" /* mm4 = -C2 -C4 C6 C4 */ | |
228 "pmaddwd %%mm0, %%mm3 \n\t" /* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */ | |
8430 | 229 |
12296 | 230 "movq 16(%1), %%mm1 \n\t" /* mm1 = -C7 C3 C3 C1 */ |
231 "punpckhdq %%mm2, %%mm2 \n\t" /* mm2 = x6 x4 x6 x4 */ | |
232 :: "r" ((row+offset)), "r" (table) | |
233 ); | |
8430 | 234 } |
235 | |
236 static inline void mmx_row (const int16_t * const table, | |
237 const int32_t * const rounder) | |
238 { | |
12296 | 239 __asm__ volatile ( |
240 "pmaddwd %%mm2, %%mm4 \n\t" /* mm4 = -C4*x4-C2*x6 C4*x4+C6*x6 */ | |
241 "punpckldq %%mm5, %%mm5 \n\t" /* mm5 = x3 x1 x3 x1 */ | |
8430 | 242 |
12296 | 243 "pmaddwd 32(%0), %%mm0 \n\t" /* mm0 = C4*x0-C2*x2 C4*x0-C6*x2 */ |
244 "punpckhdq %%mm6, %%mm6 \n\t" /* mm6 = x7 x5 x7 x5 */ | |
8430 | 245 |
12296 | 246 "movq 24(%0), %%mm7 \n\t" /* mm7 = -C5 -C1 C7 C5 */ |
247 "pmaddwd %%mm5, %%mm1 \n\t" /* mm1 = C3*x1-C7*x3 C1*x1+C3*x3 */ | |
248 | |
249 "paddd (%1), %%mm3 \n\t" /* mm3 += rounder */ | |
250 "pmaddwd %%mm6, %%mm7 \n\t" /* mm7 = -C1*x5-C5*x7 C5*x5+C7*x7 */ | |
8430 | 251 |
12296 | 252 "pmaddwd 40(%0), %%mm2 \n\t" /* mm2 = C4*x4-C6*x6 -C4*x4+C2*x6 */ |
253 "paddd %%mm4, %%mm3 \n\t" /* mm3 = a1 a0 + rounder */ | |
8430 | 254 |
12296 | 255 "pmaddwd 48(%0), %%mm5 \n\t" /* mm5 = C7*x1-C5*x3 C5*x1-C1*x3 */ |
256 "movq %%mm3, %%mm4 \n\t" /* mm4 = a1 a0 + rounder */ | |
8430 | 257 |
12296 | 258 "pmaddwd 56(%0), %%mm6 \n\t" /* mm6 = C3*x5-C1*x7 C7*x5+C3*x7 */ |
259 "paddd %%mm7, %%mm1 \n\t" /* mm1 = b1 b0 */ | |
8430 | 260 |
12296 | 261 "paddd (%1), %%mm0 \n\t" /* mm0 += rounder */ |
262 "psubd %%mm1, %%mm3 \n\t" /* mm3 = a1-b1 a0-b0 + rounder */ | |
8430 | 263 |
12296 | 264 "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm3 \n\t" /* mm3 = y6 y7 */ |
265 "paddd %%mm4, %%mm1 \n\t" /* mm1 = a1+b1 a0+b0 + rounder */ | |
8430 | 266 |
12296 | 267 "paddd %%mm2, %%mm0 \n\t" /* mm0 = a3 a2 + rounder */ |
268 "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm1 \n\t" /* mm1 = y1 y0 */ | |
8430 | 269 |
12296 | 270 "paddd %%mm6, %%mm5 \n\t" /* mm5 = b3 b2 */ |
271 "movq %%mm0, %%mm7 \n\t" /* mm7 = a3 a2 + rounder */ | |
8430 | 272 |
12296 | 273 "paddd %%mm5, %%mm0 \n\t" /* mm0 = a3+b3 a2+b2 + rounder */ |
274 "psubd %%mm5, %%mm7 \n\t" /* mm7 = a3-b3 a2-b2 + rounder */ | |
275 :: "r" (table), "r" (rounder) | |
276 ); | |
8430 | 277 } |
278 | |
279 static inline void mmx_row_tail (int16_t * const row, const int store) | |
280 { | |
12296 | 281 __asm__ volatile ( |
282 "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm0 \n\t" /* mm0 = y3 y2 */ | |
8430 | 283 |
12296 | 284 "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm7 \n\t" /* mm7 = y4 y5 */ |
8430 | 285 |
12296 | 286 "packssdw %%mm0, %%mm1 \n\t" /* mm1 = y3 y2 y1 y0 */ |
8430 | 287 |
12296 | 288 "packssdw %%mm3, %%mm7 \n\t" /* mm7 = y6 y7 y4 y5 */ |
289 | |
290 "movq %%mm1, (%0) \n\t" /* save y3 y2 y1 y0 */ | |
291 "movq %%mm7, %%mm4 \n\t" /* mm4 = y6 y7 y4 y5 */ | |
8430 | 292 |
12296 | 293 "pslld $16, %%mm7 \n\t" /* mm7 = y7 0 y5 0 */ |
8430 | 294 |
12296 | 295 "psrld $16, %%mm4 \n\t" /* mm4 = 0 y6 0 y4 */ |
296 | |
297 "por %%mm4, %%mm7 \n\t" /* mm7 = y7 y6 y5 y4 */ | |
8430 | 298 |
12296 | 299 /* slot */ |
8430 | 300 |
12296 | 301 "movq %%mm7, 8(%0) \n\t" /* save y7 y6 y5 y4 */ |
302 :: "r" (row+store) | |
303 ); | |
8430 | 304 } |
305 | |
306 static inline void mmx_row_mid (int16_t * const row, const int store, | |
307 const int offset, const int16_t * const table) | |
308 { | |
309 | |
12296 | 310 __asm__ volatile ( |
311 "movq (%0,%1), %%mm2 \n\t" /* mm2 = x6 x4 x2 x0 */ | |
312 "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm0 \n\t" /* mm0 = y3 y2 */ | |
313 | |
314 "movq 8(%0,%1), %%mm5 \n\t" /* mm5 = x7 x5 x3 x1 */ | |
315 "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm7 \n\t" /* mm7 = y4 y5 */ | |
8430 | 316 |
12296 | 317 "packssdw %%mm0, %%mm1 \n\t" /* mm1 = y3 y2 y1 y0 */ |
318 "movq %%mm5, %%mm6 \n\t" /* mm6 = x7 x5 x3 x1 */ | |
8430 | 319 |
12296 | 320 "packssdw %%mm3, %%mm7 \n\t" /* mm7 = y6 y7 y4 y5 */ |
321 "movq %%mm2, %%mm0 \n\t" /* mm0 = x6 x4 x2 x0 */ | |
8430 | 322 |
12296 | 323 "movq %%mm1, (%0,%2) \n\t" /* save y3 y2 y1 y0 */ |
324 "movq %%mm7, %%mm1 \n\t" /* mm1 = y6 y7 y4 y5 */ | |
8430 | 325 |
12296 | 326 "punpckldq %%mm0, %%mm0 \n\t" /* mm0 = x2 x0 x2 x0 */ |
327 "psrld $16, %%mm7 \n\t" /* mm7 = 0 y6 0 y4 */ | |
8430 | 328 |
12296 | 329 "movq (%3), %%mm3 \n\t" /* mm3 = C6 C4 C2 C4 */ |
330 "pslld $16, %%mm1 \n\t" /* mm1 = y7 0 y5 0 */ | |
8430 | 331 |
12296 | 332 "movq 8(%3), %%mm4 \n\t" /* mm4 = -C2 -C4 C6 C4 */ |
333 "por %%mm1, %%mm7 \n\t" /* mm7 = y7 y6 y5 y4 */ | |
8430 | 334 |
12296 | 335 "movq 16(%3), %%mm1 \n\t" /* mm1 = -C7 C3 C3 C1 */ |
336 "punpckhdq %%mm2, %%mm2 \n\t" /* mm2 = x6 x4 x6 x4 */ | |
8430 | 337 |
12296 | 338 "movq %%mm7, 8(%0,%2) \n\t" /* save y7 y6 y5 y4 */ |
339 "pmaddwd %%mm0, %%mm3 \n\t" /* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */ | |
12297 | 340 : : "r" (row), "r" ((x86_reg) (2*offset)), "r" ((x86_reg) (2*store)), "r" (table) |
12296 | 341 ); |
8430 | 342 } |
343 | |
344 | |
345 #if 0 | |
346 /* C column IDCT - it is just here to document the MMXEXT and MMX versions */ | |
347 static inline void idct_col (int16_t * col, int offset) | |
348 { | |
349 /* multiplication - as implemented on mmx */ | |
350 #define F(c,x) (((c) * (x)) >> 16) | |
351 | |
352 /* saturation - it helps us handle torture test cases */ | |
353 #define S(x) (((x)>32767) ? 32767 : ((x)<-32768) ? -32768 : (x)) | |
354 | |
355 int16_t x0, x1, x2, x3, x4, x5, x6, x7; | |
356 int16_t y0, y1, y2, y3, y4, y5, y6, y7; | |
357 int16_t a0, a1, a2, a3, b0, b1, b2, b3; | |
358 int16_t u04, v04, u26, v26, u17, v17, u35, v35, u12, v12; | |
359 | |
360 col += offset; | |
361 | |
362 x0 = col[0*8]; | |
363 x1 = col[1*8]; | |
364 x2 = col[2*8]; | |
365 x3 = col[3*8]; | |
366 x4 = col[4*8]; | |
367 x5 = col[5*8]; | |
368 x6 = col[6*8]; | |
369 x7 = col[7*8]; | |
370 | |
371 u04 = S (x0 + x4); | |
372 v04 = S (x0 - x4); | |
373 u26 = S (F (T2, x6) + x2); | |
374 v26 = S (F (T2, x2) - x6); | |
375 | |
376 a0 = S (u04 + u26); | |
377 a1 = S (v04 + v26); | |
378 a2 = S (v04 - v26); | |
379 a3 = S (u04 - u26); | |
380 | |
381 u17 = S (F (T1, x7) + x1); | |
382 v17 = S (F (T1, x1) - x7); | |
383 u35 = S (F (T3, x5) + x3); | |
384 v35 = S (F (T3, x3) - x5); | |
385 | |
386 b0 = S (u17 + u35); | |
387 b3 = S (v17 - v35); | |
388 u12 = S (u17 - u35); | |
389 v12 = S (v17 + v35); | |
390 u12 = S (2 * F (C4, u12)); | |
391 v12 = S (2 * F (C4, v12)); | |
392 b1 = S (u12 + v12); | |
393 b2 = S (u12 - v12); | |
394 | |
395 y0 = S (a0 + b0) >> COL_SHIFT; | |
396 y1 = S (a1 + b1) >> COL_SHIFT; | |
397 y2 = S (a2 + b2) >> COL_SHIFT; | |
398 y3 = S (a3 + b3) >> COL_SHIFT; | |
399 | |
400 y4 = S (a3 - b3) >> COL_SHIFT; | |
401 y5 = S (a2 - b2) >> COL_SHIFT; | |
402 y6 = S (a1 - b1) >> COL_SHIFT; | |
403 y7 = S (a0 - b0) >> COL_SHIFT; | |
404 | |
405 col[0*8] = y0; | |
406 col[1*8] = y1; | |
407 col[2*8] = y2; | |
408 col[3*8] = y3; | |
409 col[4*8] = y4; | |
410 col[5*8] = y5; | |
411 col[6*8] = y6; | |
412 col[7*8] = y7; | |
413 } | |
414 #endif | |
415 | |
416 | |
417 /* MMX column IDCT */ | |
418 static inline void idct_col (int16_t * const col, const int offset) | |
419 { | |
420 #define T1 13036 | |
421 #define T2 27146 | |
422 #define T3 43790 | |
423 #define C4 23170 | |
424 | |
12296 | 425 DECLARE_ALIGNED(8, static const short, t1_vector)[] = { |
426 T1,T1,T1,T1, | |
427 T2,T2,T2,T2, | |
428 T3,T3,T3,T3, | |
429 C4,C4,C4,C4 | |
430 }; | |
8430 | 431 |
432 /* column code adapted from Peter Gubanov */ | |
433 /* http://www.elecard.com/peter/idct.shtml */ | |
434 | |
12296 | 435 __asm__ volatile ( |
436 "movq (%0), %%mm0 \n\t" /* mm0 = T1 */ | |
8430 | 437 |
12296 | 438 "movq 2*8(%1), %%mm1 \n\t" /* mm1 = x1 */ |
439 "movq %%mm0, %%mm2 \n\t" /* mm2 = T1 */ | |
8430 | 440 |
12296 | 441 "movq 7*2*8(%1), %%mm4 \n\t" /* mm4 = x7 */ |
442 "pmulhw %%mm1, %%mm0 \n\t" /* mm0 = T1*x1 */ | |
8430 | 443 |
12296 | 444 "movq 16(%0), %%mm5 \n\t" /* mm5 = T3 */ |
445 "pmulhw %%mm4, %%mm2 \n\t" /* mm2 = T1*x7 */ | |
8430 | 446 |
12296 | 447 "movq 2*5*8(%1), %%mm6 \n\t" /* mm6 = x5 */ |
448 "movq %%mm5, %%mm7 \n\t" /* mm7 = T3-1 */ | |
8430 | 449 |
12296 | 450 "movq 3*8*2(%1), %%mm3 \n\t" /* mm3 = x3 */ |
451 "psubsw %%mm4, %%mm0 \n\t" /* mm0 = v17 */ | |
8430 | 452 |
12296 | 453 "movq 8(%0), %%mm4 \n\t" /* mm4 = T2 */ |
454 "pmulhw %%mm3, %%mm5 \n\t" /* mm5 = (T3-1)*x3 */ | |
8430 | 455 |
12296 | 456 "paddsw %%mm2, %%mm1 \n\t" /* mm1 = u17 */ |
457 "pmulhw %%mm6, %%mm7 \n\t" /* mm7 = (T3-1)*x5 */ | |
458 | |
459 /* slot */ | |
8430 | 460 |
12296 | 461 "movq %%mm4, %%mm2 \n\t" /* mm2 = T2 */ |
462 "paddsw %%mm3, %%mm5 \n\t" /* mm5 = T3*x3 */ | |
8430 | 463 |
12296 | 464 "pmulhw 2*8*2(%1), %%mm4 \n\t" /* mm4 = T2*x2 */ |
465 "paddsw %%mm6, %%mm7 \n\t" /* mm7 = T3*x5 */ | |
8430 | 466 |
12296 | 467 "psubsw %%mm6, %%mm5 \n\t" /* mm5 = v35 */ |
468 "paddsw %%mm3, %%mm7 \n\t" /* mm7 = u35 */ | |
8430 | 469 |
12296 | 470 "movq 6*8*2(%1), %%mm3 \n\t" /* mm3 = x6 */ |
471 "movq %%mm0, %%mm6 \n\t" /* mm6 = v17 */ | |
8430 | 472 |
12296 | 473 "pmulhw %%mm3, %%mm2 \n\t" /* mm2 = T2*x6 */ |
474 "psubsw %%mm5, %%mm0 \n\t" /* mm0 = b3 */ | |
8430 | 475 |
12296 | 476 "psubsw %%mm3, %%mm4 \n\t" /* mm4 = v26 */ |
477 "paddsw %%mm6, %%mm5 \n\t" /* mm5 = v12 */ | |
8430 | 478 |
12296 | 479 "movq %%mm0, 3*8*2(%1)\n\t" /* save b3 in scratch0 */ |
480 "movq %%mm1, %%mm6 \n\t" /* mm6 = u17 */ | |
8430 | 481 |
12296 | 482 "paddsw 2*8*2(%1), %%mm2 \n\t" /* mm2 = u26 */ |
483 "paddsw %%mm7, %%mm6 \n\t" /* mm6 = b0 */ | |
8430 | 484 |
12296 | 485 "psubsw %%mm7, %%mm1 \n\t" /* mm1 = u12 */ |
486 "movq %%mm1, %%mm7 \n\t" /* mm7 = u12 */ | |
8430 | 487 |
12296 | 488 "movq 0*8(%1), %%mm3 \n\t" /* mm3 = x0 */ |
489 "paddsw %%mm5, %%mm1 \n\t" /* mm1 = u12+v12 */ | |
8430 | 490 |
12296 | 491 "movq 24(%0), %%mm0 \n\t" /* mm0 = C4/2 */ |
492 "psubsw %%mm5, %%mm7 \n\t" /* mm7 = u12-v12 */ | |
8430 | 493 |
12296 | 494 "movq %%mm6, 5*8*2(%1)\n\t" /* save b0 in scratch1 */ |
495 "pmulhw %%mm0, %%mm1 \n\t" /* mm1 = b1/2 */ | |
8430 | 496 |
12296 | 497 "movq %%mm4, %%mm6 \n\t" /* mm6 = v26 */ |
498 "pmulhw %%mm0, %%mm7 \n\t" /* mm7 = b2/2 */ | |
8430 | 499 |
12296 | 500 "movq 4*8*2(%1), %%mm5 \n\t" /* mm5 = x4 */ |
501 "movq %%mm3, %%mm0 \n\t" /* mm0 = x0 */ | |
8430 | 502 |
12296 | 503 "psubsw %%mm5, %%mm3 \n\t" /* mm3 = v04 */ |
504 "paddsw %%mm5, %%mm0 \n\t" /* mm0 = u04 */ | |
8430 | 505 |
12296 | 506 "paddsw %%mm3, %%mm4 \n\t" /* mm4 = a1 */ |
507 "movq %%mm0, %%mm5 \n\t" /* mm5 = u04 */ | |
8430 | 508 |
12296 | 509 "psubsw %%mm6, %%mm3 \n\t" /* mm3 = a2 */ |
510 "paddsw %%mm2, %%mm5 \n\t" /* mm5 = a0 */ | |
8430 | 511 |
12296 | 512 "paddsw %%mm1, %%mm1 \n\t" /* mm1 = b1 */ |
513 "psubsw %%mm2, %%mm0 \n\t" /* mm0 = a3 */ | |
8430 | 514 |
12296 | 515 "paddsw %%mm7, %%mm7 \n\t" /* mm7 = b2 */ |
516 "movq %%mm3, %%mm2 \n\t" /* mm2 = a2 */ | |
8430 | 517 |
12296 | 518 "movq %%mm4, %%mm6 \n\t" /* mm6 = a1 */ |
519 "paddsw %%mm7, %%mm3 \n\t" /* mm3 = a2+b2 */ | |
8430 | 520 |
12296 | 521 "psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm3\n\t" /* mm3 = y2 */ |
522 "paddsw %%mm1, %%mm4\n\t" /* mm4 = a1+b1 */ | |
8430 | 523 |
12296 | 524 "psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm4\n\t" /* mm4 = y1 */ |
525 "psubsw %%mm1, %%mm6 \n\t" /* mm6 = a1-b1 */ | |
8430 | 526 |
12296 | 527 "movq 5*8*2(%1), %%mm1 \n\t" /* mm1 = b0 */ |
528 "psubsw %%mm7, %%mm2 \n\t" /* mm2 = a2-b2 */ | |
8430 | 529 |
12296 | 530 "psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm6\n\t" /* mm6 = y6 */ |
531 "movq %%mm5, %%mm7 \n\t" /* mm7 = a0 */ | |
8430 | 532 |
12296 | 533 "movq %%mm4, 1*8*2(%1)\n\t" /* save y1 */ |
534 "psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm2\n\t" /* mm2 = y5 */ | |
8430 | 535 |
12296 | 536 "movq %%mm3, 2*8*2(%1)\n\t" /* save y2 */ |
537 "paddsw %%mm1, %%mm5 \n\t" /* mm5 = a0+b0 */ | |
8430 | 538 |
12296 | 539 "movq 3*8*2(%1), %%mm4 \n\t" /* mm4 = b3 */ |
540 "psubsw %%mm1, %%mm7 \n\t" /* mm7 = a0-b0 */ | |
8430 | 541 |
12296 | 542 "psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm5\n\t" /* mm5 = y0 */ |
543 "movq %%mm0, %%mm3 \n\t" /* mm3 = a3 */ | |
8430 | 544 |
12296 | 545 "movq %%mm2, 5*8*2(%1)\n\t" /* save y5 */ |
546 "psubsw %%mm4, %%mm3 \n\t" /* mm3 = a3-b3 */ | |
8430 | 547 |
12296 | 548 "psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm7\n\t" /* mm7 = y7 */ |
549 "paddsw %%mm0, %%mm4 \n\t" /* mm4 = a3+b3 */ | |
8430 | 550 |
12296 | 551 "movq %%mm5, 0*8*2(%1)\n\t" /* save y0 */ |
552 "psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm3\n\t" /* mm3 = y4 */ | |
8430 | 553 |
12296 | 554 "movq %%mm6, 6*8*2(%1)\n\t" /* save y6 */ |
555 "psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm4\n\t" /* mm4 = y3 */ | |
556 | |
557 "movq %%mm7, 7*8*2(%1)\n\t" /* save y7 */ | |
8430 | 558 |
12296 | 559 "movq %%mm3, 4*8*2(%1)\n\t" /* save y4 */ |
8430 | 560 |
12296 | 561 "movq %%mm4, 3*8*2(%1)\n\t" /* save y3 */ |
562 :: "r" (t1_vector), "r" (col+offset) | |
563 ); | |
8430 | 564 |
565 #undef T1 | |
566 #undef T2 | |
567 #undef T3 | |
568 #undef C4 | |
569 } | |
570 | |
571 | |
11508
7be32921237f
Replace remaining uses of ATTR_ALIGNED with DECLARE_ALIGNED
mru
parents:
11381
diff
changeset
|
572 DECLARE_ALIGNED(8, static const int32_t, rounder0)[] = |
8430 | 573 rounder ((1 << (COL_SHIFT - 1)) - 0.5); |
11508
7be32921237f
Replace remaining uses of ATTR_ALIGNED with DECLARE_ALIGNED
mru
parents:
11381
diff
changeset
|
574 DECLARE_ALIGNED(8, static const int32_t, rounder4)[] = rounder (0); |
7be32921237f
Replace remaining uses of ATTR_ALIGNED with DECLARE_ALIGNED
mru
parents:
11381
diff
changeset
|
575 DECLARE_ALIGNED(8, static const int32_t, rounder1)[] = |
8430 | 576 rounder (1.25683487303); /* C1*(C1/C4+C1+C7)/2 */ |
11508
7be32921237f
Replace remaining uses of ATTR_ALIGNED with DECLARE_ALIGNED
mru
parents:
11381
diff
changeset
|
577 DECLARE_ALIGNED(8, static const int32_t, rounder7)[] = |
8430 | 578 rounder (-0.25); /* C1*(C7/C4+C7-C1)/2 */ |
11508
7be32921237f
Replace remaining uses of ATTR_ALIGNED with DECLARE_ALIGNED
mru
parents:
11381
diff
changeset
|
579 DECLARE_ALIGNED(8, static const int32_t, rounder2)[] = |
8430 | 580 rounder (0.60355339059); /* C2 * (C6+C2)/2 */ |
11508
7be32921237f
Replace remaining uses of ATTR_ALIGNED with DECLARE_ALIGNED
mru
parents:
11381
diff
changeset
|
581 DECLARE_ALIGNED(8, static const int32_t, rounder6)[] = |
8430 | 582 rounder (-0.25); /* C2 * (C6-C2)/2 */ |
11508
7be32921237f
Replace remaining uses of ATTR_ALIGNED with DECLARE_ALIGNED
mru
parents:
11381
diff
changeset
|
583 DECLARE_ALIGNED(8, static const int32_t, rounder3)[] = |
8430 | 584 rounder (0.087788325588); /* C3*(-C3/C4+C3+C5)/2 */ |
11508
7be32921237f
Replace remaining uses of ATTR_ALIGNED with DECLARE_ALIGNED
mru
parents:
11381
diff
changeset
|
585 DECLARE_ALIGNED(8, static const int32_t, rounder5)[] = |
8430 | 586 rounder (-0.441341716183); /* C3*(-C5/C4+C5-C3)/2 */ |
587 | |
588 #undef COL_SHIFT | |
589 #undef ROW_SHIFT | |
590 | |
591 #define declare_idct(idct,table,idct_row_head,idct_row,idct_row_tail,idct_row_mid) \ | |
592 void idct (int16_t * const block) \ | |
593 { \ | |
11508
7be32921237f
Replace remaining uses of ATTR_ALIGNED with DECLARE_ALIGNED
mru
parents:
11381
diff
changeset
|
594 DECLARE_ALIGNED(16, static const int16_t, table04)[] = \ |
8430 | 595 table (22725, 21407, 19266, 16384, 12873, 8867, 4520); \ |
11508
7be32921237f
Replace remaining uses of ATTR_ALIGNED with DECLARE_ALIGNED
mru
parents:
11381
diff
changeset
|
596 DECLARE_ALIGNED(16, static const int16_t, table17)[] = \ |
8430 | 597 table (31521, 29692, 26722, 22725, 17855, 12299, 6270); \ |
11508
7be32921237f
Replace remaining uses of ATTR_ALIGNED with DECLARE_ALIGNED
mru
parents:
11381
diff
changeset
|
598 DECLARE_ALIGNED(16, static const int16_t, table26)[] = \ |
8430 | 599 table (29692, 27969, 25172, 21407, 16819, 11585, 5906); \ |
11508
7be32921237f
Replace remaining uses of ATTR_ALIGNED with DECLARE_ALIGNED
mru
parents:
11381
diff
changeset
|
600 DECLARE_ALIGNED(16, static const int16_t, table35)[] = \ |
8430 | 601 table (26722, 25172, 22654, 19266, 15137, 10426, 5315); \ |
602 \ | |
603 idct_row_head (block, 0*8, table04); \ | |
604 idct_row (table04, rounder0); \ | |
605 idct_row_mid (block, 0*8, 4*8, table04); \ | |
606 idct_row (table04, rounder4); \ | |
607 idct_row_mid (block, 4*8, 1*8, table17); \ | |
608 idct_row (table17, rounder1); \ | |
609 idct_row_mid (block, 1*8, 7*8, table17); \ | |
610 idct_row (table17, rounder7); \ | |
611 idct_row_mid (block, 7*8, 2*8, table26); \ | |
612 idct_row (table26, rounder2); \ | |
613 idct_row_mid (block, 2*8, 6*8, table26); \ | |
614 idct_row (table26, rounder6); \ | |
615 idct_row_mid (block, 6*8, 3*8, table35); \ | |
616 idct_row (table35, rounder3); \ | |
617 idct_row_mid (block, 3*8, 5*8, table35); \ | |
618 idct_row (table35, rounder5); \ | |
619 idct_row_tail (block, 5*8); \ | |
620 \ | |
621 idct_col (block, 0); \ | |
622 idct_col (block, 4); \ | |
623 } | |
624 | |
625 declare_idct (ff_mmxext_idct, mmxext_table, | |
626 mmxext_row_head, mmxext_row, mmxext_row_tail, mmxext_row_mid) | |
627 | |
628 declare_idct (ff_mmx_idct, mmx_table, | |
629 mmx_row_head, mmx_row, mmx_row_tail, mmx_row_mid) | |
630 |