Mercurial > libavcodec.hg
annotate i386/idct_mmx.c @ 5876:731ee5ad6bde libavcodec
Correct assignment of interlaced_frame; was being set on output frames,
in display order, based on decoding information in decoding order. Now
set properly, immediately upon completion of decode.
Based on original patch from Reinhard Nissl, rnisssl % gmx , de
Original Thread: [FFmpeg-devel] H.264 + PAFF: BBC HD recording shows
extreme interlacing artefacts, Thu, 01 Nov 2007 22:43:09
author | heydowns |
---|---|
date | Mon, 05 Nov 2007 18:16:42 +0000 |
parents | 0244bba24b43 |
children | 80103098c797 |
rev | line source |
---|---|
30 | 1 /* |
2 * idct_mmx.c | |
3 * Copyright (C) 1999-2001 Aaron Holtzman <aholtzma@ess.engr.uvic.ca> | |
4 * | |
5 * This file is part of mpeg2dec, a free MPEG-2 video stream decoder. | |
6 * | |
7 * mpeg2dec is free software; you can redistribute it and/or modify | |
8 * it under the terms of the GNU General Public License as published by | |
9 * the Free Software Foundation; either version 2 of the License, or | |
10 * (at your option) any later version. | |
11 * | |
12 * mpeg2dec is distributed in the hope that it will be useful, | |
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
15 * GNU General Public License for more details. | |
16 * | |
17 * You should have received a copy of the GNU General Public License | |
4384 | 18 * along with mpeg2dec; if not, write to the Free Software |
3036
0b546eab515d
Update licensing information: The FSF changed postal address.
diego
parents:
2979
diff
changeset
|
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
30 | 20 */ |
21 | |
2817
b128802eb77b
libavutil: Utility code from libavcodec moved to a separate library.
al
parents:
2754
diff
changeset
|
22 #include "common.h" |
5010
d5ba514e3f4a
Add libavcodec to compiler include flags in order to simplify header
diego
parents:
4384
diff
changeset
|
23 #include "dsputil.h" |
30 | 24 |
25 #include "mmx.h" | |
26 | |
27 #define ATTR_ALIGN(align) __attribute__ ((__aligned__ (align))) | |
28 | |
29 #define ROW_SHIFT 11 | |
30 #define COL_SHIFT 6 | |
31 | |
32 #define round(bias) ((int)(((bias)+0.5) * (1<<ROW_SHIFT))) | |
33 #define rounder(bias) {round (bias), round (bias)} | |
34 | |
35 #if 0 | |
5129 | 36 /* C row IDCT - it is just here to document the MMXEXT and MMX versions */ |
30 | 37 static inline void idct_row (int16_t * row, int offset, |
2979 | 38 int16_t * table, int32_t * rounder) |
30 | 39 { |
40 int C1, C2, C3, C4, C5, C6, C7; | |
41 int a0, a1, a2, a3, b0, b1, b2, b3; | |
42 | |
43 row += offset; | |
44 | |
45 C1 = table[1]; | |
46 C2 = table[2]; | |
47 C3 = table[3]; | |
48 C4 = table[4]; | |
49 C5 = table[5]; | |
50 C6 = table[6]; | |
51 C7 = table[7]; | |
52 | |
53 a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + *rounder; | |
54 a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + *rounder; | |
55 a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + *rounder; | |
56 a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + *rounder; | |
57 | |
58 b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7]; | |
59 b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7]; | |
60 b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7]; | |
61 b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7]; | |
62 | |
63 row[0] = (a0 + b0) >> ROW_SHIFT; | |
64 row[1] = (a1 + b1) >> ROW_SHIFT; | |
65 row[2] = (a2 + b2) >> ROW_SHIFT; | |
66 row[3] = (a3 + b3) >> ROW_SHIFT; | |
67 row[4] = (a3 - b3) >> ROW_SHIFT; | |
68 row[5] = (a2 - b2) >> ROW_SHIFT; | |
69 row[6] = (a1 - b1) >> ROW_SHIFT; | |
70 row[7] = (a0 - b0) >> ROW_SHIFT; | |
71 } | |
72 #endif | |
73 | |
74 | |
75 /* MMXEXT row IDCT */ | |
76 | |
2979 | 77 #define mmxext_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, -c4, -c2, \ |
78 c4, c6, c4, c6, \ | |
79 c1, c3, -c1, -c5, \ | |
80 c5, c7, c3, -c7, \ | |
81 c4, -c6, c4, -c6, \ | |
82 -c4, c2, c4, -c2, \ | |
83 c5, -c1, c3, -c1, \ | |
84 c7, c3, c7, -c5 } | |
30 | 85 |
2854
00ff749b33b6
add consts (this was in my local tree, dunno where it came from, probably forgoten from some const patch)
michael
parents:
2817
diff
changeset
|
86 static inline void mmxext_row_head (int16_t * row, int offset, const int16_t * table) |
30 | 87 { |
2979 | 88 movq_m2r (*(row+offset), mm2); // mm2 = x6 x4 x2 x0 |
30 | 89 |
2979 | 90 movq_m2r (*(row+offset+4), mm5); // mm5 = x7 x5 x3 x1 |
91 movq_r2r (mm2, mm0); // mm0 = x6 x4 x2 x0 | |
30 | 92 |
2979 | 93 movq_m2r (*table, mm3); // mm3 = -C2 -C4 C2 C4 |
94 movq_r2r (mm5, mm6); // mm6 = x7 x5 x3 x1 | |
30 | 95 |
2979 | 96 movq_m2r (*(table+4), mm4); // mm4 = C6 C4 C6 C4 |
97 pmaddwd_r2r (mm0, mm3); // mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 | |
30 | 98 |
2979 | 99 pshufw_r2r (mm2, mm2, 0x4e); // mm2 = x2 x0 x6 x4 |
30 | 100 } |
101 | |
2854
00ff749b33b6
add consts (this was in my local tree, dunno where it came from, probably forgoten from some const patch)
michael
parents:
2817
diff
changeset
|
102 static inline void mmxext_row (const int16_t * table, const int32_t * rounder) |
30 | 103 { |
2979 | 104 movq_m2r (*(table+8), mm1); // mm1 = -C5 -C1 C3 C1 |
105 pmaddwd_r2r (mm2, mm4); // mm4 = C4*x0+C6*x2 C4*x4+C6*x6 | |
30 | 106 |
2979 | 107 pmaddwd_m2r (*(table+16), mm0); // mm0 = C4*x4-C6*x6 C4*x0-C6*x2 |
108 pshufw_r2r (mm6, mm6, 0x4e); // mm6 = x3 x1 x7 x5 | |
30 | 109 |
2979 | 110 movq_m2r (*(table+12), mm7); // mm7 = -C7 C3 C7 C5 |
111 pmaddwd_r2r (mm5, mm1); // mm1 = -C1*x5-C5*x7 C1*x1+C3*x3 | |
30 | 112 |
2979 | 113 paddd_m2r (*rounder, mm3); // mm3 += rounder |
114 pmaddwd_r2r (mm6, mm7); // mm7 = C3*x1-C7*x3 C5*x5+C7*x7 | |
30 | 115 |
2979 | 116 pmaddwd_m2r (*(table+20), mm2); // mm2 = C4*x0-C2*x2 -C4*x4+C2*x6 |
117 paddd_r2r (mm4, mm3); // mm3 = a1 a0 + rounder | |
30 | 118 |
2979 | 119 pmaddwd_m2r (*(table+24), mm5); // mm5 = C3*x5-C1*x7 C5*x1-C1*x3 |
120 movq_r2r (mm3, mm4); // mm4 = a1 a0 + rounder | |
30 | 121 |
2979 | 122 pmaddwd_m2r (*(table+28), mm6); // mm6 = C7*x1-C5*x3 C7*x5+C3*x7 |
123 paddd_r2r (mm7, mm1); // mm1 = b1 b0 | |
30 | 124 |
2979 | 125 paddd_m2r (*rounder, mm0); // mm0 += rounder |
126 psubd_r2r (mm1, mm3); // mm3 = a1-b1 a0-b0 + rounder | |
30 | 127 |
2979 | 128 psrad_i2r (ROW_SHIFT, mm3); // mm3 = y6 y7 |
129 paddd_r2r (mm4, mm1); // mm1 = a1+b1 a0+b0 + rounder | |
30 | 130 |
2979 | 131 paddd_r2r (mm2, mm0); // mm0 = a3 a2 + rounder |
132 psrad_i2r (ROW_SHIFT, mm1); // mm1 = y1 y0 | |
30 | 133 |
2979 | 134 paddd_r2r (mm6, mm5); // mm5 = b3 b2 |
135 movq_r2r (mm0, mm4); // mm4 = a3 a2 + rounder | |
30 | 136 |
2979 | 137 paddd_r2r (mm5, mm0); // mm0 = a3+b3 a2+b2 + rounder |
138 psubd_r2r (mm5, mm4); // mm4 = a3-b3 a2-b2 + rounder | |
30 | 139 } |
140 | |
141 static inline void mmxext_row_tail (int16_t * row, int store) | |
142 { | |
2979 | 143 psrad_i2r (ROW_SHIFT, mm0); // mm0 = y3 y2 |
30 | 144 |
2979 | 145 psrad_i2r (ROW_SHIFT, mm4); // mm4 = y4 y5 |
30 | 146 |
2979 | 147 packssdw_r2r (mm0, mm1); // mm1 = y3 y2 y1 y0 |
30 | 148 |
2979 | 149 packssdw_r2r (mm3, mm4); // mm4 = y6 y7 y4 y5 |
30 | 150 |
2979 | 151 movq_r2m (mm1, *(row+store)); // save y3 y2 y1 y0 |
152 pshufw_r2r (mm4, mm4, 0xb1); // mm4 = y7 y6 y5 y4 | |
30 | 153 |
154 /* slot */ | |
155 | |
2979 | 156 movq_r2m (mm4, *(row+store+4)); // save y7 y6 y5 y4 |
30 | 157 } |
158 | |
159 static inline void mmxext_row_mid (int16_t * row, int store, | |
2979 | 160 int offset, const int16_t * table) |
30 | 161 { |
2979 | 162 movq_m2r (*(row+offset), mm2); // mm2 = x6 x4 x2 x0 |
163 psrad_i2r (ROW_SHIFT, mm0); // mm0 = y3 y2 | |
30 | 164 |
2979 | 165 movq_m2r (*(row+offset+4), mm5); // mm5 = x7 x5 x3 x1 |
166 psrad_i2r (ROW_SHIFT, mm4); // mm4 = y4 y5 | |
30 | 167 |
2979 | 168 packssdw_r2r (mm0, mm1); // mm1 = y3 y2 y1 y0 |
169 movq_r2r (mm5, mm6); // mm6 = x7 x5 x3 x1 | |
30 | 170 |
2979 | 171 packssdw_r2r (mm3, mm4); // mm4 = y6 y7 y4 y5 |
172 movq_r2r (mm2, mm0); // mm0 = x6 x4 x2 x0 | |
30 | 173 |
2979 | 174 movq_r2m (mm1, *(row+store)); // save y3 y2 y1 y0 |
175 pshufw_r2r (mm4, mm4, 0xb1); // mm4 = y7 y6 y5 y4 | |
30 | 176 |
2979 | 177 movq_m2r (*table, mm3); // mm3 = -C2 -C4 C2 C4 |
178 movq_r2m (mm4, *(row+store+4)); // save y7 y6 y5 y4 | |
30 | 179 |
2979 | 180 pmaddwd_r2r (mm0, mm3); // mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 |
30 | 181 |
2979 | 182 movq_m2r (*(table+4), mm4); // mm4 = C6 C4 C6 C4 |
183 pshufw_r2r (mm2, mm2, 0x4e); // mm2 = x2 x0 x6 x4 | |
30 | 184 } |
185 | |
186 | |
187 /* MMX row IDCT */ | |
188 | |
2979 | 189 #define mmx_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, c4, c6, \ |
190 c4, c6, -c4, -c2, \ | |
191 c1, c3, c3, -c7, \ | |
192 c5, c7, -c1, -c5, \ | |
193 c4, -c6, c4, -c2, \ | |
194 -c4, c2, c4, -c6, \ | |
195 c5, -c1, c7, -c5, \ | |
196 c7, c3, c3, -c1 } | |
30 | 197 |
2854
00ff749b33b6
add consts (this was in my local tree, dunno where it came from, probably forgoten from some const patch)
michael
parents:
2817
diff
changeset
|
198 static inline void mmx_row_head (int16_t * row, int offset, const int16_t * table) |
30 | 199 { |
2979 | 200 movq_m2r (*(row+offset), mm2); // mm2 = x6 x4 x2 x0 |
30 | 201 |
2979 | 202 movq_m2r (*(row+offset+4), mm5); // mm5 = x7 x5 x3 x1 |
203 movq_r2r (mm2, mm0); // mm0 = x6 x4 x2 x0 | |
30 | 204 |
2979 | 205 movq_m2r (*table, mm3); // mm3 = C6 C4 C2 C4 |
206 movq_r2r (mm5, mm6); // mm6 = x7 x5 x3 x1 | |
30 | 207 |
2979 | 208 punpckldq_r2r (mm0, mm0); // mm0 = x2 x0 x2 x0 |
30 | 209 |
2979 | 210 movq_m2r (*(table+4), mm4); // mm4 = -C2 -C4 C6 C4 |
211 pmaddwd_r2r (mm0, mm3); // mm3 = C4*x0+C6*x2 C4*x0+C2*x2 | |
30 | 212 |
2979 | 213 movq_m2r (*(table+8), mm1); // mm1 = -C7 C3 C3 C1 |
214 punpckhdq_r2r (mm2, mm2); // mm2 = x6 x4 x6 x4 | |
30 | 215 } |
216 | |
2854
00ff749b33b6
add consts (this was in my local tree, dunno where it came from, probably forgoten from some const patch)
michael
parents:
2817
diff
changeset
|
217 static inline void mmx_row (const int16_t * table, const int32_t * rounder) |
30 | 218 { |
2979 | 219 pmaddwd_r2r (mm2, mm4); // mm4 = -C4*x4-C2*x6 C4*x4+C6*x6 |
220 punpckldq_r2r (mm5, mm5); // mm5 = x3 x1 x3 x1 | |
30 | 221 |
2979 | 222 pmaddwd_m2r (*(table+16), mm0); // mm0 = C4*x0-C2*x2 C4*x0-C6*x2 |
223 punpckhdq_r2r (mm6, mm6); // mm6 = x7 x5 x7 x5 | |
30 | 224 |
2979 | 225 movq_m2r (*(table+12), mm7); // mm7 = -C5 -C1 C7 C5 |
226 pmaddwd_r2r (mm5, mm1); // mm1 = C3*x1-C7*x3 C1*x1+C3*x3 | |
30 | 227 |
2979 | 228 paddd_m2r (*rounder, mm3); // mm3 += rounder |
229 pmaddwd_r2r (mm6, mm7); // mm7 = -C1*x5-C5*x7 C5*x5+C7*x7 | |
30 | 230 |
2979 | 231 pmaddwd_m2r (*(table+20), mm2); // mm2 = C4*x4-C6*x6 -C4*x4+C2*x6 |
232 paddd_r2r (mm4, mm3); // mm3 = a1 a0 + rounder | |
30 | 233 |
2979 | 234 pmaddwd_m2r (*(table+24), mm5); // mm5 = C7*x1-C5*x3 C5*x1-C1*x3 |
235 movq_r2r (mm3, mm4); // mm4 = a1 a0 + rounder | |
30 | 236 |
2979 | 237 pmaddwd_m2r (*(table+28), mm6); // mm6 = C3*x5-C1*x7 C7*x5+C3*x7 |
238 paddd_r2r (mm7, mm1); // mm1 = b1 b0 | |
30 | 239 |
2979 | 240 paddd_m2r (*rounder, mm0); // mm0 += rounder |
241 psubd_r2r (mm1, mm3); // mm3 = a1-b1 a0-b0 + rounder | |
30 | 242 |
2979 | 243 psrad_i2r (ROW_SHIFT, mm3); // mm3 = y6 y7 |
244 paddd_r2r (mm4, mm1); // mm1 = a1+b1 a0+b0 + rounder | |
30 | 245 |
2979 | 246 paddd_r2r (mm2, mm0); // mm0 = a3 a2 + rounder |
247 psrad_i2r (ROW_SHIFT, mm1); // mm1 = y1 y0 | |
30 | 248 |
2979 | 249 paddd_r2r (mm6, mm5); // mm5 = b3 b2 |
250 movq_r2r (mm0, mm7); // mm7 = a3 a2 + rounder | |
30 | 251 |
2979 | 252 paddd_r2r (mm5, mm0); // mm0 = a3+b3 a2+b2 + rounder |
253 psubd_r2r (mm5, mm7); // mm7 = a3-b3 a2-b2 + rounder | |
30 | 254 } |
255 | |
256 static inline void mmx_row_tail (int16_t * row, int store) | |
257 { | |
2979 | 258 psrad_i2r (ROW_SHIFT, mm0); // mm0 = y3 y2 |
30 | 259 |
2979 | 260 psrad_i2r (ROW_SHIFT, mm7); // mm7 = y4 y5 |
30 | 261 |
2979 | 262 packssdw_r2r (mm0, mm1); // mm1 = y3 y2 y1 y0 |
30 | 263 |
2979 | 264 packssdw_r2r (mm3, mm7); // mm7 = y6 y7 y4 y5 |
30 | 265 |
2979 | 266 movq_r2m (mm1, *(row+store)); // save y3 y2 y1 y0 |
267 movq_r2r (mm7, mm4); // mm4 = y6 y7 y4 y5 | |
30 | 268 |
2979 | 269 pslld_i2r (16, mm7); // mm7 = y7 0 y5 0 |
30 | 270 |
2979 | 271 psrld_i2r (16, mm4); // mm4 = 0 y6 0 y4 |
30 | 272 |
2979 | 273 por_r2r (mm4, mm7); // mm7 = y7 y6 y5 y4 |
30 | 274 |
275 /* slot */ | |
276 | |
2979 | 277 movq_r2m (mm7, *(row+store+4)); // save y7 y6 y5 y4 |
30 | 278 } |
279 | |
280 static inline void mmx_row_mid (int16_t * row, int store, | |
2979 | 281 int offset, const int16_t * table) |
30 | 282 { |
2979 | 283 movq_m2r (*(row+offset), mm2); // mm2 = x6 x4 x2 x0 |
284 psrad_i2r (ROW_SHIFT, mm0); // mm0 = y3 y2 | |
30 | 285 |
2979 | 286 movq_m2r (*(row+offset+4), mm5); // mm5 = x7 x5 x3 x1 |
287 psrad_i2r (ROW_SHIFT, mm7); // mm7 = y4 y5 | |
30 | 288 |
2979 | 289 packssdw_r2r (mm0, mm1); // mm1 = y3 y2 y1 y0 |
290 movq_r2r (mm5, mm6); // mm6 = x7 x5 x3 x1 | |
30 | 291 |
2979 | 292 packssdw_r2r (mm3, mm7); // mm7 = y6 y7 y4 y5 |
293 movq_r2r (mm2, mm0); // mm0 = x6 x4 x2 x0 | |
30 | 294 |
2979 | 295 movq_r2m (mm1, *(row+store)); // save y3 y2 y1 y0 |
296 movq_r2r (mm7, mm1); // mm1 = y6 y7 y4 y5 | |
30 | 297 |
2979 | 298 punpckldq_r2r (mm0, mm0); // mm0 = x2 x0 x2 x0 |
299 psrld_i2r (16, mm7); // mm7 = 0 y6 0 y4 | |
30 | 300 |
2979 | 301 movq_m2r (*table, mm3); // mm3 = C6 C4 C2 C4 |
302 pslld_i2r (16, mm1); // mm1 = y7 0 y5 0 | |
30 | 303 |
2979 | 304 movq_m2r (*(table+4), mm4); // mm4 = -C2 -C4 C6 C4 |
305 por_r2r (mm1, mm7); // mm7 = y7 y6 y5 y4 | |
30 | 306 |
2979 | 307 movq_m2r (*(table+8), mm1); // mm1 = -C7 C3 C3 C1 |
308 punpckhdq_r2r (mm2, mm2); // mm2 = x6 x4 x6 x4 | |
30 | 309 |
2979 | 310 movq_r2m (mm7, *(row+store+4)); // save y7 y6 y5 y4 |
311 pmaddwd_r2r (mm0, mm3); // mm3 = C4*x0+C6*x2 C4*x0+C2*x2 | |
30 | 312 } |
313 | |
314 | |
315 #if 0 | |
5129 | 316 // C column IDCT - it is just here to document the MMXEXT and MMX versions |
30 | 317 static inline void idct_col (int16_t * col, int offset) |
318 { | |
319 /* multiplication - as implemented on mmx */ | |
320 #define F(c,x) (((c) * (x)) >> 16) | |
321 | |
322 /* saturation - it helps us handle torture test cases */ | |
323 #define S(x) (((x)>32767) ? 32767 : ((x)<-32768) ? -32768 : (x)) | |
324 | |
325 int16_t x0, x1, x2, x3, x4, x5, x6, x7; | |
326 int16_t y0, y1, y2, y3, y4, y5, y6, y7; | |
327 int16_t a0, a1, a2, a3, b0, b1, b2, b3; | |
328 int16_t u04, v04, u26, v26, u17, v17, u35, v35, u12, v12; | |
329 | |
330 col += offset; | |
331 | |
332 x0 = col[0*8]; | |
333 x1 = col[1*8]; | |
334 x2 = col[2*8]; | |
335 x3 = col[3*8]; | |
336 x4 = col[4*8]; | |
337 x5 = col[5*8]; | |
338 x6 = col[6*8]; | |
339 x7 = col[7*8]; | |
340 | |
341 u04 = S (x0 + x4); | |
342 v04 = S (x0 - x4); | |
343 u26 = S (F (T2, x6) + x2); | |
344 v26 = S (F (T2, x2) - x6); | |
345 | |
346 a0 = S (u04 + u26); | |
347 a1 = S (v04 + v26); | |
348 a2 = S (v04 - v26); | |
349 a3 = S (u04 - u26); | |
350 | |
351 u17 = S (F (T1, x7) + x1); | |
352 v17 = S (F (T1, x1) - x7); | |
353 u35 = S (F (T3, x5) + x3); | |
354 v35 = S (F (T3, x3) - x5); | |
355 | |
356 b0 = S (u17 + u35); | |
357 b3 = S (v17 - v35); | |
358 u12 = S (u17 - u35); | |
359 v12 = S (v17 + v35); | |
360 u12 = S (2 * F (C4, u12)); | |
361 v12 = S (2 * F (C4, v12)); | |
362 b1 = S (u12 + v12); | |
363 b2 = S (u12 - v12); | |
364 | |
365 y0 = S (a0 + b0) >> COL_SHIFT; | |
366 y1 = S (a1 + b1) >> COL_SHIFT; | |
367 y2 = S (a2 + b2) >> COL_SHIFT; | |
368 y3 = S (a3 + b3) >> COL_SHIFT; | |
369 | |
370 y4 = S (a3 - b3) >> COL_SHIFT; | |
371 y5 = S (a2 - b2) >> COL_SHIFT; | |
372 y6 = S (a1 - b1) >> COL_SHIFT; | |
373 y7 = S (a0 - b0) >> COL_SHIFT; | |
374 | |
375 col[0*8] = y0; | |
376 col[1*8] = y1; | |
377 col[2*8] = y2; | |
378 col[3*8] = y3; | |
379 col[4*8] = y4; | |
380 col[5*8] = y5; | |
381 col[6*8] = y6; | |
382 col[7*8] = y7; | |
383 } | |
384 #endif | |
385 | |
386 | |
387 // MMX column IDCT | |
388 static inline void idct_col (int16_t * col, int offset) | |
389 { | |
390 #define T1 13036 | |
391 #define T2 27146 | |
392 #define T3 43790 | |
393 #define C4 23170 | |
394 | |
2854
00ff749b33b6
add consts (this was in my local tree, dunno where it came from, probably forgoten from some const patch)
michael
parents:
2817
diff
changeset
|
395 static const short _T1[] ATTR_ALIGN(8) = {T1,T1,T1,T1}; |
00ff749b33b6
add consts (this was in my local tree, dunno where it came from, probably forgoten from some const patch)
michael
parents:
2817
diff
changeset
|
396 static const short _T2[] ATTR_ALIGN(8) = {T2,T2,T2,T2}; |
00ff749b33b6
add consts (this was in my local tree, dunno where it came from, probably forgoten from some const patch)
michael
parents:
2817
diff
changeset
|
397 static const short _T3[] ATTR_ALIGN(8) = {T3,T3,T3,T3}; |
00ff749b33b6
add consts (this was in my local tree, dunno where it came from, probably forgoten from some const patch)
michael
parents:
2817
diff
changeset
|
398 static const short _C4[] ATTR_ALIGN(8) = {C4,C4,C4,C4}; |
30 | 399 |
400 /* column code adapted from peter gubanov */ | |
401 /* http://www.elecard.com/peter/idct.shtml */ | |
402 | |
2979 | 403 movq_m2r (*_T1, mm0); // mm0 = T1 |
30 | 404 |
2979 | 405 movq_m2r (*(col+offset+1*8), mm1); // mm1 = x1 |
406 movq_r2r (mm0, mm2); // mm2 = T1 | |
30 | 407 |
2979 | 408 movq_m2r (*(col+offset+7*8), mm4); // mm4 = x7 |
409 pmulhw_r2r (mm1, mm0); // mm0 = T1*x1 | |
30 | 410 |
2979 | 411 movq_m2r (*_T3, mm5); // mm5 = T3 |
412 pmulhw_r2r (mm4, mm2); // mm2 = T1*x7 | |
30 | 413 |
2979 | 414 movq_m2r (*(col+offset+5*8), mm6); // mm6 = x5 |
415 movq_r2r (mm5, mm7); // mm7 = T3-1 | |
30 | 416 |
2979 | 417 movq_m2r (*(col+offset+3*8), mm3); // mm3 = x3 |
418 psubsw_r2r (mm4, mm0); // mm0 = v17 | |
30 | 419 |
2979 | 420 movq_m2r (*_T2, mm4); // mm4 = T2 |
421 pmulhw_r2r (mm3, mm5); // mm5 = (T3-1)*x3 | |
30 | 422 |
2979 | 423 paddsw_r2r (mm2, mm1); // mm1 = u17 |
424 pmulhw_r2r (mm6, mm7); // mm7 = (T3-1)*x5 | |
30 | 425 |
426 /* slot */ | |
427 | |
2979 | 428 movq_r2r (mm4, mm2); // mm2 = T2 |
429 paddsw_r2r (mm3, mm5); // mm5 = T3*x3 | |
30 | 430 |
431 pmulhw_m2r (*(col+offset+2*8), mm4);// mm4 = T2*x2 | |
2979 | 432 paddsw_r2r (mm6, mm7); // mm7 = T3*x5 |
30 | 433 |
2979 | 434 psubsw_r2r (mm6, mm5); // mm5 = v35 |
435 paddsw_r2r (mm3, mm7); // mm7 = u35 | |
30 | 436 |
2979 | 437 movq_m2r (*(col+offset+6*8), mm3); // mm3 = x6 |
438 movq_r2r (mm0, mm6); // mm6 = v17 | |
30 | 439 |
2979 | 440 pmulhw_r2r (mm3, mm2); // mm2 = T2*x6 |
441 psubsw_r2r (mm5, mm0); // mm0 = b3 | |
30 | 442 |
2979 | 443 psubsw_r2r (mm3, mm4); // mm4 = v26 |
444 paddsw_r2r (mm6, mm5); // mm5 = v12 | |
30 | 445 |
2979 | 446 movq_r2m (mm0, *(col+offset+3*8)); // save b3 in scratch0 |
447 movq_r2r (mm1, mm6); // mm6 = u17 | |
30 | 448 |
449 paddsw_m2r (*(col+offset+2*8), mm2);// mm2 = u26 | |
2979 | 450 paddsw_r2r (mm7, mm6); // mm6 = b0 |
30 | 451 |
2979 | 452 psubsw_r2r (mm7, mm1); // mm1 = u12 |
453 movq_r2r (mm1, mm7); // mm7 = u12 | |
30 | 454 |
2979 | 455 movq_m2r (*(col+offset+0*8), mm3); // mm3 = x0 |
456 paddsw_r2r (mm5, mm1); // mm1 = u12+v12 | |
30 | 457 |
2979 | 458 movq_m2r (*_C4, mm0); // mm0 = C4/2 |
459 psubsw_r2r (mm5, mm7); // mm7 = u12-v12 | |
30 | 460 |
2979 | 461 movq_r2m (mm6, *(col+offset+5*8)); // save b0 in scratch1 |
462 pmulhw_r2r (mm0, mm1); // mm1 = b1/2 | |
30 | 463 |
2979 | 464 movq_r2r (mm4, mm6); // mm6 = v26 |
465 pmulhw_r2r (mm0, mm7); // mm7 = b2/2 | |
30 | 466 |
2979 | 467 movq_m2r (*(col+offset+4*8), mm5); // mm5 = x4 |
468 movq_r2r (mm3, mm0); // mm0 = x0 | |
30 | 469 |
2979 | 470 psubsw_r2r (mm5, mm3); // mm3 = v04 |
471 paddsw_r2r (mm5, mm0); // mm0 = u04 | |
30 | 472 |
2979 | 473 paddsw_r2r (mm3, mm4); // mm4 = a1 |
474 movq_r2r (mm0, mm5); // mm5 = u04 | |
30 | 475 |
2979 | 476 psubsw_r2r (mm6, mm3); // mm3 = a2 |
477 paddsw_r2r (mm2, mm5); // mm5 = a0 | |
30 | 478 |
2979 | 479 paddsw_r2r (mm1, mm1); // mm1 = b1 |
480 psubsw_r2r (mm2, mm0); // mm0 = a3 | |
30 | 481 |
2979 | 482 paddsw_r2r (mm7, mm7); // mm7 = b2 |
483 movq_r2r (mm3, mm2); // mm2 = a2 | |
30 | 484 |
2979 | 485 movq_r2r (mm4, mm6); // mm6 = a1 |
486 paddsw_r2r (mm7, mm3); // mm3 = a2+b2 | |
30 | 487 |
2979 | 488 psraw_i2r (COL_SHIFT, mm3); // mm3 = y2 |
489 paddsw_r2r (mm1, mm4); // mm4 = a1+b1 | |
30 | 490 |
2979 | 491 psraw_i2r (COL_SHIFT, mm4); // mm4 = y1 |
492 psubsw_r2r (mm1, mm6); // mm6 = a1-b1 | |
30 | 493 |
2979 | 494 movq_m2r (*(col+offset+5*8), mm1); // mm1 = b0 |
495 psubsw_r2r (mm7, mm2); // mm2 = a2-b2 | |
30 | 496 |
2979 | 497 psraw_i2r (COL_SHIFT, mm6); // mm6 = y6 |
498 movq_r2r (mm5, mm7); // mm7 = a0 | |
30 | 499 |
2979 | 500 movq_r2m (mm4, *(col+offset+1*8)); // save y1 |
501 psraw_i2r (COL_SHIFT, mm2); // mm2 = y5 | |
30 | 502 |
2979 | 503 movq_r2m (mm3, *(col+offset+2*8)); // save y2 |
504 paddsw_r2r (mm1, mm5); // mm5 = a0+b0 | |
30 | 505 |
2979 | 506 movq_m2r (*(col+offset+3*8), mm4); // mm4 = b3 |
507 psubsw_r2r (mm1, mm7); // mm7 = a0-b0 | |
30 | 508 |
2979 | 509 psraw_i2r (COL_SHIFT, mm5); // mm5 = y0 |
510 movq_r2r (mm0, mm3); // mm3 = a3 | |
30 | 511 |
2979 | 512 movq_r2m (mm2, *(col+offset+5*8)); // save y5 |
513 psubsw_r2r (mm4, mm3); // mm3 = a3-b3 | |
30 | 514 |
2979 | 515 psraw_i2r (COL_SHIFT, mm7); // mm7 = y7 |
516 paddsw_r2r (mm0, mm4); // mm4 = a3+b3 | |
30 | 517 |
2979 | 518 movq_r2m (mm5, *(col+offset+0*8)); // save y0 |
519 psraw_i2r (COL_SHIFT, mm3); // mm3 = y4 | |
30 | 520 |
2979 | 521 movq_r2m (mm6, *(col+offset+6*8)); // save y6 |
522 psraw_i2r (COL_SHIFT, mm4); // mm4 = y3 | |
30 | 523 |
2979 | 524 movq_r2m (mm7, *(col+offset+7*8)); // save y7 |
30 | 525 |
2979 | 526 movq_r2m (mm3, *(col+offset+4*8)); // save y4 |
30 | 527 |
2979 | 528 movq_r2m (mm4, *(col+offset+3*8)); // save y3 |
436
35de17dd6ed8
* undefine local defines when they are no longer needed
kabi
parents:
76
diff
changeset
|
529 |
35de17dd6ed8
* undefine local defines when they are no longer needed
kabi
parents:
76
diff
changeset
|
530 #undef T1 |
35de17dd6ed8
* undefine local defines when they are no longer needed
kabi
parents:
76
diff
changeset
|
531 #undef T2 |
35de17dd6ed8
* undefine local defines when they are no longer needed
kabi
parents:
76
diff
changeset
|
532 #undef T3 |
35de17dd6ed8
* undefine local defines when they are no longer needed
kabi
parents:
76
diff
changeset
|
533 #undef C4 |
30 | 534 } |
535 | |
2854
00ff749b33b6
add consts (this was in my local tree, dunno where it came from, probably forgoten from some const patch)
michael
parents:
2817
diff
changeset
|
536 static const int32_t rounder0[] ATTR_ALIGN(8) = |
30 | 537 rounder ((1 << (COL_SHIFT - 1)) - 0.5); |
2854
00ff749b33b6
add consts (this was in my local tree, dunno where it came from, probably forgoten from some const patch)
michael
parents:
2817
diff
changeset
|
538 static const int32_t rounder4[] ATTR_ALIGN(8) = rounder (0); |
00ff749b33b6
add consts (this was in my local tree, dunno where it came from, probably forgoten from some const patch)
michael
parents:
2817
diff
changeset
|
539 static const int32_t rounder1[] ATTR_ALIGN(8) = |
2979 | 540 rounder (1.25683487303); /* C1*(C1/C4+C1+C7)/2 */ |
2854
00ff749b33b6
add consts (this was in my local tree, dunno where it came from, probably forgoten from some const patch)
michael
parents:
2817
diff
changeset
|
541 static const int32_t rounder7[] ATTR_ALIGN(8) = |
2979 | 542 rounder (-0.25); /* C1*(C7/C4+C7-C1)/2 */ |
2854
00ff749b33b6
add consts (this was in my local tree, dunno where it came from, probably forgoten from some const patch)
michael
parents:
2817
diff
changeset
|
543 static const int32_t rounder2[] ATTR_ALIGN(8) = |
2979 | 544 rounder (0.60355339059); /* C2 * (C6+C2)/2 */ |
2854
00ff749b33b6
add consts (this was in my local tree, dunno where it came from, probably forgoten from some const patch)
michael
parents:
2817
diff
changeset
|
545 static const int32_t rounder6[] ATTR_ALIGN(8) = |
2979 | 546 rounder (-0.25); /* C2 * (C6-C2)/2 */ |
2854
00ff749b33b6
add consts (this was in my local tree, dunno where it came from, probably forgoten from some const patch)
michael
parents:
2817
diff
changeset
|
547 static const int32_t rounder3[] ATTR_ALIGN(8) = |
2979 | 548 rounder (0.087788325588); /* C3*(-C3/C4+C3+C5)/2 */ |
2854
00ff749b33b6
add consts (this was in my local tree, dunno where it came from, probably forgoten from some const patch)
michael
parents:
2817
diff
changeset
|
549 static const int32_t rounder5[] ATTR_ALIGN(8) = |
2979 | 550 rounder (-0.441341716183); /* C3*(-C5/C4+C5-C3)/2 */ |
30 | 551 |
436
35de17dd6ed8
* undefine local defines when they are no longer needed
kabi
parents:
76
diff
changeset
|
552 #undef COL_SHIFT |
35de17dd6ed8
* undefine local defines when they are no longer needed
kabi
parents:
76
diff
changeset
|
553 #undef ROW_SHIFT |
30 | 554 |
2979 | 555 #define declare_idct(idct,table,idct_row_head,idct_row,idct_row_tail,idct_row_mid) \ |
556 void idct (int16_t * block) \ | |
557 { \ | |
558 static const int16_t table04[] ATTR_ALIGN(16) = \ | |
559 table (22725, 21407, 19266, 16384, 12873, 8867, 4520); \ | |
560 static const int16_t table17[] ATTR_ALIGN(16) = \ | |
561 table (31521, 29692, 26722, 22725, 17855, 12299, 6270); \ | |
562 static const int16_t table26[] ATTR_ALIGN(16) = \ | |
563 table (29692, 27969, 25172, 21407, 16819, 11585, 5906); \ | |
564 static const int16_t table35[] ATTR_ALIGN(16) = \ | |
565 table (26722, 25172, 22654, 19266, 15137, 10426, 5315); \ | |
566 \ | |
567 idct_row_head (block, 0*8, table04); \ | |
568 idct_row (table04, rounder0); \ | |
569 idct_row_mid (block, 0*8, 4*8, table04); \ | |
570 idct_row (table04, rounder4); \ | |
571 idct_row_mid (block, 4*8, 1*8, table17); \ | |
572 idct_row (table17, rounder1); \ | |
573 idct_row_mid (block, 1*8, 7*8, table17); \ | |
574 idct_row (table17, rounder7); \ | |
575 idct_row_mid (block, 7*8, 2*8, table26); \ | |
576 idct_row (table26, rounder2); \ | |
577 idct_row_mid (block, 2*8, 6*8, table26); \ | |
578 idct_row (table26, rounder6); \ | |
579 idct_row_mid (block, 6*8, 3*8, table35); \ | |
580 idct_row (table35, rounder3); \ | |
581 idct_row_mid (block, 3*8, 5*8, table35); \ | |
582 idct_row (table35, rounder5); \ | |
583 idct_row_tail (block, 5*8); \ | |
584 \ | |
585 idct_col (block, 0); \ | |
586 idct_col (block, 4); \ | |
30 | 587 } |
588 | |
2024
f65d87bfdd5a
some of the warning fixes by (Michael Roitzsch <mroi at users dot sourceforge dot net>)
michael
parents:
436
diff
changeset
|
589 void ff_mmx_idct(DCTELEM *block); |
f65d87bfdd5a
some of the warning fixes by (Michael Roitzsch <mroi at users dot sourceforge dot net>)
michael
parents:
436
diff
changeset
|
590 void ff_mmxext_idct(DCTELEM *block); |
30 | 591 |
592 declare_idct (ff_mmxext_idct, mmxext_table, | |
2979 | 593 mmxext_row_head, mmxext_row, mmxext_row_tail, mmxext_row_mid) |
30 | 594 |
595 declare_idct (ff_mmx_idct, mmx_table, | |
2979 | 596 mmx_row_head, mmx_row, mmx_row_tail, mmx_row_mid) |
2745 | 597 |