Mercurial > mplayer.hg
annotate libmpeg2/idct_mmx.c @ 19762:59de9ee0ce47
FFmpeg VMware video codec
author | diego |
---|---|
date | Sat, 09 Sep 2006 18:45:23 +0000 |
parents | 0783dd397f74 |
children | 60a39d71e247 |
rev | line source |
---|---|
1 | 1 /* |
2 * idct_mmx.c | |
10303 | 3 * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org> |
9852 | 4 * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca> |
1 | 5 * |
6 * This file is part of mpeg2dec, a free MPEG-2 video stream decoder. | |
9852 | 7 * See http://libmpeg2.sourceforge.net/ for updates. |
1 | 8 * |
9 * mpeg2dec is free software; you can redistribute it and/or modify | |
10 * it under the terms of the GNU General Public License as published by | |
11 * the Free Software Foundation; either version 2 of the License, or | |
12 * (at your option) any later version. | |
13 * | |
14 * mpeg2dec is distributed in the hope that it will be useful, | |
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
17 * GNU General Public License for more details. | |
18 * | |
19 * You should have received a copy of the GNU General Public License | |
20 * along with this program; if not, write to the Free Software | |
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
14732
1385ec491ffb
Mark locally modified files as such to comply more closely with GPL 2a.
diego
parents:
13864
diff
changeset
|
22 * |
1385ec491ffb
Mark locally modified files as such to comply more closely with GPL 2a.
diego
parents:
13864
diff
changeset
|
23 * Modified for use with MPlayer, see libmpeg-0.4.0.diff for the exact changes. |
18783 | 24 * detailed changelog at http://svn.mplayerhq.hu/mplayer/trunk/ |
14732
1385ec491ffb
Mark locally modified files as such to comply more closely with GPL 2a.
diego
parents:
13864
diff
changeset
|
25 * $Id$ |
1 | 26 */ |
27 | |
28 #include "config.h" | |
29 | |
13864 | 30 #if defined(ARCH_X86) || defined(ARCH_X86_64) |
1 | 31 |
32 #include <inttypes.h> | |
33 | |
9852 | 34 #include "mpeg2.h" |
12932 | 35 #include "attributes.h" |
1 | 36 #include "mpeg2_internal.h" |
37 #include "mmx.h" | |
38 | |
12932 | 39 #define ROW_SHIFT 15 |
1 | 40 #define COL_SHIFT 6 |
41 | |
42 #define round(bias) ((int)(((bias)+0.5) * (1<<ROW_SHIFT))) | |
43 #define rounder(bias) {round (bias), round (bias)} | |
44 | |
45 | |
46 #if 0 | |
36 | 47 /* C row IDCT - its just here to document the MMXEXT and MMX versions */ |
1 | 48 static inline void idct_row (int16_t * row, int offset, |
49 int16_t * table, int32_t * rounder) | |
50 { | |
51 int C1, C2, C3, C4, C5, C6, C7; | |
52 int a0, a1, a2, a3, b0, b1, b2, b3; | |
53 | |
54 row += offset; | |
55 | |
56 C1 = table[1]; | |
57 C2 = table[2]; | |
58 C3 = table[3]; | |
59 C4 = table[4]; | |
60 C5 = table[5]; | |
61 C6 = table[6]; | |
62 C7 = table[7]; | |
63 | |
64 a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + *rounder; | |
65 a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + *rounder; | |
66 a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + *rounder; | |
67 a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + *rounder; | |
68 | |
69 b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7]; | |
70 b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7]; | |
71 b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7]; | |
72 b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7]; | |
73 | |
74 row[0] = (a0 + b0) >> ROW_SHIFT; | |
75 row[1] = (a1 + b1) >> ROW_SHIFT; | |
76 row[2] = (a2 + b2) >> ROW_SHIFT; | |
77 row[3] = (a3 + b3) >> ROW_SHIFT; | |
78 row[4] = (a3 - b3) >> ROW_SHIFT; | |
79 row[5] = (a2 - b2) >> ROW_SHIFT; | |
80 row[6] = (a1 - b1) >> ROW_SHIFT; | |
81 row[7] = (a0 - b0) >> ROW_SHIFT; | |
82 } | |
83 #endif | |
84 | |
85 | |
36 | 86 /* MMXEXT row IDCT */ |
1 | 87 |
88 #define mmxext_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, -c4, -c2, \ | |
89 c4, c6, c4, c6, \ | |
90 c1, c3, -c1, -c5, \ | |
91 c5, c7, c3, -c7, \ | |
92 c4, -c6, c4, -c6, \ | |
93 -c4, c2, c4, -c2, \ | |
94 c5, -c1, c3, -c1, \ | |
95 c7, c3, c7, -c5 } | |
96 | |
9852 | 97 static inline void mmxext_row_head (int16_t * const row, const int offset, |
98 const int16_t * const table) | |
1 | 99 { |
9852 | 100 movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */ |
1 | 101 |
9852 | 102 movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ |
103 movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */ | |
1 | 104 |
9852 | 105 movq_m2r (*table, mm3); /* mm3 = -C2 -C4 C2 C4 */ |
106 movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */ | |
1 | 107 |
9852 | 108 movq_m2r (*(table+4), mm4); /* mm4 = C6 C4 C6 C4 */ |
109 pmaddwd_r2r (mm0, mm3); /* mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 */ | |
1 | 110 |
9852 | 111 pshufw_r2r (mm2, mm2, 0x4e); /* mm2 = x2 x0 x6 x4 */ |
1 | 112 } |
113 | |
9852 | 114 static inline void mmxext_row (const int16_t * const table, |
115 const int32_t * const rounder) | |
1 | 116 { |
9852 | 117 movq_m2r (*(table+8), mm1); /* mm1 = -C5 -C1 C3 C1 */ |
118 pmaddwd_r2r (mm2, mm4); /* mm4 = C4*x0+C6*x2 C4*x4+C6*x6 */ | |
1 | 119 |
9852 | 120 pmaddwd_m2r (*(table+16), mm0); /* mm0 = C4*x4-C6*x6 C4*x0-C6*x2 */ |
121 pshufw_r2r (mm6, mm6, 0x4e); /* mm6 = x3 x1 x7 x5 */ | |
1 | 122 |
9852 | 123 movq_m2r (*(table+12), mm7); /* mm7 = -C7 C3 C7 C5 */ |
124 pmaddwd_r2r (mm5, mm1); /* mm1 = -C1*x5-C5*x7 C1*x1+C3*x3 */ | |
1 | 125 |
9852 | 126 paddd_m2r (*rounder, mm3); /* mm3 += rounder */ |
127 pmaddwd_r2r (mm6, mm7); /* mm7 = C3*x1-C7*x3 C5*x5+C7*x7 */ | |
1 | 128 |
9852 | 129 pmaddwd_m2r (*(table+20), mm2); /* mm2 = C4*x0-C2*x2 -C4*x4+C2*x6 */ |
130 paddd_r2r (mm4, mm3); /* mm3 = a1 a0 + rounder */ | |
1 | 131 |
9852 | 132 pmaddwd_m2r (*(table+24), mm5); /* mm5 = C3*x5-C1*x7 C5*x1-C1*x3 */ |
133 movq_r2r (mm3, mm4); /* mm4 = a1 a0 + rounder */ | |
1 | 134 |
9852 | 135 pmaddwd_m2r (*(table+28), mm6); /* mm6 = C7*x1-C5*x3 C7*x5+C3*x7 */ |
136 paddd_r2r (mm7, mm1); /* mm1 = b1 b0 */ | |
1 | 137 |
9852 | 138 paddd_m2r (*rounder, mm0); /* mm0 += rounder */ |
139 psubd_r2r (mm1, mm3); /* mm3 = a1-b1 a0-b0 + rounder */ | |
1 | 140 |
9852 | 141 psrad_i2r (ROW_SHIFT, mm3); /* mm3 = y6 y7 */ |
142 paddd_r2r (mm4, mm1); /* mm1 = a1+b1 a0+b0 + rounder */ | |
1 | 143 |
9852 | 144 paddd_r2r (mm2, mm0); /* mm0 = a3 a2 + rounder */ |
145 psrad_i2r (ROW_SHIFT, mm1); /* mm1 = y1 y0 */ | |
1 | 146 |
9852 | 147 paddd_r2r (mm6, mm5); /* mm5 = b3 b2 */ |
148 movq_r2r (mm0, mm4); /* mm4 = a3 a2 + rounder */ | |
1 | 149 |
9852 | 150 paddd_r2r (mm5, mm0); /* mm0 = a3+b3 a2+b2 + rounder */ |
151 psubd_r2r (mm5, mm4); /* mm4 = a3-b3 a2-b2 + rounder */ | |
1 | 152 } |
153 | |
9852 | 154 static inline void mmxext_row_tail (int16_t * const row, const int store) |
1 | 155 { |
9852 | 156 psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */ |
1 | 157 |
9852 | 158 psrad_i2r (ROW_SHIFT, mm4); /* mm4 = y4 y5 */ |
1 | 159 |
9852 | 160 packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */ |
1 | 161 |
9852 | 162 packssdw_r2r (mm3, mm4); /* mm4 = y6 y7 y4 y5 */ |
1 | 163 |
9852 | 164 movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */ |
165 pshufw_r2r (mm4, mm4, 0xb1); /* mm4 = y7 y6 y5 y4 */ | |
1 | 166 |
36 | 167 /* slot */ |
1 | 168 |
9852 | 169 movq_r2m (mm4, *(row+store+4)); /* save y7 y6 y5 y4 */ |
1 | 170 } |
171 | |
9852 | 172 static inline void mmxext_row_mid (int16_t * const row, const int store, |
173 const int offset, | |
174 const int16_t * const table) | |
1 | 175 { |
9852 | 176 movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */ |
177 psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */ | |
1 | 178 |
9852 | 179 movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ |
180 psrad_i2r (ROW_SHIFT, mm4); /* mm4 = y4 y5 */ | |
1 | 181 |
9852 | 182 packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */ |
183 movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */ | |
1 | 184 |
9852 | 185 packssdw_r2r (mm3, mm4); /* mm4 = y6 y7 y4 y5 */ |
186 movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */ | |
1 | 187 |
9852 | 188 movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */ |
189 pshufw_r2r (mm4, mm4, 0xb1); /* mm4 = y7 y6 y5 y4 */ | |
1 | 190 |
9852 | 191 movq_m2r (*table, mm3); /* mm3 = -C2 -C4 C2 C4 */ |
192 movq_r2m (mm4, *(row+store+4)); /* save y7 y6 y5 y4 */ | |
1 | 193 |
9852 | 194 pmaddwd_r2r (mm0, mm3); /* mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 */ |
1 | 195 |
9852 | 196 movq_m2r (*(table+4), mm4); /* mm4 = C6 C4 C6 C4 */ |
197 pshufw_r2r (mm2, mm2, 0x4e); /* mm2 = x2 x0 x6 x4 */ | |
1 | 198 } |
199 | |
200 | |
36 | 201 /* MMX row IDCT */ |
1 | 202 |
203 #define mmx_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, c4, c6, \ | |
204 c4, c6, -c4, -c2, \ | |
205 c1, c3, c3, -c7, \ | |
206 c5, c7, -c1, -c5, \ | |
207 c4, -c6, c4, -c2, \ | |
208 -c4, c2, c4, -c6, \ | |
209 c5, -c1, c7, -c5, \ | |
210 c7, c3, c3, -c1 } | |
211 | |
9852 | 212 static inline void mmx_row_head (int16_t * const row, const int offset, |
213 const int16_t * const table) | |
1 | 214 { |
9852 | 215 movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */ |
1 | 216 |
9852 | 217 movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ |
218 movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */ | |
1 | 219 |
9852 | 220 movq_m2r (*table, mm3); /* mm3 = C6 C4 C2 C4 */ |
221 movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */ | |
1 | 222 |
9852 | 223 punpckldq_r2r (mm0, mm0); /* mm0 = x2 x0 x2 x0 */ |
1 | 224 |
9852 | 225 movq_m2r (*(table+4), mm4); /* mm4 = -C2 -C4 C6 C4 */ |
226 pmaddwd_r2r (mm0, mm3); /* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */ | |
1 | 227 |
9852 | 228 movq_m2r (*(table+8), mm1); /* mm1 = -C7 C3 C3 C1 */ |
229 punpckhdq_r2r (mm2, mm2); /* mm2 = x6 x4 x6 x4 */ | |
1 | 230 } |
231 | |
9852 | 232 static inline void mmx_row (const int16_t * const table, |
233 const int32_t * const rounder) | |
1 | 234 { |
9852 | 235 pmaddwd_r2r (mm2, mm4); /* mm4 = -C4*x4-C2*x6 C4*x4+C6*x6 */ |
236 punpckldq_r2r (mm5, mm5); /* mm5 = x3 x1 x3 x1 */ | |
1 | 237 |
9852 | 238 pmaddwd_m2r (*(table+16), mm0); /* mm0 = C4*x0-C2*x2 C4*x0-C6*x2 */ |
239 punpckhdq_r2r (mm6, mm6); /* mm6 = x7 x5 x7 x5 */ | |
1 | 240 |
9852 | 241 movq_m2r (*(table+12), mm7); /* mm7 = -C5 -C1 C7 C5 */ |
242 pmaddwd_r2r (mm5, mm1); /* mm1 = C3*x1-C7*x3 C1*x1+C3*x3 */ | |
1 | 243 |
9852 | 244 paddd_m2r (*rounder, mm3); /* mm3 += rounder */ |
245 pmaddwd_r2r (mm6, mm7); /* mm7 = -C1*x5-C5*x7 C5*x5+C7*x7 */ | |
1 | 246 |
9852 | 247 pmaddwd_m2r (*(table+20), mm2); /* mm2 = C4*x4-C6*x6 -C4*x4+C2*x6 */ |
248 paddd_r2r (mm4, mm3); /* mm3 = a1 a0 + rounder */ | |
1 | 249 |
9852 | 250 pmaddwd_m2r (*(table+24), mm5); /* mm5 = C7*x1-C5*x3 C5*x1-C1*x3 */ |
251 movq_r2r (mm3, mm4); /* mm4 = a1 a0 + rounder */ | |
1 | 252 |
9852 | 253 pmaddwd_m2r (*(table+28), mm6); /* mm6 = C3*x5-C1*x7 C7*x5+C3*x7 */ |
254 paddd_r2r (mm7, mm1); /* mm1 = b1 b0 */ | |
1 | 255 |
9852 | 256 paddd_m2r (*rounder, mm0); /* mm0 += rounder */ |
257 psubd_r2r (mm1, mm3); /* mm3 = a1-b1 a0-b0 + rounder */ | |
1 | 258 |
9852 | 259 psrad_i2r (ROW_SHIFT, mm3); /* mm3 = y6 y7 */ |
260 paddd_r2r (mm4, mm1); /* mm1 = a1+b1 a0+b0 + rounder */ | |
1 | 261 |
9852 | 262 paddd_r2r (mm2, mm0); /* mm0 = a3 a2 + rounder */ |
263 psrad_i2r (ROW_SHIFT, mm1); /* mm1 = y1 y0 */ | |
1 | 264 |
9852 | 265 paddd_r2r (mm6, mm5); /* mm5 = b3 b2 */ |
266 movq_r2r (mm0, mm7); /* mm7 = a3 a2 + rounder */ | |
1 | 267 |
9852 | 268 paddd_r2r (mm5, mm0); /* mm0 = a3+b3 a2+b2 + rounder */ |
269 psubd_r2r (mm5, mm7); /* mm7 = a3-b3 a2-b2 + rounder */ | |
1 | 270 } |
271 | |
9852 | 272 static inline void mmx_row_tail (int16_t * const row, const int store) |
1 | 273 { |
9852 | 274 psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */ |
1 | 275 |
9852 | 276 psrad_i2r (ROW_SHIFT, mm7); /* mm7 = y4 y5 */ |
1 | 277 |
9852 | 278 packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */ |
1 | 279 |
9852 | 280 packssdw_r2r (mm3, mm7); /* mm7 = y6 y7 y4 y5 */ |
1 | 281 |
9852 | 282 movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */ |
283 movq_r2r (mm7, mm4); /* mm4 = y6 y7 y4 y5 */ | |
1 | 284 |
9852 | 285 pslld_i2r (16, mm7); /* mm7 = y7 0 y5 0 */ |
1 | 286 |
9852 | 287 psrld_i2r (16, mm4); /* mm4 = 0 y6 0 y4 */ |
1 | 288 |
9852 | 289 por_r2r (mm4, mm7); /* mm7 = y7 y6 y5 y4 */ |
1 | 290 |
36 | 291 /* slot */ |
1 | 292 |
9852 | 293 movq_r2m (mm7, *(row+store+4)); /* save y7 y6 y5 y4 */ |
1 | 294 } |
295 | |
9852 | 296 static inline void mmx_row_mid (int16_t * const row, const int store, |
297 const int offset, const int16_t * const table) | |
1 | 298 { |
9852 | 299 movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */ |
300 psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */ | |
1 | 301 |
9852 | 302 movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ |
303 psrad_i2r (ROW_SHIFT, mm7); /* mm7 = y4 y5 */ | |
1 | 304 |
9852 | 305 packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */ |
306 movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */ | |
1 | 307 |
9852 | 308 packssdw_r2r (mm3, mm7); /* mm7 = y6 y7 y4 y5 */ |
309 movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */ | |
1 | 310 |
9852 | 311 movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */ |
312 movq_r2r (mm7, mm1); /* mm1 = y6 y7 y4 y5 */ | |
1 | 313 |
9852 | 314 punpckldq_r2r (mm0, mm0); /* mm0 = x2 x0 x2 x0 */ |
315 psrld_i2r (16, mm7); /* mm7 = 0 y6 0 y4 */ | |
1 | 316 |
9852 | 317 movq_m2r (*table, mm3); /* mm3 = C6 C4 C2 C4 */ |
318 pslld_i2r (16, mm1); /* mm1 = y7 0 y5 0 */ | |
1 | 319 |
9852 | 320 movq_m2r (*(table+4), mm4); /* mm4 = -C2 -C4 C6 C4 */ |
321 por_r2r (mm1, mm7); /* mm7 = y7 y6 y5 y4 */ | |
1 | 322 |
9852 | 323 movq_m2r (*(table+8), mm1); /* mm1 = -C7 C3 C3 C1 */ |
324 punpckhdq_r2r (mm2, mm2); /* mm2 = x6 x4 x6 x4 */ | |
1 | 325 |
9852 | 326 movq_r2m (mm7, *(row+store+4)); /* save y7 y6 y5 y4 */ |
327 pmaddwd_r2r (mm0, mm3); /* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */ | |
1 | 328 } |
329 | |
330 | |
331 #if 0 | |
9852 | 332 /* C column IDCT - its just here to document the MMXEXT and MMX versions */ |
1 | 333 static inline void idct_col (int16_t * col, int offset) |
334 { | |
36 | 335 /* multiplication - as implemented on mmx */ |
1 | 336 #define F(c,x) (((c) * (x)) >> 16) |
337 | |
36 | 338 /* saturation - it helps us handle torture test cases */ |
1 | 339 #define S(x) (((x)>32767) ? 32767 : ((x)<-32768) ? -32768 : (x)) |
340 | |
341 int16_t x0, x1, x2, x3, x4, x5, x6, x7; | |
342 int16_t y0, y1, y2, y3, y4, y5, y6, y7; | |
343 int16_t a0, a1, a2, a3, b0, b1, b2, b3; | |
344 int16_t u04, v04, u26, v26, u17, v17, u35, v35, u12, v12; | |
345 | |
346 col += offset; | |
347 | |
348 x0 = col[0*8]; | |
349 x1 = col[1*8]; | |
350 x2 = col[2*8]; | |
351 x3 = col[3*8]; | |
352 x4 = col[4*8]; | |
353 x5 = col[5*8]; | |
354 x6 = col[6*8]; | |
355 x7 = col[7*8]; | |
356 | |
357 u04 = S (x0 + x4); | |
358 v04 = S (x0 - x4); | |
36 | 359 u26 = S (F (T2, x6) + x2); |
360 v26 = S (F (T2, x2) - x6); | |
1 | 361 |
362 a0 = S (u04 + u26); | |
363 a1 = S (v04 + v26); | |
364 a2 = S (v04 - v26); | |
365 a3 = S (u04 - u26); | |
366 | |
36 | 367 u17 = S (F (T1, x7) + x1); |
368 v17 = S (F (T1, x1) - x7); | |
369 u35 = S (F (T3, x5) + x3); | |
370 v35 = S (F (T3, x3) - x5); | |
1 | 371 |
372 b0 = S (u17 + u35); | |
373 b3 = S (v17 - v35); | |
374 u12 = S (u17 - u35); | |
375 v12 = S (v17 + v35); | |
36 | 376 u12 = S (2 * F (C4, u12)); |
377 v12 = S (2 * F (C4, v12)); | |
1 | 378 b1 = S (u12 + v12); |
379 b2 = S (u12 - v12); | |
380 | |
381 y0 = S (a0 + b0) >> COL_SHIFT; | |
382 y1 = S (a1 + b1) >> COL_SHIFT; | |
383 y2 = S (a2 + b2) >> COL_SHIFT; | |
384 y3 = S (a3 + b3) >> COL_SHIFT; | |
385 | |
386 y4 = S (a3 - b3) >> COL_SHIFT; | |
387 y5 = S (a2 - b2) >> COL_SHIFT; | |
388 y6 = S (a1 - b1) >> COL_SHIFT; | |
389 y7 = S (a0 - b0) >> COL_SHIFT; | |
390 | |
391 col[0*8] = y0; | |
392 col[1*8] = y1; | |
393 col[2*8] = y2; | |
394 col[3*8] = y3; | |
395 col[4*8] = y4; | |
396 col[5*8] = y5; | |
397 col[6*8] = y6; | |
398 col[7*8] = y7; | |
399 } | |
400 #endif | |
401 | |
402 | |
9852 | 403 /* MMX column IDCT */ |
404 static inline void idct_col (int16_t * const col, const int offset) | |
1 | 405 { |
406 #define T1 13036 | |
407 #define T2 27146 | |
408 #define T3 43790 | |
409 #define C4 23170 | |
410 | |
9852 | 411 static const short _T1[] ATTR_ALIGN(8) = {T1,T1,T1,T1}; |
412 static const short _T2[] ATTR_ALIGN(8) = {T2,T2,T2,T2}; | |
413 static const short _T3[] ATTR_ALIGN(8) = {T3,T3,T3,T3}; | |
414 static const short _C4[] ATTR_ALIGN(8) = {C4,C4,C4,C4}; | |
1 | 415 |
416 /* column code adapted from peter gubanov */ | |
417 /* http://www.elecard.com/peter/idct.shtml */ | |
418 | |
9852 | 419 movq_m2r (*_T1, mm0); /* mm0 = T1 */ |
1 | 420 |
9852 | 421 movq_m2r (*(col+offset+1*8), mm1); /* mm1 = x1 */ |
422 movq_r2r (mm0, mm2); /* mm2 = T1 */ | |
1 | 423 |
9852 | 424 movq_m2r (*(col+offset+7*8), mm4); /* mm4 = x7 */ |
425 pmulhw_r2r (mm1, mm0); /* mm0 = T1*x1 */ | |
1 | 426 |
9852 | 427 movq_m2r (*_T3, mm5); /* mm5 = T3 */ |
428 pmulhw_r2r (mm4, mm2); /* mm2 = T1*x7 */ | |
1 | 429 |
9852 | 430 movq_m2r (*(col+offset+5*8), mm6); /* mm6 = x5 */ |
431 movq_r2r (mm5, mm7); /* mm7 = T3-1 */ | |
1 | 432 |
9852 | 433 movq_m2r (*(col+offset+3*8), mm3); /* mm3 = x3 */ |
434 psubsw_r2r (mm4, mm0); /* mm0 = v17 */ | |
1 | 435 |
9852 | 436 movq_m2r (*_T2, mm4); /* mm4 = T2 */ |
437 pmulhw_r2r (mm3, mm5); /* mm5 = (T3-1)*x3 */ | |
1 | 438 |
9852 | 439 paddsw_r2r (mm2, mm1); /* mm1 = u17 */ |
440 pmulhw_r2r (mm6, mm7); /* mm7 = (T3-1)*x5 */ | |
1 | 441 |
36 | 442 /* slot */ |
1 | 443 |
9852 | 444 movq_r2r (mm4, mm2); /* mm2 = T2 */ |
445 paddsw_r2r (mm3, mm5); /* mm5 = T3*x3 */ | |
1 | 446 |
9852 | 447 pmulhw_m2r (*(col+offset+2*8), mm4);/* mm4 = T2*x2 */ |
448 paddsw_r2r (mm6, mm7); /* mm7 = T3*x5 */ | |
1 | 449 |
9852 | 450 psubsw_r2r (mm6, mm5); /* mm5 = v35 */ |
451 paddsw_r2r (mm3, mm7); /* mm7 = u35 */ | |
1 | 452 |
9852 | 453 movq_m2r (*(col+offset+6*8), mm3); /* mm3 = x6 */ |
454 movq_r2r (mm0, mm6); /* mm6 = v17 */ | |
1 | 455 |
9852 | 456 pmulhw_r2r (mm3, mm2); /* mm2 = T2*x6 */ |
457 psubsw_r2r (mm5, mm0); /* mm0 = b3 */ | |
1 | 458 |
9852 | 459 psubsw_r2r (mm3, mm4); /* mm4 = v26 */ |
460 paddsw_r2r (mm6, mm5); /* mm5 = v12 */ | |
1 | 461 |
9852 | 462 movq_r2m (mm0, *(col+offset+3*8)); /* save b3 in scratch0 */ |
463 movq_r2r (mm1, mm6); /* mm6 = u17 */ | |
1 | 464 |
9852 | 465 paddsw_m2r (*(col+offset+2*8), mm2);/* mm2 = u26 */ |
466 paddsw_r2r (mm7, mm6); /* mm6 = b0 */ | |
1 | 467 |
9852 | 468 psubsw_r2r (mm7, mm1); /* mm1 = u12 */ |
469 movq_r2r (mm1, mm7); /* mm7 = u12 */ | |
1 | 470 |
9852 | 471 movq_m2r (*(col+offset+0*8), mm3); /* mm3 = x0 */ |
472 paddsw_r2r (mm5, mm1); /* mm1 = u12+v12 */ | |
1 | 473 |
9852 | 474 movq_m2r (*_C4, mm0); /* mm0 = C4/2 */ |
475 psubsw_r2r (mm5, mm7); /* mm7 = u12-v12 */ | |
1 | 476 |
9852 | 477 movq_r2m (mm6, *(col+offset+5*8)); /* save b0 in scratch1 */ |
478 pmulhw_r2r (mm0, mm1); /* mm1 = b1/2 */ | |
1 | 479 |
9852 | 480 movq_r2r (mm4, mm6); /* mm6 = v26 */ |
481 pmulhw_r2r (mm0, mm7); /* mm7 = b2/2 */ | |
1 | 482 |
9852 | 483 movq_m2r (*(col+offset+4*8), mm5); /* mm5 = x4 */ |
484 movq_r2r (mm3, mm0); /* mm0 = x0 */ | |
1 | 485 |
9852 | 486 psubsw_r2r (mm5, mm3); /* mm3 = v04 */ |
487 paddsw_r2r (mm5, mm0); /* mm0 = u04 */ | |
1 | 488 |
9852 | 489 paddsw_r2r (mm3, mm4); /* mm4 = a1 */ |
490 movq_r2r (mm0, mm5); /* mm5 = u04 */ | |
1 | 491 |
9852 | 492 psubsw_r2r (mm6, mm3); /* mm3 = a2 */ |
493 paddsw_r2r (mm2, mm5); /* mm5 = a0 */ | |
1 | 494 |
9852 | 495 paddsw_r2r (mm1, mm1); /* mm1 = b1 */ |
496 psubsw_r2r (mm2, mm0); /* mm0 = a3 */ | |
1 | 497 |
9852 | 498 paddsw_r2r (mm7, mm7); /* mm7 = b2 */ |
499 movq_r2r (mm3, mm2); /* mm2 = a2 */ | |
1 | 500 |
9852 | 501 movq_r2r (mm4, mm6); /* mm6 = a1 */ |
502 paddsw_r2r (mm7, mm3); /* mm3 = a2+b2 */ | |
1 | 503 |
9852 | 504 psraw_i2r (COL_SHIFT, mm3); /* mm3 = y2 */ |
505 paddsw_r2r (mm1, mm4); /* mm4 = a1+b1 */ | |
1 | 506 |
9852 | 507 psraw_i2r (COL_SHIFT, mm4); /* mm4 = y1 */ |
508 psubsw_r2r (mm1, mm6); /* mm6 = a1-b1 */ | |
1 | 509 |
9852 | 510 movq_m2r (*(col+offset+5*8), mm1); /* mm1 = b0 */ |
511 psubsw_r2r (mm7, mm2); /* mm2 = a2-b2 */ | |
1 | 512 |
9852 | 513 psraw_i2r (COL_SHIFT, mm6); /* mm6 = y6 */ |
514 movq_r2r (mm5, mm7); /* mm7 = a0 */ | |
1 | 515 |
9852 | 516 movq_r2m (mm4, *(col+offset+1*8)); /* save y1 */ |
517 psraw_i2r (COL_SHIFT, mm2); /* mm2 = y5 */ | |
1 | 518 |
9852 | 519 movq_r2m (mm3, *(col+offset+2*8)); /* save y2 */ |
520 paddsw_r2r (mm1, mm5); /* mm5 = a0+b0 */ | |
1 | 521 |
9852 | 522 movq_m2r (*(col+offset+3*8), mm4); /* mm4 = b3 */ |
523 psubsw_r2r (mm1, mm7); /* mm7 = a0-b0 */ | |
1 | 524 |
9852 | 525 psraw_i2r (COL_SHIFT, mm5); /* mm5 = y0 */ |
526 movq_r2r (mm0, mm3); /* mm3 = a3 */ | |
1 | 527 |
9852 | 528 movq_r2m (mm2, *(col+offset+5*8)); /* save y5 */ |
529 psubsw_r2r (mm4, mm3); /* mm3 = a3-b3 */ | |
1 | 530 |
9852 | 531 psraw_i2r (COL_SHIFT, mm7); /* mm7 = y7 */ |
532 paddsw_r2r (mm0, mm4); /* mm4 = a3+b3 */ | |
1 | 533 |
9852 | 534 movq_r2m (mm5, *(col+offset+0*8)); /* save y0 */ |
535 psraw_i2r (COL_SHIFT, mm3); /* mm3 = y4 */ | |
1 | 536 |
9852 | 537 movq_r2m (mm6, *(col+offset+6*8)); /* save y6 */ |
538 psraw_i2r (COL_SHIFT, mm4); /* mm4 = y3 */ | |
1 | 539 |
9852 | 540 movq_r2m (mm7, *(col+offset+7*8)); /* save y7 */ |
1 | 541 |
9852 | 542 movq_r2m (mm3, *(col+offset+4*8)); /* save y4 */ |
1 | 543 |
9852 | 544 movq_r2m (mm4, *(col+offset+3*8)); /* save y3 */ |
1 | 545 } |
546 | |
547 | |
9852 | 548 static const int32_t rounder0[] ATTR_ALIGN(8) = |
1 | 549 rounder ((1 << (COL_SHIFT - 1)) - 0.5); |
9852 | 550 static const int32_t rounder4[] ATTR_ALIGN(8) = rounder (0); |
551 static const int32_t rounder1[] ATTR_ALIGN(8) = | |
36 | 552 rounder (1.25683487303); /* C1*(C1/C4+C1+C7)/2 */ |
9852 | 553 static const int32_t rounder7[] ATTR_ALIGN(8) = |
36 | 554 rounder (-0.25); /* C1*(C7/C4+C7-C1)/2 */ |
9852 | 555 static const int32_t rounder2[] ATTR_ALIGN(8) = |
36 | 556 rounder (0.60355339059); /* C2 * (C6+C2)/2 */ |
9852 | 557 static const int32_t rounder6[] ATTR_ALIGN(8) = |
36 | 558 rounder (-0.25); /* C2 * (C6-C2)/2 */ |
9852 | 559 static const int32_t rounder3[] ATTR_ALIGN(8) = |
36 | 560 rounder (0.087788325588); /* C3*(-C3/C4+C3+C5)/2 */ |
9852 | 561 static const int32_t rounder5[] ATTR_ALIGN(8) = |
36 | 562 rounder (-0.441341716183); /* C3*(-C5/C4+C5-C3)/2 */ |
1 | 563 |
564 | |
565 #define declare_idct(idct,table,idct_row_head,idct_row,idct_row_tail,idct_row_mid) \ | |
9852 | 566 static inline void idct (int16_t * const block) \ |
1 | 567 { \ |
9852 | 568 static const int16_t table04[] ATTR_ALIGN(16) = \ |
1 | 569 table (22725, 21407, 19266, 16384, 12873, 8867, 4520); \ |
9852 | 570 static const int16_t table17[] ATTR_ALIGN(16) = \ |
1 | 571 table (31521, 29692, 26722, 22725, 17855, 12299, 6270); \ |
9852 | 572 static const int16_t table26[] ATTR_ALIGN(16) = \ |
1 | 573 table (29692, 27969, 25172, 21407, 16819, 11585, 5906); \ |
9852 | 574 static const int16_t table35[] ATTR_ALIGN(16) = \ |
1 | 575 table (26722, 25172, 22654, 19266, 15137, 10426, 5315); \ |
576 \ | |
577 idct_row_head (block, 0*8, table04); \ | |
578 idct_row (table04, rounder0); \ | |
579 idct_row_mid (block, 0*8, 4*8, table04); \ | |
580 idct_row (table04, rounder4); \ | |
581 idct_row_mid (block, 4*8, 1*8, table17); \ | |
582 idct_row (table17, rounder1); \ | |
583 idct_row_mid (block, 1*8, 7*8, table17); \ | |
584 idct_row (table17, rounder7); \ | |
585 idct_row_mid (block, 7*8, 2*8, table26); \ | |
586 idct_row (table26, rounder2); \ | |
587 idct_row_mid (block, 2*8, 6*8, table26); \ | |
588 idct_row (table26, rounder6); \ | |
589 idct_row_mid (block, 6*8, 3*8, table35); \ | |
590 idct_row (table35, rounder3); \ | |
591 idct_row_mid (block, 3*8, 5*8, table35); \ | |
592 idct_row (table35, rounder5); \ | |
593 idct_row_tail (block, 5*8); \ | |
594 \ | |
595 idct_col (block, 0); \ | |
596 idct_col (block, 4); \ | |
597 } | |
598 | |
599 | |
600 #define COPY_MMX(offset,r0,r1,r2) \ | |
601 do { \ | |
602 movq_m2r (*(block+offset), r0); \ | |
603 dest += stride; \ | |
604 movq_m2r (*(block+offset+4), r1); \ | |
605 movq_r2m (r2, *dest); \ | |
606 packuswb_r2r (r1, r0); \ | |
607 } while (0) | |
608 | |
9852 | 609 static inline void block_copy (int16_t * const block, uint8_t * dest, |
610 const int stride) | |
1 | 611 { |
612 movq_m2r (*(block+0*8), mm0); | |
613 movq_m2r (*(block+0*8+4), mm1); | |
614 movq_m2r (*(block+1*8), mm2); | |
615 packuswb_r2r (mm1, mm0); | |
616 movq_m2r (*(block+1*8+4), mm3); | |
617 movq_r2m (mm0, *dest); | |
618 packuswb_r2r (mm3, mm2); | |
619 COPY_MMX (2*8, mm0, mm1, mm2); | |
620 COPY_MMX (3*8, mm2, mm3, mm0); | |
621 COPY_MMX (4*8, mm0, mm1, mm2); | |
622 COPY_MMX (5*8, mm2, mm3, mm0); | |
623 COPY_MMX (6*8, mm0, mm1, mm2); | |
624 COPY_MMX (7*8, mm2, mm3, mm0); | |
625 movq_r2m (mm2, *(dest+stride)); | |
626 } | |
627 | |
628 | |
629 #define ADD_MMX(offset,r1,r2,r3,r4) \ | |
630 do { \ | |
631 movq_m2r (*(dest+2*stride), r1); \ | |
632 packuswb_r2r (r4, r3); \ | |
633 movq_r2r (r1, r2); \ | |
634 dest += stride; \ | |
635 movq_r2m (r3, *dest); \ | |
636 punpcklbw_r2r (mm0, r1); \ | |
637 paddsw_m2r (*(block+offset), r1); \ | |
638 punpckhbw_r2r (mm0, r2); \ | |
639 paddsw_m2r (*(block+offset+4), r2); \ | |
640 } while (0) | |
641 | |
9852 | 642 static inline void block_add (int16_t * const block, uint8_t * dest, |
643 const int stride) | |
1 | 644 { |
645 movq_m2r (*dest, mm1); | |
646 pxor_r2r (mm0, mm0); | |
647 movq_m2r (*(dest+stride), mm3); | |
648 movq_r2r (mm1, mm2); | |
649 punpcklbw_r2r (mm0, mm1); | |
650 movq_r2r (mm3, mm4); | |
651 paddsw_m2r (*(block+0*8), mm1); | |
652 punpckhbw_r2r (mm0, mm2); | |
653 paddsw_m2r (*(block+0*8+4), mm2); | |
654 punpcklbw_r2r (mm0, mm3); | |
655 paddsw_m2r (*(block+1*8), mm3); | |
656 packuswb_r2r (mm2, mm1); | |
657 punpckhbw_r2r (mm0, mm4); | |
658 movq_r2m (mm1, *dest); | |
659 paddsw_m2r (*(block+1*8+4), mm4); | |
660 ADD_MMX (2*8, mm1, mm2, mm3, mm4); | |
661 ADD_MMX (3*8, mm3, mm4, mm1, mm2); | |
662 ADD_MMX (4*8, mm1, mm2, mm3, mm4); | |
663 ADD_MMX (5*8, mm3, mm4, mm1, mm2); | |
664 ADD_MMX (6*8, mm1, mm2, mm3, mm4); | |
665 ADD_MMX (7*8, mm3, mm4, mm1, mm2); | |
666 packuswb_r2r (mm4, mm3); | |
667 movq_r2m (mm3, *(dest+stride)); | |
668 } | |
669 | |
670 | |
9852 | 671 static inline void block_zero (int16_t * const block) |
672 { | |
673 pxor_r2r (mm0, mm0); | |
674 movq_r2m (mm0, *(block+0*4)); | |
675 movq_r2m (mm0, *(block+1*4)); | |
676 movq_r2m (mm0, *(block+2*4)); | |
677 movq_r2m (mm0, *(block+3*4)); | |
678 movq_r2m (mm0, *(block+4*4)); | |
679 movq_r2m (mm0, *(block+5*4)); | |
680 movq_r2m (mm0, *(block+6*4)); | |
681 movq_r2m (mm0, *(block+7*4)); | |
682 movq_r2m (mm0, *(block+8*4)); | |
683 movq_r2m (mm0, *(block+9*4)); | |
684 movq_r2m (mm0, *(block+10*4)); | |
685 movq_r2m (mm0, *(block+11*4)); | |
686 movq_r2m (mm0, *(block+12*4)); | |
687 movq_r2m (mm0, *(block+13*4)); | |
688 movq_r2m (mm0, *(block+14*4)); | |
689 movq_r2m (mm0, *(block+15*4)); | |
690 } | |
691 | |
692 | |
693 #define CPU_MMXEXT 0 | |
694 #define CPU_MMX 1 | |
695 | |
696 #define dup4(reg) \ | |
697 do { \ | |
698 if (cpu != CPU_MMXEXT) { \ | |
699 punpcklwd_r2r (reg, reg); \ | |
700 punpckldq_r2r (reg, reg); \ | |
701 } else \ | |
702 pshufw_r2r (reg, reg, 0x00); \ | |
703 } while (0) | |
704 | |
705 static inline void block_add_DC (int16_t * const block, uint8_t * dest, | |
706 const int stride, const int cpu) | |
707 { | |
12932 | 708 movd_v2r ((block[0] + 64) >> 7, mm0); |
9852 | 709 pxor_r2r (mm1, mm1); |
710 movq_m2r (*dest, mm2); | |
711 dup4 (mm0); | |
712 psubsw_r2r (mm0, mm1); | |
713 packuswb_r2r (mm0, mm0); | |
714 paddusb_r2r (mm0, mm2); | |
715 packuswb_r2r (mm1, mm1); | |
716 movq_m2r (*(dest + stride), mm3); | |
717 psubusb_r2r (mm1, mm2); | |
718 block[0] = 0; | |
719 paddusb_r2r (mm0, mm3); | |
720 movq_r2m (mm2, *dest); | |
721 psubusb_r2r (mm1, mm3); | |
722 movq_m2r (*(dest + 2*stride), mm2); | |
723 dest += stride; | |
724 movq_r2m (mm3, *dest); | |
725 paddusb_r2r (mm0, mm2); | |
726 movq_m2r (*(dest + 2*stride), mm3); | |
727 psubusb_r2r (mm1, mm2); | |
728 dest += stride; | |
729 paddusb_r2r (mm0, mm3); | |
730 movq_r2m (mm2, *dest); | |
731 psubusb_r2r (mm1, mm3); | |
732 movq_m2r (*(dest + 2*stride), mm2); | |
733 dest += stride; | |
734 movq_r2m (mm3, *dest); | |
735 paddusb_r2r (mm0, mm2); | |
736 movq_m2r (*(dest + 2*stride), mm3); | |
737 psubusb_r2r (mm1, mm2); | |
738 dest += stride; | |
739 paddusb_r2r (mm0, mm3); | |
740 movq_r2m (mm2, *dest); | |
741 psubusb_r2r (mm1, mm3); | |
742 movq_m2r (*(dest + 2*stride), mm2); | |
743 dest += stride; | |
744 movq_r2m (mm3, *dest); | |
745 paddusb_r2r (mm0, mm2); | |
746 movq_m2r (*(dest + 2*stride), mm3); | |
747 psubusb_r2r (mm1, mm2); | |
748 block[63] = 0; | |
749 paddusb_r2r (mm0, mm3); | |
750 movq_r2m (mm2, *(dest + stride)); | |
751 psubusb_r2r (mm1, mm3); | |
752 movq_r2m (mm3, *(dest + 2*stride)); | |
753 } | |
754 | |
755 | |
1 | 756 declare_idct (mmxext_idct, mmxext_table, |
757 mmxext_row_head, mmxext_row, mmxext_row_tail, mmxext_row_mid) | |
758 | |
9852 | 759 void mpeg2_idct_copy_mmxext (int16_t * const block, uint8_t * const dest, |
760 const int stride) | |
1 | 761 { |
762 mmxext_idct (block); | |
763 block_copy (block, dest, stride); | |
9852 | 764 block_zero (block); |
1 | 765 } |
766 | |
9852 | 767 void mpeg2_idct_add_mmxext (const int last, int16_t * const block, |
768 uint8_t * const dest, const int stride) | |
1 | 769 { |
12932 | 770 if (last != 129 || (block[0] & (7 << 4)) == (4 << 4)) { |
9852 | 771 mmxext_idct (block); |
772 block_add (block, dest, stride); | |
773 block_zero (block); | |
774 } else | |
775 block_add_DC (block, dest, stride, CPU_MMXEXT); | |
1 | 776 } |
777 | |
778 | |
779 declare_idct (mmx_idct, mmx_table, | |
780 mmx_row_head, mmx_row, mmx_row_tail, mmx_row_mid) | |
781 | |
9852 | 782 void mpeg2_idct_copy_mmx (int16_t * const block, uint8_t * const dest, |
783 const int stride) | |
1 | 784 { |
785 mmx_idct (block); | |
786 block_copy (block, dest, stride); | |
9852 | 787 block_zero (block); |
1 | 788 } |
789 | |
9852 | 790 void mpeg2_idct_add_mmx (const int last, int16_t * const block, |
791 uint8_t * const dest, const int stride) | |
1 | 792 { |
12932 | 793 if (last != 129 || (block[0] & (7 << 4)) == (4 << 4)) { |
9852 | 794 mmx_idct (block); |
795 block_add (block, dest, stride); | |
796 block_zero (block); | |
797 } else | |
798 block_add_DC (block, dest, stride, CPU_MMX); | |
1 | 799 } |
800 | |
801 | |
9852 | 802 void mpeg2_idct_mmx_init (void) |
1 | 803 { |
9852 | 804 extern uint8_t mpeg2_scan_norm[64]; |
805 extern uint8_t mpeg2_scan_alt[64]; | |
1 | 806 int i, j; |
807 | |
36 | 808 /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */ |
1 | 809 |
810 for (i = 0; i < 64; i++) { | |
9852 | 811 j = mpeg2_scan_norm[i]; |
812 mpeg2_scan_norm[i] = (j & 0x38) | ((j & 6) >> 1) | ((j & 1) << 2); | |
813 j = mpeg2_scan_alt[i]; | |
814 mpeg2_scan_alt[i] = (j & 0x38) | ((j & 6) >> 1) | ((j & 1) << 2); | |
1 | 815 } |
816 } | |
817 | |
818 #endif |