Mercurial > mplayer.hg
annotate libmpeg2/idct_mmx.c @ 34410:bb440bfcade9
configure: Simplify Theora check.
The Theora check does some very elaborate tests to account for issues in
old libtheora versions. Nowadays libtheora is stable and easy to test
for, so there is no longer a need for the extra complexity.
Also drop the Theora check that linked against tremor/bitwise.c.
Its purpose is now lost in the mists of time.
author | diego |
---|---|
date | Wed, 04 Jan 2012 10:39:38 +0000 |
parents | 25337a2147e7 |
children |
rev | line source |
---|---|
1 | 1 /* |
2 * idct_mmx.c | |
10303 | 3 * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org> |
9852 | 4 * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca> |
1 | 5 * |
6 * This file is part of mpeg2dec, a free MPEG-2 video stream decoder. | |
9852 | 7 * See http://libmpeg2.sourceforge.net/ for updates. |
1 | 8 * |
9 * mpeg2dec is free software; you can redistribute it and/or modify | |
10 * it under the terms of the GNU General Public License as published by | |
11 * the Free Software Foundation; either version 2 of the License, or | |
12 * (at your option) any later version. | |
13 * | |
14 * mpeg2dec is distributed in the hope that it will be useful, | |
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
17 * GNU General Public License for more details. | |
18 * | |
19 * You should have received a copy of the GNU General Public License | |
20 * along with this program; if not, write to the Free Software | |
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
22 */ | |
23 | |
24 #include "config.h" | |
25 | |
28290 | 26 #if ARCH_X86 || ARCH_X86_64 |
1 | 27 |
28 #include <inttypes.h> | |
29 | |
9852 | 30 #include "mpeg2.h" |
12932 | 31 #include "attributes.h" |
1 | 32 #include "mpeg2_internal.h" |
33 #include "mmx.h" | |
34 | |
12932 | 35 #define ROW_SHIFT 15 |
1 | 36 #define COL_SHIFT 6 |
37 | |
38 #define round(bias) ((int)(((bias)+0.5) * (1<<ROW_SHIFT))) | |
39 #define rounder(bias) {round (bias), round (bias)} | |
26393
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
40 #define rounder_sse2(bias) {round (bias), round (bias), round (bias), round (bias)} |
1 | 41 |
42 | |
43 #if 0 | |
25998 | 44 /* C row IDCT - it is just here to document the MMXEXT and MMX versions */ |
1 | 45 static inline void idct_row (int16_t * row, int offset, |
46 int16_t * table, int32_t * rounder) | |
47 { | |
48 int C1, C2, C3, C4, C5, C6, C7; | |
49 int a0, a1, a2, a3, b0, b1, b2, b3; | |
50 | |
51 row += offset; | |
52 | |
53 C1 = table[1]; | |
54 C2 = table[2]; | |
55 C3 = table[3]; | |
56 C4 = table[4]; | |
57 C5 = table[5]; | |
58 C6 = table[6]; | |
59 C7 = table[7]; | |
60 | |
61 a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + *rounder; | |
62 a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + *rounder; | |
63 a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + *rounder; | |
64 a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + *rounder; | |
65 | |
66 b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7]; | |
67 b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7]; | |
68 b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7]; | |
69 b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7]; | |
70 | |
71 row[0] = (a0 + b0) >> ROW_SHIFT; | |
72 row[1] = (a1 + b1) >> ROW_SHIFT; | |
73 row[2] = (a2 + b2) >> ROW_SHIFT; | |
74 row[3] = (a3 + b3) >> ROW_SHIFT; | |
75 row[4] = (a3 - b3) >> ROW_SHIFT; | |
76 row[5] = (a2 - b2) >> ROW_SHIFT; | |
77 row[6] = (a1 - b1) >> ROW_SHIFT; | |
78 row[7] = (a0 - b0) >> ROW_SHIFT; | |
79 } | |
80 #endif | |
81 | |
82 | |
26393
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
83 /* SSE2 row IDCT */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
84 #define sse2_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, c4, c6, \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
85 c4, -c6, c4, -c2, \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
86 c4, c6, -c4, -c2, \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
87 -c4, c2, c4, -c6, \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
88 c1, c3, c3, -c7, \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
89 c5, -c1, c7, -c5, \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
90 c5, c7, -c1, -c5, \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
91 c7, c3, c3, -c1 } |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
92 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
93 #define SSE2_IDCT_2ROW(table, row1, row2, round1, round2) do { \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
94 /* no scheduling: trust in out of order execution */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
95 /* based on Intel AP-945 */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
96 /* (http://cache-www.intel.com/cd/00/00/01/76/17680_w_idct.pdf) */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
97 \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
98 /* input */ /* 1: row1= x7 x5 x3 x1 x6 x4 x2 x0 */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
99 pshufd_r2r (row1, xmm1, 0); /* 1: xmm1= x2 x0 x2 x0 x2 x0 x2 x0 */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
100 pmaddwd_m2r (table[0], xmm1); /* 1: xmm1= x2*C + x0*C ... */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
101 pshufd_r2r (row1, xmm3, 0xaa); /* 1: xmm3= x3 x1 x3 x1 x3 x1 x3 x1 */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
102 pmaddwd_m2r (table[2*8], xmm3); /* 1: xmm3= x3*C + x1*C ... */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
103 pshufd_r2r (row1, xmm2, 0x55); /* 1: xmm2= x6 x4 x6 x4 x6 x4 x6 x4 */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
104 pshufd_r2r (row1, row1, 0xff); /* 1: row1= x7 x5 x7 x5 x7 x5 x7 x5 */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
105 pmaddwd_m2r (table[1*8], xmm2); /* 1: xmm2= x6*C + x4*C ... */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
106 paddd_m2r (round1, xmm1); /* 1: xmm1= x2*C + x0*C + round ... */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
107 pmaddwd_m2r (table[3*8], row1); /* 1: row1= x7*C + x5*C ... */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
108 pshufd_r2r (row2, xmm5, 0); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
109 pshufd_r2r (row2, xmm6, 0x55); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
110 pmaddwd_m2r (table[0], xmm5); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
111 paddd_r2r (xmm2, xmm1); /* 1: xmm1= a[] */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
112 movdqa_r2r (xmm1, xmm2); /* 1: xmm2= a[] */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
113 pshufd_r2r (row2, xmm7, 0xaa); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
114 pmaddwd_m2r (table[1*8], xmm6); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
115 paddd_r2r (xmm3, row1); /* 1: row1= b[]= 7*C+5*C+3*C+1*C ... */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
116 pshufd_r2r (row2, row2, 0xff); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
117 psubd_r2r (row1, xmm2); /* 1: xmm2= a[] - b[] */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
118 pmaddwd_m2r (table[2*8], xmm7); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
119 paddd_r2r (xmm1, row1); /* 1: row1= a[] + b[] */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
120 psrad_i2r (ROW_SHIFT, xmm2); /* 1: xmm2= result 4...7 */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
121 paddd_m2r (round2, xmm5); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
122 pmaddwd_m2r (table[3*8], row2); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
123 paddd_r2r (xmm6, xmm5); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
124 movdqa_r2r (xmm5, xmm6); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
125 psrad_i2r (ROW_SHIFT, row1); /* 1: row1= result 0...4 */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
126 pshufd_r2r (xmm2, xmm2, 0x1b); /* 1: [0 1 2 3] -> [3 2 1 0] */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
127 packssdw_r2r (xmm2, row1); /* 1: row1= result[] */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
128 paddd_r2r (xmm7, row2); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
129 psubd_r2r (row2, xmm6); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
130 paddd_r2r (xmm5, row2); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
131 psrad_i2r (ROW_SHIFT, xmm6); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
132 psrad_i2r (ROW_SHIFT, row2); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
133 pshufd_r2r (xmm6, xmm6, 0x1b); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
134 packssdw_r2r (xmm6, row2); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
135 } while (0) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
136 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
137 |
36 | 138 /* MMXEXT row IDCT */ |
1 | 139 |
140 #define mmxext_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, -c4, -c2, \ | |
141 c4, c6, c4, c6, \ | |
142 c1, c3, -c1, -c5, \ | |
143 c5, c7, c3, -c7, \ | |
144 c4, -c6, c4, -c6, \ | |
145 -c4, c2, c4, -c2, \ | |
146 c5, -c1, c3, -c1, \ | |
147 c7, c3, c7, -c5 } | |
148 | |
9852 | 149 static inline void mmxext_row_head (int16_t * const row, const int offset, |
150 const int16_t * const table) | |
1 | 151 { |
9852 | 152 movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */ |
1 | 153 |
9852 | 154 movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ |
155 movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */ | |
1 | 156 |
9852 | 157 movq_m2r (*table, mm3); /* mm3 = -C2 -C4 C2 C4 */ |
158 movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */ | |
1 | 159 |
9852 | 160 movq_m2r (*(table+4), mm4); /* mm4 = C6 C4 C6 C4 */ |
161 pmaddwd_r2r (mm0, mm3); /* mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 */ | |
1 | 162 |
9852 | 163 pshufw_r2r (mm2, mm2, 0x4e); /* mm2 = x2 x0 x6 x4 */ |
1 | 164 } |
165 | |
9852 | 166 static inline void mmxext_row (const int16_t * const table, |
167 const int32_t * const rounder) | |
1 | 168 { |
9852 | 169 movq_m2r (*(table+8), mm1); /* mm1 = -C5 -C1 C3 C1 */ |
170 pmaddwd_r2r (mm2, mm4); /* mm4 = C4*x0+C6*x2 C4*x4+C6*x6 */ | |
1 | 171 |
9852 | 172 pmaddwd_m2r (*(table+16), mm0); /* mm0 = C4*x4-C6*x6 C4*x0-C6*x2 */ |
173 pshufw_r2r (mm6, mm6, 0x4e); /* mm6 = x3 x1 x7 x5 */ | |
1 | 174 |
9852 | 175 movq_m2r (*(table+12), mm7); /* mm7 = -C7 C3 C7 C5 */ |
176 pmaddwd_r2r (mm5, mm1); /* mm1 = -C1*x5-C5*x7 C1*x1+C3*x3 */ | |
1 | 177 |
9852 | 178 paddd_m2r (*rounder, mm3); /* mm3 += rounder */ |
179 pmaddwd_r2r (mm6, mm7); /* mm7 = C3*x1-C7*x3 C5*x5+C7*x7 */ | |
1 | 180 |
9852 | 181 pmaddwd_m2r (*(table+20), mm2); /* mm2 = C4*x0-C2*x2 -C4*x4+C2*x6 */ |
182 paddd_r2r (mm4, mm3); /* mm3 = a1 a0 + rounder */ | |
1 | 183 |
9852 | 184 pmaddwd_m2r (*(table+24), mm5); /* mm5 = C3*x5-C1*x7 C5*x1-C1*x3 */ |
185 movq_r2r (mm3, mm4); /* mm4 = a1 a0 + rounder */ | |
1 | 186 |
9852 | 187 pmaddwd_m2r (*(table+28), mm6); /* mm6 = C7*x1-C5*x3 C7*x5+C3*x7 */ |
188 paddd_r2r (mm7, mm1); /* mm1 = b1 b0 */ | |
1 | 189 |
9852 | 190 paddd_m2r (*rounder, mm0); /* mm0 += rounder */ |
191 psubd_r2r (mm1, mm3); /* mm3 = a1-b1 a0-b0 + rounder */ | |
1 | 192 |
9852 | 193 psrad_i2r (ROW_SHIFT, mm3); /* mm3 = y6 y7 */ |
194 paddd_r2r (mm4, mm1); /* mm1 = a1+b1 a0+b0 + rounder */ | |
1 | 195 |
9852 | 196 paddd_r2r (mm2, mm0); /* mm0 = a3 a2 + rounder */ |
197 psrad_i2r (ROW_SHIFT, mm1); /* mm1 = y1 y0 */ | |
1 | 198 |
9852 | 199 paddd_r2r (mm6, mm5); /* mm5 = b3 b2 */ |
200 movq_r2r (mm0, mm4); /* mm4 = a3 a2 + rounder */ | |
1 | 201 |
9852 | 202 paddd_r2r (mm5, mm0); /* mm0 = a3+b3 a2+b2 + rounder */ |
203 psubd_r2r (mm5, mm4); /* mm4 = a3-b3 a2-b2 + rounder */ | |
1 | 204 } |
205 | |
9852 | 206 static inline void mmxext_row_tail (int16_t * const row, const int store) |
1 | 207 { |
9852 | 208 psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */ |
1 | 209 |
9852 | 210 psrad_i2r (ROW_SHIFT, mm4); /* mm4 = y4 y5 */ |
1 | 211 |
9852 | 212 packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */ |
1 | 213 |
9852 | 214 packssdw_r2r (mm3, mm4); /* mm4 = y6 y7 y4 y5 */ |
1 | 215 |
9852 | 216 movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */ |
217 pshufw_r2r (mm4, mm4, 0xb1); /* mm4 = y7 y6 y5 y4 */ | |
1 | 218 |
36 | 219 /* slot */ |
1 | 220 |
9852 | 221 movq_r2m (mm4, *(row+store+4)); /* save y7 y6 y5 y4 */ |
1 | 222 } |
223 | |
9852 | 224 static inline void mmxext_row_mid (int16_t * const row, const int store, |
225 const int offset, | |
226 const int16_t * const table) | |
1 | 227 { |
9852 | 228 movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */ |
229 psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */ | |
1 | 230 |
9852 | 231 movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ |
232 psrad_i2r (ROW_SHIFT, mm4); /* mm4 = y4 y5 */ | |
1 | 233 |
9852 | 234 packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */ |
235 movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */ | |
1 | 236 |
9852 | 237 packssdw_r2r (mm3, mm4); /* mm4 = y6 y7 y4 y5 */ |
238 movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */ | |
1 | 239 |
9852 | 240 movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */ |
241 pshufw_r2r (mm4, mm4, 0xb1); /* mm4 = y7 y6 y5 y4 */ | |
1 | 242 |
9852 | 243 movq_m2r (*table, mm3); /* mm3 = -C2 -C4 C2 C4 */ |
244 movq_r2m (mm4, *(row+store+4)); /* save y7 y6 y5 y4 */ | |
1 | 245 |
9852 | 246 pmaddwd_r2r (mm0, mm3); /* mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 */ |
1 | 247 |
9852 | 248 movq_m2r (*(table+4), mm4); /* mm4 = C6 C4 C6 C4 */ |
249 pshufw_r2r (mm2, mm2, 0x4e); /* mm2 = x2 x0 x6 x4 */ | |
1 | 250 } |
251 | |
252 | |
36 | 253 /* MMX row IDCT */ |
1 | 254 |
255 #define mmx_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, c4, c6, \ | |
256 c4, c6, -c4, -c2, \ | |
257 c1, c3, c3, -c7, \ | |
258 c5, c7, -c1, -c5, \ | |
259 c4, -c6, c4, -c2, \ | |
260 -c4, c2, c4, -c6, \ | |
261 c5, -c1, c7, -c5, \ | |
262 c7, c3, c3, -c1 } | |
263 | |
9852 | 264 static inline void mmx_row_head (int16_t * const row, const int offset, |
265 const int16_t * const table) | |
1 | 266 { |
9852 | 267 movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */ |
1 | 268 |
9852 | 269 movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ |
270 movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */ | |
1 | 271 |
9852 | 272 movq_m2r (*table, mm3); /* mm3 = C6 C4 C2 C4 */ |
273 movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */ | |
1 | 274 |
9852 | 275 punpckldq_r2r (mm0, mm0); /* mm0 = x2 x0 x2 x0 */ |
1 | 276 |
9852 | 277 movq_m2r (*(table+4), mm4); /* mm4 = -C2 -C4 C6 C4 */ |
278 pmaddwd_r2r (mm0, mm3); /* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */ | |
1 | 279 |
9852 | 280 movq_m2r (*(table+8), mm1); /* mm1 = -C7 C3 C3 C1 */ |
281 punpckhdq_r2r (mm2, mm2); /* mm2 = x6 x4 x6 x4 */ | |
1 | 282 } |
283 | |
9852 | 284 static inline void mmx_row (const int16_t * const table, |
285 const int32_t * const rounder) | |
1 | 286 { |
9852 | 287 pmaddwd_r2r (mm2, mm4); /* mm4 = -C4*x4-C2*x6 C4*x4+C6*x6 */ |
288 punpckldq_r2r (mm5, mm5); /* mm5 = x3 x1 x3 x1 */ | |
1 | 289 |
9852 | 290 pmaddwd_m2r (*(table+16), mm0); /* mm0 = C4*x0-C2*x2 C4*x0-C6*x2 */ |
291 punpckhdq_r2r (mm6, mm6); /* mm6 = x7 x5 x7 x5 */ | |
1 | 292 |
9852 | 293 movq_m2r (*(table+12), mm7); /* mm7 = -C5 -C1 C7 C5 */ |
294 pmaddwd_r2r (mm5, mm1); /* mm1 = C3*x1-C7*x3 C1*x1+C3*x3 */ | |
1 | 295 |
9852 | 296 paddd_m2r (*rounder, mm3); /* mm3 += rounder */ |
297 pmaddwd_r2r (mm6, mm7); /* mm7 = -C1*x5-C5*x7 C5*x5+C7*x7 */ | |
1 | 298 |
9852 | 299 pmaddwd_m2r (*(table+20), mm2); /* mm2 = C4*x4-C6*x6 -C4*x4+C2*x6 */ |
300 paddd_r2r (mm4, mm3); /* mm3 = a1 a0 + rounder */ | |
1 | 301 |
9852 | 302 pmaddwd_m2r (*(table+24), mm5); /* mm5 = C7*x1-C5*x3 C5*x1-C1*x3 */ |
303 movq_r2r (mm3, mm4); /* mm4 = a1 a0 + rounder */ | |
1 | 304 |
9852 | 305 pmaddwd_m2r (*(table+28), mm6); /* mm6 = C3*x5-C1*x7 C7*x5+C3*x7 */ |
306 paddd_r2r (mm7, mm1); /* mm1 = b1 b0 */ | |
1 | 307 |
9852 | 308 paddd_m2r (*rounder, mm0); /* mm0 += rounder */ |
309 psubd_r2r (mm1, mm3); /* mm3 = a1-b1 a0-b0 + rounder */ | |
1 | 310 |
9852 | 311 psrad_i2r (ROW_SHIFT, mm3); /* mm3 = y6 y7 */ |
312 paddd_r2r (mm4, mm1); /* mm1 = a1+b1 a0+b0 + rounder */ | |
1 | 313 |
9852 | 314 paddd_r2r (mm2, mm0); /* mm0 = a3 a2 + rounder */ |
315 psrad_i2r (ROW_SHIFT, mm1); /* mm1 = y1 y0 */ | |
1 | 316 |
9852 | 317 paddd_r2r (mm6, mm5); /* mm5 = b3 b2 */ |
318 movq_r2r (mm0, mm7); /* mm7 = a3 a2 + rounder */ | |
1 | 319 |
9852 | 320 paddd_r2r (mm5, mm0); /* mm0 = a3+b3 a2+b2 + rounder */ |
321 psubd_r2r (mm5, mm7); /* mm7 = a3-b3 a2-b2 + rounder */ | |
1 | 322 } |
323 | |
9852 | 324 static inline void mmx_row_tail (int16_t * const row, const int store) |
1 | 325 { |
9852 | 326 psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */ |
1 | 327 |
9852 | 328 psrad_i2r (ROW_SHIFT, mm7); /* mm7 = y4 y5 */ |
1 | 329 |
9852 | 330 packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */ |
1 | 331 |
9852 | 332 packssdw_r2r (mm3, mm7); /* mm7 = y6 y7 y4 y5 */ |
1 | 333 |
9852 | 334 movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */ |
335 movq_r2r (mm7, mm4); /* mm4 = y6 y7 y4 y5 */ | |
1 | 336 |
9852 | 337 pslld_i2r (16, mm7); /* mm7 = y7 0 y5 0 */ |
1 | 338 |
9852 | 339 psrld_i2r (16, mm4); /* mm4 = 0 y6 0 y4 */ |
1 | 340 |
9852 | 341 por_r2r (mm4, mm7); /* mm7 = y7 y6 y5 y4 */ |
1 | 342 |
36 | 343 /* slot */ |
1 | 344 |
9852 | 345 movq_r2m (mm7, *(row+store+4)); /* save y7 y6 y5 y4 */ |
1 | 346 } |
347 | |
9852 | 348 static inline void mmx_row_mid (int16_t * const row, const int store, |
349 const int offset, const int16_t * const table) | |
1 | 350 { |
9852 | 351 movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */ |
352 psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */ | |
1 | 353 |
9852 | 354 movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ |
355 psrad_i2r (ROW_SHIFT, mm7); /* mm7 = y4 y5 */ | |
1 | 356 |
9852 | 357 packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */ |
358 movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */ | |
1 | 359 |
9852 | 360 packssdw_r2r (mm3, mm7); /* mm7 = y6 y7 y4 y5 */ |
361 movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */ | |
1 | 362 |
9852 | 363 movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */ |
364 movq_r2r (mm7, mm1); /* mm1 = y6 y7 y4 y5 */ | |
1 | 365 |
9852 | 366 punpckldq_r2r (mm0, mm0); /* mm0 = x2 x0 x2 x0 */ |
367 psrld_i2r (16, mm7); /* mm7 = 0 y6 0 y4 */ | |
1 | 368 |
9852 | 369 movq_m2r (*table, mm3); /* mm3 = C6 C4 C2 C4 */ |
370 pslld_i2r (16, mm1); /* mm1 = y7 0 y5 0 */ | |
1 | 371 |
9852 | 372 movq_m2r (*(table+4), mm4); /* mm4 = -C2 -C4 C6 C4 */ |
373 por_r2r (mm1, mm7); /* mm7 = y7 y6 y5 y4 */ | |
1 | 374 |
9852 | 375 movq_m2r (*(table+8), mm1); /* mm1 = -C7 C3 C3 C1 */ |
376 punpckhdq_r2r (mm2, mm2); /* mm2 = x6 x4 x6 x4 */ | |
1 | 377 |
9852 | 378 movq_r2m (mm7, *(row+store+4)); /* save y7 y6 y5 y4 */ |
379 pmaddwd_r2r (mm0, mm3); /* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */ | |
1 | 380 } |
381 | |
382 | |
383 #if 0 | |
25998 | 384 /* C column IDCT - it is just here to document the MMXEXT and MMX versions */ |
1 | 385 static inline void idct_col (int16_t * col, int offset) |
386 { | |
36 | 387 /* multiplication - as implemented on mmx */ |
1 | 388 #define F(c,x) (((c) * (x)) >> 16) |
389 | |
36 | 390 /* saturation - it helps us handle torture test cases */ |
1 | 391 #define S(x) (((x)>32767) ? 32767 : ((x)<-32768) ? -32768 : (x)) |
392 | |
393 int16_t x0, x1, x2, x3, x4, x5, x6, x7; | |
394 int16_t y0, y1, y2, y3, y4, y5, y6, y7; | |
395 int16_t a0, a1, a2, a3, b0, b1, b2, b3; | |
396 int16_t u04, v04, u26, v26, u17, v17, u35, v35, u12, v12; | |
397 | |
398 col += offset; | |
399 | |
400 x0 = col[0*8]; | |
401 x1 = col[1*8]; | |
402 x2 = col[2*8]; | |
403 x3 = col[3*8]; | |
404 x4 = col[4*8]; | |
405 x5 = col[5*8]; | |
406 x6 = col[6*8]; | |
407 x7 = col[7*8]; | |
408 | |
409 u04 = S (x0 + x4); | |
410 v04 = S (x0 - x4); | |
36 | 411 u26 = S (F (T2, x6) + x2); |
412 v26 = S (F (T2, x2) - x6); | |
1 | 413 |
414 a0 = S (u04 + u26); | |
415 a1 = S (v04 + v26); | |
416 a2 = S (v04 - v26); | |
417 a3 = S (u04 - u26); | |
418 | |
36 | 419 u17 = S (F (T1, x7) + x1); |
420 v17 = S (F (T1, x1) - x7); | |
421 u35 = S (F (T3, x5) + x3); | |
422 v35 = S (F (T3, x3) - x5); | |
1 | 423 |
424 b0 = S (u17 + u35); | |
425 b3 = S (v17 - v35); | |
426 u12 = S (u17 - u35); | |
427 v12 = S (v17 + v35); | |
36 | 428 u12 = S (2 * F (C4, u12)); |
429 v12 = S (2 * F (C4, v12)); | |
1 | 430 b1 = S (u12 + v12); |
431 b2 = S (u12 - v12); | |
432 | |
433 y0 = S (a0 + b0) >> COL_SHIFT; | |
434 y1 = S (a1 + b1) >> COL_SHIFT; | |
435 y2 = S (a2 + b2) >> COL_SHIFT; | |
436 y3 = S (a3 + b3) >> COL_SHIFT; | |
437 | |
438 y4 = S (a3 - b3) >> COL_SHIFT; | |
439 y5 = S (a2 - b2) >> COL_SHIFT; | |
440 y6 = S (a1 - b1) >> COL_SHIFT; | |
441 y7 = S (a0 - b0) >> COL_SHIFT; | |
442 | |
443 col[0*8] = y0; | |
444 col[1*8] = y1; | |
445 col[2*8] = y2; | |
446 col[3*8] = y3; | |
447 col[4*8] = y4; | |
448 col[5*8] = y5; | |
449 col[6*8] = y6; | |
450 col[7*8] = y7; | |
451 } | |
452 #endif | |
453 | |
454 | |
455 #define T1 13036 | |
456 #define T2 27146 | |
457 #define T3 43790 | |
458 #define C4 23170 | |
459 | |
26393
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
460 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
461 /* SSE2 column IDCT */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
462 static inline void sse2_idct_col (int16_t * const col) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
463 { |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
464 /* Almost identical to mmxext version: */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
465 /* just do both 4x8 columns in paraller */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
466 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
467 static const short t1_vector[] ATTR_ALIGN(16) = {T1,T1,T1,T1,T1,T1,T1,T1}; |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
468 static const short t2_vector[] ATTR_ALIGN(16) = {T2,T2,T2,T2,T2,T2,T2,T2}; |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
469 static const short t3_vector[] ATTR_ALIGN(16) = {T3,T3,T3,T3,T3,T3,T3,T3}; |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
470 static const short c4_vector[] ATTR_ALIGN(16) = {C4,C4,C4,C4,C4,C4,C4,C4}; |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
471 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
472 #if defined(__x86_64__) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
473 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
474 /* INPUT: block in xmm8 ... xmm15 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
475 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
476 movdqa_m2r (*t1_vector, xmm0); /* xmm0 = T1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
477 movdqa_r2r (xmm9, xmm1); /* xmm1 = x1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
478 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
479 movdqa_r2r (xmm0, xmm2); /* xmm2 = T1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
480 pmulhw_r2r (xmm1, xmm0); /* xmm0 = T1*x1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
481 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
482 movdqa_m2r (*t3_vector, xmm5); /* xmm5 = T3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
483 pmulhw_r2r (xmm15, xmm2); /* xmm2 = T1*x7 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
484 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
485 movdqa_r2r (xmm5, xmm7); /* xmm7 = T3-1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
486 psubsw_r2r (xmm15, xmm0); /* xmm0 = v17 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
487 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
488 movdqa_m2r (*t2_vector, xmm9); /* xmm9 = T2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
489 pmulhw_r2r (xmm11, xmm5); /* xmm5 = (T3-1)*x3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
490 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
491 paddsw_r2r (xmm2, xmm1); /* xmm1 = u17 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
492 pmulhw_r2r (xmm13, xmm7); /* xmm7 = (T3-1)*x5 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
493 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
494 movdqa_r2r (xmm9, xmm2); /* xmm2 = T2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
495 paddsw_r2r (xmm11, xmm5); /* xmm5 = T3*x3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
496 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
497 pmulhw_r2r (xmm10, xmm9); /* xmm9 = T2*x2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
498 paddsw_r2r (xmm13, xmm7); /* xmm7 = T3*x5 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
499 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
500 psubsw_r2r (xmm13, xmm5); /* xmm5 = v35 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
501 paddsw_r2r (xmm11, xmm7); /* xmm7 = u35 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
502 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
503 movdqa_r2r (xmm0, xmm6); /* xmm6 = v17 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
504 pmulhw_r2r (xmm14, xmm2); /* xmm2 = T2*x6 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
505 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
506 psubsw_r2r (xmm5, xmm0); /* xmm0 = b3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
507 psubsw_r2r (xmm14, xmm9); /* xmm9 = v26 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
508 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
509 paddsw_r2r (xmm6, xmm5); /* xmm5 = v12 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
510 movdqa_r2r (xmm0, xmm11); /* xmm11 = b3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
511 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
512 movdqa_r2r (xmm1, xmm6); /* xmm6 = u17 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
513 paddsw_r2r (xmm10, xmm2); /* xmm2 = u26 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
514 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
515 paddsw_r2r (xmm7, xmm6); /* xmm6 = b0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
516 psubsw_r2r (xmm7, xmm1); /* xmm1 = u12 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
517 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
518 movdqa_r2r (xmm1, xmm7); /* xmm7 = u12 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
519 paddsw_r2r (xmm5, xmm1); /* xmm1 = u12+v12 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
520 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
521 movdqa_m2r (*c4_vector, xmm0); /* xmm0 = C4/2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
522 psubsw_r2r (xmm5, xmm7); /* xmm7 = u12-v12 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
523 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
524 movdqa_r2r (xmm6, xmm4); /* xmm4 = b0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
525 pmulhw_r2r (xmm0, xmm1); /* xmm1 = b1/2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
526 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
527 movdqa_r2r (xmm9, xmm6); /* xmm6 = v26 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
528 pmulhw_r2r (xmm0, xmm7); /* xmm7 = b2/2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
529 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
530 movdqa_r2r (xmm8, xmm10); /* xmm10 = x0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
531 movdqa_r2r (xmm8, xmm0); /* xmm0 = x0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
532 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
533 psubsw_r2r (xmm12, xmm10); /* xmm10 = v04 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
534 paddsw_r2r (xmm12, xmm0); /* xmm0 = u04 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
535 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
536 paddsw_r2r (xmm10, xmm9); /* xmm9 = a1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
537 movdqa_r2r (xmm0, xmm8); /* xmm8 = u04 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
538 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
539 psubsw_r2r (xmm6, xmm10); /* xmm10 = a2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
540 paddsw_r2r (xmm2, xmm8); /* xmm5 = a0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
541 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
542 paddsw_r2r (xmm1, xmm1); /* xmm1 = b1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
543 psubsw_r2r (xmm2, xmm0); /* xmm0 = a3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
544 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
545 paddsw_r2r (xmm7, xmm7); /* xmm7 = b2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
546 movdqa_r2r (xmm10, xmm13); /* xmm13 = a2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
547 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
548 movdqa_r2r (xmm9, xmm14); /* xmm14 = a1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
549 paddsw_r2r (xmm7, xmm10); /* xmm10 = a2+b2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
550 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
551 psraw_i2r (COL_SHIFT,xmm10); /* xmm10 = y2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
552 paddsw_r2r (xmm1, xmm9); /* xmm9 = a1+b1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
553 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
554 psraw_i2r (COL_SHIFT, xmm9); /* xmm9 = y1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
555 psubsw_r2r (xmm1, xmm14); /* xmm14 = a1-b1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
556 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
557 psubsw_r2r (xmm7, xmm13); /* xmm13 = a2-b2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
558 psraw_i2r (COL_SHIFT,xmm14); /* xmm14 = y6 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
559 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
560 movdqa_r2r (xmm8, xmm15); /* xmm15 = a0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
561 psraw_i2r (COL_SHIFT,xmm13); /* xmm13 = y5 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
562 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
563 paddsw_r2r (xmm4, xmm8); /* xmm8 = a0+b0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
564 psubsw_r2r (xmm4, xmm15); /* xmm15 = a0-b0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
565 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
566 psraw_i2r (COL_SHIFT, xmm8); /* xmm8 = y0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
567 movdqa_r2r (xmm0, xmm12); /* xmm12 = a3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
568 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
569 psubsw_r2r (xmm11, xmm12); /* xmm12 = a3-b3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
570 psraw_i2r (COL_SHIFT,xmm15); /* xmm15 = y7 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
571 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
572 paddsw_r2r (xmm0, xmm11); /* xmm11 = a3+b3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
573 psraw_i2r (COL_SHIFT,xmm12); /* xmm12 = y4 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
574 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
575 psraw_i2r (COL_SHIFT,xmm11); /* xmm11 = y3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
576 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
577 /* OUTPUT: block in xmm8 ... xmm15 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
578 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
579 #else |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
580 movdqa_m2r (*t1_vector, xmm0); /* xmm0 = T1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
581 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
582 movdqa_m2r (*(col+1*8), xmm1); /* xmm1 = x1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
583 movdqa_r2r (xmm0, xmm2); /* xmm2 = T1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
584 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
585 movdqa_m2r (*(col+7*8), xmm4); /* xmm4 = x7 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
586 pmulhw_r2r (xmm1, xmm0); /* xmm0 = T1*x1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
587 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
588 movdqa_m2r (*t3_vector, xmm5); /* xmm5 = T3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
589 pmulhw_r2r (xmm4, xmm2); /* xmm2 = T1*x7 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
590 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
591 movdqa_m2r (*(col+5*8), xmm6); /* xmm6 = x5 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
592 movdqa_r2r (xmm5, xmm7); /* xmm7 = T3-1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
593 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
594 movdqa_m2r (*(col+3*8), xmm3); /* xmm3 = x3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
595 psubsw_r2r (xmm4, xmm0); /* xmm0 = v17 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
596 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
597 movdqa_m2r (*t2_vector, xmm4); /* xmm4 = T2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
598 pmulhw_r2r (xmm3, xmm5); /* xmm5 = (T3-1)*x3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
599 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
600 paddsw_r2r (xmm2, xmm1); /* xmm1 = u17 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
601 pmulhw_r2r (xmm6, xmm7); /* xmm7 = (T3-1)*x5 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
602 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
603 /* slot */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
604 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
605 movdqa_r2r (xmm4, xmm2); /* xmm2 = T2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
606 paddsw_r2r (xmm3, xmm5); /* xmm5 = T3*x3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
607 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
608 pmulhw_m2r (*(col+2*8), xmm4); /* xmm4 = T2*x2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
609 paddsw_r2r (xmm6, xmm7); /* xmm7 = T3*x5 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
610 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
611 psubsw_r2r (xmm6, xmm5); /* xmm5 = v35 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
612 paddsw_r2r (xmm3, xmm7); /* xmm7 = u35 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
613 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
614 movdqa_m2r (*(col+6*8), xmm3); /* xmm3 = x6 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
615 movdqa_r2r (xmm0, xmm6); /* xmm6 = v17 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
616 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
617 pmulhw_r2r (xmm3, xmm2); /* xmm2 = T2*x6 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
618 psubsw_r2r (xmm5, xmm0); /* xmm0 = b3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
619 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
620 psubsw_r2r (xmm3, xmm4); /* xmm4 = v26 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
621 paddsw_r2r (xmm6, xmm5); /* xmm5 = v12 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
622 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
623 movdqa_r2m (xmm0, *(col+3*8)); /* save b3 in scratch0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
624 movdqa_r2r (xmm1, xmm6); /* xmm6 = u17 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
625 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
626 paddsw_m2r (*(col+2*8), xmm2); /* xmm2 = u26 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
627 paddsw_r2r (xmm7, xmm6); /* xmm6 = b0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
628 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
629 psubsw_r2r (xmm7, xmm1); /* xmm1 = u12 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
630 movdqa_r2r (xmm1, xmm7); /* xmm7 = u12 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
631 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
632 movdqa_m2r (*(col+0*8), xmm3); /* xmm3 = x0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
633 paddsw_r2r (xmm5, xmm1); /* xmm1 = u12+v12 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
634 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
635 movdqa_m2r (*c4_vector, xmm0); /* xmm0 = C4/2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
636 psubsw_r2r (xmm5, xmm7); /* xmm7 = u12-v12 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
637 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
638 movdqa_r2m (xmm6, *(col+5*8)); /* save b0 in scratch1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
639 pmulhw_r2r (xmm0, xmm1); /* xmm1 = b1/2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
640 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
641 movdqa_r2r (xmm4, xmm6); /* xmm6 = v26 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
642 pmulhw_r2r (xmm0, xmm7); /* xmm7 = b2/2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
643 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
644 movdqa_m2r (*(col+4*8), xmm5); /* xmm5 = x4 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
645 movdqa_r2r (xmm3, xmm0); /* xmm0 = x0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
646 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
647 psubsw_r2r (xmm5, xmm3); /* xmm3 = v04 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
648 paddsw_r2r (xmm5, xmm0); /* xmm0 = u04 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
649 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
650 paddsw_r2r (xmm3, xmm4); /* xmm4 = a1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
651 movdqa_r2r (xmm0, xmm5); /* xmm5 = u04 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
652 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
653 psubsw_r2r (xmm6, xmm3); /* xmm3 = a2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
654 paddsw_r2r (xmm2, xmm5); /* xmm5 = a0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
655 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
656 paddsw_r2r (xmm1, xmm1); /* xmm1 = b1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
657 psubsw_r2r (xmm2, xmm0); /* xmm0 = a3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
658 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
659 paddsw_r2r (xmm7, xmm7); /* xmm7 = b2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
660 movdqa_r2r (xmm3, xmm2); /* xmm2 = a2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
661 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
662 movdqa_r2r (xmm4, xmm6); /* xmm6 = a1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
663 paddsw_r2r (xmm7, xmm3); /* xmm3 = a2+b2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
664 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
665 psraw_i2r (COL_SHIFT, xmm3); /* xmm3 = y2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
666 paddsw_r2r (xmm1, xmm4); /* xmm4 = a1+b1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
667 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
668 psraw_i2r (COL_SHIFT, xmm4); /* xmm4 = y1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
669 psubsw_r2r (xmm1, xmm6); /* xmm6 = a1-b1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
670 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
671 movdqa_m2r (*(col+5*8), xmm1); /* xmm1 = b0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
672 psubsw_r2r (xmm7, xmm2); /* xmm2 = a2-b2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
673 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
674 psraw_i2r (COL_SHIFT, xmm6); /* xmm6 = y6 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
675 movdqa_r2r (xmm5, xmm7); /* xmm7 = a0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
676 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
677 movdqa_r2m (xmm4, *(col+1*8)); /* save y1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
678 psraw_i2r (COL_SHIFT, xmm2); /* xmm2 = y5 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
679 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
680 movdqa_r2m (xmm3, *(col+2*8)); /* save y2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
681 paddsw_r2r (xmm1, xmm5); /* xmm5 = a0+b0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
682 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
683 movdqa_m2r (*(col+3*8), xmm4); /* xmm4 = b3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
684 psubsw_r2r (xmm1, xmm7); /* xmm7 = a0-b0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
685 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
686 psraw_i2r (COL_SHIFT, xmm5); /* xmm5 = y0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
687 movdqa_r2r (xmm0, xmm3); /* xmm3 = a3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
688 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
689 movdqa_r2m (xmm2, *(col+5*8)); /* save y5 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
690 psubsw_r2r (xmm4, xmm3); /* xmm3 = a3-b3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
691 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
692 psraw_i2r (COL_SHIFT, xmm7); /* xmm7 = y7 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
693 paddsw_r2r (xmm0, xmm4); /* xmm4 = a3+b3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
694 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
695 movdqa_r2m (xmm5, *(col+0*8)); /* save y0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
696 psraw_i2r (COL_SHIFT, xmm3); /* xmm3 = y4 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
697 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
698 movdqa_r2m (xmm6, *(col+6*8)); /* save y6 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
699 psraw_i2r (COL_SHIFT, xmm4); /* xmm4 = y3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
700 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
701 movdqa_r2m (xmm7, *(col+7*8)); /* save y7 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
702 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
703 movdqa_r2m (xmm3, *(col+4*8)); /* save y4 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
704 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
705 movdqa_r2m (xmm4, *(col+3*8)); /* save y3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
706 #endif |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
707 } |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
708 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
709 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
710 /* MMX column IDCT */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
711 static inline void idct_col (int16_t * const col, const int offset) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
712 { |
25997
78c7ef4854ff
Fix illegal identifiers, port of my patch to upstream libmpeg2.
diego
parents:
21526
diff
changeset
|
713 static const short t1_vector[] ATTR_ALIGN(8) = {T1,T1,T1,T1}; |
78c7ef4854ff
Fix illegal identifiers, port of my patch to upstream libmpeg2.
diego
parents:
21526
diff
changeset
|
714 static const short t2_vector[] ATTR_ALIGN(8) = {T2,T2,T2,T2}; |
78c7ef4854ff
Fix illegal identifiers, port of my patch to upstream libmpeg2.
diego
parents:
21526
diff
changeset
|
715 static const short t3_vector[] ATTR_ALIGN(8) = {T3,T3,T3,T3}; |
78c7ef4854ff
Fix illegal identifiers, port of my patch to upstream libmpeg2.
diego
parents:
21526
diff
changeset
|
716 static const short c4_vector[] ATTR_ALIGN(8) = {C4,C4,C4,C4}; |
1 | 717 |
718 /* column code adapted from peter gubanov */ | |
719 /* http://www.elecard.com/peter/idct.shtml */ | |
720 | |
25997
78c7ef4854ff
Fix illegal identifiers, port of my patch to upstream libmpeg2.
diego
parents:
21526
diff
changeset
|
721 movq_m2r (*t1_vector, mm0); /* mm0 = T1 */ |
1 | 722 |
9852 | 723 movq_m2r (*(col+offset+1*8), mm1); /* mm1 = x1 */ |
724 movq_r2r (mm0, mm2); /* mm2 = T1 */ | |
1 | 725 |
9852 | 726 movq_m2r (*(col+offset+7*8), mm4); /* mm4 = x7 */ |
727 pmulhw_r2r (mm1, mm0); /* mm0 = T1*x1 */ | |
1 | 728 |
25997
78c7ef4854ff
Fix illegal identifiers, port of my patch to upstream libmpeg2.
diego
parents:
21526
diff
changeset
|
729 movq_m2r (*t3_vector, mm5); /* mm5 = T3 */ |
9852 | 730 pmulhw_r2r (mm4, mm2); /* mm2 = T1*x7 */ |
1 | 731 |
9852 | 732 movq_m2r (*(col+offset+5*8), mm6); /* mm6 = x5 */ |
733 movq_r2r (mm5, mm7); /* mm7 = T3-1 */ | |
1 | 734 |
9852 | 735 movq_m2r (*(col+offset+3*8), mm3); /* mm3 = x3 */ |
736 psubsw_r2r (mm4, mm0); /* mm0 = v17 */ | |
1 | 737 |
25997
78c7ef4854ff
Fix illegal identifiers, port of my patch to upstream libmpeg2.
diego
parents:
21526
diff
changeset
|
738 movq_m2r (*t2_vector, mm4); /* mm4 = T2 */ |
9852 | 739 pmulhw_r2r (mm3, mm5); /* mm5 = (T3-1)*x3 */ |
1 | 740 |
9852 | 741 paddsw_r2r (mm2, mm1); /* mm1 = u17 */ |
742 pmulhw_r2r (mm6, mm7); /* mm7 = (T3-1)*x5 */ | |
1 | 743 |
36 | 744 /* slot */ |
1 | 745 |
9852 | 746 movq_r2r (mm4, mm2); /* mm2 = T2 */ |
747 paddsw_r2r (mm3, mm5); /* mm5 = T3*x3 */ | |
1 | 748 |
9852 | 749 pmulhw_m2r (*(col+offset+2*8), mm4);/* mm4 = T2*x2 */ |
750 paddsw_r2r (mm6, mm7); /* mm7 = T3*x5 */ | |
1 | 751 |
9852 | 752 psubsw_r2r (mm6, mm5); /* mm5 = v35 */ |
753 paddsw_r2r (mm3, mm7); /* mm7 = u35 */ | |
1 | 754 |
9852 | 755 movq_m2r (*(col+offset+6*8), mm3); /* mm3 = x6 */ |
756 movq_r2r (mm0, mm6); /* mm6 = v17 */ | |
1 | 757 |
9852 | 758 pmulhw_r2r (mm3, mm2); /* mm2 = T2*x6 */ |
759 psubsw_r2r (mm5, mm0); /* mm0 = b3 */ | |
1 | 760 |
9852 | 761 psubsw_r2r (mm3, mm4); /* mm4 = v26 */ |
762 paddsw_r2r (mm6, mm5); /* mm5 = v12 */ | |
1 | 763 |
9852 | 764 movq_r2m (mm0, *(col+offset+3*8)); /* save b3 in scratch0 */ |
765 movq_r2r (mm1, mm6); /* mm6 = u17 */ | |
1 | 766 |
9852 | 767 paddsw_m2r (*(col+offset+2*8), mm2);/* mm2 = u26 */ |
768 paddsw_r2r (mm7, mm6); /* mm6 = b0 */ | |
1 | 769 |
9852 | 770 psubsw_r2r (mm7, mm1); /* mm1 = u12 */ |
771 movq_r2r (mm1, mm7); /* mm7 = u12 */ | |
1 | 772 |
9852 | 773 movq_m2r (*(col+offset+0*8), mm3); /* mm3 = x0 */ |
774 paddsw_r2r (mm5, mm1); /* mm1 = u12+v12 */ | |
1 | 775 |
25997
78c7ef4854ff
Fix illegal identifiers, port of my patch to upstream libmpeg2.
diego
parents:
21526
diff
changeset
|
776 movq_m2r (*c4_vector, mm0); /* mm0 = C4/2 */ |
9852 | 777 psubsw_r2r (mm5, mm7); /* mm7 = u12-v12 */ |
1 | 778 |
9852 | 779 movq_r2m (mm6, *(col+offset+5*8)); /* save b0 in scratch1 */ |
780 pmulhw_r2r (mm0, mm1); /* mm1 = b1/2 */ | |
1 | 781 |
9852 | 782 movq_r2r (mm4, mm6); /* mm6 = v26 */ |
783 pmulhw_r2r (mm0, mm7); /* mm7 = b2/2 */ | |
1 | 784 |
9852 | 785 movq_m2r (*(col+offset+4*8), mm5); /* mm5 = x4 */ |
786 movq_r2r (mm3, mm0); /* mm0 = x0 */ | |
1 | 787 |
9852 | 788 psubsw_r2r (mm5, mm3); /* mm3 = v04 */ |
789 paddsw_r2r (mm5, mm0); /* mm0 = u04 */ | |
1 | 790 |
9852 | 791 paddsw_r2r (mm3, mm4); /* mm4 = a1 */ |
792 movq_r2r (mm0, mm5); /* mm5 = u04 */ | |
1 | 793 |
9852 | 794 psubsw_r2r (mm6, mm3); /* mm3 = a2 */ |
795 paddsw_r2r (mm2, mm5); /* mm5 = a0 */ | |
1 | 796 |
9852 | 797 paddsw_r2r (mm1, mm1); /* mm1 = b1 */ |
798 psubsw_r2r (mm2, mm0); /* mm0 = a3 */ | |
1 | 799 |
9852 | 800 paddsw_r2r (mm7, mm7); /* mm7 = b2 */ |
801 movq_r2r (mm3, mm2); /* mm2 = a2 */ | |
1 | 802 |
9852 | 803 movq_r2r (mm4, mm6); /* mm6 = a1 */ |
804 paddsw_r2r (mm7, mm3); /* mm3 = a2+b2 */ | |
1 | 805 |
9852 | 806 psraw_i2r (COL_SHIFT, mm3); /* mm3 = y2 */ |
807 paddsw_r2r (mm1, mm4); /* mm4 = a1+b1 */ | |
1 | 808 |
9852 | 809 psraw_i2r (COL_SHIFT, mm4); /* mm4 = y1 */ |
810 psubsw_r2r (mm1, mm6); /* mm6 = a1-b1 */ | |
1 | 811 |
9852 | 812 movq_m2r (*(col+offset+5*8), mm1); /* mm1 = b0 */ |
813 psubsw_r2r (mm7, mm2); /* mm2 = a2-b2 */ | |
1 | 814 |
9852 | 815 psraw_i2r (COL_SHIFT, mm6); /* mm6 = y6 */ |
816 movq_r2r (mm5, mm7); /* mm7 = a0 */ | |
1 | 817 |
9852 | 818 movq_r2m (mm4, *(col+offset+1*8)); /* save y1 */ |
819 psraw_i2r (COL_SHIFT, mm2); /* mm2 = y5 */ | |
1 | 820 |
9852 | 821 movq_r2m (mm3, *(col+offset+2*8)); /* save y2 */ |
822 paddsw_r2r (mm1, mm5); /* mm5 = a0+b0 */ | |
1 | 823 |
9852 | 824 movq_m2r (*(col+offset+3*8), mm4); /* mm4 = b3 */ |
825 psubsw_r2r (mm1, mm7); /* mm7 = a0-b0 */ | |
1 | 826 |
9852 | 827 psraw_i2r (COL_SHIFT, mm5); /* mm5 = y0 */ |
828 movq_r2r (mm0, mm3); /* mm3 = a3 */ | |
1 | 829 |
9852 | 830 movq_r2m (mm2, *(col+offset+5*8)); /* save y5 */ |
831 psubsw_r2r (mm4, mm3); /* mm3 = a3-b3 */ | |
1 | 832 |
9852 | 833 psraw_i2r (COL_SHIFT, mm7); /* mm7 = y7 */ |
834 paddsw_r2r (mm0, mm4); /* mm4 = a3+b3 */ | |
1 | 835 |
9852 | 836 movq_r2m (mm5, *(col+offset+0*8)); /* save y0 */ |
837 psraw_i2r (COL_SHIFT, mm3); /* mm3 = y4 */ | |
1 | 838 |
9852 | 839 movq_r2m (mm6, *(col+offset+6*8)); /* save y6 */ |
840 psraw_i2r (COL_SHIFT, mm4); /* mm4 = y3 */ | |
1 | 841 |
9852 | 842 movq_r2m (mm7, *(col+offset+7*8)); /* save y7 */ |
1 | 843 |
9852 | 844 movq_r2m (mm3, *(col+offset+4*8)); /* save y4 */ |
1 | 845 |
9852 | 846 movq_r2m (mm4, *(col+offset+3*8)); /* save y3 */ |
1 | 847 } |
848 | |
849 | |
9852 | 850 static const int32_t rounder0[] ATTR_ALIGN(8) = |
1 | 851 rounder ((1 << (COL_SHIFT - 1)) - 0.5); |
9852 | 852 static const int32_t rounder4[] ATTR_ALIGN(8) = rounder (0); |
853 static const int32_t rounder1[] ATTR_ALIGN(8) = | |
36 | 854 rounder (1.25683487303); /* C1*(C1/C4+C1+C7)/2 */ |
9852 | 855 static const int32_t rounder7[] ATTR_ALIGN(8) = |
36 | 856 rounder (-0.25); /* C1*(C7/C4+C7-C1)/2 */ |
9852 | 857 static const int32_t rounder2[] ATTR_ALIGN(8) = |
36 | 858 rounder (0.60355339059); /* C2 * (C6+C2)/2 */ |
9852 | 859 static const int32_t rounder6[] ATTR_ALIGN(8) = |
36 | 860 rounder (-0.25); /* C2 * (C6-C2)/2 */ |
9852 | 861 static const int32_t rounder3[] ATTR_ALIGN(8) = |
36 | 862 rounder (0.087788325588); /* C3*(-C3/C4+C3+C5)/2 */ |
9852 | 863 static const int32_t rounder5[] ATTR_ALIGN(8) = |
36 | 864 rounder (-0.441341716183); /* C3*(-C5/C4+C5-C3)/2 */ |
1 | 865 |
866 | |
867 #define declare_idct(idct,table,idct_row_head,idct_row,idct_row_tail,idct_row_mid) \ | |
9852 | 868 static inline void idct (int16_t * const block) \ |
1 | 869 { \ |
9852 | 870 static const int16_t table04[] ATTR_ALIGN(16) = \ |
1 | 871 table (22725, 21407, 19266, 16384, 12873, 8867, 4520); \ |
9852 | 872 static const int16_t table17[] ATTR_ALIGN(16) = \ |
1 | 873 table (31521, 29692, 26722, 22725, 17855, 12299, 6270); \ |
9852 | 874 static const int16_t table26[] ATTR_ALIGN(16) = \ |
1 | 875 table (29692, 27969, 25172, 21407, 16819, 11585, 5906); \ |
9852 | 876 static const int16_t table35[] ATTR_ALIGN(16) = \ |
1 | 877 table (26722, 25172, 22654, 19266, 15137, 10426, 5315); \ |
878 \ | |
879 idct_row_head (block, 0*8, table04); \ | |
880 idct_row (table04, rounder0); \ | |
881 idct_row_mid (block, 0*8, 4*8, table04); \ | |
882 idct_row (table04, rounder4); \ | |
883 idct_row_mid (block, 4*8, 1*8, table17); \ | |
884 idct_row (table17, rounder1); \ | |
885 idct_row_mid (block, 1*8, 7*8, table17); \ | |
886 idct_row (table17, rounder7); \ | |
887 idct_row_mid (block, 7*8, 2*8, table26); \ | |
888 idct_row (table26, rounder2); \ | |
889 idct_row_mid (block, 2*8, 6*8, table26); \ | |
890 idct_row (table26, rounder6); \ | |
891 idct_row_mid (block, 6*8, 3*8, table35); \ | |
892 idct_row (table35, rounder3); \ | |
893 idct_row_mid (block, 3*8, 5*8, table35); \ | |
894 idct_row (table35, rounder5); \ | |
895 idct_row_tail (block, 5*8); \ | |
896 \ | |
897 idct_col (block, 0); \ | |
898 idct_col (block, 4); \ | |
899 } | |
900 | |
26393
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
901 static inline void sse2_idct (int16_t * const block) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
902 { |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
903 static const int16_t table04[] ATTR_ALIGN(16) = |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
904 sse2_table (22725, 21407, 19266, 16384, 12873, 8867, 4520); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
905 static const int16_t table17[] ATTR_ALIGN(16) = |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
906 sse2_table (31521, 29692, 26722, 22725, 17855, 12299, 6270); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
907 static const int16_t table26[] ATTR_ALIGN(16) = |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
908 sse2_table (29692, 27969, 25172, 21407, 16819, 11585, 5906); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
909 static const int16_t table35[] ATTR_ALIGN(16) = |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
910 sse2_table (26722, 25172, 22654, 19266, 15137, 10426, 5315); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
911 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
912 static const int32_t rounder0_128[] ATTR_ALIGN(16) = |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
913 rounder_sse2 ((1 << (COL_SHIFT - 1)) - 0.5); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
914 static const int32_t rounder4_128[] ATTR_ALIGN(16) = rounder_sse2 (0); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
915 static const int32_t rounder1_128[] ATTR_ALIGN(16) = |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
916 rounder_sse2 (1.25683487303); /* C1*(C1/C4+C1+C7)/2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
917 static const int32_t rounder7_128[] ATTR_ALIGN(16) = |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
918 rounder_sse2 (-0.25); /* C1*(C7/C4+C7-C1)/2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
919 static const int32_t rounder2_128[] ATTR_ALIGN(16) = |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
920 rounder_sse2 (0.60355339059); /* C2 * (C6+C2)/2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
921 static const int32_t rounder6_128[] ATTR_ALIGN(16) = |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
922 rounder_sse2 (-0.25); /* C2 * (C6-C2)/2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
923 static const int32_t rounder3_128[] ATTR_ALIGN(16) = |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
924 rounder_sse2 (0.087788325588); /* C3*(-C3/C4+C3+C5)/2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
925 static const int32_t rounder5_128[] ATTR_ALIGN(16) = |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
926 rounder_sse2 (-0.441341716183); /* C3*(-C5/C4+C5-C3)/2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
927 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
928 #if defined(__x86_64__) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
929 movdqa_m2r (block[0*8], xmm8); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
930 movdqa_m2r (block[4*8], xmm12); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
931 SSE2_IDCT_2ROW (table04, xmm8, xmm12, *rounder0_128, *rounder4_128); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
932 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
933 movdqa_m2r (block[1*8], xmm9); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
934 movdqa_m2r (block[7*8], xmm15); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
935 SSE2_IDCT_2ROW (table17, xmm9, xmm15, *rounder1_128, *rounder7_128); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
936 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
937 movdqa_m2r (block[2*8], xmm10); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
938 movdqa_m2r (block[6*8], xmm14); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
939 SSE2_IDCT_2ROW (table26, xmm10, xmm14, *rounder2_128, *rounder6_128); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
940 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
941 movdqa_m2r (block[3*8], xmm11); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
942 movdqa_m2r (block[5*8], xmm13); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
943 SSE2_IDCT_2ROW (table35, xmm11, xmm13, *rounder3_128, *rounder5_128); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
944 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
945 /* OUTPUT: block in xmm8 ... xmm15 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
946 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
947 #else |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
948 movdqa_m2r (block[0*8], xmm0); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
949 movdqa_m2r (block[4*8], xmm4); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
950 SSE2_IDCT_2ROW (table04, xmm0, xmm4, *rounder0_128, *rounder4_128); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
951 movdqa_r2m (xmm0, block[0*8]); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
952 movdqa_r2m (xmm4, block[4*8]); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
953 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
954 movdqa_m2r (block[1*8], xmm0); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
955 movdqa_m2r (block[7*8], xmm4); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
956 SSE2_IDCT_2ROW (table17, xmm0, xmm4, *rounder1_128, *rounder7_128); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
957 movdqa_r2m (xmm0, block[1*8]); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
958 movdqa_r2m (xmm4, block[7*8]); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
959 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
960 movdqa_m2r (block[2*8], xmm0); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
961 movdqa_m2r (block[6*8], xmm4); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
962 SSE2_IDCT_2ROW (table26, xmm0, xmm4, *rounder2_128, *rounder6_128); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
963 movdqa_r2m (xmm0, block[2*8]); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
964 movdqa_r2m (xmm4, block[6*8]); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
965 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
966 movdqa_m2r (block[3*8], xmm0); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
967 movdqa_m2r (block[5*8], xmm4); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
968 SSE2_IDCT_2ROW (table35, xmm0, xmm4, *rounder3_128, *rounder5_128); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
969 movdqa_r2m (xmm0, block[3*8]); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
970 movdqa_r2m (xmm4, block[5*8]); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
971 #endif |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
972 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
973 sse2_idct_col (block); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
974 } |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
975 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
976 static void sse2_block_copy (int16_t * const block, uint8_t * dest, |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
977 const int stride) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
978 { |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
979 #if defined(__x86_64__) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
980 /* INPUT: block in xmm8 ... xmm15 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
981 packuswb_r2r (xmm8, xmm8); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
982 packuswb_r2r (xmm9, xmm9); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
983 movq_r2m (xmm8, *(dest+0*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
984 packuswb_r2r (xmm10, xmm10); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
985 movq_r2m (xmm9, *(dest+1*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
986 packuswb_r2r (xmm11, xmm11); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
987 movq_r2m (xmm10, *(dest+2*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
988 packuswb_r2r (xmm12, xmm12); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
989 movq_r2m (xmm11, *(dest+3*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
990 packuswb_r2r (xmm13, xmm13); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
991 movq_r2m (xmm12, *(dest+4*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
992 packuswb_r2r (xmm14, xmm14); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
993 movq_r2m (xmm13, *(dest+5*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
994 packuswb_r2r (xmm15, xmm15); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
995 movq_r2m (xmm14, *(dest+6*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
996 movq_r2m (xmm15, *(dest+7*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
997 #else |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
998 movdqa_m2r (*(block+0*8), xmm0); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
999 movdqa_m2r (*(block+1*8), xmm1); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1000 movdqa_m2r (*(block+2*8), xmm2); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1001 packuswb_r2r (xmm0, xmm0); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1002 movdqa_m2r (*(block+3*8), xmm3); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1003 packuswb_r2r (xmm1, xmm1); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1004 movdqa_m2r (*(block+4*8), xmm4); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1005 packuswb_r2r (xmm2, xmm2); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1006 movdqa_m2r (*(block+5*8), xmm5); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1007 packuswb_r2r (xmm3, xmm3); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1008 movdqa_m2r (*(block+6*8), xmm6); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1009 packuswb_r2r (xmm4, xmm4); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1010 movdqa_m2r (*(block+7*8), xmm7); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1011 movq_r2m (xmm0, *(dest+0*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1012 packuswb_r2r (xmm5, xmm5); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1013 movq_r2m (xmm1, *(dest+1*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1014 packuswb_r2r (xmm6, xmm6); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1015 movq_r2m (xmm2, *(dest+2*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1016 packuswb_r2r (xmm7, xmm7); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1017 movq_r2m (xmm3, *(dest+3*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1018 movq_r2m (xmm4, *(dest+4*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1019 movq_r2m (xmm5, *(dest+5*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1020 movq_r2m (xmm6, *(dest+6*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1021 movq_r2m (xmm7, *(dest+7*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1022 #endif |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1023 } |
1 | 1024 |
1025 #define COPY_MMX(offset,r0,r1,r2) \ | |
1026 do { \ | |
1027 movq_m2r (*(block+offset), r0); \ | |
1028 dest += stride; \ | |
1029 movq_m2r (*(block+offset+4), r1); \ | |
1030 movq_r2m (r2, *dest); \ | |
1031 packuswb_r2r (r1, r0); \ | |
1032 } while (0) | |
1033 | |
9852 | 1034 static inline void block_copy (int16_t * const block, uint8_t * dest, |
1035 const int stride) | |
1 | 1036 { |
1037 movq_m2r (*(block+0*8), mm0); | |
1038 movq_m2r (*(block+0*8+4), mm1); | |
1039 movq_m2r (*(block+1*8), mm2); | |
1040 packuswb_r2r (mm1, mm0); | |
1041 movq_m2r (*(block+1*8+4), mm3); | |
1042 movq_r2m (mm0, *dest); | |
1043 packuswb_r2r (mm3, mm2); | |
1044 COPY_MMX (2*8, mm0, mm1, mm2); | |
1045 COPY_MMX (3*8, mm2, mm3, mm0); | |
1046 COPY_MMX (4*8, mm0, mm1, mm2); | |
1047 COPY_MMX (5*8, mm2, mm3, mm0); | |
1048 COPY_MMX (6*8, mm0, mm1, mm2); | |
1049 COPY_MMX (7*8, mm2, mm3, mm0); | |
1050 movq_r2m (mm2, *(dest+stride)); | |
1051 } | |
1052 | |
26393
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1053 #define ADD_SSE2_2ROW(op, block0, block1)\ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1054 do { \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1055 movq_m2r (*(dest), xmm1); \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1056 movq_m2r (*(dest+stride), xmm2); \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1057 punpcklbw_r2r (xmm0, xmm1); \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1058 punpcklbw_r2r (xmm0, xmm2); \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1059 paddsw_##op (block0, xmm1); \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1060 paddsw_##op (block1, xmm2); \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1061 packuswb_r2r (xmm1, xmm1); \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1062 packuswb_r2r (xmm2, xmm2); \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1063 movq_r2m (xmm1, *(dest)); \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1064 movq_r2m (xmm2, *(dest+stride)); \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1065 dest += 2*stride; \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1066 } while (0) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1067 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1068 static void sse2_block_add (int16_t * const block, uint8_t * dest, |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1069 const int stride) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1070 { |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1071 pxor_r2r(xmm0, xmm0); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1072 #if defined(__x86_64__) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1073 /* INPUT: block in xmm8 ... xmm15 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1074 ADD_SSE2_2ROW(r2r, xmm8, xmm9); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1075 ADD_SSE2_2ROW(r2r, xmm10, xmm11); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1076 ADD_SSE2_2ROW(r2r, xmm12, xmm13); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1077 ADD_SSE2_2ROW(r2r, xmm14, xmm15); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1078 #else |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1079 ADD_SSE2_2ROW(m2r, *(block+0*8), *(block+1*8)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1080 ADD_SSE2_2ROW(m2r, *(block+2*8), *(block+3*8)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1081 ADD_SSE2_2ROW(m2r, *(block+4*8), *(block+5*8)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1082 ADD_SSE2_2ROW(m2r, *(block+6*8), *(block+7*8)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1083 #endif |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1084 } |
1 | 1085 |
1086 #define ADD_MMX(offset,r1,r2,r3,r4) \ | |
1087 do { \ | |
1088 movq_m2r (*(dest+2*stride), r1); \ | |
1089 packuswb_r2r (r4, r3); \ | |
1090 movq_r2r (r1, r2); \ | |
1091 dest += stride; \ | |
1092 movq_r2m (r3, *dest); \ | |
1093 punpcklbw_r2r (mm0, r1); \ | |
1094 paddsw_m2r (*(block+offset), r1); \ | |
1095 punpckhbw_r2r (mm0, r2); \ | |
1096 paddsw_m2r (*(block+offset+4), r2); \ | |
1097 } while (0) | |
1098 | |
9852 | 1099 static inline void block_add (int16_t * const block, uint8_t * dest, |
1100 const int stride) | |
1 | 1101 { |
1102 movq_m2r (*dest, mm1); | |
1103 pxor_r2r (mm0, mm0); | |
1104 movq_m2r (*(dest+stride), mm3); | |
1105 movq_r2r (mm1, mm2); | |
1106 punpcklbw_r2r (mm0, mm1); | |
1107 movq_r2r (mm3, mm4); | |
1108 paddsw_m2r (*(block+0*8), mm1); | |
1109 punpckhbw_r2r (mm0, mm2); | |
1110 paddsw_m2r (*(block+0*8+4), mm2); | |
1111 punpcklbw_r2r (mm0, mm3); | |
1112 paddsw_m2r (*(block+1*8), mm3); | |
1113 packuswb_r2r (mm2, mm1); | |
1114 punpckhbw_r2r (mm0, mm4); | |
1115 movq_r2m (mm1, *dest); | |
1116 paddsw_m2r (*(block+1*8+4), mm4); | |
1117 ADD_MMX (2*8, mm1, mm2, mm3, mm4); | |
1118 ADD_MMX (3*8, mm3, mm4, mm1, mm2); | |
1119 ADD_MMX (4*8, mm1, mm2, mm3, mm4); | |
1120 ADD_MMX (5*8, mm3, mm4, mm1, mm2); | |
1121 ADD_MMX (6*8, mm1, mm2, mm3, mm4); | |
1122 ADD_MMX (7*8, mm3, mm4, mm1, mm2); | |
1123 packuswb_r2r (mm4, mm3); | |
1124 movq_r2m (mm3, *(dest+stride)); | |
1125 } | |
1126 | |
1127 | |
26393
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1128 static inline void sse2_block_zero (int16_t * const block) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1129 { |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1130 pxor_r2r (xmm0, xmm0); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1131 movdqa_r2m (xmm0, *(block+0*8)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1132 movdqa_r2m (xmm0, *(block+1*8)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1133 movdqa_r2m (xmm0, *(block+2*8)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1134 movdqa_r2m (xmm0, *(block+3*8)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1135 movdqa_r2m (xmm0, *(block+4*8)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1136 movdqa_r2m (xmm0, *(block+5*8)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1137 movdqa_r2m (xmm0, *(block+6*8)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1138 movdqa_r2m (xmm0, *(block+7*8)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1139 } |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1140 |
9852 | 1141 static inline void block_zero (int16_t * const block) |
1142 { | |
1143 pxor_r2r (mm0, mm0); | |
1144 movq_r2m (mm0, *(block+0*4)); | |
1145 movq_r2m (mm0, *(block+1*4)); | |
1146 movq_r2m (mm0, *(block+2*4)); | |
1147 movq_r2m (mm0, *(block+3*4)); | |
1148 movq_r2m (mm0, *(block+4*4)); | |
1149 movq_r2m (mm0, *(block+5*4)); | |
1150 movq_r2m (mm0, *(block+6*4)); | |
1151 movq_r2m (mm0, *(block+7*4)); | |
1152 movq_r2m (mm0, *(block+8*4)); | |
1153 movq_r2m (mm0, *(block+9*4)); | |
1154 movq_r2m (mm0, *(block+10*4)); | |
1155 movq_r2m (mm0, *(block+11*4)); | |
1156 movq_r2m (mm0, *(block+12*4)); | |
1157 movq_r2m (mm0, *(block+13*4)); | |
1158 movq_r2m (mm0, *(block+14*4)); | |
1159 movq_r2m (mm0, *(block+15*4)); | |
1160 } | |
1161 | |
1162 | |
1163 #define CPU_MMXEXT 0 | |
1164 #define CPU_MMX 1 | |
1165 | |
1166 #define dup4(reg) \ | |
1167 do { \ | |
1168 if (cpu != CPU_MMXEXT) { \ | |
1169 punpcklwd_r2r (reg, reg); \ | |
1170 punpckldq_r2r (reg, reg); \ | |
1171 } else \ | |
1172 pshufw_r2r (reg, reg, 0x00); \ | |
1173 } while (0) | |
1174 | |
1175 static inline void block_add_DC (int16_t * const block, uint8_t * dest, | |
1176 const int stride, const int cpu) | |
1177 { | |
12932 | 1178 movd_v2r ((block[0] + 64) >> 7, mm0); |
9852 | 1179 pxor_r2r (mm1, mm1); |
1180 movq_m2r (*dest, mm2); | |
1181 dup4 (mm0); | |
1182 psubsw_r2r (mm0, mm1); | |
1183 packuswb_r2r (mm0, mm0); | |
1184 paddusb_r2r (mm0, mm2); | |
1185 packuswb_r2r (mm1, mm1); | |
1186 movq_m2r (*(dest + stride), mm3); | |
1187 psubusb_r2r (mm1, mm2); | |
1188 block[0] = 0; | |
1189 paddusb_r2r (mm0, mm3); | |
1190 movq_r2m (mm2, *dest); | |
1191 psubusb_r2r (mm1, mm3); | |
1192 movq_m2r (*(dest + 2*stride), mm2); | |
1193 dest += stride; | |
1194 movq_r2m (mm3, *dest); | |
1195 paddusb_r2r (mm0, mm2); | |
1196 movq_m2r (*(dest + 2*stride), mm3); | |
1197 psubusb_r2r (mm1, mm2); | |
1198 dest += stride; | |
1199 paddusb_r2r (mm0, mm3); | |
1200 movq_r2m (mm2, *dest); | |
1201 psubusb_r2r (mm1, mm3); | |
1202 movq_m2r (*(dest + 2*stride), mm2); | |
1203 dest += stride; | |
1204 movq_r2m (mm3, *dest); | |
1205 paddusb_r2r (mm0, mm2); | |
1206 movq_m2r (*(dest + 2*stride), mm3); | |
1207 psubusb_r2r (mm1, mm2); | |
1208 dest += stride; | |
1209 paddusb_r2r (mm0, mm3); | |
1210 movq_r2m (mm2, *dest); | |
1211 psubusb_r2r (mm1, mm3); | |
1212 movq_m2r (*(dest + 2*stride), mm2); | |
1213 dest += stride; | |
1214 movq_r2m (mm3, *dest); | |
1215 paddusb_r2r (mm0, mm2); | |
1216 movq_m2r (*(dest + 2*stride), mm3); | |
1217 psubusb_r2r (mm1, mm2); | |
1218 block[63] = 0; | |
1219 paddusb_r2r (mm0, mm3); | |
1220 movq_r2m (mm2, *(dest + stride)); | |
1221 psubusb_r2r (mm1, mm3); | |
1222 movq_r2m (mm3, *(dest + 2*stride)); | |
1223 } | |
1224 | |
26393
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1225 void mpeg2_idct_copy_sse2 (int16_t * const block, uint8_t * const dest, |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1226 const int stride) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1227 { |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1228 sse2_idct (block); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1229 sse2_block_copy (block, dest, stride); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1230 sse2_block_zero (block); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1231 } |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1232 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1233 void mpeg2_idct_add_sse2 (const int last, int16_t * const block, |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1234 uint8_t * const dest, const int stride) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1235 { |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1236 if (last != 129 || (block[0] & (7 << 4)) == (4 << 4)) { |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1237 sse2_idct (block); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1238 sse2_block_add (block, dest, stride); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1239 sse2_block_zero (block); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1240 } else |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1241 block_add_DC (block, dest, stride, CPU_MMXEXT); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1242 } |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1243 |
9852 | 1244 |
1 | 1245 declare_idct (mmxext_idct, mmxext_table, |
1246 mmxext_row_head, mmxext_row, mmxext_row_tail, mmxext_row_mid) | |
1247 | |
9852 | 1248 void mpeg2_idct_copy_mmxext (int16_t * const block, uint8_t * const dest, |
1249 const int stride) | |
1 | 1250 { |
1251 mmxext_idct (block); | |
1252 block_copy (block, dest, stride); | |
9852 | 1253 block_zero (block); |
1 | 1254 } |
1255 | |
9852 | 1256 void mpeg2_idct_add_mmxext (const int last, int16_t * const block, |
1257 uint8_t * const dest, const int stride) | |
1 | 1258 { |
12932 | 1259 if (last != 129 || (block[0] & (7 << 4)) == (4 << 4)) { |
9852 | 1260 mmxext_idct (block); |
1261 block_add (block, dest, stride); | |
1262 block_zero (block); | |
1263 } else | |
1264 block_add_DC (block, dest, stride, CPU_MMXEXT); | |
1 | 1265 } |
1266 | |
1267 | |
1268 declare_idct (mmx_idct, mmx_table, | |
1269 mmx_row_head, mmx_row, mmx_row_tail, mmx_row_mid) | |
1270 | |
9852 | 1271 void mpeg2_idct_copy_mmx (int16_t * const block, uint8_t * const dest, |
1272 const int stride) | |
1 | 1273 { |
1274 mmx_idct (block); | |
1275 block_copy (block, dest, stride); | |
9852 | 1276 block_zero (block); |
1 | 1277 } |
1278 | |
9852 | 1279 void mpeg2_idct_add_mmx (const int last, int16_t * const block, |
1280 uint8_t * const dest, const int stride) | |
1 | 1281 { |
12932 | 1282 if (last != 129 || (block[0] & (7 << 4)) == (4 << 4)) { |
9852 | 1283 mmx_idct (block); |
1284 block_add (block, dest, stride); | |
1285 block_zero (block); | |
1286 } else | |
1287 block_add_DC (block, dest, stride, CPU_MMX); | |
1 | 1288 } |
1289 | |
1290 | |
9852 | 1291 void mpeg2_idct_mmx_init (void) |
1 | 1292 { |
1293 int i, j; | |
1294 | |
36 | 1295 /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */ |
1 | 1296 |
1297 for (i = 0; i < 64; i++) { | |
9852 | 1298 j = mpeg2_scan_norm[i]; |
1299 mpeg2_scan_norm[i] = (j & 0x38) | ((j & 6) >> 1) | ((j & 1) << 2); | |
1300 j = mpeg2_scan_alt[i]; | |
1301 mpeg2_scan_alt[i] = (j & 0x38) | ((j & 6) >> 1) | ((j & 1) << 2); | |
1 | 1302 } |
1303 } | |
1304 | |
1305 #endif |