Mercurial > mplayer.hg
annotate libmpeg2/idct_mmx.c @ 35524:dd7de6052739
Add doxygen comments.
author | ib |
---|---|
date | Thu, 06 Dec 2012 02:30:19 +0000 |
parents | 25337a2147e7 |
children |
rev | line source |
---|---|
1 | 1 /* |
2 * idct_mmx.c | |
10303 | 3 * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org> |
9852 | 4 * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca> |
1 | 5 * |
6 * This file is part of mpeg2dec, a free MPEG-2 video stream decoder. | |
9852 | 7 * See http://libmpeg2.sourceforge.net/ for updates. |
1 | 8 * |
9 * mpeg2dec is free software; you can redistribute it and/or modify | |
10 * it under the terms of the GNU General Public License as published by | |
11 * the Free Software Foundation; either version 2 of the License, or | |
12 * (at your option) any later version. | |
13 * | |
14 * mpeg2dec is distributed in the hope that it will be useful, | |
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
17 * GNU General Public License for more details. | |
18 * | |
19 * You should have received a copy of the GNU General Public License | |
20 * along with this program; if not, write to the Free Software | |
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
22 */ | |
23 | |
24 #include "config.h" | |
25 | |
28290 | 26 #if ARCH_X86 || ARCH_X86_64 |
1 | 27 |
28 #include <inttypes.h> | |
29 | |
9852 | 30 #include "mpeg2.h" |
12932 | 31 #include "attributes.h" |
1 | 32 #include "mpeg2_internal.h" |
33 #include "mmx.h" | |
34 | |
12932 | 35 #define ROW_SHIFT 15 |
1 | 36 #define COL_SHIFT 6 |
37 | |
38 #define round(bias) ((int)(((bias)+0.5) * (1<<ROW_SHIFT))) | |
39 #define rounder(bias) {round (bias), round (bias)} | |
26393
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
40 #define rounder_sse2(bias) {round (bias), round (bias), round (bias), round (bias)} |
1 | 41 |
42 | |
43 #if 0 | |
25998 | 44 /* C row IDCT - it is just here to document the MMXEXT and MMX versions */ |
1 | 45 static inline void idct_row (int16_t * row, int offset, |
46 int16_t * table, int32_t * rounder) | |
47 { | |
48 int C1, C2, C3, C4, C5, C6, C7; | |
49 int a0, a1, a2, a3, b0, b1, b2, b3; | |
50 | |
51 row += offset; | |
52 | |
53 C1 = table[1]; | |
54 C2 = table[2]; | |
55 C3 = table[3]; | |
56 C4 = table[4]; | |
57 C5 = table[5]; | |
58 C6 = table[6]; | |
59 C7 = table[7]; | |
60 | |
61 a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + *rounder; | |
62 a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + *rounder; | |
63 a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + *rounder; | |
64 a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + *rounder; | |
65 | |
66 b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7]; | |
67 b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7]; | |
68 b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7]; | |
69 b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7]; | |
70 | |
71 row[0] = (a0 + b0) >> ROW_SHIFT; | |
72 row[1] = (a1 + b1) >> ROW_SHIFT; | |
73 row[2] = (a2 + b2) >> ROW_SHIFT; | |
74 row[3] = (a3 + b3) >> ROW_SHIFT; | |
75 row[4] = (a3 - b3) >> ROW_SHIFT; | |
76 row[5] = (a2 - b2) >> ROW_SHIFT; | |
77 row[6] = (a1 - b1) >> ROW_SHIFT; | |
78 row[7] = (a0 - b0) >> ROW_SHIFT; | |
79 } | |
80 #endif | |
81 | |
82 | |
26393
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
83 /* SSE2 row IDCT */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
84 #define sse2_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, c4, c6, \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
85 c4, -c6, c4, -c2, \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
86 c4, c6, -c4, -c2, \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
87 -c4, c2, c4, -c6, \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
88 c1, c3, c3, -c7, \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
89 c5, -c1, c7, -c5, \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
90 c5, c7, -c1, -c5, \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
91 c7, c3, c3, -c1 } |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
92 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
93 #define SSE2_IDCT_2ROW(table, row1, row2, round1, round2) do { \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
94 /* no scheduling: trust in out of order execution */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
95 /* based on Intel AP-945 */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
96 /* (http://cache-www.intel.com/cd/00/00/01/76/17680_w_idct.pdf) */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
97 \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
98 /* input */ /* 1: row1= x7 x5 x3 x1 x6 x4 x2 x0 */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
99 pshufd_r2r (row1, xmm1, 0); /* 1: xmm1= x2 x0 x2 x0 x2 x0 x2 x0 */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
100 pmaddwd_m2r (table[0], xmm1); /* 1: xmm1= x2*C + x0*C ... */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
101 pshufd_r2r (row1, xmm3, 0xaa); /* 1: xmm3= x3 x1 x3 x1 x3 x1 x3 x1 */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
102 pmaddwd_m2r (table[2*8], xmm3); /* 1: xmm3= x3*C + x1*C ... */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
103 pshufd_r2r (row1, xmm2, 0x55); /* 1: xmm2= x6 x4 x6 x4 x6 x4 x6 x4 */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
104 pshufd_r2r (row1, row1, 0xff); /* 1: row1= x7 x5 x7 x5 x7 x5 x7 x5 */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
105 pmaddwd_m2r (table[1*8], xmm2); /* 1: xmm2= x6*C + x4*C ... */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
106 paddd_m2r (round1, xmm1); /* 1: xmm1= x2*C + x0*C + round ... */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
107 pmaddwd_m2r (table[3*8], row1); /* 1: row1= x7*C + x5*C ... */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
108 pshufd_r2r (row2, xmm5, 0); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
109 pshufd_r2r (row2, xmm6, 0x55); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
110 pmaddwd_m2r (table[0], xmm5); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
111 paddd_r2r (xmm2, xmm1); /* 1: xmm1= a[] */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
112 movdqa_r2r (xmm1, xmm2); /* 1: xmm2= a[] */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
113 pshufd_r2r (row2, xmm7, 0xaa); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
114 pmaddwd_m2r (table[1*8], xmm6); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
115 paddd_r2r (xmm3, row1); /* 1: row1= b[]= 7*C+5*C+3*C+1*C ... */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
116 pshufd_r2r (row2, row2, 0xff); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
117 psubd_r2r (row1, xmm2); /* 1: xmm2= a[] - b[] */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
118 pmaddwd_m2r (table[2*8], xmm7); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
119 paddd_r2r (xmm1, row1); /* 1: row1= a[] + b[] */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
120 psrad_i2r (ROW_SHIFT, xmm2); /* 1: xmm2= result 4...7 */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
121 paddd_m2r (round2, xmm5); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
122 pmaddwd_m2r (table[3*8], row2); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
123 paddd_r2r (xmm6, xmm5); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
124 movdqa_r2r (xmm5, xmm6); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
125 psrad_i2r (ROW_SHIFT, row1); /* 1: row1= result 0...4 */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
126 pshufd_r2r (xmm2, xmm2, 0x1b); /* 1: [0 1 2 3] -> [3 2 1 0] */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
127 packssdw_r2r (xmm2, row1); /* 1: row1= result[] */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
128 paddd_r2r (xmm7, row2); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
129 psubd_r2r (row2, xmm6); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
130 paddd_r2r (xmm5, row2); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
131 psrad_i2r (ROW_SHIFT, xmm6); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
132 psrad_i2r (ROW_SHIFT, row2); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
133 pshufd_r2r (xmm6, xmm6, 0x1b); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
134 packssdw_r2r (xmm6, row2); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
135 } while (0) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
136 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
137 |
36 | 138 /* MMXEXT row IDCT */ |
1 | 139 |
140 #define mmxext_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, -c4, -c2, \ | |
141 c4, c6, c4, c6, \ | |
142 c1, c3, -c1, -c5, \ | |
143 c5, c7, c3, -c7, \ | |
144 c4, -c6, c4, -c6, \ | |
145 -c4, c2, c4, -c2, \ | |
146 c5, -c1, c3, -c1, \ | |
147 c7, c3, c7, -c5 } | |
148 | |
9852 | 149 static inline void mmxext_row_head (int16_t * const row, const int offset, |
150 const int16_t * const table) | |
1 | 151 { |
9852 | 152 movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */ |
1 | 153 |
9852 | 154 movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ |
155 movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */ | |
1 | 156 |
9852 | 157 movq_m2r (*table, mm3); /* mm3 = -C2 -C4 C2 C4 */ |
158 movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */ | |
1 | 159 |
9852 | 160 movq_m2r (*(table+4), mm4); /* mm4 = C6 C4 C6 C4 */ |
161 pmaddwd_r2r (mm0, mm3); /* mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 */ | |
1 | 162 |
9852 | 163 pshufw_r2r (mm2, mm2, 0x4e); /* mm2 = x2 x0 x6 x4 */ |
1 | 164 } |
165 | |
9852 | 166 static inline void mmxext_row (const int16_t * const table, |
167 const int32_t * const rounder) | |
1 | 168 { |
9852 | 169 movq_m2r (*(table+8), mm1); /* mm1 = -C5 -C1 C3 C1 */ |
170 pmaddwd_r2r (mm2, mm4); /* mm4 = C4*x0+C6*x2 C4*x4+C6*x6 */ | |
1 | 171 |
9852 | 172 pmaddwd_m2r (*(table+16), mm0); /* mm0 = C4*x4-C6*x6 C4*x0-C6*x2 */ |
173 pshufw_r2r (mm6, mm6, 0x4e); /* mm6 = x3 x1 x7 x5 */ | |
1 | 174 |
9852 | 175 movq_m2r (*(table+12), mm7); /* mm7 = -C7 C3 C7 C5 */ |
176 pmaddwd_r2r (mm5, mm1); /* mm1 = -C1*x5-C5*x7 C1*x1+C3*x3 */ | |
1 | 177 |
9852 | 178 paddd_m2r (*rounder, mm3); /* mm3 += rounder */ |
179 pmaddwd_r2r (mm6, mm7); /* mm7 = C3*x1-C7*x3 C5*x5+C7*x7 */ | |
1 | 180 |
9852 | 181 pmaddwd_m2r (*(table+20), mm2); /* mm2 = C4*x0-C2*x2 -C4*x4+C2*x6 */ |
182 paddd_r2r (mm4, mm3); /* mm3 = a1 a0 + rounder */ | |
1 | 183 |
9852 | 184 pmaddwd_m2r (*(table+24), mm5); /* mm5 = C3*x5-C1*x7 C5*x1-C1*x3 */ |
185 movq_r2r (mm3, mm4); /* mm4 = a1 a0 + rounder */ | |
1 | 186 |
9852 | 187 pmaddwd_m2r (*(table+28), mm6); /* mm6 = C7*x1-C5*x3 C7*x5+C3*x7 */ |
188 paddd_r2r (mm7, mm1); /* mm1 = b1 b0 */ | |
1 | 189 |
9852 | 190 paddd_m2r (*rounder, mm0); /* mm0 += rounder */ |
191 psubd_r2r (mm1, mm3); /* mm3 = a1-b1 a0-b0 + rounder */ | |
1 | 192 |
9852 | 193 psrad_i2r (ROW_SHIFT, mm3); /* mm3 = y6 y7 */ |
194 paddd_r2r (mm4, mm1); /* mm1 = a1+b1 a0+b0 + rounder */ | |
1 | 195 |
9852 | 196 paddd_r2r (mm2, mm0); /* mm0 = a3 a2 + rounder */ |
197 psrad_i2r (ROW_SHIFT, mm1); /* mm1 = y1 y0 */ | |
1 | 198 |
9852 | 199 paddd_r2r (mm6, mm5); /* mm5 = b3 b2 */ |
200 movq_r2r (mm0, mm4); /* mm4 = a3 a2 + rounder */ | |
1 | 201 |
9852 | 202 paddd_r2r (mm5, mm0); /* mm0 = a3+b3 a2+b2 + rounder */ |
203 psubd_r2r (mm5, mm4); /* mm4 = a3-b3 a2-b2 + rounder */ | |
1 | 204 } |
205 | |
9852 | 206 static inline void mmxext_row_tail (int16_t * const row, const int store) |
1 | 207 { |
9852 | 208 psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */ |
1 | 209 |
9852 | 210 psrad_i2r (ROW_SHIFT, mm4); /* mm4 = y4 y5 */ |
1 | 211 |
9852 | 212 packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */ |
1 | 213 |
9852 | 214 packssdw_r2r (mm3, mm4); /* mm4 = y6 y7 y4 y5 */ |
1 | 215 |
9852 | 216 movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */ |
217 pshufw_r2r (mm4, mm4, 0xb1); /* mm4 = y7 y6 y5 y4 */ | |
1 | 218 |
36 | 219 /* slot */ |
1 | 220 |
9852 | 221 movq_r2m (mm4, *(row+store+4)); /* save y7 y6 y5 y4 */ |
1 | 222 } |
223 | |
9852 | 224 static inline void mmxext_row_mid (int16_t * const row, const int store, |
225 const int offset, | |
226 const int16_t * const table) | |
1 | 227 { |
9852 | 228 movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */ |
229 psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */ | |
1 | 230 |
9852 | 231 movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ |
232 psrad_i2r (ROW_SHIFT, mm4); /* mm4 = y4 y5 */ | |
1 | 233 |
9852 | 234 packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */ |
235 movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */ | |
1 | 236 |
9852 | 237 packssdw_r2r (mm3, mm4); /* mm4 = y6 y7 y4 y5 */ |
238 movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */ | |
1 | 239 |
9852 | 240 movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */ |
241 pshufw_r2r (mm4, mm4, 0xb1); /* mm4 = y7 y6 y5 y4 */ | |
1 | 242 |
9852 | 243 movq_m2r (*table, mm3); /* mm3 = -C2 -C4 C2 C4 */ |
244 movq_r2m (mm4, *(row+store+4)); /* save y7 y6 y5 y4 */ | |
1 | 245 |
9852 | 246 pmaddwd_r2r (mm0, mm3); /* mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 */ |
1 | 247 |
9852 | 248 movq_m2r (*(table+4), mm4); /* mm4 = C6 C4 C6 C4 */ |
249 pshufw_r2r (mm2, mm2, 0x4e); /* mm2 = x2 x0 x6 x4 */ | |
1 | 250 } |
251 | |
252 | |
36 | 253 /* MMX row IDCT */ |
1 | 254 |
255 #define mmx_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, c4, c6, \ | |
256 c4, c6, -c4, -c2, \ | |
257 c1, c3, c3, -c7, \ | |
258 c5, c7, -c1, -c5, \ | |
259 c4, -c6, c4, -c2, \ | |
260 -c4, c2, c4, -c6, \ | |
261 c5, -c1, c7, -c5, \ | |
262 c7, c3, c3, -c1 } | |
263 | |
9852 | 264 static inline void mmx_row_head (int16_t * const row, const int offset, |
265 const int16_t * const table) | |
1 | 266 { |
9852 | 267 movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */ |
1 | 268 |
9852 | 269 movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ |
270 movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */ | |
1 | 271 |
9852 | 272 movq_m2r (*table, mm3); /* mm3 = C6 C4 C2 C4 */ |
273 movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */ | |
1 | 274 |
9852 | 275 punpckldq_r2r (mm0, mm0); /* mm0 = x2 x0 x2 x0 */ |
1 | 276 |
9852 | 277 movq_m2r (*(table+4), mm4); /* mm4 = -C2 -C4 C6 C4 */ |
278 pmaddwd_r2r (mm0, mm3); /* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */ | |
1 | 279 |
9852 | 280 movq_m2r (*(table+8), mm1); /* mm1 = -C7 C3 C3 C1 */ |
281 punpckhdq_r2r (mm2, mm2); /* mm2 = x6 x4 x6 x4 */ | |
1 | 282 } |
283 | |
9852 | 284 static inline void mmx_row (const int16_t * const table, |
285 const int32_t * const rounder) | |
1 | 286 { |
9852 | 287 pmaddwd_r2r (mm2, mm4); /* mm4 = -C4*x4-C2*x6 C4*x4+C6*x6 */ |
288 punpckldq_r2r (mm5, mm5); /* mm5 = x3 x1 x3 x1 */ | |
1 | 289 |
9852 | 290 pmaddwd_m2r (*(table+16), mm0); /* mm0 = C4*x0-C2*x2 C4*x0-C6*x2 */ |
291 punpckhdq_r2r (mm6, mm6); /* mm6 = x7 x5 x7 x5 */ | |
1 | 292 |
9852 | 293 movq_m2r (*(table+12), mm7); /* mm7 = -C5 -C1 C7 C5 */ |
294 pmaddwd_r2r (mm5, mm1); /* mm1 = C3*x1-C7*x3 C1*x1+C3*x3 */ | |
1 | 295 |
9852 | 296 paddd_m2r (*rounder, mm3); /* mm3 += rounder */ |
297 pmaddwd_r2r (mm6, mm7); /* mm7 = -C1*x5-C5*x7 C5*x5+C7*x7 */ | |
1 | 298 |
9852 | 299 pmaddwd_m2r (*(table+20), mm2); /* mm2 = C4*x4-C6*x6 -C4*x4+C2*x6 */ |
300 paddd_r2r (mm4, mm3); /* mm3 = a1 a0 + rounder */ | |
1 | 301 |
9852 | 302 pmaddwd_m2r (*(table+24), mm5); /* mm5 = C7*x1-C5*x3 C5*x1-C1*x3 */ |
303 movq_r2r (mm3, mm4); /* mm4 = a1 a0 + rounder */ | |
1 | 304 |
9852 | 305 pmaddwd_m2r (*(table+28), mm6); /* mm6 = C3*x5-C1*x7 C7*x5+C3*x7 */ |
306 paddd_r2r (mm7, mm1); /* mm1 = b1 b0 */ | |
1 | 307 |
9852 | 308 paddd_m2r (*rounder, mm0); /* mm0 += rounder */ |
309 psubd_r2r (mm1, mm3); /* mm3 = a1-b1 a0-b0 + rounder */ | |
1 | 310 |
9852 | 311 psrad_i2r (ROW_SHIFT, mm3); /* mm3 = y6 y7 */ |
312 paddd_r2r (mm4, mm1); /* mm1 = a1+b1 a0+b0 + rounder */ | |
1 | 313 |
9852 | 314 paddd_r2r (mm2, mm0); /* mm0 = a3 a2 + rounder */ |
315 psrad_i2r (ROW_SHIFT, mm1); /* mm1 = y1 y0 */ | |
1 | 316 |
9852 | 317 paddd_r2r (mm6, mm5); /* mm5 = b3 b2 */ |
318 movq_r2r (mm0, mm7); /* mm7 = a3 a2 + rounder */ | |
1 | 319 |
9852 | 320 paddd_r2r (mm5, mm0); /* mm0 = a3+b3 a2+b2 + rounder */ |
321 psubd_r2r (mm5, mm7); /* mm7 = a3-b3 a2-b2 + rounder */ | |
1 | 322 } |
323 | |
9852 | 324 static inline void mmx_row_tail (int16_t * const row, const int store) |
1 | 325 { |
9852 | 326 psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */ |
1 | 327 |
9852 | 328 psrad_i2r (ROW_SHIFT, mm7); /* mm7 = y4 y5 */ |
1 | 329 |
9852 | 330 packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */ |
1 | 331 |
9852 | 332 packssdw_r2r (mm3, mm7); /* mm7 = y6 y7 y4 y5 */ |
1 | 333 |
9852 | 334 movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */ |
335 movq_r2r (mm7, mm4); /* mm4 = y6 y7 y4 y5 */ | |
1 | 336 |
9852 | 337 pslld_i2r (16, mm7); /* mm7 = y7 0 y5 0 */ |
1 | 338 |
9852 | 339 psrld_i2r (16, mm4); /* mm4 = 0 y6 0 y4 */ |
1 | 340 |
9852 | 341 por_r2r (mm4, mm7); /* mm7 = y7 y6 y5 y4 */ |
1 | 342 |
36 | 343 /* slot */ |
1 | 344 |
9852 | 345 movq_r2m (mm7, *(row+store+4)); /* save y7 y6 y5 y4 */ |
1 | 346 } |
347 | |
9852 | 348 static inline void mmx_row_mid (int16_t * const row, const int store, |
349 const int offset, const int16_t * const table) | |
1 | 350 { |
9852 | 351 movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */ |
352 psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */ | |
1 | 353 |
9852 | 354 movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ |
355 psrad_i2r (ROW_SHIFT, mm7); /* mm7 = y4 y5 */ | |
1 | 356 |
9852 | 357 packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */ |
358 movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */ | |
1 | 359 |
9852 | 360 packssdw_r2r (mm3, mm7); /* mm7 = y6 y7 y4 y5 */ |
361 movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */ | |
1 | 362 |
9852 | 363 movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */ |
364 movq_r2r (mm7, mm1); /* mm1 = y6 y7 y4 y5 */ | |
1 | 365 |
9852 | 366 punpckldq_r2r (mm0, mm0); /* mm0 = x2 x0 x2 x0 */ |
367 psrld_i2r (16, mm7); /* mm7 = 0 y6 0 y4 */ | |
1 | 368 |
9852 | 369 movq_m2r (*table, mm3); /* mm3 = C6 C4 C2 C4 */ |
370 pslld_i2r (16, mm1); /* mm1 = y7 0 y5 0 */ | |
1 | 371 |
9852 | 372 movq_m2r (*(table+4), mm4); /* mm4 = -C2 -C4 C6 C4 */ |
373 por_r2r (mm1, mm7); /* mm7 = y7 y6 y5 y4 */ | |
1 | 374 |
9852 | 375 movq_m2r (*(table+8), mm1); /* mm1 = -C7 C3 C3 C1 */ |
376 punpckhdq_r2r (mm2, mm2); /* mm2 = x6 x4 x6 x4 */ | |
1 | 377 |
9852 | 378 movq_r2m (mm7, *(row+store+4)); /* save y7 y6 y5 y4 */ |
379 pmaddwd_r2r (mm0, mm3); /* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */ | |
1 | 380 } |
381 | |
382 | |
383 #if 0 | |
25998 | 384 /* C column IDCT - it is just here to document the MMXEXT and MMX versions */ |
1 | 385 static inline void idct_col (int16_t * col, int offset) |
386 { | |
36 | 387 /* multiplication - as implemented on mmx */ |
1 | 388 #define F(c,x) (((c) * (x)) >> 16) |
389 | |
36 | 390 /* saturation - it helps us handle torture test cases */ |
1 | 391 #define S(x) (((x)>32767) ? 32767 : ((x)<-32768) ? -32768 : (x)) |
392 | |
393 int16_t x0, x1, x2, x3, x4, x5, x6, x7; | |
394 int16_t y0, y1, y2, y3, y4, y5, y6, y7; | |
395 int16_t a0, a1, a2, a3, b0, b1, b2, b3; | |
396 int16_t u04, v04, u26, v26, u17, v17, u35, v35, u12, v12; | |
397 | |
398 col += offset; | |
399 | |
400 x0 = col[0*8]; | |
401 x1 = col[1*8]; | |
402 x2 = col[2*8]; | |
403 x3 = col[3*8]; | |
404 x4 = col[4*8]; | |
405 x5 = col[5*8]; | |
406 x6 = col[6*8]; | |
407 x7 = col[7*8]; | |
408 | |
409 u04 = S (x0 + x4); | |
410 v04 = S (x0 - x4); | |
36 | 411 u26 = S (F (T2, x6) + x2); |
412 v26 = S (F (T2, x2) - x6); | |
1 | 413 |
414 a0 = S (u04 + u26); | |
415 a1 = S (v04 + v26); | |
416 a2 = S (v04 - v26); | |
417 a3 = S (u04 - u26); | |
418 | |
36 | 419 u17 = S (F (T1, x7) + x1); |
420 v17 = S (F (T1, x1) - x7); | |
421 u35 = S (F (T3, x5) + x3); | |
422 v35 = S (F (T3, x3) - x5); | |
1 | 423 |
424 b0 = S (u17 + u35); | |
425 b3 = S (v17 - v35); | |
426 u12 = S (u17 - u35); | |
427 v12 = S (v17 + v35); | |
36 | 428 u12 = S (2 * F (C4, u12)); |
429 v12 = S (2 * F (C4, v12)); | |
1 | 430 b1 = S (u12 + v12); |
431 b2 = S (u12 - v12); | |
432 | |
433 y0 = S (a0 + b0) >> COL_SHIFT; | |
434 y1 = S (a1 + b1) >> COL_SHIFT; | |
435 y2 = S (a2 + b2) >> COL_SHIFT; | |
436 y3 = S (a3 + b3) >> COL_SHIFT; | |
437 | |
438 y4 = S (a3 - b3) >> COL_SHIFT; | |
439 y5 = S (a2 - b2) >> COL_SHIFT; | |
440 y6 = S (a1 - b1) >> COL_SHIFT; | |
441 y7 = S (a0 - b0) >> COL_SHIFT; | |
442 | |
443 col[0*8] = y0; | |
444 col[1*8] = y1; | |
445 col[2*8] = y2; | |
446 col[3*8] = y3; | |
447 col[4*8] = y4; | |
448 col[5*8] = y5; | |
449 col[6*8] = y6; | |
450 col[7*8] = y7; | |
451 } | |
452 #endif | |
453 | |
454 | |
455 #define T1 13036 | |
456 #define T2 27146 | |
457 #define T3 43790 | |
458 #define C4 23170 | |
459 | |
26393
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
460 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
461 /* SSE2 column IDCT */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
462 static inline void sse2_idct_col (int16_t * const col) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
463 { |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
464 /* Almost identical to mmxext version: */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
465 /* just do both 4x8 columns in paraller */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
466 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
467 static const short t1_vector[] ATTR_ALIGN(16) = {T1,T1,T1,T1,T1,T1,T1,T1}; |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
468 static const short t2_vector[] ATTR_ALIGN(16) = {T2,T2,T2,T2,T2,T2,T2,T2}; |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
469 static const short t3_vector[] ATTR_ALIGN(16) = {T3,T3,T3,T3,T3,T3,T3,T3}; |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
470 static const short c4_vector[] ATTR_ALIGN(16) = {C4,C4,C4,C4,C4,C4,C4,C4}; |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
471 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
472 #if defined(__x86_64__) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
473 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
474 /* INPUT: block in xmm8 ... xmm15 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
475 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
476 movdqa_m2r (*t1_vector, xmm0); /* xmm0 = T1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
477 movdqa_r2r (xmm9, xmm1); /* xmm1 = x1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
478 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
479 movdqa_r2r (xmm0, xmm2); /* xmm2 = T1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
480 pmulhw_r2r (xmm1, xmm0); /* xmm0 = T1*x1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
481 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
482 movdqa_m2r (*t3_vector, xmm5); /* xmm5 = T3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
483 pmulhw_r2r (xmm15, xmm2); /* xmm2 = T1*x7 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
484 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
485 movdqa_r2r (xmm5, xmm7); /* xmm7 = T3-1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
486 psubsw_r2r (xmm15, xmm0); /* xmm0 = v17 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
487 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
488 movdqa_m2r (*t2_vector, xmm9); /* xmm9 = T2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
489 pmulhw_r2r (xmm11, xmm5); /* xmm5 = (T3-1)*x3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
490 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
491 paddsw_r2r (xmm2, xmm1); /* xmm1 = u17 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
492 pmulhw_r2r (xmm13, xmm7); /* xmm7 = (T3-1)*x5 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
493 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
494 movdqa_r2r (xmm9, xmm2); /* xmm2 = T2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
495 paddsw_r2r (xmm11, xmm5); /* xmm5 = T3*x3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
496 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
497 pmulhw_r2r (xmm10, xmm9); /* xmm9 = T2*x2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
498 paddsw_r2r (xmm13, xmm7); /* xmm7 = T3*x5 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
499 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
500 psubsw_r2r (xmm13, xmm5); /* xmm5 = v35 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
501 paddsw_r2r (xmm11, xmm7); /* xmm7 = u35 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
502 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
503 movdqa_r2r (xmm0, xmm6); /* xmm6 = v17 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
504 pmulhw_r2r (xmm14, xmm2); /* xmm2 = T2*x6 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
505 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
506 psubsw_r2r (xmm5, xmm0); /* xmm0 = b3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
507 psubsw_r2r (xmm14, xmm9); /* xmm9 = v26 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
508 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
509 paddsw_r2r (xmm6, xmm5); /* xmm5 = v12 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
510 movdqa_r2r (xmm0, xmm11); /* xmm11 = b3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
511 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
512 movdqa_r2r (xmm1, xmm6); /* xmm6 = u17 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
513 paddsw_r2r (xmm10, xmm2); /* xmm2 = u26 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
514 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
515 paddsw_r2r (xmm7, xmm6); /* xmm6 = b0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
516 psubsw_r2r (xmm7, xmm1); /* xmm1 = u12 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
517 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
518 movdqa_r2r (xmm1, xmm7); /* xmm7 = u12 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
519 paddsw_r2r (xmm5, xmm1); /* xmm1 = u12+v12 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
520 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
521 movdqa_m2r (*c4_vector, xmm0); /* xmm0 = C4/2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
522 psubsw_r2r (xmm5, xmm7); /* xmm7 = u12-v12 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
523 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
524 movdqa_r2r (xmm6, xmm4); /* xmm4 = b0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
525 pmulhw_r2r (xmm0, xmm1); /* xmm1 = b1/2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
526 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
527 movdqa_r2r (xmm9, xmm6); /* xmm6 = v26 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
528 pmulhw_r2r (xmm0, xmm7); /* xmm7 = b2/2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
529 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
530 movdqa_r2r (xmm8, xmm10); /* xmm10 = x0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
531 movdqa_r2r (xmm8, xmm0); /* xmm0 = x0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
532 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
533 psubsw_r2r (xmm12, xmm10); /* xmm10 = v04 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
534 paddsw_r2r (xmm12, xmm0); /* xmm0 = u04 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
535 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
536 paddsw_r2r (xmm10, xmm9); /* xmm9 = a1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
537 movdqa_r2r (xmm0, xmm8); /* xmm8 = u04 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
538 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
539 psubsw_r2r (xmm6, xmm10); /* xmm10 = a2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
540 paddsw_r2r (xmm2, xmm8); /* xmm5 = a0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
541 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
542 paddsw_r2r (xmm1, xmm1); /* xmm1 = b1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
543 psubsw_r2r (xmm2, xmm0); /* xmm0 = a3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
544 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
545 paddsw_r2r (xmm7, xmm7); /* xmm7 = b2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
546 movdqa_r2r (xmm10, xmm13); /* xmm13 = a2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
547 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
548 movdqa_r2r (xmm9, xmm14); /* xmm14 = a1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
549 paddsw_r2r (xmm7, xmm10); /* xmm10 = a2+b2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
550 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
551 psraw_i2r (COL_SHIFT,xmm10); /* xmm10 = y2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
552 paddsw_r2r (xmm1, xmm9); /* xmm9 = a1+b1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
553 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
554 psraw_i2r (COL_SHIFT, xmm9); /* xmm9 = y1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
555 psubsw_r2r (xmm1, xmm14); /* xmm14 = a1-b1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
556 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
557 psubsw_r2r (xmm7, xmm13); /* xmm13 = a2-b2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
558 psraw_i2r (COL_SHIFT,xmm14); /* xmm14 = y6 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
559 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
560 movdqa_r2r (xmm8, xmm15); /* xmm15 = a0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
561 psraw_i2r (COL_SHIFT,xmm13); /* xmm13 = y5 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
562 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
563 paddsw_r2r (xmm4, xmm8); /* xmm8 = a0+b0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
564 psubsw_r2r (xmm4, xmm15); /* xmm15 = a0-b0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
565 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
566 psraw_i2r (COL_SHIFT, xmm8); /* xmm8 = y0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
567 movdqa_r2r (xmm0, xmm12); /* xmm12 = a3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
568 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
569 psubsw_r2r (xmm11, xmm12); /* xmm12 = a3-b3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
570 psraw_i2r (COL_SHIFT,xmm15); /* xmm15 = y7 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
571 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
572 paddsw_r2r (xmm0, xmm11); /* xmm11 = a3+b3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
573 psraw_i2r (COL_SHIFT,xmm12); /* xmm12 = y4 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
574 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
575 psraw_i2r (COL_SHIFT,xmm11); /* xmm11 = y3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
576 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
577 /* OUTPUT: block in xmm8 ... xmm15 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
578 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
579 #else |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
580 movdqa_m2r (*t1_vector, xmm0); /* xmm0 = T1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
581 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
582 movdqa_m2r (*(col+1*8), xmm1); /* xmm1 = x1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
583 movdqa_r2r (xmm0, xmm2); /* xmm2 = T1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
584 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
585 movdqa_m2r (*(col+7*8), xmm4); /* xmm4 = x7 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
586 pmulhw_r2r (xmm1, xmm0); /* xmm0 = T1*x1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
587 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
588 movdqa_m2r (*t3_vector, xmm5); /* xmm5 = T3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
589 pmulhw_r2r (xmm4, xmm2); /* xmm2 = T1*x7 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
590 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
591 movdqa_m2r (*(col+5*8), xmm6); /* xmm6 = x5 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
592 movdqa_r2r (xmm5, xmm7); /* xmm7 = T3-1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
593 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
594 movdqa_m2r (*(col+3*8), xmm3); /* xmm3 = x3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
595 psubsw_r2r (xmm4, xmm0); /* xmm0 = v17 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
596 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
597 movdqa_m2r (*t2_vector, xmm4); /* xmm4 = T2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
598 pmulhw_r2r (xmm3, xmm5); /* xmm5 = (T3-1)*x3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
599 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
600 paddsw_r2r (xmm2, xmm1); /* xmm1 = u17 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
601 pmulhw_r2r (xmm6, xmm7); /* xmm7 = (T3-1)*x5 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
602 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
603 /* slot */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
604 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
605 movdqa_r2r (xmm4, xmm2); /* xmm2 = T2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
606 paddsw_r2r (xmm3, xmm5); /* xmm5 = T3*x3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
607 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
608 pmulhw_m2r (*(col+2*8), xmm4); /* xmm4 = T2*x2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
609 paddsw_r2r (xmm6, xmm7); /* xmm7 = T3*x5 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
610 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
611 psubsw_r2r (xmm6, xmm5); /* xmm5 = v35 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
612 paddsw_r2r (xmm3, xmm7); /* xmm7 = u35 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
613 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
614 movdqa_m2r (*(col+6*8), xmm3); /* xmm3 = x6 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
615 movdqa_r2r (xmm0, xmm6); /* xmm6 = v17 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
616 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
617 pmulhw_r2r (xmm3, xmm2); /* xmm2 = T2*x6 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
618 psubsw_r2r (xmm5, xmm0); /* xmm0 = b3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
619 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
620 psubsw_r2r (xmm3, xmm4); /* xmm4 = v26 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
621 paddsw_r2r (xmm6, xmm5); /* xmm5 = v12 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
622 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
623 movdqa_r2m (xmm0, *(col+3*8)); /* save b3 in scratch0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
624 movdqa_r2r (xmm1, xmm6); /* xmm6 = u17 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
625 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
626 paddsw_m2r (*(col+2*8), xmm2); /* xmm2 = u26 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
627 paddsw_r2r (xmm7, xmm6); /* xmm6 = b0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
628 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
629 psubsw_r2r (xmm7, xmm1); /* xmm1 = u12 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
630 movdqa_r2r (xmm1, xmm7); /* xmm7 = u12 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
631 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
632 movdqa_m2r (*(col+0*8), xmm3); /* xmm3 = x0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
633 paddsw_r2r (xmm5, xmm1); /* xmm1 = u12+v12 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
634 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
635 movdqa_m2r (*c4_vector, xmm0); /* xmm0 = C4/2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
636 psubsw_r2r (xmm5, xmm7); /* xmm7 = u12-v12 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
637 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
638 movdqa_r2m (xmm6, *(col+5*8)); /* save b0 in scratch1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
639 pmulhw_r2r (xmm0, xmm1); /* xmm1 = b1/2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
640 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
641 movdqa_r2r (xmm4, xmm6); /* xmm6 = v26 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
642 pmulhw_r2r (xmm0, xmm7); /* xmm7 = b2/2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
643 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
644 movdqa_m2r (*(col+4*8), xmm5); /* xmm5 = x4 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
645 movdqa_r2r (xmm3, xmm0); /* xmm0 = x0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
646 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
647 psubsw_r2r (xmm5, xmm3); /* xmm3 = v04 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
648 paddsw_r2r (xmm5, xmm0); /* xmm0 = u04 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
649 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
650 paddsw_r2r (xmm3, xmm4); /* xmm4 = a1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
651 movdqa_r2r (xmm0, xmm5); /* xmm5 = u04 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
652 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
653 psubsw_r2r (xmm6, xmm3); /* xmm3 = a2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
654 paddsw_r2r (xmm2, xmm5); /* xmm5 = a0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
655 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
656 paddsw_r2r (xmm1, xmm1); /* xmm1 = b1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
657 psubsw_r2r (xmm2, xmm0); /* xmm0 = a3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
658 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
659 paddsw_r2r (xmm7, xmm7); /* xmm7 = b2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
660 movdqa_r2r (xmm3, xmm2); /* xmm2 = a2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
661 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
662 movdqa_r2r (xmm4, xmm6); /* xmm6 = a1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
663 paddsw_r2r (xmm7, xmm3); /* xmm3 = a2+b2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
664 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
665 psraw_i2r (COL_SHIFT, xmm3); /* xmm3 = y2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
666 paddsw_r2r (xmm1, xmm4); /* xmm4 = a1+b1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
667 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
668 psraw_i2r (COL_SHIFT, xmm4); /* xmm4 = y1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
669 psubsw_r2r (xmm1, xmm6); /* xmm6 = a1-b1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
670 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
671 movdqa_m2r (*(col+5*8), xmm1); /* xmm1 = b0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
672 psubsw_r2r (xmm7, xmm2); /* xmm2 = a2-b2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
673 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
674 psraw_i2r (COL_SHIFT, xmm6); /* xmm6 = y6 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
675 movdqa_r2r (xmm5, xmm7); /* xmm7 = a0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
676 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
677 movdqa_r2m (xmm4, *(col+1*8)); /* save y1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
678 psraw_i2r (COL_SHIFT, xmm2); /* xmm2 = y5 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
679 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
680 movdqa_r2m (xmm3, *(col+2*8)); /* save y2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
681 paddsw_r2r (xmm1, xmm5); /* xmm5 = a0+b0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
682 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
683 movdqa_m2r (*(col+3*8), xmm4); /* xmm4 = b3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
684 psubsw_r2r (xmm1, xmm7); /* xmm7 = a0-b0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
685 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
686 psraw_i2r (COL_SHIFT, xmm5); /* xmm5 = y0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
687 movdqa_r2r (xmm0, xmm3); /* xmm3 = a3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
688 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
689 movdqa_r2m (xmm2, *(col+5*8)); /* save y5 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
690 psubsw_r2r (xmm4, xmm3); /* xmm3 = a3-b3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
691 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
692 psraw_i2r (COL_SHIFT, xmm7); /* xmm7 = y7 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
693 paddsw_r2r (xmm0, xmm4); /* xmm4 = a3+b3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
694 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
695 movdqa_r2m (xmm5, *(col+0*8)); /* save y0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
696 psraw_i2r (COL_SHIFT, xmm3); /* xmm3 = y4 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
697 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
698 movdqa_r2m (xmm6, *(col+6*8)); /* save y6 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
699 psraw_i2r (COL_SHIFT, xmm4); /* xmm4 = y3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
700 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
701 movdqa_r2m (xmm7, *(col+7*8)); /* save y7 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
702 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
703 movdqa_r2m (xmm3, *(col+4*8)); /* save y4 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
704 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
705 movdqa_r2m (xmm4, *(col+3*8)); /* save y3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
706 #endif |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
707 } |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
708 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
709 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
710 /* MMX column IDCT */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
711 static inline void idct_col (int16_t * const col, const int offset) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
712 { |
25997
78c7ef4854ff
Fix illegal identifiers, port of my patch to upstream libmpeg2.
diego
parents:
21526
diff
changeset
|
713 static const short t1_vector[] ATTR_ALIGN(8) = {T1,T1,T1,T1}; |
78c7ef4854ff
Fix illegal identifiers, port of my patch to upstream libmpeg2.
diego
parents:
21526
diff
changeset
|
714 static const short t2_vector[] ATTR_ALIGN(8) = {T2,T2,T2,T2}; |
78c7ef4854ff
Fix illegal identifiers, port of my patch to upstream libmpeg2.
diego
parents:
21526
diff
changeset
|
715 static const short t3_vector[] ATTR_ALIGN(8) = {T3,T3,T3,T3}; |
78c7ef4854ff
Fix illegal identifiers, port of my patch to upstream libmpeg2.
diego
parents:
21526
diff
changeset
|
716 static const short c4_vector[] ATTR_ALIGN(8) = {C4,C4,C4,C4}; |
1 | 717 |
718 /* column code adapted from peter gubanov */ | |
719 /* http://www.elecard.com/peter/idct.shtml */ | |
720 | |
25997
78c7ef4854ff
Fix illegal identifiers, port of my patch to upstream libmpeg2.
diego
parents:
21526
diff
changeset
|
721 movq_m2r (*t1_vector, mm0); /* mm0 = T1 */ |
1 | 722 |
9852 | 723 movq_m2r (*(col+offset+1*8), mm1); /* mm1 = x1 */ |
724 movq_r2r (mm0, mm2); /* mm2 = T1 */ | |
1 | 725 |
9852 | 726 movq_m2r (*(col+offset+7*8), mm4); /* mm4 = x7 */ |
727 pmulhw_r2r (mm1, mm0); /* mm0 = T1*x1 */ | |
1 | 728 |
25997
78c7ef4854ff
Fix illegal identifiers, port of my patch to upstream libmpeg2.
diego
parents:
21526
diff
changeset
|
729 movq_m2r (*t3_vector, mm5); /* mm5 = T3 */ |
9852 | 730 pmulhw_r2r (mm4, mm2); /* mm2 = T1*x7 */ |
1 | 731 |
9852 | 732 movq_m2r (*(col+offset+5*8), mm6); /* mm6 = x5 */ |
733 movq_r2r (mm5, mm7); /* mm7 = T3-1 */ | |
1 | 734 |
9852 | 735 movq_m2r (*(col+offset+3*8), mm3); /* mm3 = x3 */ |
736 psubsw_r2r (mm4, mm0); /* mm0 = v17 */ | |
1 | 737 |
25997
78c7ef4854ff
Fix illegal identifiers, port of my patch to upstream libmpeg2.
diego
parents:
21526
diff
changeset
|
738 movq_m2r (*t2_vector, mm4); /* mm4 = T2 */ |
9852 | 739 pmulhw_r2r (mm3, mm5); /* mm5 = (T3-1)*x3 */ |
1 | 740 |
9852 | 741 paddsw_r2r (mm2, mm1); /* mm1 = u17 */ |
742 pmulhw_r2r (mm6, mm7); /* mm7 = (T3-1)*x5 */ | |
1 | 743 |
36 | 744 /* slot */ |
1 | 745 |
9852 | 746 movq_r2r (mm4, mm2); /* mm2 = T2 */ |
747 paddsw_r2r (mm3, mm5); /* mm5 = T3*x3 */ | |
1 | 748 |
9852 | 749 pmulhw_m2r (*(col+offset+2*8), mm4);/* mm4 = T2*x2 */ |
750 paddsw_r2r (mm6, mm7); /* mm7 = T3*x5 */ | |
1 | 751 |
9852 | 752 psubsw_r2r (mm6, mm5); /* mm5 = v35 */ |
753 paddsw_r2r (mm3, mm7); /* mm7 = u35 */ | |
1 | 754 |
9852 | 755 movq_m2r (*(col+offset+6*8), mm3); /* mm3 = x6 */ |
756 movq_r2r (mm0, mm6); /* mm6 = v17 */ | |
1 | 757 |
9852 | 758 pmulhw_r2r (mm3, mm2); /* mm2 = T2*x6 */ |
759 psubsw_r2r (mm5, mm0); /* mm0 = b3 */ | |
1 | 760 |
9852 | 761 psubsw_r2r (mm3, mm4); /* mm4 = v26 */ |
762 paddsw_r2r (mm6, mm5); /* mm5 = v12 */ | |
1 | 763 |
9852 | 764 movq_r2m (mm0, *(col+offset+3*8)); /* save b3 in scratch0 */ |
765 movq_r2r (mm1, mm6); /* mm6 = u17 */ | |
1 | 766 |
9852 | 767 paddsw_m2r (*(col+offset+2*8), mm2);/* mm2 = u26 */ |
768 paddsw_r2r (mm7, mm6); /* mm6 = b0 */ | |
1 | 769 |
9852 | 770 psubsw_r2r (mm7, mm1); /* mm1 = u12 */ |
771 movq_r2r (mm1, mm7); /* mm7 = u12 */ | |
1 | 772 |
9852 | 773 movq_m2r (*(col+offset+0*8), mm3); /* mm3 = x0 */ |
774 paddsw_r2r (mm5, mm1); /* mm1 = u12+v12 */ | |
1 | 775 |
25997
78c7ef4854ff
Fix illegal identifiers, port of my patch to upstream libmpeg2.
diego
parents:
21526
diff
changeset
|
776 movq_m2r (*c4_vector, mm0); /* mm0 = C4/2 */ |
9852 | 777 psubsw_r2r (mm5, mm7); /* mm7 = u12-v12 */ |
1 | 778 |
9852 | 779 movq_r2m (mm6, *(col+offset+5*8)); /* save b0 in scratch1 */ |
780 pmulhw_r2r (mm0, mm1); /* mm1 = b1/2 */ | |
1 | 781 |
9852 | 782 movq_r2r (mm4, mm6); /* mm6 = v26 */ |
783 pmulhw_r2r (mm0, mm7); /* mm7 = b2/2 */ | |
1 | 784 |
9852 | 785 movq_m2r (*(col+offset+4*8), mm5); /* mm5 = x4 */ |
786 movq_r2r (mm3, mm0); /* mm0 = x0 */ | |
1 | 787 |
9852 | 788 psubsw_r2r (mm5, mm3); /* mm3 = v04 */ |
789 paddsw_r2r (mm5, mm0); /* mm0 = u04 */ | |
1 | 790 |
9852 | 791 paddsw_r2r (mm3, mm4); /* mm4 = a1 */ |
792 movq_r2r (mm0, mm5); /* mm5 = u04 */ | |
1 | 793 |
9852 | 794 psubsw_r2r (mm6, mm3); /* mm3 = a2 */ |
795 paddsw_r2r (mm2, mm5); /* mm5 = a0 */ | |
1 | 796 |
9852 | 797 paddsw_r2r (mm1, mm1); /* mm1 = b1 */ |
798 psubsw_r2r (mm2, mm0); /* mm0 = a3 */ | |
1 | 799 |
9852 | 800 paddsw_r2r (mm7, mm7); /* mm7 = b2 */ |
801 movq_r2r (mm3, mm2); /* mm2 = a2 */ | |
1 | 802 |
9852 | 803 movq_r2r (mm4, mm6); /* mm6 = a1 */ |
804 paddsw_r2r (mm7, mm3); /* mm3 = a2+b2 */ | |
1 | 805 |
9852 | 806 psraw_i2r (COL_SHIFT, mm3); /* mm3 = y2 */ |
807 paddsw_r2r (mm1, mm4); /* mm4 = a1+b1 */ | |
1 | 808 |
9852 | 809 psraw_i2r (COL_SHIFT, mm4); /* mm4 = y1 */ |
810 psubsw_r2r (mm1, mm6); /* mm6 = a1-b1 */ | |
1 | 811 |
9852 | 812 movq_m2r (*(col+offset+5*8), mm1); /* mm1 = b0 */ |
813 psubsw_r2r (mm7, mm2); /* mm2 = a2-b2 */ | |
1 | 814 |
9852 | 815 psraw_i2r (COL_SHIFT, mm6); /* mm6 = y6 */ |
816 movq_r2r (mm5, mm7); /* mm7 = a0 */ | |
1 | 817 |
9852 | 818 movq_r2m (mm4, *(col+offset+1*8)); /* save y1 */ |
819 psraw_i2r (COL_SHIFT, mm2); /* mm2 = y5 */ | |
1 | 820 |
9852 | 821 movq_r2m (mm3, *(col+offset+2*8)); /* save y2 */ |
822 paddsw_r2r (mm1, mm5); /* mm5 = a0+b0 */ | |
1 | 823 |
9852 | 824 movq_m2r (*(col+offset+3*8), mm4); /* mm4 = b3 */ |
825 psubsw_r2r (mm1, mm7); /* mm7 = a0-b0 */ | |
1 | 826 |
9852 | 827 psraw_i2r (COL_SHIFT, mm5); /* mm5 = y0 */ |
828 movq_r2r (mm0, mm3); /* mm3 = a3 */ | |
1 | 829 |
9852 | 830 movq_r2m (mm2, *(col+offset+5*8)); /* save y5 */ |
831 psubsw_r2r (mm4, mm3); /* mm3 = a3-b3 */ | |
1 | 832 |
9852 | 833 psraw_i2r (COL_SHIFT, mm7); /* mm7 = y7 */ |
834 paddsw_r2r (mm0, mm4); /* mm4 = a3+b3 */ | |
1 | 835 |
9852 | 836 movq_r2m (mm5, *(col+offset+0*8)); /* save y0 */ |
837 psraw_i2r (COL_SHIFT, mm3); /* mm3 = y4 */ | |
1 | 838 |
9852 | 839 movq_r2m (mm6, *(col+offset+6*8)); /* save y6 */ |
840 psraw_i2r (COL_SHIFT, mm4); /* mm4 = y3 */ | |
1 | 841 |
9852 | 842 movq_r2m (mm7, *(col+offset+7*8)); /* save y7 */ |
1 | 843 |
9852 | 844 movq_r2m (mm3, *(col+offset+4*8)); /* save y4 */ |
1 | 845 |
9852 | 846 movq_r2m (mm4, *(col+offset+3*8)); /* save y3 */ |
1 | 847 } |
848 | |
849 | |
9852 | 850 static const int32_t rounder0[] ATTR_ALIGN(8) = |
1 | 851 rounder ((1 << (COL_SHIFT - 1)) - 0.5); |
9852 | 852 static const int32_t rounder4[] ATTR_ALIGN(8) = rounder (0); |
853 static const int32_t rounder1[] ATTR_ALIGN(8) = | |
36 | 854 rounder (1.25683487303); /* C1*(C1/C4+C1+C7)/2 */ |
9852 | 855 static const int32_t rounder7[] ATTR_ALIGN(8) = |
36 | 856 rounder (-0.25); /* C1*(C7/C4+C7-C1)/2 */ |
9852 | 857 static const int32_t rounder2[] ATTR_ALIGN(8) = |
36 | 858 rounder (0.60355339059); /* C2 * (C6+C2)/2 */ |
9852 | 859 static const int32_t rounder6[] ATTR_ALIGN(8) = |
36 | 860 rounder (-0.25); /* C2 * (C6-C2)/2 */ |
9852 | 861 static const int32_t rounder3[] ATTR_ALIGN(8) = |
36 | 862 rounder (0.087788325588); /* C3*(-C3/C4+C3+C5)/2 */ |
9852 | 863 static const int32_t rounder5[] ATTR_ALIGN(8) = |
36 | 864 rounder (-0.441341716183); /* C3*(-C5/C4+C5-C3)/2 */ |
1 | 865 |
866 | |
867 #define declare_idct(idct,table,idct_row_head,idct_row,idct_row_tail,idct_row_mid) \ | |
9852 | 868 static inline void idct (int16_t * const block) \ |
1 | 869 { \ |
9852 | 870 static const int16_t table04[] ATTR_ALIGN(16) = \ |
1 | 871 table (22725, 21407, 19266, 16384, 12873, 8867, 4520); \ |
9852 | 872 static const int16_t table17[] ATTR_ALIGN(16) = \ |
1 | 873 table (31521, 29692, 26722, 22725, 17855, 12299, 6270); \ |
9852 | 874 static const int16_t table26[] ATTR_ALIGN(16) = \ |
1 | 875 table (29692, 27969, 25172, 21407, 16819, 11585, 5906); \ |
9852 | 876 static const int16_t table35[] ATTR_ALIGN(16) = \ |
1 | 877 table (26722, 25172, 22654, 19266, 15137, 10426, 5315); \ |
878 \ | |
879 idct_row_head (block, 0*8, table04); \ | |
880 idct_row (table04, rounder0); \ | |
881 idct_row_mid (block, 0*8, 4*8, table04); \ | |
882 idct_row (table04, rounder4); \ | |
883 idct_row_mid (block, 4*8, 1*8, table17); \ | |
884 idct_row (table17, rounder1); \ | |
885 idct_row_mid (block, 1*8, 7*8, table17); \ | |
886 idct_row (table17, rounder7); \ | |
887 idct_row_mid (block, 7*8, 2*8, table26); \ | |
888 idct_row (table26, rounder2); \ | |
889 idct_row_mid (block, 2*8, 6*8, table26); \ | |
890 idct_row (table26, rounder6); \ | |
891 idct_row_mid (block, 6*8, 3*8, table35); \ | |
892 idct_row (table35, rounder3); \ | |
893 idct_row_mid (block, 3*8, 5*8, table35); \ | |
894 idct_row (table35, rounder5); \ | |
895 idct_row_tail (block, 5*8); \ | |
896 \ | |
897 idct_col (block, 0); \ | |
898 idct_col (block, 4); \ | |
899 } | |
900 | |
26393
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
901 static inline void sse2_idct (int16_t * const block) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
902 { |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
903 static const int16_t table04[] ATTR_ALIGN(16) = |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
904 sse2_table (22725, 21407, 19266, 16384, 12873, 8867, 4520); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
905 static const int16_t table17[] ATTR_ALIGN(16) = |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
906 sse2_table (31521, 29692, 26722, 22725, 17855, 12299, 6270); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
907 static const int16_t table26[] ATTR_ALIGN(16) = |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
908 sse2_table (29692, 27969, 25172, 21407, 16819, 11585, 5906); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
909 static const int16_t table35[] ATTR_ALIGN(16) = |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
910 sse2_table (26722, 25172, 22654, 19266, 15137, 10426, 5315); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
911 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
912 static const int32_t rounder0_128[] ATTR_ALIGN(16) = |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
913 rounder_sse2 ((1 << (COL_SHIFT - 1)) - 0.5); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
914 static const int32_t rounder4_128[] ATTR_ALIGN(16) = rounder_sse2 (0); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
915 static const int32_t rounder1_128[] ATTR_ALIGN(16) = |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
916 rounder_sse2 (1.25683487303); /* C1*(C1/C4+C1+C7)/2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
917 static const int32_t rounder7_128[] ATTR_ALIGN(16) = |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
918 rounder_sse2 (-0.25); /* C1*(C7/C4+C7-C1)/2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
919 static const int32_t rounder2_128[] ATTR_ALIGN(16) = |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
920 rounder_sse2 (0.60355339059); /* C2 * (C6+C2)/2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
921 static const int32_t rounder6_128[] ATTR_ALIGN(16) = |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
922 rounder_sse2 (-0.25); /* C2 * (C6-C2)/2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
923 static const int32_t rounder3_128[] ATTR_ALIGN(16) = |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
924 rounder_sse2 (0.087788325588); /* C3*(-C3/C4+C3+C5)/2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
925 static const int32_t rounder5_128[] ATTR_ALIGN(16) = |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
926 rounder_sse2 (-0.441341716183); /* C3*(-C5/C4+C5-C3)/2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
927 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
928 #if defined(__x86_64__) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
929 movdqa_m2r (block[0*8], xmm8); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
930 movdqa_m2r (block[4*8], xmm12); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
931 SSE2_IDCT_2ROW (table04, xmm8, xmm12, *rounder0_128, *rounder4_128); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
932 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
933 movdqa_m2r (block[1*8], xmm9); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
934 movdqa_m2r (block[7*8], xmm15); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
935 SSE2_IDCT_2ROW (table17, xmm9, xmm15, *rounder1_128, *rounder7_128); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
936 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
937 movdqa_m2r (block[2*8], xmm10); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
938 movdqa_m2r (block[6*8], xmm14); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
939 SSE2_IDCT_2ROW (table26, xmm10, xmm14, *rounder2_128, *rounder6_128); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
940 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
941 movdqa_m2r (block[3*8], xmm11); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
942 movdqa_m2r (block[5*8], xmm13); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
943 SSE2_IDCT_2ROW (table35, xmm11, xmm13, *rounder3_128, *rounder5_128); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
944 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
945 /* OUTPUT: block in xmm8 ... xmm15 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
946 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
947 #else |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
948 movdqa_m2r (block[0*8], xmm0); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
949 movdqa_m2r (block[4*8], xmm4); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
950 SSE2_IDCT_2ROW (table04, xmm0, xmm4, *rounder0_128, *rounder4_128); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
951 movdqa_r2m (xmm0, block[0*8]); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
952 movdqa_r2m (xmm4, block[4*8]); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
953 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
954 movdqa_m2r (block[1*8], xmm0); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
955 movdqa_m2r (block[7*8], xmm4); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
956 SSE2_IDCT_2ROW (table17, xmm0, xmm4, *rounder1_128, *rounder7_128); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
957 movdqa_r2m (xmm0, block[1*8]); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
958 movdqa_r2m (xmm4, block[7*8]); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
959 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
960 movdqa_m2r (block[2*8], xmm0); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
961 movdqa_m2r (block[6*8], xmm4); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
962 SSE2_IDCT_2ROW (table26, xmm0, xmm4, *rounder2_128, *rounder6_128); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
963 movdqa_r2m (xmm0, block[2*8]); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
964 movdqa_r2m (xmm4, block[6*8]); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
965 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
966 movdqa_m2r (block[3*8], xmm0); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
967 movdqa_m2r (block[5*8], xmm4); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
968 SSE2_IDCT_2ROW (table35, xmm0, xmm4, *rounder3_128, *rounder5_128); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
969 movdqa_r2m (xmm0, block[3*8]); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
970 movdqa_r2m (xmm4, block[5*8]); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
971 #endif |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
972 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
973 sse2_idct_col (block); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
974 } |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
975 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
976 static void sse2_block_copy (int16_t * const block, uint8_t * dest, |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
977 const int stride) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
978 { |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
979 #if defined(__x86_64__) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
980 /* INPUT: block in xmm8 ... xmm15 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
981 packuswb_r2r (xmm8, xmm8); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
982 packuswb_r2r (xmm9, xmm9); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
983 movq_r2m (xmm8, *(dest+0*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
984 packuswb_r2r (xmm10, xmm10); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
985 movq_r2m (xmm9, *(dest+1*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
986 packuswb_r2r (xmm11, xmm11); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
987 movq_r2m (xmm10, *(dest+2*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
988 packuswb_r2r (xmm12, xmm12); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
989 movq_r2m (xmm11, *(dest+3*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
990 packuswb_r2r (xmm13, xmm13); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
991 movq_r2m (xmm12, *(dest+4*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
992 packuswb_r2r (xmm14, xmm14); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
993 movq_r2m (xmm13, *(dest+5*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
994 packuswb_r2r (xmm15, xmm15); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
995 movq_r2m (xmm14, *(dest+6*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
996 movq_r2m (xmm15, *(dest+7*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
997 #else |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
998 movdqa_m2r (*(block+0*8), xmm0); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
999 movdqa_m2r (*(block+1*8), xmm1); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1000 movdqa_m2r (*(block+2*8), xmm2); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1001 packuswb_r2r (xmm0, xmm0); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1002 movdqa_m2r (*(block+3*8), xmm3); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1003 packuswb_r2r (xmm1, xmm1); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1004 movdqa_m2r (*(block+4*8), xmm4); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1005 packuswb_r2r (xmm2, xmm2); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1006 movdqa_m2r (*(block+5*8), xmm5); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1007 packuswb_r2r (xmm3, xmm3); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1008 movdqa_m2r (*(block+6*8), xmm6); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1009 packuswb_r2r (xmm4, xmm4); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1010 movdqa_m2r (*(block+7*8), xmm7); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1011 movq_r2m (xmm0, *(dest+0*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1012 packuswb_r2r (xmm5, xmm5); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1013 movq_r2m (xmm1, *(dest+1*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1014 packuswb_r2r (xmm6, xmm6); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1015 movq_r2m (xmm2, *(dest+2*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1016 packuswb_r2r (xmm7, xmm7); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1017 movq_r2m (xmm3, *(dest+3*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1018 movq_r2m (xmm4, *(dest+4*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1019 movq_r2m (xmm5, *(dest+5*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1020 movq_r2m (xmm6, *(dest+6*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1021 movq_r2m (xmm7, *(dest+7*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1022 #endif |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1023 } |
1 | 1024 |
1025 #define COPY_MMX(offset,r0,r1,r2) \ | |
1026 do { \ | |
1027 movq_m2r (*(block+offset), r0); \ | |
1028 dest += stride; \ | |
1029 movq_m2r (*(block+offset+4), r1); \ | |
1030 movq_r2m (r2, *dest); \ | |
1031 packuswb_r2r (r1, r0); \ | |
1032 } while (0) | |
1033 | |
9852 | 1034 static inline void block_copy (int16_t * const block, uint8_t * dest, |
1035 const int stride) | |
1 | 1036 { |
1037 movq_m2r (*(block+0*8), mm0); | |
1038 movq_m2r (*(block+0*8+4), mm1); | |
1039 movq_m2r (*(block+1*8), mm2); | |
1040 packuswb_r2r (mm1, mm0); | |
1041 movq_m2r (*(block+1*8+4), mm3); | |
1042 movq_r2m (mm0, *dest); | |
1043 packuswb_r2r (mm3, mm2); | |
1044 COPY_MMX (2*8, mm0, mm1, mm2); | |
1045 COPY_MMX (3*8, mm2, mm3, mm0); | |
1046 COPY_MMX (4*8, mm0, mm1, mm2); | |
1047 COPY_MMX (5*8, mm2, mm3, mm0); | |
1048 COPY_MMX (6*8, mm0, mm1, mm2); | |
1049 COPY_MMX (7*8, mm2, mm3, mm0); | |
1050 movq_r2m (mm2, *(dest+stride)); | |
1051 } | |
1052 | |
26393
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1053 #define ADD_SSE2_2ROW(op, block0, block1)\ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1054 do { \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1055 movq_m2r (*(dest), xmm1); \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1056 movq_m2r (*(dest+stride), xmm2); \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1057 punpcklbw_r2r (xmm0, xmm1); \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1058 punpcklbw_r2r (xmm0, xmm2); \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1059 paddsw_##op (block0, xmm1); \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1060 paddsw_##op (block1, xmm2); \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1061 packuswb_r2r (xmm1, xmm1); \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1062 packuswb_r2r (xmm2, xmm2); \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1063 movq_r2m (xmm1, *(dest)); \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1064 movq_r2m (xmm2, *(dest+stride)); \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1065 dest += 2*stride; \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1066 } while (0) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1067 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1068 static void sse2_block_add (int16_t * const block, uint8_t * dest, |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1069 const int stride) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1070 { |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1071 pxor_r2r(xmm0, xmm0); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1072 #if defined(__x86_64__) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1073 /* INPUT: block in xmm8 ... xmm15 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1074 ADD_SSE2_2ROW(r2r, xmm8, xmm9); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1075 ADD_SSE2_2ROW(r2r, xmm10, xmm11); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1076 ADD_SSE2_2ROW(r2r, xmm12, xmm13); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1077 ADD_SSE2_2ROW(r2r, xmm14, xmm15); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1078 #else |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1079 ADD_SSE2_2ROW(m2r, *(block+0*8), *(block+1*8)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1080 ADD_SSE2_2ROW(m2r, *(block+2*8), *(block+3*8)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1081 ADD_SSE2_2ROW(m2r, *(block+4*8), *(block+5*8)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1082 ADD_SSE2_2ROW(m2r, *(block+6*8), *(block+7*8)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1083 #endif |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1084 } |
1 | 1085 |
1086 #define ADD_MMX(offset,r1,r2,r3,r4) \ | |
1087 do { \ | |
1088 movq_m2r (*(dest+2*stride), r1); \ | |
1089 packuswb_r2r (r4, r3); \ | |
1090 movq_r2r (r1, r2); \ | |
1091 dest += stride; \ | |
1092 movq_r2m (r3, *dest); \ | |
1093 punpcklbw_r2r (mm0, r1); \ | |
1094 paddsw_m2r (*(block+offset), r1); \ | |
1095 punpckhbw_r2r (mm0, r2); \ | |
1096 paddsw_m2r (*(block+offset+4), r2); \ | |
1097 } while (0) | |
1098 | |
9852 | 1099 static inline void block_add (int16_t * const block, uint8_t * dest, |
1100 const int stride) | |
1 | 1101 { |
1102 movq_m2r (*dest, mm1); | |
1103 pxor_r2r (mm0, mm0); | |
1104 movq_m2r (*(dest+stride), mm3); | |
1105 movq_r2r (mm1, mm2); | |
1106 punpcklbw_r2r (mm0, mm1); | |
1107 movq_r2r (mm3, mm4); | |
1108 paddsw_m2r (*(block+0*8), mm1); | |
1109 punpckhbw_r2r (mm0, mm2); | |
1110 paddsw_m2r (*(block+0*8+4), mm2); | |
1111 punpcklbw_r2r (mm0, mm3); | |
1112 paddsw_m2r (*(block+1*8), mm3); | |
1113 packuswb_r2r (mm2, mm1); | |
1114 punpckhbw_r2r (mm0, mm4); | |
1115 movq_r2m (mm1, *dest); | |
1116 paddsw_m2r (*(block+1*8+4), mm4); | |
1117 ADD_MMX (2*8, mm1, mm2, mm3, mm4); | |
1118 ADD_MMX (3*8, mm3, mm4, mm1, mm2); | |
1119 ADD_MMX (4*8, mm1, mm2, mm3, mm4); | |
1120 ADD_MMX (5*8, mm3, mm4, mm1, mm2); | |
1121 ADD_MMX (6*8, mm1, mm2, mm3, mm4); | |
1122 ADD_MMX (7*8, mm3, mm4, mm1, mm2); | |
1123 packuswb_r2r (mm4, mm3); | |
1124 movq_r2m (mm3, *(dest+stride)); | |
1125 } | |
1126 | |
1127 | |
26393
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1128 static inline void sse2_block_zero (int16_t * const block) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1129 { |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1130 pxor_r2r (xmm0, xmm0); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1131 movdqa_r2m (xmm0, *(block+0*8)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1132 movdqa_r2m (xmm0, *(block+1*8)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1133 movdqa_r2m (xmm0, *(block+2*8)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1134 movdqa_r2m (xmm0, *(block+3*8)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1135 movdqa_r2m (xmm0, *(block+4*8)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1136 movdqa_r2m (xmm0, *(block+5*8)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1137 movdqa_r2m (xmm0, *(block+6*8)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1138 movdqa_r2m (xmm0, *(block+7*8)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1139 } |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1140 |
9852 | 1141 static inline void block_zero (int16_t * const block) |
1142 { | |
1143 pxor_r2r (mm0, mm0); | |
1144 movq_r2m (mm0, *(block+0*4)); | |
1145 movq_r2m (mm0, *(block+1*4)); | |
1146 movq_r2m (mm0, *(block+2*4)); | |
1147 movq_r2m (mm0, *(block+3*4)); | |
1148 movq_r2m (mm0, *(block+4*4)); | |
1149 movq_r2m (mm0, *(block+5*4)); | |
1150 movq_r2m (mm0, *(block+6*4)); | |
1151 movq_r2m (mm0, *(block+7*4)); | |
1152 movq_r2m (mm0, *(block+8*4)); | |
1153 movq_r2m (mm0, *(block+9*4)); | |
1154 movq_r2m (mm0, *(block+10*4)); | |
1155 movq_r2m (mm0, *(block+11*4)); | |
1156 movq_r2m (mm0, *(block+12*4)); | |
1157 movq_r2m (mm0, *(block+13*4)); | |
1158 movq_r2m (mm0, *(block+14*4)); | |
1159 movq_r2m (mm0, *(block+15*4)); | |
1160 } | |
1161 | |
1162 | |
1163 #define CPU_MMXEXT 0 | |
1164 #define CPU_MMX 1 | |
1165 | |
1166 #define dup4(reg) \ | |
1167 do { \ | |
1168 if (cpu != CPU_MMXEXT) { \ | |
1169 punpcklwd_r2r (reg, reg); \ | |
1170 punpckldq_r2r (reg, reg); \ | |
1171 } else \ | |
1172 pshufw_r2r (reg, reg, 0x00); \ | |
1173 } while (0) | |
1174 | |
1175 static inline void block_add_DC (int16_t * const block, uint8_t * dest, | |
1176 const int stride, const int cpu) | |
1177 { | |
12932 | 1178 movd_v2r ((block[0] + 64) >> 7, mm0); |
9852 | 1179 pxor_r2r (mm1, mm1); |
1180 movq_m2r (*dest, mm2); | |
1181 dup4 (mm0); | |
1182 psubsw_r2r (mm0, mm1); | |
1183 packuswb_r2r (mm0, mm0); | |
1184 paddusb_r2r (mm0, mm2); | |
1185 packuswb_r2r (mm1, mm1); | |
1186 movq_m2r (*(dest + stride), mm3); | |
1187 psubusb_r2r (mm1, mm2); | |
1188 block[0] = 0; | |
1189 paddusb_r2r (mm0, mm3); | |
1190 movq_r2m (mm2, *dest); | |
1191 psubusb_r2r (mm1, mm3); | |
1192 movq_m2r (*(dest + 2*stride), mm2); | |
1193 dest += stride; | |
1194 movq_r2m (mm3, *dest); | |
1195 paddusb_r2r (mm0, mm2); | |
1196 movq_m2r (*(dest + 2*stride), mm3); | |
1197 psubusb_r2r (mm1, mm2); | |
1198 dest += stride; | |
1199 paddusb_r2r (mm0, mm3); | |
1200 movq_r2m (mm2, *dest); | |
1201 psubusb_r2r (mm1, mm3); | |
1202 movq_m2r (*(dest + 2*stride), mm2); | |
1203 dest += stride; | |
1204 movq_r2m (mm3, *dest); | |
1205 paddusb_r2r (mm0, mm2); | |
1206 movq_m2r (*(dest + 2*stride), mm3); | |
1207 psubusb_r2r (mm1, mm2); | |
1208 dest += stride; | |
1209 paddusb_r2r (mm0, mm3); | |
1210 movq_r2m (mm2, *dest); | |
1211 psubusb_r2r (mm1, mm3); | |
1212 movq_m2r (*(dest + 2*stride), mm2); | |
1213 dest += stride; | |
1214 movq_r2m (mm3, *dest); | |
1215 paddusb_r2r (mm0, mm2); | |
1216 movq_m2r (*(dest + 2*stride), mm3); | |
1217 psubusb_r2r (mm1, mm2); | |
1218 block[63] = 0; | |
1219 paddusb_r2r (mm0, mm3); | |
1220 movq_r2m (mm2, *(dest + stride)); | |
1221 psubusb_r2r (mm1, mm3); | |
1222 movq_r2m (mm3, *(dest + 2*stride)); | |
1223 } | |
1224 | |
26393
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1225 void mpeg2_idct_copy_sse2 (int16_t * const block, uint8_t * const dest, |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1226 const int stride) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1227 { |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1228 sse2_idct (block); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1229 sse2_block_copy (block, dest, stride); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1230 sse2_block_zero (block); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1231 } |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1232 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1233 void mpeg2_idct_add_sse2 (const int last, int16_t * const block, |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1234 uint8_t * const dest, const int stride) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1235 { |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1236 if (last != 129 || (block[0] & (7 << 4)) == (4 << 4)) { |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1237 sse2_idct (block); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1238 sse2_block_add (block, dest, stride); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1239 sse2_block_zero (block); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1240 } else |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1241 block_add_DC (block, dest, stride, CPU_MMXEXT); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1242 } |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1243 |
9852 | 1244 |
1 | 1245 declare_idct (mmxext_idct, mmxext_table, |
1246 mmxext_row_head, mmxext_row, mmxext_row_tail, mmxext_row_mid) | |
1247 | |
9852 | 1248 void mpeg2_idct_copy_mmxext (int16_t * const block, uint8_t * const dest, |
1249 const int stride) | |
1 | 1250 { |
1251 mmxext_idct (block); | |
1252 block_copy (block, dest, stride); | |
9852 | 1253 block_zero (block); |
1 | 1254 } |
1255 | |
9852 | 1256 void mpeg2_idct_add_mmxext (const int last, int16_t * const block, |
1257 uint8_t * const dest, const int stride) | |
1 | 1258 { |
12932 | 1259 if (last != 129 || (block[0] & (7 << 4)) == (4 << 4)) { |
9852 | 1260 mmxext_idct (block); |
1261 block_add (block, dest, stride); | |
1262 block_zero (block); | |
1263 } else | |
1264 block_add_DC (block, dest, stride, CPU_MMXEXT); | |
1 | 1265 } |
1266 | |
1267 | |
1268 declare_idct (mmx_idct, mmx_table, | |
1269 mmx_row_head, mmx_row, mmx_row_tail, mmx_row_mid) | |
1270 | |
9852 | 1271 void mpeg2_idct_copy_mmx (int16_t * const block, uint8_t * const dest, |
1272 const int stride) | |
1 | 1273 { |
1274 mmx_idct (block); | |
1275 block_copy (block, dest, stride); | |
9852 | 1276 block_zero (block); |
1 | 1277 } |
1278 | |
9852 | 1279 void mpeg2_idct_add_mmx (const int last, int16_t * const block, |
1280 uint8_t * const dest, const int stride) | |
1 | 1281 { |
12932 | 1282 if (last != 129 || (block[0] & (7 << 4)) == (4 << 4)) { |
9852 | 1283 mmx_idct (block); |
1284 block_add (block, dest, stride); | |
1285 block_zero (block); | |
1286 } else | |
1287 block_add_DC (block, dest, stride, CPU_MMX); | |
1 | 1288 } |
1289 | |
1290 | |
9852 | 1291 void mpeg2_idct_mmx_init (void) |
1 | 1292 { |
1293 int i, j; | |
1294 | |
36 | 1295 /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */ |
1 | 1296 |
1297 for (i = 0; i < 64; i++) { | |
9852 | 1298 j = mpeg2_scan_norm[i]; |
1299 mpeg2_scan_norm[i] = (j & 0x38) | ((j & 6) >> 1) | ((j & 1) << 2); | |
1300 j = mpeg2_scan_alt[i]; | |
1301 mpeg2_scan_alt[i] = (j & 0x38) | ((j & 6) >> 1) | ((j & 1) << 2); | |
1 | 1302 } |
1303 } | |
1304 | |
1305 #endif |