Mercurial > mplayer.hg
annotate libmpeg2/idct_mmx.c @ 26686:45fc8351ca9b
usec_sleep(0) is not the same as not sleeping at all.
Fixes massive slowdown on Windows.
author | reimar |
---|---|
date | Sat, 10 May 2008 14:03:42 +0000 |
parents | 2506f1b0bdbe |
children | fd18fa10de53 |
rev | line source |
---|---|
1 | 1 /* |
2 * idct_mmx.c | |
10303 | 3 * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org> |
9852 | 4 * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca> |
1 | 5 * |
6 * This file is part of mpeg2dec, a free MPEG-2 video stream decoder. | |
9852 | 7 * See http://libmpeg2.sourceforge.net/ for updates. |
1 | 8 * |
9 * mpeg2dec is free software; you can redistribute it and/or modify | |
10 * it under the terms of the GNU General Public License as published by | |
11 * the Free Software Foundation; either version 2 of the License, or | |
12 * (at your option) any later version. | |
13 * | |
14 * mpeg2dec is distributed in the hope that it will be useful, | |
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
17 * GNU General Public License for more details. | |
18 * | |
19 * You should have received a copy of the GNU General Public License | |
20 * along with this program; if not, write to the Free Software | |
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
14732
1385ec491ffb
Mark locally modified files as such to comply more closely with GPL 2a.
diego
parents:
13864
diff
changeset
|
22 * |
21526 | 23 * Modified for use with MPlayer, see libmpeg-0.4.1.diff for the exact changes. |
18783 | 24 * detailed changelog at http://svn.mplayerhq.hu/mplayer/trunk/ |
14732
1385ec491ffb
Mark locally modified files as such to comply more closely with GPL 2a.
diego
parents:
13864
diff
changeset
|
25 * $Id$ |
1 | 26 */ |
27 | |
28 #include "config.h" | |
29 | |
13864 | 30 #if defined(ARCH_X86) || defined(ARCH_X86_64) |
1 | 31 |
32 #include <inttypes.h> | |
33 | |
9852 | 34 #include "mpeg2.h" |
12932 | 35 #include "attributes.h" |
1 | 36 #include "mpeg2_internal.h" |
37 #include "mmx.h" | |
38 | |
12932 | 39 #define ROW_SHIFT 15 |
1 | 40 #define COL_SHIFT 6 |
41 | |
42 #define round(bias) ((int)(((bias)+0.5) * (1<<ROW_SHIFT))) | |
43 #define rounder(bias) {round (bias), round (bias)} | |
26393
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
44 #define rounder_sse2(bias) {round (bias), round (bias), round (bias), round (bias)} |
1 | 45 |
46 | |
47 #if 0 | |
25998 | 48 /* C row IDCT - it is just here to document the MMXEXT and MMX versions */ |
1 | 49 static inline void idct_row (int16_t * row, int offset, |
50 int16_t * table, int32_t * rounder) | |
51 { | |
52 int C1, C2, C3, C4, C5, C6, C7; | |
53 int a0, a1, a2, a3, b0, b1, b2, b3; | |
54 | |
55 row += offset; | |
56 | |
57 C1 = table[1]; | |
58 C2 = table[2]; | |
59 C3 = table[3]; | |
60 C4 = table[4]; | |
61 C5 = table[5]; | |
62 C6 = table[6]; | |
63 C7 = table[7]; | |
64 | |
65 a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + *rounder; | |
66 a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + *rounder; | |
67 a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + *rounder; | |
68 a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + *rounder; | |
69 | |
70 b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7]; | |
71 b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7]; | |
72 b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7]; | |
73 b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7]; | |
74 | |
75 row[0] = (a0 + b0) >> ROW_SHIFT; | |
76 row[1] = (a1 + b1) >> ROW_SHIFT; | |
77 row[2] = (a2 + b2) >> ROW_SHIFT; | |
78 row[3] = (a3 + b3) >> ROW_SHIFT; | |
79 row[4] = (a3 - b3) >> ROW_SHIFT; | |
80 row[5] = (a2 - b2) >> ROW_SHIFT; | |
81 row[6] = (a1 - b1) >> ROW_SHIFT; | |
82 row[7] = (a0 - b0) >> ROW_SHIFT; | |
83 } | |
84 #endif | |
85 | |
86 | |
26393
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
87 /* SSE2 row IDCT */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
88 #define sse2_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, c4, c6, \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
89 c4, -c6, c4, -c2, \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
90 c4, c6, -c4, -c2, \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
91 -c4, c2, c4, -c6, \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
92 c1, c3, c3, -c7, \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
93 c5, -c1, c7, -c5, \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
94 c5, c7, -c1, -c5, \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
95 c7, c3, c3, -c1 } |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
96 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
97 #define SSE2_IDCT_2ROW(table, row1, row2, round1, round2) do { \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
98 /* no scheduling: trust in out of order execution */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
99 /* based on Intel AP-945 */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
100 /* (http://cache-www.intel.com/cd/00/00/01/76/17680_w_idct.pdf) */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
101 \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
102 /* input */ /* 1: row1= x7 x5 x3 x1 x6 x4 x2 x0 */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
103 pshufd_r2r (row1, xmm1, 0); /* 1: xmm1= x2 x0 x2 x0 x2 x0 x2 x0 */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
104 pmaddwd_m2r (table[0], xmm1); /* 1: xmm1= x2*C + x0*C ... */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
105 pshufd_r2r (row1, xmm3, 0xaa); /* 1: xmm3= x3 x1 x3 x1 x3 x1 x3 x1 */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
106 pmaddwd_m2r (table[2*8], xmm3); /* 1: xmm3= x3*C + x1*C ... */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
107 pshufd_r2r (row1, xmm2, 0x55); /* 1: xmm2= x6 x4 x6 x4 x6 x4 x6 x4 */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
108 pshufd_r2r (row1, row1, 0xff); /* 1: row1= x7 x5 x7 x5 x7 x5 x7 x5 */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
109 pmaddwd_m2r (table[1*8], xmm2); /* 1: xmm2= x6*C + x4*C ... */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
110 paddd_m2r (round1, xmm1); /* 1: xmm1= x2*C + x0*C + round ... */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
111 pmaddwd_m2r (table[3*8], row1); /* 1: row1= x7*C + x5*C ... */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
112 pshufd_r2r (row2, xmm5, 0); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
113 pshufd_r2r (row2, xmm6, 0x55); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
114 pmaddwd_m2r (table[0], xmm5); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
115 paddd_r2r (xmm2, xmm1); /* 1: xmm1= a[] */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
116 movdqa_r2r (xmm1, xmm2); /* 1: xmm2= a[] */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
117 pshufd_r2r (row2, xmm7, 0xaa); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
118 pmaddwd_m2r (table[1*8], xmm6); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
119 paddd_r2r (xmm3, row1); /* 1: row1= b[]= 7*C+5*C+3*C+1*C ... */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
120 pshufd_r2r (row2, row2, 0xff); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
121 psubd_r2r (row1, xmm2); /* 1: xmm2= a[] - b[] */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
122 pmaddwd_m2r (table[2*8], xmm7); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
123 paddd_r2r (xmm1, row1); /* 1: row1= a[] + b[] */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
124 psrad_i2r (ROW_SHIFT, xmm2); /* 1: xmm2= result 4...7 */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
125 paddd_m2r (round2, xmm5); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
126 pmaddwd_m2r (table[3*8], row2); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
127 paddd_r2r (xmm6, xmm5); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
128 movdqa_r2r (xmm5, xmm6); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
129 psrad_i2r (ROW_SHIFT, row1); /* 1: row1= result 0...4 */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
130 pshufd_r2r (xmm2, xmm2, 0x1b); /* 1: [0 1 2 3] -> [3 2 1 0] */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
131 packssdw_r2r (xmm2, row1); /* 1: row1= result[] */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
132 paddd_r2r (xmm7, row2); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
133 psubd_r2r (row2, xmm6); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
134 paddd_r2r (xmm5, row2); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
135 psrad_i2r (ROW_SHIFT, xmm6); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
136 psrad_i2r (ROW_SHIFT, row2); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
137 pshufd_r2r (xmm6, xmm6, 0x1b); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
138 packssdw_r2r (xmm6, row2); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
139 } while (0) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
140 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
141 |
36 | 142 /* MMXEXT row IDCT */ |
1 | 143 |
144 #define mmxext_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, -c4, -c2, \ | |
145 c4, c6, c4, c6, \ | |
146 c1, c3, -c1, -c5, \ | |
147 c5, c7, c3, -c7, \ | |
148 c4, -c6, c4, -c6, \ | |
149 -c4, c2, c4, -c2, \ | |
150 c5, -c1, c3, -c1, \ | |
151 c7, c3, c7, -c5 } | |
152 | |
9852 | 153 static inline void mmxext_row_head (int16_t * const row, const int offset, |
154 const int16_t * const table) | |
1 | 155 { |
9852 | 156 movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */ |
1 | 157 |
9852 | 158 movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ |
159 movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */ | |
1 | 160 |
9852 | 161 movq_m2r (*table, mm3); /* mm3 = -C2 -C4 C2 C4 */ |
162 movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */ | |
1 | 163 |
9852 | 164 movq_m2r (*(table+4), mm4); /* mm4 = C6 C4 C6 C4 */ |
165 pmaddwd_r2r (mm0, mm3); /* mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 */ | |
1 | 166 |
9852 | 167 pshufw_r2r (mm2, mm2, 0x4e); /* mm2 = x2 x0 x6 x4 */ |
1 | 168 } |
169 | |
9852 | 170 static inline void mmxext_row (const int16_t * const table, |
171 const int32_t * const rounder) | |
1 | 172 { |
9852 | 173 movq_m2r (*(table+8), mm1); /* mm1 = -C5 -C1 C3 C1 */ |
174 pmaddwd_r2r (mm2, mm4); /* mm4 = C4*x0+C6*x2 C4*x4+C6*x6 */ | |
1 | 175 |
9852 | 176 pmaddwd_m2r (*(table+16), mm0); /* mm0 = C4*x4-C6*x6 C4*x0-C6*x2 */ |
177 pshufw_r2r (mm6, mm6, 0x4e); /* mm6 = x3 x1 x7 x5 */ | |
1 | 178 |
9852 | 179 movq_m2r (*(table+12), mm7); /* mm7 = -C7 C3 C7 C5 */ |
180 pmaddwd_r2r (mm5, mm1); /* mm1 = -C1*x5-C5*x7 C1*x1+C3*x3 */ | |
1 | 181 |
9852 | 182 paddd_m2r (*rounder, mm3); /* mm3 += rounder */ |
183 pmaddwd_r2r (mm6, mm7); /* mm7 = C3*x1-C7*x3 C5*x5+C7*x7 */ | |
1 | 184 |
9852 | 185 pmaddwd_m2r (*(table+20), mm2); /* mm2 = C4*x0-C2*x2 -C4*x4+C2*x6 */ |
186 paddd_r2r (mm4, mm3); /* mm3 = a1 a0 + rounder */ | |
1 | 187 |
9852 | 188 pmaddwd_m2r (*(table+24), mm5); /* mm5 = C3*x5-C1*x7 C5*x1-C1*x3 */ |
189 movq_r2r (mm3, mm4); /* mm4 = a1 a0 + rounder */ | |
1 | 190 |
9852 | 191 pmaddwd_m2r (*(table+28), mm6); /* mm6 = C7*x1-C5*x3 C7*x5+C3*x7 */ |
192 paddd_r2r (mm7, mm1); /* mm1 = b1 b0 */ | |
1 | 193 |
9852 | 194 paddd_m2r (*rounder, mm0); /* mm0 += rounder */ |
195 psubd_r2r (mm1, mm3); /* mm3 = a1-b1 a0-b0 + rounder */ | |
1 | 196 |
9852 | 197 psrad_i2r (ROW_SHIFT, mm3); /* mm3 = y6 y7 */ |
198 paddd_r2r (mm4, mm1); /* mm1 = a1+b1 a0+b0 + rounder */ | |
1 | 199 |
9852 | 200 paddd_r2r (mm2, mm0); /* mm0 = a3 a2 + rounder */ |
201 psrad_i2r (ROW_SHIFT, mm1); /* mm1 = y1 y0 */ | |
1 | 202 |
9852 | 203 paddd_r2r (mm6, mm5); /* mm5 = b3 b2 */ |
204 movq_r2r (mm0, mm4); /* mm4 = a3 a2 + rounder */ | |
1 | 205 |
9852 | 206 paddd_r2r (mm5, mm0); /* mm0 = a3+b3 a2+b2 + rounder */ |
207 psubd_r2r (mm5, mm4); /* mm4 = a3-b3 a2-b2 + rounder */ | |
1 | 208 } |
209 | |
9852 | 210 static inline void mmxext_row_tail (int16_t * const row, const int store) |
1 | 211 { |
9852 | 212 psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */ |
1 | 213 |
9852 | 214 psrad_i2r (ROW_SHIFT, mm4); /* mm4 = y4 y5 */ |
1 | 215 |
9852 | 216 packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */ |
1 | 217 |
9852 | 218 packssdw_r2r (mm3, mm4); /* mm4 = y6 y7 y4 y5 */ |
1 | 219 |
9852 | 220 movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */ |
221 pshufw_r2r (mm4, mm4, 0xb1); /* mm4 = y7 y6 y5 y4 */ | |
1 | 222 |
36 | 223 /* slot */ |
1 | 224 |
9852 | 225 movq_r2m (mm4, *(row+store+4)); /* save y7 y6 y5 y4 */ |
1 | 226 } |
227 | |
9852 | 228 static inline void mmxext_row_mid (int16_t * const row, const int store, |
229 const int offset, | |
230 const int16_t * const table) | |
1 | 231 { |
9852 | 232 movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */ |
233 psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */ | |
1 | 234 |
9852 | 235 movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ |
236 psrad_i2r (ROW_SHIFT, mm4); /* mm4 = y4 y5 */ | |
1 | 237 |
9852 | 238 packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */ |
239 movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */ | |
1 | 240 |
9852 | 241 packssdw_r2r (mm3, mm4); /* mm4 = y6 y7 y4 y5 */ |
242 movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */ | |
1 | 243 |
9852 | 244 movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */ |
245 pshufw_r2r (mm4, mm4, 0xb1); /* mm4 = y7 y6 y5 y4 */ | |
1 | 246 |
9852 | 247 movq_m2r (*table, mm3); /* mm3 = -C2 -C4 C2 C4 */ |
248 movq_r2m (mm4, *(row+store+4)); /* save y7 y6 y5 y4 */ | |
1 | 249 |
9852 | 250 pmaddwd_r2r (mm0, mm3); /* mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 */ |
1 | 251 |
9852 | 252 movq_m2r (*(table+4), mm4); /* mm4 = C6 C4 C6 C4 */ |
253 pshufw_r2r (mm2, mm2, 0x4e); /* mm2 = x2 x0 x6 x4 */ | |
1 | 254 } |
255 | |
256 | |
36 | 257 /* MMX row IDCT */ |
1 | 258 |
259 #define mmx_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, c4, c6, \ | |
260 c4, c6, -c4, -c2, \ | |
261 c1, c3, c3, -c7, \ | |
262 c5, c7, -c1, -c5, \ | |
263 c4, -c6, c4, -c2, \ | |
264 -c4, c2, c4, -c6, \ | |
265 c5, -c1, c7, -c5, \ | |
266 c7, c3, c3, -c1 } | |
267 | |
9852 | 268 static inline void mmx_row_head (int16_t * const row, const int offset, |
269 const int16_t * const table) | |
1 | 270 { |
9852 | 271 movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */ |
1 | 272 |
9852 | 273 movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ |
274 movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */ | |
1 | 275 |
9852 | 276 movq_m2r (*table, mm3); /* mm3 = C6 C4 C2 C4 */ |
277 movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */ | |
1 | 278 |
9852 | 279 punpckldq_r2r (mm0, mm0); /* mm0 = x2 x0 x2 x0 */ |
1 | 280 |
9852 | 281 movq_m2r (*(table+4), mm4); /* mm4 = -C2 -C4 C6 C4 */ |
282 pmaddwd_r2r (mm0, mm3); /* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */ | |
1 | 283 |
9852 | 284 movq_m2r (*(table+8), mm1); /* mm1 = -C7 C3 C3 C1 */ |
285 punpckhdq_r2r (mm2, mm2); /* mm2 = x6 x4 x6 x4 */ | |
1 | 286 } |
287 | |
9852 | 288 static inline void mmx_row (const int16_t * const table, |
289 const int32_t * const rounder) | |
1 | 290 { |
9852 | 291 pmaddwd_r2r (mm2, mm4); /* mm4 = -C4*x4-C2*x6 C4*x4+C6*x6 */ |
292 punpckldq_r2r (mm5, mm5); /* mm5 = x3 x1 x3 x1 */ | |
1 | 293 |
9852 | 294 pmaddwd_m2r (*(table+16), mm0); /* mm0 = C4*x0-C2*x2 C4*x0-C6*x2 */ |
295 punpckhdq_r2r (mm6, mm6); /* mm6 = x7 x5 x7 x5 */ | |
1 | 296 |
9852 | 297 movq_m2r (*(table+12), mm7); /* mm7 = -C5 -C1 C7 C5 */ |
298 pmaddwd_r2r (mm5, mm1); /* mm1 = C3*x1-C7*x3 C1*x1+C3*x3 */ | |
1 | 299 |
9852 | 300 paddd_m2r (*rounder, mm3); /* mm3 += rounder */ |
301 pmaddwd_r2r (mm6, mm7); /* mm7 = -C1*x5-C5*x7 C5*x5+C7*x7 */ | |
1 | 302 |
9852 | 303 pmaddwd_m2r (*(table+20), mm2); /* mm2 = C4*x4-C6*x6 -C4*x4+C2*x6 */ |
304 paddd_r2r (mm4, mm3); /* mm3 = a1 a0 + rounder */ | |
1 | 305 |
9852 | 306 pmaddwd_m2r (*(table+24), mm5); /* mm5 = C7*x1-C5*x3 C5*x1-C1*x3 */ |
307 movq_r2r (mm3, mm4); /* mm4 = a1 a0 + rounder */ | |
1 | 308 |
9852 | 309 pmaddwd_m2r (*(table+28), mm6); /* mm6 = C3*x5-C1*x7 C7*x5+C3*x7 */ |
310 paddd_r2r (mm7, mm1); /* mm1 = b1 b0 */ | |
1 | 311 |
9852 | 312 paddd_m2r (*rounder, mm0); /* mm0 += rounder */ |
313 psubd_r2r (mm1, mm3); /* mm3 = a1-b1 a0-b0 + rounder */ | |
1 | 314 |
9852 | 315 psrad_i2r (ROW_SHIFT, mm3); /* mm3 = y6 y7 */ |
316 paddd_r2r (mm4, mm1); /* mm1 = a1+b1 a0+b0 + rounder */ | |
1 | 317 |
9852 | 318 paddd_r2r (mm2, mm0); /* mm0 = a3 a2 + rounder */ |
319 psrad_i2r (ROW_SHIFT, mm1); /* mm1 = y1 y0 */ | |
1 | 320 |
9852 | 321 paddd_r2r (mm6, mm5); /* mm5 = b3 b2 */ |
322 movq_r2r (mm0, mm7); /* mm7 = a3 a2 + rounder */ | |
1 | 323 |
9852 | 324 paddd_r2r (mm5, mm0); /* mm0 = a3+b3 a2+b2 + rounder */ |
325 psubd_r2r (mm5, mm7); /* mm7 = a3-b3 a2-b2 + rounder */ | |
1 | 326 } |
327 | |
9852 | 328 static inline void mmx_row_tail (int16_t * const row, const int store) |
1 | 329 { |
9852 | 330 psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */ |
1 | 331 |
9852 | 332 psrad_i2r (ROW_SHIFT, mm7); /* mm7 = y4 y5 */ |
1 | 333 |
9852 | 334 packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */ |
1 | 335 |
9852 | 336 packssdw_r2r (mm3, mm7); /* mm7 = y6 y7 y4 y5 */ |
1 | 337 |
9852 | 338 movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */ |
339 movq_r2r (mm7, mm4); /* mm4 = y6 y7 y4 y5 */ | |
1 | 340 |
9852 | 341 pslld_i2r (16, mm7); /* mm7 = y7 0 y5 0 */ |
1 | 342 |
9852 | 343 psrld_i2r (16, mm4); /* mm4 = 0 y6 0 y4 */ |
1 | 344 |
9852 | 345 por_r2r (mm4, mm7); /* mm7 = y7 y6 y5 y4 */ |
1 | 346 |
36 | 347 /* slot */ |
1 | 348 |
9852 | 349 movq_r2m (mm7, *(row+store+4)); /* save y7 y6 y5 y4 */ |
1 | 350 } |
351 | |
9852 | 352 static inline void mmx_row_mid (int16_t * const row, const int store, |
353 const int offset, const int16_t * const table) | |
1 | 354 { |
9852 | 355 movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */ |
356 psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */ | |
1 | 357 |
9852 | 358 movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ |
359 psrad_i2r (ROW_SHIFT, mm7); /* mm7 = y4 y5 */ | |
1 | 360 |
9852 | 361 packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */ |
362 movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */ | |
1 | 363 |
9852 | 364 packssdw_r2r (mm3, mm7); /* mm7 = y6 y7 y4 y5 */ |
365 movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */ | |
1 | 366 |
9852 | 367 movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */ |
368 movq_r2r (mm7, mm1); /* mm1 = y6 y7 y4 y5 */ | |
1 | 369 |
9852 | 370 punpckldq_r2r (mm0, mm0); /* mm0 = x2 x0 x2 x0 */ |
371 psrld_i2r (16, mm7); /* mm7 = 0 y6 0 y4 */ | |
1 | 372 |
9852 | 373 movq_m2r (*table, mm3); /* mm3 = C6 C4 C2 C4 */ |
374 pslld_i2r (16, mm1); /* mm1 = y7 0 y5 0 */ | |
1 | 375 |
9852 | 376 movq_m2r (*(table+4), mm4); /* mm4 = -C2 -C4 C6 C4 */ |
377 por_r2r (mm1, mm7); /* mm7 = y7 y6 y5 y4 */ | |
1 | 378 |
9852 | 379 movq_m2r (*(table+8), mm1); /* mm1 = -C7 C3 C3 C1 */ |
380 punpckhdq_r2r (mm2, mm2); /* mm2 = x6 x4 x6 x4 */ | |
1 | 381 |
9852 | 382 movq_r2m (mm7, *(row+store+4)); /* save y7 y6 y5 y4 */ |
383 pmaddwd_r2r (mm0, mm3); /* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */ | |
1 | 384 } |
385 | |
386 | |
387 #if 0 | |
25998 | 388 /* C column IDCT - it is just here to document the MMXEXT and MMX versions */ |
1 | 389 static inline void idct_col (int16_t * col, int offset) |
390 { | |
36 | 391 /* multiplication - as implemented on mmx */ |
1 | 392 #define F(c,x) (((c) * (x)) >> 16) |
393 | |
36 | 394 /* saturation - it helps us handle torture test cases */ |
1 | 395 #define S(x) (((x)>32767) ? 32767 : ((x)<-32768) ? -32768 : (x)) |
396 | |
397 int16_t x0, x1, x2, x3, x4, x5, x6, x7; | |
398 int16_t y0, y1, y2, y3, y4, y5, y6, y7; | |
399 int16_t a0, a1, a2, a3, b0, b1, b2, b3; | |
400 int16_t u04, v04, u26, v26, u17, v17, u35, v35, u12, v12; | |
401 | |
402 col += offset; | |
403 | |
404 x0 = col[0*8]; | |
405 x1 = col[1*8]; | |
406 x2 = col[2*8]; | |
407 x3 = col[3*8]; | |
408 x4 = col[4*8]; | |
409 x5 = col[5*8]; | |
410 x6 = col[6*8]; | |
411 x7 = col[7*8]; | |
412 | |
413 u04 = S (x0 + x4); | |
414 v04 = S (x0 - x4); | |
36 | 415 u26 = S (F (T2, x6) + x2); |
416 v26 = S (F (T2, x2) - x6); | |
1 | 417 |
418 a0 = S (u04 + u26); | |
419 a1 = S (v04 + v26); | |
420 a2 = S (v04 - v26); | |
421 a3 = S (u04 - u26); | |
422 | |
36 | 423 u17 = S (F (T1, x7) + x1); |
424 v17 = S (F (T1, x1) - x7); | |
425 u35 = S (F (T3, x5) + x3); | |
426 v35 = S (F (T3, x3) - x5); | |
1 | 427 |
428 b0 = S (u17 + u35); | |
429 b3 = S (v17 - v35); | |
430 u12 = S (u17 - u35); | |
431 v12 = S (v17 + v35); | |
36 | 432 u12 = S (2 * F (C4, u12)); |
433 v12 = S (2 * F (C4, v12)); | |
1 | 434 b1 = S (u12 + v12); |
435 b2 = S (u12 - v12); | |
436 | |
437 y0 = S (a0 + b0) >> COL_SHIFT; | |
438 y1 = S (a1 + b1) >> COL_SHIFT; | |
439 y2 = S (a2 + b2) >> COL_SHIFT; | |
440 y3 = S (a3 + b3) >> COL_SHIFT; | |
441 | |
442 y4 = S (a3 - b3) >> COL_SHIFT; | |
443 y5 = S (a2 - b2) >> COL_SHIFT; | |
444 y6 = S (a1 - b1) >> COL_SHIFT; | |
445 y7 = S (a0 - b0) >> COL_SHIFT; | |
446 | |
447 col[0*8] = y0; | |
448 col[1*8] = y1; | |
449 col[2*8] = y2; | |
450 col[3*8] = y3; | |
451 col[4*8] = y4; | |
452 col[5*8] = y5; | |
453 col[6*8] = y6; | |
454 col[7*8] = y7; | |
455 } | |
456 #endif | |
457 | |
458 | |
459 #define T1 13036 | |
460 #define T2 27146 | |
461 #define T3 43790 | |
462 #define C4 23170 | |
463 | |
26393
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
464 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
465 /* SSE2 column IDCT */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
466 static inline void sse2_idct_col (int16_t * const col) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
467 { |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
468 /* Almost identical to mmxext version: */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
469 /* just do both 4x8 columns in paraller */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
470 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
471 static const short t1_vector[] ATTR_ALIGN(16) = {T1,T1,T1,T1,T1,T1,T1,T1}; |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
472 static const short t2_vector[] ATTR_ALIGN(16) = {T2,T2,T2,T2,T2,T2,T2,T2}; |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
473 static const short t3_vector[] ATTR_ALIGN(16) = {T3,T3,T3,T3,T3,T3,T3,T3}; |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
474 static const short c4_vector[] ATTR_ALIGN(16) = {C4,C4,C4,C4,C4,C4,C4,C4}; |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
475 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
476 #if defined(__x86_64__) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
477 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
478 /* INPUT: block in xmm8 ... xmm15 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
479 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
480 movdqa_m2r (*t1_vector, xmm0); /* xmm0 = T1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
481 movdqa_r2r (xmm9, xmm1); /* xmm1 = x1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
482 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
483 movdqa_r2r (xmm0, xmm2); /* xmm2 = T1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
484 pmulhw_r2r (xmm1, xmm0); /* xmm0 = T1*x1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
485 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
486 movdqa_m2r (*t3_vector, xmm5); /* xmm5 = T3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
487 pmulhw_r2r (xmm15, xmm2); /* xmm2 = T1*x7 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
488 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
489 movdqa_r2r (xmm5, xmm7); /* xmm7 = T3-1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
490 psubsw_r2r (xmm15, xmm0); /* xmm0 = v17 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
491 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
492 movdqa_m2r (*t2_vector, xmm9); /* xmm9 = T2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
493 pmulhw_r2r (xmm11, xmm5); /* xmm5 = (T3-1)*x3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
494 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
495 paddsw_r2r (xmm2, xmm1); /* xmm1 = u17 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
496 pmulhw_r2r (xmm13, xmm7); /* xmm7 = (T3-1)*x5 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
497 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
498 movdqa_r2r (xmm9, xmm2); /* xmm2 = T2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
499 paddsw_r2r (xmm11, xmm5); /* xmm5 = T3*x3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
500 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
501 pmulhw_r2r (xmm10, xmm9); /* xmm9 = T2*x2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
502 paddsw_r2r (xmm13, xmm7); /* xmm7 = T3*x5 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
503 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
504 psubsw_r2r (xmm13, xmm5); /* xmm5 = v35 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
505 paddsw_r2r (xmm11, xmm7); /* xmm7 = u35 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
506 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
507 movdqa_r2r (xmm0, xmm6); /* xmm6 = v17 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
508 pmulhw_r2r (xmm14, xmm2); /* xmm2 = T2*x6 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
509 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
510 psubsw_r2r (xmm5, xmm0); /* xmm0 = b3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
511 psubsw_r2r (xmm14, xmm9); /* xmm9 = v26 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
512 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
513 paddsw_r2r (xmm6, xmm5); /* xmm5 = v12 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
514 movdqa_r2r (xmm0, xmm11); /* xmm11 = b3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
515 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
516 movdqa_r2r (xmm1, xmm6); /* xmm6 = u17 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
517 paddsw_r2r (xmm10, xmm2); /* xmm2 = u26 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
518 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
519 paddsw_r2r (xmm7, xmm6); /* xmm6 = b0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
520 psubsw_r2r (xmm7, xmm1); /* xmm1 = u12 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
521 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
522 movdqa_r2r (xmm1, xmm7); /* xmm7 = u12 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
523 paddsw_r2r (xmm5, xmm1); /* xmm1 = u12+v12 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
524 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
525 movdqa_m2r (*c4_vector, xmm0); /* xmm0 = C4/2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
526 psubsw_r2r (xmm5, xmm7); /* xmm7 = u12-v12 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
527 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
528 movdqa_r2r (xmm6, xmm4); /* xmm4 = b0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
529 pmulhw_r2r (xmm0, xmm1); /* xmm1 = b1/2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
530 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
531 movdqa_r2r (xmm9, xmm6); /* xmm6 = v26 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
532 pmulhw_r2r (xmm0, xmm7); /* xmm7 = b2/2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
533 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
534 movdqa_r2r (xmm8, xmm10); /* xmm10 = x0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
535 movdqa_r2r (xmm8, xmm0); /* xmm0 = x0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
536 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
537 psubsw_r2r (xmm12, xmm10); /* xmm10 = v04 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
538 paddsw_r2r (xmm12, xmm0); /* xmm0 = u04 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
539 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
540 paddsw_r2r (xmm10, xmm9); /* xmm9 = a1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
541 movdqa_r2r (xmm0, xmm8); /* xmm8 = u04 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
542 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
543 psubsw_r2r (xmm6, xmm10); /* xmm10 = a2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
544 paddsw_r2r (xmm2, xmm8); /* xmm5 = a0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
545 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
546 paddsw_r2r (xmm1, xmm1); /* xmm1 = b1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
547 psubsw_r2r (xmm2, xmm0); /* xmm0 = a3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
548 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
549 paddsw_r2r (xmm7, xmm7); /* xmm7 = b2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
550 movdqa_r2r (xmm10, xmm13); /* xmm13 = a2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
551 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
552 movdqa_r2r (xmm9, xmm14); /* xmm14 = a1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
553 paddsw_r2r (xmm7, xmm10); /* xmm10 = a2+b2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
554 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
555 psraw_i2r (COL_SHIFT,xmm10); /* xmm10 = y2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
556 paddsw_r2r (xmm1, xmm9); /* xmm9 = a1+b1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
557 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
558 psraw_i2r (COL_SHIFT, xmm9); /* xmm9 = y1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
559 psubsw_r2r (xmm1, xmm14); /* xmm14 = a1-b1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
560 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
561 psubsw_r2r (xmm7, xmm13); /* xmm13 = a2-b2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
562 psraw_i2r (COL_SHIFT,xmm14); /* xmm14 = y6 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
563 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
564 movdqa_r2r (xmm8, xmm15); /* xmm15 = a0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
565 psraw_i2r (COL_SHIFT,xmm13); /* xmm13 = y5 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
566 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
567 paddsw_r2r (xmm4, xmm8); /* xmm8 = a0+b0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
568 psubsw_r2r (xmm4, xmm15); /* xmm15 = a0-b0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
569 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
570 psraw_i2r (COL_SHIFT, xmm8); /* xmm8 = y0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
571 movdqa_r2r (xmm0, xmm12); /* xmm12 = a3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
572 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
573 psubsw_r2r (xmm11, xmm12); /* xmm12 = a3-b3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
574 psraw_i2r (COL_SHIFT,xmm15); /* xmm15 = y7 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
575 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
576 paddsw_r2r (xmm0, xmm11); /* xmm11 = a3+b3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
577 psraw_i2r (COL_SHIFT,xmm12); /* xmm12 = y4 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
578 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
579 psraw_i2r (COL_SHIFT,xmm11); /* xmm11 = y3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
580 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
581 /* OUTPUT: block in xmm8 ... xmm15 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
582 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
583 #else |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
584 movdqa_m2r (*t1_vector, xmm0); /* xmm0 = T1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
585 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
586 movdqa_m2r (*(col+1*8), xmm1); /* xmm1 = x1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
587 movdqa_r2r (xmm0, xmm2); /* xmm2 = T1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
588 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
589 movdqa_m2r (*(col+7*8), xmm4); /* xmm4 = x7 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
590 pmulhw_r2r (xmm1, xmm0); /* xmm0 = T1*x1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
591 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
592 movdqa_m2r (*t3_vector, xmm5); /* xmm5 = T3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
593 pmulhw_r2r (xmm4, xmm2); /* xmm2 = T1*x7 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
594 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
595 movdqa_m2r (*(col+5*8), xmm6); /* xmm6 = x5 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
596 movdqa_r2r (xmm5, xmm7); /* xmm7 = T3-1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
597 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
598 movdqa_m2r (*(col+3*8), xmm3); /* xmm3 = x3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
599 psubsw_r2r (xmm4, xmm0); /* xmm0 = v17 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
600 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
601 movdqa_m2r (*t2_vector, xmm4); /* xmm4 = T2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
602 pmulhw_r2r (xmm3, xmm5); /* xmm5 = (T3-1)*x3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
603 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
604 paddsw_r2r (xmm2, xmm1); /* xmm1 = u17 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
605 pmulhw_r2r (xmm6, xmm7); /* xmm7 = (T3-1)*x5 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
606 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
607 /* slot */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
608 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
609 movdqa_r2r (xmm4, xmm2); /* xmm2 = T2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
610 paddsw_r2r (xmm3, xmm5); /* xmm5 = T3*x3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
611 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
612 pmulhw_m2r (*(col+2*8), xmm4); /* xmm4 = T2*x2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
613 paddsw_r2r (xmm6, xmm7); /* xmm7 = T3*x5 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
614 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
615 psubsw_r2r (xmm6, xmm5); /* xmm5 = v35 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
616 paddsw_r2r (xmm3, xmm7); /* xmm7 = u35 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
617 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
618 movdqa_m2r (*(col+6*8), xmm3); /* xmm3 = x6 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
619 movdqa_r2r (xmm0, xmm6); /* xmm6 = v17 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
620 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
621 pmulhw_r2r (xmm3, xmm2); /* xmm2 = T2*x6 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
622 psubsw_r2r (xmm5, xmm0); /* xmm0 = b3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
623 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
624 psubsw_r2r (xmm3, xmm4); /* xmm4 = v26 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
625 paddsw_r2r (xmm6, xmm5); /* xmm5 = v12 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
626 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
627 movdqa_r2m (xmm0, *(col+3*8)); /* save b3 in scratch0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
628 movdqa_r2r (xmm1, xmm6); /* xmm6 = u17 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
629 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
630 paddsw_m2r (*(col+2*8), xmm2); /* xmm2 = u26 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
631 paddsw_r2r (xmm7, xmm6); /* xmm6 = b0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
632 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
633 psubsw_r2r (xmm7, xmm1); /* xmm1 = u12 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
634 movdqa_r2r (xmm1, xmm7); /* xmm7 = u12 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
635 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
636 movdqa_m2r (*(col+0*8), xmm3); /* xmm3 = x0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
637 paddsw_r2r (xmm5, xmm1); /* xmm1 = u12+v12 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
638 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
639 movdqa_m2r (*c4_vector, xmm0); /* xmm0 = C4/2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
640 psubsw_r2r (xmm5, xmm7); /* xmm7 = u12-v12 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
641 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
642 movdqa_r2m (xmm6, *(col+5*8)); /* save b0 in scratch1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
643 pmulhw_r2r (xmm0, xmm1); /* xmm1 = b1/2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
644 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
645 movdqa_r2r (xmm4, xmm6); /* xmm6 = v26 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
646 pmulhw_r2r (xmm0, xmm7); /* xmm7 = b2/2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
647 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
648 movdqa_m2r (*(col+4*8), xmm5); /* xmm5 = x4 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
649 movdqa_r2r (xmm3, xmm0); /* xmm0 = x0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
650 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
651 psubsw_r2r (xmm5, xmm3); /* xmm3 = v04 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
652 paddsw_r2r (xmm5, xmm0); /* xmm0 = u04 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
653 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
654 paddsw_r2r (xmm3, xmm4); /* xmm4 = a1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
655 movdqa_r2r (xmm0, xmm5); /* xmm5 = u04 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
656 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
657 psubsw_r2r (xmm6, xmm3); /* xmm3 = a2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
658 paddsw_r2r (xmm2, xmm5); /* xmm5 = a0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
659 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
660 paddsw_r2r (xmm1, xmm1); /* xmm1 = b1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
661 psubsw_r2r (xmm2, xmm0); /* xmm0 = a3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
662 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
663 paddsw_r2r (xmm7, xmm7); /* xmm7 = b2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
664 movdqa_r2r (xmm3, xmm2); /* xmm2 = a2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
665 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
666 movdqa_r2r (xmm4, xmm6); /* xmm6 = a1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
667 paddsw_r2r (xmm7, xmm3); /* xmm3 = a2+b2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
668 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
669 psraw_i2r (COL_SHIFT, xmm3); /* xmm3 = y2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
670 paddsw_r2r (xmm1, xmm4); /* xmm4 = a1+b1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
671 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
672 psraw_i2r (COL_SHIFT, xmm4); /* xmm4 = y1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
673 psubsw_r2r (xmm1, xmm6); /* xmm6 = a1-b1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
674 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
675 movdqa_m2r (*(col+5*8), xmm1); /* xmm1 = b0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
676 psubsw_r2r (xmm7, xmm2); /* xmm2 = a2-b2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
677 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
678 psraw_i2r (COL_SHIFT, xmm6); /* xmm6 = y6 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
679 movdqa_r2r (xmm5, xmm7); /* xmm7 = a0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
680 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
681 movdqa_r2m (xmm4, *(col+1*8)); /* save y1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
682 psraw_i2r (COL_SHIFT, xmm2); /* xmm2 = y5 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
683 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
684 movdqa_r2m (xmm3, *(col+2*8)); /* save y2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
685 paddsw_r2r (xmm1, xmm5); /* xmm5 = a0+b0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
686 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
687 movdqa_m2r (*(col+3*8), xmm4); /* xmm4 = b3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
688 psubsw_r2r (xmm1, xmm7); /* xmm7 = a0-b0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
689 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
690 psraw_i2r (COL_SHIFT, xmm5); /* xmm5 = y0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
691 movdqa_r2r (xmm0, xmm3); /* xmm3 = a3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
692 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
693 movdqa_r2m (xmm2, *(col+5*8)); /* save y5 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
694 psubsw_r2r (xmm4, xmm3); /* xmm3 = a3-b3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
695 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
696 psraw_i2r (COL_SHIFT, xmm7); /* xmm7 = y7 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
697 paddsw_r2r (xmm0, xmm4); /* xmm4 = a3+b3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
698 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
699 movdqa_r2m (xmm5, *(col+0*8)); /* save y0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
700 psraw_i2r (COL_SHIFT, xmm3); /* xmm3 = y4 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
701 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
702 movdqa_r2m (xmm6, *(col+6*8)); /* save y6 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
703 psraw_i2r (COL_SHIFT, xmm4); /* xmm4 = y3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
704 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
705 movdqa_r2m (xmm7, *(col+7*8)); /* save y7 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
706 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
707 movdqa_r2m (xmm3, *(col+4*8)); /* save y4 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
708 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
709 movdqa_r2m (xmm4, *(col+3*8)); /* save y3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
710 #endif |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
711 } |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
712 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
713 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
714 /* MMX column IDCT */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
715 static inline void idct_col (int16_t * const col, const int offset) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
716 { |
25997
78c7ef4854ff
Fix illegal identifiers, port of my patch to upstream libmpeg2.
diego
parents:
21526
diff
changeset
|
717 static const short t1_vector[] ATTR_ALIGN(8) = {T1,T1,T1,T1}; |
78c7ef4854ff
Fix illegal identifiers, port of my patch to upstream libmpeg2.
diego
parents:
21526
diff
changeset
|
718 static const short t2_vector[] ATTR_ALIGN(8) = {T2,T2,T2,T2}; |
78c7ef4854ff
Fix illegal identifiers, port of my patch to upstream libmpeg2.
diego
parents:
21526
diff
changeset
|
719 static const short t3_vector[] ATTR_ALIGN(8) = {T3,T3,T3,T3}; |
78c7ef4854ff
Fix illegal identifiers, port of my patch to upstream libmpeg2.
diego
parents:
21526
diff
changeset
|
720 static const short c4_vector[] ATTR_ALIGN(8) = {C4,C4,C4,C4}; |
1 | 721 |
722 /* column code adapted from peter gubanov */ | |
723 /* http://www.elecard.com/peter/idct.shtml */ | |
724 | |
25997
78c7ef4854ff
Fix illegal identifiers, port of my patch to upstream libmpeg2.
diego
parents:
21526
diff
changeset
|
725 movq_m2r (*t1_vector, mm0); /* mm0 = T1 */ |
1 | 726 |
9852 | 727 movq_m2r (*(col+offset+1*8), mm1); /* mm1 = x1 */ |
728 movq_r2r (mm0, mm2); /* mm2 = T1 */ | |
1 | 729 |
9852 | 730 movq_m2r (*(col+offset+7*8), mm4); /* mm4 = x7 */ |
731 pmulhw_r2r (mm1, mm0); /* mm0 = T1*x1 */ | |
1 | 732 |
25997
78c7ef4854ff
Fix illegal identifiers, port of my patch to upstream libmpeg2.
diego
parents:
21526
diff
changeset
|
733 movq_m2r (*t3_vector, mm5); /* mm5 = T3 */ |
9852 | 734 pmulhw_r2r (mm4, mm2); /* mm2 = T1*x7 */ |
1 | 735 |
9852 | 736 movq_m2r (*(col+offset+5*8), mm6); /* mm6 = x5 */ |
737 movq_r2r (mm5, mm7); /* mm7 = T3-1 */ | |
1 | 738 |
9852 | 739 movq_m2r (*(col+offset+3*8), mm3); /* mm3 = x3 */ |
740 psubsw_r2r (mm4, mm0); /* mm0 = v17 */ | |
1 | 741 |
25997
78c7ef4854ff
Fix illegal identifiers, port of my patch to upstream libmpeg2.
diego
parents:
21526
diff
changeset
|
742 movq_m2r (*t2_vector, mm4); /* mm4 = T2 */ |
9852 | 743 pmulhw_r2r (mm3, mm5); /* mm5 = (T3-1)*x3 */ |
1 | 744 |
9852 | 745 paddsw_r2r (mm2, mm1); /* mm1 = u17 */ |
746 pmulhw_r2r (mm6, mm7); /* mm7 = (T3-1)*x5 */ | |
1 | 747 |
36 | 748 /* slot */ |
1 | 749 |
9852 | 750 movq_r2r (mm4, mm2); /* mm2 = T2 */ |
751 paddsw_r2r (mm3, mm5); /* mm5 = T3*x3 */ | |
1 | 752 |
9852 | 753 pmulhw_m2r (*(col+offset+2*8), mm4);/* mm4 = T2*x2 */ |
754 paddsw_r2r (mm6, mm7); /* mm7 = T3*x5 */ | |
1 | 755 |
9852 | 756 psubsw_r2r (mm6, mm5); /* mm5 = v35 */ |
757 paddsw_r2r (mm3, mm7); /* mm7 = u35 */ | |
1 | 758 |
9852 | 759 movq_m2r (*(col+offset+6*8), mm3); /* mm3 = x6 */ |
760 movq_r2r (mm0, mm6); /* mm6 = v17 */ | |
1 | 761 |
9852 | 762 pmulhw_r2r (mm3, mm2); /* mm2 = T2*x6 */ |
763 psubsw_r2r (mm5, mm0); /* mm0 = b3 */ | |
1 | 764 |
9852 | 765 psubsw_r2r (mm3, mm4); /* mm4 = v26 */ |
766 paddsw_r2r (mm6, mm5); /* mm5 = v12 */ | |
1 | 767 |
9852 | 768 movq_r2m (mm0, *(col+offset+3*8)); /* save b3 in scratch0 */ |
769 movq_r2r (mm1, mm6); /* mm6 = u17 */ | |
1 | 770 |
9852 | 771 paddsw_m2r (*(col+offset+2*8), mm2);/* mm2 = u26 */ |
772 paddsw_r2r (mm7, mm6); /* mm6 = b0 */ | |
1 | 773 |
9852 | 774 psubsw_r2r (mm7, mm1); /* mm1 = u12 */ |
775 movq_r2r (mm1, mm7); /* mm7 = u12 */ | |
1 | 776 |
9852 | 777 movq_m2r (*(col+offset+0*8), mm3); /* mm3 = x0 */ |
778 paddsw_r2r (mm5, mm1); /* mm1 = u12+v12 */ | |
1 | 779 |
25997
78c7ef4854ff
Fix illegal identifiers, port of my patch to upstream libmpeg2.
diego
parents:
21526
diff
changeset
|
780 movq_m2r (*c4_vector, mm0); /* mm0 = C4/2 */ |
9852 | 781 psubsw_r2r (mm5, mm7); /* mm7 = u12-v12 */ |
1 | 782 |
9852 | 783 movq_r2m (mm6, *(col+offset+5*8)); /* save b0 in scratch1 */ |
784 pmulhw_r2r (mm0, mm1); /* mm1 = b1/2 */ | |
1 | 785 |
9852 | 786 movq_r2r (mm4, mm6); /* mm6 = v26 */ |
787 pmulhw_r2r (mm0, mm7); /* mm7 = b2/2 */ | |
1 | 788 |
9852 | 789 movq_m2r (*(col+offset+4*8), mm5); /* mm5 = x4 */ |
790 movq_r2r (mm3, mm0); /* mm0 = x0 */ | |
1 | 791 |
9852 | 792 psubsw_r2r (mm5, mm3); /* mm3 = v04 */ |
793 paddsw_r2r (mm5, mm0); /* mm0 = u04 */ | |
1 | 794 |
9852 | 795 paddsw_r2r (mm3, mm4); /* mm4 = a1 */ |
796 movq_r2r (mm0, mm5); /* mm5 = u04 */ | |
1 | 797 |
9852 | 798 psubsw_r2r (mm6, mm3); /* mm3 = a2 */ |
799 paddsw_r2r (mm2, mm5); /* mm5 = a0 */ | |
1 | 800 |
9852 | 801 paddsw_r2r (mm1, mm1); /* mm1 = b1 */ |
802 psubsw_r2r (mm2, mm0); /* mm0 = a3 */ | |
1 | 803 |
9852 | 804 paddsw_r2r (mm7, mm7); /* mm7 = b2 */ |
805 movq_r2r (mm3, mm2); /* mm2 = a2 */ | |
1 | 806 |
9852 | 807 movq_r2r (mm4, mm6); /* mm6 = a1 */ |
808 paddsw_r2r (mm7, mm3); /* mm3 = a2+b2 */ | |
1 | 809 |
9852 | 810 psraw_i2r (COL_SHIFT, mm3); /* mm3 = y2 */ |
811 paddsw_r2r (mm1, mm4); /* mm4 = a1+b1 */ | |
1 | 812 |
9852 | 813 psraw_i2r (COL_SHIFT, mm4); /* mm4 = y1 */ |
814 psubsw_r2r (mm1, mm6); /* mm6 = a1-b1 */ | |
1 | 815 |
9852 | 816 movq_m2r (*(col+offset+5*8), mm1); /* mm1 = b0 */ |
817 psubsw_r2r (mm7, mm2); /* mm2 = a2-b2 */ | |
1 | 818 |
9852 | 819 psraw_i2r (COL_SHIFT, mm6); /* mm6 = y6 */ |
820 movq_r2r (mm5, mm7); /* mm7 = a0 */ | |
1 | 821 |
9852 | 822 movq_r2m (mm4, *(col+offset+1*8)); /* save y1 */ |
823 psraw_i2r (COL_SHIFT, mm2); /* mm2 = y5 */ | |
1 | 824 |
9852 | 825 movq_r2m (mm3, *(col+offset+2*8)); /* save y2 */ |
826 paddsw_r2r (mm1, mm5); /* mm5 = a0+b0 */ | |
1 | 827 |
9852 | 828 movq_m2r (*(col+offset+3*8), mm4); /* mm4 = b3 */ |
829 psubsw_r2r (mm1, mm7); /* mm7 = a0-b0 */ | |
1 | 830 |
9852 | 831 psraw_i2r (COL_SHIFT, mm5); /* mm5 = y0 */ |
832 movq_r2r (mm0, mm3); /* mm3 = a3 */ | |
1 | 833 |
9852 | 834 movq_r2m (mm2, *(col+offset+5*8)); /* save y5 */ |
835 psubsw_r2r (mm4, mm3); /* mm3 = a3-b3 */ | |
1 | 836 |
9852 | 837 psraw_i2r (COL_SHIFT, mm7); /* mm7 = y7 */ |
838 paddsw_r2r (mm0, mm4); /* mm4 = a3+b3 */ | |
1 | 839 |
9852 | 840 movq_r2m (mm5, *(col+offset+0*8)); /* save y0 */ |
841 psraw_i2r (COL_SHIFT, mm3); /* mm3 = y4 */ | |
1 | 842 |
9852 | 843 movq_r2m (mm6, *(col+offset+6*8)); /* save y6 */ |
844 psraw_i2r (COL_SHIFT, mm4); /* mm4 = y3 */ | |
1 | 845 |
9852 | 846 movq_r2m (mm7, *(col+offset+7*8)); /* save y7 */ |
1 | 847 |
9852 | 848 movq_r2m (mm3, *(col+offset+4*8)); /* save y4 */ |
1 | 849 |
9852 | 850 movq_r2m (mm4, *(col+offset+3*8)); /* save y3 */ |
1 | 851 } |
852 | |
853 | |
9852 | 854 static const int32_t rounder0[] ATTR_ALIGN(8) = |
1 | 855 rounder ((1 << (COL_SHIFT - 1)) - 0.5); |
9852 | 856 static const int32_t rounder4[] ATTR_ALIGN(8) = rounder (0); |
857 static const int32_t rounder1[] ATTR_ALIGN(8) = | |
36 | 858 rounder (1.25683487303); /* C1*(C1/C4+C1+C7)/2 */ |
9852 | 859 static const int32_t rounder7[] ATTR_ALIGN(8) = |
36 | 860 rounder (-0.25); /* C1*(C7/C4+C7-C1)/2 */ |
9852 | 861 static const int32_t rounder2[] ATTR_ALIGN(8) = |
36 | 862 rounder (0.60355339059); /* C2 * (C6+C2)/2 */ |
9852 | 863 static const int32_t rounder6[] ATTR_ALIGN(8) = |
36 | 864 rounder (-0.25); /* C2 * (C6-C2)/2 */ |
9852 | 865 static const int32_t rounder3[] ATTR_ALIGN(8) = |
36 | 866 rounder (0.087788325588); /* C3*(-C3/C4+C3+C5)/2 */ |
9852 | 867 static const int32_t rounder5[] ATTR_ALIGN(8) = |
36 | 868 rounder (-0.441341716183); /* C3*(-C5/C4+C5-C3)/2 */ |
1 | 869 |
870 | |
871 #define declare_idct(idct,table,idct_row_head,idct_row,idct_row_tail,idct_row_mid) \ | |
9852 | 872 static inline void idct (int16_t * const block) \ |
1 | 873 { \ |
9852 | 874 static const int16_t table04[] ATTR_ALIGN(16) = \ |
1 | 875 table (22725, 21407, 19266, 16384, 12873, 8867, 4520); \ |
9852 | 876 static const int16_t table17[] ATTR_ALIGN(16) = \ |
1 | 877 table (31521, 29692, 26722, 22725, 17855, 12299, 6270); \ |
9852 | 878 static const int16_t table26[] ATTR_ALIGN(16) = \ |
1 | 879 table (29692, 27969, 25172, 21407, 16819, 11585, 5906); \ |
9852 | 880 static const int16_t table35[] ATTR_ALIGN(16) = \ |
1 | 881 table (26722, 25172, 22654, 19266, 15137, 10426, 5315); \ |
882 \ | |
883 idct_row_head (block, 0*8, table04); \ | |
884 idct_row (table04, rounder0); \ | |
885 idct_row_mid (block, 0*8, 4*8, table04); \ | |
886 idct_row (table04, rounder4); \ | |
887 idct_row_mid (block, 4*8, 1*8, table17); \ | |
888 idct_row (table17, rounder1); \ | |
889 idct_row_mid (block, 1*8, 7*8, table17); \ | |
890 idct_row (table17, rounder7); \ | |
891 idct_row_mid (block, 7*8, 2*8, table26); \ | |
892 idct_row (table26, rounder2); \ | |
893 idct_row_mid (block, 2*8, 6*8, table26); \ | |
894 idct_row (table26, rounder6); \ | |
895 idct_row_mid (block, 6*8, 3*8, table35); \ | |
896 idct_row (table35, rounder3); \ | |
897 idct_row_mid (block, 3*8, 5*8, table35); \ | |
898 idct_row (table35, rounder5); \ | |
899 idct_row_tail (block, 5*8); \ | |
900 \ | |
901 idct_col (block, 0); \ | |
902 idct_col (block, 4); \ | |
903 } | |
904 | |
26393
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
905 static inline void sse2_idct (int16_t * const block) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
906 { |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
907 static const int16_t table04[] ATTR_ALIGN(16) = |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
908 sse2_table (22725, 21407, 19266, 16384, 12873, 8867, 4520); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
909 static const int16_t table17[] ATTR_ALIGN(16) = |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
910 sse2_table (31521, 29692, 26722, 22725, 17855, 12299, 6270); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
911 static const int16_t table26[] ATTR_ALIGN(16) = |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
912 sse2_table (29692, 27969, 25172, 21407, 16819, 11585, 5906); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
913 static const int16_t table35[] ATTR_ALIGN(16) = |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
914 sse2_table (26722, 25172, 22654, 19266, 15137, 10426, 5315); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
915 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
916 static const int32_t rounder0_128[] ATTR_ALIGN(16) = |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
917 rounder_sse2 ((1 << (COL_SHIFT - 1)) - 0.5); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
918 static const int32_t rounder4_128[] ATTR_ALIGN(16) = rounder_sse2 (0); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
919 static const int32_t rounder1_128[] ATTR_ALIGN(16) = |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
920 rounder_sse2 (1.25683487303); /* C1*(C1/C4+C1+C7)/2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
921 static const int32_t rounder7_128[] ATTR_ALIGN(16) = |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
922 rounder_sse2 (-0.25); /* C1*(C7/C4+C7-C1)/2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
923 static const int32_t rounder2_128[] ATTR_ALIGN(16) = |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
924 rounder_sse2 (0.60355339059); /* C2 * (C6+C2)/2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
925 static const int32_t rounder6_128[] ATTR_ALIGN(16) = |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
926 rounder_sse2 (-0.25); /* C2 * (C6-C2)/2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
927 static const int32_t rounder3_128[] ATTR_ALIGN(16) = |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
928 rounder_sse2 (0.087788325588); /* C3*(-C3/C4+C3+C5)/2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
929 static const int32_t rounder5_128[] ATTR_ALIGN(16) = |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
930 rounder_sse2 (-0.441341716183); /* C3*(-C5/C4+C5-C3)/2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
931 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
932 #if defined(__x86_64__) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
933 movdqa_m2r (block[0*8], xmm8); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
934 movdqa_m2r (block[4*8], xmm12); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
935 SSE2_IDCT_2ROW (table04, xmm8, xmm12, *rounder0_128, *rounder4_128); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
936 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
937 movdqa_m2r (block[1*8], xmm9); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
938 movdqa_m2r (block[7*8], xmm15); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
939 SSE2_IDCT_2ROW (table17, xmm9, xmm15, *rounder1_128, *rounder7_128); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
940 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
941 movdqa_m2r (block[2*8], xmm10); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
942 movdqa_m2r (block[6*8], xmm14); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
943 SSE2_IDCT_2ROW (table26, xmm10, xmm14, *rounder2_128, *rounder6_128); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
944 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
945 movdqa_m2r (block[3*8], xmm11); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
946 movdqa_m2r (block[5*8], xmm13); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
947 SSE2_IDCT_2ROW (table35, xmm11, xmm13, *rounder3_128, *rounder5_128); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
948 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
949 /* OUTPUT: block in xmm8 ... xmm15 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
950 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
951 #else |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
952 movdqa_m2r (block[0*8], xmm0); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
953 movdqa_m2r (block[4*8], xmm4); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
954 SSE2_IDCT_2ROW (table04, xmm0, xmm4, *rounder0_128, *rounder4_128); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
955 movdqa_r2m (xmm0, block[0*8]); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
956 movdqa_r2m (xmm4, block[4*8]); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
957 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
958 movdqa_m2r (block[1*8], xmm0); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
959 movdqa_m2r (block[7*8], xmm4); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
960 SSE2_IDCT_2ROW (table17, xmm0, xmm4, *rounder1_128, *rounder7_128); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
961 movdqa_r2m (xmm0, block[1*8]); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
962 movdqa_r2m (xmm4, block[7*8]); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
963 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
964 movdqa_m2r (block[2*8], xmm0); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
965 movdqa_m2r (block[6*8], xmm4); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
966 SSE2_IDCT_2ROW (table26, xmm0, xmm4, *rounder2_128, *rounder6_128); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
967 movdqa_r2m (xmm0, block[2*8]); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
968 movdqa_r2m (xmm4, block[6*8]); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
969 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
970 movdqa_m2r (block[3*8], xmm0); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
971 movdqa_m2r (block[5*8], xmm4); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
972 SSE2_IDCT_2ROW (table35, xmm0, xmm4, *rounder3_128, *rounder5_128); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
973 movdqa_r2m (xmm0, block[3*8]); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
974 movdqa_r2m (xmm4, block[5*8]); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
975 #endif |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
976 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
977 sse2_idct_col (block); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
978 } |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
979 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
980 static void sse2_block_copy (int16_t * const block, uint8_t * dest, |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
981 const int stride) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
982 { |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
983 #if defined(__x86_64__) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
984 /* INPUT: block in xmm8 ... xmm15 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
985 packuswb_r2r (xmm8, xmm8); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
986 packuswb_r2r (xmm9, xmm9); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
987 movq_r2m (xmm8, *(dest+0*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
988 packuswb_r2r (xmm10, xmm10); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
989 movq_r2m (xmm9, *(dest+1*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
990 packuswb_r2r (xmm11, xmm11); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
991 movq_r2m (xmm10, *(dest+2*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
992 packuswb_r2r (xmm12, xmm12); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
993 movq_r2m (xmm11, *(dest+3*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
994 packuswb_r2r (xmm13, xmm13); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
995 movq_r2m (xmm12, *(dest+4*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
996 packuswb_r2r (xmm14, xmm14); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
997 movq_r2m (xmm13, *(dest+5*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
998 packuswb_r2r (xmm15, xmm15); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
999 movq_r2m (xmm14, *(dest+6*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1000 movq_r2m (xmm15, *(dest+7*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1001 #else |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1002 movdqa_m2r (*(block+0*8), xmm0); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1003 movdqa_m2r (*(block+1*8), xmm1); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1004 movdqa_m2r (*(block+2*8), xmm2); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1005 packuswb_r2r (xmm0, xmm0); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1006 movdqa_m2r (*(block+3*8), xmm3); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1007 packuswb_r2r (xmm1, xmm1); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1008 movdqa_m2r (*(block+4*8), xmm4); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1009 packuswb_r2r (xmm2, xmm2); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1010 movdqa_m2r (*(block+5*8), xmm5); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1011 packuswb_r2r (xmm3, xmm3); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1012 movdqa_m2r (*(block+6*8), xmm6); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1013 packuswb_r2r (xmm4, xmm4); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1014 movdqa_m2r (*(block+7*8), xmm7); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1015 movq_r2m (xmm0, *(dest+0*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1016 packuswb_r2r (xmm5, xmm5); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1017 movq_r2m (xmm1, *(dest+1*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1018 packuswb_r2r (xmm6, xmm6); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1019 movq_r2m (xmm2, *(dest+2*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1020 packuswb_r2r (xmm7, xmm7); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1021 movq_r2m (xmm3, *(dest+3*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1022 movq_r2m (xmm4, *(dest+4*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1023 movq_r2m (xmm5, *(dest+5*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1024 movq_r2m (xmm6, *(dest+6*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1025 movq_r2m (xmm7, *(dest+7*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1026 #endif |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1027 } |
1 | 1028 |
1029 #define COPY_MMX(offset,r0,r1,r2) \ | |
1030 do { \ | |
1031 movq_m2r (*(block+offset), r0); \ | |
1032 dest += stride; \ | |
1033 movq_m2r (*(block+offset+4), r1); \ | |
1034 movq_r2m (r2, *dest); \ | |
1035 packuswb_r2r (r1, r0); \ | |
1036 } while (0) | |
1037 | |
9852 | 1038 static inline void block_copy (int16_t * const block, uint8_t * dest, |
1039 const int stride) | |
1 | 1040 { |
1041 movq_m2r (*(block+0*8), mm0); | |
1042 movq_m2r (*(block+0*8+4), mm1); | |
1043 movq_m2r (*(block+1*8), mm2); | |
1044 packuswb_r2r (mm1, mm0); | |
1045 movq_m2r (*(block+1*8+4), mm3); | |
1046 movq_r2m (mm0, *dest); | |
1047 packuswb_r2r (mm3, mm2); | |
1048 COPY_MMX (2*8, mm0, mm1, mm2); | |
1049 COPY_MMX (3*8, mm2, mm3, mm0); | |
1050 COPY_MMX (4*8, mm0, mm1, mm2); | |
1051 COPY_MMX (5*8, mm2, mm3, mm0); | |
1052 COPY_MMX (6*8, mm0, mm1, mm2); | |
1053 COPY_MMX (7*8, mm2, mm3, mm0); | |
1054 movq_r2m (mm2, *(dest+stride)); | |
1055 } | |
1056 | |
26393
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1057 #define ADD_SSE2_2ROW(op, block0, block1)\ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1058 do { \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1059 movq_m2r (*(dest), xmm1); \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1060 movq_m2r (*(dest+stride), xmm2); \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1061 punpcklbw_r2r (xmm0, xmm1); \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1062 punpcklbw_r2r (xmm0, xmm2); \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1063 paddsw_##op (block0, xmm1); \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1064 paddsw_##op (block1, xmm2); \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1065 packuswb_r2r (xmm1, xmm1); \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1066 packuswb_r2r (xmm2, xmm2); \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1067 movq_r2m (xmm1, *(dest)); \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1068 movq_r2m (xmm2, *(dest+stride)); \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1069 dest += 2*stride; \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1070 } while (0) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1071 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1072 static void sse2_block_add (int16_t * const block, uint8_t * dest, |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1073 const int stride) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1074 { |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1075 pxor_r2r(xmm0, xmm0); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1076 #if defined(__x86_64__) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1077 /* INPUT: block in xmm8 ... xmm15 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1078 ADD_SSE2_2ROW(r2r, xmm8, xmm9); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1079 ADD_SSE2_2ROW(r2r, xmm10, xmm11); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1080 ADD_SSE2_2ROW(r2r, xmm12, xmm13); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1081 ADD_SSE2_2ROW(r2r, xmm14, xmm15); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1082 #else |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1083 ADD_SSE2_2ROW(m2r, *(block+0*8), *(block+1*8)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1084 ADD_SSE2_2ROW(m2r, *(block+2*8), *(block+3*8)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1085 ADD_SSE2_2ROW(m2r, *(block+4*8), *(block+5*8)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1086 ADD_SSE2_2ROW(m2r, *(block+6*8), *(block+7*8)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1087 #endif |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1088 } |
1 | 1089 |
1090 #define ADD_MMX(offset,r1,r2,r3,r4) \ | |
1091 do { \ | |
1092 movq_m2r (*(dest+2*stride), r1); \ | |
1093 packuswb_r2r (r4, r3); \ | |
1094 movq_r2r (r1, r2); \ | |
1095 dest += stride; \ | |
1096 movq_r2m (r3, *dest); \ | |
1097 punpcklbw_r2r (mm0, r1); \ | |
1098 paddsw_m2r (*(block+offset), r1); \ | |
1099 punpckhbw_r2r (mm0, r2); \ | |
1100 paddsw_m2r (*(block+offset+4), r2); \ | |
1101 } while (0) | |
1102 | |
9852 | 1103 static inline void block_add (int16_t * const block, uint8_t * dest, |
1104 const int stride) | |
1 | 1105 { |
1106 movq_m2r (*dest, mm1); | |
1107 pxor_r2r (mm0, mm0); | |
1108 movq_m2r (*(dest+stride), mm3); | |
1109 movq_r2r (mm1, mm2); | |
1110 punpcklbw_r2r (mm0, mm1); | |
1111 movq_r2r (mm3, mm4); | |
1112 paddsw_m2r (*(block+0*8), mm1); | |
1113 punpckhbw_r2r (mm0, mm2); | |
1114 paddsw_m2r (*(block+0*8+4), mm2); | |
1115 punpcklbw_r2r (mm0, mm3); | |
1116 paddsw_m2r (*(block+1*8), mm3); | |
1117 packuswb_r2r (mm2, mm1); | |
1118 punpckhbw_r2r (mm0, mm4); | |
1119 movq_r2m (mm1, *dest); | |
1120 paddsw_m2r (*(block+1*8+4), mm4); | |
1121 ADD_MMX (2*8, mm1, mm2, mm3, mm4); | |
1122 ADD_MMX (3*8, mm3, mm4, mm1, mm2); | |
1123 ADD_MMX (4*8, mm1, mm2, mm3, mm4); | |
1124 ADD_MMX (5*8, mm3, mm4, mm1, mm2); | |
1125 ADD_MMX (6*8, mm1, mm2, mm3, mm4); | |
1126 ADD_MMX (7*8, mm3, mm4, mm1, mm2); | |
1127 packuswb_r2r (mm4, mm3); | |
1128 movq_r2m (mm3, *(dest+stride)); | |
1129 } | |
1130 | |
1131 | |
26393
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1132 static inline void sse2_block_zero (int16_t * const block) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1133 { |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1134 pxor_r2r (xmm0, xmm0); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1135 movdqa_r2m (xmm0, *(block+0*8)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1136 movdqa_r2m (xmm0, *(block+1*8)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1137 movdqa_r2m (xmm0, *(block+2*8)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1138 movdqa_r2m (xmm0, *(block+3*8)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1139 movdqa_r2m (xmm0, *(block+4*8)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1140 movdqa_r2m (xmm0, *(block+5*8)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1141 movdqa_r2m (xmm0, *(block+6*8)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1142 movdqa_r2m (xmm0, *(block+7*8)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1143 } |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1144 |
9852 | 1145 static inline void block_zero (int16_t * const block) |
1146 { | |
1147 pxor_r2r (mm0, mm0); | |
1148 movq_r2m (mm0, *(block+0*4)); | |
1149 movq_r2m (mm0, *(block+1*4)); | |
1150 movq_r2m (mm0, *(block+2*4)); | |
1151 movq_r2m (mm0, *(block+3*4)); | |
1152 movq_r2m (mm0, *(block+4*4)); | |
1153 movq_r2m (mm0, *(block+5*4)); | |
1154 movq_r2m (mm0, *(block+6*4)); | |
1155 movq_r2m (mm0, *(block+7*4)); | |
1156 movq_r2m (mm0, *(block+8*4)); | |
1157 movq_r2m (mm0, *(block+9*4)); | |
1158 movq_r2m (mm0, *(block+10*4)); | |
1159 movq_r2m (mm0, *(block+11*4)); | |
1160 movq_r2m (mm0, *(block+12*4)); | |
1161 movq_r2m (mm0, *(block+13*4)); | |
1162 movq_r2m (mm0, *(block+14*4)); | |
1163 movq_r2m (mm0, *(block+15*4)); | |
1164 } | |
1165 | |
1166 | |
1167 #define CPU_MMXEXT 0 | |
1168 #define CPU_MMX 1 | |
1169 | |
1170 #define dup4(reg) \ | |
1171 do { \ | |
1172 if (cpu != CPU_MMXEXT) { \ | |
1173 punpcklwd_r2r (reg, reg); \ | |
1174 punpckldq_r2r (reg, reg); \ | |
1175 } else \ | |
1176 pshufw_r2r (reg, reg, 0x00); \ | |
1177 } while (0) | |
1178 | |
1179 static inline void block_add_DC (int16_t * const block, uint8_t * dest, | |
1180 const int stride, const int cpu) | |
1181 { | |
12932 | 1182 movd_v2r ((block[0] + 64) >> 7, mm0); |
9852 | 1183 pxor_r2r (mm1, mm1); |
1184 movq_m2r (*dest, mm2); | |
1185 dup4 (mm0); | |
1186 psubsw_r2r (mm0, mm1); | |
1187 packuswb_r2r (mm0, mm0); | |
1188 paddusb_r2r (mm0, mm2); | |
1189 packuswb_r2r (mm1, mm1); | |
1190 movq_m2r (*(dest + stride), mm3); | |
1191 psubusb_r2r (mm1, mm2); | |
1192 block[0] = 0; | |
1193 paddusb_r2r (mm0, mm3); | |
1194 movq_r2m (mm2, *dest); | |
1195 psubusb_r2r (mm1, mm3); | |
1196 movq_m2r (*(dest + 2*stride), mm2); | |
1197 dest += stride; | |
1198 movq_r2m (mm3, *dest); | |
1199 paddusb_r2r (mm0, mm2); | |
1200 movq_m2r (*(dest + 2*stride), mm3); | |
1201 psubusb_r2r (mm1, mm2); | |
1202 dest += stride; | |
1203 paddusb_r2r (mm0, mm3); | |
1204 movq_r2m (mm2, *dest); | |
1205 psubusb_r2r (mm1, mm3); | |
1206 movq_m2r (*(dest + 2*stride), mm2); | |
1207 dest += stride; | |
1208 movq_r2m (mm3, *dest); | |
1209 paddusb_r2r (mm0, mm2); | |
1210 movq_m2r (*(dest + 2*stride), mm3); | |
1211 psubusb_r2r (mm1, mm2); | |
1212 dest += stride; | |
1213 paddusb_r2r (mm0, mm3); | |
1214 movq_r2m (mm2, *dest); | |
1215 psubusb_r2r (mm1, mm3); | |
1216 movq_m2r (*(dest + 2*stride), mm2); | |
1217 dest += stride; | |
1218 movq_r2m (mm3, *dest); | |
1219 paddusb_r2r (mm0, mm2); | |
1220 movq_m2r (*(dest + 2*stride), mm3); | |
1221 psubusb_r2r (mm1, mm2); | |
1222 block[63] = 0; | |
1223 paddusb_r2r (mm0, mm3); | |
1224 movq_r2m (mm2, *(dest + stride)); | |
1225 psubusb_r2r (mm1, mm3); | |
1226 movq_r2m (mm3, *(dest + 2*stride)); | |
1227 } | |
1228 | |
26393
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1229 void mpeg2_idct_copy_sse2 (int16_t * const block, uint8_t * const dest, |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1230 const int stride) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1231 { |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1232 sse2_idct (block); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1233 sse2_block_copy (block, dest, stride); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1234 sse2_block_zero (block); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1235 } |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1236 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1237 void mpeg2_idct_add_sse2 (const int last, int16_t * const block, |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1238 uint8_t * const dest, const int stride) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1239 { |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1240 if (last != 129 || (block[0] & (7 << 4)) == (4 << 4)) { |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1241 sse2_idct (block); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1242 sse2_block_add (block, dest, stride); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1243 sse2_block_zero (block); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1244 } else |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1245 block_add_DC (block, dest, stride, CPU_MMXEXT); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1246 } |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1247 |
9852 | 1248 |
1 | 1249 declare_idct (mmxext_idct, mmxext_table, |
1250 mmxext_row_head, mmxext_row, mmxext_row_tail, mmxext_row_mid) | |
1251 | |
9852 | 1252 void mpeg2_idct_copy_mmxext (int16_t * const block, uint8_t * const dest, |
1253 const int stride) | |
1 | 1254 { |
1255 mmxext_idct (block); | |
1256 block_copy (block, dest, stride); | |
9852 | 1257 block_zero (block); |
1 | 1258 } |
1259 | |
9852 | 1260 void mpeg2_idct_add_mmxext (const int last, int16_t * const block, |
1261 uint8_t * const dest, const int stride) | |
1 | 1262 { |
12932 | 1263 if (last != 129 || (block[0] & (7 << 4)) == (4 << 4)) { |
9852 | 1264 mmxext_idct (block); |
1265 block_add (block, dest, stride); | |
1266 block_zero (block); | |
1267 } else | |
1268 block_add_DC (block, dest, stride, CPU_MMXEXT); | |
1 | 1269 } |
1270 | |
1271 | |
1272 declare_idct (mmx_idct, mmx_table, | |
1273 mmx_row_head, mmx_row, mmx_row_tail, mmx_row_mid) | |
1274 | |
9852 | 1275 void mpeg2_idct_copy_mmx (int16_t * const block, uint8_t * const dest, |
1276 const int stride) | |
1 | 1277 { |
1278 mmx_idct (block); | |
1279 block_copy (block, dest, stride); | |
9852 | 1280 block_zero (block); |
1 | 1281 } |
1282 | |
9852 | 1283 void mpeg2_idct_add_mmx (const int last, int16_t * const block, |
1284 uint8_t * const dest, const int stride) | |
1 | 1285 { |
12932 | 1286 if (last != 129 || (block[0] & (7 << 4)) == (4 << 4)) { |
9852 | 1287 mmx_idct (block); |
1288 block_add (block, dest, stride); | |
1289 block_zero (block); | |
1290 } else | |
1291 block_add_DC (block, dest, stride, CPU_MMX); | |
1 | 1292 } |
1293 | |
1294 | |
9852 | 1295 void mpeg2_idct_mmx_init (void) |
1 | 1296 { |
9852 | 1297 extern uint8_t mpeg2_scan_norm[64]; |
1298 extern uint8_t mpeg2_scan_alt[64]; | |
1 | 1299 int i, j; |
1300 | |
36 | 1301 /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */ |
1 | 1302 |
1303 for (i = 0; i < 64; i++) { | |
9852 | 1304 j = mpeg2_scan_norm[i]; |
1305 mpeg2_scan_norm[i] = (j & 0x38) | ((j & 6) >> 1) | ((j & 1) << 2); | |
1306 j = mpeg2_scan_alt[i]; | |
1307 mpeg2_scan_alt[i] = (j & 0x38) | ((j & 6) >> 1) | ((j & 1) << 2); | |
1 | 1308 } |
1309 } | |
1310 | |
1311 #endif |