Mercurial > mplayer.hg
annotate libmpeg2/idct_mmx.c @ 27450:4da9ce4d8327
Fix 'cast from pointer to integer of different size' on 64bit architectures. Casting to long should work for 32bit and 64bit and not make a difference to the boolean operation (since 'format' is always 32bit (int) the upper 32bit of 'arg' won't matter, but the compiler should be happy now. Casting both to unsigned makes sure the compiler isn't messing things up by sign-extending 'format' to 64bit before masking)
author | ranma |
---|---|
date | Sun, 24 Aug 2008 13:52:54 +0000 |
parents | 2506f1b0bdbe |
children | fd18fa10de53 |
rev | line source |
---|---|
1 | 1 /* |
2 * idct_mmx.c | |
10303 | 3 * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org> |
9852 | 4 * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca> |
1 | 5 * |
6 * This file is part of mpeg2dec, a free MPEG-2 video stream decoder. | |
9852 | 7 * See http://libmpeg2.sourceforge.net/ for updates. |
1 | 8 * |
9 * mpeg2dec is free software; you can redistribute it and/or modify | |
10 * it under the terms of the GNU General Public License as published by | |
11 * the Free Software Foundation; either version 2 of the License, or | |
12 * (at your option) any later version. | |
13 * | |
14 * mpeg2dec is distributed in the hope that it will be useful, | |
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
17 * GNU General Public License for more details. | |
18 * | |
19 * You should have received a copy of the GNU General Public License | |
20 * along with this program; if not, write to the Free Software | |
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
14732
1385ec491ffb
Mark locally modified files as such to comply more closely with GPL 2a.
diego
parents:
13864
diff
changeset
|
22 * |
21526 | 23 * Modified for use with MPlayer, see libmpeg-0.4.1.diff for the exact changes. |
18783 | 24 * detailed changelog at http://svn.mplayerhq.hu/mplayer/trunk/ |
14732
1385ec491ffb
Mark locally modified files as such to comply more closely with GPL 2a.
diego
parents:
13864
diff
changeset
|
25 * $Id$ |
1 | 26 */ |
27 | |
28 #include "config.h" | |
29 | |
13864 | 30 #if defined(ARCH_X86) || defined(ARCH_X86_64) |
1 | 31 |
32 #include <inttypes.h> | |
33 | |
9852 | 34 #include "mpeg2.h" |
12932 | 35 #include "attributes.h" |
1 | 36 #include "mpeg2_internal.h" |
37 #include "mmx.h" | |
38 | |
12932 | 39 #define ROW_SHIFT 15 |
1 | 40 #define COL_SHIFT 6 |
41 | |
42 #define round(bias) ((int)(((bias)+0.5) * (1<<ROW_SHIFT))) | |
43 #define rounder(bias) {round (bias), round (bias)} | |
26393
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
44 #define rounder_sse2(bias) {round (bias), round (bias), round (bias), round (bias)} |
1 | 45 |
46 | |
47 #if 0 | |
25998 | 48 /* C row IDCT - it is just here to document the MMXEXT and MMX versions */ |
1 | 49 static inline void idct_row (int16_t * row, int offset, |
50 int16_t * table, int32_t * rounder) | |
51 { | |
52 int C1, C2, C3, C4, C5, C6, C7; | |
53 int a0, a1, a2, a3, b0, b1, b2, b3; | |
54 | |
55 row += offset; | |
56 | |
57 C1 = table[1]; | |
58 C2 = table[2]; | |
59 C3 = table[3]; | |
60 C4 = table[4]; | |
61 C5 = table[5]; | |
62 C6 = table[6]; | |
63 C7 = table[7]; | |
64 | |
65 a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + *rounder; | |
66 a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + *rounder; | |
67 a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + *rounder; | |
68 a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + *rounder; | |
69 | |
70 b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7]; | |
71 b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7]; | |
72 b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7]; | |
73 b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7]; | |
74 | |
75 row[0] = (a0 + b0) >> ROW_SHIFT; | |
76 row[1] = (a1 + b1) >> ROW_SHIFT; | |
77 row[2] = (a2 + b2) >> ROW_SHIFT; | |
78 row[3] = (a3 + b3) >> ROW_SHIFT; | |
79 row[4] = (a3 - b3) >> ROW_SHIFT; | |
80 row[5] = (a2 - b2) >> ROW_SHIFT; | |
81 row[6] = (a1 - b1) >> ROW_SHIFT; | |
82 row[7] = (a0 - b0) >> ROW_SHIFT; | |
83 } | |
84 #endif | |
85 | |
86 | |
26393
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
87 /* SSE2 row IDCT */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
88 #define sse2_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, c4, c6, \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
89 c4, -c6, c4, -c2, \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
90 c4, c6, -c4, -c2, \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
91 -c4, c2, c4, -c6, \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
92 c1, c3, c3, -c7, \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
93 c5, -c1, c7, -c5, \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
94 c5, c7, -c1, -c5, \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
95 c7, c3, c3, -c1 } |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
96 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
97 #define SSE2_IDCT_2ROW(table, row1, row2, round1, round2) do { \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
98 /* no scheduling: trust in out of order execution */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
99 /* based on Intel AP-945 */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
100 /* (http://cache-www.intel.com/cd/00/00/01/76/17680_w_idct.pdf) */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
101 \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
102 /* input */ /* 1: row1= x7 x5 x3 x1 x6 x4 x2 x0 */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
103 pshufd_r2r (row1, xmm1, 0); /* 1: xmm1= x2 x0 x2 x0 x2 x0 x2 x0 */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
104 pmaddwd_m2r (table[0], xmm1); /* 1: xmm1= x2*C + x0*C ... */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
105 pshufd_r2r (row1, xmm3, 0xaa); /* 1: xmm3= x3 x1 x3 x1 x3 x1 x3 x1 */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
106 pmaddwd_m2r (table[2*8], xmm3); /* 1: xmm3= x3*C + x1*C ... */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
107 pshufd_r2r (row1, xmm2, 0x55); /* 1: xmm2= x6 x4 x6 x4 x6 x4 x6 x4 */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
108 pshufd_r2r (row1, row1, 0xff); /* 1: row1= x7 x5 x7 x5 x7 x5 x7 x5 */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
109 pmaddwd_m2r (table[1*8], xmm2); /* 1: xmm2= x6*C + x4*C ... */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
110 paddd_m2r (round1, xmm1); /* 1: xmm1= x2*C + x0*C + round ... */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
111 pmaddwd_m2r (table[3*8], row1); /* 1: row1= x7*C + x5*C ... */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
112 pshufd_r2r (row2, xmm5, 0); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
113 pshufd_r2r (row2, xmm6, 0x55); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
114 pmaddwd_m2r (table[0], xmm5); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
115 paddd_r2r (xmm2, xmm1); /* 1: xmm1= a[] */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
116 movdqa_r2r (xmm1, xmm2); /* 1: xmm2= a[] */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
117 pshufd_r2r (row2, xmm7, 0xaa); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
118 pmaddwd_m2r (table[1*8], xmm6); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
119 paddd_r2r (xmm3, row1); /* 1: row1= b[]= 7*C+5*C+3*C+1*C ... */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
120 pshufd_r2r (row2, row2, 0xff); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
121 psubd_r2r (row1, xmm2); /* 1: xmm2= a[] - b[] */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
122 pmaddwd_m2r (table[2*8], xmm7); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
123 paddd_r2r (xmm1, row1); /* 1: row1= a[] + b[] */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
124 psrad_i2r (ROW_SHIFT, xmm2); /* 1: xmm2= result 4...7 */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
125 paddd_m2r (round2, xmm5); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
126 pmaddwd_m2r (table[3*8], row2); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
127 paddd_r2r (xmm6, xmm5); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
128 movdqa_r2r (xmm5, xmm6); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
129 psrad_i2r (ROW_SHIFT, row1); /* 1: row1= result 0...4 */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
130 pshufd_r2r (xmm2, xmm2, 0x1b); /* 1: [0 1 2 3] -> [3 2 1 0] */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
131 packssdw_r2r (xmm2, row1); /* 1: row1= result[] */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
132 paddd_r2r (xmm7, row2); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
133 psubd_r2r (row2, xmm6); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
134 paddd_r2r (xmm5, row2); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
135 psrad_i2r (ROW_SHIFT, xmm6); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
136 psrad_i2r (ROW_SHIFT, row2); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
137 pshufd_r2r (xmm6, xmm6, 0x1b); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
138 packssdw_r2r (xmm6, row2); /* 2: */ \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
139 } while (0) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
140 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
141 |
36 | 142 /* MMXEXT row IDCT */ |
1 | 143 |
144 #define mmxext_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, -c4, -c2, \ | |
145 c4, c6, c4, c6, \ | |
146 c1, c3, -c1, -c5, \ | |
147 c5, c7, c3, -c7, \ | |
148 c4, -c6, c4, -c6, \ | |
149 -c4, c2, c4, -c2, \ | |
150 c5, -c1, c3, -c1, \ | |
151 c7, c3, c7, -c5 } | |
152 | |
9852 | 153 static inline void mmxext_row_head (int16_t * const row, const int offset, |
154 const int16_t * const table) | |
1 | 155 { |
9852 | 156 movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */ |
1 | 157 |
9852 | 158 movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ |
159 movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */ | |
1 | 160 |
9852 | 161 movq_m2r (*table, mm3); /* mm3 = -C2 -C4 C2 C4 */ |
162 movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */ | |
1 | 163 |
9852 | 164 movq_m2r (*(table+4), mm4); /* mm4 = C6 C4 C6 C4 */ |
165 pmaddwd_r2r (mm0, mm3); /* mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 */ | |
1 | 166 |
9852 | 167 pshufw_r2r (mm2, mm2, 0x4e); /* mm2 = x2 x0 x6 x4 */ |
1 | 168 } |
169 | |
9852 | 170 static inline void mmxext_row (const int16_t * const table, |
171 const int32_t * const rounder) | |
1 | 172 { |
9852 | 173 movq_m2r (*(table+8), mm1); /* mm1 = -C5 -C1 C3 C1 */ |
174 pmaddwd_r2r (mm2, mm4); /* mm4 = C4*x0+C6*x2 C4*x4+C6*x6 */ | |
1 | 175 |
9852 | 176 pmaddwd_m2r (*(table+16), mm0); /* mm0 = C4*x4-C6*x6 C4*x0-C6*x2 */ |
177 pshufw_r2r (mm6, mm6, 0x4e); /* mm6 = x3 x1 x7 x5 */ | |
1 | 178 |
9852 | 179 movq_m2r (*(table+12), mm7); /* mm7 = -C7 C3 C7 C5 */ |
180 pmaddwd_r2r (mm5, mm1); /* mm1 = -C1*x5-C5*x7 C1*x1+C3*x3 */ | |
1 | 181 |
9852 | 182 paddd_m2r (*rounder, mm3); /* mm3 += rounder */ |
183 pmaddwd_r2r (mm6, mm7); /* mm7 = C3*x1-C7*x3 C5*x5+C7*x7 */ | |
1 | 184 |
9852 | 185 pmaddwd_m2r (*(table+20), mm2); /* mm2 = C4*x0-C2*x2 -C4*x4+C2*x6 */ |
186 paddd_r2r (mm4, mm3); /* mm3 = a1 a0 + rounder */ | |
1 | 187 |
9852 | 188 pmaddwd_m2r (*(table+24), mm5); /* mm5 = C3*x5-C1*x7 C5*x1-C1*x3 */ |
189 movq_r2r (mm3, mm4); /* mm4 = a1 a0 + rounder */ | |
1 | 190 |
9852 | 191 pmaddwd_m2r (*(table+28), mm6); /* mm6 = C7*x1-C5*x3 C7*x5+C3*x7 */ |
192 paddd_r2r (mm7, mm1); /* mm1 = b1 b0 */ | |
1 | 193 |
9852 | 194 paddd_m2r (*rounder, mm0); /* mm0 += rounder */ |
195 psubd_r2r (mm1, mm3); /* mm3 = a1-b1 a0-b0 + rounder */ | |
1 | 196 |
9852 | 197 psrad_i2r (ROW_SHIFT, mm3); /* mm3 = y6 y7 */ |
198 paddd_r2r (mm4, mm1); /* mm1 = a1+b1 a0+b0 + rounder */ | |
1 | 199 |
9852 | 200 paddd_r2r (mm2, mm0); /* mm0 = a3 a2 + rounder */ |
201 psrad_i2r (ROW_SHIFT, mm1); /* mm1 = y1 y0 */ | |
1 | 202 |
9852 | 203 paddd_r2r (mm6, mm5); /* mm5 = b3 b2 */ |
204 movq_r2r (mm0, mm4); /* mm4 = a3 a2 + rounder */ | |
1 | 205 |
9852 | 206 paddd_r2r (mm5, mm0); /* mm0 = a3+b3 a2+b2 + rounder */ |
207 psubd_r2r (mm5, mm4); /* mm4 = a3-b3 a2-b2 + rounder */ | |
1 | 208 } |
209 | |
9852 | 210 static inline void mmxext_row_tail (int16_t * const row, const int store) |
1 | 211 { |
9852 | 212 psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */ |
1 | 213 |
9852 | 214 psrad_i2r (ROW_SHIFT, mm4); /* mm4 = y4 y5 */ |
1 | 215 |
9852 | 216 packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */ |
1 | 217 |
9852 | 218 packssdw_r2r (mm3, mm4); /* mm4 = y6 y7 y4 y5 */ |
1 | 219 |
9852 | 220 movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */ |
221 pshufw_r2r (mm4, mm4, 0xb1); /* mm4 = y7 y6 y5 y4 */ | |
1 | 222 |
36 | 223 /* slot */ |
1 | 224 |
9852 | 225 movq_r2m (mm4, *(row+store+4)); /* save y7 y6 y5 y4 */ |
1 | 226 } |
227 | |
9852 | 228 static inline void mmxext_row_mid (int16_t * const row, const int store, |
229 const int offset, | |
230 const int16_t * const table) | |
1 | 231 { |
9852 | 232 movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */ |
233 psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */ | |
1 | 234 |
9852 | 235 movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ |
236 psrad_i2r (ROW_SHIFT, mm4); /* mm4 = y4 y5 */ | |
1 | 237 |
9852 | 238 packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */ |
239 movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */ | |
1 | 240 |
9852 | 241 packssdw_r2r (mm3, mm4); /* mm4 = y6 y7 y4 y5 */ |
242 movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */ | |
1 | 243 |
9852 | 244 movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */ |
245 pshufw_r2r (mm4, mm4, 0xb1); /* mm4 = y7 y6 y5 y4 */ | |
1 | 246 |
9852 | 247 movq_m2r (*table, mm3); /* mm3 = -C2 -C4 C2 C4 */ |
248 movq_r2m (mm4, *(row+store+4)); /* save y7 y6 y5 y4 */ | |
1 | 249 |
9852 | 250 pmaddwd_r2r (mm0, mm3); /* mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 */ |
1 | 251 |
9852 | 252 movq_m2r (*(table+4), mm4); /* mm4 = C6 C4 C6 C4 */ |
253 pshufw_r2r (mm2, mm2, 0x4e); /* mm2 = x2 x0 x6 x4 */ | |
1 | 254 } |
255 | |
256 | |
36 | 257 /* MMX row IDCT */ |
1 | 258 |
259 #define mmx_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, c4, c6, \ | |
260 c4, c6, -c4, -c2, \ | |
261 c1, c3, c3, -c7, \ | |
262 c5, c7, -c1, -c5, \ | |
263 c4, -c6, c4, -c2, \ | |
264 -c4, c2, c4, -c6, \ | |
265 c5, -c1, c7, -c5, \ | |
266 c7, c3, c3, -c1 } | |
267 | |
9852 | 268 static inline void mmx_row_head (int16_t * const row, const int offset, |
269 const int16_t * const table) | |
1 | 270 { |
9852 | 271 movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */ |
1 | 272 |
9852 | 273 movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ |
274 movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */ | |
1 | 275 |
9852 | 276 movq_m2r (*table, mm3); /* mm3 = C6 C4 C2 C4 */ |
277 movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */ | |
1 | 278 |
9852 | 279 punpckldq_r2r (mm0, mm0); /* mm0 = x2 x0 x2 x0 */ |
1 | 280 |
9852 | 281 movq_m2r (*(table+4), mm4); /* mm4 = -C2 -C4 C6 C4 */ |
282 pmaddwd_r2r (mm0, mm3); /* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */ | |
1 | 283 |
9852 | 284 movq_m2r (*(table+8), mm1); /* mm1 = -C7 C3 C3 C1 */ |
285 punpckhdq_r2r (mm2, mm2); /* mm2 = x6 x4 x6 x4 */ | |
1 | 286 } |
287 | |
9852 | 288 static inline void mmx_row (const int16_t * const table, |
289 const int32_t * const rounder) | |
1 | 290 { |
9852 | 291 pmaddwd_r2r (mm2, mm4); /* mm4 = -C4*x4-C2*x6 C4*x4+C6*x6 */ |
292 punpckldq_r2r (mm5, mm5); /* mm5 = x3 x1 x3 x1 */ | |
1 | 293 |
9852 | 294 pmaddwd_m2r (*(table+16), mm0); /* mm0 = C4*x0-C2*x2 C4*x0-C6*x2 */ |
295 punpckhdq_r2r (mm6, mm6); /* mm6 = x7 x5 x7 x5 */ | |
1 | 296 |
9852 | 297 movq_m2r (*(table+12), mm7); /* mm7 = -C5 -C1 C7 C5 */ |
298 pmaddwd_r2r (mm5, mm1); /* mm1 = C3*x1-C7*x3 C1*x1+C3*x3 */ | |
1 | 299 |
9852 | 300 paddd_m2r (*rounder, mm3); /* mm3 += rounder */ |
301 pmaddwd_r2r (mm6, mm7); /* mm7 = -C1*x5-C5*x7 C5*x5+C7*x7 */ | |
1 | 302 |
9852 | 303 pmaddwd_m2r (*(table+20), mm2); /* mm2 = C4*x4-C6*x6 -C4*x4+C2*x6 */ |
304 paddd_r2r (mm4, mm3); /* mm3 = a1 a0 + rounder */ | |
1 | 305 |
9852 | 306 pmaddwd_m2r (*(table+24), mm5); /* mm5 = C7*x1-C5*x3 C5*x1-C1*x3 */ |
307 movq_r2r (mm3, mm4); /* mm4 = a1 a0 + rounder */ | |
1 | 308 |
9852 | 309 pmaddwd_m2r (*(table+28), mm6); /* mm6 = C3*x5-C1*x7 C7*x5+C3*x7 */ |
310 paddd_r2r (mm7, mm1); /* mm1 = b1 b0 */ | |
1 | 311 |
9852 | 312 paddd_m2r (*rounder, mm0); /* mm0 += rounder */ |
313 psubd_r2r (mm1, mm3); /* mm3 = a1-b1 a0-b0 + rounder */ | |
1 | 314 |
9852 | 315 psrad_i2r (ROW_SHIFT, mm3); /* mm3 = y6 y7 */ |
316 paddd_r2r (mm4, mm1); /* mm1 = a1+b1 a0+b0 + rounder */ | |
1 | 317 |
9852 | 318 paddd_r2r (mm2, mm0); /* mm0 = a3 a2 + rounder */ |
319 psrad_i2r (ROW_SHIFT, mm1); /* mm1 = y1 y0 */ | |
1 | 320 |
9852 | 321 paddd_r2r (mm6, mm5); /* mm5 = b3 b2 */ |
322 movq_r2r (mm0, mm7); /* mm7 = a3 a2 + rounder */ | |
1 | 323 |
9852 | 324 paddd_r2r (mm5, mm0); /* mm0 = a3+b3 a2+b2 + rounder */ |
325 psubd_r2r (mm5, mm7); /* mm7 = a3-b3 a2-b2 + rounder */ | |
1 | 326 } |
327 | |
9852 | 328 static inline void mmx_row_tail (int16_t * const row, const int store) |
1 | 329 { |
9852 | 330 psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */ |
1 | 331 |
9852 | 332 psrad_i2r (ROW_SHIFT, mm7); /* mm7 = y4 y5 */ |
1 | 333 |
9852 | 334 packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */ |
1 | 335 |
9852 | 336 packssdw_r2r (mm3, mm7); /* mm7 = y6 y7 y4 y5 */ |
1 | 337 |
9852 | 338 movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */ |
339 movq_r2r (mm7, mm4); /* mm4 = y6 y7 y4 y5 */ | |
1 | 340 |
9852 | 341 pslld_i2r (16, mm7); /* mm7 = y7 0 y5 0 */ |
1 | 342 |
9852 | 343 psrld_i2r (16, mm4); /* mm4 = 0 y6 0 y4 */ |
1 | 344 |
9852 | 345 por_r2r (mm4, mm7); /* mm7 = y7 y6 y5 y4 */ |
1 | 346 |
36 | 347 /* slot */ |
1 | 348 |
9852 | 349 movq_r2m (mm7, *(row+store+4)); /* save y7 y6 y5 y4 */ |
1 | 350 } |
351 | |
9852 | 352 static inline void mmx_row_mid (int16_t * const row, const int store, |
353 const int offset, const int16_t * const table) | |
1 | 354 { |
9852 | 355 movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */ |
356 psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */ | |
1 | 357 |
9852 | 358 movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ |
359 psrad_i2r (ROW_SHIFT, mm7); /* mm7 = y4 y5 */ | |
1 | 360 |
9852 | 361 packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */ |
362 movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */ | |
1 | 363 |
9852 | 364 packssdw_r2r (mm3, mm7); /* mm7 = y6 y7 y4 y5 */ |
365 movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */ | |
1 | 366 |
9852 | 367 movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */ |
368 movq_r2r (mm7, mm1); /* mm1 = y6 y7 y4 y5 */ | |
1 | 369 |
9852 | 370 punpckldq_r2r (mm0, mm0); /* mm0 = x2 x0 x2 x0 */ |
371 psrld_i2r (16, mm7); /* mm7 = 0 y6 0 y4 */ | |
1 | 372 |
9852 | 373 movq_m2r (*table, mm3); /* mm3 = C6 C4 C2 C4 */ |
374 pslld_i2r (16, mm1); /* mm1 = y7 0 y5 0 */ | |
1 | 375 |
9852 | 376 movq_m2r (*(table+4), mm4); /* mm4 = -C2 -C4 C6 C4 */ |
377 por_r2r (mm1, mm7); /* mm7 = y7 y6 y5 y4 */ | |
1 | 378 |
9852 | 379 movq_m2r (*(table+8), mm1); /* mm1 = -C7 C3 C3 C1 */ |
380 punpckhdq_r2r (mm2, mm2); /* mm2 = x6 x4 x6 x4 */ | |
1 | 381 |
9852 | 382 movq_r2m (mm7, *(row+store+4)); /* save y7 y6 y5 y4 */ |
383 pmaddwd_r2r (mm0, mm3); /* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */ | |
1 | 384 } |
385 | |
386 | |
387 #if 0 | |
25998 | 388 /* C column IDCT - it is just here to document the MMXEXT and MMX versions */ |
1 | 389 static inline void idct_col (int16_t * col, int offset) |
390 { | |
36 | 391 /* multiplication - as implemented on mmx */ |
1 | 392 #define F(c,x) (((c) * (x)) >> 16) |
393 | |
36 | 394 /* saturation - it helps us handle torture test cases */ |
1 | 395 #define S(x) (((x)>32767) ? 32767 : ((x)<-32768) ? -32768 : (x)) |
396 | |
397 int16_t x0, x1, x2, x3, x4, x5, x6, x7; | |
398 int16_t y0, y1, y2, y3, y4, y5, y6, y7; | |
399 int16_t a0, a1, a2, a3, b0, b1, b2, b3; | |
400 int16_t u04, v04, u26, v26, u17, v17, u35, v35, u12, v12; | |
401 | |
402 col += offset; | |
403 | |
404 x0 = col[0*8]; | |
405 x1 = col[1*8]; | |
406 x2 = col[2*8]; | |
407 x3 = col[3*8]; | |
408 x4 = col[4*8]; | |
409 x5 = col[5*8]; | |
410 x6 = col[6*8]; | |
411 x7 = col[7*8]; | |
412 | |
413 u04 = S (x0 + x4); | |
414 v04 = S (x0 - x4); | |
36 | 415 u26 = S (F (T2, x6) + x2); |
416 v26 = S (F (T2, x2) - x6); | |
1 | 417 |
418 a0 = S (u04 + u26); | |
419 a1 = S (v04 + v26); | |
420 a2 = S (v04 - v26); | |
421 a3 = S (u04 - u26); | |
422 | |
36 | 423 u17 = S (F (T1, x7) + x1); |
424 v17 = S (F (T1, x1) - x7); | |
425 u35 = S (F (T3, x5) + x3); | |
426 v35 = S (F (T3, x3) - x5); | |
1 | 427 |
428 b0 = S (u17 + u35); | |
429 b3 = S (v17 - v35); | |
430 u12 = S (u17 - u35); | |
431 v12 = S (v17 + v35); | |
36 | 432 u12 = S (2 * F (C4, u12)); |
433 v12 = S (2 * F (C4, v12)); | |
1 | 434 b1 = S (u12 + v12); |
435 b2 = S (u12 - v12); | |
436 | |
437 y0 = S (a0 + b0) >> COL_SHIFT; | |
438 y1 = S (a1 + b1) >> COL_SHIFT; | |
439 y2 = S (a2 + b2) >> COL_SHIFT; | |
440 y3 = S (a3 + b3) >> COL_SHIFT; | |
441 | |
442 y4 = S (a3 - b3) >> COL_SHIFT; | |
443 y5 = S (a2 - b2) >> COL_SHIFT; | |
444 y6 = S (a1 - b1) >> COL_SHIFT; | |
445 y7 = S (a0 - b0) >> COL_SHIFT; | |
446 | |
447 col[0*8] = y0; | |
448 col[1*8] = y1; | |
449 col[2*8] = y2; | |
450 col[3*8] = y3; | |
451 col[4*8] = y4; | |
452 col[5*8] = y5; | |
453 col[6*8] = y6; | |
454 col[7*8] = y7; | |
455 } | |
456 #endif | |
457 | |
458 | |
459 #define T1 13036 | |
460 #define T2 27146 | |
461 #define T3 43790 | |
462 #define C4 23170 | |
463 | |
26393
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
464 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
465 /* SSE2 column IDCT */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
466 static inline void sse2_idct_col (int16_t * const col) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
467 { |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
468 /* Almost identical to mmxext version: */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
469 /* just do both 4x8 columns in paraller */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
470 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
471 static const short t1_vector[] ATTR_ALIGN(16) = {T1,T1,T1,T1,T1,T1,T1,T1}; |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
472 static const short t2_vector[] ATTR_ALIGN(16) = {T2,T2,T2,T2,T2,T2,T2,T2}; |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
473 static const short t3_vector[] ATTR_ALIGN(16) = {T3,T3,T3,T3,T3,T3,T3,T3}; |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
474 static const short c4_vector[] ATTR_ALIGN(16) = {C4,C4,C4,C4,C4,C4,C4,C4}; |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
475 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
476 #if defined(__x86_64__) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
477 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
478 /* INPUT: block in xmm8 ... xmm15 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
479 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
480 movdqa_m2r (*t1_vector, xmm0); /* xmm0 = T1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
481 movdqa_r2r (xmm9, xmm1); /* xmm1 = x1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
482 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
483 movdqa_r2r (xmm0, xmm2); /* xmm2 = T1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
484 pmulhw_r2r (xmm1, xmm0); /* xmm0 = T1*x1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
485 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
486 movdqa_m2r (*t3_vector, xmm5); /* xmm5 = T3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
487 pmulhw_r2r (xmm15, xmm2); /* xmm2 = T1*x7 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
488 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
489 movdqa_r2r (xmm5, xmm7); /* xmm7 = T3-1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
490 psubsw_r2r (xmm15, xmm0); /* xmm0 = v17 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
491 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
492 movdqa_m2r (*t2_vector, xmm9); /* xmm9 = T2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
493 pmulhw_r2r (xmm11, xmm5); /* xmm5 = (T3-1)*x3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
494 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
495 paddsw_r2r (xmm2, xmm1); /* xmm1 = u17 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
496 pmulhw_r2r (xmm13, xmm7); /* xmm7 = (T3-1)*x5 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
497 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
498 movdqa_r2r (xmm9, xmm2); /* xmm2 = T2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
499 paddsw_r2r (xmm11, xmm5); /* xmm5 = T3*x3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
500 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
501 pmulhw_r2r (xmm10, xmm9); /* xmm9 = T2*x2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
502 paddsw_r2r (xmm13, xmm7); /* xmm7 = T3*x5 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
503 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
504 psubsw_r2r (xmm13, xmm5); /* xmm5 = v35 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
505 paddsw_r2r (xmm11, xmm7); /* xmm7 = u35 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
506 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
507 movdqa_r2r (xmm0, xmm6); /* xmm6 = v17 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
508 pmulhw_r2r (xmm14, xmm2); /* xmm2 = T2*x6 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
509 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
510 psubsw_r2r (xmm5, xmm0); /* xmm0 = b3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
511 psubsw_r2r (xmm14, xmm9); /* xmm9 = v26 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
512 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
513 paddsw_r2r (xmm6, xmm5); /* xmm5 = v12 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
514 movdqa_r2r (xmm0, xmm11); /* xmm11 = b3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
515 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
516 movdqa_r2r (xmm1, xmm6); /* xmm6 = u17 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
517 paddsw_r2r (xmm10, xmm2); /* xmm2 = u26 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
518 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
519 paddsw_r2r (xmm7, xmm6); /* xmm6 = b0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
520 psubsw_r2r (xmm7, xmm1); /* xmm1 = u12 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
521 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
522 movdqa_r2r (xmm1, xmm7); /* xmm7 = u12 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
523 paddsw_r2r (xmm5, xmm1); /* xmm1 = u12+v12 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
524 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
525 movdqa_m2r (*c4_vector, xmm0); /* xmm0 = C4/2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
526 psubsw_r2r (xmm5, xmm7); /* xmm7 = u12-v12 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
527 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
528 movdqa_r2r (xmm6, xmm4); /* xmm4 = b0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
529 pmulhw_r2r (xmm0, xmm1); /* xmm1 = b1/2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
530 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
531 movdqa_r2r (xmm9, xmm6); /* xmm6 = v26 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
532 pmulhw_r2r (xmm0, xmm7); /* xmm7 = b2/2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
533 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
534 movdqa_r2r (xmm8, xmm10); /* xmm10 = x0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
535 movdqa_r2r (xmm8, xmm0); /* xmm0 = x0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
536 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
537 psubsw_r2r (xmm12, xmm10); /* xmm10 = v04 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
538 paddsw_r2r (xmm12, xmm0); /* xmm0 = u04 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
539 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
540 paddsw_r2r (xmm10, xmm9); /* xmm9 = a1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
541 movdqa_r2r (xmm0, xmm8); /* xmm8 = u04 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
542 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
543 psubsw_r2r (xmm6, xmm10); /* xmm10 = a2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
544 paddsw_r2r (xmm2, xmm8); /* xmm5 = a0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
545 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
546 paddsw_r2r (xmm1, xmm1); /* xmm1 = b1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
547 psubsw_r2r (xmm2, xmm0); /* xmm0 = a3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
548 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
549 paddsw_r2r (xmm7, xmm7); /* xmm7 = b2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
550 movdqa_r2r (xmm10, xmm13); /* xmm13 = a2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
551 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
552 movdqa_r2r (xmm9, xmm14); /* xmm14 = a1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
553 paddsw_r2r (xmm7, xmm10); /* xmm10 = a2+b2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
554 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
555 psraw_i2r (COL_SHIFT,xmm10); /* xmm10 = y2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
556 paddsw_r2r (xmm1, xmm9); /* xmm9 = a1+b1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
557 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
558 psraw_i2r (COL_SHIFT, xmm9); /* xmm9 = y1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
559 psubsw_r2r (xmm1, xmm14); /* xmm14 = a1-b1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
560 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
561 psubsw_r2r (xmm7, xmm13); /* xmm13 = a2-b2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
562 psraw_i2r (COL_SHIFT,xmm14); /* xmm14 = y6 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
563 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
564 movdqa_r2r (xmm8, xmm15); /* xmm15 = a0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
565 psraw_i2r (COL_SHIFT,xmm13); /* xmm13 = y5 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
566 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
567 paddsw_r2r (xmm4, xmm8); /* xmm8 = a0+b0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
568 psubsw_r2r (xmm4, xmm15); /* xmm15 = a0-b0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
569 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
570 psraw_i2r (COL_SHIFT, xmm8); /* xmm8 = y0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
571 movdqa_r2r (xmm0, xmm12); /* xmm12 = a3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
572 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
573 psubsw_r2r (xmm11, xmm12); /* xmm12 = a3-b3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
574 psraw_i2r (COL_SHIFT,xmm15); /* xmm15 = y7 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
575 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
576 paddsw_r2r (xmm0, xmm11); /* xmm11 = a3+b3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
577 psraw_i2r (COL_SHIFT,xmm12); /* xmm12 = y4 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
578 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
579 psraw_i2r (COL_SHIFT,xmm11); /* xmm11 = y3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
580 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
581 /* OUTPUT: block in xmm8 ... xmm15 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
582 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
583 #else |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
584 movdqa_m2r (*t1_vector, xmm0); /* xmm0 = T1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
585 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
586 movdqa_m2r (*(col+1*8), xmm1); /* xmm1 = x1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
587 movdqa_r2r (xmm0, xmm2); /* xmm2 = T1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
588 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
589 movdqa_m2r (*(col+7*8), xmm4); /* xmm4 = x7 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
590 pmulhw_r2r (xmm1, xmm0); /* xmm0 = T1*x1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
591 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
592 movdqa_m2r (*t3_vector, xmm5); /* xmm5 = T3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
593 pmulhw_r2r (xmm4, xmm2); /* xmm2 = T1*x7 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
594 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
595 movdqa_m2r (*(col+5*8), xmm6); /* xmm6 = x5 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
596 movdqa_r2r (xmm5, xmm7); /* xmm7 = T3-1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
597 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
598 movdqa_m2r (*(col+3*8), xmm3); /* xmm3 = x3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
599 psubsw_r2r (xmm4, xmm0); /* xmm0 = v17 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
600 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
601 movdqa_m2r (*t2_vector, xmm4); /* xmm4 = T2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
602 pmulhw_r2r (xmm3, xmm5); /* xmm5 = (T3-1)*x3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
603 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
604 paddsw_r2r (xmm2, xmm1); /* xmm1 = u17 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
605 pmulhw_r2r (xmm6, xmm7); /* xmm7 = (T3-1)*x5 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
606 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
607 /* slot */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
608 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
609 movdqa_r2r (xmm4, xmm2); /* xmm2 = T2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
610 paddsw_r2r (xmm3, xmm5); /* xmm5 = T3*x3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
611 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
612 pmulhw_m2r (*(col+2*8), xmm4); /* xmm4 = T2*x2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
613 paddsw_r2r (xmm6, xmm7); /* xmm7 = T3*x5 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
614 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
615 psubsw_r2r (xmm6, xmm5); /* xmm5 = v35 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
616 paddsw_r2r (xmm3, xmm7); /* xmm7 = u35 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
617 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
618 movdqa_m2r (*(col+6*8), xmm3); /* xmm3 = x6 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
619 movdqa_r2r (xmm0, xmm6); /* xmm6 = v17 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
620 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
621 pmulhw_r2r (xmm3, xmm2); /* xmm2 = T2*x6 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
622 psubsw_r2r (xmm5, xmm0); /* xmm0 = b3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
623 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
624 psubsw_r2r (xmm3, xmm4); /* xmm4 = v26 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
625 paddsw_r2r (xmm6, xmm5); /* xmm5 = v12 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
626 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
627 movdqa_r2m (xmm0, *(col+3*8)); /* save b3 in scratch0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
628 movdqa_r2r (xmm1, xmm6); /* xmm6 = u17 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
629 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
630 paddsw_m2r (*(col+2*8), xmm2); /* xmm2 = u26 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
631 paddsw_r2r (xmm7, xmm6); /* xmm6 = b0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
632 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
633 psubsw_r2r (xmm7, xmm1); /* xmm1 = u12 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
634 movdqa_r2r (xmm1, xmm7); /* xmm7 = u12 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
635 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
636 movdqa_m2r (*(col+0*8), xmm3); /* xmm3 = x0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
637 paddsw_r2r (xmm5, xmm1); /* xmm1 = u12+v12 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
638 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
639 movdqa_m2r (*c4_vector, xmm0); /* xmm0 = C4/2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
640 psubsw_r2r (xmm5, xmm7); /* xmm7 = u12-v12 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
641 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
642 movdqa_r2m (xmm6, *(col+5*8)); /* save b0 in scratch1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
643 pmulhw_r2r (xmm0, xmm1); /* xmm1 = b1/2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
644 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
645 movdqa_r2r (xmm4, xmm6); /* xmm6 = v26 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
646 pmulhw_r2r (xmm0, xmm7); /* xmm7 = b2/2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
647 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
648 movdqa_m2r (*(col+4*8), xmm5); /* xmm5 = x4 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
649 movdqa_r2r (xmm3, xmm0); /* xmm0 = x0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
650 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
651 psubsw_r2r (xmm5, xmm3); /* xmm3 = v04 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
652 paddsw_r2r (xmm5, xmm0); /* xmm0 = u04 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
653 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
654 paddsw_r2r (xmm3, xmm4); /* xmm4 = a1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
655 movdqa_r2r (xmm0, xmm5); /* xmm5 = u04 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
656 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
657 psubsw_r2r (xmm6, xmm3); /* xmm3 = a2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
658 paddsw_r2r (xmm2, xmm5); /* xmm5 = a0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
659 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
660 paddsw_r2r (xmm1, xmm1); /* xmm1 = b1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
661 psubsw_r2r (xmm2, xmm0); /* xmm0 = a3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
662 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
663 paddsw_r2r (xmm7, xmm7); /* xmm7 = b2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
664 movdqa_r2r (xmm3, xmm2); /* xmm2 = a2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
665 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
666 movdqa_r2r (xmm4, xmm6); /* xmm6 = a1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
667 paddsw_r2r (xmm7, xmm3); /* xmm3 = a2+b2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
668 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
669 psraw_i2r (COL_SHIFT, xmm3); /* xmm3 = y2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
670 paddsw_r2r (xmm1, xmm4); /* xmm4 = a1+b1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
671 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
672 psraw_i2r (COL_SHIFT, xmm4); /* xmm4 = y1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
673 psubsw_r2r (xmm1, xmm6); /* xmm6 = a1-b1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
674 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
675 movdqa_m2r (*(col+5*8), xmm1); /* xmm1 = b0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
676 psubsw_r2r (xmm7, xmm2); /* xmm2 = a2-b2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
677 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
678 psraw_i2r (COL_SHIFT, xmm6); /* xmm6 = y6 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
679 movdqa_r2r (xmm5, xmm7); /* xmm7 = a0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
680 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
681 movdqa_r2m (xmm4, *(col+1*8)); /* save y1 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
682 psraw_i2r (COL_SHIFT, xmm2); /* xmm2 = y5 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
683 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
684 movdqa_r2m (xmm3, *(col+2*8)); /* save y2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
685 paddsw_r2r (xmm1, xmm5); /* xmm5 = a0+b0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
686 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
687 movdqa_m2r (*(col+3*8), xmm4); /* xmm4 = b3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
688 psubsw_r2r (xmm1, xmm7); /* xmm7 = a0-b0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
689 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
690 psraw_i2r (COL_SHIFT, xmm5); /* xmm5 = y0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
691 movdqa_r2r (xmm0, xmm3); /* xmm3 = a3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
692 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
693 movdqa_r2m (xmm2, *(col+5*8)); /* save y5 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
694 psubsw_r2r (xmm4, xmm3); /* xmm3 = a3-b3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
695 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
696 psraw_i2r (COL_SHIFT, xmm7); /* xmm7 = y7 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
697 paddsw_r2r (xmm0, xmm4); /* xmm4 = a3+b3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
698 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
699 movdqa_r2m (xmm5, *(col+0*8)); /* save y0 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
700 psraw_i2r (COL_SHIFT, xmm3); /* xmm3 = y4 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
701 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
702 movdqa_r2m (xmm6, *(col+6*8)); /* save y6 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
703 psraw_i2r (COL_SHIFT, xmm4); /* xmm4 = y3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
704 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
705 movdqa_r2m (xmm7, *(col+7*8)); /* save y7 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
706 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
707 movdqa_r2m (xmm3, *(col+4*8)); /* save y4 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
708 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
709 movdqa_r2m (xmm4, *(col+3*8)); /* save y3 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
710 #endif |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
711 } |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
712 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
713 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
714 /* MMX column IDCT */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
715 static inline void idct_col (int16_t * const col, const int offset) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
716 { |
25997
78c7ef4854ff
Fix illegal identifiers, port of my patch to upstream libmpeg2.
diego
parents:
21526
diff
changeset
|
717 static const short t1_vector[] ATTR_ALIGN(8) = {T1,T1,T1,T1}; |
78c7ef4854ff
Fix illegal identifiers, port of my patch to upstream libmpeg2.
diego
parents:
21526
diff
changeset
|
718 static const short t2_vector[] ATTR_ALIGN(8) = {T2,T2,T2,T2}; |
78c7ef4854ff
Fix illegal identifiers, port of my patch to upstream libmpeg2.
diego
parents:
21526
diff
changeset
|
719 static const short t3_vector[] ATTR_ALIGN(8) = {T3,T3,T3,T3}; |
78c7ef4854ff
Fix illegal identifiers, port of my patch to upstream libmpeg2.
diego
parents:
21526
diff
changeset
|
720 static const short c4_vector[] ATTR_ALIGN(8) = {C4,C4,C4,C4}; |
1 | 721 |
722 /* column code adapted from peter gubanov */ | |
723 /* http://www.elecard.com/peter/idct.shtml */ | |
724 | |
25997
78c7ef4854ff
Fix illegal identifiers, port of my patch to upstream libmpeg2.
diego
parents:
21526
diff
changeset
|
725 movq_m2r (*t1_vector, mm0); /* mm0 = T1 */ |
1 | 726 |
9852 | 727 movq_m2r (*(col+offset+1*8), mm1); /* mm1 = x1 */ |
728 movq_r2r (mm0, mm2); /* mm2 = T1 */ | |
1 | 729 |
9852 | 730 movq_m2r (*(col+offset+7*8), mm4); /* mm4 = x7 */ |
731 pmulhw_r2r (mm1, mm0); /* mm0 = T1*x1 */ | |
1 | 732 |
25997
78c7ef4854ff
Fix illegal identifiers, port of my patch to upstream libmpeg2.
diego
parents:
21526
diff
changeset
|
733 movq_m2r (*t3_vector, mm5); /* mm5 = T3 */ |
9852 | 734 pmulhw_r2r (mm4, mm2); /* mm2 = T1*x7 */ |
1 | 735 |
9852 | 736 movq_m2r (*(col+offset+5*8), mm6); /* mm6 = x5 */ |
737 movq_r2r (mm5, mm7); /* mm7 = T3-1 */ | |
1 | 738 |
9852 | 739 movq_m2r (*(col+offset+3*8), mm3); /* mm3 = x3 */ |
740 psubsw_r2r (mm4, mm0); /* mm0 = v17 */ | |
1 | 741 |
25997
78c7ef4854ff
Fix illegal identifiers, port of my patch to upstream libmpeg2.
diego
parents:
21526
diff
changeset
|
742 movq_m2r (*t2_vector, mm4); /* mm4 = T2 */ |
9852 | 743 pmulhw_r2r (mm3, mm5); /* mm5 = (T3-1)*x3 */ |
1 | 744 |
9852 | 745 paddsw_r2r (mm2, mm1); /* mm1 = u17 */ |
746 pmulhw_r2r (mm6, mm7); /* mm7 = (T3-1)*x5 */ | |
1 | 747 |
36 | 748 /* slot */ |
1 | 749 |
9852 | 750 movq_r2r (mm4, mm2); /* mm2 = T2 */ |
751 paddsw_r2r (mm3, mm5); /* mm5 = T3*x3 */ | |
1 | 752 |
9852 | 753 pmulhw_m2r (*(col+offset+2*8), mm4);/* mm4 = T2*x2 */ |
754 paddsw_r2r (mm6, mm7); /* mm7 = T3*x5 */ | |
1 | 755 |
9852 | 756 psubsw_r2r (mm6, mm5); /* mm5 = v35 */ |
757 paddsw_r2r (mm3, mm7); /* mm7 = u35 */ | |
1 | 758 |
9852 | 759 movq_m2r (*(col+offset+6*8), mm3); /* mm3 = x6 */ |
760 movq_r2r (mm0, mm6); /* mm6 = v17 */ | |
1 | 761 |
9852 | 762 pmulhw_r2r (mm3, mm2); /* mm2 = T2*x6 */ |
763 psubsw_r2r (mm5, mm0); /* mm0 = b3 */ | |
1 | 764 |
9852 | 765 psubsw_r2r (mm3, mm4); /* mm4 = v26 */ |
766 paddsw_r2r (mm6, mm5); /* mm5 = v12 */ | |
1 | 767 |
9852 | 768 movq_r2m (mm0, *(col+offset+3*8)); /* save b3 in scratch0 */ |
769 movq_r2r (mm1, mm6); /* mm6 = u17 */ | |
1 | 770 |
9852 | 771 paddsw_m2r (*(col+offset+2*8), mm2);/* mm2 = u26 */ |
772 paddsw_r2r (mm7, mm6); /* mm6 = b0 */ | |
1 | 773 |
9852 | 774 psubsw_r2r (mm7, mm1); /* mm1 = u12 */ |
775 movq_r2r (mm1, mm7); /* mm7 = u12 */ | |
1 | 776 |
9852 | 777 movq_m2r (*(col+offset+0*8), mm3); /* mm3 = x0 */ |
778 paddsw_r2r (mm5, mm1); /* mm1 = u12+v12 */ | |
1 | 779 |
25997
78c7ef4854ff
Fix illegal identifiers, port of my patch to upstream libmpeg2.
diego
parents:
21526
diff
changeset
|
780 movq_m2r (*c4_vector, mm0); /* mm0 = C4/2 */ |
9852 | 781 psubsw_r2r (mm5, mm7); /* mm7 = u12-v12 */ |
1 | 782 |
9852 | 783 movq_r2m (mm6, *(col+offset+5*8)); /* save b0 in scratch1 */ |
784 pmulhw_r2r (mm0, mm1); /* mm1 = b1/2 */ | |
1 | 785 |
9852 | 786 movq_r2r (mm4, mm6); /* mm6 = v26 */ |
787 pmulhw_r2r (mm0, mm7); /* mm7 = b2/2 */ | |
1 | 788 |
9852 | 789 movq_m2r (*(col+offset+4*8), mm5); /* mm5 = x4 */ |
790 movq_r2r (mm3, mm0); /* mm0 = x0 */ | |
1 | 791 |
9852 | 792 psubsw_r2r (mm5, mm3); /* mm3 = v04 */ |
793 paddsw_r2r (mm5, mm0); /* mm0 = u04 */ | |
1 | 794 |
9852 | 795 paddsw_r2r (mm3, mm4); /* mm4 = a1 */ |
796 movq_r2r (mm0, mm5); /* mm5 = u04 */ | |
1 | 797 |
9852 | 798 psubsw_r2r (mm6, mm3); /* mm3 = a2 */ |
799 paddsw_r2r (mm2, mm5); /* mm5 = a0 */ | |
1 | 800 |
9852 | 801 paddsw_r2r (mm1, mm1); /* mm1 = b1 */ |
802 psubsw_r2r (mm2, mm0); /* mm0 = a3 */ | |
1 | 803 |
9852 | 804 paddsw_r2r (mm7, mm7); /* mm7 = b2 */ |
805 movq_r2r (mm3, mm2); /* mm2 = a2 */ | |
1 | 806 |
9852 | 807 movq_r2r (mm4, mm6); /* mm6 = a1 */ |
808 paddsw_r2r (mm7, mm3); /* mm3 = a2+b2 */ | |
1 | 809 |
9852 | 810 psraw_i2r (COL_SHIFT, mm3); /* mm3 = y2 */ |
811 paddsw_r2r (mm1, mm4); /* mm4 = a1+b1 */ | |
1 | 812 |
9852 | 813 psraw_i2r (COL_SHIFT, mm4); /* mm4 = y1 */ |
814 psubsw_r2r (mm1, mm6); /* mm6 = a1-b1 */ | |
1 | 815 |
9852 | 816 movq_m2r (*(col+offset+5*8), mm1); /* mm1 = b0 */ |
817 psubsw_r2r (mm7, mm2); /* mm2 = a2-b2 */ | |
1 | 818 |
9852 | 819 psraw_i2r (COL_SHIFT, mm6); /* mm6 = y6 */ |
820 movq_r2r (mm5, mm7); /* mm7 = a0 */ | |
1 | 821 |
9852 | 822 movq_r2m (mm4, *(col+offset+1*8)); /* save y1 */ |
823 psraw_i2r (COL_SHIFT, mm2); /* mm2 = y5 */ | |
1 | 824 |
9852 | 825 movq_r2m (mm3, *(col+offset+2*8)); /* save y2 */ |
826 paddsw_r2r (mm1, mm5); /* mm5 = a0+b0 */ | |
1 | 827 |
9852 | 828 movq_m2r (*(col+offset+3*8), mm4); /* mm4 = b3 */ |
829 psubsw_r2r (mm1, mm7); /* mm7 = a0-b0 */ | |
1 | 830 |
9852 | 831 psraw_i2r (COL_SHIFT, mm5); /* mm5 = y0 */ |
832 movq_r2r (mm0, mm3); /* mm3 = a3 */ | |
1 | 833 |
9852 | 834 movq_r2m (mm2, *(col+offset+5*8)); /* save y5 */ |
835 psubsw_r2r (mm4, mm3); /* mm3 = a3-b3 */ | |
1 | 836 |
9852 | 837 psraw_i2r (COL_SHIFT, mm7); /* mm7 = y7 */ |
838 paddsw_r2r (mm0, mm4); /* mm4 = a3+b3 */ | |
1 | 839 |
9852 | 840 movq_r2m (mm5, *(col+offset+0*8)); /* save y0 */ |
841 psraw_i2r (COL_SHIFT, mm3); /* mm3 = y4 */ | |
1 | 842 |
9852 | 843 movq_r2m (mm6, *(col+offset+6*8)); /* save y6 */ |
844 psraw_i2r (COL_SHIFT, mm4); /* mm4 = y3 */ | |
1 | 845 |
9852 | 846 movq_r2m (mm7, *(col+offset+7*8)); /* save y7 */ |
1 | 847 |
9852 | 848 movq_r2m (mm3, *(col+offset+4*8)); /* save y4 */ |
1 | 849 |
9852 | 850 movq_r2m (mm4, *(col+offset+3*8)); /* save y3 */ |
1 | 851 } |
852 | |
853 | |
9852 | 854 static const int32_t rounder0[] ATTR_ALIGN(8) = |
1 | 855 rounder ((1 << (COL_SHIFT - 1)) - 0.5); |
9852 | 856 static const int32_t rounder4[] ATTR_ALIGN(8) = rounder (0); |
857 static const int32_t rounder1[] ATTR_ALIGN(8) = | |
36 | 858 rounder (1.25683487303); /* C1*(C1/C4+C1+C7)/2 */ |
9852 | 859 static const int32_t rounder7[] ATTR_ALIGN(8) = |
36 | 860 rounder (-0.25); /* C1*(C7/C4+C7-C1)/2 */ |
9852 | 861 static const int32_t rounder2[] ATTR_ALIGN(8) = |
36 | 862 rounder (0.60355339059); /* C2 * (C6+C2)/2 */ |
9852 | 863 static const int32_t rounder6[] ATTR_ALIGN(8) = |
36 | 864 rounder (-0.25); /* C2 * (C6-C2)/2 */ |
9852 | 865 static const int32_t rounder3[] ATTR_ALIGN(8) = |
36 | 866 rounder (0.087788325588); /* C3*(-C3/C4+C3+C5)/2 */ |
9852 | 867 static const int32_t rounder5[] ATTR_ALIGN(8) = |
36 | 868 rounder (-0.441341716183); /* C3*(-C5/C4+C5-C3)/2 */ |
1 | 869 |
870 | |
871 #define declare_idct(idct,table,idct_row_head,idct_row,idct_row_tail,idct_row_mid) \ | |
9852 | 872 static inline void idct (int16_t * const block) \ |
1 | 873 { \ |
9852 | 874 static const int16_t table04[] ATTR_ALIGN(16) = \ |
1 | 875 table (22725, 21407, 19266, 16384, 12873, 8867, 4520); \ |
9852 | 876 static const int16_t table17[] ATTR_ALIGN(16) = \ |
1 | 877 table (31521, 29692, 26722, 22725, 17855, 12299, 6270); \ |
9852 | 878 static const int16_t table26[] ATTR_ALIGN(16) = \ |
1 | 879 table (29692, 27969, 25172, 21407, 16819, 11585, 5906); \ |
9852 | 880 static const int16_t table35[] ATTR_ALIGN(16) = \ |
1 | 881 table (26722, 25172, 22654, 19266, 15137, 10426, 5315); \ |
882 \ | |
883 idct_row_head (block, 0*8, table04); \ | |
884 idct_row (table04, rounder0); \ | |
885 idct_row_mid (block, 0*8, 4*8, table04); \ | |
886 idct_row (table04, rounder4); \ | |
887 idct_row_mid (block, 4*8, 1*8, table17); \ | |
888 idct_row (table17, rounder1); \ | |
889 idct_row_mid (block, 1*8, 7*8, table17); \ | |
890 idct_row (table17, rounder7); \ | |
891 idct_row_mid (block, 7*8, 2*8, table26); \ | |
892 idct_row (table26, rounder2); \ | |
893 idct_row_mid (block, 2*8, 6*8, table26); \ | |
894 idct_row (table26, rounder6); \ | |
895 idct_row_mid (block, 6*8, 3*8, table35); \ | |
896 idct_row (table35, rounder3); \ | |
897 idct_row_mid (block, 3*8, 5*8, table35); \ | |
898 idct_row (table35, rounder5); \ | |
899 idct_row_tail (block, 5*8); \ | |
900 \ | |
901 idct_col (block, 0); \ | |
902 idct_col (block, 4); \ | |
903 } | |
904 | |
26393
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
905 static inline void sse2_idct (int16_t * const block) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
906 { |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
907 static const int16_t table04[] ATTR_ALIGN(16) = |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
908 sse2_table (22725, 21407, 19266, 16384, 12873, 8867, 4520); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
909 static const int16_t table17[] ATTR_ALIGN(16) = |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
910 sse2_table (31521, 29692, 26722, 22725, 17855, 12299, 6270); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
911 static const int16_t table26[] ATTR_ALIGN(16) = |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
912 sse2_table (29692, 27969, 25172, 21407, 16819, 11585, 5906); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
913 static const int16_t table35[] ATTR_ALIGN(16) = |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
914 sse2_table (26722, 25172, 22654, 19266, 15137, 10426, 5315); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
915 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
916 static const int32_t rounder0_128[] ATTR_ALIGN(16) = |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
917 rounder_sse2 ((1 << (COL_SHIFT - 1)) - 0.5); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
918 static const int32_t rounder4_128[] ATTR_ALIGN(16) = rounder_sse2 (0); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
919 static const int32_t rounder1_128[] ATTR_ALIGN(16) = |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
920 rounder_sse2 (1.25683487303); /* C1*(C1/C4+C1+C7)/2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
921 static const int32_t rounder7_128[] ATTR_ALIGN(16) = |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
922 rounder_sse2 (-0.25); /* C1*(C7/C4+C7-C1)/2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
923 static const int32_t rounder2_128[] ATTR_ALIGN(16) = |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
924 rounder_sse2 (0.60355339059); /* C2 * (C6+C2)/2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
925 static const int32_t rounder6_128[] ATTR_ALIGN(16) = |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
926 rounder_sse2 (-0.25); /* C2 * (C6-C2)/2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
927 static const int32_t rounder3_128[] ATTR_ALIGN(16) = |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
928 rounder_sse2 (0.087788325588); /* C3*(-C3/C4+C3+C5)/2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
929 static const int32_t rounder5_128[] ATTR_ALIGN(16) = |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
930 rounder_sse2 (-0.441341716183); /* C3*(-C5/C4+C5-C3)/2 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
931 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
932 #if defined(__x86_64__) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
933 movdqa_m2r (block[0*8], xmm8); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
934 movdqa_m2r (block[4*8], xmm12); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
935 SSE2_IDCT_2ROW (table04, xmm8, xmm12, *rounder0_128, *rounder4_128); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
936 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
937 movdqa_m2r (block[1*8], xmm9); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
938 movdqa_m2r (block[7*8], xmm15); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
939 SSE2_IDCT_2ROW (table17, xmm9, xmm15, *rounder1_128, *rounder7_128); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
940 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
941 movdqa_m2r (block[2*8], xmm10); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
942 movdqa_m2r (block[6*8], xmm14); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
943 SSE2_IDCT_2ROW (table26, xmm10, xmm14, *rounder2_128, *rounder6_128); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
944 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
945 movdqa_m2r (block[3*8], xmm11); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
946 movdqa_m2r (block[5*8], xmm13); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
947 SSE2_IDCT_2ROW (table35, xmm11, xmm13, *rounder3_128, *rounder5_128); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
948 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
949 /* OUTPUT: block in xmm8 ... xmm15 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
950 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
951 #else |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
952 movdqa_m2r (block[0*8], xmm0); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
953 movdqa_m2r (block[4*8], xmm4); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
954 SSE2_IDCT_2ROW (table04, xmm0, xmm4, *rounder0_128, *rounder4_128); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
955 movdqa_r2m (xmm0, block[0*8]); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
956 movdqa_r2m (xmm4, block[4*8]); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
957 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
958 movdqa_m2r (block[1*8], xmm0); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
959 movdqa_m2r (block[7*8], xmm4); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
960 SSE2_IDCT_2ROW (table17, xmm0, xmm4, *rounder1_128, *rounder7_128); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
961 movdqa_r2m (xmm0, block[1*8]); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
962 movdqa_r2m (xmm4, block[7*8]); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
963 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
964 movdqa_m2r (block[2*8], xmm0); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
965 movdqa_m2r (block[6*8], xmm4); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
966 SSE2_IDCT_2ROW (table26, xmm0, xmm4, *rounder2_128, *rounder6_128); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
967 movdqa_r2m (xmm0, block[2*8]); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
968 movdqa_r2m (xmm4, block[6*8]); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
969 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
970 movdqa_m2r (block[3*8], xmm0); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
971 movdqa_m2r (block[5*8], xmm4); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
972 SSE2_IDCT_2ROW (table35, xmm0, xmm4, *rounder3_128, *rounder5_128); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
973 movdqa_r2m (xmm0, block[3*8]); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
974 movdqa_r2m (xmm4, block[5*8]); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
975 #endif |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
976 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
977 sse2_idct_col (block); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
978 } |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
979 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
980 static void sse2_block_copy (int16_t * const block, uint8_t * dest, |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
981 const int stride) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
982 { |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
983 #if defined(__x86_64__) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
984 /* INPUT: block in xmm8 ... xmm15 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
985 packuswb_r2r (xmm8, xmm8); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
986 packuswb_r2r (xmm9, xmm9); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
987 movq_r2m (xmm8, *(dest+0*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
988 packuswb_r2r (xmm10, xmm10); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
989 movq_r2m (xmm9, *(dest+1*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
990 packuswb_r2r (xmm11, xmm11); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
991 movq_r2m (xmm10, *(dest+2*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
992 packuswb_r2r (xmm12, xmm12); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
993 movq_r2m (xmm11, *(dest+3*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
994 packuswb_r2r (xmm13, xmm13); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
995 movq_r2m (xmm12, *(dest+4*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
996 packuswb_r2r (xmm14, xmm14); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
997 movq_r2m (xmm13, *(dest+5*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
998 packuswb_r2r (xmm15, xmm15); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
999 movq_r2m (xmm14, *(dest+6*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1000 movq_r2m (xmm15, *(dest+7*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1001 #else |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1002 movdqa_m2r (*(block+0*8), xmm0); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1003 movdqa_m2r (*(block+1*8), xmm1); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1004 movdqa_m2r (*(block+2*8), xmm2); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1005 packuswb_r2r (xmm0, xmm0); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1006 movdqa_m2r (*(block+3*8), xmm3); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1007 packuswb_r2r (xmm1, xmm1); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1008 movdqa_m2r (*(block+4*8), xmm4); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1009 packuswb_r2r (xmm2, xmm2); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1010 movdqa_m2r (*(block+5*8), xmm5); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1011 packuswb_r2r (xmm3, xmm3); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1012 movdqa_m2r (*(block+6*8), xmm6); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1013 packuswb_r2r (xmm4, xmm4); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1014 movdqa_m2r (*(block+7*8), xmm7); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1015 movq_r2m (xmm0, *(dest+0*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1016 packuswb_r2r (xmm5, xmm5); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1017 movq_r2m (xmm1, *(dest+1*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1018 packuswb_r2r (xmm6, xmm6); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1019 movq_r2m (xmm2, *(dest+2*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1020 packuswb_r2r (xmm7, xmm7); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1021 movq_r2m (xmm3, *(dest+3*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1022 movq_r2m (xmm4, *(dest+4*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1023 movq_r2m (xmm5, *(dest+5*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1024 movq_r2m (xmm6, *(dest+6*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1025 movq_r2m (xmm7, *(dest+7*stride)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1026 #endif |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1027 } |
1 | 1028 |
1029 #define COPY_MMX(offset,r0,r1,r2) \ | |
1030 do { \ | |
1031 movq_m2r (*(block+offset), r0); \ | |
1032 dest += stride; \ | |
1033 movq_m2r (*(block+offset+4), r1); \ | |
1034 movq_r2m (r2, *dest); \ | |
1035 packuswb_r2r (r1, r0); \ | |
1036 } while (0) | |
1037 | |
9852 | 1038 static inline void block_copy (int16_t * const block, uint8_t * dest, |
1039 const int stride) | |
1 | 1040 { |
1041 movq_m2r (*(block+0*8), mm0); | |
1042 movq_m2r (*(block+0*8+4), mm1); | |
1043 movq_m2r (*(block+1*8), mm2); | |
1044 packuswb_r2r (mm1, mm0); | |
1045 movq_m2r (*(block+1*8+4), mm3); | |
1046 movq_r2m (mm0, *dest); | |
1047 packuswb_r2r (mm3, mm2); | |
1048 COPY_MMX (2*8, mm0, mm1, mm2); | |
1049 COPY_MMX (3*8, mm2, mm3, mm0); | |
1050 COPY_MMX (4*8, mm0, mm1, mm2); | |
1051 COPY_MMX (5*8, mm2, mm3, mm0); | |
1052 COPY_MMX (6*8, mm0, mm1, mm2); | |
1053 COPY_MMX (7*8, mm2, mm3, mm0); | |
1054 movq_r2m (mm2, *(dest+stride)); | |
1055 } | |
1056 | |
26393
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1057 #define ADD_SSE2_2ROW(op, block0, block1)\ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1058 do { \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1059 movq_m2r (*(dest), xmm1); \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1060 movq_m2r (*(dest+stride), xmm2); \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1061 punpcklbw_r2r (xmm0, xmm1); \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1062 punpcklbw_r2r (xmm0, xmm2); \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1063 paddsw_##op (block0, xmm1); \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1064 paddsw_##op (block1, xmm2); \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1065 packuswb_r2r (xmm1, xmm1); \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1066 packuswb_r2r (xmm2, xmm2); \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1067 movq_r2m (xmm1, *(dest)); \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1068 movq_r2m (xmm2, *(dest+stride)); \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1069 dest += 2*stride; \ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1070 } while (0) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1071 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1072 static void sse2_block_add (int16_t * const block, uint8_t * dest, |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1073 const int stride) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1074 { |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1075 pxor_r2r(xmm0, xmm0); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1076 #if defined(__x86_64__) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1077 /* INPUT: block in xmm8 ... xmm15 */ |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1078 ADD_SSE2_2ROW(r2r, xmm8, xmm9); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1079 ADD_SSE2_2ROW(r2r, xmm10, xmm11); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1080 ADD_SSE2_2ROW(r2r, xmm12, xmm13); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1081 ADD_SSE2_2ROW(r2r, xmm14, xmm15); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1082 #else |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1083 ADD_SSE2_2ROW(m2r, *(block+0*8), *(block+1*8)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1084 ADD_SSE2_2ROW(m2r, *(block+2*8), *(block+3*8)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1085 ADD_SSE2_2ROW(m2r, *(block+4*8), *(block+5*8)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1086 ADD_SSE2_2ROW(m2r, *(block+6*8), *(block+7*8)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1087 #endif |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1088 } |
1 | 1089 |
1090 #define ADD_MMX(offset,r1,r2,r3,r4) \ | |
1091 do { \ | |
1092 movq_m2r (*(dest+2*stride), r1); \ | |
1093 packuswb_r2r (r4, r3); \ | |
1094 movq_r2r (r1, r2); \ | |
1095 dest += stride; \ | |
1096 movq_r2m (r3, *dest); \ | |
1097 punpcklbw_r2r (mm0, r1); \ | |
1098 paddsw_m2r (*(block+offset), r1); \ | |
1099 punpckhbw_r2r (mm0, r2); \ | |
1100 paddsw_m2r (*(block+offset+4), r2); \ | |
1101 } while (0) | |
1102 | |
9852 | 1103 static inline void block_add (int16_t * const block, uint8_t * dest, |
1104 const int stride) | |
1 | 1105 { |
1106 movq_m2r (*dest, mm1); | |
1107 pxor_r2r (mm0, mm0); | |
1108 movq_m2r (*(dest+stride), mm3); | |
1109 movq_r2r (mm1, mm2); | |
1110 punpcklbw_r2r (mm0, mm1); | |
1111 movq_r2r (mm3, mm4); | |
1112 paddsw_m2r (*(block+0*8), mm1); | |
1113 punpckhbw_r2r (mm0, mm2); | |
1114 paddsw_m2r (*(block+0*8+4), mm2); | |
1115 punpcklbw_r2r (mm0, mm3); | |
1116 paddsw_m2r (*(block+1*8), mm3); | |
1117 packuswb_r2r (mm2, mm1); | |
1118 punpckhbw_r2r (mm0, mm4); | |
1119 movq_r2m (mm1, *dest); | |
1120 paddsw_m2r (*(block+1*8+4), mm4); | |
1121 ADD_MMX (2*8, mm1, mm2, mm3, mm4); | |
1122 ADD_MMX (3*8, mm3, mm4, mm1, mm2); | |
1123 ADD_MMX (4*8, mm1, mm2, mm3, mm4); | |
1124 ADD_MMX (5*8, mm3, mm4, mm1, mm2); | |
1125 ADD_MMX (6*8, mm1, mm2, mm3, mm4); | |
1126 ADD_MMX (7*8, mm3, mm4, mm1, mm2); | |
1127 packuswb_r2r (mm4, mm3); | |
1128 movq_r2m (mm3, *(dest+stride)); | |
1129 } | |
1130 | |
1131 | |
26393
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1132 static inline void sse2_block_zero (int16_t * const block) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1133 { |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1134 pxor_r2r (xmm0, xmm0); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1135 movdqa_r2m (xmm0, *(block+0*8)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1136 movdqa_r2m (xmm0, *(block+1*8)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1137 movdqa_r2m (xmm0, *(block+2*8)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1138 movdqa_r2m (xmm0, *(block+3*8)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1139 movdqa_r2m (xmm0, *(block+4*8)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1140 movdqa_r2m (xmm0, *(block+5*8)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1141 movdqa_r2m (xmm0, *(block+6*8)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1142 movdqa_r2m (xmm0, *(block+7*8)); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1143 } |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1144 |
9852 | 1145 static inline void block_zero (int16_t * const block) |
1146 { | |
1147 pxor_r2r (mm0, mm0); | |
1148 movq_r2m (mm0, *(block+0*4)); | |
1149 movq_r2m (mm0, *(block+1*4)); | |
1150 movq_r2m (mm0, *(block+2*4)); | |
1151 movq_r2m (mm0, *(block+3*4)); | |
1152 movq_r2m (mm0, *(block+4*4)); | |
1153 movq_r2m (mm0, *(block+5*4)); | |
1154 movq_r2m (mm0, *(block+6*4)); | |
1155 movq_r2m (mm0, *(block+7*4)); | |
1156 movq_r2m (mm0, *(block+8*4)); | |
1157 movq_r2m (mm0, *(block+9*4)); | |
1158 movq_r2m (mm0, *(block+10*4)); | |
1159 movq_r2m (mm0, *(block+11*4)); | |
1160 movq_r2m (mm0, *(block+12*4)); | |
1161 movq_r2m (mm0, *(block+13*4)); | |
1162 movq_r2m (mm0, *(block+14*4)); | |
1163 movq_r2m (mm0, *(block+15*4)); | |
1164 } | |
1165 | |
1166 | |
1167 #define CPU_MMXEXT 0 | |
1168 #define CPU_MMX 1 | |
1169 | |
1170 #define dup4(reg) \ | |
1171 do { \ | |
1172 if (cpu != CPU_MMXEXT) { \ | |
1173 punpcklwd_r2r (reg, reg); \ | |
1174 punpckldq_r2r (reg, reg); \ | |
1175 } else \ | |
1176 pshufw_r2r (reg, reg, 0x00); \ | |
1177 } while (0) | |
1178 | |
1179 static inline void block_add_DC (int16_t * const block, uint8_t * dest, | |
1180 const int stride, const int cpu) | |
1181 { | |
12932 | 1182 movd_v2r ((block[0] + 64) >> 7, mm0); |
9852 | 1183 pxor_r2r (mm1, mm1); |
1184 movq_m2r (*dest, mm2); | |
1185 dup4 (mm0); | |
1186 psubsw_r2r (mm0, mm1); | |
1187 packuswb_r2r (mm0, mm0); | |
1188 paddusb_r2r (mm0, mm2); | |
1189 packuswb_r2r (mm1, mm1); | |
1190 movq_m2r (*(dest + stride), mm3); | |
1191 psubusb_r2r (mm1, mm2); | |
1192 block[0] = 0; | |
1193 paddusb_r2r (mm0, mm3); | |
1194 movq_r2m (mm2, *dest); | |
1195 psubusb_r2r (mm1, mm3); | |
1196 movq_m2r (*(dest + 2*stride), mm2); | |
1197 dest += stride; | |
1198 movq_r2m (mm3, *dest); | |
1199 paddusb_r2r (mm0, mm2); | |
1200 movq_m2r (*(dest + 2*stride), mm3); | |
1201 psubusb_r2r (mm1, mm2); | |
1202 dest += stride; | |
1203 paddusb_r2r (mm0, mm3); | |
1204 movq_r2m (mm2, *dest); | |
1205 psubusb_r2r (mm1, mm3); | |
1206 movq_m2r (*(dest + 2*stride), mm2); | |
1207 dest += stride; | |
1208 movq_r2m (mm3, *dest); | |
1209 paddusb_r2r (mm0, mm2); | |
1210 movq_m2r (*(dest + 2*stride), mm3); | |
1211 psubusb_r2r (mm1, mm2); | |
1212 dest += stride; | |
1213 paddusb_r2r (mm0, mm3); | |
1214 movq_r2m (mm2, *dest); | |
1215 psubusb_r2r (mm1, mm3); | |
1216 movq_m2r (*(dest + 2*stride), mm2); | |
1217 dest += stride; | |
1218 movq_r2m (mm3, *dest); | |
1219 paddusb_r2r (mm0, mm2); | |
1220 movq_m2r (*(dest + 2*stride), mm3); | |
1221 psubusb_r2r (mm1, mm2); | |
1222 block[63] = 0; | |
1223 paddusb_r2r (mm0, mm3); | |
1224 movq_r2m (mm2, *(dest + stride)); | |
1225 psubusb_r2r (mm1, mm3); | |
1226 movq_r2m (mm3, *(dest + 2*stride)); | |
1227 } | |
1228 | |
26393
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1229 void mpeg2_idct_copy_sse2 (int16_t * const block, uint8_t * const dest, |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1230 const int stride) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1231 { |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1232 sse2_idct (block); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1233 sse2_block_copy (block, dest, stride); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1234 sse2_block_zero (block); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1235 } |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1236 |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1237 void mpeg2_idct_add_sse2 (const int last, int16_t * const block, |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1238 uint8_t * const dest, const int stride) |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1239 { |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1240 if (last != 129 || (block[0] & (7 << 4)) == (4 << 4)) { |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1241 sse2_idct (block); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1242 sse2_block_add (block, dest, stride); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1243 sse2_block_zero (block); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1244 } else |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1245 block_add_DC (block, dest, stride, CPU_MMXEXT); |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1246 } |
2506f1b0bdbe
Backport SSE2-optimized IDCT routines from upstream libmpeg2.
diego
parents:
25998
diff
changeset
|
1247 |
9852 | 1248 |
1 | 1249 declare_idct (mmxext_idct, mmxext_table, |
1250 mmxext_row_head, mmxext_row, mmxext_row_tail, mmxext_row_mid) | |
1251 | |
9852 | 1252 void mpeg2_idct_copy_mmxext (int16_t * const block, uint8_t * const dest, |
1253 const int stride) | |
1 | 1254 { |
1255 mmxext_idct (block); | |
1256 block_copy (block, dest, stride); | |
9852 | 1257 block_zero (block); |
1 | 1258 } |
1259 | |
9852 | 1260 void mpeg2_idct_add_mmxext (const int last, int16_t * const block, |
1261 uint8_t * const dest, const int stride) | |
1 | 1262 { |
12932 | 1263 if (last != 129 || (block[0] & (7 << 4)) == (4 << 4)) { |
9852 | 1264 mmxext_idct (block); |
1265 block_add (block, dest, stride); | |
1266 block_zero (block); | |
1267 } else | |
1268 block_add_DC (block, dest, stride, CPU_MMXEXT); | |
1 | 1269 } |
1270 | |
1271 | |
1272 declare_idct (mmx_idct, mmx_table, | |
1273 mmx_row_head, mmx_row, mmx_row_tail, mmx_row_mid) | |
1274 | |
9852 | 1275 void mpeg2_idct_copy_mmx (int16_t * const block, uint8_t * const dest, |
1276 const int stride) | |
1 | 1277 { |
1278 mmx_idct (block); | |
1279 block_copy (block, dest, stride); | |
9852 | 1280 block_zero (block); |
1 | 1281 } |
1282 | |
9852 | 1283 void mpeg2_idct_add_mmx (const int last, int16_t * const block, |
1284 uint8_t * const dest, const int stride) | |
1 | 1285 { |
12932 | 1286 if (last != 129 || (block[0] & (7 << 4)) == (4 << 4)) { |
9852 | 1287 mmx_idct (block); |
1288 block_add (block, dest, stride); | |
1289 block_zero (block); | |
1290 } else | |
1291 block_add_DC (block, dest, stride, CPU_MMX); | |
1 | 1292 } |
1293 | |
1294 | |
9852 | 1295 void mpeg2_idct_mmx_init (void) |
1 | 1296 { |
9852 | 1297 extern uint8_t mpeg2_scan_norm[64]; |
1298 extern uint8_t mpeg2_scan_alt[64]; | |
1 | 1299 int i, j; |
1300 | |
36 | 1301 /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */ |
1 | 1302 |
1303 for (i = 0; i < 64; i++) { | |
9852 | 1304 j = mpeg2_scan_norm[i]; |
1305 mpeg2_scan_norm[i] = (j & 0x38) | ((j & 6) >> 1) | ((j & 1) << 2); | |
1306 j = mpeg2_scan_alt[i]; | |
1307 mpeg2_scan_alt[i] = (j & 0x38) | ((j & 6) >> 1) | ((j & 1) << 2); | |
1 | 1308 } |
1309 } | |
1310 | |
1311 #endif |