comparison x86/idct_mmx.c @ 12296:34d95bdfd38c libavcodec

Translate libmpeg2 MMX IDCT to plain asm
author vitor
date Thu, 29 Jul 2010 22:19:54 +0000
parents 7be32921237f
children be9129c5503e
comparison
equal deleted inserted replaced
12295:bb7b7602b40e 12296:34d95bdfd38c
21 */ 21 */
22 22
23 #include "libavutil/common.h" 23 #include "libavutil/common.h"
24 #include "libavcodec/dsputil.h" 24 #include "libavcodec/dsputil.h"
25 25
26 #include "libavutil/x86_cpu.h"
26 #include "dsputil_mmx.h" 27 #include "dsputil_mmx.h"
27 #include "mmx.h"
28 28
29 #define ROW_SHIFT 11 29 #define ROW_SHIFT 11
30 #define COL_SHIFT 6 30 #define COL_SHIFT 6
31 31
32 #define round(bias) ((int)(((bias)+0.5) * (1<<ROW_SHIFT))) 32 #define round(bias) ((int)(((bias)+0.5) * (1<<ROW_SHIFT)))
85 c7, c3, c7, -c5 } 85 c7, c3, c7, -c5 }
86 86
87 static inline void mmxext_row_head (int16_t * const row, const int offset, 87 static inline void mmxext_row_head (int16_t * const row, const int offset,
88 const int16_t * const table) 88 const int16_t * const table)
89 { 89 {
90 movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */ 90 __asm__ volatile(
91 91 "movq (%0), %%mm2 \n\t" /* mm2 = x6 x4 x2 x0 */
92 movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ 92
93 movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */ 93 "movq 8(%0), %%mm5 \n\t" /* mm5 = x7 x5 x3 x1 */
94 94 "movq %%mm2, %%mm0 \n\t" /* mm0 = x6 x4 x2 x0 */
95 movq_m2r (*table, mm3); /* mm3 = -C2 -C4 C2 C4 */ 95
96 movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */ 96 "movq (%1), %%mm3 \n\t" /* mm3 = -C2 -C4 C2 C4 */
97 97 "movq %%mm5, %%mm6 \n\t" /* mm6 = x7 x5 x3 x1 */
98 movq_m2r (*(table+4), mm4); /* mm4 = C6 C4 C6 C4 */ 98
99 pmaddwd_r2r (mm0, mm3); /* mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 */ 99 "movq 8(%1), %%mm4 \n\t" /* mm4 = C6 C4 C6 C4 */
100 100 "pmaddwd %%mm0, %%mm3 \n\t" /* mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 */
101 pshufw_r2r (mm2, mm2, 0x4e); /* mm2 = x2 x0 x6 x4 */ 101
102 "pshufw $0x4e, %%mm2, %%mm2 \n\t" /* mm2 = x2 x0 x6 x4 */
103 :: "r" ((row+offset)), "r" (table)
104 );
102 } 105 }
103 106
104 static inline void mmxext_row (const int16_t * const table, 107 static inline void mmxext_row (const int16_t * const table,
105 const int32_t * const rounder) 108 const int32_t * const rounder)
106 { 109 {
107 movq_m2r (*(table+8), mm1); /* mm1 = -C5 -C1 C3 C1 */ 110 __asm__ volatile (
108 pmaddwd_r2r (mm2, mm4); /* mm4 = C4*x0+C6*x2 C4*x4+C6*x6 */ 111 "movq 16(%0), %%mm1 \n\t" /* mm1 = -C5 -C1 C3 C1 */
109 112 "pmaddwd %%mm2, %%mm4 \n\t" /* mm4 = C4*x0+C6*x2 C4*x4+C6*x6 */
110 pmaddwd_m2r (*(table+16), mm0); /* mm0 = C4*x4-C6*x6 C4*x0-C6*x2 */ 113
111 pshufw_r2r (mm6, mm6, 0x4e); /* mm6 = x3 x1 x7 x5 */ 114 "pmaddwd 32(%0), %%mm0 \n\t" /* mm0 = C4*x4-C6*x6 C4*x0-C6*x2 */
112 115 "pshufw $0x4e, %%mm6, %%mm6 \n\t" /* mm6 = x3 x1 x7 x5 */
113 movq_m2r (*(table+12), mm7); /* mm7 = -C7 C3 C7 C5 */ 116
114 pmaddwd_r2r (mm5, mm1); /* mm1 = -C1*x5-C5*x7 C1*x1+C3*x3 */ 117 "movq 24(%0), %%mm7 \n\t" /* mm7 = -C7 C3 C7 C5 */
115 118 "pmaddwd %%mm5, %%mm1 \n\t" /* mm1= -C1*x5-C5*x7 C1*x1+C3*x3 */
116 paddd_m2r (*rounder, mm3); /* mm3 += rounder */ 119
117 pmaddwd_r2r (mm6, mm7); /* mm7 = C3*x1-C7*x3 C5*x5+C7*x7 */ 120 "paddd (%1), %%mm3 \n\t" /* mm3 += rounder */
118 121 "pmaddwd %%mm6, %%mm7 \n\t" /* mm7 = C3*x1-C7*x3 C5*x5+C7*x7 */
119 pmaddwd_m2r (*(table+20), mm2); /* mm2 = C4*x0-C2*x2 -C4*x4+C2*x6 */ 122
120 paddd_r2r (mm4, mm3); /* mm3 = a1 a0 + rounder */ 123 "pmaddwd 40(%0), %%mm2 \n\t" /* mm2= C4*x0-C2*x2 -C4*x4+C2*x6 */
121 124 "paddd %%mm4, %%mm3 \n\t" /* mm3 = a1 a0 + rounder */
122 pmaddwd_m2r (*(table+24), mm5); /* mm5 = C3*x5-C1*x7 C5*x1-C1*x3 */ 125
123 movq_r2r (mm3, mm4); /* mm4 = a1 a0 + rounder */ 126 "pmaddwd 48(%0), %%mm5 \n\t" /* mm5 = C3*x5-C1*x7 C5*x1-C1*x3 */
124 127 "movq %%mm3, %%mm4 \n\t" /* mm4 = a1 a0 + rounder */
125 pmaddwd_m2r (*(table+28), mm6); /* mm6 = C7*x1-C5*x3 C7*x5+C3*x7 */ 128
126 paddd_r2r (mm7, mm1); /* mm1 = b1 b0 */ 129 "pmaddwd 56(%0), %%mm6 \n\t" /* mm6 = C7*x1-C5*x3 C7*x5+C3*x7 */
127 130 "paddd %%mm7, %%mm1 \n\t" /* mm1 = b1 b0 */
128 paddd_m2r (*rounder, mm0); /* mm0 += rounder */ 131
129 psubd_r2r (mm1, mm3); /* mm3 = a1-b1 a0-b0 + rounder */ 132 "paddd (%1), %%mm0 \n\t" /* mm0 += rounder */
130 133 "psubd %%mm1, %%mm3 \n\t" /* mm3 = a1-b1 a0-b0 + rounder */
131 psrad_i2r (ROW_SHIFT, mm3); /* mm3 = y6 y7 */ 134
132 paddd_r2r (mm4, mm1); /* mm1 = a1+b1 a0+b0 + rounder */ 135 "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm3 \n\t" /* mm3 = y6 y7 */
133 136 "paddd %%mm4, %%mm1 \n\t" /* mm1 = a1+b1 a0+b0 + rounder */
134 paddd_r2r (mm2, mm0); /* mm0 = a3 a2 + rounder */ 137
135 psrad_i2r (ROW_SHIFT, mm1); /* mm1 = y1 y0 */ 138 "paddd %%mm2, %%mm0 \n\t" /* mm0 = a3 a2 + rounder */
136 139 "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm1 \n\t" /* mm1 = y1 y0 */
137 paddd_r2r (mm6, mm5); /* mm5 = b3 b2 */ 140
138 movq_r2r (mm0, mm4); /* mm4 = a3 a2 + rounder */ 141 "paddd %%mm6, %%mm5 \n\t" /* mm5 = b3 b2 */
139 142 "movq %%mm0, %%mm4 \n\t" /* mm4 = a3 a2 + rounder */
140 paddd_r2r (mm5, mm0); /* mm0 = a3+b3 a2+b2 + rounder */ 143
141 psubd_r2r (mm5, mm4); /* mm4 = a3-b3 a2-b2 + rounder */ 144 "paddd %%mm5, %%mm0 \n\t" /* mm0 = a3+b3 a2+b2 + rounder */
145 "psubd %%mm5, %%mm4 \n\t" /* mm4 = a3-b3 a2-b2 + rounder */
146 : : "r" (table), "r" (rounder));
142 } 147 }
143 148
144 static inline void mmxext_row_tail (int16_t * const row, const int store) 149 static inline void mmxext_row_tail (int16_t * const row, const int store)
145 { 150 {
146 psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */ 151 __asm__ volatile (
147 152 "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm0 \n\t" /* mm0 = y3 y2 */
148 psrad_i2r (ROW_SHIFT, mm4); /* mm4 = y4 y5 */ 153
149 154 "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm4 \n\t" /* mm4 = y4 y5 */
150 packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */ 155
151 156 "packssdw %%mm0, %%mm1 \n\t" /* mm1 = y3 y2 y1 y0 */
152 packssdw_r2r (mm3, mm4); /* mm4 = y6 y7 y4 y5 */ 157
153 158 "packssdw %%mm3, %%mm4 \n\t" /* mm4 = y6 y7 y4 y5 */
154 movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */ 159
155 pshufw_r2r (mm4, mm4, 0xb1); /* mm4 = y7 y6 y5 y4 */ 160 "movq %%mm1, (%0) \n\t" /* save y3 y2 y1 y0 */
156 161 "pshufw $0xb1, %%mm4, %%mm4 \n\t" /* mm4 = y7 y6 y5 y4 */
157 /* slot */ 162
158 163 /* slot */
159 movq_r2m (mm4, *(row+store+4)); /* save y7 y6 y5 y4 */ 164
165 "movq %%mm4, 8(%0) \n\t" /* save y7 y6 y5 y4 */
166 :: "r" (row+store)
167 );
160 } 168 }
161 169
162 static inline void mmxext_row_mid (int16_t * const row, const int store, 170 static inline void mmxext_row_mid (int16_t * const row, const int store,
163 const int offset, 171 const int offset,
164 const int16_t * const table) 172 const int16_t * const table)
165 { 173 {
166 movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */ 174 __asm__ volatile (
167 psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */ 175 "movq (%0,%1), %%mm2 \n\t" /* mm2 = x6 x4 x2 x0 */
168 176 "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm0 \n\t" /* mm0 = y3 y2 */
169 movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ 177
170 psrad_i2r (ROW_SHIFT, mm4); /* mm4 = y4 y5 */ 178 "movq 8(%0,%1), %%mm5 \n\t" /* mm5 = x7 x5 x3 x1 */
171 179 "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm4 \n\t" /* mm4 = y4 y5 */
172 packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */ 180
173 movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */ 181 "packssdw %%mm0, %%mm1 \n\t" /* mm1 = y3 y2 y1 y0 */
174 182 "movq %%mm5, %%mm6 \n\t" /* mm6 = x7 x5 x3 x1 */
175 packssdw_r2r (mm3, mm4); /* mm4 = y6 y7 y4 y5 */ 183
176 movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */ 184 "packssdw %%mm3, %%mm4 \n\t" /* mm4 = y6 y7 y4 y5 */
177 185 "movq %%mm2, %%mm0 \n\t" /* mm0 = x6 x4 x2 x0 */
178 movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */ 186
179 pshufw_r2r (mm4, mm4, 0xb1); /* mm4 = y7 y6 y5 y4 */ 187 "movq %%mm1, (%0,%2) \n\t" /* save y3 y2 y1 y0 */
180 188 "pshufw $0xb1, %%mm4, %%mm4\n\t" /* mm4 = y7 y6 y5 y4 */
181 movq_m2r (*table, mm3); /* mm3 = -C2 -C4 C2 C4 */ 189
182 movq_r2m (mm4, *(row+store+4)); /* save y7 y6 y5 y4 */ 190 "movq (%3), %%mm3 \n\t" /* mm3 = -C2 -C4 C2 C4 */
183 191 "movq %%mm4, 8(%0,%2) \n\t" /* save y7 y6 y5 y4 */
184 pmaddwd_r2r (mm0, mm3); /* mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 */ 192
185 193 "pmaddwd %%mm0, %%mm3 \n\t" /* mm3= -C4*x4-C2*x6 C4*x0+C2*x2 */
186 movq_m2r (*(table+4), mm4); /* mm4 = C6 C4 C6 C4 */ 194
187 pshufw_r2r (mm2, mm2, 0x4e); /* mm2 = x2 x0 x6 x4 */ 195 "movq 8(%3), %%mm4 \n\t" /* mm4 = C6 C4 C6 C4 */
196 "pshufw $0x4e, %%mm2, %%mm2\n\t" /* mm2 = x2 x0 x6 x4 */
197 :: "r" (row), "r" (2*offset), "r" (2*store), "r" (table)
198 );
188 } 199 }
189 200
190 201
191 /* MMX row IDCT */ 202 /* MMX row IDCT */
192 203
200 c7, c3, c3, -c1 } 211 c7, c3, c3, -c1 }
201 212
202 static inline void mmx_row_head (int16_t * const row, const int offset, 213 static inline void mmx_row_head (int16_t * const row, const int offset,
203 const int16_t * const table) 214 const int16_t * const table)
204 { 215 {
205 movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */ 216 __asm__ volatile (
206 217 "movq (%0), %%mm2 \n\t" /* mm2 = x6 x4 x2 x0 */
207 movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ 218
208 movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */ 219 "movq 8(%0), %%mm5 \n\t" /* mm5 = x7 x5 x3 x1 */
209 220 "movq %%mm2, %%mm0 \n\t" /* mm0 = x6 x4 x2 x0 */
210 movq_m2r (*table, mm3); /* mm3 = C6 C4 C2 C4 */ 221
211 movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */ 222 "movq (%1), %%mm3 \n\t" /* mm3 = C6 C4 C2 C4 */
212 223 "movq %%mm5, %%mm6 \n\t" /* mm6 = x7 x5 x3 x1 */
213 punpckldq_r2r (mm0, mm0); /* mm0 = x2 x0 x2 x0 */ 224
214 225 "punpckldq %%mm0, %%mm0 \n\t" /* mm0 = x2 x0 x2 x0 */
215 movq_m2r (*(table+4), mm4); /* mm4 = -C2 -C4 C6 C4 */ 226
216 pmaddwd_r2r (mm0, mm3); /* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */ 227 "movq 8(%1), %%mm4 \n\t" /* mm4 = -C2 -C4 C6 C4 */
217 228 "pmaddwd %%mm0, %%mm3 \n\t" /* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */
218 movq_m2r (*(table+8), mm1); /* mm1 = -C7 C3 C3 C1 */ 229
219 punpckhdq_r2r (mm2, mm2); /* mm2 = x6 x4 x6 x4 */ 230 "movq 16(%1), %%mm1 \n\t" /* mm1 = -C7 C3 C3 C1 */
231 "punpckhdq %%mm2, %%mm2 \n\t" /* mm2 = x6 x4 x6 x4 */
232 :: "r" ((row+offset)), "r" (table)
233 );
220 } 234 }
221 235
222 static inline void mmx_row (const int16_t * const table, 236 static inline void mmx_row (const int16_t * const table,
223 const int32_t * const rounder) 237 const int32_t * const rounder)
224 { 238 {
225 pmaddwd_r2r (mm2, mm4); /* mm4 = -C4*x4-C2*x6 C4*x4+C6*x6 */ 239 __asm__ volatile (
226 punpckldq_r2r (mm5, mm5); /* mm5 = x3 x1 x3 x1 */ 240 "pmaddwd %%mm2, %%mm4 \n\t" /* mm4 = -C4*x4-C2*x6 C4*x4+C6*x6 */
227 241 "punpckldq %%mm5, %%mm5 \n\t" /* mm5 = x3 x1 x3 x1 */
228 pmaddwd_m2r (*(table+16), mm0); /* mm0 = C4*x0-C2*x2 C4*x0-C6*x2 */ 242
229 punpckhdq_r2r (mm6, mm6); /* mm6 = x7 x5 x7 x5 */ 243 "pmaddwd 32(%0), %%mm0 \n\t" /* mm0 = C4*x0-C2*x2 C4*x0-C6*x2 */
230 244 "punpckhdq %%mm6, %%mm6 \n\t" /* mm6 = x7 x5 x7 x5 */
231 movq_m2r (*(table+12), mm7); /* mm7 = -C5 -C1 C7 C5 */ 245
232 pmaddwd_r2r (mm5, mm1); /* mm1 = C3*x1-C7*x3 C1*x1+C3*x3 */ 246 "movq 24(%0), %%mm7 \n\t" /* mm7 = -C5 -C1 C7 C5 */
233 247 "pmaddwd %%mm5, %%mm1 \n\t" /* mm1 = C3*x1-C7*x3 C1*x1+C3*x3 */
234 paddd_m2r (*rounder, mm3); /* mm3 += rounder */ 248
235 pmaddwd_r2r (mm6, mm7); /* mm7 = -C1*x5-C5*x7 C5*x5+C7*x7 */ 249 "paddd (%1), %%mm3 \n\t" /* mm3 += rounder */
236 250 "pmaddwd %%mm6, %%mm7 \n\t" /* mm7 = -C1*x5-C5*x7 C5*x5+C7*x7 */
237 pmaddwd_m2r (*(table+20), mm2); /* mm2 = C4*x4-C6*x6 -C4*x4+C2*x6 */ 251
238 paddd_r2r (mm4, mm3); /* mm3 = a1 a0 + rounder */ 252 "pmaddwd 40(%0), %%mm2 \n\t" /* mm2 = C4*x4-C6*x6 -C4*x4+C2*x6 */
239 253 "paddd %%mm4, %%mm3 \n\t" /* mm3 = a1 a0 + rounder */
240 pmaddwd_m2r (*(table+24), mm5); /* mm5 = C7*x1-C5*x3 C5*x1-C1*x3 */ 254
241 movq_r2r (mm3, mm4); /* mm4 = a1 a0 + rounder */ 255 "pmaddwd 48(%0), %%mm5 \n\t" /* mm5 = C7*x1-C5*x3 C5*x1-C1*x3 */
242 256 "movq %%mm3, %%mm4 \n\t" /* mm4 = a1 a0 + rounder */
243 pmaddwd_m2r (*(table+28), mm6); /* mm6 = C3*x5-C1*x7 C7*x5+C3*x7 */ 257
244 paddd_r2r (mm7, mm1); /* mm1 = b1 b0 */ 258 "pmaddwd 56(%0), %%mm6 \n\t" /* mm6 = C3*x5-C1*x7 C7*x5+C3*x7 */
245 259 "paddd %%mm7, %%mm1 \n\t" /* mm1 = b1 b0 */
246 paddd_m2r (*rounder, mm0); /* mm0 += rounder */ 260
247 psubd_r2r (mm1, mm3); /* mm3 = a1-b1 a0-b0 + rounder */ 261 "paddd (%1), %%mm0 \n\t" /* mm0 += rounder */
248 262 "psubd %%mm1, %%mm3 \n\t" /* mm3 = a1-b1 a0-b0 + rounder */
249 psrad_i2r (ROW_SHIFT, mm3); /* mm3 = y6 y7 */ 263
250 paddd_r2r (mm4, mm1); /* mm1 = a1+b1 a0+b0 + rounder */ 264 "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm3 \n\t" /* mm3 = y6 y7 */
251 265 "paddd %%mm4, %%mm1 \n\t" /* mm1 = a1+b1 a0+b0 + rounder */
252 paddd_r2r (mm2, mm0); /* mm0 = a3 a2 + rounder */ 266
253 psrad_i2r (ROW_SHIFT, mm1); /* mm1 = y1 y0 */ 267 "paddd %%mm2, %%mm0 \n\t" /* mm0 = a3 a2 + rounder */
254 268 "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm1 \n\t" /* mm1 = y1 y0 */
255 paddd_r2r (mm6, mm5); /* mm5 = b3 b2 */ 269
256 movq_r2r (mm0, mm7); /* mm7 = a3 a2 + rounder */ 270 "paddd %%mm6, %%mm5 \n\t" /* mm5 = b3 b2 */
257 271 "movq %%mm0, %%mm7 \n\t" /* mm7 = a3 a2 + rounder */
258 paddd_r2r (mm5, mm0); /* mm0 = a3+b3 a2+b2 + rounder */ 272
259 psubd_r2r (mm5, mm7); /* mm7 = a3-b3 a2-b2 + rounder */ 273 "paddd %%mm5, %%mm0 \n\t" /* mm0 = a3+b3 a2+b2 + rounder */
274 "psubd %%mm5, %%mm7 \n\t" /* mm7 = a3-b3 a2-b2 + rounder */
275 :: "r" (table), "r" (rounder)
276 );
260 } 277 }
261 278
262 static inline void mmx_row_tail (int16_t * const row, const int store) 279 static inline void mmx_row_tail (int16_t * const row, const int store)
263 { 280 {
264 psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */ 281 __asm__ volatile (
265 282 "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm0 \n\t" /* mm0 = y3 y2 */
266 psrad_i2r (ROW_SHIFT, mm7); /* mm7 = y4 y5 */ 283
267 284 "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm7 \n\t" /* mm7 = y4 y5 */
268 packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */ 285
269 286 "packssdw %%mm0, %%mm1 \n\t" /* mm1 = y3 y2 y1 y0 */
270 packssdw_r2r (mm3, mm7); /* mm7 = y6 y7 y4 y5 */ 287
271 288 "packssdw %%mm3, %%mm7 \n\t" /* mm7 = y6 y7 y4 y5 */
272 movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */ 289
273 movq_r2r (mm7, mm4); /* mm4 = y6 y7 y4 y5 */ 290 "movq %%mm1, (%0) \n\t" /* save y3 y2 y1 y0 */
274 291 "movq %%mm7, %%mm4 \n\t" /* mm4 = y6 y7 y4 y5 */
275 pslld_i2r (16, mm7); /* mm7 = y7 0 y5 0 */ 292
276 293 "pslld $16, %%mm7 \n\t" /* mm7 = y7 0 y5 0 */
277 psrld_i2r (16, mm4); /* mm4 = 0 y6 0 y4 */ 294
278 295 "psrld $16, %%mm4 \n\t" /* mm4 = 0 y6 0 y4 */
279 por_r2r (mm4, mm7); /* mm7 = y7 y6 y5 y4 */ 296
280 297 "por %%mm4, %%mm7 \n\t" /* mm7 = y7 y6 y5 y4 */
281 /* slot */ 298
282 299 /* slot */
283 movq_r2m (mm7, *(row+store+4)); /* save y7 y6 y5 y4 */ 300
301 "movq %%mm7, 8(%0) \n\t" /* save y7 y6 y5 y4 */
302 :: "r" (row+store)
303 );
284 } 304 }
285 305
286 static inline void mmx_row_mid (int16_t * const row, const int store, 306 static inline void mmx_row_mid (int16_t * const row, const int store,
287 const int offset, const int16_t * const table) 307 const int offset, const int16_t * const table)
288 { 308 {
289 movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */ 309
290 psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */ 310 __asm__ volatile (
291 311 "movq (%0,%1), %%mm2 \n\t" /* mm2 = x6 x4 x2 x0 */
292 movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ 312 "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm0 \n\t" /* mm0 = y3 y2 */
293 psrad_i2r (ROW_SHIFT, mm7); /* mm7 = y4 y5 */ 313
294 314 "movq 8(%0,%1), %%mm5 \n\t" /* mm5 = x7 x5 x3 x1 */
295 packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */ 315 "psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm7 \n\t" /* mm7 = y4 y5 */
296 movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */ 316
297 317 "packssdw %%mm0, %%mm1 \n\t" /* mm1 = y3 y2 y1 y0 */
298 packssdw_r2r (mm3, mm7); /* mm7 = y6 y7 y4 y5 */ 318 "movq %%mm5, %%mm6 \n\t" /* mm6 = x7 x5 x3 x1 */
299 movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */ 319
300 320 "packssdw %%mm3, %%mm7 \n\t" /* mm7 = y6 y7 y4 y5 */
301 movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */ 321 "movq %%mm2, %%mm0 \n\t" /* mm0 = x6 x4 x2 x0 */
302 movq_r2r (mm7, mm1); /* mm1 = y6 y7 y4 y5 */ 322
303 323 "movq %%mm1, (%0,%2) \n\t" /* save y3 y2 y1 y0 */
304 punpckldq_r2r (mm0, mm0); /* mm0 = x2 x0 x2 x0 */ 324 "movq %%mm7, %%mm1 \n\t" /* mm1 = y6 y7 y4 y5 */
305 psrld_i2r (16, mm7); /* mm7 = 0 y6 0 y4 */ 325
306 326 "punpckldq %%mm0, %%mm0 \n\t" /* mm0 = x2 x0 x2 x0 */
307 movq_m2r (*table, mm3); /* mm3 = C6 C4 C2 C4 */ 327 "psrld $16, %%mm7 \n\t" /* mm7 = 0 y6 0 y4 */
308 pslld_i2r (16, mm1); /* mm1 = y7 0 y5 0 */ 328
309 329 "movq (%3), %%mm3 \n\t" /* mm3 = C6 C4 C2 C4 */
310 movq_m2r (*(table+4), mm4); /* mm4 = -C2 -C4 C6 C4 */ 330 "pslld $16, %%mm1 \n\t" /* mm1 = y7 0 y5 0 */
311 por_r2r (mm1, mm7); /* mm7 = y7 y6 y5 y4 */ 331
312 332 "movq 8(%3), %%mm4 \n\t" /* mm4 = -C2 -C4 C6 C4 */
313 movq_m2r (*(table+8), mm1); /* mm1 = -C7 C3 C3 C1 */ 333 "por %%mm1, %%mm7 \n\t" /* mm7 = y7 y6 y5 y4 */
314 punpckhdq_r2r (mm2, mm2); /* mm2 = x6 x4 x6 x4 */ 334
315 335 "movq 16(%3), %%mm1 \n\t" /* mm1 = -C7 C3 C3 C1 */
316 movq_r2m (mm7, *(row+store+4)); /* save y7 y6 y5 y4 */ 336 "punpckhdq %%mm2, %%mm2 \n\t" /* mm2 = x6 x4 x6 x4 */
317 pmaddwd_r2r (mm0, mm3); /* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */ 337
338 "movq %%mm7, 8(%0,%2) \n\t" /* save y7 y6 y5 y4 */
339 "pmaddwd %%mm0, %%mm3 \n\t" /* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */
340 : : "r" (row), "r"(2*offset), "r" (2*store), "r" (table)
341 );
318 } 342 }
319 343
320 344
321 #if 0 345 #if 0
322 /* C column IDCT - it is just here to document the MMXEXT and MMX versions */ 346 /* C column IDCT - it is just here to document the MMXEXT and MMX versions */
396 #define T1 13036 420 #define T1 13036
397 #define T2 27146 421 #define T2 27146
398 #define T3 43790 422 #define T3 43790
399 #define C4 23170 423 #define C4 23170
400 424
401 DECLARE_ALIGNED(8, static const short, t1_vector)[] = {T1,T1,T1,T1}; 425 DECLARE_ALIGNED(8, static const short, t1_vector)[] = {
402 DECLARE_ALIGNED(8, static const short, t2_vector)[] = {T2,T2,T2,T2}; 426 T1,T1,T1,T1,
403 DECLARE_ALIGNED(8, static const short, t3_vector)[] = {T3,T3,T3,T3}; 427 T2,T2,T2,T2,
404 DECLARE_ALIGNED(8, static const short, c4_vector)[] = {C4,C4,C4,C4}; 428 T3,T3,T3,T3,
429 C4,C4,C4,C4
430 };
405 431
406 /* column code adapted from Peter Gubanov */ 432 /* column code adapted from Peter Gubanov */
407 /* http://www.elecard.com/peter/idct.shtml */ 433 /* http://www.elecard.com/peter/idct.shtml */
408 434
409 movq_m2r (*t1_vector, mm0); /* mm0 = T1 */ 435 __asm__ volatile (
410 436 "movq (%0), %%mm0 \n\t" /* mm0 = T1 */
411 movq_m2r (*(col+offset+1*8), mm1); /* mm1 = x1 */ 437
412 movq_r2r (mm0, mm2); /* mm2 = T1 */ 438 "movq 2*8(%1), %%mm1 \n\t" /* mm1 = x1 */
413 439 "movq %%mm0, %%mm2 \n\t" /* mm2 = T1 */
414 movq_m2r (*(col+offset+7*8), mm4); /* mm4 = x7 */ 440
415 pmulhw_r2r (mm1, mm0); /* mm0 = T1*x1 */ 441 "movq 7*2*8(%1), %%mm4 \n\t" /* mm4 = x7 */
416 442 "pmulhw %%mm1, %%mm0 \n\t" /* mm0 = T1*x1 */
417 movq_m2r (*t3_vector, mm5); /* mm5 = T3 */ 443
418 pmulhw_r2r (mm4, mm2); /* mm2 = T1*x7 */ 444 "movq 16(%0), %%mm5 \n\t" /* mm5 = T3 */
419 445 "pmulhw %%mm4, %%mm2 \n\t" /* mm2 = T1*x7 */
420 movq_m2r (*(col+offset+5*8), mm6); /* mm6 = x5 */ 446
421 movq_r2r (mm5, mm7); /* mm7 = T3-1 */ 447 "movq 2*5*8(%1), %%mm6 \n\t" /* mm6 = x5 */
422 448 "movq %%mm5, %%mm7 \n\t" /* mm7 = T3-1 */
423 movq_m2r (*(col+offset+3*8), mm3); /* mm3 = x3 */ 449
424 psubsw_r2r (mm4, mm0); /* mm0 = v17 */ 450 "movq 3*8*2(%1), %%mm3 \n\t" /* mm3 = x3 */
425 451 "psubsw %%mm4, %%mm0 \n\t" /* mm0 = v17 */
426 movq_m2r (*t2_vector, mm4); /* mm4 = T2 */ 452
427 pmulhw_r2r (mm3, mm5); /* mm5 = (T3-1)*x3 */ 453 "movq 8(%0), %%mm4 \n\t" /* mm4 = T2 */
428 454 "pmulhw %%mm3, %%mm5 \n\t" /* mm5 = (T3-1)*x3 */
429 paddsw_r2r (mm2, mm1); /* mm1 = u17 */ 455
430 pmulhw_r2r (mm6, mm7); /* mm7 = (T3-1)*x5 */ 456 "paddsw %%mm2, %%mm1 \n\t" /* mm1 = u17 */
431 457 "pmulhw %%mm6, %%mm7 \n\t" /* mm7 = (T3-1)*x5 */
432 /* slot */ 458
433 459 /* slot */
434 movq_r2r (mm4, mm2); /* mm2 = T2 */ 460
435 paddsw_r2r (mm3, mm5); /* mm5 = T3*x3 */ 461 "movq %%mm4, %%mm2 \n\t" /* mm2 = T2 */
436 462 "paddsw %%mm3, %%mm5 \n\t" /* mm5 = T3*x3 */
437 pmulhw_m2r (*(col+offset+2*8), mm4);/* mm4 = T2*x2 */ 463
438 paddsw_r2r (mm6, mm7); /* mm7 = T3*x5 */ 464 "pmulhw 2*8*2(%1), %%mm4 \n\t" /* mm4 = T2*x2 */
439 465 "paddsw %%mm6, %%mm7 \n\t" /* mm7 = T3*x5 */
440 psubsw_r2r (mm6, mm5); /* mm5 = v35 */ 466
441 paddsw_r2r (mm3, mm7); /* mm7 = u35 */ 467 "psubsw %%mm6, %%mm5 \n\t" /* mm5 = v35 */
442 468 "paddsw %%mm3, %%mm7 \n\t" /* mm7 = u35 */
443 movq_m2r (*(col+offset+6*8), mm3); /* mm3 = x6 */ 469
444 movq_r2r (mm0, mm6); /* mm6 = v17 */ 470 "movq 6*8*2(%1), %%mm3 \n\t" /* mm3 = x6 */
445 471 "movq %%mm0, %%mm6 \n\t" /* mm6 = v17 */
446 pmulhw_r2r (mm3, mm2); /* mm2 = T2*x6 */ 472
447 psubsw_r2r (mm5, mm0); /* mm0 = b3 */ 473 "pmulhw %%mm3, %%mm2 \n\t" /* mm2 = T2*x6 */
448 474 "psubsw %%mm5, %%mm0 \n\t" /* mm0 = b3 */
449 psubsw_r2r (mm3, mm4); /* mm4 = v26 */ 475
450 paddsw_r2r (mm6, mm5); /* mm5 = v12 */ 476 "psubsw %%mm3, %%mm4 \n\t" /* mm4 = v26 */
451 477 "paddsw %%mm6, %%mm5 \n\t" /* mm5 = v12 */
452 movq_r2m (mm0, *(col+offset+3*8)); /* save b3 in scratch0 */ 478
453 movq_r2r (mm1, mm6); /* mm6 = u17 */ 479 "movq %%mm0, 3*8*2(%1)\n\t" /* save b3 in scratch0 */
454 480 "movq %%mm1, %%mm6 \n\t" /* mm6 = u17 */
455 paddsw_m2r (*(col+offset+2*8), mm2);/* mm2 = u26 */ 481
456 paddsw_r2r (mm7, mm6); /* mm6 = b0 */ 482 "paddsw 2*8*2(%1), %%mm2 \n\t" /* mm2 = u26 */
457 483 "paddsw %%mm7, %%mm6 \n\t" /* mm6 = b0 */
458 psubsw_r2r (mm7, mm1); /* mm1 = u12 */ 484
459 movq_r2r (mm1, mm7); /* mm7 = u12 */ 485 "psubsw %%mm7, %%mm1 \n\t" /* mm1 = u12 */
460 486 "movq %%mm1, %%mm7 \n\t" /* mm7 = u12 */
461 movq_m2r (*(col+offset+0*8), mm3); /* mm3 = x0 */ 487
462 paddsw_r2r (mm5, mm1); /* mm1 = u12+v12 */ 488 "movq 0*8(%1), %%mm3 \n\t" /* mm3 = x0 */
463 489 "paddsw %%mm5, %%mm1 \n\t" /* mm1 = u12+v12 */
464 movq_m2r (*c4_vector, mm0); /* mm0 = C4/2 */ 490
465 psubsw_r2r (mm5, mm7); /* mm7 = u12-v12 */ 491 "movq 24(%0), %%mm0 \n\t" /* mm0 = C4/2 */
466 492 "psubsw %%mm5, %%mm7 \n\t" /* mm7 = u12-v12 */
467 movq_r2m (mm6, *(col+offset+5*8)); /* save b0 in scratch1 */ 493
468 pmulhw_r2r (mm0, mm1); /* mm1 = b1/2 */ 494 "movq %%mm6, 5*8*2(%1)\n\t" /* save b0 in scratch1 */
469 495 "pmulhw %%mm0, %%mm1 \n\t" /* mm1 = b1/2 */
470 movq_r2r (mm4, mm6); /* mm6 = v26 */ 496
471 pmulhw_r2r (mm0, mm7); /* mm7 = b2/2 */ 497 "movq %%mm4, %%mm6 \n\t" /* mm6 = v26 */
472 498 "pmulhw %%mm0, %%mm7 \n\t" /* mm7 = b2/2 */
473 movq_m2r (*(col+offset+4*8), mm5); /* mm5 = x4 */ 499
474 movq_r2r (mm3, mm0); /* mm0 = x0 */ 500 "movq 4*8*2(%1), %%mm5 \n\t" /* mm5 = x4 */
475 501 "movq %%mm3, %%mm0 \n\t" /* mm0 = x0 */
476 psubsw_r2r (mm5, mm3); /* mm3 = v04 */ 502
477 paddsw_r2r (mm5, mm0); /* mm0 = u04 */ 503 "psubsw %%mm5, %%mm3 \n\t" /* mm3 = v04 */
478 504 "paddsw %%mm5, %%mm0 \n\t" /* mm0 = u04 */
479 paddsw_r2r (mm3, mm4); /* mm4 = a1 */ 505
480 movq_r2r (mm0, mm5); /* mm5 = u04 */ 506 "paddsw %%mm3, %%mm4 \n\t" /* mm4 = a1 */
481 507 "movq %%mm0, %%mm5 \n\t" /* mm5 = u04 */
482 psubsw_r2r (mm6, mm3); /* mm3 = a2 */ 508
483 paddsw_r2r (mm2, mm5); /* mm5 = a0 */ 509 "psubsw %%mm6, %%mm3 \n\t" /* mm3 = a2 */
484 510 "paddsw %%mm2, %%mm5 \n\t" /* mm5 = a0 */
485 paddsw_r2r (mm1, mm1); /* mm1 = b1 */ 511
486 psubsw_r2r (mm2, mm0); /* mm0 = a3 */ 512 "paddsw %%mm1, %%mm1 \n\t" /* mm1 = b1 */
487 513 "psubsw %%mm2, %%mm0 \n\t" /* mm0 = a3 */
488 paddsw_r2r (mm7, mm7); /* mm7 = b2 */ 514
489 movq_r2r (mm3, mm2); /* mm2 = a2 */ 515 "paddsw %%mm7, %%mm7 \n\t" /* mm7 = b2 */
490 516 "movq %%mm3, %%mm2 \n\t" /* mm2 = a2 */
491 movq_r2r (mm4, mm6); /* mm6 = a1 */ 517
492 paddsw_r2r (mm7, mm3); /* mm3 = a2+b2 */ 518 "movq %%mm4, %%mm6 \n\t" /* mm6 = a1 */
493 519 "paddsw %%mm7, %%mm3 \n\t" /* mm3 = a2+b2 */
494 psraw_i2r (COL_SHIFT, mm3); /* mm3 = y2 */ 520
495 paddsw_r2r (mm1, mm4); /* mm4 = a1+b1 */ 521 "psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm3\n\t" /* mm3 = y2 */
496 522 "paddsw %%mm1, %%mm4\n\t" /* mm4 = a1+b1 */
497 psraw_i2r (COL_SHIFT, mm4); /* mm4 = y1 */ 523
498 psubsw_r2r (mm1, mm6); /* mm6 = a1-b1 */ 524 "psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm4\n\t" /* mm4 = y1 */
499 525 "psubsw %%mm1, %%mm6 \n\t" /* mm6 = a1-b1 */
500 movq_m2r (*(col+offset+5*8), mm1); /* mm1 = b0 */ 526
501 psubsw_r2r (mm7, mm2); /* mm2 = a2-b2 */ 527 "movq 5*8*2(%1), %%mm1 \n\t" /* mm1 = b0 */
502 528 "psubsw %%mm7, %%mm2 \n\t" /* mm2 = a2-b2 */
503 psraw_i2r (COL_SHIFT, mm6); /* mm6 = y6 */ 529
504 movq_r2r (mm5, mm7); /* mm7 = a0 */ 530 "psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm6\n\t" /* mm6 = y6 */
505 531 "movq %%mm5, %%mm7 \n\t" /* mm7 = a0 */
506 movq_r2m (mm4, *(col+offset+1*8)); /* save y1 */ 532
507 psraw_i2r (COL_SHIFT, mm2); /* mm2 = y5 */ 533 "movq %%mm4, 1*8*2(%1)\n\t" /* save y1 */
508 534 "psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm2\n\t" /* mm2 = y5 */
509 movq_r2m (mm3, *(col+offset+2*8)); /* save y2 */ 535
510 paddsw_r2r (mm1, mm5); /* mm5 = a0+b0 */ 536 "movq %%mm3, 2*8*2(%1)\n\t" /* save y2 */
511 537 "paddsw %%mm1, %%mm5 \n\t" /* mm5 = a0+b0 */
512 movq_m2r (*(col+offset+3*8), mm4); /* mm4 = b3 */ 538
513 psubsw_r2r (mm1, mm7); /* mm7 = a0-b0 */ 539 "movq 3*8*2(%1), %%mm4 \n\t" /* mm4 = b3 */
514 540 "psubsw %%mm1, %%mm7 \n\t" /* mm7 = a0-b0 */
515 psraw_i2r (COL_SHIFT, mm5); /* mm5 = y0 */ 541
516 movq_r2r (mm0, mm3); /* mm3 = a3 */ 542 "psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm5\n\t" /* mm5 = y0 */
517 543 "movq %%mm0, %%mm3 \n\t" /* mm3 = a3 */
518 movq_r2m (mm2, *(col+offset+5*8)); /* save y5 */ 544
519 psubsw_r2r (mm4, mm3); /* mm3 = a3-b3 */ 545 "movq %%mm2, 5*8*2(%1)\n\t" /* save y5 */
520 546 "psubsw %%mm4, %%mm3 \n\t" /* mm3 = a3-b3 */
521 psraw_i2r (COL_SHIFT, mm7); /* mm7 = y7 */ 547
522 paddsw_r2r (mm0, mm4); /* mm4 = a3+b3 */ 548 "psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm7\n\t" /* mm7 = y7 */
523 549 "paddsw %%mm0, %%mm4 \n\t" /* mm4 = a3+b3 */
524 movq_r2m (mm5, *(col+offset+0*8)); /* save y0 */ 550
525 psraw_i2r (COL_SHIFT, mm3); /* mm3 = y4 */ 551 "movq %%mm5, 0*8*2(%1)\n\t" /* save y0 */
526 552 "psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm3\n\t" /* mm3 = y4 */
527 movq_r2m (mm6, *(col+offset+6*8)); /* save y6 */ 553
528 psraw_i2r (COL_SHIFT, mm4); /* mm4 = y3 */ 554 "movq %%mm6, 6*8*2(%1)\n\t" /* save y6 */
529 555 "psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm4\n\t" /* mm4 = y3 */
530 movq_r2m (mm7, *(col+offset+7*8)); /* save y7 */ 556
531 557 "movq %%mm7, 7*8*2(%1)\n\t" /* save y7 */
532 movq_r2m (mm3, *(col+offset+4*8)); /* save y4 */ 558
533 559 "movq %%mm3, 4*8*2(%1)\n\t" /* save y4 */
534 movq_r2m (mm4, *(col+offset+3*8)); /* save y3 */ 560
561 "movq %%mm4, 3*8*2(%1)\n\t" /* save y3 */
562 :: "r" (t1_vector), "r" (col+offset)
563 );
535 564
536 #undef T1 565 #undef T1
537 #undef T2 566 #undef T2
538 #undef T3 567 #undef T3
539 #undef C4 568 #undef C4