comparison i386/mpegvideo_mmx.c @ 2979:bfabfdf9ce55 libavcodec

COSMETICS: tabs --> spaces, some prettyprinting
author diego
date Thu, 22 Dec 2005 01:10:11 +0000
parents ef2149182f1c
children 0b546eab515d
comparison
equal deleted inserted replaced
2978:403183bbb505 2979:bfabfdf9ce55
55 nCoeffs=63; 55 nCoeffs=63;
56 else 56 else
57 nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; 57 nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
58 //printf("%d %d ", qmul, qadd); 58 //printf("%d %d ", qmul, qadd);
59 asm volatile( 59 asm volatile(
60 "movd %1, %%mm6 \n\t" //qmul 60 "movd %1, %%mm6 \n\t" //qmul
61 "packssdw %%mm6, %%mm6 \n\t" 61 "packssdw %%mm6, %%mm6 \n\t"
62 "packssdw %%mm6, %%mm6 \n\t" 62 "packssdw %%mm6, %%mm6 \n\t"
63 "movd %2, %%mm5 \n\t" //qadd 63 "movd %2, %%mm5 \n\t" //qadd
64 "pxor %%mm7, %%mm7 \n\t" 64 "pxor %%mm7, %%mm7 \n\t"
65 "packssdw %%mm5, %%mm5 \n\t" 65 "packssdw %%mm5, %%mm5 \n\t"
66 "packssdw %%mm5, %%mm5 \n\t" 66 "packssdw %%mm5, %%mm5 \n\t"
67 "psubw %%mm5, %%mm7 \n\t" 67 "psubw %%mm5, %%mm7 \n\t"
68 "pxor %%mm4, %%mm4 \n\t" 68 "pxor %%mm4, %%mm4 \n\t"
69 ".balign 16\n\t" 69 ".balign 16 \n\t"
70 "1: \n\t" 70 "1: \n\t"
71 "movq (%0, %3), %%mm0 \n\t" 71 "movq (%0, %3), %%mm0 \n\t"
72 "movq 8(%0, %3), %%mm1 \n\t" 72 "movq 8(%0, %3), %%mm1 \n\t"
73 73
74 "pmullw %%mm6, %%mm0 \n\t" 74 "pmullw %%mm6, %%mm0 \n\t"
75 "pmullw %%mm6, %%mm1 \n\t" 75 "pmullw %%mm6, %%mm1 \n\t"
76 76
77 "movq (%0, %3), %%mm2 \n\t" 77 "movq (%0, %3), %%mm2 \n\t"
78 "movq 8(%0, %3), %%mm3 \n\t" 78 "movq 8(%0, %3), %%mm3 \n\t"
79 79
80 "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 80 "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
81 "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 81 "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
82 82
83 "pxor %%mm2, %%mm0 \n\t" 83 "pxor %%mm2, %%mm0 \n\t"
84 "pxor %%mm3, %%mm1 \n\t" 84 "pxor %%mm3, %%mm1 \n\t"
85 85
86 "paddw %%mm7, %%mm0 \n\t" 86 "paddw %%mm7, %%mm0 \n\t"
87 "paddw %%mm7, %%mm1 \n\t" 87 "paddw %%mm7, %%mm1 \n\t"
88 88
89 "pxor %%mm0, %%mm2 \n\t" 89 "pxor %%mm0, %%mm2 \n\t"
90 "pxor %%mm1, %%mm3 \n\t" 90 "pxor %%mm1, %%mm3 \n\t"
91 91
92 "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 92 "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0
93 "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 93 "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0
94 94
95 "pandn %%mm2, %%mm0 \n\t" 95 "pandn %%mm2, %%mm0 \n\t"
96 "pandn %%mm3, %%mm1 \n\t" 96 "pandn %%mm3, %%mm1 \n\t"
97 97
98 "movq %%mm0, (%0, %3) \n\t" 98 "movq %%mm0, (%0, %3) \n\t"
99 "movq %%mm1, 8(%0, %3) \n\t" 99 "movq %%mm1, 8(%0, %3) \n\t"
100 100
101 "add $16, %3 \n\t" 101 "add $16, %3 \n\t"
102 "jng 1b \n\t" 102 "jng 1b \n\t"
103 ::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(-nCoeffs)) 103 ::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(-nCoeffs))
104 : "memory" 104 : "memory"
105 ); 105 );
106 block[0]= level; 106 block[0]= level;
107 } 107 }
108 108
109 109
110 static void dct_unquantize_h263_inter_mmx(MpegEncContext *s, 110 static void dct_unquantize_h263_inter_mmx(MpegEncContext *s,
118 assert(s->block_last_index[n]>=0 || s->h263_aic); 118 assert(s->block_last_index[n]>=0 || s->h263_aic);
119 119
120 nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; 120 nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
121 //printf("%d %d ", qmul, qadd); 121 //printf("%d %d ", qmul, qadd);
122 asm volatile( 122 asm volatile(
123 "movd %1, %%mm6 \n\t" //qmul 123 "movd %1, %%mm6 \n\t" //qmul
124 "packssdw %%mm6, %%mm6 \n\t" 124 "packssdw %%mm6, %%mm6 \n\t"
125 "packssdw %%mm6, %%mm6 \n\t" 125 "packssdw %%mm6, %%mm6 \n\t"
126 "movd %2, %%mm5 \n\t" //qadd 126 "movd %2, %%mm5 \n\t" //qadd
127 "pxor %%mm7, %%mm7 \n\t" 127 "pxor %%mm7, %%mm7 \n\t"
128 "packssdw %%mm5, %%mm5 \n\t" 128 "packssdw %%mm5, %%mm5 \n\t"
129 "packssdw %%mm5, %%mm5 \n\t" 129 "packssdw %%mm5, %%mm5 \n\t"
130 "psubw %%mm5, %%mm7 \n\t" 130 "psubw %%mm5, %%mm7 \n\t"
131 "pxor %%mm4, %%mm4 \n\t" 131 "pxor %%mm4, %%mm4 \n\t"
132 ".balign 16\n\t" 132 ".balign 16 \n\t"
133 "1: \n\t" 133 "1: \n\t"
134 "movq (%0, %3), %%mm0 \n\t" 134 "movq (%0, %3), %%mm0 \n\t"
135 "movq 8(%0, %3), %%mm1 \n\t" 135 "movq 8(%0, %3), %%mm1 \n\t"
136 136
137 "pmullw %%mm6, %%mm0 \n\t" 137 "pmullw %%mm6, %%mm0 \n\t"
138 "pmullw %%mm6, %%mm1 \n\t" 138 "pmullw %%mm6, %%mm1 \n\t"
139 139
140 "movq (%0, %3), %%mm2 \n\t" 140 "movq (%0, %3), %%mm2 \n\t"
141 "movq 8(%0, %3), %%mm3 \n\t" 141 "movq 8(%0, %3), %%mm3 \n\t"
142 142
143 "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 143 "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
144 "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 144 "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
145 145
146 "pxor %%mm2, %%mm0 \n\t" 146 "pxor %%mm2, %%mm0 \n\t"
147 "pxor %%mm3, %%mm1 \n\t" 147 "pxor %%mm3, %%mm1 \n\t"
148 148
149 "paddw %%mm7, %%mm0 \n\t" 149 "paddw %%mm7, %%mm0 \n\t"
150 "paddw %%mm7, %%mm1 \n\t" 150 "paddw %%mm7, %%mm1 \n\t"
151 151
152 "pxor %%mm0, %%mm2 \n\t" 152 "pxor %%mm0, %%mm2 \n\t"
153 "pxor %%mm1, %%mm3 \n\t" 153 "pxor %%mm1, %%mm3 \n\t"
154 154
155 "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 155 "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0
156 "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 156 "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0
157 157
158 "pandn %%mm2, %%mm0 \n\t" 158 "pandn %%mm2, %%mm0 \n\t"
159 "pandn %%mm3, %%mm1 \n\t" 159 "pandn %%mm3, %%mm1 \n\t"
160 160
161 "movq %%mm0, (%0, %3) \n\t" 161 "movq %%mm0, (%0, %3) \n\t"
162 "movq %%mm1, 8(%0, %3) \n\t" 162 "movq %%mm1, 8(%0, %3) \n\t"
163 163
164 "add $16, %3 \n\t" 164 "add $16, %3 \n\t"
165 "jng 1b \n\t" 165 "jng 1b \n\t"
166 ::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(-nCoeffs)) 166 ::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(-nCoeffs))
167 : "memory" 167 : "memory"
168 ); 168 );
169 } 169 }
170 170
171 171
172 /* 172 /*
173 NK: 173 NK:
214 else 214 else
215 block0 = block[0] * s->c_dc_scale; 215 block0 = block[0] * s->c_dc_scale;
216 /* XXX: only mpeg1 */ 216 /* XXX: only mpeg1 */
217 quant_matrix = s->intra_matrix; 217 quant_matrix = s->intra_matrix;
218 asm volatile( 218 asm volatile(
219 "pcmpeqw %%mm7, %%mm7 \n\t" 219 "pcmpeqw %%mm7, %%mm7 \n\t"
220 "psrlw $15, %%mm7 \n\t" 220 "psrlw $15, %%mm7 \n\t"
221 "movd %2, %%mm6 \n\t" 221 "movd %2, %%mm6 \n\t"
222 "packssdw %%mm6, %%mm6 \n\t" 222 "packssdw %%mm6, %%mm6 \n\t"
223 "packssdw %%mm6, %%mm6 \n\t" 223 "packssdw %%mm6, %%mm6 \n\t"
224 "mov %3, %%"REG_a" \n\t" 224 "mov %3, %%"REG_a" \n\t"
225 ".balign 16\n\t" 225 ".balign 16 \n\t"
226 "1: \n\t" 226 "1: \n\t"
227 "movq (%0, %%"REG_a"), %%mm0 \n\t" 227 "movq (%0, %%"REG_a"), %%mm0 \n\t"
228 "movq 8(%0, %%"REG_a"), %%mm1 \n\t" 228 "movq 8(%0, %%"REG_a"), %%mm1 \n\t"
229 "movq (%1, %%"REG_a"), %%mm4 \n\t" 229 "movq (%1, %%"REG_a"), %%mm4 \n\t"
230 "movq 8(%1, %%"REG_a"), %%mm5 \n\t" 230 "movq 8(%1, %%"REG_a"), %%mm5 \n\t"
231 "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] 231 "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
232 "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] 232 "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
233 "pxor %%mm2, %%mm2 \n\t" 233 "pxor %%mm2, %%mm2 \n\t"
234 "pxor %%mm3, %%mm3 \n\t" 234 "pxor %%mm3, %%mm3 \n\t"
235 "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 235 "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
236 "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 236 "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
237 "pxor %%mm2, %%mm0 \n\t" 237 "pxor %%mm2, %%mm0 \n\t"
238 "pxor %%mm3, %%mm1 \n\t" 238 "pxor %%mm3, %%mm1 \n\t"
239 "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) 239 "psubw %%mm2, %%mm0 \n\t" // abs(block[i])
240 "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) 240 "psubw %%mm3, %%mm1 \n\t" // abs(block[i])
241 "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q 241 "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q
242 "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q 242 "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q
243 "pxor %%mm4, %%mm4 \n\t" 243 "pxor %%mm4, %%mm4 \n\t"
244 "pxor %%mm5, %%mm5 \n\t" // FIXME slow 244 "pxor %%mm5, %%mm5 \n\t" // FIXME slow
245 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 245 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
246 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 246 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
247 "psraw $3, %%mm0 \n\t" 247 "psraw $3, %%mm0 \n\t"
248 "psraw $3, %%mm1 \n\t" 248 "psraw $3, %%mm1 \n\t"
249 "psubw %%mm7, %%mm0 \n\t" 249 "psubw %%mm7, %%mm0 \n\t"
250 "psubw %%mm7, %%mm1 \n\t" 250 "psubw %%mm7, %%mm1 \n\t"
251 "por %%mm7, %%mm0 \n\t" 251 "por %%mm7, %%mm0 \n\t"
252 "por %%mm7, %%mm1 \n\t" 252 "por %%mm7, %%mm1 \n\t"
253 "pxor %%mm2, %%mm0 \n\t" 253 "pxor %%mm2, %%mm0 \n\t"
254 "pxor %%mm3, %%mm1 \n\t" 254 "pxor %%mm3, %%mm1 \n\t"
255 "psubw %%mm2, %%mm0 \n\t" 255 "psubw %%mm2, %%mm0 \n\t"
256 "psubw %%mm3, %%mm1 \n\t" 256 "psubw %%mm3, %%mm1 \n\t"
257 "pandn %%mm0, %%mm4 \n\t" 257 "pandn %%mm0, %%mm4 \n\t"
258 "pandn %%mm1, %%mm5 \n\t" 258 "pandn %%mm1, %%mm5 \n\t"
259 "movq %%mm4, (%0, %%"REG_a") \n\t" 259 "movq %%mm4, (%0, %%"REG_a") \n\t"
260 "movq %%mm5, 8(%0, %%"REG_a") \n\t" 260 "movq %%mm5, 8(%0, %%"REG_a") \n\t"
261 261
262 "add $16, %%"REG_a" \n\t" 262 "add $16, %%"REG_a" \n\t"
263 "js 1b \n\t" 263 "js 1b \n\t"
264 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs) 264 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs)
265 : "%"REG_a, "memory" 265 : "%"REG_a, "memory"
266 ); 266 );
267 block[0]= block0; 267 block[0]= block0;
268 } 268 }
269 269
270 static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s, 270 static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s,
271 DCTELEM *block, int n, int qscale) 271 DCTELEM *block, int n, int qscale)
277 277
278 nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1; 278 nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
279 279
280 quant_matrix = s->inter_matrix; 280 quant_matrix = s->inter_matrix;
281 asm volatile( 281 asm volatile(
282 "pcmpeqw %%mm7, %%mm7 \n\t" 282 "pcmpeqw %%mm7, %%mm7 \n\t"
283 "psrlw $15, %%mm7 \n\t" 283 "psrlw $15, %%mm7 \n\t"
284 "movd %2, %%mm6 \n\t" 284 "movd %2, %%mm6 \n\t"
285 "packssdw %%mm6, %%mm6 \n\t" 285 "packssdw %%mm6, %%mm6 \n\t"
286 "packssdw %%mm6, %%mm6 \n\t" 286 "packssdw %%mm6, %%mm6 \n\t"
287 "mov %3, %%"REG_a" \n\t" 287 "mov %3, %%"REG_a" \n\t"
288 ".balign 16\n\t" 288 ".balign 16 \n\t"
289 "1: \n\t" 289 "1: \n\t"
290 "movq (%0, %%"REG_a"), %%mm0 \n\t" 290 "movq (%0, %%"REG_a"), %%mm0 \n\t"
291 "movq 8(%0, %%"REG_a"), %%mm1 \n\t" 291 "movq 8(%0, %%"REG_a"), %%mm1 \n\t"
292 "movq (%1, %%"REG_a"), %%mm4 \n\t" 292 "movq (%1, %%"REG_a"), %%mm4 \n\t"
293 "movq 8(%1, %%"REG_a"), %%mm5 \n\t" 293 "movq 8(%1, %%"REG_a"), %%mm5 \n\t"
294 "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] 294 "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
295 "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] 295 "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
296 "pxor %%mm2, %%mm2 \n\t" 296 "pxor %%mm2, %%mm2 \n\t"
297 "pxor %%mm3, %%mm3 \n\t" 297 "pxor %%mm3, %%mm3 \n\t"
298 "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 298 "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
299 "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 299 "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
300 "pxor %%mm2, %%mm0 \n\t" 300 "pxor %%mm2, %%mm0 \n\t"
301 "pxor %%mm3, %%mm1 \n\t" 301 "pxor %%mm3, %%mm1 \n\t"
302 "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) 302 "psubw %%mm2, %%mm0 \n\t" // abs(block[i])
303 "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) 303 "psubw %%mm3, %%mm1 \n\t" // abs(block[i])
304 "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2 304 "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2
305 "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2 305 "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2
306 "paddw %%mm7, %%mm0 \n\t" // abs(block[i])*2 + 1 306 "paddw %%mm7, %%mm0 \n\t" // abs(block[i])*2 + 1
307 "paddw %%mm7, %%mm1 \n\t" // abs(block[i])*2 + 1 307 "paddw %%mm7, %%mm1 \n\t" // abs(block[i])*2 + 1
308 "pmullw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q 308 "pmullw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q
309 "pmullw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q 309 "pmullw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q
310 "pxor %%mm4, %%mm4 \n\t" 310 "pxor %%mm4, %%mm4 \n\t"
311 "pxor %%mm5, %%mm5 \n\t" // FIXME slow 311 "pxor %%mm5, %%mm5 \n\t" // FIXME slow
312 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 312 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
313 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 313 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
314 "psraw $4, %%mm0 \n\t" 314 "psraw $4, %%mm0 \n\t"
315 "psraw $4, %%mm1 \n\t" 315 "psraw $4, %%mm1 \n\t"
316 "psubw %%mm7, %%mm0 \n\t" 316 "psubw %%mm7, %%mm0 \n\t"
317 "psubw %%mm7, %%mm1 \n\t" 317 "psubw %%mm7, %%mm1 \n\t"
318 "por %%mm7, %%mm0 \n\t" 318 "por %%mm7, %%mm0 \n\t"
319 "por %%mm7, %%mm1 \n\t" 319 "por %%mm7, %%mm1 \n\t"
320 "pxor %%mm2, %%mm0 \n\t" 320 "pxor %%mm2, %%mm0 \n\t"
321 "pxor %%mm3, %%mm1 \n\t" 321 "pxor %%mm3, %%mm1 \n\t"
322 "psubw %%mm2, %%mm0 \n\t" 322 "psubw %%mm2, %%mm0 \n\t"
323 "psubw %%mm3, %%mm1 \n\t" 323 "psubw %%mm3, %%mm1 \n\t"
324 "pandn %%mm0, %%mm4 \n\t" 324 "pandn %%mm0, %%mm4 \n\t"
325 "pandn %%mm1, %%mm5 \n\t" 325 "pandn %%mm1, %%mm5 \n\t"
326 "movq %%mm4, (%0, %%"REG_a") \n\t" 326 "movq %%mm4, (%0, %%"REG_a") \n\t"
327 "movq %%mm5, 8(%0, %%"REG_a") \n\t" 327 "movq %%mm5, 8(%0, %%"REG_a") \n\t"
328 328
329 "add $16, %%"REG_a" \n\t" 329 "add $16, %%"REG_a" \n\t"
330 "js 1b \n\t" 330 "js 1b \n\t"
331 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs) 331 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs)
332 : "%"REG_a, "memory" 332 : "%"REG_a, "memory"
333 ); 333 );
334 } 334 }
335 335
336 static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s, 336 static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s,
337 DCTELEM *block, int n, int qscale) 337 DCTELEM *block, int n, int qscale)
338 { 338 {
349 block0 = block[0] * s->y_dc_scale; 349 block0 = block[0] * s->y_dc_scale;
350 else 350 else
351 block0 = block[0] * s->c_dc_scale; 351 block0 = block[0] * s->c_dc_scale;
352 quant_matrix = s->intra_matrix; 352 quant_matrix = s->intra_matrix;
353 asm volatile( 353 asm volatile(
354 "pcmpeqw %%mm7, %%mm7 \n\t" 354 "pcmpeqw %%mm7, %%mm7 \n\t"
355 "psrlw $15, %%mm7 \n\t" 355 "psrlw $15, %%mm7 \n\t"
356 "movd %2, %%mm6 \n\t" 356 "movd %2, %%mm6 \n\t"
357 "packssdw %%mm6, %%mm6 \n\t" 357 "packssdw %%mm6, %%mm6 \n\t"
358 "packssdw %%mm6, %%mm6 \n\t" 358 "packssdw %%mm6, %%mm6 \n\t"
359 "mov %3, %%"REG_a" \n\t" 359 "mov %3, %%"REG_a" \n\t"
360 ".balign 16\n\t" 360 ".balign 16 \n\t"
361 "1: \n\t" 361 "1: \n\t"
362 "movq (%0, %%"REG_a"), %%mm0 \n\t" 362 "movq (%0, %%"REG_a"), %%mm0 \n\t"
363 "movq 8(%0, %%"REG_a"), %%mm1 \n\t" 363 "movq 8(%0, %%"REG_a"), %%mm1 \n\t"
364 "movq (%1, %%"REG_a"), %%mm4 \n\t" 364 "movq (%1, %%"REG_a"), %%mm4 \n\t"
365 "movq 8(%1, %%"REG_a"), %%mm5 \n\t" 365 "movq 8(%1, %%"REG_a"), %%mm5 \n\t"
366 "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] 366 "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
367 "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] 367 "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
368 "pxor %%mm2, %%mm2 \n\t" 368 "pxor %%mm2, %%mm2 \n\t"
369 "pxor %%mm3, %%mm3 \n\t" 369 "pxor %%mm3, %%mm3 \n\t"
370 "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 370 "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
371 "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 371 "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
372 "pxor %%mm2, %%mm0 \n\t" 372 "pxor %%mm2, %%mm0 \n\t"
373 "pxor %%mm3, %%mm1 \n\t" 373 "pxor %%mm3, %%mm1 \n\t"
374 "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) 374 "psubw %%mm2, %%mm0 \n\t" // abs(block[i])
375 "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) 375 "psubw %%mm3, %%mm1 \n\t" // abs(block[i])
376 "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q 376 "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q
377 "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q 377 "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q
378 "pxor %%mm4, %%mm4 \n\t" 378 "pxor %%mm4, %%mm4 \n\t"
379 "pxor %%mm5, %%mm5 \n\t" // FIXME slow 379 "pxor %%mm5, %%mm5 \n\t" // FIXME slow
380 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 380 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
381 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 381 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
382 "psraw $3, %%mm0 \n\t" 382 "psraw $3, %%mm0 \n\t"
383 "psraw $3, %%mm1 \n\t" 383 "psraw $3, %%mm1 \n\t"
384 "pxor %%mm2, %%mm0 \n\t" 384 "pxor %%mm2, %%mm0 \n\t"
385 "pxor %%mm3, %%mm1 \n\t" 385 "pxor %%mm3, %%mm1 \n\t"
386 "psubw %%mm2, %%mm0 \n\t" 386 "psubw %%mm2, %%mm0 \n\t"
387 "psubw %%mm3, %%mm1 \n\t" 387 "psubw %%mm3, %%mm1 \n\t"
388 "pandn %%mm0, %%mm4 \n\t" 388 "pandn %%mm0, %%mm4 \n\t"
389 "pandn %%mm1, %%mm5 \n\t" 389 "pandn %%mm1, %%mm5 \n\t"
390 "movq %%mm4, (%0, %%"REG_a") \n\t" 390 "movq %%mm4, (%0, %%"REG_a") \n\t"
391 "movq %%mm5, 8(%0, %%"REG_a") \n\t" 391 "movq %%mm5, 8(%0, %%"REG_a") \n\t"
392 392
393 "add $16, %%"REG_a" \n\t" 393 "add $16, %%"REG_a" \n\t"
394 "jng 1b \n\t" 394 "jng 1b \n\t"
395 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs) 395 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs)
396 : "%"REG_a, "memory" 396 : "%"REG_a, "memory"
397 ); 397 );
398 block[0]= block0; 398 block[0]= block0;
399 //Note, we dont do mismatch control for intra as errors cannot accumulate 399 //Note, we dont do mismatch control for intra as errors cannot accumulate
400 } 400 }
401 401
402 static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s, 402 static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s,
410 if(s->alternate_scan) nCoeffs= 63; //FIXME 410 if(s->alternate_scan) nCoeffs= 63; //FIXME
411 else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; 411 else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
412 412
413 quant_matrix = s->inter_matrix; 413 quant_matrix = s->inter_matrix;
414 asm volatile( 414 asm volatile(
415 "pcmpeqw %%mm7, %%mm7 \n\t" 415 "pcmpeqw %%mm7, %%mm7 \n\t"
416 "psrlq $48, %%mm7 \n\t" 416 "psrlq $48, %%mm7 \n\t"
417 "movd %2, %%mm6 \n\t" 417 "movd %2, %%mm6 \n\t"
418 "packssdw %%mm6, %%mm6 \n\t" 418 "packssdw %%mm6, %%mm6 \n\t"
419 "packssdw %%mm6, %%mm6 \n\t" 419 "packssdw %%mm6, %%mm6 \n\t"
420 "mov %3, %%"REG_a" \n\t" 420 "mov %3, %%"REG_a" \n\t"
421 ".balign 16\n\t" 421 ".balign 16 \n\t"
422 "1: \n\t" 422 "1: \n\t"
423 "movq (%0, %%"REG_a"), %%mm0 \n\t" 423 "movq (%0, %%"REG_a"), %%mm0 \n\t"
424 "movq 8(%0, %%"REG_a"), %%mm1 \n\t" 424 "movq 8(%0, %%"REG_a"), %%mm1 \n\t"
425 "movq (%1, %%"REG_a"), %%mm4 \n\t" 425 "movq (%1, %%"REG_a"), %%mm4 \n\t"
426 "movq 8(%1, %%"REG_a"), %%mm5 \n\t" 426 "movq 8(%1, %%"REG_a"), %%mm5 \n\t"
427 "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] 427 "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
428 "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] 428 "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
429 "pxor %%mm2, %%mm2 \n\t" 429 "pxor %%mm2, %%mm2 \n\t"
430 "pxor %%mm3, %%mm3 \n\t" 430 "pxor %%mm3, %%mm3 \n\t"
431 "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 431 "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
432 "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 432 "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
433 "pxor %%mm2, %%mm0 \n\t" 433 "pxor %%mm2, %%mm0 \n\t"
434 "pxor %%mm3, %%mm1 \n\t" 434 "pxor %%mm3, %%mm1 \n\t"
435 "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) 435 "psubw %%mm2, %%mm0 \n\t" // abs(block[i])
436 "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) 436 "psubw %%mm3, %%mm1 \n\t" // abs(block[i])
437 "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2 437 "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2
438 "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2 438 "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2
439 "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*2*q 439 "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*2*q
440 "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*2*q 440 "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*2*q
441 "paddw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q 441 "paddw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q
442 "paddw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q 442 "paddw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q
443 "pxor %%mm4, %%mm4 \n\t" 443 "pxor %%mm4, %%mm4 \n\t"
444 "pxor %%mm5, %%mm5 \n\t" // FIXME slow 444 "pxor %%mm5, %%mm5 \n\t" // FIXME slow
445 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 445 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
446 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 446 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
447 "psrlw $4, %%mm0 \n\t" 447 "psrlw $4, %%mm0 \n\t"
448 "psrlw $4, %%mm1 \n\t" 448 "psrlw $4, %%mm1 \n\t"
449 "pxor %%mm2, %%mm0 \n\t" 449 "pxor %%mm2, %%mm0 \n\t"
450 "pxor %%mm3, %%mm1 \n\t" 450 "pxor %%mm3, %%mm1 \n\t"
451 "psubw %%mm2, %%mm0 \n\t" 451 "psubw %%mm2, %%mm0 \n\t"
452 "psubw %%mm3, %%mm1 \n\t" 452 "psubw %%mm3, %%mm1 \n\t"
453 "pandn %%mm0, %%mm4 \n\t" 453 "pandn %%mm0, %%mm4 \n\t"
454 "pandn %%mm1, %%mm5 \n\t" 454 "pandn %%mm1, %%mm5 \n\t"
455 "pxor %%mm4, %%mm7 \n\t" 455 "pxor %%mm4, %%mm7 \n\t"
456 "pxor %%mm5, %%mm7 \n\t" 456 "pxor %%mm5, %%mm7 \n\t"
457 "movq %%mm4, (%0, %%"REG_a") \n\t" 457 "movq %%mm4, (%0, %%"REG_a") \n\t"
458 "movq %%mm5, 8(%0, %%"REG_a") \n\t" 458 "movq %%mm5, 8(%0, %%"REG_a") \n\t"
459 459
460 "add $16, %%"REG_a" \n\t" 460 "add $16, %%"REG_a" \n\t"
461 "jng 1b \n\t" 461 "jng 1b \n\t"
462 "movd 124(%0, %3), %%mm0 \n\t" 462 "movd 124(%0, %3), %%mm0 \n\t"
463 "movq %%mm7, %%mm6 \n\t" 463 "movq %%mm7, %%mm6 \n\t"
464 "psrlq $32, %%mm7 \n\t" 464 "psrlq $32, %%mm7 \n\t"
465 "pxor %%mm6, %%mm7 \n\t" 465 "pxor %%mm6, %%mm7 \n\t"
466 "movq %%mm7, %%mm6 \n\t" 466 "movq %%mm7, %%mm6 \n\t"
467 "psrlq $16, %%mm7 \n\t" 467 "psrlq $16, %%mm7 \n\t"
468 "pxor %%mm6, %%mm7 \n\t" 468 "pxor %%mm6, %%mm7 \n\t"
469 "pslld $31, %%mm7 \n\t" 469 "pslld $31, %%mm7 \n\t"
470 "psrlq $15, %%mm7 \n\t" 470 "psrlq $15, %%mm7 \n\t"
471 "pxor %%mm7, %%mm0 \n\t" 471 "pxor %%mm7, %%mm0 \n\t"
472 "movd %%mm0, 124(%0, %3) \n\t" 472 "movd %%mm0, 124(%0, %3) \n\t"
473 473
474 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "r" (-2*nCoeffs) 474 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "r" (-2*nCoeffs)
475 : "%"REG_a, "memory" 475 : "%"REG_a, "memory"
476 ); 476 );
477 } 477 }
478 478
479 /* draw the edges of width 'w' of an image of size width, height 479 /* draw the edges of width 'w' of an image of size width, height
480 this mmx version can only handle w==8 || w==16 */ 480 this mmx version can only handle w==8 || w==16 */
481 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w) 481 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w)
486 last_line = buf + (height - 1) * wrap; 486 last_line = buf + (height - 1) * wrap;
487 /* left and right */ 487 /* left and right */
488 ptr = buf; 488 ptr = buf;
489 if(w==8) 489 if(w==8)
490 { 490 {
491 asm volatile( 491 asm volatile(
492 "1: \n\t" 492 "1: \n\t"
493 "movd (%0), %%mm0 \n\t" 493 "movd (%0), %%mm0 \n\t"
494 "punpcklbw %%mm0, %%mm0 \n\t" 494 "punpcklbw %%mm0, %%mm0 \n\t"
495 "punpcklwd %%mm0, %%mm0 \n\t" 495 "punpcklwd %%mm0, %%mm0 \n\t"
496 "punpckldq %%mm0, %%mm0 \n\t" 496 "punpckldq %%mm0, %%mm0 \n\t"
497 "movq %%mm0, -8(%0) \n\t" 497 "movq %%mm0, -8(%0) \n\t"
498 "movq -8(%0, %2), %%mm1 \n\t" 498 "movq -8(%0, %2), %%mm1 \n\t"
499 "punpckhbw %%mm1, %%mm1 \n\t" 499 "punpckhbw %%mm1, %%mm1 \n\t"
500 "punpckhwd %%mm1, %%mm1 \n\t" 500 "punpckhwd %%mm1, %%mm1 \n\t"
501 "punpckhdq %%mm1, %%mm1 \n\t" 501 "punpckhdq %%mm1, %%mm1 \n\t"
502 "movq %%mm1, (%0, %2) \n\t" 502 "movq %%mm1, (%0, %2) \n\t"
503 "add %1, %0 \n\t" 503 "add %1, %0 \n\t"
504 "cmp %3, %0 \n\t" 504 "cmp %3, %0 \n\t"
505 " jb 1b \n\t" 505 " jb 1b \n\t"
506 : "+r" (ptr) 506 : "+r" (ptr)
507 : "r" ((long)wrap), "r" ((long)width), "r" (ptr + wrap*height) 507 : "r" ((long)wrap), "r" ((long)width), "r" (ptr + wrap*height)
508 ); 508 );
509 } 509 }
510 else 510 else
511 { 511 {
512 asm volatile( 512 asm volatile(
513 "1: \n\t" 513 "1: \n\t"
514 "movd (%0), %%mm0 \n\t" 514 "movd (%0), %%mm0 \n\t"
515 "punpcklbw %%mm0, %%mm0 \n\t" 515 "punpcklbw %%mm0, %%mm0 \n\t"
516 "punpcklwd %%mm0, %%mm0 \n\t" 516 "punpcklwd %%mm0, %%mm0 \n\t"
517 "punpckldq %%mm0, %%mm0 \n\t" 517 "punpckldq %%mm0, %%mm0 \n\t"
518 "movq %%mm0, -8(%0) \n\t" 518 "movq %%mm0, -8(%0) \n\t"
519 "movq %%mm0, -16(%0) \n\t" 519 "movq %%mm0, -16(%0) \n\t"
520 "movq -8(%0, %2), %%mm1 \n\t" 520 "movq -8(%0, %2), %%mm1 \n\t"
521 "punpckhbw %%mm1, %%mm1 \n\t" 521 "punpckhbw %%mm1, %%mm1 \n\t"
522 "punpckhwd %%mm1, %%mm1 \n\t" 522 "punpckhwd %%mm1, %%mm1 \n\t"
523 "punpckhdq %%mm1, %%mm1 \n\t" 523 "punpckhdq %%mm1, %%mm1 \n\t"
524 "movq %%mm1, (%0, %2) \n\t" 524 "movq %%mm1, (%0, %2) \n\t"
525 "movq %%mm1, 8(%0, %2) \n\t" 525 "movq %%mm1, 8(%0, %2) \n\t"
526 "add %1, %0 \n\t" 526 "add %1, %0 \n\t"
527 "cmp %3, %0 \n\t" 527 "cmp %3, %0 \n\t"
528 " jb 1b \n\t" 528 " jb 1b \n\t"
529 : "+r" (ptr) 529 : "+r" (ptr)
530 : "r" ((long)wrap), "r" ((long)width), "r" (ptr + wrap*height) 530 : "r" ((long)wrap), "r" ((long)width), "r" (ptr + wrap*height)
531 ); 531 );
532 } 532 }
533 533
534 for(i=0;i<w;i+=4) { 534 for(i=0;i<w;i+=4) {
535 /* top and bottom (and hopefully also the corners) */ 535 /* top and bottom (and hopefully also the corners) */
536 ptr= buf - (i + 1) * wrap - w; 536 ptr= buf - (i + 1) * wrap - w;
537 asm volatile( 537 asm volatile(
538 "1: \n\t" 538 "1: \n\t"
539 "movq (%1, %0), %%mm0 \n\t" 539 "movq (%1, %0), %%mm0 \n\t"
540 "movq %%mm0, (%0) \n\t" 540 "movq %%mm0, (%0) \n\t"
541 "movq %%mm0, (%0, %2) \n\t" 541 "movq %%mm0, (%0, %2) \n\t"
542 "movq %%mm0, (%0, %2, 2) \n\t" 542 "movq %%mm0, (%0, %2, 2) \n\t"
543 "movq %%mm0, (%0, %3) \n\t" 543 "movq %%mm0, (%0, %3) \n\t"
544 "add $8, %0 \n\t" 544 "add $8, %0 \n\t"
545 "cmp %4, %0 \n\t" 545 "cmp %4, %0 \n\t"
546 " jb 1b \n\t" 546 " jb 1b \n\t"
547 : "+r" (ptr) 547 : "+r" (ptr)
548 : "r" ((long)buf - (long)ptr - w), "r" ((long)-wrap), "r" ((long)-wrap*3), "r" (ptr+width+2*w) 548 : "r" ((long)buf - (long)ptr - w), "r" ((long)-wrap), "r" ((long)-wrap*3), "r" (ptr+width+2*w)
549 ); 549 );
550 ptr= last_line + (i + 1) * wrap - w; 550 ptr= last_line + (i + 1) * wrap - w;
551 asm volatile( 551 asm volatile(
552 "1: \n\t" 552 "1: \n\t"
553 "movq (%1, %0), %%mm0 \n\t" 553 "movq (%1, %0), %%mm0 \n\t"
554 "movq %%mm0, (%0) \n\t" 554 "movq %%mm0, (%0) \n\t"
555 "movq %%mm0, (%0, %2) \n\t" 555 "movq %%mm0, (%0, %2) \n\t"
556 "movq %%mm0, (%0, %2, 2) \n\t" 556 "movq %%mm0, (%0, %2, 2) \n\t"
557 "movq %%mm0, (%0, %3) \n\t" 557 "movq %%mm0, (%0, %3) \n\t"
558 "add $8, %0 \n\t" 558 "add $8, %0 \n\t"
559 "cmp %4, %0 \n\t" 559 "cmp %4, %0 \n\t"
560 " jb 1b \n\t" 560 " jb 1b \n\t"
561 : "+r" (ptr) 561 : "+r" (ptr)
562 : "r" ((long)last_line - (long)ptr - w), "r" ((long)wrap), "r" ((long)wrap*3), "r" (ptr+width+2*w) 562 : "r" ((long)last_line - (long)ptr - w), "r" ((long)wrap), "r" ((long)wrap*3), "r" (ptr+width+2*w)
563 ); 563 );
564 } 564 }
565 } 565 }
566 566
567 static void denoise_dct_mmx(MpegEncContext *s, DCTELEM *block){ 567 static void denoise_dct_mmx(MpegEncContext *s, DCTELEM *block){
568 const int intra= s->mb_intra; 568 const int intra= s->mb_intra;
570 uint16_t *offset= s->dct_offset[intra]; 570 uint16_t *offset= s->dct_offset[intra];
571 571
572 s->dct_count[intra]++; 572 s->dct_count[intra]++;
573 573
574 asm volatile( 574 asm volatile(
575 "pxor %%mm7, %%mm7 \n\t" 575 "pxor %%mm7, %%mm7 \n\t"
576 "1: \n\t" 576 "1: \n\t"
577 "pxor %%mm0, %%mm0 \n\t" 577 "pxor %%mm0, %%mm0 \n\t"
578 "pxor %%mm1, %%mm1 \n\t" 578 "pxor %%mm1, %%mm1 \n\t"
579 "movq (%0), %%mm2 \n\t" 579 "movq (%0), %%mm2 \n\t"
580 "movq 8(%0), %%mm3 \n\t" 580 "movq 8(%0), %%mm3 \n\t"
581 "pcmpgtw %%mm2, %%mm0 \n\t" 581 "pcmpgtw %%mm2, %%mm0 \n\t"
582 "pcmpgtw %%mm3, %%mm1 \n\t" 582 "pcmpgtw %%mm3, %%mm1 \n\t"
583 "pxor %%mm0, %%mm2 \n\t" 583 "pxor %%mm0, %%mm2 \n\t"
584 "pxor %%mm1, %%mm3 \n\t" 584 "pxor %%mm1, %%mm3 \n\t"
585 "psubw %%mm0, %%mm2 \n\t" 585 "psubw %%mm0, %%mm2 \n\t"
586 "psubw %%mm1, %%mm3 \n\t" 586 "psubw %%mm1, %%mm3 \n\t"
587 "movq %%mm2, %%mm4 \n\t" 587 "movq %%mm2, %%mm4 \n\t"
588 "movq %%mm3, %%mm5 \n\t" 588 "movq %%mm3, %%mm5 \n\t"
589 "psubusw (%2), %%mm2 \n\t" 589 "psubusw (%2), %%mm2 \n\t"
590 "psubusw 8(%2), %%mm3 \n\t" 590 "psubusw 8(%2), %%mm3 \n\t"
591 "pxor %%mm0, %%mm2 \n\t" 591 "pxor %%mm0, %%mm2 \n\t"
592 "pxor %%mm1, %%mm3 \n\t" 592 "pxor %%mm1, %%mm3 \n\t"
593 "psubw %%mm0, %%mm2 \n\t" 593 "psubw %%mm0, %%mm2 \n\t"
594 "psubw %%mm1, %%mm3 \n\t" 594 "psubw %%mm1, %%mm3 \n\t"
595 "movq %%mm2, (%0) \n\t" 595 "movq %%mm2, (%0) \n\t"
596 "movq %%mm3, 8(%0) \n\t" 596 "movq %%mm3, 8(%0) \n\t"
597 "movq %%mm4, %%mm2 \n\t" 597 "movq %%mm4, %%mm2 \n\t"
598 "movq %%mm5, %%mm3 \n\t" 598 "movq %%mm5, %%mm3 \n\t"
599 "punpcklwd %%mm7, %%mm4 \n\t" 599 "punpcklwd %%mm7, %%mm4 \n\t"
600 "punpckhwd %%mm7, %%mm2 \n\t" 600 "punpckhwd %%mm7, %%mm2 \n\t"
601 "punpcklwd %%mm7, %%mm5 \n\t" 601 "punpcklwd %%mm7, %%mm5 \n\t"
602 "punpckhwd %%mm7, %%mm3 \n\t" 602 "punpckhwd %%mm7, %%mm3 \n\t"
603 "paddd (%1), %%mm4 \n\t" 603 "paddd (%1), %%mm4 \n\t"
604 "paddd 8(%1), %%mm2 \n\t" 604 "paddd 8(%1), %%mm2 \n\t"
605 "paddd 16(%1), %%mm5 \n\t" 605 "paddd 16(%1), %%mm5 \n\t"
606 "paddd 24(%1), %%mm3 \n\t" 606 "paddd 24(%1), %%mm3 \n\t"
607 "movq %%mm4, (%1) \n\t" 607 "movq %%mm4, (%1) \n\t"
608 "movq %%mm2, 8(%1) \n\t" 608 "movq %%mm2, 8(%1) \n\t"
609 "movq %%mm5, 16(%1) \n\t" 609 "movq %%mm5, 16(%1) \n\t"
610 "movq %%mm3, 24(%1) \n\t" 610 "movq %%mm3, 24(%1) \n\t"
611 "add $16, %0 \n\t" 611 "add $16, %0 \n\t"
612 "add $32, %1 \n\t" 612 "add $32, %1 \n\t"
613 "add $16, %2 \n\t" 613 "add $16, %2 \n\t"
614 "cmp %3, %0 \n\t" 614 "cmp %3, %0 \n\t"
615 " jb 1b \n\t" 615 " jb 1b \n\t"
616 : "+r" (block), "+r" (sum), "+r" (offset) 616 : "+r" (block), "+r" (sum), "+r" (offset)
617 : "r"(block+64) 617 : "r"(block+64)
618 ); 618 );
619 } 619 }
620 620
624 uint16_t *offset= s->dct_offset[intra]; 624 uint16_t *offset= s->dct_offset[intra];
625 625
626 s->dct_count[intra]++; 626 s->dct_count[intra]++;
627 627
628 asm volatile( 628 asm volatile(
629 "pxor %%xmm7, %%xmm7 \n\t" 629 "pxor %%xmm7, %%xmm7 \n\t"
630 "1: \n\t" 630 "1: \n\t"
631 "pxor %%xmm0, %%xmm0 \n\t" 631 "pxor %%xmm0, %%xmm0 \n\t"
632 "pxor %%xmm1, %%xmm1 \n\t" 632 "pxor %%xmm1, %%xmm1 \n\t"
633 "movdqa (%0), %%xmm2 \n\t" 633 "movdqa (%0), %%xmm2 \n\t"
634 "movdqa 16(%0), %%xmm3 \n\t" 634 "movdqa 16(%0), %%xmm3 \n\t"
635 "pcmpgtw %%xmm2, %%xmm0 \n\t" 635 "pcmpgtw %%xmm2, %%xmm0 \n\t"
636 "pcmpgtw %%xmm3, %%xmm1 \n\t" 636 "pcmpgtw %%xmm3, %%xmm1 \n\t"
637 "pxor %%xmm0, %%xmm2 \n\t" 637 "pxor %%xmm0, %%xmm2 \n\t"
638 "pxor %%xmm1, %%xmm3 \n\t" 638 "pxor %%xmm1, %%xmm3 \n\t"
639 "psubw %%xmm0, %%xmm2 \n\t" 639 "psubw %%xmm0, %%xmm2 \n\t"
640 "psubw %%xmm1, %%xmm3 \n\t" 640 "psubw %%xmm1, %%xmm3 \n\t"
641 "movdqa %%xmm2, %%xmm4 \n\t" 641 "movdqa %%xmm2, %%xmm4 \n\t"
642 "movdqa %%xmm3, %%xmm5 \n\t" 642 "movdqa %%xmm3, %%xmm5 \n\t"
643 "psubusw (%2), %%xmm2 \n\t" 643 "psubusw (%2), %%xmm2 \n\t"
644 "psubusw 16(%2), %%xmm3 \n\t" 644 "psubusw 16(%2), %%xmm3 \n\t"
645 "pxor %%xmm0, %%xmm2 \n\t" 645 "pxor %%xmm0, %%xmm2 \n\t"
646 "pxor %%xmm1, %%xmm3 \n\t" 646 "pxor %%xmm1, %%xmm3 \n\t"
647 "psubw %%xmm0, %%xmm2 \n\t" 647 "psubw %%xmm0, %%xmm2 \n\t"
648 "psubw %%xmm1, %%xmm3 \n\t" 648 "psubw %%xmm1, %%xmm3 \n\t"
649 "movdqa %%xmm2, (%0) \n\t" 649 "movdqa %%xmm2, (%0) \n\t"
650 "movdqa %%xmm3, 16(%0) \n\t" 650 "movdqa %%xmm3, 16(%0) \n\t"
651 "movdqa %%xmm4, %%xmm6 \n\t" 651 "movdqa %%xmm4, %%xmm6 \n\t"
652 "movdqa %%xmm5, %%xmm0 \n\t" 652 "movdqa %%xmm5, %%xmm0 \n\t"
653 "punpcklwd %%xmm7, %%xmm4 \n\t" 653 "punpcklwd %%xmm7, %%xmm4 \n\t"
654 "punpckhwd %%xmm7, %%xmm6 \n\t" 654 "punpckhwd %%xmm7, %%xmm6 \n\t"
655 "punpcklwd %%xmm7, %%xmm5 \n\t" 655 "punpcklwd %%xmm7, %%xmm5 \n\t"
656 "punpckhwd %%xmm7, %%xmm0 \n\t" 656 "punpckhwd %%xmm7, %%xmm0 \n\t"
657 "paddd (%1), %%xmm4 \n\t" 657 "paddd (%1), %%xmm4 \n\t"
658 "paddd 16(%1), %%xmm6 \n\t" 658 "paddd 16(%1), %%xmm6 \n\t"
659 "paddd 32(%1), %%xmm5 \n\t" 659 "paddd 32(%1), %%xmm5 \n\t"
660 "paddd 48(%1), %%xmm0 \n\t" 660 "paddd 48(%1), %%xmm0 \n\t"
661 "movdqa %%xmm4, (%1) \n\t" 661 "movdqa %%xmm4, (%1) \n\t"
662 "movdqa %%xmm6, 16(%1) \n\t" 662 "movdqa %%xmm6, 16(%1) \n\t"
663 "movdqa %%xmm5, 32(%1) \n\t" 663 "movdqa %%xmm5, 32(%1) \n\t"
664 "movdqa %%xmm0, 48(%1) \n\t" 664 "movdqa %%xmm0, 48(%1) \n\t"
665 "add $32, %0 \n\t" 665 "add $32, %0 \n\t"
666 "add $64, %1 \n\t" 666 "add $64, %1 \n\t"
667 "add $32, %2 \n\t" 667 "add $32, %2 \n\t"
668 "cmp %3, %0 \n\t" 668 "cmp %3, %0 \n\t"
669 " jb 1b \n\t" 669 " jb 1b \n\t"
670 : "+r" (block), "+r" (sum), "+r" (offset) 670 : "+r" (block), "+r" (sum), "+r" (offset)
671 : "r"(block+64) 671 : "r"(block+64)
672 ); 672 );
673 } 673 }
674 674
703 s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx; 703 s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx;
704 704
705 draw_edges = draw_edges_mmx; 705 draw_edges = draw_edges_mmx;
706 706
707 if (mm_flags & MM_SSE2) { 707 if (mm_flags & MM_SSE2) {
708 s->denoise_dct= denoise_dct_sse2; 708 s->denoise_dct= denoise_dct_sse2;
709 } else { 709 } else {
710 s->denoise_dct= denoise_dct_mmx; 710 s->denoise_dct= denoise_dct_mmx;
711 } 711 }
712 712
713 if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){ 713 if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
714 if(mm_flags & MM_SSE2){ 714 if(mm_flags & MM_SSE2){
715 s->dct_quantize= dct_quantize_SSE2; 715 s->dct_quantize= dct_quantize_SSE2;
716 } else if(mm_flags & MM_MMXEXT){ 716 } else if(mm_flags & MM_MMXEXT){