comparison i386/mpegvideo_mmx.c @ 2293:15cfba1b97b5 libavcodec

adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs <aurel at gnuage dot org>)
author michael
date Mon, 11 Oct 2004 02:19:29 +0000
parents f65d87bfdd5a
children ef2149182f1c
comparison
equal deleted inserted replaced
2292:8556f080fcc2 2293:15cfba1b97b5
21 */ 21 */
22 22
23 #include "../dsputil.h" 23 #include "../dsputil.h"
24 #include "../mpegvideo.h" 24 #include "../mpegvideo.h"
25 #include "../avcodec.h" 25 #include "../avcodec.h"
26 #include "mmx.h"
26 27
27 extern uint8_t zigzag_direct_noperm[64]; 28 extern uint8_t zigzag_direct_noperm[64];
28 extern uint16_t inv_zigzag_direct16[64]; 29 extern uint16_t inv_zigzag_direct16[64];
29 30
30 static const unsigned long long int mm_wabs __attribute__ ((aligned(8))) = 0xffffffffffffffffULL; 31 static const unsigned long long int mm_wabs __attribute__ ((aligned(8))) = 0xffffffffffffffffULL;
32 33
33 34
34 static void dct_unquantize_h263_intra_mmx(MpegEncContext *s, 35 static void dct_unquantize_h263_intra_mmx(MpegEncContext *s,
35 DCTELEM *block, int n, int qscale) 36 DCTELEM *block, int n, int qscale)
36 { 37 {
37 int level, qmul, qadd, nCoeffs; 38 long level, qmul, qadd, nCoeffs;
38 39
39 qmul = qscale << 1; 40 qmul = qscale << 1;
40 41
41 assert(s->block_last_index[n]>=0 || s->h263_aic); 42 assert(s->block_last_index[n]>=0 || s->h263_aic);
42 43
95 "pandn %%mm3, %%mm1 \n\t" 96 "pandn %%mm3, %%mm1 \n\t"
96 97
97 "movq %%mm0, (%0, %3) \n\t" 98 "movq %%mm0, (%0, %3) \n\t"
98 "movq %%mm1, 8(%0, %3) \n\t" 99 "movq %%mm1, 8(%0, %3) \n\t"
99 100
100 "addl $16, %3 \n\t" 101 "add $16, %3 \n\t"
101 "jng 1b \n\t" 102 "jng 1b \n\t"
102 ::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(-nCoeffs)) 103 ::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(-nCoeffs))
103 : "memory" 104 : "memory"
104 ); 105 );
105 block[0]= level; 106 block[0]= level;
107 108
108 109
109 static void dct_unquantize_h263_inter_mmx(MpegEncContext *s, 110 static void dct_unquantize_h263_inter_mmx(MpegEncContext *s,
110 DCTELEM *block, int n, int qscale) 111 DCTELEM *block, int n, int qscale)
111 { 112 {
112 int qmul, qadd, nCoeffs; 113 long qmul, qadd, nCoeffs;
113 114
114 qmul = qscale << 1; 115 qmul = qscale << 1;
115 qadd = (qscale - 1) | 1; 116 qadd = (qscale - 1) | 1;
116 117
117 assert(s->block_last_index[n]>=0 || s->h263_aic); 118 assert(s->block_last_index[n]>=0 || s->h263_aic);
158 "pandn %%mm3, %%mm1 \n\t" 159 "pandn %%mm3, %%mm1 \n\t"
159 160
160 "movq %%mm0, (%0, %3) \n\t" 161 "movq %%mm0, (%0, %3) \n\t"
161 "movq %%mm1, 8(%0, %3) \n\t" 162 "movq %%mm1, 8(%0, %3) \n\t"
162 163
163 "addl $16, %3 \n\t" 164 "add $16, %3 \n\t"
164 "jng 1b \n\t" 165 "jng 1b \n\t"
165 ::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(-nCoeffs)) 166 ::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(-nCoeffs))
166 : "memory" 167 : "memory"
167 ); 168 );
168 } 169 }
198 high3 += tlow1 199 high3 += tlow1
199 */ 200 */
200 static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s, 201 static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s,
201 DCTELEM *block, int n, int qscale) 202 DCTELEM *block, int n, int qscale)
202 { 203 {
203 int nCoeffs; 204 long nCoeffs;
204 const uint16_t *quant_matrix; 205 const uint16_t *quant_matrix;
205 int block0; 206 int block0;
206 207
207 assert(s->block_last_index[n]>=0); 208 assert(s->block_last_index[n]>=0);
208 209
218 "pcmpeqw %%mm7, %%mm7 \n\t" 219 "pcmpeqw %%mm7, %%mm7 \n\t"
219 "psrlw $15, %%mm7 \n\t" 220 "psrlw $15, %%mm7 \n\t"
220 "movd %2, %%mm6 \n\t" 221 "movd %2, %%mm6 \n\t"
221 "packssdw %%mm6, %%mm6 \n\t" 222 "packssdw %%mm6, %%mm6 \n\t"
222 "packssdw %%mm6, %%mm6 \n\t" 223 "packssdw %%mm6, %%mm6 \n\t"
223 "movl %3, %%eax \n\t" 224 "mov %3, %%"REG_a" \n\t"
224 ".balign 16\n\t" 225 ".balign 16\n\t"
225 "1: \n\t" 226 "1: \n\t"
226 "movq (%0, %%eax), %%mm0 \n\t" 227 "movq (%0, %%"REG_a"), %%mm0 \n\t"
227 "movq 8(%0, %%eax), %%mm1 \n\t" 228 "movq 8(%0, %%"REG_a"), %%mm1 \n\t"
228 "movq (%1, %%eax), %%mm4 \n\t" 229 "movq (%1, %%"REG_a"), %%mm4 \n\t"
229 "movq 8(%1, %%eax), %%mm5 \n\t" 230 "movq 8(%1, %%"REG_a"), %%mm5 \n\t"
230 "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] 231 "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
231 "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] 232 "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
232 "pxor %%mm2, %%mm2 \n\t" 233 "pxor %%mm2, %%mm2 \n\t"
233 "pxor %%mm3, %%mm3 \n\t" 234 "pxor %%mm3, %%mm3 \n\t"
234 "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 235 "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
239 "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) 240 "psubw %%mm3, %%mm1 \n\t" // abs(block[i])
240 "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q 241 "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q
241 "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q 242 "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q
242 "pxor %%mm4, %%mm4 \n\t" 243 "pxor %%mm4, %%mm4 \n\t"
243 "pxor %%mm5, %%mm5 \n\t" // FIXME slow 244 "pxor %%mm5, %%mm5 \n\t" // FIXME slow
244 "pcmpeqw (%0, %%eax), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 245 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
245 "pcmpeqw 8(%0, %%eax), %%mm5 \n\t" // block[i] == 0 ? -1 : 0 246 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
246 "psraw $3, %%mm0 \n\t" 247 "psraw $3, %%mm0 \n\t"
247 "psraw $3, %%mm1 \n\t" 248 "psraw $3, %%mm1 \n\t"
248 "psubw %%mm7, %%mm0 \n\t" 249 "psubw %%mm7, %%mm0 \n\t"
249 "psubw %%mm7, %%mm1 \n\t" 250 "psubw %%mm7, %%mm1 \n\t"
250 "por %%mm7, %%mm0 \n\t" 251 "por %%mm7, %%mm0 \n\t"
253 "pxor %%mm3, %%mm1 \n\t" 254 "pxor %%mm3, %%mm1 \n\t"
254 "psubw %%mm2, %%mm0 \n\t" 255 "psubw %%mm2, %%mm0 \n\t"
255 "psubw %%mm3, %%mm1 \n\t" 256 "psubw %%mm3, %%mm1 \n\t"
256 "pandn %%mm0, %%mm4 \n\t" 257 "pandn %%mm0, %%mm4 \n\t"
257 "pandn %%mm1, %%mm5 \n\t" 258 "pandn %%mm1, %%mm5 \n\t"
258 "movq %%mm4, (%0, %%eax) \n\t" 259 "movq %%mm4, (%0, %%"REG_a") \n\t"
259 "movq %%mm5, 8(%0, %%eax) \n\t" 260 "movq %%mm5, 8(%0, %%"REG_a") \n\t"
260 261
261 "addl $16, %%eax \n\t" 262 "add $16, %%"REG_a" \n\t"
262 "js 1b \n\t" 263 "js 1b \n\t"
263 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs) 264 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs)
264 : "%eax", "memory" 265 : "%"REG_a, "memory"
265 ); 266 );
266 block[0]= block0; 267 block[0]= block0;
267 } 268 }
268 269
269 static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s, 270 static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s,
270 DCTELEM *block, int n, int qscale) 271 DCTELEM *block, int n, int qscale)
271 { 272 {
272 int nCoeffs; 273 long nCoeffs;
273 const uint16_t *quant_matrix; 274 const uint16_t *quant_matrix;
274 275
275 assert(s->block_last_index[n]>=0); 276 assert(s->block_last_index[n]>=0);
276 277
277 nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1; 278 nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
281 "pcmpeqw %%mm7, %%mm7 \n\t" 282 "pcmpeqw %%mm7, %%mm7 \n\t"
282 "psrlw $15, %%mm7 \n\t" 283 "psrlw $15, %%mm7 \n\t"
283 "movd %2, %%mm6 \n\t" 284 "movd %2, %%mm6 \n\t"
284 "packssdw %%mm6, %%mm6 \n\t" 285 "packssdw %%mm6, %%mm6 \n\t"
285 "packssdw %%mm6, %%mm6 \n\t" 286 "packssdw %%mm6, %%mm6 \n\t"
286 "movl %3, %%eax \n\t" 287 "mov %3, %%"REG_a" \n\t"
287 ".balign 16\n\t" 288 ".balign 16\n\t"
288 "1: \n\t" 289 "1: \n\t"
289 "movq (%0, %%eax), %%mm0 \n\t" 290 "movq (%0, %%"REG_a"), %%mm0 \n\t"
290 "movq 8(%0, %%eax), %%mm1 \n\t" 291 "movq 8(%0, %%"REG_a"), %%mm1 \n\t"
291 "movq (%1, %%eax), %%mm4 \n\t" 292 "movq (%1, %%"REG_a"), %%mm4 \n\t"
292 "movq 8(%1, %%eax), %%mm5 \n\t" 293 "movq 8(%1, %%"REG_a"), %%mm5 \n\t"
293 "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] 294 "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
294 "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] 295 "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
295 "pxor %%mm2, %%mm2 \n\t" 296 "pxor %%mm2, %%mm2 \n\t"
296 "pxor %%mm3, %%mm3 \n\t" 297 "pxor %%mm3, %%mm3 \n\t"
297 "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 298 "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
306 "paddw %%mm7, %%mm1 \n\t" // abs(block[i])*2 + 1 307 "paddw %%mm7, %%mm1 \n\t" // abs(block[i])*2 + 1
307 "pmullw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q 308 "pmullw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q
308 "pmullw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q 309 "pmullw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q
309 "pxor %%mm4, %%mm4 \n\t" 310 "pxor %%mm4, %%mm4 \n\t"
310 "pxor %%mm5, %%mm5 \n\t" // FIXME slow 311 "pxor %%mm5, %%mm5 \n\t" // FIXME slow
311 "pcmpeqw (%0, %%eax), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 312 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
312 "pcmpeqw 8(%0, %%eax), %%mm5 \n\t" // block[i] == 0 ? -1 : 0 313 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
313 "psraw $4, %%mm0 \n\t" 314 "psraw $4, %%mm0 \n\t"
314 "psraw $4, %%mm1 \n\t" 315 "psraw $4, %%mm1 \n\t"
315 "psubw %%mm7, %%mm0 \n\t" 316 "psubw %%mm7, %%mm0 \n\t"
316 "psubw %%mm7, %%mm1 \n\t" 317 "psubw %%mm7, %%mm1 \n\t"
317 "por %%mm7, %%mm0 \n\t" 318 "por %%mm7, %%mm0 \n\t"
320 "pxor %%mm3, %%mm1 \n\t" 321 "pxor %%mm3, %%mm1 \n\t"
321 "psubw %%mm2, %%mm0 \n\t" 322 "psubw %%mm2, %%mm0 \n\t"
322 "psubw %%mm3, %%mm1 \n\t" 323 "psubw %%mm3, %%mm1 \n\t"
323 "pandn %%mm0, %%mm4 \n\t" 324 "pandn %%mm0, %%mm4 \n\t"
324 "pandn %%mm1, %%mm5 \n\t" 325 "pandn %%mm1, %%mm5 \n\t"
325 "movq %%mm4, (%0, %%eax) \n\t" 326 "movq %%mm4, (%0, %%"REG_a") \n\t"
326 "movq %%mm5, 8(%0, %%eax) \n\t" 327 "movq %%mm5, 8(%0, %%"REG_a") \n\t"
327 328
328 "addl $16, %%eax \n\t" 329 "add $16, %%"REG_a" \n\t"
329 "js 1b \n\t" 330 "js 1b \n\t"
330 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs) 331 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs)
331 : "%eax", "memory" 332 : "%"REG_a, "memory"
332 ); 333 );
333 } 334 }
334 335
335 static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s, 336 static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s,
336 DCTELEM *block, int n, int qscale) 337 DCTELEM *block, int n, int qscale)
337 { 338 {
338 int nCoeffs; 339 long nCoeffs;
339 const uint16_t *quant_matrix; 340 const uint16_t *quant_matrix;
340 int block0; 341 int block0;
341 342
342 assert(s->block_last_index[n]>=0); 343 assert(s->block_last_index[n]>=0);
343 344
353 "pcmpeqw %%mm7, %%mm7 \n\t" 354 "pcmpeqw %%mm7, %%mm7 \n\t"
354 "psrlw $15, %%mm7 \n\t" 355 "psrlw $15, %%mm7 \n\t"
355 "movd %2, %%mm6 \n\t" 356 "movd %2, %%mm6 \n\t"
356 "packssdw %%mm6, %%mm6 \n\t" 357 "packssdw %%mm6, %%mm6 \n\t"
357 "packssdw %%mm6, %%mm6 \n\t" 358 "packssdw %%mm6, %%mm6 \n\t"
358 "movl %3, %%eax \n\t" 359 "mov %3, %%"REG_a" \n\t"
359 ".balign 16\n\t" 360 ".balign 16\n\t"
360 "1: \n\t" 361 "1: \n\t"
361 "movq (%0, %%eax), %%mm0 \n\t" 362 "movq (%0, %%"REG_a"), %%mm0 \n\t"
362 "movq 8(%0, %%eax), %%mm1 \n\t" 363 "movq 8(%0, %%"REG_a"), %%mm1 \n\t"
363 "movq (%1, %%eax), %%mm4 \n\t" 364 "movq (%1, %%"REG_a"), %%mm4 \n\t"
364 "movq 8(%1, %%eax), %%mm5 \n\t" 365 "movq 8(%1, %%"REG_a"), %%mm5 \n\t"
365 "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] 366 "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
366 "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] 367 "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
367 "pxor %%mm2, %%mm2 \n\t" 368 "pxor %%mm2, %%mm2 \n\t"
368 "pxor %%mm3, %%mm3 \n\t" 369 "pxor %%mm3, %%mm3 \n\t"
369 "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 370 "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
374 "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) 375 "psubw %%mm3, %%mm1 \n\t" // abs(block[i])
375 "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q 376 "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q
376 "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q 377 "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q
377 "pxor %%mm4, %%mm4 \n\t" 378 "pxor %%mm4, %%mm4 \n\t"
378 "pxor %%mm5, %%mm5 \n\t" // FIXME slow 379 "pxor %%mm5, %%mm5 \n\t" // FIXME slow
379 "pcmpeqw (%0, %%eax), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 380 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
380 "pcmpeqw 8(%0, %%eax), %%mm5 \n\t" // block[i] == 0 ? -1 : 0 381 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
381 "psraw $3, %%mm0 \n\t" 382 "psraw $3, %%mm0 \n\t"
382 "psraw $3, %%mm1 \n\t" 383 "psraw $3, %%mm1 \n\t"
383 "pxor %%mm2, %%mm0 \n\t" 384 "pxor %%mm2, %%mm0 \n\t"
384 "pxor %%mm3, %%mm1 \n\t" 385 "pxor %%mm3, %%mm1 \n\t"
385 "psubw %%mm2, %%mm0 \n\t" 386 "psubw %%mm2, %%mm0 \n\t"
386 "psubw %%mm3, %%mm1 \n\t" 387 "psubw %%mm3, %%mm1 \n\t"
387 "pandn %%mm0, %%mm4 \n\t" 388 "pandn %%mm0, %%mm4 \n\t"
388 "pandn %%mm1, %%mm5 \n\t" 389 "pandn %%mm1, %%mm5 \n\t"
389 "movq %%mm4, (%0, %%eax) \n\t" 390 "movq %%mm4, (%0, %%"REG_a") \n\t"
390 "movq %%mm5, 8(%0, %%eax) \n\t" 391 "movq %%mm5, 8(%0, %%"REG_a") \n\t"
391 392
392 "addl $16, %%eax \n\t" 393 "add $16, %%"REG_a" \n\t"
393 "jng 1b \n\t" 394 "jng 1b \n\t"
394 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs) 395 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs)
395 : "%eax", "memory" 396 : "%"REG_a, "memory"
396 ); 397 );
397 block[0]= block0; 398 block[0]= block0;
398 //Note, we dont do mismatch control for intra as errors cannot accumulate 399 //Note, we dont do mismatch control for intra as errors cannot accumulate
399 } 400 }
400 401
401 static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s, 402 static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s,
402 DCTELEM *block, int n, int qscale) 403 DCTELEM *block, int n, int qscale)
403 { 404 {
404 int nCoeffs; 405 long nCoeffs;
405 const uint16_t *quant_matrix; 406 const uint16_t *quant_matrix;
406 407
407 assert(s->block_last_index[n]>=0); 408 assert(s->block_last_index[n]>=0);
408 409
409 if(s->alternate_scan) nCoeffs= 63; //FIXME 410 if(s->alternate_scan) nCoeffs= 63; //FIXME
414 "pcmpeqw %%mm7, %%mm7 \n\t" 415 "pcmpeqw %%mm7, %%mm7 \n\t"
415 "psrlq $48, %%mm7 \n\t" 416 "psrlq $48, %%mm7 \n\t"
416 "movd %2, %%mm6 \n\t" 417 "movd %2, %%mm6 \n\t"
417 "packssdw %%mm6, %%mm6 \n\t" 418 "packssdw %%mm6, %%mm6 \n\t"
418 "packssdw %%mm6, %%mm6 \n\t" 419 "packssdw %%mm6, %%mm6 \n\t"
419 "movl %3, %%eax \n\t" 420 "mov %3, %%"REG_a" \n\t"
420 ".balign 16\n\t" 421 ".balign 16\n\t"
421 "1: \n\t" 422 "1: \n\t"
422 "movq (%0, %%eax), %%mm0 \n\t" 423 "movq (%0, %%"REG_a"), %%mm0 \n\t"
423 "movq 8(%0, %%eax), %%mm1 \n\t" 424 "movq 8(%0, %%"REG_a"), %%mm1 \n\t"
424 "movq (%1, %%eax), %%mm4 \n\t" 425 "movq (%1, %%"REG_a"), %%mm4 \n\t"
425 "movq 8(%1, %%eax), %%mm5 \n\t" 426 "movq 8(%1, %%"REG_a"), %%mm5 \n\t"
426 "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] 427 "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
427 "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] 428 "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
428 "pxor %%mm2, %%mm2 \n\t" 429 "pxor %%mm2, %%mm2 \n\t"
429 "pxor %%mm3, %%mm3 \n\t" 430 "pxor %%mm3, %%mm3 \n\t"
430 "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 431 "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
439 "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*2*q 440 "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*2*q
440 "paddw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q 441 "paddw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q
441 "paddw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q 442 "paddw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q
442 "pxor %%mm4, %%mm4 \n\t" 443 "pxor %%mm4, %%mm4 \n\t"
443 "pxor %%mm5, %%mm5 \n\t" // FIXME slow 444 "pxor %%mm5, %%mm5 \n\t" // FIXME slow
444 "pcmpeqw (%0, %%eax), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 445 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
445 "pcmpeqw 8(%0, %%eax), %%mm5 \n\t" // block[i] == 0 ? -1 : 0 446 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
446 "psrlw $4, %%mm0 \n\t" 447 "psrlw $4, %%mm0 \n\t"
447 "psrlw $4, %%mm1 \n\t" 448 "psrlw $4, %%mm1 \n\t"
448 "pxor %%mm2, %%mm0 \n\t" 449 "pxor %%mm2, %%mm0 \n\t"
449 "pxor %%mm3, %%mm1 \n\t" 450 "pxor %%mm3, %%mm1 \n\t"
450 "psubw %%mm2, %%mm0 \n\t" 451 "psubw %%mm2, %%mm0 \n\t"
451 "psubw %%mm3, %%mm1 \n\t" 452 "psubw %%mm3, %%mm1 \n\t"
452 "pandn %%mm0, %%mm4 \n\t" 453 "pandn %%mm0, %%mm4 \n\t"
453 "pandn %%mm1, %%mm5 \n\t" 454 "pandn %%mm1, %%mm5 \n\t"
454 "pxor %%mm4, %%mm7 \n\t" 455 "pxor %%mm4, %%mm7 \n\t"
455 "pxor %%mm5, %%mm7 \n\t" 456 "pxor %%mm5, %%mm7 \n\t"
456 "movq %%mm4, (%0, %%eax) \n\t" 457 "movq %%mm4, (%0, %%"REG_a") \n\t"
457 "movq %%mm5, 8(%0, %%eax) \n\t" 458 "movq %%mm5, 8(%0, %%"REG_a") \n\t"
458 459
459 "addl $16, %%eax \n\t" 460 "add $16, %%"REG_a" \n\t"
460 "jng 1b \n\t" 461 "jng 1b \n\t"
461 "movd 124(%0, %3), %%mm0 \n\t" 462 "movd 124(%0, %3), %%mm0 \n\t"
462 "movq %%mm7, %%mm6 \n\t" 463 "movq %%mm7, %%mm6 \n\t"
463 "psrlq $32, %%mm7 \n\t" 464 "psrlq $32, %%mm7 \n\t"
464 "pxor %%mm6, %%mm7 \n\t" 465 "pxor %%mm6, %%mm7 \n\t"
469 "psrlq $15, %%mm7 \n\t" 470 "psrlq $15, %%mm7 \n\t"
470 "pxor %%mm7, %%mm0 \n\t" 471 "pxor %%mm7, %%mm0 \n\t"
471 "movd %%mm0, 124(%0, %3) \n\t" 472 "movd %%mm0, 124(%0, %3) \n\t"
472 473
473 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "r" (-2*nCoeffs) 474 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "r" (-2*nCoeffs)
474 : "%eax", "memory" 475 : "%"REG_a, "memory"
475 ); 476 );
476 } 477 }
477 478
478 /* draw the edges of width 'w' of an image of size width, height 479 /* draw the edges of width 'w' of an image of size width, height
479 this mmx version can only handle w==8 || w==16 */ 480 this mmx version can only handle w==8 || w==16 */
497 "movq -8(%0, %2), %%mm1 \n\t" 498 "movq -8(%0, %2), %%mm1 \n\t"
498 "punpckhbw %%mm1, %%mm1 \n\t" 499 "punpckhbw %%mm1, %%mm1 \n\t"
499 "punpckhwd %%mm1, %%mm1 \n\t" 500 "punpckhwd %%mm1, %%mm1 \n\t"
500 "punpckhdq %%mm1, %%mm1 \n\t" 501 "punpckhdq %%mm1, %%mm1 \n\t"
501 "movq %%mm1, (%0, %2) \n\t" 502 "movq %%mm1, (%0, %2) \n\t"
502 "addl %1, %0 \n\t" 503 "add %1, %0 \n\t"
503 "cmpl %3, %0 \n\t" 504 "cmp %3, %0 \n\t"
504 " jb 1b \n\t" 505 " jb 1b \n\t"
505 : "+r" (ptr) 506 : "+r" (ptr)
506 : "r" (wrap), "r" (width), "r" (ptr + wrap*height) 507 : "r" ((long)wrap), "r" ((long)width), "r" (ptr + wrap*height)
507 ); 508 );
508 } 509 }
509 else 510 else
510 { 511 {
511 asm volatile( 512 asm volatile(
520 "punpckhbw %%mm1, %%mm1 \n\t" 521 "punpckhbw %%mm1, %%mm1 \n\t"
521 "punpckhwd %%mm1, %%mm1 \n\t" 522 "punpckhwd %%mm1, %%mm1 \n\t"
522 "punpckhdq %%mm1, %%mm1 \n\t" 523 "punpckhdq %%mm1, %%mm1 \n\t"
523 "movq %%mm1, (%0, %2) \n\t" 524 "movq %%mm1, (%0, %2) \n\t"
524 "movq %%mm1, 8(%0, %2) \n\t" 525 "movq %%mm1, 8(%0, %2) \n\t"
525 "addl %1, %0 \n\t" 526 "add %1, %0 \n\t"
526 "cmpl %3, %0 \n\t" 527 "cmp %3, %0 \n\t"
527 " jb 1b \n\t" 528 " jb 1b \n\t"
528 : "+r" (ptr) 529 : "+r" (ptr)
529 : "r" (wrap), "r" (width), "r" (ptr + wrap*height) 530 : "r" ((long)wrap), "r" ((long)width), "r" (ptr + wrap*height)
530 ); 531 );
531 } 532 }
532 533
533 for(i=0;i<w;i+=4) { 534 for(i=0;i<w;i+=4) {
534 /* top and bottom (and hopefully also the corners) */ 535 /* top and bottom (and hopefully also the corners) */
538 "movq (%1, %0), %%mm0 \n\t" 539 "movq (%1, %0), %%mm0 \n\t"
539 "movq %%mm0, (%0) \n\t" 540 "movq %%mm0, (%0) \n\t"
540 "movq %%mm0, (%0, %2) \n\t" 541 "movq %%mm0, (%0, %2) \n\t"
541 "movq %%mm0, (%0, %2, 2) \n\t" 542 "movq %%mm0, (%0, %2, 2) \n\t"
542 "movq %%mm0, (%0, %3) \n\t" 543 "movq %%mm0, (%0, %3) \n\t"
543 "addl $8, %0 \n\t" 544 "add $8, %0 \n\t"
544 "cmpl %4, %0 \n\t" 545 "cmp %4, %0 \n\t"
545 " jb 1b \n\t" 546 " jb 1b \n\t"
546 : "+r" (ptr) 547 : "+r" (ptr)
547 : "r" ((int)buf - (int)ptr - w), "r" (-wrap), "r" (-wrap*3), "r" (ptr+width+2*w) 548 : "r" ((long)buf - (long)ptr - w), "r" ((long)-wrap), "r" ((long)-wrap*3), "r" (ptr+width+2*w)
548 ); 549 );
549 ptr= last_line + (i + 1) * wrap - w; 550 ptr= last_line + (i + 1) * wrap - w;
550 asm volatile( 551 asm volatile(
551 "1: \n\t" 552 "1: \n\t"
552 "movq (%1, %0), %%mm0 \n\t" 553 "movq (%1, %0), %%mm0 \n\t"
553 "movq %%mm0, (%0) \n\t" 554 "movq %%mm0, (%0) \n\t"
554 "movq %%mm0, (%0, %2) \n\t" 555 "movq %%mm0, (%0, %2) \n\t"
555 "movq %%mm0, (%0, %2, 2) \n\t" 556 "movq %%mm0, (%0, %2, 2) \n\t"
556 "movq %%mm0, (%0, %3) \n\t" 557 "movq %%mm0, (%0, %3) \n\t"
557 "addl $8, %0 \n\t" 558 "add $8, %0 \n\t"
558 "cmpl %4, %0 \n\t" 559 "cmp %4, %0 \n\t"
559 " jb 1b \n\t" 560 " jb 1b \n\t"
560 : "+r" (ptr) 561 : "+r" (ptr)
561 : "r" ((int)last_line - (int)ptr - w), "r" (wrap), "r" (wrap*3), "r" (ptr+width+2*w) 562 : "r" ((long)last_line - (long)ptr - w), "r" ((long)wrap), "r" ((long)wrap*3), "r" (ptr+width+2*w)
562 ); 563 );
563 } 564 }
564 } 565 }
565 566
566 static void denoise_dct_mmx(MpegEncContext *s, DCTELEM *block){ 567 static void denoise_dct_mmx(MpegEncContext *s, DCTELEM *block){
605 "paddd 24(%1), %%mm3 \n\t" 606 "paddd 24(%1), %%mm3 \n\t"
606 "movq %%mm4, (%1) \n\t" 607 "movq %%mm4, (%1) \n\t"
607 "movq %%mm2, 8(%1) \n\t" 608 "movq %%mm2, 8(%1) \n\t"
608 "movq %%mm5, 16(%1) \n\t" 609 "movq %%mm5, 16(%1) \n\t"
609 "movq %%mm3, 24(%1) \n\t" 610 "movq %%mm3, 24(%1) \n\t"
610 "addl $16, %0 \n\t" 611 "add $16, %0 \n\t"
611 "addl $32, %1 \n\t" 612 "add $32, %1 \n\t"
612 "addl $16, %2 \n\t" 613 "add $16, %2 \n\t"
613 "cmpl %3, %0 \n\t" 614 "cmp %3, %0 \n\t"
614 " jb 1b \n\t" 615 " jb 1b \n\t"
615 : "+r" (block), "+r" (sum), "+r" (offset) 616 : "+r" (block), "+r" (sum), "+r" (offset)
616 : "r"(block+64) 617 : "r"(block+64)
617 ); 618 );
618 } 619 }
659 "paddd 48(%1), %%xmm0 \n\t" 660 "paddd 48(%1), %%xmm0 \n\t"
660 "movdqa %%xmm4, (%1) \n\t" 661 "movdqa %%xmm4, (%1) \n\t"
661 "movdqa %%xmm6, 16(%1) \n\t" 662 "movdqa %%xmm6, 16(%1) \n\t"
662 "movdqa %%xmm5, 32(%1) \n\t" 663 "movdqa %%xmm5, 32(%1) \n\t"
663 "movdqa %%xmm0, 48(%1) \n\t" 664 "movdqa %%xmm0, 48(%1) \n\t"
664 "addl $32, %0 \n\t" 665 "add $32, %0 \n\t"
665 "addl $64, %1 \n\t" 666 "add $64, %1 \n\t"
666 "addl $32, %2 \n\t" 667 "add $32, %2 \n\t"
667 "cmpl %3, %0 \n\t" 668 "cmp %3, %0 \n\t"
668 " jb 1b \n\t" 669 " jb 1b \n\t"
669 : "+r" (block), "+r" (sum), "+r" (offset) 670 : "+r" (block), "+r" (sum), "+r" (offset)
670 : "r"(block+64) 671 : "r"(block+64)
671 ); 672 );
672 } 673 }