comparison i386/snowdsp_mmx.c @ 5600:f302e395f552 libavcodec

optimize the first vertical lifting step, this also prevents another overflow, the last known possible overflow
author michael
date Sun, 26 Aug 2007 11:16:23 +0000
parents f81e45d5ebb1
children b26025b9586d
comparison
equal deleted inserted replaced
5599:f81e45d5ebb1 5600:f302e395f552
422 "paddw %%"s0", %%"t0" \n\t"\ 422 "paddw %%"s0", %%"t0" \n\t"\
423 "paddw %%"s1", %%"t1" \n\t"\ 423 "paddw %%"s1", %%"t1" \n\t"\
424 "paddw %%"s2", %%"t2" \n\t"\ 424 "paddw %%"s2", %%"t2" \n\t"\
425 "paddw %%"s3", %%"t3" \n\t" 425 "paddw %%"s3", %%"t3" \n\t"
426 426
427 #define snow_vertical_compose_r2r_pmulhw(s0,s1,s2,s3,t0,t1,t2,t3)\
428 "pmulhw %%"s0", %%"t0" \n\t"\
429 "pmulhw %%"s1", %%"t1" \n\t"\
430 "pmulhw %%"s2", %%"t2" \n\t"\
431 "pmulhw %%"s3", %%"t3" \n\t"
432
427 #define snow_vertical_compose_sse2_move(s0,s1,s2,s3,t0,t1,t2,t3)\ 433 #define snow_vertical_compose_sse2_move(s0,s1,s2,s3,t0,t1,t2,t3)\
428 "movdqa %%"s0", %%"t0" \n\t"\ 434 "movdqa %%"s0", %%"t0" \n\t"\
429 "movdqa %%"s1", %%"t1" \n\t"\ 435 "movdqa %%"s1", %%"t1" \n\t"\
430 "movdqa %%"s2", %%"t2" \n\t"\ 436 "movdqa %%"s2", %%"t2" \n\t"\
431 "movdqa %%"s3", %%"t3" \n\t" 437 "movdqa %%"s3", %%"t3" \n\t"
444 i+=i; 450 i+=i;
445 451
446 asm volatile ( 452 asm volatile (
447 "jmp 2f \n\t" 453 "jmp 2f \n\t"
448 "1: \n\t" 454 "1: \n\t"
449
450 snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6") 455 snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6")
451 snow_vertical_compose_sse2_add("%6","xmm0","xmm2","xmm4","xmm6") 456 snow_vertical_compose_sse2_add("%6","xmm0","xmm2","xmm4","xmm6")
452 snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7") 457
453 snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6") 458
454 snow_vertical_compose_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6") 459 "pcmpeqw %%xmm0, %%xmm0 \n\t"
455 460 "pcmpeqw %%xmm2, %%xmm2 \n\t"
456 "pcmpeqd %%xmm1, %%xmm1 \n\t" 461 "paddw %%xmm2, %%xmm2 \n\t"
457 "psllw $15, %%xmm1 \n\t" 462 "paddw %%xmm0, %%xmm2 \n\t"
458 "psrlw $14, %%xmm1 \n\t" 463 "psllw $13, %%xmm2 \n\t"
459 464 snow_vertical_compose_r2r_add("xmm0","xmm0","xmm0","xmm0","xmm1","xmm3","xmm5","xmm7")
460 snow_vertical_compose_r2r_add("xmm1","xmm1","xmm1","xmm1","xmm0","xmm2","xmm4","xmm6") 465 snow_vertical_compose_r2r_pmulhw("xmm2","xmm2","xmm2","xmm2","xmm1","xmm3","xmm5","xmm7")
461 snow_vertical_compose_sra("2","xmm0","xmm2","xmm4","xmm6") 466 snow_vertical_compose_sse2_add("%5","xmm1","xmm3","xmm5","xmm7")
462 snow_vertical_compose_sse2_load("%5","xmm1","xmm3","xmm5","xmm7")
463 snow_vertical_compose_r2r_sub("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7")
464 snow_vertical_compose_sse2_store("%5","xmm1","xmm3","xmm5","xmm7") 467 snow_vertical_compose_sse2_store("%5","xmm1","xmm3","xmm5","xmm7")
465 snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6") 468 snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6")
466 snow_vertical_compose_sse2_add("%3","xmm1","xmm3","xmm5","xmm7") 469 snow_vertical_compose_sse2_add("%3","xmm1","xmm3","xmm5","xmm7")
467 snow_vertical_compose_r2r_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6") 470 snow_vertical_compose_r2r_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
468 snow_vertical_compose_sse2_store("%4","xmm0","xmm2","xmm4","xmm6") 471 snow_vertical_compose_sse2_store("%4","xmm0","xmm2","xmm4","xmm6")
528 "movq %%"s0", %%"t0" \n\t"\ 531 "movq %%"s0", %%"t0" \n\t"\
529 "movq %%"s1", %%"t1" \n\t"\ 532 "movq %%"s1", %%"t1" \n\t"\
530 "movq %%"s2", %%"t2" \n\t"\ 533 "movq %%"s2", %%"t2" \n\t"\
531 "movq %%"s3", %%"t3" \n\t" 534 "movq %%"s3", %%"t3" \n\t"
532 535
536
533 void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){ 537 void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){
534 long i = width; 538 long i = width;
535 while(i & 15) 539 while(i & 15)
536 { 540 {
537 i--; 541 i--;
543 i+=i; 547 i+=i;
544 asm volatile( 548 asm volatile(
545 "jmp 2f \n\t" 549 "jmp 2f \n\t"
546 "1: \n\t" 550 "1: \n\t"
547 551
548 snow_vertical_compose_mmx_load("%4","mm0","mm2","mm4","mm6") 552 snow_vertical_compose_mmx_load("%4","mm1","mm3","mm5","mm7")
549 snow_vertical_compose_mmx_add("%6","mm0","mm2","mm4","mm6") 553 snow_vertical_compose_mmx_add("%6","mm1","mm3","mm5","mm7")
550 snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7") 554 "pcmpeqw %%mm0, %%mm0 \n\t"
551 snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6") 555 "pcmpeqw %%mm2, %%mm2 \n\t"
552 snow_vertical_compose_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6") 556 "paddw %%mm2, %%mm2 \n\t"
553 557 "paddw %%mm0, %%mm2 \n\t"
554 "pcmpeqw %%mm1, %%mm1 \n\t" 558 "psllw $13, %%mm2 \n\t"
555 "psllw $15, %%mm1 \n\t" 559 snow_vertical_compose_r2r_add("mm0","mm0","mm0","mm0","mm1","mm3","mm5","mm7")
556 "psrlw $14, %%mm1 \n\t" 560 snow_vertical_compose_r2r_pmulhw("mm2","mm2","mm2","mm2","mm1","mm3","mm5","mm7")
557 561 snow_vertical_compose_mmx_add("%5","mm1","mm3","mm5","mm7")
558 snow_vertical_compose_r2r_add("mm1","mm1","mm1","mm1","mm0","mm2","mm4","mm6")
559 snow_vertical_compose_sra("2","mm0","mm2","mm4","mm6")
560 snow_vertical_compose_mmx_load("%5","mm1","mm3","mm5","mm7")
561 snow_vertical_compose_r2r_sub("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7")
562 snow_vertical_compose_mmx_store("%5","mm1","mm3","mm5","mm7") 562 snow_vertical_compose_mmx_store("%5","mm1","mm3","mm5","mm7")
563 snow_vertical_compose_mmx_load("%4","mm0","mm2","mm4","mm6") 563 snow_vertical_compose_mmx_load("%4","mm0","mm2","mm4","mm6")
564 snow_vertical_compose_mmx_add("%3","mm1","mm3","mm5","mm7") 564 snow_vertical_compose_mmx_add("%3","mm1","mm3","mm5","mm7")
565 snow_vertical_compose_r2r_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6") 565 snow_vertical_compose_r2r_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
566 snow_vertical_compose_mmx_store("%4","mm0","mm2","mm4","mm6") 566 snow_vertical_compose_mmx_store("%4","mm0","mm2","mm4","mm6")