Mercurial > libavcodec.hg
comparison i386/snowdsp_mmx.c @ 5600:f302e395f552 libavcodec
optimize the first vertical lifting step, this also prevents another
overflow, the last known possible overflow
author | michael |
---|---|
date | Sun, 26 Aug 2007 11:16:23 +0000 |
parents | f81e45d5ebb1 |
children | b26025b9586d |
comparison
equal
deleted
inserted
replaced
5599:f81e45d5ebb1 | 5600:f302e395f552 |
---|---|
422 "paddw %%"s0", %%"t0" \n\t"\ | 422 "paddw %%"s0", %%"t0" \n\t"\ |
423 "paddw %%"s1", %%"t1" \n\t"\ | 423 "paddw %%"s1", %%"t1" \n\t"\ |
424 "paddw %%"s2", %%"t2" \n\t"\ | 424 "paddw %%"s2", %%"t2" \n\t"\ |
425 "paddw %%"s3", %%"t3" \n\t" | 425 "paddw %%"s3", %%"t3" \n\t" |
426 | 426 |
427 #define snow_vertical_compose_r2r_pmulhw(s0,s1,s2,s3,t0,t1,t2,t3)\ | |
428 "pmulhw %%"s0", %%"t0" \n\t"\ | |
429 "pmulhw %%"s1", %%"t1" \n\t"\ | |
430 "pmulhw %%"s2", %%"t2" \n\t"\ | |
431 "pmulhw %%"s3", %%"t3" \n\t" | |
432 | |
427 #define snow_vertical_compose_sse2_move(s0,s1,s2,s3,t0,t1,t2,t3)\ | 433 #define snow_vertical_compose_sse2_move(s0,s1,s2,s3,t0,t1,t2,t3)\ |
428 "movdqa %%"s0", %%"t0" \n\t"\ | 434 "movdqa %%"s0", %%"t0" \n\t"\ |
429 "movdqa %%"s1", %%"t1" \n\t"\ | 435 "movdqa %%"s1", %%"t1" \n\t"\ |
430 "movdqa %%"s2", %%"t2" \n\t"\ | 436 "movdqa %%"s2", %%"t2" \n\t"\ |
431 "movdqa %%"s3", %%"t3" \n\t" | 437 "movdqa %%"s3", %%"t3" \n\t" |
444 i+=i; | 450 i+=i; |
445 | 451 |
446 asm volatile ( | 452 asm volatile ( |
447 "jmp 2f \n\t" | 453 "jmp 2f \n\t" |
448 "1: \n\t" | 454 "1: \n\t" |
449 | |
450 snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6") | 455 snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6") |
451 snow_vertical_compose_sse2_add("%6","xmm0","xmm2","xmm4","xmm6") | 456 snow_vertical_compose_sse2_add("%6","xmm0","xmm2","xmm4","xmm6") |
452 snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7") | 457 |
453 snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6") | 458 |
454 snow_vertical_compose_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6") | 459 "pcmpeqw %%xmm0, %%xmm0 \n\t" |
455 | 460 "pcmpeqw %%xmm2, %%xmm2 \n\t" |
456 "pcmpeqd %%xmm1, %%xmm1 \n\t" | 461 "paddw %%xmm2, %%xmm2 \n\t" |
457 "psllw $15, %%xmm1 \n\t" | 462 "paddw %%xmm0, %%xmm2 \n\t" |
458 "psrlw $14, %%xmm1 \n\t" | 463 "psllw $13, %%xmm2 \n\t" |
459 | 464 snow_vertical_compose_r2r_add("xmm0","xmm0","xmm0","xmm0","xmm1","xmm3","xmm5","xmm7") |
460 snow_vertical_compose_r2r_add("xmm1","xmm1","xmm1","xmm1","xmm0","xmm2","xmm4","xmm6") | 465 snow_vertical_compose_r2r_pmulhw("xmm2","xmm2","xmm2","xmm2","xmm1","xmm3","xmm5","xmm7") |
461 snow_vertical_compose_sra("2","xmm0","xmm2","xmm4","xmm6") | 466 snow_vertical_compose_sse2_add("%5","xmm1","xmm3","xmm5","xmm7") |
462 snow_vertical_compose_sse2_load("%5","xmm1","xmm3","xmm5","xmm7") | |
463 snow_vertical_compose_r2r_sub("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7") | |
464 snow_vertical_compose_sse2_store("%5","xmm1","xmm3","xmm5","xmm7") | 467 snow_vertical_compose_sse2_store("%5","xmm1","xmm3","xmm5","xmm7") |
465 snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6") | 468 snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6") |
466 snow_vertical_compose_sse2_add("%3","xmm1","xmm3","xmm5","xmm7") | 469 snow_vertical_compose_sse2_add("%3","xmm1","xmm3","xmm5","xmm7") |
467 snow_vertical_compose_r2r_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6") | 470 snow_vertical_compose_r2r_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6") |
468 snow_vertical_compose_sse2_store("%4","xmm0","xmm2","xmm4","xmm6") | 471 snow_vertical_compose_sse2_store("%4","xmm0","xmm2","xmm4","xmm6") |
528 "movq %%"s0", %%"t0" \n\t"\ | 531 "movq %%"s0", %%"t0" \n\t"\ |
529 "movq %%"s1", %%"t1" \n\t"\ | 532 "movq %%"s1", %%"t1" \n\t"\ |
530 "movq %%"s2", %%"t2" \n\t"\ | 533 "movq %%"s2", %%"t2" \n\t"\ |
531 "movq %%"s3", %%"t3" \n\t" | 534 "movq %%"s3", %%"t3" \n\t" |
532 | 535 |
536 | |
533 void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){ | 537 void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){ |
534 long i = width; | 538 long i = width; |
535 while(i & 15) | 539 while(i & 15) |
536 { | 540 { |
537 i--; | 541 i--; |
543 i+=i; | 547 i+=i; |
544 asm volatile( | 548 asm volatile( |
545 "jmp 2f \n\t" | 549 "jmp 2f \n\t" |
546 "1: \n\t" | 550 "1: \n\t" |
547 | 551 |
548 snow_vertical_compose_mmx_load("%4","mm0","mm2","mm4","mm6") | 552 snow_vertical_compose_mmx_load("%4","mm1","mm3","mm5","mm7") |
549 snow_vertical_compose_mmx_add("%6","mm0","mm2","mm4","mm6") | 553 snow_vertical_compose_mmx_add("%6","mm1","mm3","mm5","mm7") |
550 snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7") | 554 "pcmpeqw %%mm0, %%mm0 \n\t" |
551 snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6") | 555 "pcmpeqw %%mm2, %%mm2 \n\t" |
552 snow_vertical_compose_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6") | 556 "paddw %%mm2, %%mm2 \n\t" |
553 | 557 "paddw %%mm0, %%mm2 \n\t" |
554 "pcmpeqw %%mm1, %%mm1 \n\t" | 558 "psllw $13, %%mm2 \n\t" |
555 "psllw $15, %%mm1 \n\t" | 559 snow_vertical_compose_r2r_add("mm0","mm0","mm0","mm0","mm1","mm3","mm5","mm7") |
556 "psrlw $14, %%mm1 \n\t" | 560 snow_vertical_compose_r2r_pmulhw("mm2","mm2","mm2","mm2","mm1","mm3","mm5","mm7") |
557 | 561 snow_vertical_compose_mmx_add("%5","mm1","mm3","mm5","mm7") |
558 snow_vertical_compose_r2r_add("mm1","mm1","mm1","mm1","mm0","mm2","mm4","mm6") | |
559 snow_vertical_compose_sra("2","mm0","mm2","mm4","mm6") | |
560 snow_vertical_compose_mmx_load("%5","mm1","mm3","mm5","mm7") | |
561 snow_vertical_compose_r2r_sub("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7") | |
562 snow_vertical_compose_mmx_store("%5","mm1","mm3","mm5","mm7") | 562 snow_vertical_compose_mmx_store("%5","mm1","mm3","mm5","mm7") |
563 snow_vertical_compose_mmx_load("%4","mm0","mm2","mm4","mm6") | 563 snow_vertical_compose_mmx_load("%4","mm0","mm2","mm4","mm6") |
564 snow_vertical_compose_mmx_add("%3","mm1","mm3","mm5","mm7") | 564 snow_vertical_compose_mmx_add("%3","mm1","mm3","mm5","mm7") |
565 snow_vertical_compose_r2r_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6") | 565 snow_vertical_compose_r2r_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6") |
566 snow_vertical_compose_mmx_store("%4","mm0","mm2","mm4","mm6") | 566 snow_vertical_compose_mmx_store("%4","mm0","mm2","mm4","mm6") |