Mercurial > libavcodec.hg
comparison i386/dsputil_mmx.c @ 997:4dfe15ae0078 libavcodec
sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
author | michaelni |
---|---|
date | Sat, 11 Jan 2003 22:32:56 +0000 |
parents | e162c09efbe7 |
children | bb5de8a59da8 |
comparison
equal
deleted
inserted
replaced
996:ad44196ea483 | 997:4dfe15ae0078 |
---|---|
481 : "+r" (i) | 481 : "+r" (i) |
482 : "r"(src), "r"(dst), "r"(w-15) | 482 : "r"(src), "r"(dst), "r"(w-15) |
483 ); | 483 ); |
484 for(; i<w; i++) | 484 for(; i<w; i++) |
485 dst[i+0] += src[i+0]; | 485 dst[i+0] += src[i+0]; |
486 } | |
487 | |
488 static int pix_norm1_mmx(uint8_t *pix, int line_size) { | |
489 int tmp; | |
490 asm volatile ( | |
491 "movl $16,%%ecx\n" | |
492 "pxor %%mm0,%%mm0\n" | |
493 "pxor %%mm7,%%mm7\n" | |
494 "1:\n" | |
495 "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */ | |
496 "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */ | |
497 | |
498 "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */ | |
499 | |
500 "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */ | |
501 "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */ | |
502 | |
503 "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */ | |
504 "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */ | |
505 "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */ | |
506 | |
507 "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */ | |
508 "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */ | |
509 | |
510 "pmaddwd %%mm3,%%mm3\n" | |
511 "pmaddwd %%mm4,%%mm4\n" | |
512 | |
513 "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2, | |
514 pix2^2+pix3^2+pix6^2+pix7^2) */ | |
515 "paddd %%mm3,%%mm4\n" | |
516 "paddd %%mm2,%%mm7\n" | |
517 | |
518 "addl %2, %0\n" | |
519 "paddd %%mm4,%%mm7\n" | |
520 "dec %%ecx\n" | |
521 "jnz 1b\n" | |
522 | |
523 "movq %%mm7,%%mm1\n" | |
524 "psrlq $32, %%mm7\n" /* shift hi dword to lo */ | |
525 "paddd %%mm7,%%mm1\n" | |
526 "movd %%mm1,%1\n" | |
527 : "+r" (pix), "=r"(tmp) : "r" (line_size) : "%ecx" ); | |
528 return tmp; | |
529 } | |
530 | |
531 static int sse16_mmx(void *v, UINT8 * pix1, UINT8 * pix2, int line_size) { | |
532 int tmp; | |
533 asm volatile ( | |
534 "movl $16,%%ecx\n" | |
535 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */ | |
536 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */ | |
537 "1:\n" | |
538 "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */ | |
539 "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */ | |
540 "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */ | |
541 "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */ | |
542 | |
543 /* todo: mm1-mm2, mm3-mm4 */ | |
544 /* algo: substract mm1 from mm2 with saturation and vice versa */ | |
545 /* OR the results to get absolute difference */ | |
546 "movq %%mm1,%%mm5\n" | |
547 "movq %%mm3,%%mm6\n" | |
548 "psubusb %%mm2,%%mm1\n" | |
549 "psubusb %%mm4,%%mm3\n" | |
550 "psubusb %%mm5,%%mm2\n" | |
551 "psubusb %%mm6,%%mm4\n" | |
552 | |
553 "por %%mm1,%%mm2\n" | |
554 "por %%mm3,%%mm4\n" | |
555 | |
556 /* now convert to 16-bit vectors so we can square them */ | |
557 "movq %%mm2,%%mm1\n" | |
558 "movq %%mm4,%%mm3\n" | |
559 | |
560 "punpckhbw %%mm0,%%mm2\n" | |
561 "punpckhbw %%mm0,%%mm4\n" | |
562 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */ | |
563 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */ | |
564 | |
565 "pmaddwd %%mm2,%%mm2\n" | |
566 "pmaddwd %%mm4,%%mm4\n" | |
567 "pmaddwd %%mm1,%%mm1\n" | |
568 "pmaddwd %%mm3,%%mm3\n" | |
569 | |
570 "addl %3,%0\n" | |
571 "addl %3,%1\n" | |
572 | |
573 "paddd %%mm2,%%mm1\n" | |
574 "paddd %%mm4,%%mm3\n" | |
575 "paddd %%mm1,%%mm7\n" | |
576 "paddd %%mm3,%%mm7\n" | |
577 | |
578 "decl %%ecx\n" | |
579 "jnz 1b\n" | |
580 | |
581 "movq %%mm7,%%mm1\n" | |
582 "psrlq $32, %%mm7\n" /* shift hi dword to lo */ | |
583 "paddd %%mm7,%%mm1\n" | |
584 "movd %%mm1,%2\n" | |
585 : "+r" (pix1), "+r" (pix2), "=r"(tmp) : "r" (line_size) : "ecx"); | |
586 return tmp; | |
486 } | 587 } |
487 | 588 |
488 static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ | 589 static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ |
489 int i=0; | 590 int i=0; |
490 asm volatile( | 591 asm volatile( |
1414 c->hadamard8_diff[0]= hadamard8_diff16_mmx; | 1515 c->hadamard8_diff[0]= hadamard8_diff16_mmx; |
1415 c->hadamard8_diff[1]= hadamard8_diff_mmx; | 1516 c->hadamard8_diff[1]= hadamard8_diff_mmx; |
1416 | 1517 |
1417 c->sad[0]= sad16x16_mmx; | 1518 c->sad[0]= sad16x16_mmx; |
1418 c->sad[1]= sad8x8_mmx; | 1519 c->sad[1]= sad8x8_mmx; |
1520 | |
1521 c->pix_norm1 = pix_norm1_mmx; | |
1522 c->sse[0] = sse16_mmx; | |
1419 | 1523 |
1420 if (mm_flags & MM_MMXEXT) { | 1524 if (mm_flags & MM_MMXEXT) { |
1421 c->pix_abs16x16 = pix_abs16x16_mmx2; | 1525 c->pix_abs16x16 = pix_abs16x16_mmx2; |
1422 c->pix_abs16x16_x2 = pix_abs16x16_x2_mmx2; | 1526 c->pix_abs16x16_x2 = pix_abs16x16_x2_mmx2; |
1423 c->pix_abs16x16_y2 = pix_abs16x16_y2_mmx2; | 1527 c->pix_abs16x16_y2 = pix_abs16x16_y2_mmx2; |