comparison i386/dsputil_mmx.c @ 997:4dfe15ae0078 libavcodec

sse16 & pix_norm1 optimization patch by (Felix von Leitner <felix-ffmpeg at fefe dot de>) (with some modifications)
author michaelni
date Sat, 11 Jan 2003 22:32:56 +0000
parents e162c09efbe7
children bb5de8a59da8
comparison
equal deleted inserted replaced
996:ad44196ea483 997:4dfe15ae0078
481 : "+r" (i) 481 : "+r" (i)
482 : "r"(src), "r"(dst), "r"(w-15) 482 : "r"(src), "r"(dst), "r"(w-15)
483 ); 483 );
484 for(; i<w; i++) 484 for(; i<w; i++)
485 dst[i+0] += src[i+0]; 485 dst[i+0] += src[i+0];
486 }
487
488 static int pix_norm1_mmx(uint8_t *pix, int line_size) {
489 int tmp;
490 asm volatile (
491 "movl $16,%%ecx\n"
492 "pxor %%mm0,%%mm0\n"
493 "pxor %%mm7,%%mm7\n"
494 "1:\n"
495 "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */
496 "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */
497
498 "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */
499
500 "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
501 "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
502
503 "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */
504 "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
505 "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
506
507 "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
508 "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
509
510 "pmaddwd %%mm3,%%mm3\n"
511 "pmaddwd %%mm4,%%mm4\n"
512
513 "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
514 pix2^2+pix3^2+pix6^2+pix7^2) */
515 "paddd %%mm3,%%mm4\n"
516 "paddd %%mm2,%%mm7\n"
517
518 "addl %2, %0\n"
519 "paddd %%mm4,%%mm7\n"
520 "dec %%ecx\n"
521 "jnz 1b\n"
522
523 "movq %%mm7,%%mm1\n"
524 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
525 "paddd %%mm7,%%mm1\n"
526 "movd %%mm1,%1\n"
527 : "+r" (pix), "=r"(tmp) : "r" (line_size) : "%ecx" );
528 return tmp;
529 }
530
531 static int sse16_mmx(void *v, UINT8 * pix1, UINT8 * pix2, int line_size) {
532 int tmp;
533 asm volatile (
534 "movl $16,%%ecx\n"
535 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
536 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
537 "1:\n"
538 "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */
539 "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */
540 "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */
541 "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */
542
543 /* todo: mm1-mm2, mm3-mm4 */
544 /* algo: substract mm1 from mm2 with saturation and vice versa */
545 /* OR the results to get absolute difference */
546 "movq %%mm1,%%mm5\n"
547 "movq %%mm3,%%mm6\n"
548 "psubusb %%mm2,%%mm1\n"
549 "psubusb %%mm4,%%mm3\n"
550 "psubusb %%mm5,%%mm2\n"
551 "psubusb %%mm6,%%mm4\n"
552
553 "por %%mm1,%%mm2\n"
554 "por %%mm3,%%mm4\n"
555
556 /* now convert to 16-bit vectors so we can square them */
557 "movq %%mm2,%%mm1\n"
558 "movq %%mm4,%%mm3\n"
559
560 "punpckhbw %%mm0,%%mm2\n"
561 "punpckhbw %%mm0,%%mm4\n"
562 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
563 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
564
565 "pmaddwd %%mm2,%%mm2\n"
566 "pmaddwd %%mm4,%%mm4\n"
567 "pmaddwd %%mm1,%%mm1\n"
568 "pmaddwd %%mm3,%%mm3\n"
569
570 "addl %3,%0\n"
571 "addl %3,%1\n"
572
573 "paddd %%mm2,%%mm1\n"
574 "paddd %%mm4,%%mm3\n"
575 "paddd %%mm1,%%mm7\n"
576 "paddd %%mm3,%%mm7\n"
577
578 "decl %%ecx\n"
579 "jnz 1b\n"
580
581 "movq %%mm7,%%mm1\n"
582 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
583 "paddd %%mm7,%%mm1\n"
584 "movd %%mm1,%2\n"
585 : "+r" (pix1), "+r" (pix2), "=r"(tmp) : "r" (line_size) : "ecx");
586 return tmp;
486 } 587 }
487 588
488 static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ 589 static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
489 int i=0; 590 int i=0;
490 asm volatile( 591 asm volatile(
1414 c->hadamard8_diff[0]= hadamard8_diff16_mmx; 1515 c->hadamard8_diff[0]= hadamard8_diff16_mmx;
1415 c->hadamard8_diff[1]= hadamard8_diff_mmx; 1516 c->hadamard8_diff[1]= hadamard8_diff_mmx;
1416 1517
1417 c->sad[0]= sad16x16_mmx; 1518 c->sad[0]= sad16x16_mmx;
1418 c->sad[1]= sad8x8_mmx; 1519 c->sad[1]= sad8x8_mmx;
1520
1521 c->pix_norm1 = pix_norm1_mmx;
1522 c->sse[0] = sse16_mmx;
1419 1523
1420 if (mm_flags & MM_MMXEXT) { 1524 if (mm_flags & MM_MMXEXT) {
1421 c->pix_abs16x16 = pix_abs16x16_mmx2; 1525 c->pix_abs16x16 = pix_abs16x16_mmx2;
1422 c->pix_abs16x16_x2 = pix_abs16x16_x2_mmx2; 1526 c->pix_abs16x16_x2 = pix_abs16x16_x2_mmx2;
1423 c->pix_abs16x16_y2 = pix_abs16x16_y2_mmx2; 1527 c->pix_abs16x16_y2 = pix_abs16x16_y2_mmx2;