Mercurial > libavcodec.hg
comparison libpostproc/postprocess_template.c @ 126:55f57883bbf5 libavcodec
more speed
author | michael |
---|---|
date | Wed, 24 Oct 2001 00:05:30 +0000 |
parents | 3ecf2a90c65e |
children | e5266b8e79be |
comparison
equal
deleted
inserted
replaced
125:42a36eb08c8d | 126:55f57883bbf5 |
---|---|
2601 int x,y; | 2601 int x,y; |
2602 /* we need 64bit here otherwise weŽll going to have a problem | 2602 /* we need 64bit here otherwise weŽll going to have a problem |
2603 after watching a black picture for 5 hours*/ | 2603 after watching a black picture for 5 hours*/ |
2604 static uint64_t *yHistogram= NULL; | 2604 static uint64_t *yHistogram= NULL; |
2605 int black=0, white=255; // blackest black and whitest white in the picture | 2605 int black=0, white=255; // blackest black and whitest white in the picture |
2606 int QPCorrecture= 256; | |
2606 | 2607 |
2607 /* Temporary buffers for handling the last row(s) */ | 2608 /* Temporary buffers for handling the last row(s) */ |
2608 static uint8_t *tempDst= NULL; | 2609 static uint8_t *tempDst= NULL; |
2609 static uint8_t *tempSrc= NULL; | 2610 static uint8_t *tempSrc= NULL; |
2610 | 2611 |
2691 { | 2692 { |
2692 packedYScale= 0x0100010001000100LL; | 2693 packedYScale= 0x0100010001000100LL; |
2693 packedYOffset= 0; | 2694 packedYOffset= 0; |
2694 } | 2695 } |
2695 | 2696 |
2697 if(mode & LEVEL_FIX) QPCorrecture= packedYScale &0xFFFF; | |
2698 else QPCorrecture= 256; | |
2699 | |
2696 /* copy first row of 8x8 blocks */ | 2700 /* copy first row of 8x8 blocks */ |
2697 for(x=0; x<width; x+=BLOCK_SIZE) | 2701 for(x=0; x<width; x+=BLOCK_SIZE) |
2698 blockCopy(dst + x, dstStride, src + x, srcStride, 8, mode & LEVEL_FIX); | 2702 blockCopy(dst + x, dstStride, src + x, srcStride, 8, mode & LEVEL_FIX); |
2699 | 2703 |
2700 for(y=0; y<height; y+=BLOCK_SIZE) | 2704 for(y=0; y<height; y+=BLOCK_SIZE) |
2701 { | 2705 { |
2702 //1% speedup if these are here instead of the inner loop | 2706 //1% speedup if these are here instead of the inner loop |
2703 uint8_t *srcBlock= &(src[y*srcStride]); | 2707 uint8_t *srcBlock= &(src[y*srcStride]); |
2704 uint8_t *dstBlock= &(dst[y*dstStride]); | 2708 uint8_t *dstBlock= &(dst[y*dstStride]); |
2705 | 2709 #ifdef ARCH_X86 |
2710 int *QPptr= isColor ? &QPs[(y>>3)*QPStride] :&QPs[(y>>4)*QPStride]; | |
2711 int QPDelta= isColor ? 1<<(32-3) : 1<<(32-4); | |
2712 int QPFrac= QPDelta; | |
2713 #endif | |
2706 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards, if not | 2714 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards, if not |
2707 than use a temporary buffer */ | 2715 than use a temporary buffer */ |
2708 if(y+15 >= height) | 2716 if(y+15 >= height) |
2709 { | 2717 { |
2710 /* copy from line 5 to 12 of src, these will e copied with | 2718 /* copy from line 5 to 12 of src, these will e copied with |
2732 // finish 1 block before the next otherwise weŽll might have a problem | 2740 // finish 1 block before the next otherwise weŽll might have a problem |
2733 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing | 2741 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing |
2734 for(x=0; x<width; x+=BLOCK_SIZE) | 2742 for(x=0; x<width; x+=BLOCK_SIZE) |
2735 { | 2743 { |
2736 const int stride= dstStride; | 2744 const int stride= dstStride; |
2737 int QP; | 2745 #ifdef ARCH_X86 |
2738 if(isColor) | 2746 int QP= *QPptr; |
2747 asm volatile( | |
2748 "addl %2, %1 \n\t" | |
2749 "sbbl %%eax, %%eax \n\t" | |
2750 "shll $2, %%eax \n\t" | |
2751 "subl %%eax, %0 \n\t" | |
2752 : "+r" (QPptr), "+m" (QPFrac) | |
2753 : "r" (QPDelta) | |
2754 : "%eax" | |
2755 ); | |
2756 #else | |
2757 int QP= isColor ? | |
2758 QPs[(y>>3)*QPStride + (x>>3)]: | |
2759 QPs[(y>>4)*QPStride + (x>>4)]; | |
2760 #endif | |
2761 if(!isColor) | |
2739 { | 2762 { |
2740 QP=QPs[(y>>3)*QPStride + (x>>3)]; | 2763 QP= (QP* QPCorrecture)>>8; |
2741 } | 2764 yHistogram[ srcBlock[srcStride*4 + 4] ]++; |
2742 else | |
2743 { | |
2744 QP= QPs[(y>>4)*QPStride + (x>>4)]; | |
2745 if(mode & LEVEL_FIX) QP= (QP* (packedYScale &0xFFFF))>>8; | |
2746 yHistogram[ srcBlock[srcStride*5] ]++; | |
2747 } | 2765 } |
2748 #ifdef HAVE_MMX | 2766 #ifdef HAVE_MMX |
2749 asm volatile( | 2767 asm volatile( |
2750 "movd %0, %%mm7 \n\t" | 2768 "movd %0, %%mm7 \n\t" |
2751 "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP | 2769 "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP |
2759 #ifdef MORE_TIMING | 2777 #ifdef MORE_TIMING |
2760 T0= rdtsc(); | 2778 T0= rdtsc(); |
2761 #endif | 2779 #endif |
2762 | 2780 |
2763 #ifdef HAVE_MMX2 | 2781 #ifdef HAVE_MMX2 |
2782 /* | |
2764 prefetchnta(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); | 2783 prefetchnta(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); |
2765 prefetchnta(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); | 2784 prefetchnta(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); |
2766 prefetcht0(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); | 2785 prefetcht0(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); |
2767 prefetcht0(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); | 2786 prefetcht0(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); |
2787 */ | |
2788 /* | |
2789 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); | |
2790 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); | |
2791 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); | |
2792 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); | |
2793 */ | |
2794 | |
2795 asm( | |
2796 "movl %4, %%eax \n\t" | |
2797 "shrl $2, %%eax \n\t" | |
2798 "andl $6, %%eax \n\t" | |
2799 "addl $5, %%eax \n\t" | |
2800 "movl %%eax, %%ebx \n\t" | |
2801 "imul %1, %%eax \n\t" | |
2802 "imul %3, %%ebx \n\t" | |
2803 "prefetchnta 32(%%eax, %0) \n\t" | |
2804 "prefetcht0 32(%%ebx, %2) \n\t" | |
2805 "addl %1, %%eax \n\t" | |
2806 "addl %3, %%ebx \n\t" | |
2807 "prefetchnta 32(%%eax, %0) \n\t" | |
2808 "prefetcht0 32(%%ebx, %2) \n\t" | |
2809 :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride), | |
2810 "m" (x) | |
2811 : "%eax", "%ebx" | |
2812 ); | |
2813 | |
2768 #elif defined(HAVE_3DNOW) | 2814 #elif defined(HAVE_3DNOW) |
2769 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ... | 2815 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ... |
2770 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); | 2816 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); |
2771 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); | 2817 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); |
2772 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); | 2818 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); |