comparison libpostproc/postprocess_template.c @ 126:55f57883bbf5 libavcodec

more speed
author michael
date Wed, 24 Oct 2001 00:05:30 +0000
parents 3ecf2a90c65e
children e5266b8e79be
comparison
equal deleted inserted replaced
125:42a36eb08c8d 126:55f57883bbf5
2601 int x,y; 2601 int x,y;
2602 /* we need 64bit here otherwise weŽll going to have a problem 2602 /* we need 64bit here otherwise weŽll going to have a problem
2603 after watching a black picture for 5 hours*/ 2603 after watching a black picture for 5 hours*/
2604 static uint64_t *yHistogram= NULL; 2604 static uint64_t *yHistogram= NULL;
2605 int black=0, white=255; // blackest black and whitest white in the picture 2605 int black=0, white=255; // blackest black and whitest white in the picture
2606 int QPCorrecture= 256;
2606 2607
2607 /* Temporary buffers for handling the last row(s) */ 2608 /* Temporary buffers for handling the last row(s) */
2608 static uint8_t *tempDst= NULL; 2609 static uint8_t *tempDst= NULL;
2609 static uint8_t *tempSrc= NULL; 2610 static uint8_t *tempSrc= NULL;
2610 2611
2691 { 2692 {
2692 packedYScale= 0x0100010001000100LL; 2693 packedYScale= 0x0100010001000100LL;
2693 packedYOffset= 0; 2694 packedYOffset= 0;
2694 } 2695 }
2695 2696
2697 if(mode & LEVEL_FIX) QPCorrecture= packedYScale &0xFFFF;
2698 else QPCorrecture= 256;
2699
2696 /* copy first row of 8x8 blocks */ 2700 /* copy first row of 8x8 blocks */
2697 for(x=0; x<width; x+=BLOCK_SIZE) 2701 for(x=0; x<width; x+=BLOCK_SIZE)
2698 blockCopy(dst + x, dstStride, src + x, srcStride, 8, mode & LEVEL_FIX); 2702 blockCopy(dst + x, dstStride, src + x, srcStride, 8, mode & LEVEL_FIX);
2699 2703
2700 for(y=0; y<height; y+=BLOCK_SIZE) 2704 for(y=0; y<height; y+=BLOCK_SIZE)
2701 { 2705 {
2702 //1% speedup if these are here instead of the inner loop 2706 //1% speedup if these are here instead of the inner loop
2703 uint8_t *srcBlock= &(src[y*srcStride]); 2707 uint8_t *srcBlock= &(src[y*srcStride]);
2704 uint8_t *dstBlock= &(dst[y*dstStride]); 2708 uint8_t *dstBlock= &(dst[y*dstStride]);
2705 2709 #ifdef ARCH_X86
2710 int *QPptr= isColor ? &QPs[(y>>3)*QPStride] :&QPs[(y>>4)*QPStride];
2711 int QPDelta= isColor ? 1<<(32-3) : 1<<(32-4);
2712 int QPFrac= QPDelta;
2713 #endif
2706 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards, if not 2714 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards, if not
2707 than use a temporary buffer */ 2715 than use a temporary buffer */
2708 if(y+15 >= height) 2716 if(y+15 >= height)
2709 { 2717 {
2710 /* copy from line 5 to 12 of src, these will e copied with 2718 /* copy from line 5 to 12 of src, these will e copied with
2732 // finish 1 block before the next otherwise weŽll might have a problem 2740 // finish 1 block before the next otherwise weŽll might have a problem
2733 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing 2741 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
2734 for(x=0; x<width; x+=BLOCK_SIZE) 2742 for(x=0; x<width; x+=BLOCK_SIZE)
2735 { 2743 {
2736 const int stride= dstStride; 2744 const int stride= dstStride;
2737 int QP; 2745 #ifdef ARCH_X86
2738 if(isColor) 2746 int QP= *QPptr;
2747 asm volatile(
2748 "addl %2, %1 \n\t"
2749 "sbbl %%eax, %%eax \n\t"
2750 "shll $2, %%eax \n\t"
2751 "subl %%eax, %0 \n\t"
2752 : "+r" (QPptr), "+m" (QPFrac)
2753 : "r" (QPDelta)
2754 : "%eax"
2755 );
2756 #else
2757 int QP= isColor ?
2758 QPs[(y>>3)*QPStride + (x>>3)]:
2759 QPs[(y>>4)*QPStride + (x>>4)];
2760 #endif
2761 if(!isColor)
2739 { 2762 {
2740 QP=QPs[(y>>3)*QPStride + (x>>3)]; 2763 QP= (QP* QPCorrecture)>>8;
2741 } 2764 yHistogram[ srcBlock[srcStride*4 + 4] ]++;
2742 else
2743 {
2744 QP= QPs[(y>>4)*QPStride + (x>>4)];
2745 if(mode & LEVEL_FIX) QP= (QP* (packedYScale &0xFFFF))>>8;
2746 yHistogram[ srcBlock[srcStride*5] ]++;
2747 } 2765 }
2748 #ifdef HAVE_MMX 2766 #ifdef HAVE_MMX
2749 asm volatile( 2767 asm volatile(
2750 "movd %0, %%mm7 \n\t" 2768 "movd %0, %%mm7 \n\t"
2751 "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP 2769 "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
2759 #ifdef MORE_TIMING 2777 #ifdef MORE_TIMING
2760 T0= rdtsc(); 2778 T0= rdtsc();
2761 #endif 2779 #endif
2762 2780
2763 #ifdef HAVE_MMX2 2781 #ifdef HAVE_MMX2
2782 /*
2764 prefetchnta(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); 2783 prefetchnta(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
2765 prefetchnta(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); 2784 prefetchnta(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
2766 prefetcht0(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); 2785 prefetcht0(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
2767 prefetcht0(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); 2786 prefetcht0(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
2787 */
2788 /*
2789 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
2790 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
2791 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
2792 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
2793 */
2794
2795 asm(
2796 "movl %4, %%eax \n\t"
2797 "shrl $2, %%eax \n\t"
2798 "andl $6, %%eax \n\t"
2799 "addl $5, %%eax \n\t"
2800 "movl %%eax, %%ebx \n\t"
2801 "imul %1, %%eax \n\t"
2802 "imul %3, %%ebx \n\t"
2803 "prefetchnta 32(%%eax, %0) \n\t"
2804 "prefetcht0 32(%%ebx, %2) \n\t"
2805 "addl %1, %%eax \n\t"
2806 "addl %3, %%ebx \n\t"
2807 "prefetchnta 32(%%eax, %0) \n\t"
2808 "prefetcht0 32(%%ebx, %2) \n\t"
2809 :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
2810 "m" (x)
2811 : "%eax", "%ebx"
2812 );
2813
2768 #elif defined(HAVE_3DNOW) 2814 #elif defined(HAVE_3DNOW)
2769 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ... 2815 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
2770 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); 2816 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
2771 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); 2817 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
2772 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); 2818 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);