comparison libpostproc/postprocess_template.c @ 2040:5de466b3360e libavcodec

per line lowpass filter in mmx
author michael
date Fri, 28 May 2004 13:23:53 +0000
parents f25e485a7850
children b996fbe0a7e7
comparison
equal deleted inserted replaced
2039:f25e485a7850 2040:5de466b3360e
2615 #ifdef HAVE_MMX 2615 #ifdef HAVE_MMX
2616 /** 2616 /**
2617 * accurate deblock filter 2617 * accurate deblock filter
2618 */ 2618 */
2619 static always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int stride, PPContext *c){ 2619 static always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int stride, PPContext *c){
2620 int y;
2621 const int QP= c->QP;
2622 int64_t dc_mask, eq_mask; 2620 int64_t dc_mask, eq_mask;
2621 int64_t sums[10*8*2];
2623 src+= step*3; // src points to begin of the 8x8 Block 2622 src+= step*3; // src points to begin of the 8x8 Block
2624 //START_TIMER 2623 //START_TIMER
2625 asm volatile( 2624 asm volatile(
2626 "movq %0, %%mm7 \n\t" 2625 "movq %0, %%mm7 \n\t"
2627 "movq %1, %%mm6 \n\t" 2626 "movq %1, %%mm6 \n\t"
2723 : "=m" (eq_mask), "=m" (dc_mask) 2722 : "=m" (eq_mask), "=m" (dc_mask)
2724 : "r" (src), "r" (step), "m" (c->pQPb), "m"(c->ppMode.flatnessThreshold) 2723 : "r" (src), "r" (step), "m" (c->pQPb), "m"(c->ppMode.flatnessThreshold)
2725 : "%eax" 2724 : "%eax"
2726 ); 2725 );
2727 2726
2728 src+= step; // src points to begin of the 8x8 Block 2727 if(dc_mask & eq_mask){
2728 int offset= -8*step;
2729 int64_t *temp_sums= sums;
2730
2731 asm volatile(
2732 "movq %2, %%mm0 \n\t" // QP,..., QP
2733 "pxor %%mm4, %%mm4 \n\t"
2734
2735 "movq (%0), %%mm6 \n\t"
2736 "movq (%0, %1), %%mm5 \n\t"
2737 "movq %%mm5, %%mm1 \n\t"
2738 "movq %%mm6, %%mm2 \n\t"
2739 "psubusb %%mm6, %%mm5 \n\t"
2740 "psubusb %%mm1, %%mm2 \n\t"
2741 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
2742 "psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0
2743 "pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF
2744
2745 "pxor %%mm6, %%mm1 \n\t"
2746 "pand %%mm0, %%mm1 \n\t"
2747 "pxor %%mm1, %%mm6 \n\t"
2748 // 0:QP 6:First
2749
2750 "movq (%0, %1, 8), %%mm5 \n\t"
2751 "addl %1, %0 \n\t" // %0 points to line 1 not 0
2752 "movq (%0, %1, 8), %%mm7 \n\t"
2753 "movq %%mm5, %%mm1 \n\t"
2754 "movq %%mm7, %%mm2 \n\t"
2755 "psubusb %%mm7, %%mm5 \n\t"
2756 "psubusb %%mm1, %%mm2 \n\t"
2757 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
2758 "movq %2, %%mm0 \n\t" // QP,..., QP
2759 "psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0
2760 "pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF
2761
2762 "pxor %%mm7, %%mm1 \n\t"
2763 "pand %%mm0, %%mm1 \n\t"
2764 "pxor %%mm1, %%mm7 \n\t"
2765
2766 "movq %%mm6, %%mm5 \n\t"
2767 "punpckhbw %%mm4, %%mm6 \n\t"
2768 "punpcklbw %%mm4, %%mm5 \n\t"
2769 // 4:0 5/6:First 7:Last
2770
2771 "movq %%mm5, %%mm0 \n\t"
2772 "movq %%mm6, %%mm1 \n\t"
2773 "psllw $2, %%mm0 \n\t"
2774 "psllw $2, %%mm1 \n\t"
2775 "paddw "MANGLE(w04)", %%mm0 \n\t"
2776 "paddw "MANGLE(w04)", %%mm1 \n\t"
2777
2778 #define NEXT\
2779 "movq (%0), %%mm2 \n\t"\
2780 "movq (%0), %%mm3 \n\t"\
2781 "addl %1, %0 \n\t"\
2782 "punpcklbw %%mm4, %%mm2 \n\t"\
2783 "punpckhbw %%mm4, %%mm3 \n\t"\
2784 "paddw %%mm2, %%mm0 \n\t"\
2785 "paddw %%mm3, %%mm1 \n\t"
2786
2787 #define PREV\
2788 "movq (%0), %%mm2 \n\t"\
2789 "movq (%0), %%mm3 \n\t"\
2790 "addl %1, %0 \n\t"\
2791 "punpcklbw %%mm4, %%mm2 \n\t"\
2792 "punpckhbw %%mm4, %%mm3 \n\t"\
2793 "psubw %%mm2, %%mm0 \n\t"\
2794 "psubw %%mm3, %%mm1 \n\t"
2795
2796
2797 NEXT //0
2798 NEXT //1
2799 NEXT //2
2800 "movq %%mm0, (%3) \n\t"
2801 "movq %%mm1, 8(%3) \n\t"
2802
2803 NEXT //3
2804 "psubw %%mm5, %%mm0 \n\t"
2805 "psubw %%mm6, %%mm1 \n\t"
2806 "movq %%mm0, 16(%3) \n\t"
2807 "movq %%mm1, 24(%3) \n\t"
2808
2809 NEXT //4
2810 "psubw %%mm5, %%mm0 \n\t"
2811 "psubw %%mm6, %%mm1 \n\t"
2812 "movq %%mm0, 32(%3) \n\t"
2813 "movq %%mm1, 40(%3) \n\t"
2814
2815 NEXT //5
2816 "psubw %%mm5, %%mm0 \n\t"
2817 "psubw %%mm6, %%mm1 \n\t"
2818 "movq %%mm0, 48(%3) \n\t"
2819 "movq %%mm1, 56(%3) \n\t"
2820
2821 NEXT //6
2822 "psubw %%mm5, %%mm0 \n\t"
2823 "psubw %%mm6, %%mm1 \n\t"
2824 "movq %%mm0, 64(%3) \n\t"
2825 "movq %%mm1, 72(%3) \n\t"
2826
2827 "movq %%mm7, %%mm6 \n\t"
2828 "punpckhbw %%mm4, %%mm7 \n\t"
2829 "punpcklbw %%mm4, %%mm6 \n\t"
2830
2831 NEXT //7
2832 "movl %4, %0 \n\t"
2833 "addl %1, %0 \n\t"
2834 PREV //0
2835 "movq %%mm0, 80(%3) \n\t"
2836 "movq %%mm1, 88(%3) \n\t"
2837
2838 PREV //1
2839 "paddw %%mm6, %%mm0 \n\t"
2840 "paddw %%mm7, %%mm1 \n\t"
2841 "movq %%mm0, 96(%3) \n\t"
2842 "movq %%mm1, 104(%3) \n\t"
2843
2844 PREV //2
2845 "paddw %%mm6, %%mm0 \n\t"
2846 "paddw %%mm7, %%mm1 \n\t"
2847 "movq %%mm0, 112(%3) \n\t"
2848 "movq %%mm1, 120(%3) \n\t"
2849
2850 PREV //3
2851 "paddw %%mm6, %%mm0 \n\t"
2852 "paddw %%mm7, %%mm1 \n\t"
2853 "movq %%mm0, 128(%3) \n\t"
2854 "movq %%mm1, 136(%3) \n\t"
2855
2856 PREV //4
2857 "paddw %%mm6, %%mm0 \n\t"
2858 "paddw %%mm7, %%mm1 \n\t"
2859 "movq %%mm0, 144(%3) \n\t"
2860 "movq %%mm1, 152(%3) \n\t"
2861
2862 "movl %4, %0 \n\t" //FIXME
2863
2864 : "+&r"(src)
2865 : "r" (step), "m" (c->pQPb), "r"(sums), "g"(src)
2866 );
2867
2868 src+= step; // src points to begin of the 8x8 Block
2869
2870 asm volatile(
2871 "movq %4, %%mm6 \n\t"
2872 "pcmpeqb %%mm5, %%mm5 \n\t"
2873 "pxor %%mm6, %%mm5 \n\t"
2874 "pxor %%mm7, %%mm7 \n\t"
2875
2876 "1: \n\t"
2877 "movq (%1), %%mm0 \n\t"
2878 "movq 8(%1), %%mm1 \n\t"
2879 "paddw 32(%1), %%mm0 \n\t"
2880 "paddw 40(%1), %%mm1 \n\t"
2881 "movq (%0, %3), %%mm2 \n\t"
2882 "movq %%mm2, %%mm3 \n\t"
2883 "movq %%mm2, %%mm4 \n\t"
2884 "punpcklbw %%mm7, %%mm2 \n\t"
2885 "punpckhbw %%mm7, %%mm3 \n\t"
2886 "paddw %%mm2, %%mm0 \n\t"
2887 "paddw %%mm3, %%mm1 \n\t"
2888 "paddw %%mm2, %%mm0 \n\t"
2889 "paddw %%mm3, %%mm1 \n\t"
2890 "psrlw $4, %%mm0 \n\t"
2891 "psrlw $4, %%mm1 \n\t"
2892 "packuswb %%mm1, %%mm0 \n\t"
2893 "pand %%mm6, %%mm0 \n\t"
2894 "pand %%mm5, %%mm4 \n\t"
2895 "por %%mm4, %%mm0 \n\t"
2896 "movq %%mm0, (%0, %3) \n\t"
2897 "addl $16, %1 \n\t"
2898 "addl %2, %0 \n\t"
2899 " js 1b \n\t"
2900
2901 : "+r"(offset), "+r"(temp_sums)
2902 : "r" (step), "r"(src - offset), "m"(dc_mask & eq_mask)
2903 );
2904 }else
2905 src+= step; // src points to begin of the 8x8 Block
2729 2906
2730 if(eq_mask != -1LL){ 2907 if(eq_mask != -1LL){
2908 uint8_t *temp_src= src;
2731 asm volatile( 2909 asm volatile(
2732 "pxor %%mm7, %%mm7 \n\t" 2910 "pxor %%mm7, %%mm7 \n\t"
2733 "leal -40(%%esp), %%ecx \n\t" // make space for 4 8-byte vars 2911 "leal -40(%%esp), %%ecx \n\t" // make space for 4 8-byte vars
2734 "andl $0xFFFFFFF8, %%ecx \n\t" // align 2912 "andl $0xFFFFFFF8, %%ecx \n\t" // align
2735 // 0 1 2 3 4 5 6 7 8 9 2913 // 0 1 2 3 4 5 6 7 8 9
2953 "movq %%mm0, (%0) \n\t" 3131 "movq %%mm0, (%0) \n\t"
2954 "movq (%0, %1), %%mm0 \n\t" 3132 "movq (%0, %1), %%mm0 \n\t"
2955 "psubb %%mm1, %%mm0 \n\t" 3133 "psubb %%mm1, %%mm0 \n\t"
2956 "movq %%mm0, (%0, %1) \n\t" 3134 "movq %%mm0, (%0, %1) \n\t"
2957 3135
2958 : "+r" (src) 3136 : "+r" (temp_src)
2959 : "r" (step), "m" (c->pQPb), "m"(eq_mask) 3137 : "r" (step), "m" (c->pQPb), "m"(eq_mask)
2960 : "%eax", "%ecx" 3138 : "%eax", "%ecx"
2961 ); 3139 );
2962 src-= 3*step; //reverse src change from asm
2963 }
2964
2965 for(y=0; y<8; y++){
2966 if((eq_mask>>(y*8))&1){
2967 if((dc_mask>>(y*8))&1){
2968 const int first= ABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0];
2969 const int last= ABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step];
2970
2971 int sums[10];
2972 sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4;
2973 sums[1] = sums[0] - first + src[3*step];
2974 sums[2] = sums[1] - first + src[4*step];
2975 sums[3] = sums[2] - first + src[5*step];
2976 sums[4] = sums[3] - first + src[6*step];
2977 sums[5] = sums[4] - src[0*step] + src[7*step];
2978 sums[6] = sums[5] - src[1*step] + last;
2979 sums[7] = sums[6] - src[2*step] + last;
2980 sums[8] = sums[7] - src[3*step] + last;
2981 sums[9] = sums[8] - src[4*step] + last;
2982
2983 src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4;
2984 src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4;
2985 src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4;
2986 src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4;
2987 src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4;
2988 src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4;
2989 src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4;
2990 src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4;
2991 }
2992 }
2993
2994 src += stride;
2995 } 3140 }
2996 /*if(step==16){ 3141 /*if(step==16){
2997 STOP_TIMER("step16") 3142 STOP_TIMER("step16")
2998 }else{ 3143 }else{
2999 STOP_TIMER("stepX") 3144 STOP_TIMER("stepX")