Mercurial > libavcodec.hg
comparison libpostproc/postprocess_template.c @ 2040:5de466b3360e libavcodec
per line lowpass filter in mmx
author | michael |
---|---|
date | Fri, 28 May 2004 13:23:53 +0000 |
parents | f25e485a7850 |
children | b996fbe0a7e7 |
comparison
equal
deleted
inserted
replaced
2039:f25e485a7850 | 2040:5de466b3360e |
---|---|
2615 #ifdef HAVE_MMX | 2615 #ifdef HAVE_MMX |
2616 /** | 2616 /** |
2617 * accurate deblock filter | 2617 * accurate deblock filter |
2618 */ | 2618 */ |
2619 static always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int stride, PPContext *c){ | 2619 static always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int stride, PPContext *c){ |
2620 int y; | |
2621 const int QP= c->QP; | |
2622 int64_t dc_mask, eq_mask; | 2620 int64_t dc_mask, eq_mask; |
2621 int64_t sums[10*8*2]; | |
2623 src+= step*3; // src points to begin of the 8x8 Block | 2622 src+= step*3; // src points to begin of the 8x8 Block |
2624 //START_TIMER | 2623 //START_TIMER |
2625 asm volatile( | 2624 asm volatile( |
2626 "movq %0, %%mm7 \n\t" | 2625 "movq %0, %%mm7 \n\t" |
2627 "movq %1, %%mm6 \n\t" | 2626 "movq %1, %%mm6 \n\t" |
2723 : "=m" (eq_mask), "=m" (dc_mask) | 2722 : "=m" (eq_mask), "=m" (dc_mask) |
2724 : "r" (src), "r" (step), "m" (c->pQPb), "m"(c->ppMode.flatnessThreshold) | 2723 : "r" (src), "r" (step), "m" (c->pQPb), "m"(c->ppMode.flatnessThreshold) |
2725 : "%eax" | 2724 : "%eax" |
2726 ); | 2725 ); |
2727 | 2726 |
2728 src+= step; // src points to begin of the 8x8 Block | 2727 if(dc_mask & eq_mask){ |
2728 int offset= -8*step; | |
2729 int64_t *temp_sums= sums; | |
2730 | |
2731 asm volatile( | |
2732 "movq %2, %%mm0 \n\t" // QP,..., QP | |
2733 "pxor %%mm4, %%mm4 \n\t" | |
2734 | |
2735 "movq (%0), %%mm6 \n\t" | |
2736 "movq (%0, %1), %%mm5 \n\t" | |
2737 "movq %%mm5, %%mm1 \n\t" | |
2738 "movq %%mm6, %%mm2 \n\t" | |
2739 "psubusb %%mm6, %%mm5 \n\t" | |
2740 "psubusb %%mm1, %%mm2 \n\t" | |
2741 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines | |
2742 "psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0 | |
2743 "pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF | |
2744 | |
2745 "pxor %%mm6, %%mm1 \n\t" | |
2746 "pand %%mm0, %%mm1 \n\t" | |
2747 "pxor %%mm1, %%mm6 \n\t" | |
2748 // 0:QP 6:First | |
2749 | |
2750 "movq (%0, %1, 8), %%mm5 \n\t" | |
2751 "addl %1, %0 \n\t" // %0 points to line 1 not 0 | |
2752 "movq (%0, %1, 8), %%mm7 \n\t" | |
2753 "movq %%mm5, %%mm1 \n\t" | |
2754 "movq %%mm7, %%mm2 \n\t" | |
2755 "psubusb %%mm7, %%mm5 \n\t" | |
2756 "psubusb %%mm1, %%mm2 \n\t" | |
2757 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines | |
2758 "movq %2, %%mm0 \n\t" // QP,..., QP | |
2759 "psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0 | |
2760 "pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF | |
2761 | |
2762 "pxor %%mm7, %%mm1 \n\t" | |
2763 "pand %%mm0, %%mm1 \n\t" | |
2764 "pxor %%mm1, %%mm7 \n\t" | |
2765 | |
2766 "movq %%mm6, %%mm5 \n\t" | |
2767 "punpckhbw %%mm4, %%mm6 \n\t" | |
2768 "punpcklbw %%mm4, %%mm5 \n\t" | |
2769 // 4:0 5/6:First 7:Last | |
2770 | |
2771 "movq %%mm5, %%mm0 \n\t" | |
2772 "movq %%mm6, %%mm1 \n\t" | |
2773 "psllw $2, %%mm0 \n\t" | |
2774 "psllw $2, %%mm1 \n\t" | |
2775 "paddw "MANGLE(w04)", %%mm0 \n\t" | |
2776 "paddw "MANGLE(w04)", %%mm1 \n\t" | |
2777 | |
2778 #define NEXT\ | |
2779 "movq (%0), %%mm2 \n\t"\ | |
2780 "movq (%0), %%mm3 \n\t"\ | |
2781 "addl %1, %0 \n\t"\ | |
2782 "punpcklbw %%mm4, %%mm2 \n\t"\ | |
2783 "punpckhbw %%mm4, %%mm3 \n\t"\ | |
2784 "paddw %%mm2, %%mm0 \n\t"\ | |
2785 "paddw %%mm3, %%mm1 \n\t" | |
2786 | |
2787 #define PREV\ | |
2788 "movq (%0), %%mm2 \n\t"\ | |
2789 "movq (%0), %%mm3 \n\t"\ | |
2790 "addl %1, %0 \n\t"\ | |
2791 "punpcklbw %%mm4, %%mm2 \n\t"\ | |
2792 "punpckhbw %%mm4, %%mm3 \n\t"\ | |
2793 "psubw %%mm2, %%mm0 \n\t"\ | |
2794 "psubw %%mm3, %%mm1 \n\t" | |
2795 | |
2796 | |
2797 NEXT //0 | |
2798 NEXT //1 | |
2799 NEXT //2 | |
2800 "movq %%mm0, (%3) \n\t" | |
2801 "movq %%mm1, 8(%3) \n\t" | |
2802 | |
2803 NEXT //3 | |
2804 "psubw %%mm5, %%mm0 \n\t" | |
2805 "psubw %%mm6, %%mm1 \n\t" | |
2806 "movq %%mm0, 16(%3) \n\t" | |
2807 "movq %%mm1, 24(%3) \n\t" | |
2808 | |
2809 NEXT //4 | |
2810 "psubw %%mm5, %%mm0 \n\t" | |
2811 "psubw %%mm6, %%mm1 \n\t" | |
2812 "movq %%mm0, 32(%3) \n\t" | |
2813 "movq %%mm1, 40(%3) \n\t" | |
2814 | |
2815 NEXT //5 | |
2816 "psubw %%mm5, %%mm0 \n\t" | |
2817 "psubw %%mm6, %%mm1 \n\t" | |
2818 "movq %%mm0, 48(%3) \n\t" | |
2819 "movq %%mm1, 56(%3) \n\t" | |
2820 | |
2821 NEXT //6 | |
2822 "psubw %%mm5, %%mm0 \n\t" | |
2823 "psubw %%mm6, %%mm1 \n\t" | |
2824 "movq %%mm0, 64(%3) \n\t" | |
2825 "movq %%mm1, 72(%3) \n\t" | |
2826 | |
2827 "movq %%mm7, %%mm6 \n\t" | |
2828 "punpckhbw %%mm4, %%mm7 \n\t" | |
2829 "punpcklbw %%mm4, %%mm6 \n\t" | |
2830 | |
2831 NEXT //7 | |
2832 "movl %4, %0 \n\t" | |
2833 "addl %1, %0 \n\t" | |
2834 PREV //0 | |
2835 "movq %%mm0, 80(%3) \n\t" | |
2836 "movq %%mm1, 88(%3) \n\t" | |
2837 | |
2838 PREV //1 | |
2839 "paddw %%mm6, %%mm0 \n\t" | |
2840 "paddw %%mm7, %%mm1 \n\t" | |
2841 "movq %%mm0, 96(%3) \n\t" | |
2842 "movq %%mm1, 104(%3) \n\t" | |
2843 | |
2844 PREV //2 | |
2845 "paddw %%mm6, %%mm0 \n\t" | |
2846 "paddw %%mm7, %%mm1 \n\t" | |
2847 "movq %%mm0, 112(%3) \n\t" | |
2848 "movq %%mm1, 120(%3) \n\t" | |
2849 | |
2850 PREV //3 | |
2851 "paddw %%mm6, %%mm0 \n\t" | |
2852 "paddw %%mm7, %%mm1 \n\t" | |
2853 "movq %%mm0, 128(%3) \n\t" | |
2854 "movq %%mm1, 136(%3) \n\t" | |
2855 | |
2856 PREV //4 | |
2857 "paddw %%mm6, %%mm0 \n\t" | |
2858 "paddw %%mm7, %%mm1 \n\t" | |
2859 "movq %%mm0, 144(%3) \n\t" | |
2860 "movq %%mm1, 152(%3) \n\t" | |
2861 | |
2862 "movl %4, %0 \n\t" //FIXME | |
2863 | |
2864 : "+&r"(src) | |
2865 : "r" (step), "m" (c->pQPb), "r"(sums), "g"(src) | |
2866 ); | |
2867 | |
2868 src+= step; // src points to begin of the 8x8 Block | |
2869 | |
2870 asm volatile( | |
2871 "movq %4, %%mm6 \n\t" | |
2872 "pcmpeqb %%mm5, %%mm5 \n\t" | |
2873 "pxor %%mm6, %%mm5 \n\t" | |
2874 "pxor %%mm7, %%mm7 \n\t" | |
2875 | |
2876 "1: \n\t" | |
2877 "movq (%1), %%mm0 \n\t" | |
2878 "movq 8(%1), %%mm1 \n\t" | |
2879 "paddw 32(%1), %%mm0 \n\t" | |
2880 "paddw 40(%1), %%mm1 \n\t" | |
2881 "movq (%0, %3), %%mm2 \n\t" | |
2882 "movq %%mm2, %%mm3 \n\t" | |
2883 "movq %%mm2, %%mm4 \n\t" | |
2884 "punpcklbw %%mm7, %%mm2 \n\t" | |
2885 "punpckhbw %%mm7, %%mm3 \n\t" | |
2886 "paddw %%mm2, %%mm0 \n\t" | |
2887 "paddw %%mm3, %%mm1 \n\t" | |
2888 "paddw %%mm2, %%mm0 \n\t" | |
2889 "paddw %%mm3, %%mm1 \n\t" | |
2890 "psrlw $4, %%mm0 \n\t" | |
2891 "psrlw $4, %%mm1 \n\t" | |
2892 "packuswb %%mm1, %%mm0 \n\t" | |
2893 "pand %%mm6, %%mm0 \n\t" | |
2894 "pand %%mm5, %%mm4 \n\t" | |
2895 "por %%mm4, %%mm0 \n\t" | |
2896 "movq %%mm0, (%0, %3) \n\t" | |
2897 "addl $16, %1 \n\t" | |
2898 "addl %2, %0 \n\t" | |
2899 " js 1b \n\t" | |
2900 | |
2901 : "+r"(offset), "+r"(temp_sums) | |
2902 : "r" (step), "r"(src - offset), "m"(dc_mask & eq_mask) | |
2903 ); | |
2904 }else | |
2905 src+= step; // src points to begin of the 8x8 Block | |
2729 | 2906 |
2730 if(eq_mask != -1LL){ | 2907 if(eq_mask != -1LL){ |
2908 uint8_t *temp_src= src; | |
2731 asm volatile( | 2909 asm volatile( |
2732 "pxor %%mm7, %%mm7 \n\t" | 2910 "pxor %%mm7, %%mm7 \n\t" |
2733 "leal -40(%%esp), %%ecx \n\t" // make space for 4 8-byte vars | 2911 "leal -40(%%esp), %%ecx \n\t" // make space for 4 8-byte vars |
2734 "andl $0xFFFFFFF8, %%ecx \n\t" // align | 2912 "andl $0xFFFFFFF8, %%ecx \n\t" // align |
2735 // 0 1 2 3 4 5 6 7 8 9 | 2913 // 0 1 2 3 4 5 6 7 8 9 |
2953 "movq %%mm0, (%0) \n\t" | 3131 "movq %%mm0, (%0) \n\t" |
2954 "movq (%0, %1), %%mm0 \n\t" | 3132 "movq (%0, %1), %%mm0 \n\t" |
2955 "psubb %%mm1, %%mm0 \n\t" | 3133 "psubb %%mm1, %%mm0 \n\t" |
2956 "movq %%mm0, (%0, %1) \n\t" | 3134 "movq %%mm0, (%0, %1) \n\t" |
2957 | 3135 |
2958 : "+r" (src) | 3136 : "+r" (temp_src) |
2959 : "r" (step), "m" (c->pQPb), "m"(eq_mask) | 3137 : "r" (step), "m" (c->pQPb), "m"(eq_mask) |
2960 : "%eax", "%ecx" | 3138 : "%eax", "%ecx" |
2961 ); | 3139 ); |
2962 src-= 3*step; //reverse src change from asm | |
2963 } | |
2964 | |
2965 for(y=0; y<8; y++){ | |
2966 if((eq_mask>>(y*8))&1){ | |
2967 if((dc_mask>>(y*8))&1){ | |
2968 const int first= ABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0]; | |
2969 const int last= ABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step]; | |
2970 | |
2971 int sums[10]; | |
2972 sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4; | |
2973 sums[1] = sums[0] - first + src[3*step]; | |
2974 sums[2] = sums[1] - first + src[4*step]; | |
2975 sums[3] = sums[2] - first + src[5*step]; | |
2976 sums[4] = sums[3] - first + src[6*step]; | |
2977 sums[5] = sums[4] - src[0*step] + src[7*step]; | |
2978 sums[6] = sums[5] - src[1*step] + last; | |
2979 sums[7] = sums[6] - src[2*step] + last; | |
2980 sums[8] = sums[7] - src[3*step] + last; | |
2981 sums[9] = sums[8] - src[4*step] + last; | |
2982 | |
2983 src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4; | |
2984 src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4; | |
2985 src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4; | |
2986 src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4; | |
2987 src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4; | |
2988 src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4; | |
2989 src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4; | |
2990 src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4; | |
2991 } | |
2992 } | |
2993 | |
2994 src += stride; | |
2995 } | 3140 } |
2996 /*if(step==16){ | 3141 /*if(step==16){ |
2997 STOP_TIMER("step16") | 3142 STOP_TIMER("step16") |
2998 }else{ | 3143 }else{ |
2999 STOP_TIMER("stepX") | 3144 STOP_TIMER("stepX") |