comparison x86/dsputil_yasm.asm @ 8760:31138c296ac6 libavcodec

ff_add_hfyu_median_prediction_mmx2 overall ffvhuff decoding speedup: 28% on core2, 25% on k8.
author lorenm
date Sun, 08 Feb 2009 17:45:30 +0000
parents 7768bdfd4f7b
children 12c8175d6db5
comparison
equal deleted inserted replaced
8759:4cea2f47219a 8760:31138c296ac6
88 FLOAT_TO_INT16_INTERLEAVE6 3dnow 88 FLOAT_TO_INT16_INTERLEAVE6 3dnow
89 %undef pswapd 89 %undef pswapd
90 FLOAT_TO_INT16_INTERLEAVE6 3dn2 90 FLOAT_TO_INT16_INTERLEAVE6 3dn2
91 %undef cvtps2pi 91 %undef cvtps2pi
92 92
93
94
95 ; void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *top, uint8_t *diff, int w, int *left, int *left_top)
96 cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top
97 movq mm0, [topq]
98 movq mm2, mm0
99 movd mm4, [left_topq]
100 psllq mm2, 8
101 movq mm1, mm0
102 por mm4, mm2
103 movd mm3, [leftq]
104 psubb mm0, mm4 ; t-tl
105 add dstq, wq
106 add topq, wq
107 add diffq, wq
108 neg wq
109 jmp .skip
110 .loop:
111 movq mm4, [topq+wq]
112 movq mm0, mm4
113 psllq mm4, 8
114 por mm4, mm1
115 movq mm1, mm0 ; t
116 psubb mm0, mm4 ; t-tl
117 .skip:
118 movq mm2, [diffq+wq]
119 %assign i 0
120 %rep 8
121 movq mm4, mm0
122 paddb mm4, mm3 ; t-tl+l
123 movq mm5, mm3
124 pmaxub mm3, mm1
125 pminub mm5, mm1
126 pminub mm3, mm4
127 pmaxub mm3, mm5 ; median
128 paddb mm3, mm2 ; +residual
129 %if i==0
130 movq mm7, mm3
131 psllq mm7, 56
132 %else
133 movq mm6, mm3
134 psrlq mm7, 8
135 psllq mm6, 56
136 por mm7, mm6
137 %endif
138 %if i<7
139 psrlq mm0, 8
140 psrlq mm1, 8
141 psrlq mm2, 8
142 %endif
143 %assign i i+1
144 %endrep
145 movq [dstq+wq], mm7
146 add wq, 8
147 jl .loop
148 movzx r2d, byte [dstq-1]
149 mov [leftq], r2d
150 movzx r2d, byte [topq-1]
151 mov [left_topq], r2d
152 RET