Mercurial > libavcodec.hg
comparison x86/dsputil_yasm.asm @ 8760:31138c296ac6 libavcodec
ff_add_hfyu_median_prediction_mmx2
overall ffvhuff decoding speedup: 28% on core2, 25% on k8.
author | lorenm |
---|---|
date | Sun, 08 Feb 2009 17:45:30 +0000 |
parents | 7768bdfd4f7b |
children | 12c8175d6db5 |
comparison
equal
deleted
inserted
replaced
8759:4cea2f47219a | 8760:31138c296ac6 |
---|---|
88 FLOAT_TO_INT16_INTERLEAVE6 3dnow | 88 FLOAT_TO_INT16_INTERLEAVE6 3dnow |
89 %undef pswapd | 89 %undef pswapd |
90 FLOAT_TO_INT16_INTERLEAVE6 3dn2 | 90 FLOAT_TO_INT16_INTERLEAVE6 3dn2 |
91 %undef cvtps2pi | 91 %undef cvtps2pi |
92 | 92 |
93 | |
94 | |
95 ; void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *top, uint8_t *diff, int w, int *left, int *left_top) | |
96 cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top | |
97 movq mm0, [topq] | |
98 movq mm2, mm0 | |
99 movd mm4, [left_topq] | |
100 psllq mm2, 8 | |
101 movq mm1, mm0 | |
102 por mm4, mm2 | |
103 movd mm3, [leftq] | |
104 psubb mm0, mm4 ; t-tl | |
105 add dstq, wq | |
106 add topq, wq | |
107 add diffq, wq | |
108 neg wq | |
109 jmp .skip | |
110 .loop: | |
111 movq mm4, [topq+wq] | |
112 movq mm0, mm4 | |
113 psllq mm4, 8 | |
114 por mm4, mm1 | |
115 movq mm1, mm0 ; t | |
116 psubb mm0, mm4 ; t-tl | |
117 .skip: | |
118 movq mm2, [diffq+wq] | |
119 %assign i 0 | |
120 %rep 8 | |
121 movq mm4, mm0 | |
122 paddb mm4, mm3 ; t-tl+l | |
123 movq mm5, mm3 | |
124 pmaxub mm3, mm1 | |
125 pminub mm5, mm1 | |
126 pminub mm3, mm4 | |
127 pmaxub mm3, mm5 ; median | |
128 paddb mm3, mm2 ; +residual | |
129 %if i==0 | |
130 movq mm7, mm3 | |
131 psllq mm7, 56 | |
132 %else | |
133 movq mm6, mm3 | |
134 psrlq mm7, 8 | |
135 psllq mm6, 56 | |
136 por mm7, mm6 | |
137 %endif | |
138 %if i<7 | |
139 psrlq mm0, 8 | |
140 psrlq mm1, 8 | |
141 psrlq mm2, 8 | |
142 %endif | |
143 %assign i i+1 | |
144 %endrep | |
145 movq [dstq+wq], mm7 | |
146 add wq, 8 | |
147 jl .loop | |
148 movzx r2d, byte [dstq-1] | |
149 mov [leftq], r2d | |
150 movzx r2d, byte [topq-1] | |
151 mov [left_topq], r2d | |
152 RET |