Mercurial > libavcodec.hg
comparison i386/mpegvideo_mmx_template.c @ 4989:0b1e761135cd libavcodec
sse2 & ssse3 versions of dct_quantize.
core2: mmx2=154 sse2=73 ssse3=66 (cycles)
k8: mmx2=179 sse2=149
p4: mmx2=284 sse2=194
author | lorenm |
---|---|
date | Sat, 12 May 2007 05:55:09 +0000 |
parents | bbe0bc387a19 |
children | 4dbe6578f811 |
comparison
equal
deleted
inserted
replaced
4988:689490842cf5 | 4989:0b1e761135cd |
---|---|
17 * | 17 * |
18 * You should have received a copy of the GNU Lesser General Public | 18 * You should have received a copy of the GNU Lesser General Public |
19 * License along with FFmpeg; if not, write to the Free Software | 19 * License along with FFmpeg; if not, write to the Free Software |
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | 20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
21 */ | 21 */ |
22 | |
23 #undef MMREG_WIDTH | |
24 #undef MM | |
25 #undef MOVQ | |
22 #undef SPREADW | 26 #undef SPREADW |
23 #undef PMAXW | 27 #undef PMAXW |
24 #undef PMAX | 28 #undef PMAX |
25 #ifdef HAVE_MMX2 | 29 #undef SAVE_SIGN |
26 #define SPREADW(a) "pshufw $0, " #a ", " #a " \n\t" | 30 #undef RESTORE_SIGN |
27 #define PMAXW(a,b) "pmaxsw " #a ", " #b " \n\t" | 31 |
32 #if defined(HAVE_SSE2) | |
33 #define MMREG_WIDTH "16" | |
34 #define MM "%%xmm" | |
35 #define MOVQ "movdqa" | |
36 #define SPREADW(a) \ | |
37 "pshuflw $0, "a", "a" \n\t"\ | |
38 "punpcklwd "a", "a" \n\t" | |
39 #define PMAXW(a,b) "pmaxsw "a", "b" \n\t" | |
28 #define PMAX(a,b) \ | 40 #define PMAX(a,b) \ |
29 "pshufw $0x0E," #a ", " #b " \n\t"\ | 41 "movhlps "a", "b" \n\t"\ |
30 PMAXW(b, a)\ | 42 PMAXW(b, a)\ |
31 "pshufw $0x01," #a ", " #b " \n\t"\ | 43 "pshuflw $0x0E, "a", "b" \n\t"\ |
44 PMAXW(b, a)\ | |
45 "pshuflw $0x01, "a", "b" \n\t"\ | |
46 PMAXW(b, a) | |
47 #else | |
48 #define MMREG_WIDTH "8" | |
49 #define MM "%%mm" | |
50 #define MOVQ "movq" | |
51 #if defined(HAVE_MMX2) | |
52 #define SPREADW(a) "pshufw $0, "a", "a" \n\t" | |
53 #define PMAXW(a,b) "pmaxsw "a", "b" \n\t" | |
54 #define PMAX(a,b) \ | |
55 "pshufw $0x0E, "a", "b" \n\t"\ | |
56 PMAXW(b, a)\ | |
57 "pshufw $0x01, "a", "b" \n\t"\ | |
32 PMAXW(b, a) | 58 PMAXW(b, a) |
33 #else | 59 #else |
34 #define SPREADW(a) \ | 60 #define SPREADW(a) \ |
35 "punpcklwd " #a ", " #a " \n\t"\ | 61 "punpcklwd "a", "a" \n\t"\ |
36 "punpcklwd " #a ", " #a " \n\t" | 62 "punpcklwd "a", "a" \n\t" |
37 #define PMAXW(a,b) \ | 63 #define PMAXW(a,b) \ |
38 "psubusw " #a ", " #b " \n\t"\ | 64 "psubusw "a", "b" \n\t"\ |
39 "paddw " #a ", " #b " \n\t" | 65 "paddw "a", "b" \n\t" |
40 #define PMAX(a,b) \ | 66 #define PMAX(a,b) \ |
41 "movq " #a ", " #b " \n\t"\ | 67 "movq "a", "b" \n\t"\ |
42 "psrlq $32, " #a " \n\t"\ | 68 "psrlq $32, "a" \n\t"\ |
43 PMAXW(b, a)\ | 69 PMAXW(b, a)\ |
44 "movq " #a ", " #b " \n\t"\ | 70 "movq "a", "b" \n\t"\ |
45 "psrlq $16, " #a " \n\t"\ | 71 "psrlq $16, "a" \n\t"\ |
46 PMAXW(b, a) | 72 PMAXW(b, a) |
47 | 73 |
74 #endif | |
75 #endif | |
76 | |
77 #ifdef HAVE_SSSE3 | |
78 #define SAVE_SIGN(a,b) \ | |
79 "movdqa "b", "a" \n\t"\ | |
80 "pabsw "b", "b" \n\t" | |
81 #define RESTORE_SIGN(a,b) \ | |
82 "psignw "a", "b" \n\t" | |
83 #else | |
84 #define SAVE_SIGN(a,b) \ | |
85 "pxor "a", "a" \n\t"\ | |
86 "pcmpgtw "b", "a" \n\t" /* block[i] <= 0 ? 0xFF : 0x00 */\ | |
87 "pxor "a", "b" \n\t"\ | |
88 "psubw "a", "b" \n\t" /* ABS(block[i]) */ | |
89 #define RESTORE_SIGN(a,b) \ | |
90 "pxor "a", "b" \n\t"\ | |
91 "psubw "a", "b" \n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i]) | |
48 #endif | 92 #endif |
49 | 93 |
50 static int RENAME(dct_quantize)(MpegEncContext *s, | 94 static int RENAME(dct_quantize)(MpegEncContext *s, |
51 DCTELEM *block, int n, | 95 DCTELEM *block, int n, |
52 int qscale, int *overflow) | 96 int qscale, int *overflow) |
53 { | 97 { |
54 long last_non_zero_p1; | 98 long last_non_zero_p1; |
55 int level=0, q; //=0 is cuz gcc says uninitalized ... | 99 int level=0, q; //=0 is cuz gcc says uninitalized ... |
56 const uint16_t *qmat, *bias; | 100 const uint16_t *qmat, *bias; |
57 DECLARE_ALIGNED_8(int16_t, temp_block[64]); | 101 DECLARE_ALIGNED_16(int16_t, temp_block[64]); |
58 | 102 |
59 assert((7&(int)(&temp_block[0])) == 0); //did gcc align it correctly? | 103 assert((7&(int)(&temp_block[0])) == 0); //did gcc align it correctly? |
60 | 104 |
61 //s->fdct (block); | 105 //s->fdct (block); |
62 RENAMEl(ff_fdct) (block); //cant be anything else ... | 106 RENAMEl(ff_fdct) (block); //cant be anything else ... |
104 } | 148 } |
105 | 149 |
106 if((s->out_format == FMT_H263 || s->out_format == FMT_H261) && s->mpeg_quant==0){ | 150 if((s->out_format == FMT_H263 || s->out_format == FMT_H261) && s->mpeg_quant==0){ |
107 | 151 |
108 asm volatile( | 152 asm volatile( |
109 "movd %%"REG_a", %%mm3 \n\t" // last_non_zero_p1 | 153 "movd %%"REG_a", "MM"3 \n\t" // last_non_zero_p1 |
110 SPREADW(%%mm3) | 154 SPREADW(MM"3") |
111 "pxor %%mm7, %%mm7 \n\t" // 0 | 155 "pxor "MM"7, "MM"7 \n\t" // 0 |
112 "pxor %%mm4, %%mm4 \n\t" // 0 | 156 "pxor "MM"4, "MM"4 \n\t" // 0 |
113 "movq (%2), %%mm5 \n\t" // qmat[0] | 157 MOVQ" (%2), "MM"5 \n\t" // qmat[0] |
114 "pxor %%mm6, %%mm6 \n\t" | 158 "pxor "MM"6, "MM"6 \n\t" |
115 "psubw (%3), %%mm6 \n\t" // -bias[0] | 159 "psubw (%3), "MM"6 \n\t" // -bias[0] |
116 "mov $-128, %%"REG_a" \n\t" | 160 "mov $-128, %%"REG_a" \n\t" |
117 ASMALIGN(4) | 161 ASMALIGN(4) |
118 "1: \n\t" | 162 "1: \n\t" |
119 "pxor %%mm1, %%mm1 \n\t" // 0 | 163 MOVQ" (%1, %%"REG_a"), "MM"0 \n\t" // block[i] |
120 "movq (%1, %%"REG_a"), %%mm0 \n\t" // block[i] | 164 SAVE_SIGN(MM"1", MM"0") // ABS(block[i]) |
121 "pcmpgtw %%mm0, %%mm1 \n\t" // block[i] <= 0 ? 0xFF : 0x00 | 165 "psubusw "MM"6, "MM"0 \n\t" // ABS(block[i]) + bias[0] |
122 "pxor %%mm1, %%mm0 \n\t" | 166 "pmulhw "MM"5, "MM"0 \n\t" // (ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16 |
123 "psubw %%mm1, %%mm0 \n\t" // ABS(block[i]) | 167 "por "MM"0, "MM"4 \n\t" |
124 "psubusw %%mm6, %%mm0 \n\t" // ABS(block[i]) + bias[0] | 168 RESTORE_SIGN(MM"1", MM"0") // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i]) |
125 "pmulhw %%mm5, %%mm0 \n\t" // (ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16 | 169 MOVQ" "MM"0, (%5, %%"REG_a") \n\t" |
126 "por %%mm0, %%mm4 \n\t" | 170 "pcmpeqw "MM"7, "MM"0 \n\t" // out==0 ? 0xFF : 0x00 |
127 "pxor %%mm1, %%mm0 \n\t" | 171 MOVQ" (%4, %%"REG_a"), "MM"1 \n\t" |
128 "psubw %%mm1, %%mm0 \n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i]) | 172 MOVQ" "MM"7, (%1, %%"REG_a") \n\t" // 0 |
129 "movq %%mm0, (%5, %%"REG_a") \n\t" | 173 "pandn "MM"1, "MM"0 \n\t" |
130 "pcmpeqw %%mm7, %%mm0 \n\t" // out==0 ? 0xFF : 0x00 | 174 PMAXW(MM"0", MM"3") |
131 "movq (%4, %%"REG_a"), %%mm1 \n\t" | 175 "add $"MMREG_WIDTH", %%"REG_a" \n\t" |
132 "movq %%mm7, (%1, %%"REG_a") \n\t" // 0 | |
133 "pandn %%mm1, %%mm0 \n\t" | |
134 PMAXW(%%mm0, %%mm3) | |
135 "add $8, %%"REG_a" \n\t" | |
136 " js 1b \n\t" | 176 " js 1b \n\t" |
137 PMAX(%%mm3, %%mm0) | 177 PMAX(MM"3", MM"0") |
138 "movd %%mm3, %%"REG_a" \n\t" | 178 "movd "MM"3, %%"REG_a" \n\t" |
139 "movzb %%al, %%"REG_a" \n\t" // last_non_zero_p1 | 179 "movzb %%al, %%"REG_a" \n\t" // last_non_zero_p1 |
140 : "+a" (last_non_zero_p1) | 180 : "+a" (last_non_zero_p1) |
141 : "r" (block+64), "r" (qmat), "r" (bias), | 181 : "r" (block+64), "r" (qmat), "r" (bias), |
142 "r" (inv_zigzag_direct16+64), "r" (temp_block+64) | 182 "r" (inv_zigzag_direct16+64), "r" (temp_block+64) |
143 ); | 183 ); |
144 // note the asm is split cuz gcc doesnt like that many operands ... | |
145 asm volatile( | |
146 "movd %1, %%mm1 \n\t" // max_qcoeff | |
147 SPREADW(%%mm1) | |
148 "psubusw %%mm1, %%mm4 \n\t" | |
149 "packuswb %%mm4, %%mm4 \n\t" | |
150 "movd %%mm4, %0 \n\t" // *overflow | |
151 : "=g" (*overflow) | |
152 : "g" (s->max_qcoeff) | |
153 ); | |
154 }else{ // FMT_H263 | 184 }else{ // FMT_H263 |
155 asm volatile( | 185 asm volatile( |
156 "movd %%"REG_a", %%mm3 \n\t" // last_non_zero_p1 | 186 "movd %%"REG_a", "MM"3 \n\t" // last_non_zero_p1 |
157 SPREADW(%%mm3) | 187 SPREADW(MM"3") |
158 "pxor %%mm7, %%mm7 \n\t" // 0 | 188 "pxor "MM"7, "MM"7 \n\t" // 0 |
159 "pxor %%mm4, %%mm4 \n\t" // 0 | 189 "pxor "MM"4, "MM"4 \n\t" // 0 |
160 "mov $-128, %%"REG_a" \n\t" | 190 "mov $-128, %%"REG_a" \n\t" |
161 ASMALIGN(4) | 191 ASMALIGN(4) |
162 "1: \n\t" | 192 "1: \n\t" |
163 "pxor %%mm1, %%mm1 \n\t" // 0 | 193 MOVQ" (%1, %%"REG_a"), "MM"0 \n\t" // block[i] |
164 "movq (%1, %%"REG_a"), %%mm0 \n\t" // block[i] | 194 SAVE_SIGN(MM"1", MM"0") // ABS(block[i]) |
165 "pcmpgtw %%mm0, %%mm1 \n\t" // block[i] <= 0 ? 0xFF : 0x00 | 195 MOVQ" (%3, %%"REG_a"), "MM"6 \n\t" // bias[0] |
166 "pxor %%mm1, %%mm0 \n\t" | 196 "paddusw "MM"6, "MM"0 \n\t" // ABS(block[i]) + bias[0] |
167 "psubw %%mm1, %%mm0 \n\t" // ABS(block[i]) | 197 MOVQ" (%2, %%"REG_a"), "MM"5 \n\t" // qmat[i] |
168 "movq (%3, %%"REG_a"), %%mm6 \n\t" // bias[0] | 198 "pmulhw "MM"5, "MM"0 \n\t" // (ABS(block[i])*qmat[0] + bias[0]*qmat[0])>>16 |
169 "paddusw %%mm6, %%mm0 \n\t" // ABS(block[i]) + bias[0] | 199 "por "MM"0, "MM"4 \n\t" |
170 "movq (%2, %%"REG_a"), %%mm5 \n\t" // qmat[i] | 200 RESTORE_SIGN(MM"1", MM"0") // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i]) |
171 "pmulhw %%mm5, %%mm0 \n\t" // (ABS(block[i])*qmat[0] + bias[0]*qmat[0])>>16 | 201 MOVQ" "MM"0, (%5, %%"REG_a") \n\t" |
172 "por %%mm0, %%mm4 \n\t" | 202 "pcmpeqw "MM"7, "MM"0 \n\t" // out==0 ? 0xFF : 0x00 |
173 "pxor %%mm1, %%mm0 \n\t" | 203 MOVQ" (%4, %%"REG_a"), "MM"1 \n\t" |
174 "psubw %%mm1, %%mm0 \n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i]) | 204 MOVQ" "MM"7, (%1, %%"REG_a") \n\t" // 0 |
175 "movq %%mm0, (%5, %%"REG_a") \n\t" | 205 "pandn "MM"1, "MM"0 \n\t" |
176 "pcmpeqw %%mm7, %%mm0 \n\t" // out==0 ? 0xFF : 0x00 | 206 PMAXW(MM"0", MM"3") |
177 "movq (%4, %%"REG_a"), %%mm1 \n\t" | 207 "add $"MMREG_WIDTH", %%"REG_a" \n\t" |
178 "movq %%mm7, (%1, %%"REG_a") \n\t" // 0 | |
179 "pandn %%mm1, %%mm0 \n\t" | |
180 PMAXW(%%mm0, %%mm3) | |
181 "add $8, %%"REG_a" \n\t" | |
182 " js 1b \n\t" | 208 " js 1b \n\t" |
183 PMAX(%%mm3, %%mm0) | 209 PMAX(MM"3", MM"0") |
184 "movd %%mm3, %%"REG_a" \n\t" | 210 "movd "MM"3, %%"REG_a" \n\t" |
185 "movzb %%al, %%"REG_a" \n\t" // last_non_zero_p1 | 211 "movzb %%al, %%"REG_a" \n\t" // last_non_zero_p1 |
186 : "+a" (last_non_zero_p1) | 212 : "+a" (last_non_zero_p1) |
187 : "r" (block+64), "r" (qmat+64), "r" (bias+64), | 213 : "r" (block+64), "r" (qmat+64), "r" (bias+64), |
188 "r" (inv_zigzag_direct16+64), "r" (temp_block+64) | 214 "r" (inv_zigzag_direct16+64), "r" (temp_block+64) |
189 ); | 215 ); |
190 // note the asm is split cuz gcc doesnt like that many operands ... | 216 } |
191 asm volatile( | 217 asm volatile( |
192 "movd %1, %%mm1 \n\t" // max_qcoeff | 218 "movd %1, "MM"1 \n\t" // max_qcoeff |
193 SPREADW(%%mm1) | 219 SPREADW(MM"1") |
194 "psubusw %%mm1, %%mm4 \n\t" | 220 "psubusw "MM"1, "MM"4 \n\t" |
195 "packuswb %%mm4, %%mm4 \n\t" | 221 "packuswb "MM"4, "MM"4 \n\t" |
196 "movd %%mm4, %0 \n\t" // *overflow | 222 #ifdef HAVE_SSE2 |
223 "packuswb "MM"4, "MM"4 \n\t" | |
224 #endif | |
225 "movd "MM"4, %0 \n\t" // *overflow | |
197 : "=g" (*overflow) | 226 : "=g" (*overflow) |
198 : "g" (s->max_qcoeff) | 227 : "g" (s->max_qcoeff) |
199 ); | 228 ); |
200 } | |
201 | 229 |
202 if(s->mb_intra) block[0]= level; | 230 if(s->mb_intra) block[0]= level; |
203 else block[0]= temp_block[0]; | 231 else block[0]= temp_block[0]; |
204 | 232 |
205 if(s->dsp.idct_permutation_type == FF_SIMPLE_IDCT_PERM){ | 233 if(s->dsp.idct_permutation_type == FF_SIMPLE_IDCT_PERM){ |