comparison i386/mpegvideo_mmx_template.c @ 4989:0b1e761135cd libavcodec

sse2 & ssse3 versions of dct_quantize. core2: mmx2=154 sse2=73 ssse3=66 (cycles) k8: mmx2=179 sse2=149 p4: mmx2=284 sse2=194
author lorenm
date Sat, 12 May 2007 05:55:09 +0000
parents bbe0bc387a19
children 4dbe6578f811
comparison
equal deleted inserted replaced
4988:689490842cf5 4989:0b1e761135cd
17 * 17 *
18 * You should have received a copy of the GNU Lesser General Public 18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software 19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */ 21 */
22
23 #undef MMREG_WIDTH
24 #undef MM
25 #undef MOVQ
22 #undef SPREADW 26 #undef SPREADW
23 #undef PMAXW 27 #undef PMAXW
24 #undef PMAX 28 #undef PMAX
25 #ifdef HAVE_MMX2 29 #undef SAVE_SIGN
26 #define SPREADW(a) "pshufw $0, " #a ", " #a " \n\t" 30 #undef RESTORE_SIGN
27 #define PMAXW(a,b) "pmaxsw " #a ", " #b " \n\t" 31
32 #if defined(HAVE_SSE2)
33 #define MMREG_WIDTH "16"
34 #define MM "%%xmm"
35 #define MOVQ "movdqa"
36 #define SPREADW(a) \
37 "pshuflw $0, "a", "a" \n\t"\
38 "punpcklwd "a", "a" \n\t"
39 #define PMAXW(a,b) "pmaxsw "a", "b" \n\t"
28 #define PMAX(a,b) \ 40 #define PMAX(a,b) \
29 "pshufw $0x0E," #a ", " #b " \n\t"\ 41 "movhlps "a", "b" \n\t"\
30 PMAXW(b, a)\ 42 PMAXW(b, a)\
31 "pshufw $0x01," #a ", " #b " \n\t"\ 43 "pshuflw $0x0E, "a", "b" \n\t"\
44 PMAXW(b, a)\
45 "pshuflw $0x01, "a", "b" \n\t"\
46 PMAXW(b, a)
47 #else
48 #define MMREG_WIDTH "8"
49 #define MM "%%mm"
50 #define MOVQ "movq"
51 #if defined(HAVE_MMX2)
52 #define SPREADW(a) "pshufw $0, "a", "a" \n\t"
53 #define PMAXW(a,b) "pmaxsw "a", "b" \n\t"
54 #define PMAX(a,b) \
55 "pshufw $0x0E, "a", "b" \n\t"\
56 PMAXW(b, a)\
57 "pshufw $0x01, "a", "b" \n\t"\
32 PMAXW(b, a) 58 PMAXW(b, a)
33 #else 59 #else
34 #define SPREADW(a) \ 60 #define SPREADW(a) \
35 "punpcklwd " #a ", " #a " \n\t"\ 61 "punpcklwd "a", "a" \n\t"\
36 "punpcklwd " #a ", " #a " \n\t" 62 "punpcklwd "a", "a" \n\t"
37 #define PMAXW(a,b) \ 63 #define PMAXW(a,b) \
38 "psubusw " #a ", " #b " \n\t"\ 64 "psubusw "a", "b" \n\t"\
39 "paddw " #a ", " #b " \n\t" 65 "paddw "a", "b" \n\t"
40 #define PMAX(a,b) \ 66 #define PMAX(a,b) \
41 "movq " #a ", " #b " \n\t"\ 67 "movq "a", "b" \n\t"\
42 "psrlq $32, " #a " \n\t"\ 68 "psrlq $32, "a" \n\t"\
43 PMAXW(b, a)\ 69 PMAXW(b, a)\
44 "movq " #a ", " #b " \n\t"\ 70 "movq "a", "b" \n\t"\
45 "psrlq $16, " #a " \n\t"\ 71 "psrlq $16, "a" \n\t"\
46 PMAXW(b, a) 72 PMAXW(b, a)
47 73
74 #endif
75 #endif
76
77 #ifdef HAVE_SSSE3
78 #define SAVE_SIGN(a,b) \
79 "movdqa "b", "a" \n\t"\
80 "pabsw "b", "b" \n\t"
81 #define RESTORE_SIGN(a,b) \
82 "psignw "a", "b" \n\t"
83 #else
84 #define SAVE_SIGN(a,b) \
85 "pxor "a", "a" \n\t"\
86 "pcmpgtw "b", "a" \n\t" /* block[i] <= 0 ? 0xFF : 0x00 */\
87 "pxor "a", "b" \n\t"\
88 "psubw "a", "b" \n\t" /* ABS(block[i]) */
89 #define RESTORE_SIGN(a,b) \
90 "pxor "a", "b" \n\t"\
91 "psubw "a", "b" \n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
48 #endif 92 #endif
49 93
50 static int RENAME(dct_quantize)(MpegEncContext *s, 94 static int RENAME(dct_quantize)(MpegEncContext *s,
51 DCTELEM *block, int n, 95 DCTELEM *block, int n,
52 int qscale, int *overflow) 96 int qscale, int *overflow)
53 { 97 {
54 long last_non_zero_p1; 98 long last_non_zero_p1;
55 int level=0, q; //=0 is cuz gcc says uninitalized ... 99 int level=0, q; //=0 is cuz gcc says uninitalized ...
56 const uint16_t *qmat, *bias; 100 const uint16_t *qmat, *bias;
57 DECLARE_ALIGNED_8(int16_t, temp_block[64]); 101 DECLARE_ALIGNED_16(int16_t, temp_block[64]);
58 102
59 assert((7&(int)(&temp_block[0])) == 0); //did gcc align it correctly? 103 assert((7&(int)(&temp_block[0])) == 0); //did gcc align it correctly?
60 104
61 //s->fdct (block); 105 //s->fdct (block);
62 RENAMEl(ff_fdct) (block); //cant be anything else ... 106 RENAMEl(ff_fdct) (block); //cant be anything else ...
104 } 148 }
105 149
106 if((s->out_format == FMT_H263 || s->out_format == FMT_H261) && s->mpeg_quant==0){ 150 if((s->out_format == FMT_H263 || s->out_format == FMT_H261) && s->mpeg_quant==0){
107 151
108 asm volatile( 152 asm volatile(
109 "movd %%"REG_a", %%mm3 \n\t" // last_non_zero_p1 153 "movd %%"REG_a", "MM"3 \n\t" // last_non_zero_p1
110 SPREADW(%%mm3) 154 SPREADW(MM"3")
111 "pxor %%mm7, %%mm7 \n\t" // 0 155 "pxor "MM"7, "MM"7 \n\t" // 0
112 "pxor %%mm4, %%mm4 \n\t" // 0 156 "pxor "MM"4, "MM"4 \n\t" // 0
113 "movq (%2), %%mm5 \n\t" // qmat[0] 157 MOVQ" (%2), "MM"5 \n\t" // qmat[0]
114 "pxor %%mm6, %%mm6 \n\t" 158 "pxor "MM"6, "MM"6 \n\t"
115 "psubw (%3), %%mm6 \n\t" // -bias[0] 159 "psubw (%3), "MM"6 \n\t" // -bias[0]
116 "mov $-128, %%"REG_a" \n\t" 160 "mov $-128, %%"REG_a" \n\t"
117 ASMALIGN(4) 161 ASMALIGN(4)
118 "1: \n\t" 162 "1: \n\t"
119 "pxor %%mm1, %%mm1 \n\t" // 0 163 MOVQ" (%1, %%"REG_a"), "MM"0 \n\t" // block[i]
120 "movq (%1, %%"REG_a"), %%mm0 \n\t" // block[i] 164 SAVE_SIGN(MM"1", MM"0") // ABS(block[i])
121 "pcmpgtw %%mm0, %%mm1 \n\t" // block[i] <= 0 ? 0xFF : 0x00 165 "psubusw "MM"6, "MM"0 \n\t" // ABS(block[i]) + bias[0]
122 "pxor %%mm1, %%mm0 \n\t" 166 "pmulhw "MM"5, "MM"0 \n\t" // (ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16
123 "psubw %%mm1, %%mm0 \n\t" // ABS(block[i]) 167 "por "MM"0, "MM"4 \n\t"
124 "psubusw %%mm6, %%mm0 \n\t" // ABS(block[i]) + bias[0] 168 RESTORE_SIGN(MM"1", MM"0") // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
125 "pmulhw %%mm5, %%mm0 \n\t" // (ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16 169 MOVQ" "MM"0, (%5, %%"REG_a") \n\t"
126 "por %%mm0, %%mm4 \n\t" 170 "pcmpeqw "MM"7, "MM"0 \n\t" // out==0 ? 0xFF : 0x00
127 "pxor %%mm1, %%mm0 \n\t" 171 MOVQ" (%4, %%"REG_a"), "MM"1 \n\t"
128 "psubw %%mm1, %%mm0 \n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i]) 172 MOVQ" "MM"7, (%1, %%"REG_a") \n\t" // 0
129 "movq %%mm0, (%5, %%"REG_a") \n\t" 173 "pandn "MM"1, "MM"0 \n\t"
130 "pcmpeqw %%mm7, %%mm0 \n\t" // out==0 ? 0xFF : 0x00 174 PMAXW(MM"0", MM"3")
131 "movq (%4, %%"REG_a"), %%mm1 \n\t" 175 "add $"MMREG_WIDTH", %%"REG_a" \n\t"
132 "movq %%mm7, (%1, %%"REG_a") \n\t" // 0
133 "pandn %%mm1, %%mm0 \n\t"
134 PMAXW(%%mm0, %%mm3)
135 "add $8, %%"REG_a" \n\t"
136 " js 1b \n\t" 176 " js 1b \n\t"
137 PMAX(%%mm3, %%mm0) 177 PMAX(MM"3", MM"0")
138 "movd %%mm3, %%"REG_a" \n\t" 178 "movd "MM"3, %%"REG_a" \n\t"
139 "movzb %%al, %%"REG_a" \n\t" // last_non_zero_p1 179 "movzb %%al, %%"REG_a" \n\t" // last_non_zero_p1
140 : "+a" (last_non_zero_p1) 180 : "+a" (last_non_zero_p1)
141 : "r" (block+64), "r" (qmat), "r" (bias), 181 : "r" (block+64), "r" (qmat), "r" (bias),
142 "r" (inv_zigzag_direct16+64), "r" (temp_block+64) 182 "r" (inv_zigzag_direct16+64), "r" (temp_block+64)
143 ); 183 );
144 // note the asm is split cuz gcc doesnt like that many operands ...
145 asm volatile(
146 "movd %1, %%mm1 \n\t" // max_qcoeff
147 SPREADW(%%mm1)
148 "psubusw %%mm1, %%mm4 \n\t"
149 "packuswb %%mm4, %%mm4 \n\t"
150 "movd %%mm4, %0 \n\t" // *overflow
151 : "=g" (*overflow)
152 : "g" (s->max_qcoeff)
153 );
154 }else{ // FMT_H263 184 }else{ // FMT_H263
155 asm volatile( 185 asm volatile(
156 "movd %%"REG_a", %%mm3 \n\t" // last_non_zero_p1 186 "movd %%"REG_a", "MM"3 \n\t" // last_non_zero_p1
157 SPREADW(%%mm3) 187 SPREADW(MM"3")
158 "pxor %%mm7, %%mm7 \n\t" // 0 188 "pxor "MM"7, "MM"7 \n\t" // 0
159 "pxor %%mm4, %%mm4 \n\t" // 0 189 "pxor "MM"4, "MM"4 \n\t" // 0
160 "mov $-128, %%"REG_a" \n\t" 190 "mov $-128, %%"REG_a" \n\t"
161 ASMALIGN(4) 191 ASMALIGN(4)
162 "1: \n\t" 192 "1: \n\t"
163 "pxor %%mm1, %%mm1 \n\t" // 0 193 MOVQ" (%1, %%"REG_a"), "MM"0 \n\t" // block[i]
164 "movq (%1, %%"REG_a"), %%mm0 \n\t" // block[i] 194 SAVE_SIGN(MM"1", MM"0") // ABS(block[i])
165 "pcmpgtw %%mm0, %%mm1 \n\t" // block[i] <= 0 ? 0xFF : 0x00 195 MOVQ" (%3, %%"REG_a"), "MM"6 \n\t" // bias[0]
166 "pxor %%mm1, %%mm0 \n\t" 196 "paddusw "MM"6, "MM"0 \n\t" // ABS(block[i]) + bias[0]
167 "psubw %%mm1, %%mm0 \n\t" // ABS(block[i]) 197 MOVQ" (%2, %%"REG_a"), "MM"5 \n\t" // qmat[i]
168 "movq (%3, %%"REG_a"), %%mm6 \n\t" // bias[0] 198 "pmulhw "MM"5, "MM"0 \n\t" // (ABS(block[i])*qmat[0] + bias[0]*qmat[0])>>16
169 "paddusw %%mm6, %%mm0 \n\t" // ABS(block[i]) + bias[0] 199 "por "MM"0, "MM"4 \n\t"
170 "movq (%2, %%"REG_a"), %%mm5 \n\t" // qmat[i] 200 RESTORE_SIGN(MM"1", MM"0") // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
171 "pmulhw %%mm5, %%mm0 \n\t" // (ABS(block[i])*qmat[0] + bias[0]*qmat[0])>>16 201 MOVQ" "MM"0, (%5, %%"REG_a") \n\t"
172 "por %%mm0, %%mm4 \n\t" 202 "pcmpeqw "MM"7, "MM"0 \n\t" // out==0 ? 0xFF : 0x00
173 "pxor %%mm1, %%mm0 \n\t" 203 MOVQ" (%4, %%"REG_a"), "MM"1 \n\t"
174 "psubw %%mm1, %%mm0 \n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i]) 204 MOVQ" "MM"7, (%1, %%"REG_a") \n\t" // 0
175 "movq %%mm0, (%5, %%"REG_a") \n\t" 205 "pandn "MM"1, "MM"0 \n\t"
176 "pcmpeqw %%mm7, %%mm0 \n\t" // out==0 ? 0xFF : 0x00 206 PMAXW(MM"0", MM"3")
177 "movq (%4, %%"REG_a"), %%mm1 \n\t" 207 "add $"MMREG_WIDTH", %%"REG_a" \n\t"
178 "movq %%mm7, (%1, %%"REG_a") \n\t" // 0
179 "pandn %%mm1, %%mm0 \n\t"
180 PMAXW(%%mm0, %%mm3)
181 "add $8, %%"REG_a" \n\t"
182 " js 1b \n\t" 208 " js 1b \n\t"
183 PMAX(%%mm3, %%mm0) 209 PMAX(MM"3", MM"0")
184 "movd %%mm3, %%"REG_a" \n\t" 210 "movd "MM"3, %%"REG_a" \n\t"
185 "movzb %%al, %%"REG_a" \n\t" // last_non_zero_p1 211 "movzb %%al, %%"REG_a" \n\t" // last_non_zero_p1
186 : "+a" (last_non_zero_p1) 212 : "+a" (last_non_zero_p1)
187 : "r" (block+64), "r" (qmat+64), "r" (bias+64), 213 : "r" (block+64), "r" (qmat+64), "r" (bias+64),
188 "r" (inv_zigzag_direct16+64), "r" (temp_block+64) 214 "r" (inv_zigzag_direct16+64), "r" (temp_block+64)
189 ); 215 );
190 // note the asm is split cuz gcc doesnt like that many operands ... 216 }
191 asm volatile( 217 asm volatile(
192 "movd %1, %%mm1 \n\t" // max_qcoeff 218 "movd %1, "MM"1 \n\t" // max_qcoeff
193 SPREADW(%%mm1) 219 SPREADW(MM"1")
194 "psubusw %%mm1, %%mm4 \n\t" 220 "psubusw "MM"1, "MM"4 \n\t"
195 "packuswb %%mm4, %%mm4 \n\t" 221 "packuswb "MM"4, "MM"4 \n\t"
196 "movd %%mm4, %0 \n\t" // *overflow 222 #ifdef HAVE_SSE2
223 "packuswb "MM"4, "MM"4 \n\t"
224 #endif
225 "movd "MM"4, %0 \n\t" // *overflow
197 : "=g" (*overflow) 226 : "=g" (*overflow)
198 : "g" (s->max_qcoeff) 227 : "g" (s->max_qcoeff)
199 ); 228 );
200 }
201 229
202 if(s->mb_intra) block[0]= level; 230 if(s->mb_intra) block[0]= level;
203 else block[0]= temp_block[0]; 231 else block[0]= temp_block[0];
204 232
205 if(s->dsp.idct_permutation_type == FF_SIMPLE_IDCT_PERM){ 233 if(s->dsp.idct_permutation_type == FF_SIMPLE_IDCT_PERM){