comparison x86/h264dsp_mmx.c @ 12492:58a960d6e34c libavcodec

Rename h264_idct_sse2.asm to h264_idct.asm; move inline IDCT asm from h264dsp_mmx.c to h264_idct.asm (as yasm code). Because the loops are now coded in asm instead of C, this is (depending on the function) up to 50% faster for cases where gcc didn't do a great job at looping. Since h264_idct_add8() is now faster than the manual loop setup in h264.c, in-asm idct calling can now be enabled for chroma as well (see r16207). For MMX, this is 5% faster. For SSE2 (which isn't done for chroma if h264.c does the looping), this makes it up to 50% faster. Speed gain overall is ~0.5-1.0%.
author rbultje
date Tue, 14 Sep 2010 13:36:26 +0000
parents 14896fa76003
children 6bc14239edfb
comparison
equal deleted inserted replaced
12491:990f8a5fc8af 12492:58a960d6e34c
27 DECLARE_ALIGNED(8, static const uint64_t, ff_pb_7_3 ) = 0x0307030703070307ULL; 27 DECLARE_ALIGNED(8, static const uint64_t, ff_pb_7_3 ) = 0x0307030703070307ULL;
28 28
29 /***********************************/ 29 /***********************************/
30 /* IDCT */ 30 /* IDCT */
31 31
32 #define SUMSUB_BADC( a, b, c, d ) \ 32 void ff_h264_idct_add_mmx (uint8_t *dst, int16_t *block, int stride);
33 "paddw "#b", "#a" \n\t"\ 33 void ff_h264_idct8_add_mmx (uint8_t *dst, int16_t *block, int stride);
34 "paddw "#d", "#c" \n\t"\ 34 void ff_h264_idct8_add_sse2 (uint8_t *dst, int16_t *block, int stride);
35 "paddw "#b", "#b" \n\t"\ 35 void ff_h264_idct_dc_add_mmx2 (uint8_t *dst, int16_t *block, int stride);
36 "paddw "#d", "#d" \n\t"\ 36 void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride);
37 "psubw "#a", "#b" \n\t"\ 37
38 "psubw "#c", "#d" \n\t" 38 void ff_h264_idct_add16_mmx (uint8_t *dst, const int *block_offset,
39 39 DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
40 #define SUMSUBD2_AB( a, b, t ) \ 40 void ff_h264_idct8_add4_mmx (uint8_t *dst, const int *block_offset,
41 "movq "#b", "#t" \n\t"\ 41 DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
42 "psraw $1 , "#b" \n\t"\ 42 void ff_h264_idct_add16_mmx2 (uint8_t *dst, const int *block_offset,
43 "paddw "#a", "#b" \n\t"\ 43 DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
44 "psraw $1 , "#a" \n\t"\ 44 void ff_h264_idct_add16intra_mmx (uint8_t *dst, const int *block_offset,
45 "psubw "#t", "#a" \n\t" 45 DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
46 46 void ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset,
47 #define IDCT4_1D( s02, s13, d02, d13, t ) \ 47 DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
48 SUMSUB_BA ( s02, d02 )\ 48 void ff_h264_idct8_add4_mmx2 (uint8_t *dst, const int *block_offset,
49 SUMSUBD2_AB( s13, d13, t )\ 49 DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
50 SUMSUB_BADC( d13, s02, s13, d02 ) 50 void ff_h264_idct8_add4_sse2 (uint8_t *dst, const int *block_offset,
51 51 DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
52 #define STORE_DIFF_4P( p, t, z ) \ 52 void ff_h264_idct_add8_mmx (uint8_t **dest, const int *block_offset,
53 "psraw $6, "#p" \n\t"\ 53 DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
54 "movd (%0), "#t" \n\t"\ 54 void ff_h264_idct_add8_mmx2 (uint8_t **dest, const int *block_offset,
55 "punpcklbw "#z", "#t" \n\t"\ 55 DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
56 "paddsw "#t", "#p" \n\t"\ 56
57 "packuswb "#z", "#p" \n\t"\ 57 void ff_h264_idct_add16_sse2 (uint8_t *dst, const int *block_offset, DCTELEM *block,
58 "movd "#p", (%0) \n\t" 58 int stride, const uint8_t nnzc[6*8]);
59 59 void ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block,
60 static void ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride) 60 int stride, const uint8_t nnzc[6*8]);
61 { 61 void ff_h264_idct_add8_sse2 (uint8_t **dest, const int *block_offset, DCTELEM *block,
62 /* Load dct coeffs */ 62 int stride, const uint8_t nnzc[6*8]);
63 __asm__ volatile(
64 "movq (%0), %%mm0 \n\t"
65 "movq 8(%0), %%mm1 \n\t"
66 "movq 16(%0), %%mm2 \n\t"
67 "movq 24(%0), %%mm3 \n\t"
68 :: "r"(block) );
69
70 __asm__ volatile(
71 /* mm1=s02+s13 mm2=s02-s13 mm4=d02+d13 mm0=d02-d13 */
72 IDCT4_1D( %%mm2, %%mm1, %%mm0, %%mm3, %%mm4 )
73
74 "movq %0, %%mm6 \n\t"
75 /* in: 1,4,0,2 out: 1,2,3,0 */
76 TRANSPOSE4( %%mm3, %%mm1, %%mm0, %%mm2, %%mm4 )
77
78 "paddw %%mm6, %%mm3 \n\t"
79
80 /* mm2=s02+s13 mm3=s02-s13 mm4=d02+d13 mm1=d02-d13 */
81 IDCT4_1D( %%mm4, %%mm2, %%mm3, %%mm0, %%mm1 )
82
83 "pxor %%mm7, %%mm7 \n\t"
84 :: "m"(ff_pw_32));
85
86 __asm__ volatile(
87 STORE_DIFF_4P( %%mm0, %%mm1, %%mm7)
88 "add %1, %0 \n\t"
89 STORE_DIFF_4P( %%mm2, %%mm1, %%mm7)
90 "add %1, %0 \n\t"
91 STORE_DIFF_4P( %%mm3, %%mm1, %%mm7)
92 "add %1, %0 \n\t"
93 STORE_DIFF_4P( %%mm4, %%mm1, %%mm7)
94 : "+r"(dst)
95 : "r" ((x86_reg)stride)
96 );
97 }
98
99 static inline void h264_idct8_1d(int16_t *block)
100 {
101 __asm__ volatile(
102 "movq 112(%0), %%mm7 \n\t"
103 "movq 80(%0), %%mm0 \n\t"
104 "movq 48(%0), %%mm3 \n\t"
105 "movq 16(%0), %%mm5 \n\t"
106
107 "movq %%mm0, %%mm4 \n\t"
108 "movq %%mm5, %%mm1 \n\t"
109 "psraw $1, %%mm4 \n\t"
110 "psraw $1, %%mm1 \n\t"
111 "paddw %%mm0, %%mm4 \n\t"
112 "paddw %%mm5, %%mm1 \n\t"
113 "paddw %%mm7, %%mm4 \n\t"
114 "paddw %%mm0, %%mm1 \n\t"
115 "psubw %%mm5, %%mm4 \n\t"
116 "paddw %%mm3, %%mm1 \n\t"
117
118 "psubw %%mm3, %%mm5 \n\t"
119 "psubw %%mm3, %%mm0 \n\t"
120 "paddw %%mm7, %%mm5 \n\t"
121 "psubw %%mm7, %%mm0 \n\t"
122 "psraw $1, %%mm3 \n\t"
123 "psraw $1, %%mm7 \n\t"
124 "psubw %%mm3, %%mm5 \n\t"
125 "psubw %%mm7, %%mm0 \n\t"
126
127 "movq %%mm4, %%mm3 \n\t"
128 "movq %%mm1, %%mm7 \n\t"
129 "psraw $2, %%mm1 \n\t"
130 "psraw $2, %%mm3 \n\t"
131 "paddw %%mm5, %%mm3 \n\t"
132 "psraw $2, %%mm5 \n\t"
133 "paddw %%mm0, %%mm1 \n\t"
134 "psraw $2, %%mm0 \n\t"
135 "psubw %%mm4, %%mm5 \n\t"
136 "psubw %%mm0, %%mm7 \n\t"
137
138 "movq 32(%0), %%mm2 \n\t"
139 "movq 96(%0), %%mm6 \n\t"
140 "movq %%mm2, %%mm4 \n\t"
141 "movq %%mm6, %%mm0 \n\t"
142 "psraw $1, %%mm4 \n\t"
143 "psraw $1, %%mm6 \n\t"
144 "psubw %%mm0, %%mm4 \n\t"
145 "paddw %%mm2, %%mm6 \n\t"
146
147 "movq (%0), %%mm2 \n\t"
148 "movq 64(%0), %%mm0 \n\t"
149 SUMSUB_BA( %%mm0, %%mm2 )
150 SUMSUB_BA( %%mm6, %%mm0 )
151 SUMSUB_BA( %%mm4, %%mm2 )
152 SUMSUB_BA( %%mm7, %%mm6 )
153 SUMSUB_BA( %%mm5, %%mm4 )
154 SUMSUB_BA( %%mm3, %%mm2 )
155 SUMSUB_BA( %%mm1, %%mm0 )
156 :: "r"(block)
157 );
158 }
159
160 static void ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
161 {
162 int i;
163 DECLARE_ALIGNED(8, int16_t, b2)[64];
164
165 block[0] += 32;
166
167 for(i=0; i<2; i++){
168 DECLARE_ALIGNED(8, uint64_t, tmp);
169
170 h264_idct8_1d(block+4*i);
171
172 __asm__ volatile(
173 "movq %%mm7, %0 \n\t"
174 TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 )
175 "movq %%mm0, 8(%1) \n\t"
176 "movq %%mm6, 24(%1) \n\t"
177 "movq %%mm7, 40(%1) \n\t"
178 "movq %%mm4, 56(%1) \n\t"
179 "movq %0, %%mm7 \n\t"
180 TRANSPOSE4( %%mm7, %%mm5, %%mm3, %%mm1, %%mm0 )
181 "movq %%mm7, (%1) \n\t"
182 "movq %%mm1, 16(%1) \n\t"
183 "movq %%mm0, 32(%1) \n\t"
184 "movq %%mm3, 48(%1) \n\t"
185 : "=m"(tmp)
186 : "r"(b2+32*i)
187 : "memory"
188 );
189 }
190
191 for(i=0; i<2; i++){
192 h264_idct8_1d(b2+4*i);
193
194 __asm__ volatile(
195 "psraw $6, %%mm7 \n\t"
196 "psraw $6, %%mm6 \n\t"
197 "psraw $6, %%mm5 \n\t"
198 "psraw $6, %%mm4 \n\t"
199 "psraw $6, %%mm3 \n\t"
200 "psraw $6, %%mm2 \n\t"
201 "psraw $6, %%mm1 \n\t"
202 "psraw $6, %%mm0 \n\t"
203
204 "movq %%mm7, (%0) \n\t"
205 "movq %%mm5, 16(%0) \n\t"
206 "movq %%mm3, 32(%0) \n\t"
207 "movq %%mm1, 48(%0) \n\t"
208 "movq %%mm0, 64(%0) \n\t"
209 "movq %%mm2, 80(%0) \n\t"
210 "movq %%mm4, 96(%0) \n\t"
211 "movq %%mm6, 112(%0) \n\t"
212 :: "r"(b2+4*i)
213 : "memory"
214 );
215 }
216
217 ff_add_pixels_clamped_mmx(b2, dst, stride);
218 }
219
220 #define STORE_DIFF_8P( p, d, t, z )\
221 "movq "#d", "#t" \n"\
222 "psraw $6, "#p" \n"\
223 "punpcklbw "#z", "#t" \n"\
224 "paddsw "#t", "#p" \n"\
225 "packuswb "#p", "#p" \n"\
226 "movq "#p", "#d" \n"
227
228 #define H264_IDCT8_1D_SSE2(a,b,c,d,e,f,g,h)\
229 "movdqa "#c", "#a" \n"\
230 "movdqa "#g", "#e" \n"\
231 "psraw $1, "#c" \n"\
232 "psraw $1, "#g" \n"\
233 "psubw "#e", "#c" \n"\
234 "paddw "#a", "#g" \n"\
235 "movdqa "#b", "#e" \n"\
236 "psraw $1, "#e" \n"\
237 "paddw "#b", "#e" \n"\
238 "paddw "#d", "#e" \n"\
239 "paddw "#f", "#e" \n"\
240 "movdqa "#f", "#a" \n"\
241 "psraw $1, "#a" \n"\
242 "paddw "#f", "#a" \n"\
243 "paddw "#h", "#a" \n"\
244 "psubw "#b", "#a" \n"\
245 "psubw "#d", "#b" \n"\
246 "psubw "#d", "#f" \n"\
247 "paddw "#h", "#b" \n"\
248 "psubw "#h", "#f" \n"\
249 "psraw $1, "#d" \n"\
250 "psraw $1, "#h" \n"\
251 "psubw "#d", "#b" \n"\
252 "psubw "#h", "#f" \n"\
253 "movdqa "#e", "#d" \n"\
254 "movdqa "#a", "#h" \n"\
255 "psraw $2, "#d" \n"\
256 "psraw $2, "#h" \n"\
257 "paddw "#f", "#d" \n"\
258 "paddw "#b", "#h" \n"\
259 "psraw $2, "#f" \n"\
260 "psraw $2, "#b" \n"\
261 "psubw "#f", "#e" \n"\
262 "psubw "#a", "#b" \n"\
263 "movdqa 0x00(%1), "#a" \n"\
264 "movdqa 0x40(%1), "#f" \n"\
265 SUMSUB_BA(f, a)\
266 SUMSUB_BA(g, f)\
267 SUMSUB_BA(c, a)\
268 SUMSUB_BA(e, g)\
269 SUMSUB_BA(b, c)\
270 SUMSUB_BA(h, a)\
271 SUMSUB_BA(d, f)
272
273 static void ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride)
274 {
275 __asm__ volatile(
276 "movdqa 0x10(%1), %%xmm1 \n"
277 "movdqa 0x20(%1), %%xmm2 \n"
278 "movdqa 0x30(%1), %%xmm3 \n"
279 "movdqa 0x50(%1), %%xmm5 \n"
280 "movdqa 0x60(%1), %%xmm6 \n"
281 "movdqa 0x70(%1), %%xmm7 \n"
282 H264_IDCT8_1D_SSE2(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)
283 TRANSPOSE8(%%xmm4, %%xmm1, %%xmm7, %%xmm3, %%xmm5, %%xmm0, %%xmm2, %%xmm6, (%1))
284 "paddw %4, %%xmm4 \n"
285 "movdqa %%xmm4, 0x00(%1) \n"
286 "movdqa %%xmm2, 0x40(%1) \n"
287 H264_IDCT8_1D_SSE2(%%xmm4, %%xmm0, %%xmm6, %%xmm3, %%xmm2, %%xmm5, %%xmm7, %%xmm1)
288 "movdqa %%xmm6, 0x60(%1) \n"
289 "movdqa %%xmm7, 0x70(%1) \n"
290 "pxor %%xmm7, %%xmm7 \n"
291 STORE_DIFF_8P(%%xmm2, (%0), %%xmm6, %%xmm7)
292 STORE_DIFF_8P(%%xmm0, (%0,%2), %%xmm6, %%xmm7)
293 STORE_DIFF_8P(%%xmm1, (%0,%2,2), %%xmm6, %%xmm7)
294 STORE_DIFF_8P(%%xmm3, (%0,%3), %%xmm6, %%xmm7)
295 "lea (%0,%2,4), %0 \n"
296 STORE_DIFF_8P(%%xmm5, (%0), %%xmm6, %%xmm7)
297 STORE_DIFF_8P(%%xmm4, (%0,%2), %%xmm6, %%xmm7)
298 "movdqa 0x60(%1), %%xmm0 \n"
299 "movdqa 0x70(%1), %%xmm1 \n"
300 STORE_DIFF_8P(%%xmm0, (%0,%2,2), %%xmm6, %%xmm7)
301 STORE_DIFF_8P(%%xmm1, (%0,%3), %%xmm6, %%xmm7)
302 :"+r"(dst)
303 :"r"(block), "r"((x86_reg)stride), "r"((x86_reg)3L*stride), "m"(ff_pw_32)
304 );
305 }
306
307 static void ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
308 {
309 int dc = (block[0] + 32) >> 6;
310 __asm__ volatile(
311 "movd %0, %%mm0 \n\t"
312 "pshufw $0, %%mm0, %%mm0 \n\t"
313 "pxor %%mm1, %%mm1 \n\t"
314 "psubw %%mm0, %%mm1 \n\t"
315 "packuswb %%mm0, %%mm0 \n\t"
316 "packuswb %%mm1, %%mm1 \n\t"
317 ::"r"(dc)
318 );
319 __asm__ volatile(
320 "movd %0, %%mm2 \n\t"
321 "movd %1, %%mm3 \n\t"
322 "movd %2, %%mm4 \n\t"
323 "movd %3, %%mm5 \n\t"
324 "paddusb %%mm0, %%mm2 \n\t"
325 "paddusb %%mm0, %%mm3 \n\t"
326 "paddusb %%mm0, %%mm4 \n\t"
327 "paddusb %%mm0, %%mm5 \n\t"
328 "psubusb %%mm1, %%mm2 \n\t"
329 "psubusb %%mm1, %%mm3 \n\t"
330 "psubusb %%mm1, %%mm4 \n\t"
331 "psubusb %%mm1, %%mm5 \n\t"
332 "movd %%mm2, %0 \n\t"
333 "movd %%mm3, %1 \n\t"
334 "movd %%mm4, %2 \n\t"
335 "movd %%mm5, %3 \n\t"
336 :"+m"(*(uint32_t*)(dst+0*stride)),
337 "+m"(*(uint32_t*)(dst+1*stride)),
338 "+m"(*(uint32_t*)(dst+2*stride)),
339 "+m"(*(uint32_t*)(dst+3*stride))
340 );
341 }
342
343 static void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
344 {
345 int dc = (block[0] + 32) >> 6;
346 int y;
347 __asm__ volatile(
348 "movd %0, %%mm0 \n\t"
349 "pshufw $0, %%mm0, %%mm0 \n\t"
350 "pxor %%mm1, %%mm1 \n\t"
351 "psubw %%mm0, %%mm1 \n\t"
352 "packuswb %%mm0, %%mm0 \n\t"
353 "packuswb %%mm1, %%mm1 \n\t"
354 ::"r"(dc)
355 );
356 for(y=2; y--; dst += 4*stride){
357 __asm__ volatile(
358 "movq %0, %%mm2 \n\t"
359 "movq %1, %%mm3 \n\t"
360 "movq %2, %%mm4 \n\t"
361 "movq %3, %%mm5 \n\t"
362 "paddusb %%mm0, %%mm2 \n\t"
363 "paddusb %%mm0, %%mm3 \n\t"
364 "paddusb %%mm0, %%mm4 \n\t"
365 "paddusb %%mm0, %%mm5 \n\t"
366 "psubusb %%mm1, %%mm2 \n\t"
367 "psubusb %%mm1, %%mm3 \n\t"
368 "psubusb %%mm1, %%mm4 \n\t"
369 "psubusb %%mm1, %%mm5 \n\t"
370 "movq %%mm2, %0 \n\t"
371 "movq %%mm3, %1 \n\t"
372 "movq %%mm4, %2 \n\t"
373 "movq %%mm5, %3 \n\t"
374 :"+m"(*(uint64_t*)(dst+0*stride)),
375 "+m"(*(uint64_t*)(dst+1*stride)),
376 "+m"(*(uint64_t*)(dst+2*stride)),
377 "+m"(*(uint64_t*)(dst+3*stride))
378 );
379 }
380 }
381
382 //FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split
383 static const uint8_t scan8[16 + 2*4]={
384 4+1*8, 5+1*8, 4+2*8, 5+2*8,
385 6+1*8, 7+1*8, 6+2*8, 7+2*8,
386 4+3*8, 5+3*8, 4+4*8, 5+4*8,
387 6+3*8, 7+3*8, 6+4*8, 7+4*8,
388 1+1*8, 2+1*8,
389 1+2*8, 2+2*8,
390 1+4*8, 2+4*8,
391 1+5*8, 2+5*8,
392 };
393
394 static void ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
395 int i;
396 for(i=0; i<16; i++){
397 if(nnzc[ scan8[i] ])
398 ff_h264_idct_add_mmx(dst + block_offset[i], block + i*16, stride);
399 }
400 }
401
402 static void ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
403 int i;
404 for(i=0; i<16; i+=4){
405 if(nnzc[ scan8[i] ])
406 ff_h264_idct8_add_mmx(dst + block_offset[i], block + i*16, stride);
407 }
408 }
409
410
411 static void ff_h264_idct_add16_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
412 int i;
413 for(i=0; i<16; i++){
414 int nnz = nnzc[ scan8[i] ];
415 if(nnz){
416 if(nnz==1 && block[i*16]) ff_h264_idct_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
417 else ff_h264_idct_add_mmx (dst + block_offset[i], block + i*16, stride);
418 }
419 }
420 }
421
422 static void ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
423 int i;
424 for(i=0; i<16; i++){
425 if(nnzc[ scan8[i] ] || block[i*16])
426 ff_h264_idct_add_mmx(dst + block_offset[i], block + i*16, stride);
427 }
428 }
429
430 static void ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
431 int i;
432 for(i=0; i<16; i++){
433 if(nnzc[ scan8[i] ]) ff_h264_idct_add_mmx (dst + block_offset[i], block + i*16, stride);
434 else if(block[i*16]) ff_h264_idct_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
435 }
436 }
437
438 static void ff_h264_idct8_add4_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
439 int i;
440 for(i=0; i<16; i+=4){
441 int nnz = nnzc[ scan8[i] ];
442 if(nnz){
443 if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
444 else ff_h264_idct8_add_mmx (dst + block_offset[i], block + i*16, stride);
445 }
446 }
447 }
448
449 static void ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
450 int i;
451 for(i=0; i<16; i+=4){
452 int nnz = nnzc[ scan8[i] ];
453 if(nnz){
454 if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
455 else ff_h264_idct8_add_sse2 (dst + block_offset[i], block + i*16, stride);
456 }
457 }
458 }
459
460 static void ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
461 int i;
462 for(i=16; i<16+8; i++){
463 if(nnzc[ scan8[i] ] || block[i*16])
464 ff_h264_idct_add_mmx (dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
465 }
466 }
467
468 static void ff_h264_idct_add8_mmx2(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
469 int i;
470 for(i=16; i<16+8; i++){
471 if(nnzc[ scan8[i] ])
472 ff_h264_idct_add_mmx (dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
473 else if(block[i*16])
474 ff_h264_idct_dc_add_mmx2(dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
475 }
476 }
477
478 #if HAVE_YASM
479 static void ff_h264_idct_dc_add8_mmx2(uint8_t *dst, int16_t *block, int stride)
480 {
481 __asm__ volatile(
482 "movd %0, %%mm0 \n\t" // 0 0 X D
483 "punpcklwd %1, %%mm0 \n\t" // x X d D
484 "paddsw %2, %%mm0 \n\t"
485 "psraw $6, %%mm0 \n\t"
486 "punpcklwd %%mm0, %%mm0 \n\t" // d d D D
487 "pxor %%mm1, %%mm1 \n\t" // 0 0 0 0
488 "psubw %%mm0, %%mm1 \n\t" // -d-d-D-D
489 "packuswb %%mm1, %%mm0 \n\t" // -d-d-D-D d d D D
490 "pshufw $0xFA, %%mm0, %%mm1 \n\t" // -d-d-d-d-D-D-D-D
491 "punpcklwd %%mm0, %%mm0 \n\t" // d d d d D D D D
492 ::"m"(block[ 0]),
493 "m"(block[16]),
494 "m"(ff_pw_32)
495 );
496 __asm__ volatile(
497 "movq %0, %%mm2 \n\t"
498 "movq %1, %%mm3 \n\t"
499 "movq %2, %%mm4 \n\t"
500 "movq %3, %%mm5 \n\t"
501 "paddusb %%mm0, %%mm2 \n\t"
502 "paddusb %%mm0, %%mm3 \n\t"
503 "paddusb %%mm0, %%mm4 \n\t"
504 "paddusb %%mm0, %%mm5 \n\t"
505 "psubusb %%mm1, %%mm2 \n\t"
506 "psubusb %%mm1, %%mm3 \n\t"
507 "psubusb %%mm1, %%mm4 \n\t"
508 "psubusb %%mm1, %%mm5 \n\t"
509 "movq %%mm2, %0 \n\t"
510 "movq %%mm3, %1 \n\t"
511 "movq %%mm4, %2 \n\t"
512 "movq %%mm5, %3 \n\t"
513 :"+m"(*(uint64_t*)(dst+0*stride)),
514 "+m"(*(uint64_t*)(dst+1*stride)),
515 "+m"(*(uint64_t*)(dst+2*stride)),
516 "+m"(*(uint64_t*)(dst+3*stride))
517 );
518 }
519
520 extern void ff_x264_add8x4_idct_sse2(uint8_t *dst, int16_t *block, int stride);
521
522 static void ff_h264_idct_add16_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
523 int i;
524 for(i=0; i<16; i+=2)
525 if(nnzc[ scan8[i+0] ]|nnzc[ scan8[i+1] ])
526 ff_x264_add8x4_idct_sse2 (dst + block_offset[i], block + i*16, stride);
527 }
528
529 static void ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
530 int i;
531 for(i=0; i<16; i+=2){
532 if(nnzc[ scan8[i+0] ]|nnzc[ scan8[i+1] ])
533 ff_x264_add8x4_idct_sse2 (dst + block_offset[i], block + i*16, stride);
534 else if(block[i*16]|block[i*16+16])
535 ff_h264_idct_dc_add8_mmx2(dst + block_offset[i], block + i*16, stride);
536 }
537 }
538
539 static void ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
540 int i;
541 for(i=16; i<16+8; i+=2){
542 if(nnzc[ scan8[i+0] ]|nnzc[ scan8[i+1] ])
543 ff_x264_add8x4_idct_sse2 (dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
544 else if(block[i*16]|block[i*16+16])
545 ff_h264_idct_dc_add8_mmx2(dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
546 }
547 }
548 #endif
549 63
550 /***********************************/ 64 /***********************************/
551 /* deblocking */ 65 /* deblocking */
552 66
553 static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2], 67 static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2],
743 257
744 void ff_h264dsp_init_x86(H264DSPContext *c) 258 void ff_h264dsp_init_x86(H264DSPContext *c)
745 { 259 {
746 int mm_flags = av_get_cpu_flags(); 260 int mm_flags = av_get_cpu_flags();
747 261
262 if (mm_flags & AV_CPU_FLAG_MMX2) {
263 c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2;
264 }
265 #if HAVE_YASM
748 if (mm_flags & AV_CPU_FLAG_MMX) { 266 if (mm_flags & AV_CPU_FLAG_MMX) {
749 c->h264_idct_dc_add= 267 c->h264_idct_dc_add=
750 c->h264_idct_add= ff_h264_idct_add_mmx; 268 c->h264_idct_add= ff_h264_idct_add_mmx;
751 c->h264_idct8_dc_add= 269 c->h264_idct8_dc_add=
752 c->h264_idct8_add= ff_h264_idct8_add_mmx; 270 c->h264_idct8_add= ff_h264_idct8_add_mmx;
762 c->h264_idct_add16 = ff_h264_idct_add16_mmx2; 280 c->h264_idct_add16 = ff_h264_idct_add16_mmx2;
763 c->h264_idct8_add4 = ff_h264_idct8_add4_mmx2; 281 c->h264_idct8_add4 = ff_h264_idct8_add4_mmx2;
764 c->h264_idct_add8 = ff_h264_idct_add8_mmx2; 282 c->h264_idct_add8 = ff_h264_idct_add8_mmx2;
765 c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2; 283 c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2;
766 284
767 c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2;
768 }
769 if(mm_flags & AV_CPU_FLAG_SSE2){
770 c->h264_idct8_add = ff_h264_idct8_add_sse2;
771 c->h264_idct8_add4= ff_h264_idct8_add4_sse2;
772 }
773
774 #if HAVE_YASM
775 if (mm_flags & AV_CPU_FLAG_MMX2){
776 c->h264_v_loop_filter_chroma= ff_x264_deblock_v_chroma_mmxext; 285 c->h264_v_loop_filter_chroma= ff_x264_deblock_v_chroma_mmxext;
777 c->h264_h_loop_filter_chroma= ff_x264_deblock_h_chroma_mmxext; 286 c->h264_h_loop_filter_chroma= ff_x264_deblock_h_chroma_mmxext;
778 c->h264_v_loop_filter_chroma_intra= ff_x264_deblock_v_chroma_intra_mmxext; 287 c->h264_v_loop_filter_chroma_intra= ff_x264_deblock_v_chroma_intra_mmxext;
779 c->h264_h_loop_filter_chroma_intra= ff_x264_deblock_h_chroma_intra_mmxext; 288 c->h264_h_loop_filter_chroma_intra= ff_x264_deblock_h_chroma_intra_mmxext;
780 #if ARCH_X86_32 289 #if ARCH_X86_32
800 c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2; 309 c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2;
801 c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2; 310 c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
802 c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2; 311 c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
803 312
804 if (mm_flags&AV_CPU_FLAG_SSE2) { 313 if (mm_flags&AV_CPU_FLAG_SSE2) {
314 c->h264_idct8_add = ff_h264_idct8_add_sse2;
315 c->h264_idct8_add4= ff_h264_idct8_add4_sse2;
316
805 c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_sse2; 317 c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_sse2;
806 c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_sse2; 318 c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_sse2;
807 c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_sse2; 319 c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_sse2;
808 c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_sse2; 320 c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_sse2;
809 c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_sse2; 321 c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_sse2;
830 c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_ssse3; 342 c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_ssse3;
831 c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_ssse3; 343 c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_ssse3;
832 c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_ssse3; 344 c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_ssse3;
833 } 345 }
834 } 346 }
347 }
835 #endif 348 #endif
836 }
837 } 349 }