comparison x86/vp8dsp-init.c @ 12210:baf13deed97e libavcodec

Various VP8 x86 deblocking speedups SSSE3 versions, improve SSE2 versions a bit. SSE2/SSSE3 mbedge h functions are currently broken, so explicitly disable them.
author darkshikari
date Wed, 21 Jul 2010 22:11:03 +0000
parents 9eef00a43280
children 657d353cd515
comparison
equal deleted inserted replaced
12209:9eef00a43280 12210:baf13deed97e
221 extern void ff_vp8_idct_dc_add_mmx(uint8_t *dst, DCTELEM block[16], int stride); 221 extern void ff_vp8_idct_dc_add_mmx(uint8_t *dst, DCTELEM block[16], int stride);
222 extern void ff_vp8_idct_dc_add_sse4(uint8_t *dst, DCTELEM block[16], int stride); 222 extern void ff_vp8_idct_dc_add_sse4(uint8_t *dst, DCTELEM block[16], int stride);
223 extern void ff_vp8_luma_dc_wht_mmx(DCTELEM block[4][4][16], DCTELEM dc[16]); 223 extern void ff_vp8_luma_dc_wht_mmx(DCTELEM block[4][4][16], DCTELEM dc[16]);
224 extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16], int stride); 224 extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16], int stride);
225 225
226 extern void ff_vp8_v_loop_filter_simple_mmx (uint8_t *dst, int stride, int flim); 226 #define DECLARE_LOOP_FILTER(NAME)\
227 extern void ff_vp8_v_loop_filter_simple_mmxext(uint8_t *dst, int stride, int flim); 227 extern void ff_vp8_v_loop_filter_simple_ ## NAME(uint8_t *dst, int stride, int flim);\
228 extern void ff_vp8_v_loop_filter_simple_sse2 (uint8_t *dst, int stride, int flim); 228 extern void ff_vp8_h_loop_filter_simple_ ## NAME(uint8_t *dst, int stride, int flim);\
229 extern void ff_vp8_h_loop_filter_simple_mmx (uint8_t *dst, int stride, int flim); 229 extern void ff_vp8_v_loop_filter16y_inner_ ## NAME (uint8_t *dst, int stride,\
230 extern void ff_vp8_h_loop_filter_simple_mmxext(uint8_t *dst, int stride, int flim); 230 int e, int i, int hvt);\
231 extern void ff_vp8_h_loop_filter_simple_sse2 (uint8_t *dst, int stride, int flim); 231 extern void ff_vp8_h_loop_filter16y_inner_ ## NAME (uint8_t *dst, int stride,\
232 232 int e, int i, int hvt);\
233 extern void ff_vp8_v_loop_filter16y_inner_mmx (uint8_t *dst, int stride, 233 extern void ff_vp8_v_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, uint8_t *dstV,\
234 int e, int i, int hvt); 234 int s, int e, int i, int hvt);\
235 extern void ff_vp8_v_loop_filter16y_inner_mmxext(uint8_t *dst, int stride, 235 extern void ff_vp8_h_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, uint8_t *dstV,\
236 int e, int i, int hvt); 236 int s, int e, int i, int hvt);\
237 extern void ff_vp8_v_loop_filter16y_inner_sse2 (uint8_t *dst, int stride, 237 extern void ff_vp8_v_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, int stride,\
238 int e, int i, int hvt); 238 int e, int i, int hvt);\
239 extern void ff_vp8_h_loop_filter16y_inner_mmx (uint8_t *dst, int stride, 239 extern void ff_vp8_h_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, int stride,\
240 int e, int i, int hvt); 240 int e, int i, int hvt);\
241 extern void ff_vp8_h_loop_filter16y_inner_mmxext(uint8_t *dst, int stride, 241 extern void ff_vp8_v_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, uint8_t *dstV,\
242 int e, int i, int hvt); 242 int s, int e, int i, int hvt);\
243 extern void ff_vp8_h_loop_filter16y_inner_sse2 (uint8_t *dst, int stride, 243 extern void ff_vp8_h_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, uint8_t *dstV,\
244 int e, int i, int hvt); 244 int s, int e, int i, int hvt);
245 245
246 extern void ff_vp8_v_loop_filter8uv_inner_mmx (uint8_t *dstU, uint8_t *dstV, 246 DECLARE_LOOP_FILTER(mmx)
247 int s, int e, int i, int hvt); 247 DECLARE_LOOP_FILTER(mmxext)
248 extern void ff_vp8_v_loop_filter8uv_inner_mmxext(uint8_t *dstU, uint8_t *dstV, 248 DECLARE_LOOP_FILTER(sse2)
249 int s, int e, int i, int hvt); 249 DECLARE_LOOP_FILTER(ssse3)
250 extern void ff_vp8_v_loop_filter8uv_inner_sse2 (uint8_t *dstU, uint8_t *dstV, 250
251 int s, int e, int i, int hvt);
252 extern void ff_vp8_h_loop_filter8uv_inner_mmx (uint8_t *dstU, uint8_t *dstV,
253 int s, int e, int i, int hvt);
254 extern void ff_vp8_h_loop_filter8uv_inner_mmxext(uint8_t *dstU, uint8_t *dstV,
255 int s, int e, int i, int hvt);
256 extern void ff_vp8_h_loop_filter8uv_inner_sse2 (uint8_t *dstU, uint8_t *dstV,
257 int s, int e, int i, int hvt);
258
259 extern void ff_vp8_v_loop_filter16y_mbedge_mmx (uint8_t *dst, int stride,
260 int e, int i, int hvt);
261 extern void ff_vp8_v_loop_filter16y_mbedge_mmxext(uint8_t *dst, int stride,
262 int e, int i, int hvt);
263 extern void ff_vp8_v_loop_filter16y_mbedge_sse2 (uint8_t *dst, int stride,
264 int e, int i, int hvt);
265 extern void ff_vp8_h_loop_filter16y_mbedge_mmx (uint8_t *dst, int stride,
266 int e, int i, int hvt);
267 extern void ff_vp8_h_loop_filter16y_mbedge_mmxext(uint8_t *dst, int stride,
268 int e, int i, int hvt);
269 extern void ff_vp8_h_loop_filter16y_mbedge_sse2 (uint8_t *dst, int stride,
270 int e, int i, int hvt);
271
272 extern void ff_vp8_v_loop_filter8uv_mbedge_mmx (uint8_t *dstU, uint8_t *dstV,
273 int s, int e, int i, int hvt);
274 extern void ff_vp8_v_loop_filter8uv_mbedge_mmxext(uint8_t *dstU, uint8_t *dstV,
275 int s, int e, int i, int hvt);
276 extern void ff_vp8_v_loop_filter8uv_mbedge_sse2 (uint8_t *dstU, uint8_t *dstV,
277 int s, int e, int i, int hvt);
278 extern void ff_vp8_h_loop_filter8uv_mbedge_mmx (uint8_t *dstU, uint8_t *dstV,
279 int s, int e, int i, int hvt);
280 extern void ff_vp8_h_loop_filter8uv_mbedge_mmxext(uint8_t *dstU, uint8_t *dstV,
281 int s, int e, int i, int hvt);
282 extern void ff_vp8_h_loop_filter8uv_mbedge_sse2 (uint8_t *dstU, uint8_t *dstV,
283 int s, int e, int i, int hvt);
284 #endif 251 #endif
285 252
286 #define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \ 253 #define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \
287 c->put_vp8_epel_pixels_tab[IDX][0][2] = ff_put_vp8_epel ## SIZE ## _h6_ ## OPT; \ 254 c->put_vp8_epel_pixels_tab[IDX][0][2] = ff_put_vp8_epel ## SIZE ## _h6_ ## OPT; \
288 c->put_vp8_epel_pixels_tab[IDX][2][0] = ff_put_vp8_epel ## SIZE ## _v6_ ## OPT; \ 255 c->put_vp8_epel_pixels_tab[IDX][2][0] = ff_put_vp8_epel ## SIZE ## _v6_ ## OPT; \
382 349
383 if (mm_flags & FF_MM_SSE2) { 350 if (mm_flags & FF_MM_SSE2) {
384 c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2; 351 c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
385 c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2; 352 c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2;
386 353
387 c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmxext; 354 //c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse2;
388 c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmxext; 355 //c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse2;
389 } 356 }
390 357
391 if (mm_flags & FF_MM_SSSE3) { 358 if (mm_flags & FF_MM_SSSE3) {
392 VP8_LUMA_MC_FUNC(0, 16, ssse3); 359 VP8_LUMA_MC_FUNC(0, 16, ssse3);
393 VP8_MC_FUNC(1, 8, ssse3); 360 VP8_MC_FUNC(1, 8, ssse3);
394 VP8_MC_FUNC(2, 4, ssse3); 361 VP8_MC_FUNC(2, 4, ssse3);
395 VP8_BILINEAR_MC_FUNC(0, 16, ssse3); 362 VP8_BILINEAR_MC_FUNC(0, 16, ssse3);
396 VP8_BILINEAR_MC_FUNC(1, 8, ssse3); 363 VP8_BILINEAR_MC_FUNC(1, 8, ssse3);
397 VP8_BILINEAR_MC_FUNC(2, 4, ssse3); 364 VP8_BILINEAR_MC_FUNC(2, 4, ssse3);
365
366 c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_ssse3;
367 c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_ssse3;
368
369 c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_ssse3;
370 c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_ssse3;
371 c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_ssse3;
372 c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_ssse3;
373
374 c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_ssse3;
375 //c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_ssse3;
376 c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_ssse3;
377 //c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_ssse3;
398 } 378 }
399 379
400 if (mm_flags & FF_MM_SSE4) { 380 if (mm_flags & FF_MM_SSE4) {
401 c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse4; 381 c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse4;
402 } 382 }