Mercurial > libavcodec.hg
comparison x86/vp8dsp-init.c @ 12210:baf13deed97e libavcodec
Various VP8 x86 deblocking speedups
SSSE3 versions, improve SSE2 versions a bit.
SSE2/SSSE3 mbedge h functions are currently broken, so explicitly disable them.
author | darkshikari |
---|---|
date | Wed, 21 Jul 2010 22:11:03 +0000 |
parents | 9eef00a43280 |
children | 657d353cd515 |
comparison
equal
deleted
inserted
replaced
12209:9eef00a43280 | 12210:baf13deed97e |
---|---|
221 extern void ff_vp8_idct_dc_add_mmx(uint8_t *dst, DCTELEM block[16], int stride); | 221 extern void ff_vp8_idct_dc_add_mmx(uint8_t *dst, DCTELEM block[16], int stride); |
222 extern void ff_vp8_idct_dc_add_sse4(uint8_t *dst, DCTELEM block[16], int stride); | 222 extern void ff_vp8_idct_dc_add_sse4(uint8_t *dst, DCTELEM block[16], int stride); |
223 extern void ff_vp8_luma_dc_wht_mmx(DCTELEM block[4][4][16], DCTELEM dc[16]); | 223 extern void ff_vp8_luma_dc_wht_mmx(DCTELEM block[4][4][16], DCTELEM dc[16]); |
224 extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16], int stride); | 224 extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16], int stride); |
225 | 225 |
226 extern void ff_vp8_v_loop_filter_simple_mmx (uint8_t *dst, int stride, int flim); | 226 #define DECLARE_LOOP_FILTER(NAME)\ |
227 extern void ff_vp8_v_loop_filter_simple_mmxext(uint8_t *dst, int stride, int flim); | 227 extern void ff_vp8_v_loop_filter_simple_ ## NAME(uint8_t *dst, int stride, int flim);\ |
228 extern void ff_vp8_v_loop_filter_simple_sse2 (uint8_t *dst, int stride, int flim); | 228 extern void ff_vp8_h_loop_filter_simple_ ## NAME(uint8_t *dst, int stride, int flim);\ |
229 extern void ff_vp8_h_loop_filter_simple_mmx (uint8_t *dst, int stride, int flim); | 229 extern void ff_vp8_v_loop_filter16y_inner_ ## NAME (uint8_t *dst, int stride,\ |
230 extern void ff_vp8_h_loop_filter_simple_mmxext(uint8_t *dst, int stride, int flim); | 230 int e, int i, int hvt);\ |
231 extern void ff_vp8_h_loop_filter_simple_sse2 (uint8_t *dst, int stride, int flim); | 231 extern void ff_vp8_h_loop_filter16y_inner_ ## NAME (uint8_t *dst, int stride,\ |
232 | 232 int e, int i, int hvt);\ |
233 extern void ff_vp8_v_loop_filter16y_inner_mmx (uint8_t *dst, int stride, | 233 extern void ff_vp8_v_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, uint8_t *dstV,\ |
234 int e, int i, int hvt); | 234 int s, int e, int i, int hvt);\ |
235 extern void ff_vp8_v_loop_filter16y_inner_mmxext(uint8_t *dst, int stride, | 235 extern void ff_vp8_h_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, uint8_t *dstV,\ |
236 int e, int i, int hvt); | 236 int s, int e, int i, int hvt);\ |
237 extern void ff_vp8_v_loop_filter16y_inner_sse2 (uint8_t *dst, int stride, | 237 extern void ff_vp8_v_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, int stride,\ |
238 int e, int i, int hvt); | 238 int e, int i, int hvt);\ |
239 extern void ff_vp8_h_loop_filter16y_inner_mmx (uint8_t *dst, int stride, | 239 extern void ff_vp8_h_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, int stride,\ |
240 int e, int i, int hvt); | 240 int e, int i, int hvt);\ |
241 extern void ff_vp8_h_loop_filter16y_inner_mmxext(uint8_t *dst, int stride, | 241 extern void ff_vp8_v_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, uint8_t *dstV,\ |
242 int e, int i, int hvt); | 242 int s, int e, int i, int hvt);\ |
243 extern void ff_vp8_h_loop_filter16y_inner_sse2 (uint8_t *dst, int stride, | 243 extern void ff_vp8_h_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, uint8_t *dstV,\ |
244 int e, int i, int hvt); | 244 int s, int e, int i, int hvt); |
245 | 245 |
246 extern void ff_vp8_v_loop_filter8uv_inner_mmx (uint8_t *dstU, uint8_t *dstV, | 246 DECLARE_LOOP_FILTER(mmx) |
247 int s, int e, int i, int hvt); | 247 DECLARE_LOOP_FILTER(mmxext) |
248 extern void ff_vp8_v_loop_filter8uv_inner_mmxext(uint8_t *dstU, uint8_t *dstV, | 248 DECLARE_LOOP_FILTER(sse2) |
249 int s, int e, int i, int hvt); | 249 DECLARE_LOOP_FILTER(ssse3) |
250 extern void ff_vp8_v_loop_filter8uv_inner_sse2 (uint8_t *dstU, uint8_t *dstV, | 250 |
251 int s, int e, int i, int hvt); | |
252 extern void ff_vp8_h_loop_filter8uv_inner_mmx (uint8_t *dstU, uint8_t *dstV, | |
253 int s, int e, int i, int hvt); | |
254 extern void ff_vp8_h_loop_filter8uv_inner_mmxext(uint8_t *dstU, uint8_t *dstV, | |
255 int s, int e, int i, int hvt); | |
256 extern void ff_vp8_h_loop_filter8uv_inner_sse2 (uint8_t *dstU, uint8_t *dstV, | |
257 int s, int e, int i, int hvt); | |
258 | |
259 extern void ff_vp8_v_loop_filter16y_mbedge_mmx (uint8_t *dst, int stride, | |
260 int e, int i, int hvt); | |
261 extern void ff_vp8_v_loop_filter16y_mbedge_mmxext(uint8_t *dst, int stride, | |
262 int e, int i, int hvt); | |
263 extern void ff_vp8_v_loop_filter16y_mbedge_sse2 (uint8_t *dst, int stride, | |
264 int e, int i, int hvt); | |
265 extern void ff_vp8_h_loop_filter16y_mbedge_mmx (uint8_t *dst, int stride, | |
266 int e, int i, int hvt); | |
267 extern void ff_vp8_h_loop_filter16y_mbedge_mmxext(uint8_t *dst, int stride, | |
268 int e, int i, int hvt); | |
269 extern void ff_vp8_h_loop_filter16y_mbedge_sse2 (uint8_t *dst, int stride, | |
270 int e, int i, int hvt); | |
271 | |
272 extern void ff_vp8_v_loop_filter8uv_mbedge_mmx (uint8_t *dstU, uint8_t *dstV, | |
273 int s, int e, int i, int hvt); | |
274 extern void ff_vp8_v_loop_filter8uv_mbedge_mmxext(uint8_t *dstU, uint8_t *dstV, | |
275 int s, int e, int i, int hvt); | |
276 extern void ff_vp8_v_loop_filter8uv_mbedge_sse2 (uint8_t *dstU, uint8_t *dstV, | |
277 int s, int e, int i, int hvt); | |
278 extern void ff_vp8_h_loop_filter8uv_mbedge_mmx (uint8_t *dstU, uint8_t *dstV, | |
279 int s, int e, int i, int hvt); | |
280 extern void ff_vp8_h_loop_filter8uv_mbedge_mmxext(uint8_t *dstU, uint8_t *dstV, | |
281 int s, int e, int i, int hvt); | |
282 extern void ff_vp8_h_loop_filter8uv_mbedge_sse2 (uint8_t *dstU, uint8_t *dstV, | |
283 int s, int e, int i, int hvt); | |
284 #endif | 251 #endif |
285 | 252 |
286 #define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \ | 253 #define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \ |
287 c->put_vp8_epel_pixels_tab[IDX][0][2] = ff_put_vp8_epel ## SIZE ## _h6_ ## OPT; \ | 254 c->put_vp8_epel_pixels_tab[IDX][0][2] = ff_put_vp8_epel ## SIZE ## _h6_ ## OPT; \ |
288 c->put_vp8_epel_pixels_tab[IDX][2][0] = ff_put_vp8_epel ## SIZE ## _v6_ ## OPT; \ | 255 c->put_vp8_epel_pixels_tab[IDX][2][0] = ff_put_vp8_epel ## SIZE ## _v6_ ## OPT; \ |
382 | 349 |
383 if (mm_flags & FF_MM_SSE2) { | 350 if (mm_flags & FF_MM_SSE2) { |
384 c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2; | 351 c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2; |
385 c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2; | 352 c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2; |
386 | 353 |
387 c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmxext; | 354 //c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse2; |
388 c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmxext; | 355 //c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse2; |
389 } | 356 } |
390 | 357 |
391 if (mm_flags & FF_MM_SSSE3) { | 358 if (mm_flags & FF_MM_SSSE3) { |
392 VP8_LUMA_MC_FUNC(0, 16, ssse3); | 359 VP8_LUMA_MC_FUNC(0, 16, ssse3); |
393 VP8_MC_FUNC(1, 8, ssse3); | 360 VP8_MC_FUNC(1, 8, ssse3); |
394 VP8_MC_FUNC(2, 4, ssse3); | 361 VP8_MC_FUNC(2, 4, ssse3); |
395 VP8_BILINEAR_MC_FUNC(0, 16, ssse3); | 362 VP8_BILINEAR_MC_FUNC(0, 16, ssse3); |
396 VP8_BILINEAR_MC_FUNC(1, 8, ssse3); | 363 VP8_BILINEAR_MC_FUNC(1, 8, ssse3); |
397 VP8_BILINEAR_MC_FUNC(2, 4, ssse3); | 364 VP8_BILINEAR_MC_FUNC(2, 4, ssse3); |
365 | |
366 c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_ssse3; | |
367 c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_ssse3; | |
368 | |
369 c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_ssse3; | |
370 c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_ssse3; | |
371 c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_ssse3; | |
372 c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_ssse3; | |
373 | |
374 c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_ssse3; | |
375 //c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_ssse3; | |
376 c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_ssse3; | |
377 //c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_ssse3; | |
398 } | 378 } |
399 | 379 |
400 if (mm_flags & FF_MM_SSE4) { | 380 if (mm_flags & FF_MM_SSE4) { |
401 c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse4; | 381 c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse4; |
402 } | 382 } |