comparison x86/vp8dsp-init.c @ 12168:b246b214c2e9 libavcodec

VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
author rbultje
date Thu, 15 Jul 2010 23:02:34 +0000
parents d780ae746855
children 80b142c2e9f7
comparison
equal deleted inserted replaced
12167:69bbfd8f2ba5 12168:b246b214c2e9
227 extern void ff_vp8_v_loop_filter_simple_mmxext(uint8_t *dst, int stride, int flim); 227 extern void ff_vp8_v_loop_filter_simple_mmxext(uint8_t *dst, int stride, int flim);
228 extern void ff_vp8_v_loop_filter_simple_sse2 (uint8_t *dst, int stride, int flim); 228 extern void ff_vp8_v_loop_filter_simple_sse2 (uint8_t *dst, int stride, int flim);
229 extern void ff_vp8_h_loop_filter_simple_mmx (uint8_t *dst, int stride, int flim); 229 extern void ff_vp8_h_loop_filter_simple_mmx (uint8_t *dst, int stride, int flim);
230 extern void ff_vp8_h_loop_filter_simple_mmxext(uint8_t *dst, int stride, int flim); 230 extern void ff_vp8_h_loop_filter_simple_mmxext(uint8_t *dst, int stride, int flim);
231 extern void ff_vp8_h_loop_filter_simple_sse2 (uint8_t *dst, int stride, int flim); 231 extern void ff_vp8_h_loop_filter_simple_sse2 (uint8_t *dst, int stride, int flim);
232
233 extern void ff_vp8_v_loop_filter16_inner_mmx (uint8_t *dst, int stride,
234 int e, int i, int hvt);
235 extern void ff_vp8_v_loop_filter16_inner_mmxext(uint8_t *dst, int stride,
236 int e, int i, int hvt);
237 extern void ff_vp8_v_loop_filter16_inner_sse2 (uint8_t *dst, int stride,
238 int e, int i, int hvt);
239 extern void ff_vp8_h_loop_filter16_inner_mmx (uint8_t *dst, int stride,
240 int e, int i, int hvt);
241 extern void ff_vp8_h_loop_filter16_inner_mmxext(uint8_t *dst, int stride,
242 int e, int i, int hvt);
243 extern void ff_vp8_h_loop_filter16_inner_sse2 (uint8_t *dst, int stride,
244 int e, int i, int hvt);
232 #endif 245 #endif
233 246
234 #define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \ 247 #define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \
235 c->put_vp8_epel_pixels_tab[IDX][0][2] = ff_put_vp8_epel ## SIZE ## _h6_ ## OPT; \ 248 c->put_vp8_epel_pixels_tab[IDX][0][2] = ff_put_vp8_epel ## SIZE ## _h6_ ## OPT; \
236 c->put_vp8_epel_pixels_tab[IDX][2][0] = ff_put_vp8_epel ## SIZE ## _v6_ ## OPT; \ 249 c->put_vp8_epel_pixels_tab[IDX][2][0] = ff_put_vp8_epel ## SIZE ## _v6_ ## OPT; \
268 c->put_vp8_epel_pixels_tab[1][0][0] = 281 c->put_vp8_epel_pixels_tab[1][0][0] =
269 c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_mmx; 282 c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_mmx;
270 283
271 c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmx; 284 c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmx;
272 c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmx; 285 c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmx;
286
287 c->vp8_v_loop_filter16_inner = ff_vp8_v_loop_filter16_inner_mmx;
288 c->vp8_h_loop_filter16_inner = ff_vp8_h_loop_filter16_inner_mmx;
273 } 289 }
274 290
275 /* note that 4-tap width=16 functions are missing because w=16 291 /* note that 4-tap width=16 functions are missing because w=16
276 * is only used for luma, and luma is always a copy or sixtap. */ 292 * is only used for luma, and luma is always a copy or sixtap. */
277 if (mm_flags & FF_MM_MMX2) { 293 if (mm_flags & FF_MM_MMX2) {
283 VP8_BILINEAR_MC_FUNC(1, 8, mmxext); 299 VP8_BILINEAR_MC_FUNC(1, 8, mmxext);
284 VP8_BILINEAR_MC_FUNC(2, 4, mmxext); 300 VP8_BILINEAR_MC_FUNC(2, 4, mmxext);
285 301
286 c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmxext; 302 c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmxext;
287 c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmxext; 303 c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmxext;
304
305 c->vp8_v_loop_filter16_inner = ff_vp8_v_loop_filter16_inner_mmxext;
306 c->vp8_h_loop_filter16_inner = ff_vp8_h_loop_filter16_inner_mmxext;
288 } 307 }
289 308
290 if (mm_flags & FF_MM_SSE) { 309 if (mm_flags & FF_MM_SSE) {
291 c->put_vp8_epel_pixels_tab[0][0][0] = 310 c->put_vp8_epel_pixels_tab[0][0][0] =
292 c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse; 311 c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse;
298 VP8_BILINEAR_MC_FUNC(0, 16, sse2); 317 VP8_BILINEAR_MC_FUNC(0, 16, sse2);
299 VP8_BILINEAR_MC_FUNC(1, 8, sse2); 318 VP8_BILINEAR_MC_FUNC(1, 8, sse2);
300 319
301 c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2; 320 c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2;
302 c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2; 321 c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2;
322
323 c->vp8_v_loop_filter16_inner = ff_vp8_v_loop_filter16_inner_sse2;
324 c->vp8_h_loop_filter16_inner = ff_vp8_h_loop_filter16_inner_sse2;
303 } 325 }
304 326
305 if (mm_flags & FF_MM_SSSE3) { 327 if (mm_flags & FF_MM_SSSE3) {
306 VP8_LUMA_MC_FUNC(0, 16, ssse3); 328 VP8_LUMA_MC_FUNC(0, 16, ssse3);
307 VP8_MC_FUNC(1, 8, ssse3); 329 VP8_MC_FUNC(1, 8, ssse3);