comparison x86/vp8dsp-init.c @ 12194:80b142c2e9f7 libavcodec

Change function prototypes for width=8 inner and mbedge loopfilter functions so that it does both U and V planes at the same time. This will have speed advantages when using SSE2 (or higher) optimizations, since we can do both the U and V rows together in a single xmm register. This also renames filter16 to filter16y and filter8 to filter8uv so that it's more obvious what each function is used for.
author rbultje
date Mon, 19 Jul 2010 21:18:04 +0000
parents b246b214c2e9
children 552c7c10bc73
comparison
equal deleted inserted replaced
12193:0a63bed2a00e 12194:80b142c2e9f7
228 extern void ff_vp8_v_loop_filter_simple_sse2 (uint8_t *dst, int stride, int flim); 228 extern void ff_vp8_v_loop_filter_simple_sse2 (uint8_t *dst, int stride, int flim);
229 extern void ff_vp8_h_loop_filter_simple_mmx (uint8_t *dst, int stride, int flim); 229 extern void ff_vp8_h_loop_filter_simple_mmx (uint8_t *dst, int stride, int flim);
230 extern void ff_vp8_h_loop_filter_simple_mmxext(uint8_t *dst, int stride, int flim); 230 extern void ff_vp8_h_loop_filter_simple_mmxext(uint8_t *dst, int stride, int flim);
231 extern void ff_vp8_h_loop_filter_simple_sse2 (uint8_t *dst, int stride, int flim); 231 extern void ff_vp8_h_loop_filter_simple_sse2 (uint8_t *dst, int stride, int flim);
232 232
233 extern void ff_vp8_v_loop_filter16_inner_mmx (uint8_t *dst, int stride, 233 extern void ff_vp8_v_loop_filter16y_inner_mmx (uint8_t *dst, int stride,
234 int e, int i, int hvt); 234 int e, int i, int hvt);
235 extern void ff_vp8_v_loop_filter16_inner_mmxext(uint8_t *dst, int stride, 235 extern void ff_vp8_v_loop_filter16y_inner_mmxext(uint8_t *dst, int stride,
236 int e, int i, int hvt); 236 int e, int i, int hvt);
237 extern void ff_vp8_v_loop_filter16_inner_sse2 (uint8_t *dst, int stride, 237 extern void ff_vp8_v_loop_filter16y_inner_sse2 (uint8_t *dst, int stride,
238 int e, int i, int hvt); 238 int e, int i, int hvt);
239 extern void ff_vp8_h_loop_filter16_inner_mmx (uint8_t *dst, int stride, 239 extern void ff_vp8_h_loop_filter16y_inner_mmx (uint8_t *dst, int stride,
240 int e, int i, int hvt); 240 int e, int i, int hvt);
241 extern void ff_vp8_h_loop_filter16_inner_mmxext(uint8_t *dst, int stride, 241 extern void ff_vp8_h_loop_filter16y_inner_mmxext(uint8_t *dst, int stride,
242 int e, int i, int hvt); 242 int e, int i, int hvt);
243 extern void ff_vp8_h_loop_filter16_inner_sse2 (uint8_t *dst, int stride, 243 extern void ff_vp8_h_loop_filter16y_inner_sse2 (uint8_t *dst, int stride,
244 int e, int i, int hvt); 244 int e, int i, int hvt);
245 #endif 245 #endif
246 246
247 #define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \ 247 #define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \
248 c->put_vp8_epel_pixels_tab[IDX][0][2] = ff_put_vp8_epel ## SIZE ## _h6_ ## OPT; \ 248 c->put_vp8_epel_pixels_tab[IDX][0][2] = ff_put_vp8_epel ## SIZE ## _h6_ ## OPT; \
249 c->put_vp8_epel_pixels_tab[IDX][2][0] = ff_put_vp8_epel ## SIZE ## _v6_ ## OPT; \ 249 c->put_vp8_epel_pixels_tab[IDX][2][0] = ff_put_vp8_epel ## SIZE ## _v6_ ## OPT; \
282 c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_mmx; 282 c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_mmx;
283 283
284 c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmx; 284 c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmx;
285 c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmx; 285 c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmx;
286 286
287 c->vp8_v_loop_filter16_inner = ff_vp8_v_loop_filter16_inner_mmx; 287 c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmx;
288 c->vp8_h_loop_filter16_inner = ff_vp8_h_loop_filter16_inner_mmx; 288 c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmx;
289 } 289 }
290 290
291 /* note that 4-tap width=16 functions are missing because w=16 291 /* note that 4-tap width=16 functions are missing because w=16
292 * is only used for luma, and luma is always a copy or sixtap. */ 292 * is only used for luma, and luma is always a copy or sixtap. */
293 if (mm_flags & FF_MM_MMX2) { 293 if (mm_flags & FF_MM_MMX2) {
300 VP8_BILINEAR_MC_FUNC(2, 4, mmxext); 300 VP8_BILINEAR_MC_FUNC(2, 4, mmxext);
301 301
302 c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmxext; 302 c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmxext;
303 c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmxext; 303 c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmxext;
304 304
305 c->vp8_v_loop_filter16_inner = ff_vp8_v_loop_filter16_inner_mmxext; 305 c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmxext;
306 c->vp8_h_loop_filter16_inner = ff_vp8_h_loop_filter16_inner_mmxext; 306 c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmxext;
307 } 307 }
308 308
309 if (mm_flags & FF_MM_SSE) { 309 if (mm_flags & FF_MM_SSE) {
310 c->put_vp8_epel_pixels_tab[0][0][0] = 310 c->put_vp8_epel_pixels_tab[0][0][0] =
311 c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse; 311 c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse;
318 VP8_BILINEAR_MC_FUNC(1, 8, sse2); 318 VP8_BILINEAR_MC_FUNC(1, 8, sse2);
319 319
320 c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2; 320 c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2;
321 c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2; 321 c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2;
322 322
323 c->vp8_v_loop_filter16_inner = ff_vp8_v_loop_filter16_inner_sse2; 323 c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2;
324 c->vp8_h_loop_filter16_inner = ff_vp8_h_loop_filter16_inner_sse2; 324 c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
325 } 325 }
326 326
327 if (mm_flags & FF_MM_SSSE3) { 327 if (mm_flags & FF_MM_SSSE3) {
328 VP8_LUMA_MC_FUNC(0, 16, ssse3); 328 VP8_LUMA_MC_FUNC(0, 16, ssse3);
329 VP8_MC_FUNC(1, 8, ssse3); 329 VP8_MC_FUNC(1, 8, ssse3);