diff vp8.c @ 11989:176c5deb6756 libavcodec

Optimize split MC, so we don't always do 4x4 blocks of 4x4pixels each, but we apply them as 16x8/8x16/8x8 subblocks where possible. Since this allows us to use width=8/16 instead of width=4 MC functions, we can now take more advantage of SSE2/SSSE3 optimizations, leading to a total speedup for splitMV filter of about 10%.
author rbultje
date Mon, 28 Jun 2010 13:50:55 +0000
parents 356b20a6566d
children 3c51d7ac41c9
line wrap: on
line diff
--- a/vp8.c	Mon Jun 28 10:56:16 2010 +0000
+++ b/vp8.c	Mon Jun 28 13:50:55 2010 +0000
@@ -943,6 +943,39 @@
     mc_func[my_idx][mx_idx](dst, linesize, src, linesize, block_h, mx, my);
 }
 
+static inline void vp8_mc_part(VP8Context *s, uint8_t *dst[3],
+                               AVFrame *ref_frame, int x_off, int y_off,
+                               int bx_off, int by_off,
+                               int block_w, int block_h,
+                               int width, int height, VP56mv *mv)
+{
+    VP56mv uvmv = *mv;
+
+    /* Y */
+    vp8_mc(s, 1, dst[0] + by_off * s->linesize + bx_off,
+           ref_frame->data[0], mv, x_off + bx_off, y_off + by_off,
+           block_w, block_h, width, height, s->linesize,
+           s->put_pixels_tab[block_w == 8]);
+
+    /* U/V */
+    if (s->profile == 3) {
+        uvmv.x &= ~7;
+        uvmv.y &= ~7;
+    }
+    x_off   >>= 1; y_off   >>= 1;
+    bx_off  >>= 1; by_off  >>= 1;
+    width   >>= 1; height  >>= 1;
+    block_w >>= 1; block_h >>= 1;
+    vp8_mc(s, 0, dst[1] + by_off * s->uvlinesize + bx_off,
+           ref_frame->data[1], &uvmv, x_off + bx_off, y_off + by_off,
+           block_w, block_h, width, height, s->uvlinesize,
+           s->put_pixels_tab[1 + (block_w == 4)]);
+    vp8_mc(s, 0, dst[2] + by_off * s->uvlinesize + bx_off,
+           ref_frame->data[2], &uvmv, x_off + bx_off, y_off + by_off,
+           block_w, block_h, width, height, s->uvlinesize,
+           s->put_pixels_tab[1 + (block_w == 4)]);
+}
+
 /**
  * Apply motion vectors to prediction buffer, chapter 18.
  */
@@ -951,29 +984,14 @@
 {
     int x_off = mb_x << 4, y_off = mb_y << 4;
     int width = 16*s->mb_width, height = 16*s->mb_height;
-    VP56mv uvmv;
 
     if (mb->mode < VP8_MVMODE_SPLIT) {
-        /* Y */
-        vp8_mc(s, 1, dst[0], s->framep[mb->ref_frame]->data[0], &mb->mv,
-               x_off, y_off, 16, 16, width, height, s->linesize,
-               s->put_pixels_tab[0]);
-
-        /* U/V */
-        uvmv = mb->mv;
-        if (s->profile == 3) {
-            uvmv.x &= ~7;
-            uvmv.y &= ~7;
-        }
-        x_off >>= 1; y_off >>= 1; width >>= 1; height >>= 1;
-        vp8_mc(s, 0, dst[1], s->framep[mb->ref_frame]->data[1], &uvmv,
-               x_off, y_off, 8, 8, width, height, s->uvlinesize,
-               s->put_pixels_tab[1]);
-        vp8_mc(s, 0, dst[2], s->framep[mb->ref_frame]->data[2], &uvmv,
-               x_off, y_off, 8, 8, width, height, s->uvlinesize,
-               s->put_pixels_tab[1]);
-    } else {
+        vp8_mc_part(s, dst, s->framep[mb->ref_frame], x_off, y_off,
+                    0, 0, 16, 16, width, height, &mb->mv);
+    } else switch (mb->partitioning) {
+    case VP8_SPLITMVMODE_4x4: {
         int x, y;
+        VP56mv uvmv;
 
         /* Y */
         for (y = 0; y < 4; y++) {
@@ -1016,6 +1034,30 @@
                        s->put_pixels_tab[2]);
             }
         }
+        break;
+    }
+    case VP8_SPLITMVMODE_16x8:
+        vp8_mc_part(s, dst, s->framep[mb->ref_frame], x_off, y_off,
+                    0, 0, 16, 8, width, height, &mb->bmv[0]);
+        vp8_mc_part(s, dst, s->framep[mb->ref_frame], x_off, y_off,
+                    0, 8, 16, 8, width, height, &mb->bmv[8]);
+        break;
+    case VP8_SPLITMVMODE_8x16:
+        vp8_mc_part(s, dst, s->framep[mb->ref_frame], x_off, y_off,
+                    0, 0, 8, 16, width, height, &mb->bmv[0]);
+        vp8_mc_part(s, dst, s->framep[mb->ref_frame], x_off, y_off,
+                    8, 0, 8, 16, width, height, &mb->bmv[2]);
+        break;
+    case VP8_SPLITMVMODE_8x8:
+        vp8_mc_part(s, dst, s->framep[mb->ref_frame], x_off, y_off,
+                    0, 0, 8, 8, width, height, &mb->bmv[0]);
+        vp8_mc_part(s, dst, s->framep[mb->ref_frame], x_off, y_off,
+                    8, 0, 8, 8, width, height, &mb->bmv[2]);
+        vp8_mc_part(s, dst, s->framep[mb->ref_frame], x_off, y_off,
+                    0, 8, 8, 8, width, height, &mb->bmv[8]);
+        vp8_mc_part(s, dst, s->framep[mb->ref_frame], x_off, y_off,
+                    8, 8, 8, 8, width, height, &mb->bmv[10]);
+        break;
     }
 }