# HG changeset patch # User upsuper # Date 1355451396 0 # Node ID 7d8f561558fc17e7b9571d51df0d100987dee63d # Parent fb92c6de7f704520c25e4d47aa6b6ac8d809cf65 Reduce register usage in an asm block. Reduce to 4 registers in the asm block of render_frame_yuv422_sse4. After this modification, the function is only ~3.4x faster than render_frame_yuv422. diff -r fb92c6de7f70 -r 7d8f561558fc libmpcodecs/vf_ass.c --- a/libmpcodecs/vf_ass.c Fri Dec 14 02:16:30 2012 +0000 +++ b/libmpcodecs/vf_ass.c Fri Dec 14 02:16:36 2012 +0000 @@ -274,9 +274,12 @@ "psrlw $8, %%xmm3 \n\t" "packuswb %%xmm7, %%xmm1 \n\t" "packuswb %%xmm7, %%xmm3 \n\t" - "movq (%[src_y], %[j], 1), %%xmm4 \n\t" - "movq (%[src_u], %[j], 1), %%xmm5 \n\t" - "movq (%[src_v], %[j], 1), %%xmm6 \n\t" + "mov %[src_y], %%"REG_S" \n\t" + "movq (%%"REG_S", %[j], 1), %%xmm4 \n\t" + "mov %[src_u], %%"REG_S" \n\t" + "movq (%%"REG_S", %[j], 1), %%xmm5 \n\t" + "mov %[src_v], %%"REG_S" \n\t" + "movq (%%"REG_S", %[j], 1), %%xmm6 \n\t" "packuswb %%xmm7, %%xmm5 \n\t" "packuswb %%xmm7, %%xmm6 \n\t" "punpcklbw %%xmm6, %%xmm5 \n\t" @@ -302,12 +305,13 @@ : : [dst] "r" (dst + i * stride), [alpha] "r" (alpha + i * outw), - [src_y] "r" (src_y + i * outw), - [src_u] "r" (src_u + i * outw), - [src_v] "r" (src_v + i * outw), + [src_y] "g" (src_y + i * outw), + [src_u] "g" (src_u + i * outw), + [src_v] "g" (src_v + i * outw), [j] "r" (xmin), [xmax] "g" (xmax), [f] "g" (is_uyvy) + : REG_S ); } }