# HG changeset patch # User diego # Date 1199301882 0 # Node ID ed7190bd3530e5e91171d3e5f3bfa54e42635cae # Parent 89140b93ae09bb7964a2a98b24e4d0604d9e4f4c Fix issue #301: summary of changes: - Use MANGLE when loading some constants into MMX registers. - Convert those constants to non-static and thus add ff_ prefix. - Remove last parameter of MSPEL_FILTER13_CORE (was constant). - Use of "+r" instead of stricter but unnecessary "+g". - Use of REG_c and direct loading of some of the above. patch by Christophe GISQUET, christophe.gisquet free fr Subject: [FFmpeg-devel] [PATCH] Roundup issue #301 Date: Fri, 28 Dec 2007 19:22:18 +0100 diff -r 89140b93ae09 -r ed7190bd3530 i386/vc1dsp_mmx.c --- a/i386/vc1dsp_mmx.c Mon Dec 31 07:12:50 2007 +0000 +++ b/i386/vc1dsp_mmx.c Wed Jan 02 19:24:42 2008 +0000 @@ -55,34 +55,33 @@ #define SHIFT2_LINE(OFF, R0,R1,R2,R3) \ "paddw %%mm"#R2", %%mm"#R1" \n\t" \ - "movd (%1,%4), %%mm"#R0" \n\t" \ + "movd (%0,%3), %%mm"#R0" \n\t" \ "pmullw %%mm6, %%mm"#R1" \n\t" \ "punpcklbw %%mm0, %%mm"#R0" \n\t" \ - "movd (%1,%3), %%mm"#R3" \n\t" \ + "movd (%0,%2), %%mm"#R3" \n\t" \ "psubw %%mm"#R0", %%mm"#R1" \n\t" \ "punpcklbw %%mm0, %%mm"#R3" \n\t" \ "paddw %%mm7, %%mm"#R1" \n\t" \ "psubw %%mm"#R3", %%mm"#R1" \n\t" \ - "psraw %5, %%mm"#R1" \n\t" \ - "movq %%mm"#R1", "#OFF"(%2) \n\t" \ - "add %3, %1 \n\t" + "psraw %4, %%mm"#R1" \n\t" \ + "movq %%mm"#R1", "#OFF"(%1) \n\t" \ + "add %2, %0 \n\t" -DECLARE_ALIGNED_16(static const uint64_t, fact_9) = 0x0009000900090009ULL; +DECLARE_ALIGNED_16(const uint64_t, ff_pw_9) = 0x0009000900090009ULL; /** Sacrifying mm6 allows to pipeline loads from src */ static void vc1_put_ver_16b_shift2_mmx(int16_t *dst, const uint8_t *src, long int stride, int rnd, int64_t shift) { - int w = 3; - asm volatile( - LOAD_ROUNDER_MMX("%6") - "movq %7, %%mm6 \n\t" + "mov $3, %%"REG_c" \n\t" + LOAD_ROUNDER_MMX("%5") + "movq "MANGLE(ff_pw_9)", %%mm6 \n\t" "1: \n\t" - "movd (%1), %%mm2 \n\t" - "add %3, %1 \n\t" - "movd (%1), %%mm3 \n\t" + "movd (%0), %%mm2 \n\t" + "add %2, %0 \n\t" + "movd (%0), %%mm3 \n\t" "punpcklbw %%mm0, %%mm2 \n\t" "punpcklbw %%mm0, %%mm3 \n\t" SHIFT2_LINE( 0, 1, 2, 3, 4) @@ -93,14 +92,14 @@ SHIFT2_LINE(120, 2, 3, 4, 1) SHIFT2_LINE(144, 3, 4, 1, 2) SHIFT2_LINE(168, 4, 1, 2, 3) - "sub %8, %1 \n\t" - "add $8, %2 \n\t" - "decl %0 \n\t" + "sub %6, %0 \n\t" + "add $8, %1 \n\t" + "dec %%"REG_c" \n\t" "jnz 1b \n\t" - : "+g"(w), "+r"(src), "+r"(dst) - : "r"(stride), "r"(-2*stride), "m"(shift), - "m"(rnd), "m"(fact_9), "g"(9*stride-4) - : "memory" + : "+r"(src), "+r"(dst) + : "r"(stride), "r"(-2*stride), + "m"(shift), "m"(rnd), "r"(9*stride-4) + : "%"REG_c, "memory" ); } @@ -117,8 +116,8 @@ rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */ asm volatile( LOAD_ROUNDER_MMX("%4") - "movq %6, %%mm6 \n\t" - "movq %5, %%mm5 \n\t" + "movq "MANGLE(ff_pw_128)", %%mm6\n\t" + "movq "MANGLE(ff_pw_9)", %%mm5 \n\t" "1: \n\t" "movq 2*0+0(%1), %%mm1 \n\t" "movq 2*0+8(%1), %%mm2 \n\t" @@ -141,8 +140,8 @@ "add %3, %2 \n\t" "decl %0 \n\t" "jnz 1b \n\t" - : "+g"(h), "+r" (src), "+r" (dst) - : "g"(stride), "m"(rnd), "m"(fact_9), "m"(ff_pw_128) + : "+r"(h), "+r" (src), "+r" (dst) + : "r"(stride), "m"(rnd) : "memory" ); } @@ -155,48 +154,48 @@ static void vc1_put_shift2_mmx(uint8_t *dst, const uint8_t *src, long int stride, int rnd, long int offset) { - int h = 8; - rnd = 8-rnd; asm volatile( - LOAD_ROUNDER_MMX("%6") - "movq %8, %%mm6 \n\t" + "mov $8, %%"REG_c" \n\t" + LOAD_ROUNDER_MMX("%5") + "movq "MANGLE(ff_pw_9)", %%mm6\n\t" "1: \n\t" - "movd 0(%1 ), %%mm3 \n\t" - "movd 4(%1 ), %%mm4 \n\t" - "movd 0(%1,%3), %%mm1 \n\t" - "movd 4(%1,%3), %%mm2 \n\t" - "add %3, %1 \n\t" + "movd 0(%0 ), %%mm3 \n\t" + "movd 4(%0 ), %%mm4 \n\t" + "movd 0(%0,%2), %%mm1 \n\t" + "movd 4(%0,%2), %%mm2 \n\t" + "add %2, %0 \n\t" "punpcklbw %%mm0, %%mm3 \n\t" "punpcklbw %%mm0, %%mm4 \n\t" "punpcklbw %%mm0, %%mm1 \n\t" "punpcklbw %%mm0, %%mm2 \n\t" "paddw %%mm1, %%mm3 \n\t" "paddw %%mm2, %%mm4 \n\t" - "movd 0(%1,%4), %%mm1 \n\t" - "movd 4(%1,%4), %%mm2 \n\t" + "movd 0(%0,%3), %%mm1 \n\t" + "movd 4(%0,%3), %%mm2 \n\t" "pmullw %%mm6, %%mm3 \n\t" /* 0,9,9,0*/ "pmullw %%mm6, %%mm4 \n\t" /* 0,9,9,0*/ "punpcklbw %%mm0, %%mm1 \n\t" "punpcklbw %%mm0, %%mm2 \n\t" "psubw %%mm1, %%mm3 \n\t" /*-1,9,9,0*/ "psubw %%mm2, %%mm4 \n\t" /*-1,9,9,0*/ - "movd 0(%1,%3), %%mm1 \n\t" - "movd 4(%1,%3), %%mm2 \n\t" + "movd 0(%0,%2), %%mm1 \n\t" + "movd 4(%0,%2), %%mm2 \n\t" "punpcklbw %%mm0, %%mm1 \n\t" "punpcklbw %%mm0, %%mm2 \n\t" "psubw %%mm1, %%mm3 \n\t" /*-1,9,9,-1*/ "psubw %%mm2, %%mm4 \n\t" /*-1,9,9,-1*/ NORMALIZE_MMX("$4") - TRANSFER_DO_PACK - "add %7, %1 \n\t" - "add %5, %2 \n\t" - "decl %0 \n\t" + "packuswb %%mm4, %%mm3 \n\t" + "movq %%mm3, (%1) \n\t" + "add %6, %0 \n\t" + "add %4, %1 \n\t" + "dec %%"REG_c" \n\t" "jnz 1b \n\t" - : "+g"(h), "+r"(src), "+r"(dst) + : "+r"(src), "+r"(dst) : "r"(offset), "r"(-2*offset), "g"(stride), "m"(rnd), - "g"(stride-offset), "m"(fact_9) - : "memory" + "g"(stride-offset) + : "%"REG_c, "memory" ); } @@ -204,8 +203,8 @@ * Filter coefficients made global to allow access by all 1 or 3 quarter shift * interpolation functions. */ -DECLARE_ALIGNED_16(static const uint64_t, fact_53) = 0x0035003500350035ULL; -DECLARE_ALIGNED_16(static const uint64_t, fact_18) = 0x0012001200120012ULL; +DECLARE_ALIGNED_16(const uint64_t, ff_pw_53) = 0x0035003500350035ULL; +DECLARE_ALIGNED_16(const uint64_t, ff_pw_18) = 0x0012001200120012ULL; /** * Core of the 1/4 and 3/4 shift bicubic interpolation. @@ -217,13 +216,13 @@ * @param A3 Address of 3rd tap * @param A4 Address of 4th tap */ -#define MSPEL_FILTER13_CORE(UNPACK, MOVQ, A1, A2, A3, A4, POS) \ +#define MSPEL_FILTER13_CORE(UNPACK, MOVQ, A1, A2, A3, A4) \ MOVQ "*0+"A1", %%mm1 \n\t" \ MOVQ "*4+"A1", %%mm2 \n\t" \ UNPACK("%%mm1") \ UNPACK("%%mm2") \ - "pmullw "POS", %%mm1 \n\t" \ - "pmullw "POS", %%mm2 \n\t" \ + "pmullw "MANGLE(ff_pw_3)", %%mm1\n\t" \ + "pmullw "MANGLE(ff_pw_3)", %%mm2\n\t" \ MOVQ "*0+"A2", %%mm3 \n\t" \ MOVQ "*4+"A2", %%mm4 \n\t" \ UNPACK("%%mm3") \ @@ -267,11 +266,11 @@ src -= src_stride; \ asm volatile( \ LOAD_ROUNDER_MMX("%5") \ - "movq %7, %%mm5 \n\t" \ - "movq %8, %%mm6 \n\t" \ + "movq "MANGLE(ff_pw_53)", %%mm5\n\t" \ + "movq "MANGLE(ff_pw_18)", %%mm6\n\t" \ ASMALIGN(3) \ "1: \n\t" \ - MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4, "%9") \ + MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \ NORMALIZE_MMX("%6") \ TRANSFER_DONT_PACK \ /* Last 3 (in fact 4) bytes on the line */ \ @@ -299,10 +298,9 @@ "add $24, %2 \n\t" \ "decl %0 \n\t" \ "jnz 1b \n\t" \ - : "+g"(h), "+r" (src), "+r" (dst) \ + : "+r"(h), "+r" (src), "+r" (dst) \ : "r"(src_stride), "r"(3*src_stride), \ - "m"(rnd), "m"(shift), \ - "m"(fact_53), "m"(fact_18), "m"(ff_pw_3) \ + "m"(rnd), "m"(shift) \ : "memory" \ ); \ } @@ -324,23 +322,22 @@ rnd -= (-4+58+13-3)*256; /* Add -256 bias */ \ asm volatile( \ LOAD_ROUNDER_MMX("%4") \ - "movq %6, %%mm6 \n\t" \ - "movq %5, %%mm5 \n\t" \ + "movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \ + "movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \ ASMALIGN(3) \ "1: \n\t" \ - MSPEL_FILTER13_CORE(DONT_UNPACK, "movq 2", A1, A2, A3, A4, "%8")\ + MSPEL_FILTER13_CORE(DONT_UNPACK, "movq 2", A1, A2, A3, A4) \ NORMALIZE_MMX("$7") \ /* Remove bias */ \ - "paddw %7, %%mm3 \n\t" \ - "paddw %7, %%mm4 \n\t" \ + "paddw "MANGLE(ff_pw_128)", %%mm3 \n\t" \ + "paddw "MANGLE(ff_pw_128)", %%mm4 \n\t" \ TRANSFER_DO_PACK \ "add $24, %1 \n\t" \ "add %3, %2 \n\t" \ "decl %0 \n\t" \ "jnz 1b \n\t" \ - : "+g"(h), "+r" (src), "+r" (dst) \ - : "g"(stride), "m"(rnd), "m"(fact_53), "m"(fact_18), \ - "m"(ff_pw_128), "m"(ff_pw_3) \ + : "+r"(h), "+r" (src), "+r" (dst) \ + : "r"(stride), "m"(rnd) \ : "memory" \ ); \ } @@ -363,20 +360,19 @@ rnd = 32-rnd; \ asm volatile ( \ LOAD_ROUNDER_MMX("%6") \ - "movq %7, %%mm5 \n\t" \ - "movq %8, %%mm6 \n\t" \ + "movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \ + "movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \ ASMALIGN(3) \ "1: \n\t" \ - MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4, "%9")\ + MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \ NORMALIZE_MMX("$6") \ TRANSFER_DO_PACK \ "add %5, %1 \n\t" \ "add %5, %2 \n\t" \ "decl %0 \n\t" \ "jnz 1b \n\t" \ - : "+g"(h), "+r" (src), "+r" (dst) \ - : "r"(offset), "r"(3*offset), "g"(stride), "m"(rnd), \ - "m"(fact_53), "m"(fact_18), "m"(ff_pw_3) \ + : "+r"(h), "+r" (src), "+r" (dst) \ + : "r"(offset), "r"(3*offset), "g"(stride), "m"(rnd) \ : "memory" \ ); \ }