Mercurial > libavcodec.hg
changeset 8430:7768bdfd4f7b libavcodec
Rename libavcodec/i386/ --> libavcodec/x86/.
It contains optimizations that are not specific to i386 and
libavutil uses this naming scheme already.
author | diego |
---|---|
date | Mon, 22 Dec 2008 09:12:42 +0000 |
parents | b3ecaba81501 |
children | 405a44077331 |
files | Makefile dct-test.c h264.c i386/cavsdsp_mmx.c i386/cpuid.c i386/dnxhd_mmx.c i386/dsputil_h264_template_mmx.c i386/dsputil_h264_template_ssse3.c i386/dsputil_mmx.c i386/dsputil_mmx.h i386/dsputil_mmx_avg_template.c i386/dsputil_mmx_qns_template.c i386/dsputil_mmx_rnd_template.c i386/dsputil_yasm.asm i386/dsputilenc_mmx.c i386/fdct_mmx.c i386/fft_3dn.c i386/fft_3dn2.c i386/fft_mmx.asm i386/fft_sse.c i386/flacdsp_mmx.c i386/h264_deblock_sse2.asm i386/h264_i386.h i386/h264dsp_mmx.c i386/idct_mmx.c i386/idct_mmx_xvid.c i386/idct_sse2_xvid.c i386/idct_xvid.h i386/mathops.h i386/mmx.h i386/motion_est_mmx.c i386/mpegvideo_mmx.c i386/mpegvideo_mmx_template.c i386/simple_idct_mmx.c i386/snowdsp_mmx.c i386/vc1dsp_mmx.c i386/vp3dsp_mmx.c i386/vp3dsp_mmx.h i386/vp3dsp_sse2.c i386/vp3dsp_sse2.h i386/x86inc.asm imgconvert.c imgresample.c mathops.h x86/cavsdsp_mmx.c x86/cpuid.c x86/dnxhd_mmx.c x86/dsputil_h264_template_mmx.c x86/dsputil_h264_template_ssse3.c x86/dsputil_mmx.c x86/dsputil_mmx.h x86/dsputil_mmx_avg_template.c x86/dsputil_mmx_qns_template.c x86/dsputil_mmx_rnd_template.c x86/dsputil_yasm.asm x86/dsputilenc_mmx.c x86/fdct_mmx.c x86/fft_3dn.c x86/fft_3dn2.c x86/fft_mmx.asm x86/fft_sse.c x86/flacdsp_mmx.c x86/h264_deblock_sse2.asm x86/h264_i386.h x86/h264dsp_mmx.c x86/idct_mmx.c x86/idct_mmx_xvid.c x86/idct_sse2_xvid.c x86/idct_xvid.h x86/mathops.h x86/mmx.h x86/motion_est_mmx.c x86/mpegvideo_mmx.c x86/mpegvideo_mmx_template.c x86/simple_idct_mmx.c x86/snowdsp_mmx.c x86/vc1dsp_mmx.c x86/vp3dsp_mmx.c x86/vp3dsp_mmx.h x86/vp3dsp_sse2.c x86/vp3dsp_sse2.h x86/x86inc.asm |
diffstat | 82 files changed, 19390 insertions(+), 19390 deletions(-) [+] |
line wrap: on
line diff
--- a/Makefile Mon Dec 22 06:50:18 2008 +0000 +++ b/Makefile Mon Dec 22 09:12:42 2008 +0000 @@ -392,37 +392,37 @@ OBJS-$(HAVE_W32THREADS) += w32thread.o # processor-specific code -MMX-OBJS-$(CONFIG_CAVS_DECODER) += i386/cavsdsp_mmx.o -MMX-OBJS-$(CONFIG_ENCODERS) += i386/dsputilenc_mmx.o -MMX-OBJS-$(CONFIG_FLAC_ENCODER) += i386/flacdsp_mmx.o -MMX-OBJS-$(CONFIG_GPL) += i386/idct_mmx.o -MMX-OBJS-$(CONFIG_SNOW_DECODER) += i386/snowdsp_mmx.o -MMX-OBJS-$(CONFIG_THEORA_DECODER) += i386/vp3dsp_mmx.o i386/vp3dsp_sse2.o -MMX-OBJS-$(CONFIG_VC1_DECODER) += i386/vc1dsp_mmx.o -MMX-OBJS-$(CONFIG_VP3_DECODER) += i386/vp3dsp_mmx.o i386/vp3dsp_sse2.o -MMX-OBJS-$(CONFIG_VP5_DECODER) += i386/vp3dsp_mmx.o i386/vp3dsp_sse2.o -MMX-OBJS-$(CONFIG_VP6_DECODER) += i386/vp3dsp_mmx.o i386/vp3dsp_sse2.o -MMX-OBJS-$(CONFIG_VP6A_DECODER) += i386/vp3dsp_mmx.o i386/vp3dsp_sse2.o -MMX-OBJS-$(CONFIG_VP6F_DECODER) += i386/vp3dsp_mmx.o i386/vp3dsp_sse2.o -MMX-OBJS-$(CONFIG_WMV3_DECODER) += i386/vc1dsp_mmx.o -MMX-OBJS-$(HAVE_YASM) += i386/dsputil_yasm.o \ - i386/h264_deblock_sse2.o +MMX-OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp_mmx.o +MMX-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_mmx.o +MMX-OBJS-$(CONFIG_FLAC_ENCODER) += x86/flacdsp_mmx.o +MMX-OBJS-$(CONFIG_GPL) += x86/idct_mmx.o +MMX-OBJS-$(CONFIG_SNOW_DECODER) += x86/snowdsp_mmx.o +MMX-OBJS-$(CONFIG_THEORA_DECODER) += x86/vp3dsp_mmx.o x86/vp3dsp_sse2.o +MMX-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_mmx.o +MMX-OBJS-$(CONFIG_VP3_DECODER) += x86/vp3dsp_mmx.o x86/vp3dsp_sse2.o +MMX-OBJS-$(CONFIG_VP5_DECODER) += x86/vp3dsp_mmx.o x86/vp3dsp_sse2.o +MMX-OBJS-$(CONFIG_VP6_DECODER) += x86/vp3dsp_mmx.o x86/vp3dsp_sse2.o +MMX-OBJS-$(CONFIG_VP6A_DECODER) += x86/vp3dsp_mmx.o x86/vp3dsp_sse2.o +MMX-OBJS-$(CONFIG_VP6F_DECODER) += x86/vp3dsp_mmx.o x86/vp3dsp_sse2.o +MMX-OBJS-$(CONFIG_WMV3_DECODER) += x86/vc1dsp_mmx.o +MMX-OBJS-$(HAVE_YASM) += x86/dsputil_yasm.o \ + x86/h264_deblock_sse2.o \ -OBJS-$(HAVE_MMX) += i386/cpuid.o \ - i386/dnxhd_mmx.o \ - i386/dsputil_mmx.o \ - i386/fdct_mmx.o \ - i386/idct_mmx_xvid.o \ - i386/idct_sse2_xvid.o \ - i386/motion_est_mmx.o \ - i386/mpegvideo_mmx.o \ - i386/simple_idct_mmx.o \ +OBJS-$(HAVE_MMX) += x86/cpuid.o \ + x86/dnxhd_mmx.o \ + x86/dsputil_mmx.o \ + x86/fdct_mmx.o \ + x86/idct_mmx_xvid.o \ + x86/idct_sse2_xvid.o \ + x86/motion_est_mmx.o \ + x86/mpegvideo_mmx.o \ + x86/simple_idct_mmx.o \ $(MMX-OBJS-yes) -OBJS-$(CONFIG_FFT_MMX) += i386/fft_3dn.o \ - i386/fft_3dn2.o \ - i386/fft_mmx.o \ - i386/fft_sse.o \ +OBJS-$(CONFIG_FFT_MMX) += x86/fft_3dn.o \ + x86/fft_3dn2.o \ + x86/fft_mmx.o \ + x86/fft_sse.o \ OBJS-$(ARCH_ALPHA) += alpha/dsputil_alpha.o \ alpha/dsputil_alpha_asm.o \ @@ -498,10 +498,10 @@ TESTS = $(addsuffix -test$(EXESUF), cabac dct eval fft h264 rangecoder snow) TESTS-$(CONFIG_OLDSCALER) += imgresample-test$(EXESUF) -TESTS-$(ARCH_X86) += i386/cpuid-test$(EXESUF) motion-test$(EXESUF) +TESTS-$(ARCH_X86) += x86/cpuid-test$(EXESUF) motion-test$(EXESUF) CLEANFILES = apiexample$(EXESUF) -DIRS = alpha arm bfin i386 mlib ppc ps2 sh4 sparc +DIRS = alpha arm bfin mlib ppc ps2 sh4 sparc x86 include $(SUBDIR)../subdir.mak
--- a/dct-test.c Mon Dec 22 06:50:18 2008 +0000 +++ b/dct-test.c Mon Dec 22 09:12:42 2008 +0000 @@ -38,7 +38,7 @@ #include "aandcttab.h" #include "faandct.h" #include "faanidct.h" -#include "i386/idct_xvid.h" +#include "x86/idct_xvid.h" #undef printf #undef random
--- a/h264.c Mon Dec 22 06:50:18 2008 +0000 +++ b/h264.c Mon Dec 22 09:12:42 2008 +0000 @@ -36,7 +36,7 @@ #include "cabac.h" #ifdef ARCH_X86 -#include "i386/h264_i386.h" +#include "x86/h264_i386.h" #endif //#undef NDEBUG
--- a/i386/cavsdsp_mmx.c Mon Dec 22 06:50:18 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,497 +0,0 @@ -/* - * Chinese AVS video (AVS1-P2, JiZhun profile) decoder. - * Copyright (c) 2006 Stefan Gehrer <stefan.gehrer@gmx.de> - * - * MMX-optimized DSP functions, based on H.264 optimizations by - * Michael Niedermayer and Loren Merritt - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/common.h" -#include "libavutil/x86_cpu.h" -#include "libavcodec/dsputil.h" -#include "dsputil_mmx.h" - -/***************************************************************************** - * - * inverse transform - * - ****************************************************************************/ - -static inline void cavs_idct8_1d(int16_t *block, uint64_t bias) -{ - __asm__ volatile( - "movq 112(%0), %%mm4 \n\t" /* mm4 = src7 */ - "movq 16(%0), %%mm5 \n\t" /* mm5 = src1 */ - "movq 80(%0), %%mm2 \n\t" /* mm2 = src5 */ - "movq 48(%0), %%mm7 \n\t" /* mm7 = src3 */ - "movq %%mm4, %%mm0 \n\t" - "movq %%mm5, %%mm3 \n\t" - "movq %%mm2, %%mm6 \n\t" - "movq %%mm7, %%mm1 \n\t" - - "paddw %%mm4, %%mm4 \n\t" /* mm4 = 2*src7 */ - "paddw %%mm3, %%mm3 \n\t" /* mm3 = 2*src1 */ - "paddw %%mm6, %%mm6 \n\t" /* mm6 = 2*src5 */ - "paddw %%mm1, %%mm1 \n\t" /* mm1 = 2*src3 */ - "paddw %%mm4, %%mm0 \n\t" /* mm0 = 3*src7 */ - "paddw %%mm3, %%mm5 \n\t" /* mm5 = 3*src1 */ - "paddw %%mm6, %%mm2 \n\t" /* mm2 = 3*src5 */ - "paddw %%mm1, %%mm7 \n\t" /* mm7 = 3*src3 */ - "psubw %%mm4, %%mm5 \n\t" /* mm5 = 3*src1 - 2*src7 = a0 */ - "paddw %%mm6, %%mm7 \n\t" /* mm7 = 3*src3 + 2*src5 = a1 */ - "psubw %%mm2, %%mm1 \n\t" /* mm1 = 2*src3 - 3*src5 = a2 */ - "paddw %%mm0, %%mm3 \n\t" /* mm3 = 2*src1 + 3*src7 = a3 */ - - "movq %%mm5, %%mm4 \n\t" - "movq %%mm7, %%mm6 \n\t" - "movq %%mm3, %%mm0 \n\t" - "movq %%mm1, %%mm2 \n\t" - SUMSUB_BA( %%mm7, %%mm5 ) /* mm7 = a0 + a1 mm5 = a0 - a1 */ - "paddw %%mm3, %%mm7 \n\t" /* mm7 = a0 + a1 + a3 */ - "paddw %%mm1, %%mm5 \n\t" /* mm5 = a0 - a1 + a2 */ - "paddw %%mm7, %%mm7 \n\t" - "paddw %%mm5, %%mm5 \n\t" - "paddw %%mm6, %%mm7 \n\t" /* mm7 = b4 */ - "paddw %%mm4, %%mm5 \n\t" /* mm5 = b5 */ - - SUMSUB_BA( %%mm1, %%mm3 ) /* mm1 = a3 + a2 mm3 = a3 - a2 */ - "psubw %%mm1, %%mm4 \n\t" /* mm4 = a0 - a2 - a3 */ - "movq %%mm4, %%mm1 \n\t" /* mm1 = a0 - a2 - a3 */ - "psubw %%mm6, %%mm3 \n\t" /* mm3 = a3 - a2 - a1 */ - "paddw %%mm1, %%mm1 \n\t" - "paddw %%mm3, %%mm3 \n\t" - "psubw %%mm2, %%mm1 \n\t" /* mm1 = b7 */ - "paddw %%mm0, %%mm3 \n\t" /* mm3 = b6 */ - - "movq 32(%0), %%mm2 \n\t" /* mm2 = src2 */ - "movq 96(%0), %%mm6 \n\t" /* mm6 = src6 */ - "movq %%mm2, %%mm4 \n\t" - "movq %%mm6, %%mm0 \n\t" - "psllw $2, %%mm4 \n\t" /* mm4 = 4*src2 */ - "psllw $2, %%mm6 \n\t" /* mm6 = 4*src6 */ - "paddw %%mm4, %%mm2 \n\t" /* mm2 = 5*src2 */ - "paddw %%mm6, %%mm0 \n\t" /* mm0 = 5*src6 */ - "paddw %%mm2, %%mm2 \n\t" - "paddw %%mm0, %%mm0 \n\t" - "psubw %%mm0, %%mm4 \n\t" /* mm4 = 4*src2 - 10*src6 = a7 */ - "paddw %%mm2, %%mm6 \n\t" /* mm6 = 4*src6 + 10*src2 = a6 */ - - "movq (%0), %%mm2 \n\t" /* mm2 = src0 */ - "movq 64(%0), %%mm0 \n\t" /* mm0 = src4 */ - SUMSUB_BA( %%mm0, %%mm2 ) /* mm0 = src0+src4 mm2 = src0-src4 */ - "psllw $3, %%mm0 \n\t" - "psllw $3, %%mm2 \n\t" - "paddw %1, %%mm0 \n\t" /* add rounding bias */ - "paddw %1, %%mm2 \n\t" /* add rounding bias */ - - SUMSUB_BA( %%mm6, %%mm0 ) /* mm6 = a4 + a6 mm0 = a4 - a6 */ - SUMSUB_BA( %%mm4, %%mm2 ) /* mm4 = a5 + a7 mm2 = a5 - a7 */ - SUMSUB_BA( %%mm7, %%mm6 ) /* mm7 = dst0 mm6 = dst7 */ - SUMSUB_BA( %%mm5, %%mm4 ) /* mm5 = dst1 mm4 = dst6 */ - SUMSUB_BA( %%mm3, %%mm2 ) /* mm3 = dst2 mm2 = dst5 */ - SUMSUB_BA( %%mm1, %%mm0 ) /* mm1 = dst3 mm0 = dst4 */ - :: "r"(block), "m"(bias) - ); -} - -static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride) -{ - int i; - DECLARE_ALIGNED_8(int16_t, b2[64]); - - for(i=0; i<2; i++){ - DECLARE_ALIGNED_8(uint64_t, tmp); - - cavs_idct8_1d(block+4*i, ff_pw_4); - - __asm__ volatile( - "psraw $3, %%mm7 \n\t" - "psraw $3, %%mm6 \n\t" - "psraw $3, %%mm5 \n\t" - "psraw $3, %%mm4 \n\t" - "psraw $3, %%mm3 \n\t" - "psraw $3, %%mm2 \n\t" - "psraw $3, %%mm1 \n\t" - "psraw $3, %%mm0 \n\t" - "movq %%mm7, %0 \n\t" - TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 ) - "movq %%mm0, 8(%1) \n\t" - "movq %%mm6, 24(%1) \n\t" - "movq %%mm7, 40(%1) \n\t" - "movq %%mm4, 56(%1) \n\t" - "movq %0, %%mm7 \n\t" - TRANSPOSE4( %%mm7, %%mm5, %%mm3, %%mm1, %%mm0 ) - "movq %%mm7, (%1) \n\t" - "movq %%mm1, 16(%1) \n\t" - "movq %%mm0, 32(%1) \n\t" - "movq %%mm3, 48(%1) \n\t" - : "=m"(tmp) - : "r"(b2+32*i) - : "memory" - ); - } - - for(i=0; i<2; i++){ - cavs_idct8_1d(b2+4*i, ff_pw_64); - - __asm__ volatile( - "psraw $7, %%mm7 \n\t" - "psraw $7, %%mm6 \n\t" - "psraw $7, %%mm5 \n\t" - "psraw $7, %%mm4 \n\t" - "psraw $7, %%mm3 \n\t" - "psraw $7, %%mm2 \n\t" - "psraw $7, %%mm1 \n\t" - "psraw $7, %%mm0 \n\t" - "movq %%mm7, (%0) \n\t" - "movq %%mm5, 16(%0) \n\t" - "movq %%mm3, 32(%0) \n\t" - "movq %%mm1, 48(%0) \n\t" - "movq %%mm0, 64(%0) \n\t" - "movq %%mm2, 80(%0) \n\t" - "movq %%mm4, 96(%0) \n\t" - "movq %%mm6, 112(%0) \n\t" - :: "r"(b2+4*i) - : "memory" - ); - } - - add_pixels_clamped_mmx(b2, dst, stride); - - /* clear block */ - __asm__ volatile( - "pxor %%mm7, %%mm7 \n\t" - "movq %%mm7, (%0) \n\t" - "movq %%mm7, 8(%0) \n\t" - "movq %%mm7, 16(%0) \n\t" - "movq %%mm7, 24(%0) \n\t" - "movq %%mm7, 32(%0) \n\t" - "movq %%mm7, 40(%0) \n\t" - "movq %%mm7, 48(%0) \n\t" - "movq %%mm7, 56(%0) \n\t" - "movq %%mm7, 64(%0) \n\t" - "movq %%mm7, 72(%0) \n\t" - "movq %%mm7, 80(%0) \n\t" - "movq %%mm7, 88(%0) \n\t" - "movq %%mm7, 96(%0) \n\t" - "movq %%mm7, 104(%0) \n\t" - "movq %%mm7, 112(%0) \n\t" - "movq %%mm7, 120(%0) \n\t" - :: "r" (block) - ); -} - -/***************************************************************************** - * - * motion compensation - * - ****************************************************************************/ - -/* vertical filter [-1 -2 96 42 -7 0] */ -#define QPEL_CAVSV1(A,B,C,D,E,F,OP) \ - "movd (%0), "#F" \n\t"\ - "movq "#C", %%mm6 \n\t"\ - "pmullw %5, %%mm6 \n\t"\ - "movq "#D", %%mm7 \n\t"\ - "pmullw %6, %%mm7 \n\t"\ - "psllw $3, "#E" \n\t"\ - "psubw "#E", %%mm6 \n\t"\ - "psraw $3, "#E" \n\t"\ - "paddw %%mm7, %%mm6 \n\t"\ - "paddw "#E", %%mm6 \n\t"\ - "paddw "#B", "#B" \n\t"\ - "pxor %%mm7, %%mm7 \n\t"\ - "add %2, %0 \n\t"\ - "punpcklbw %%mm7, "#F" \n\t"\ - "psubw "#B", %%mm6 \n\t"\ - "psraw $1, "#B" \n\t"\ - "psubw "#A", %%mm6 \n\t"\ - "paddw %4, %%mm6 \n\t"\ - "psraw $7, %%mm6 \n\t"\ - "packuswb %%mm6, %%mm6 \n\t"\ - OP(%%mm6, (%1), A, d) \ - "add %3, %1 \n\t" - -/* vertical filter [ 0 -1 5 5 -1 0] */ -#define QPEL_CAVSV2(A,B,C,D,E,F,OP) \ - "movd (%0), "#F" \n\t"\ - "movq "#C", %%mm6 \n\t"\ - "paddw "#D", %%mm6 \n\t"\ - "pmullw %5, %%mm6 \n\t"\ - "add %2, %0 \n\t"\ - "punpcklbw %%mm7, "#F" \n\t"\ - "psubw "#B", %%mm6 \n\t"\ - "psubw "#E", %%mm6 \n\t"\ - "paddw %4, %%mm6 \n\t"\ - "psraw $3, %%mm6 \n\t"\ - "packuswb %%mm6, %%mm6 \n\t"\ - OP(%%mm6, (%1), A, d) \ - "add %3, %1 \n\t" - -/* vertical filter [ 0 -7 42 96 -2 -1] */ -#define QPEL_CAVSV3(A,B,C,D,E,F,OP) \ - "movd (%0), "#F" \n\t"\ - "movq "#C", %%mm6 \n\t"\ - "pmullw %6, %%mm6 \n\t"\ - "movq "#D", %%mm7 \n\t"\ - "pmullw %5, %%mm7 \n\t"\ - "psllw $3, "#B" \n\t"\ - "psubw "#B", %%mm6 \n\t"\ - "psraw $3, "#B" \n\t"\ - "paddw %%mm7, %%mm6 \n\t"\ - "paddw "#B", %%mm6 \n\t"\ - "paddw "#E", "#E" \n\t"\ - "pxor %%mm7, %%mm7 \n\t"\ - "add %2, %0 \n\t"\ - "punpcklbw %%mm7, "#F" \n\t"\ - "psubw "#E", %%mm6 \n\t"\ - "psraw $1, "#E" \n\t"\ - "psubw "#F", %%mm6 \n\t"\ - "paddw %4, %%mm6 \n\t"\ - "psraw $7, %%mm6 \n\t"\ - "packuswb %%mm6, %%mm6 \n\t"\ - OP(%%mm6, (%1), A, d) \ - "add %3, %1 \n\t" - - -#define QPEL_CAVSVNUM(VOP,OP,ADD,MUL1,MUL2)\ - int w= 2;\ - src -= 2*srcStride;\ - \ - while(w--){\ - __asm__ volatile(\ - "pxor %%mm7, %%mm7 \n\t"\ - "movd (%0), %%mm0 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm1 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm2 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm3 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm4 \n\t"\ - "add %2, %0 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "punpcklbw %%mm7, %%mm1 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpcklbw %%mm7, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm4 \n\t"\ - VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ - VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ - VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ - VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ - VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\ - VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\ - VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ - VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ - \ - : "+a"(src), "+c"(dst)\ - : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride), "m"(ADD), "m"(MUL1), "m"(MUL2)\ - : "memory"\ - );\ - if(h==16){\ - __asm__ volatile(\ - VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ - VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ - VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\ - VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\ - VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ - VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ - VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ - VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ - \ - : "+a"(src), "+c"(dst)\ - : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride), "m"(ADD), "m"(MUL1), "m"(MUL2)\ - : "memory"\ - );\ - }\ - src += 4-(h+5)*srcStride;\ - dst += 4-h*dstStride;\ - } - -#define QPEL_CAVS(OPNAME, OP, MMX)\ -static void OPNAME ## cavs_qpel8_h_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - int h=8;\ - __asm__ volatile(\ - "pxor %%mm7, %%mm7 \n\t"\ - "movq %5, %%mm6 \n\t"\ - "1: \n\t"\ - "movq (%0), %%mm0 \n\t"\ - "movq 1(%0), %%mm2 \n\t"\ - "movq %%mm0, %%mm1 \n\t"\ - "movq %%mm2, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "punpckhbw %%mm7, %%mm1 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpckhbw %%mm7, %%mm3 \n\t"\ - "paddw %%mm2, %%mm0 \n\t"\ - "paddw %%mm3, %%mm1 \n\t"\ - "pmullw %%mm6, %%mm0 \n\t"\ - "pmullw %%mm6, %%mm1 \n\t"\ - "movq -1(%0), %%mm2 \n\t"\ - "movq 2(%0), %%mm4 \n\t"\ - "movq %%mm2, %%mm3 \n\t"\ - "movq %%mm4, %%mm5 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpckhbw %%mm7, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm4 \n\t"\ - "punpckhbw %%mm7, %%mm5 \n\t"\ - "paddw %%mm4, %%mm2 \n\t"\ - "paddw %%mm3, %%mm5 \n\t"\ - "psubw %%mm2, %%mm0 \n\t"\ - "psubw %%mm5, %%mm1 \n\t"\ - "movq %6, %%mm5 \n\t"\ - "paddw %%mm5, %%mm0 \n\t"\ - "paddw %%mm5, %%mm1 \n\t"\ - "psraw $3, %%mm0 \n\t"\ - "psraw $3, %%mm1 \n\t"\ - "packuswb %%mm1, %%mm0 \n\t"\ - OP(%%mm0, (%1),%%mm5, q) \ - "add %3, %0 \n\t"\ - "add %4, %1 \n\t"\ - "decl %2 \n\t"\ - " jnz 1b \n\t"\ - : "+a"(src), "+c"(dst), "+m"(h)\ - : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_4)\ - : "memory"\ - );\ -}\ -\ -static inline void OPNAME ## cavs_qpel8or16_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ - QPEL_CAVSVNUM(QPEL_CAVSV1,OP,ff_pw_64,ff_pw_96,ff_pw_42) \ -}\ -\ -static inline void OPNAME ## cavs_qpel8or16_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ - QPEL_CAVSVNUM(QPEL_CAVSV2,OP,ff_pw_4,ff_pw_5,ff_pw_5) \ -}\ -\ -static inline void OPNAME ## cavs_qpel8or16_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ - QPEL_CAVSVNUM(QPEL_CAVSV3,OP,ff_pw_64,ff_pw_96,ff_pw_42) \ -}\ -\ -static void OPNAME ## cavs_qpel8_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst , src , dstStride, srcStride, 8);\ -}\ -static void OPNAME ## cavs_qpel16_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst , src , dstStride, srcStride, 16);\ - OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ -}\ -\ -static void OPNAME ## cavs_qpel8_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst , src , dstStride, srcStride, 8);\ -}\ -static void OPNAME ## cavs_qpel16_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst , src , dstStride, srcStride, 16);\ - OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ -}\ -\ -static void OPNAME ## cavs_qpel8_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst , src , dstStride, srcStride, 8);\ -}\ -static void OPNAME ## cavs_qpel16_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst , src , dstStride, srcStride, 16);\ - OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ -}\ -\ -static void OPNAME ## cavs_qpel16_h_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - OPNAME ## cavs_qpel8_h_ ## MMX(dst , src , dstStride, srcStride);\ - OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\ - src += 8*srcStride;\ - dst += 8*dstStride;\ - OPNAME ## cavs_qpel8_h_ ## MMX(dst , src , dstStride, srcStride);\ - OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\ -}\ - -#define CAVS_MC(OPNAME, SIZE, MMX) \ -static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## cavs_qpel ## SIZE ## _h_ ## MMX(dst, src, stride, stride);\ -}\ -\ -static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## cavs_qpel ## SIZE ## _v1_ ## MMX(dst, src, stride, stride);\ -}\ -\ -static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## cavs_qpel ## SIZE ## _v2_ ## MMX(dst, src, stride, stride);\ -}\ -\ -static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## cavs_qpel ## SIZE ## _v3_ ## MMX(dst, src, stride, stride);\ -}\ - -#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t" -#define AVG_3DNOW_OP(a,b,temp, size) \ -"mov" #size " " #b ", " #temp " \n\t"\ -"pavgusb " #temp ", " #a " \n\t"\ -"mov" #size " " #a ", " #b " \n\t" -#define AVG_MMX2_OP(a,b,temp, size) \ -"mov" #size " " #b ", " #temp " \n\t"\ -"pavgb " #temp ", " #a " \n\t"\ -"mov" #size " " #a ", " #b " \n\t" - -QPEL_CAVS(put_, PUT_OP, 3dnow) -QPEL_CAVS(avg_, AVG_3DNOW_OP, 3dnow) -QPEL_CAVS(put_, PUT_OP, mmx2) -QPEL_CAVS(avg_, AVG_MMX2_OP, mmx2) - -CAVS_MC(put_, 8, 3dnow) -CAVS_MC(put_, 16,3dnow) -CAVS_MC(avg_, 8, 3dnow) -CAVS_MC(avg_, 16,3dnow) -CAVS_MC(put_, 8, mmx2) -CAVS_MC(put_, 16,mmx2) -CAVS_MC(avg_, 8, mmx2) -CAVS_MC(avg_, 16,mmx2) - -void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride); -void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride); -void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride); -void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride); - -void ff_cavsdsp_init_mmx2(DSPContext* c, AVCodecContext *avctx) { -#define dspfunc(PFX, IDX, NUM) \ - c->PFX ## _pixels_tab[IDX][ 0] = ff_ ## PFX ## NUM ## _mc00_mmx2; \ - c->PFX ## _pixels_tab[IDX][ 2] = ff_ ## PFX ## NUM ## _mc20_mmx2; \ - c->PFX ## _pixels_tab[IDX][ 4] = ff_ ## PFX ## NUM ## _mc01_mmx2; \ - c->PFX ## _pixels_tab[IDX][ 8] = ff_ ## PFX ## NUM ## _mc02_mmx2; \ - c->PFX ## _pixels_tab[IDX][12] = ff_ ## PFX ## NUM ## _mc03_mmx2; \ - - dspfunc(put_cavs_qpel, 0, 16); - dspfunc(put_cavs_qpel, 1, 8); - dspfunc(avg_cavs_qpel, 0, 16); - dspfunc(avg_cavs_qpel, 1, 8); -#undef dspfunc - c->cavs_idct8_add = cavs_idct8_add_mmx; -} - -void ff_cavsdsp_init_3dnow(DSPContext* c, AVCodecContext *avctx) { -#define dspfunc(PFX, IDX, NUM) \ - c->PFX ## _pixels_tab[IDX][ 0] = ff_ ## PFX ## NUM ## _mc00_mmx2; \ - c->PFX ## _pixels_tab[IDX][ 2] = ff_ ## PFX ## NUM ## _mc20_3dnow; \ - c->PFX ## _pixels_tab[IDX][ 4] = ff_ ## PFX ## NUM ## _mc01_3dnow; \ - c->PFX ## _pixels_tab[IDX][ 8] = ff_ ## PFX ## NUM ## _mc02_3dnow; \ - c->PFX ## _pixels_tab[IDX][12] = ff_ ## PFX ## NUM ## _mc03_3dnow; \ - - dspfunc(put_cavs_qpel, 0, 16); - dspfunc(put_cavs_qpel, 1, 8); - dspfunc(avg_cavs_qpel, 0, 16); - dspfunc(avg_cavs_qpel, 1, 8); -#undef dspfunc - c->cavs_idct8_add = cavs_idct8_add_mmx; -}
--- a/i386/cpuid.c Mon Dec 22 06:50:18 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,134 +0,0 @@ -/* - * CPU detection code, extracted from mmx.h - * (c)1997-99 by H. Dietz and R. Fisher - * Converted to C and improved by Fabrice Bellard. - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <stdlib.h> -#include "libavutil/x86_cpu.h" -#include "libavcodec/dsputil.h" - -#undef printf - -/* ebx saving is necessary for PIC. gcc seems unable to see it alone */ -#define cpuid(index,eax,ebx,ecx,edx)\ - __asm__ volatile\ - ("mov %%"REG_b", %%"REG_S"\n\t"\ - "cpuid\n\t"\ - "xchg %%"REG_b", %%"REG_S\ - : "=a" (eax), "=S" (ebx),\ - "=c" (ecx), "=d" (edx)\ - : "0" (index)); - -/* Function to test if multimedia instructions are supported... */ -int mm_support(void) -{ - int rval = 0; - int eax, ebx, ecx, edx; - int max_std_level, max_ext_level, std_caps=0, ext_caps=0; - x86_reg a, c; - -#ifdef ARCH_X86_64 -#define PUSHF "pushfq\n\t" -#define POPF "popfq\n\t" -#else -#define PUSHF "pushfl\n\t" -#define POPF "popfl\n\t" -#endif - __asm__ volatile ( - /* See if CPUID instruction is supported ... */ - /* ... Get copies of EFLAGS into eax and ecx */ - PUSHF - "pop %0\n\t" - "mov %0, %1\n\t" - - /* ... Toggle the ID bit in one copy and store */ - /* to the EFLAGS reg */ - "xor $0x200000, %0\n\t" - "push %0\n\t" - POPF - - /* ... Get the (hopefully modified) EFLAGS */ - PUSHF - "pop %0\n\t" - : "=a" (a), "=c" (c) - : - : "cc" - ); - - if (a == c) - return 0; /* CPUID not supported */ - - cpuid(0, max_std_level, ebx, ecx, edx); - - if(max_std_level >= 1){ - cpuid(1, eax, ebx, ecx, std_caps); - if (std_caps & (1<<23)) - rval |= FF_MM_MMX; - if (std_caps & (1<<25)) - rval |= FF_MM_MMXEXT -#if !defined(__GNUC__) || __GNUC__ > 2 - | FF_MM_SSE; - if (std_caps & (1<<26)) - rval |= FF_MM_SSE2; - if (ecx & 1) - rval |= FF_MM_SSE3; - if (ecx & 0x00000200 ) - rval |= FF_MM_SSSE3 -#endif - ; - } - - cpuid(0x80000000, max_ext_level, ebx, ecx, edx); - - if(max_ext_level >= 0x80000001){ - cpuid(0x80000001, eax, ebx, ecx, ext_caps); - if (ext_caps & (1<<31)) - rval |= FF_MM_3DNOW; - if (ext_caps & (1<<30)) - rval |= FF_MM_3DNOWEXT; - if (ext_caps & (1<<23)) - rval |= FF_MM_MMX; - if (ext_caps & (1<<22)) - rval |= FF_MM_MMXEXT; - } - -#if 0 - av_log(NULL, AV_LOG_DEBUG, "%s%s%s%s%s%s%s%s\n", - (rval&FF_MM_MMX) ? "MMX ":"", - (rval&FF_MM_MMXEXT) ? "MMX2 ":"", - (rval&FF_MM_SSE) ? "SSE ":"", - (rval&FF_MM_SSE2) ? "SSE2 ":"", - (rval&FF_MM_SSE3) ? "SSE3 ":"", - (rval&FF_MM_SSSE3) ? "SSSE3 ":"", - (rval&FF_MM_3DNOW) ? "3DNow ":"", - (rval&FF_MM_3DNOWEXT) ? "3DNowExt ":""); -#endif - return rval; -} - -#ifdef TEST -int main ( void ) -{ - int mm_flags; - mm_flags = mm_support(); - printf("mm_support = 0x%08X\n",mm_flags); - return 0; -} -#endif
--- a/i386/dnxhd_mmx.c Mon Dec 22 06:50:18 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,58 +0,0 @@ -/* - * VC3/DNxHD SIMD functions - * Copyright (c) 2007 Baptiste Coudurier <baptiste dot coudurier at smartjog dot com> - * - * VC-3 encoder funded by the British Broadcasting Corporation - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/x86_cpu.h" -#include "libavcodec/dnxhdenc.h" - -static void get_pixels_8x4_sym_sse2(DCTELEM *block, const uint8_t *pixels, int line_size) -{ - __asm__ volatile( - "pxor %%xmm7, %%xmm7 \n\t" - "movq (%0), %%xmm0 \n\t" - "add %2, %0 \n\t" - "movq (%0), %%xmm1 \n\t" - "movq (%0, %2), %%xmm2 \n\t" - "movq (%0, %2,2), %%xmm3 \n\t" - "punpcklbw %%xmm7, %%xmm0 \n\t" - "punpcklbw %%xmm7, %%xmm1 \n\t" - "punpcklbw %%xmm7, %%xmm2 \n\t" - "punpcklbw %%xmm7, %%xmm3 \n\t" - "movdqa %%xmm0, (%1) \n\t" - "movdqa %%xmm1, 16(%1) \n\t" - "movdqa %%xmm2, 32(%1) \n\t" - "movdqa %%xmm3, 48(%1) \n\t" - "movdqa %%xmm3 , 64(%1) \n\t" - "movdqa %%xmm2 , 80(%1) \n\t" - "movdqa %%xmm1 , 96(%1) \n\t" - "movdqa %%xmm0, 112(%1) \n\t" - : "+r" (pixels) - : "r" (block), "r" ((x86_reg)line_size) - ); -} - -void ff_dnxhd_init_mmx(DNXHDEncContext *ctx) -{ - if (mm_flags & FF_MM_SSE2) { - ctx->get_pixels_8x4_sym = get_pixels_8x4_sym_sse2; - } -}
--- a/i386/dsputil_h264_template_mmx.c Mon Dec 22 06:50:18 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,308 +0,0 @@ -/* - * Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>, - * Loren Merritt - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * MMX optimized version of (put|avg)_h264_chroma_mc8. - * H264_CHROMA_MC8_TMPL must be defined to the desired function name - * H264_CHROMA_OP must be defined to empty for put and pavgb/pavgusb for avg - * H264_CHROMA_MC8_MV0 must be defined to a (put|avg)_pixels8 function - */ -static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, int rnd) -{ - const uint64_t *rnd_reg; - DECLARE_ALIGNED_8(uint64_t, AA); - DECLARE_ALIGNED_8(uint64_t, DD); - int i; - - if(y==0 && x==0) { - /* no filter needed */ - H264_CHROMA_MC8_MV0(dst, src, stride, h); - return; - } - - assert(x<8 && y<8 && x>=0 && y>=0); - - if(y==0 || x==0) - { - /* 1 dimensional filter only */ - const int dxy = x ? 1 : stride; - - rnd_reg = rnd ? &ff_pw_4 : &ff_pw_3; - - __asm__ volatile( - "movd %0, %%mm5\n\t" - "movq %1, %%mm4\n\t" - "movq %2, %%mm6\n\t" /* mm6 = rnd */ - "punpcklwd %%mm5, %%mm5\n\t" - "punpckldq %%mm5, %%mm5\n\t" /* mm5 = B = x */ - "pxor %%mm7, %%mm7\n\t" - "psubw %%mm5, %%mm4\n\t" /* mm4 = A = 8-x */ - :: "rm"(x+y), "m"(ff_pw_8), "m"(*rnd_reg)); - - for(i=0; i<h; i++) { - __asm__ volatile( - /* mm0 = src[0..7], mm1 = src[1..8] */ - "movq %0, %%mm0\n\t" - "movq %1, %%mm2\n\t" - :: "m"(src[0]), "m"(src[dxy])); - - __asm__ volatile( - /* [mm0,mm1] = A * src[0..7] */ - /* [mm2,mm3] = B * src[1..8] */ - "movq %%mm0, %%mm1\n\t" - "movq %%mm2, %%mm3\n\t" - "punpcklbw %%mm7, %%mm0\n\t" - "punpckhbw %%mm7, %%mm1\n\t" - "punpcklbw %%mm7, %%mm2\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "pmullw %%mm4, %%mm0\n\t" - "pmullw %%mm4, %%mm1\n\t" - "pmullw %%mm5, %%mm2\n\t" - "pmullw %%mm5, %%mm3\n\t" - - /* dst[0..7] = (A * src[0..7] + B * src[1..8] + 4) >> 3 */ - "paddw %%mm6, %%mm0\n\t" - "paddw %%mm6, %%mm1\n\t" - "paddw %%mm2, %%mm0\n\t" - "paddw %%mm3, %%mm1\n\t" - "psrlw $3, %%mm0\n\t" - "psrlw $3, %%mm1\n\t" - "packuswb %%mm1, %%mm0\n\t" - H264_CHROMA_OP(%0, %%mm0) - "movq %%mm0, %0\n\t" - : "=m" (dst[0])); - - src += stride; - dst += stride; - } - return; - } - - /* general case, bilinear */ - rnd_reg = rnd ? &ff_pw_32.a : &ff_pw_28.a; - __asm__ volatile("movd %2, %%mm4\n\t" - "movd %3, %%mm6\n\t" - "punpcklwd %%mm4, %%mm4\n\t" - "punpcklwd %%mm6, %%mm6\n\t" - "punpckldq %%mm4, %%mm4\n\t" /* mm4 = x words */ - "punpckldq %%mm6, %%mm6\n\t" /* mm6 = y words */ - "movq %%mm4, %%mm5\n\t" - "pmullw %%mm6, %%mm4\n\t" /* mm4 = x * y */ - "psllw $3, %%mm5\n\t" - "psllw $3, %%mm6\n\t" - "movq %%mm5, %%mm7\n\t" - "paddw %%mm6, %%mm7\n\t" - "movq %%mm4, %1\n\t" /* DD = x * y */ - "psubw %%mm4, %%mm5\n\t" /* mm5 = B = 8x - xy */ - "psubw %%mm4, %%mm6\n\t" /* mm6 = C = 8y - xy */ - "paddw %4, %%mm4\n\t" - "psubw %%mm7, %%mm4\n\t" /* mm4 = A = xy - (8x+8y) + 64 */ - "pxor %%mm7, %%mm7\n\t" - "movq %%mm4, %0\n\t" - : "=m" (AA), "=m" (DD) : "rm" (x), "rm" (y), "m" (ff_pw_64)); - - __asm__ volatile( - /* mm0 = src[0..7], mm1 = src[1..8] */ - "movq %0, %%mm0\n\t" - "movq %1, %%mm1\n\t" - : : "m" (src[0]), "m" (src[1])); - - for(i=0; i<h; i++) { - src += stride; - - __asm__ volatile( - /* mm2 = A * src[0..3] + B * src[1..4] */ - /* mm3 = A * src[4..7] + B * src[5..8] */ - "movq %%mm0, %%mm2\n\t" - "movq %%mm1, %%mm3\n\t" - "punpckhbw %%mm7, %%mm0\n\t" - "punpcklbw %%mm7, %%mm1\n\t" - "punpcklbw %%mm7, %%mm2\n\t" - "punpckhbw %%mm7, %%mm3\n\t" - "pmullw %0, %%mm0\n\t" - "pmullw %0, %%mm2\n\t" - "pmullw %%mm5, %%mm1\n\t" - "pmullw %%mm5, %%mm3\n\t" - "paddw %%mm1, %%mm2\n\t" - "paddw %%mm0, %%mm3\n\t" - : : "m" (AA)); - - __asm__ volatile( - /* [mm2,mm3] += C * src[0..7] */ - "movq %0, %%mm0\n\t" - "movq %%mm0, %%mm1\n\t" - "punpcklbw %%mm7, %%mm0\n\t" - "punpckhbw %%mm7, %%mm1\n\t" - "pmullw %%mm6, %%mm0\n\t" - "pmullw %%mm6, %%mm1\n\t" - "paddw %%mm0, %%mm2\n\t" - "paddw %%mm1, %%mm3\n\t" - : : "m" (src[0])); - - __asm__ volatile( - /* [mm2,mm3] += D * src[1..8] */ - "movq %1, %%mm1\n\t" - "movq %%mm1, %%mm0\n\t" - "movq %%mm1, %%mm4\n\t" - "punpcklbw %%mm7, %%mm0\n\t" - "punpckhbw %%mm7, %%mm4\n\t" - "pmullw %2, %%mm0\n\t" - "pmullw %2, %%mm4\n\t" - "paddw %%mm0, %%mm2\n\t" - "paddw %%mm4, %%mm3\n\t" - "movq %0, %%mm0\n\t" - : : "m" (src[0]), "m" (src[1]), "m" (DD)); - - __asm__ volatile( - /* dst[0..7] = ([mm2,mm3] + 32) >> 6 */ - "paddw %1, %%mm2\n\t" - "paddw %1, %%mm3\n\t" - "psrlw $6, %%mm2\n\t" - "psrlw $6, %%mm3\n\t" - "packuswb %%mm3, %%mm2\n\t" - H264_CHROMA_OP(%0, %%mm2) - "movq %%mm2, %0\n\t" - : "=m" (dst[0]) : "m" (*rnd_reg)); - dst+= stride; - } -} - -static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) -{ - __asm__ volatile( - "pxor %%mm7, %%mm7 \n\t" - "movd %5, %%mm2 \n\t" - "movd %6, %%mm3 \n\t" - "movq "MANGLE(ff_pw_8)", %%mm4\n\t" - "movq "MANGLE(ff_pw_8)", %%mm5\n\t" - "punpcklwd %%mm2, %%mm2 \n\t" - "punpcklwd %%mm3, %%mm3 \n\t" - "punpcklwd %%mm2, %%mm2 \n\t" - "punpcklwd %%mm3, %%mm3 \n\t" - "psubw %%mm2, %%mm4 \n\t" - "psubw %%mm3, %%mm5 \n\t" - - "movd (%1), %%mm0 \n\t" - "movd 1(%1), %%mm6 \n\t" - "add %3, %1 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm6 \n\t" - "pmullw %%mm4, %%mm0 \n\t" - "pmullw %%mm2, %%mm6 \n\t" - "paddw %%mm0, %%mm6 \n\t" - - "1: \n\t" - "movd (%1), %%mm0 \n\t" - "movd 1(%1), %%mm1 \n\t" - "add %3, %1 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm1 \n\t" - "pmullw %%mm4, %%mm0 \n\t" - "pmullw %%mm2, %%mm1 \n\t" - "paddw %%mm0, %%mm1 \n\t" - "movq %%mm1, %%mm0 \n\t" - "pmullw %%mm5, %%mm6 \n\t" - "pmullw %%mm3, %%mm1 \n\t" - "paddw %4, %%mm6 \n\t" - "paddw %%mm6, %%mm1 \n\t" - "psrlw $6, %%mm1 \n\t" - "packuswb %%mm1, %%mm1 \n\t" - H264_CHROMA_OP4((%0), %%mm1, %%mm6) - "movd %%mm1, (%0) \n\t" - "add %3, %0 \n\t" - "movd (%1), %%mm6 \n\t" - "movd 1(%1), %%mm1 \n\t" - "add %3, %1 \n\t" - "punpcklbw %%mm7, %%mm6 \n\t" - "punpcklbw %%mm7, %%mm1 \n\t" - "pmullw %%mm4, %%mm6 \n\t" - "pmullw %%mm2, %%mm1 \n\t" - "paddw %%mm6, %%mm1 \n\t" - "movq %%mm1, %%mm6 \n\t" - "pmullw %%mm5, %%mm0 \n\t" - "pmullw %%mm3, %%mm1 \n\t" - "paddw %4, %%mm0 \n\t" - "paddw %%mm0, %%mm1 \n\t" - "psrlw $6, %%mm1 \n\t" - "packuswb %%mm1, %%mm1 \n\t" - H264_CHROMA_OP4((%0), %%mm1, %%mm0) - "movd %%mm1, (%0) \n\t" - "add %3, %0 \n\t" - "sub $2, %2 \n\t" - "jnz 1b \n\t" - : "+r"(dst), "+r"(src), "+r"(h) - : "r"((x86_reg)stride), "m"(ff_pw_32), "m"(x), "m"(y) - ); -} - -#ifdef H264_CHROMA_MC2_TMPL -static void H264_CHROMA_MC2_TMPL(uint8_t *dst/*align 2*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) -{ - int tmp = ((1<<16)-1)*x + 8; - int CD= tmp*y; - int AB= (tmp<<3) - CD; - __asm__ volatile( - /* mm5 = {A,B,A,B} */ - /* mm6 = {C,D,C,D} */ - "movd %0, %%mm5\n\t" - "movd %1, %%mm6\n\t" - "punpckldq %%mm5, %%mm5\n\t" - "punpckldq %%mm6, %%mm6\n\t" - "pxor %%mm7, %%mm7\n\t" - /* mm0 = src[0,1,1,2] */ - "movd %2, %%mm2\n\t" - "punpcklbw %%mm7, %%mm2\n\t" - "pshufw $0x94, %%mm2, %%mm2\n\t" - :: "r"(AB), "r"(CD), "m"(src[0])); - - - __asm__ volatile( - "1:\n\t" - "add %4, %1\n\t" - /* mm1 = A * src[0,1] + B * src[1,2] */ - "movq %%mm2, %%mm1\n\t" - "pmaddwd %%mm5, %%mm1\n\t" - /* mm0 = src[0,1,1,2] */ - "movd (%1), %%mm0\n\t" - "punpcklbw %%mm7, %%mm0\n\t" - "pshufw $0x94, %%mm0, %%mm0\n\t" - /* mm1 += C * src[0,1] + D * src[1,2] */ - "movq %%mm0, %%mm2\n\t" - "pmaddwd %%mm6, %%mm0\n\t" - "paddw %3, %%mm1\n\t" - "paddw %%mm0, %%mm1\n\t" - /* dst[0,1] = pack((mm1 + 32) >> 6) */ - "psrlw $6, %%mm1\n\t" - "packssdw %%mm7, %%mm1\n\t" - "packuswb %%mm7, %%mm1\n\t" - H264_CHROMA_OP4((%0), %%mm1, %%mm3) - "movd %%mm1, %%esi\n\t" - "movw %%si, (%0)\n\t" - "add %4, %0\n\t" - "sub $1, %2\n\t" - "jnz 1b\n\t" - : "+r" (dst), "+r"(src), "+r"(h) - : "m" (ff_pw_32), "r"((x86_reg)stride) - : "%esi"); - -} -#endif -
--- a/i386/dsputil_h264_template_ssse3.c Mon Dec 22 06:50:18 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,208 +0,0 @@ -/* - * Copyright (c) 2008 Loren Merritt - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * SSSE3 optimized version of (put|avg)_h264_chroma_mc8. - * H264_CHROMA_MC8_TMPL must be defined to the desired function name - * H264_CHROMA_MC8_MV0 must be defined to a (put|avg)_pixels8 function - * AVG_OP must be defined to empty for put and the identify for avg - */ -static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, int rnd) -{ - if(y==0 && x==0) { - /* no filter needed */ - H264_CHROMA_MC8_MV0(dst, src, stride, h); - return; - } - - assert(x<8 && y<8 && x>=0 && y>=0); - - if(y==0 || x==0) - { - /* 1 dimensional filter only */ - __asm__ volatile( - "movd %0, %%xmm7 \n\t" - "movq %1, %%xmm6 \n\t" - "pshuflw $0, %%xmm7, %%xmm7 \n\t" - "movlhps %%xmm6, %%xmm6 \n\t" - "movlhps %%xmm7, %%xmm7 \n\t" - :: "r"(255*(x+y)+8), "m"(*(rnd?&ff_pw_4:&ff_pw_3)) - ); - - if(x) { - __asm__ volatile( - "1: \n\t" - "movq (%1), %%xmm0 \n\t" - "movq 1(%1), %%xmm1 \n\t" - "movq (%1,%3), %%xmm2 \n\t" - "movq 1(%1,%3), %%xmm3 \n\t" - "punpcklbw %%xmm1, %%xmm0 \n\t" - "punpcklbw %%xmm3, %%xmm2 \n\t" - "pmaddubsw %%xmm7, %%xmm0 \n\t" - "pmaddubsw %%xmm7, %%xmm2 \n\t" - AVG_OP("movq (%0), %%xmm4 \n\t") - AVG_OP("movhps (%0,%3), %%xmm4 \n\t") - "paddw %%xmm6, %%xmm0 \n\t" - "paddw %%xmm6, %%xmm2 \n\t" - "psrlw $3, %%xmm0 \n\t" - "psrlw $3, %%xmm2 \n\t" - "packuswb %%xmm2, %%xmm0 \n\t" - AVG_OP("pavgb %%xmm4, %%xmm0 \n\t") - "movq %%xmm0, (%0) \n\t" - "movhps %%xmm0, (%0,%3) \n\t" - "sub $2, %2 \n\t" - "lea (%1,%3,2), %1 \n\t" - "lea (%0,%3,2), %0 \n\t" - "jg 1b \n\t" - :"+r"(dst), "+r"(src), "+r"(h) - :"r"((x86_reg)stride) - ); - } else { - __asm__ volatile( - "1: \n\t" - "movq (%1), %%xmm0 \n\t" - "movq (%1,%3), %%xmm1 \n\t" - "movdqa %%xmm1, %%xmm2 \n\t" - "movq (%1,%3,2), %%xmm3 \n\t" - "punpcklbw %%xmm1, %%xmm0 \n\t" - "punpcklbw %%xmm3, %%xmm2 \n\t" - "pmaddubsw %%xmm7, %%xmm0 \n\t" - "pmaddubsw %%xmm7, %%xmm2 \n\t" - AVG_OP("movq (%0), %%xmm4 \n\t") - AVG_OP("movhps (%0,%3), %%xmm4 \n\t") - "paddw %%xmm6, %%xmm0 \n\t" - "paddw %%xmm6, %%xmm2 \n\t" - "psrlw $3, %%xmm0 \n\t" - "psrlw $3, %%xmm2 \n\t" - "packuswb %%xmm2, %%xmm0 \n\t" - AVG_OP("pavgb %%xmm4, %%xmm0 \n\t") - "movq %%xmm0, (%0) \n\t" - "movhps %%xmm0, (%0,%3) \n\t" - "sub $2, %2 \n\t" - "lea (%1,%3,2), %1 \n\t" - "lea (%0,%3,2), %0 \n\t" - "jg 1b \n\t" - :"+r"(dst), "+r"(src), "+r"(h) - :"r"((x86_reg)stride) - ); - } - return; - } - - /* general case, bilinear */ - __asm__ volatile( - "movd %0, %%xmm7 \n\t" - "movd %1, %%xmm6 \n\t" - "movdqa %2, %%xmm5 \n\t" - "pshuflw $0, %%xmm7, %%xmm7 \n\t" - "pshuflw $0, %%xmm6, %%xmm6 \n\t" - "movlhps %%xmm7, %%xmm7 \n\t" - "movlhps %%xmm6, %%xmm6 \n\t" - :: "r"((x*255+8)*(8-y)), "r"((x*255+8)*y), "m"(*(rnd?&ff_pw_32:&ff_pw_28)) - ); - - __asm__ volatile( - "movq (%1), %%xmm0 \n\t" - "movq 1(%1), %%xmm1 \n\t" - "punpcklbw %%xmm1, %%xmm0 \n\t" - "add %3, %1 \n\t" - "1: \n\t" - "movq (%1), %%xmm1 \n\t" - "movq 1(%1), %%xmm2 \n\t" - "movq (%1,%3), %%xmm3 \n\t" - "movq 1(%1,%3), %%xmm4 \n\t" - "lea (%1,%3,2), %1 \n\t" - "punpcklbw %%xmm2, %%xmm1 \n\t" - "punpcklbw %%xmm4, %%xmm3 \n\t" - "movdqa %%xmm1, %%xmm2 \n\t" - "movdqa %%xmm3, %%xmm4 \n\t" - "pmaddubsw %%xmm7, %%xmm0 \n\t" - "pmaddubsw %%xmm6, %%xmm1 \n\t" - "pmaddubsw %%xmm7, %%xmm2 \n\t" - "pmaddubsw %%xmm6, %%xmm3 \n\t" - "paddw %%xmm5, %%xmm0 \n\t" - "paddw %%xmm5, %%xmm2 \n\t" - "paddw %%xmm0, %%xmm1 \n\t" - "paddw %%xmm2, %%xmm3 \n\t" - "movdqa %%xmm4, %%xmm0 \n\t" - "psrlw $6, %%xmm1 \n\t" - "psrlw $6, %%xmm3 \n\t" - AVG_OP("movq (%0), %%xmm2 \n\t") - AVG_OP("movhps (%0,%3), %%xmm2 \n\t") - "packuswb %%xmm3, %%xmm1 \n\t" - AVG_OP("pavgb %%xmm2, %%xmm1 \n\t") - "movq %%xmm1, (%0)\n\t" - "movhps %%xmm1, (%0,%3)\n\t" - "sub $2, %2 \n\t" - "lea (%0,%3,2), %0 \n\t" - "jg 1b \n\t" - :"+r"(dst), "+r"(src), "+r"(h) - :"r"((x86_reg)stride) - ); -} - -static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) -{ - __asm__ volatile( - "movd %0, %%mm7 \n\t" - "movd %1, %%mm6 \n\t" - "movq %2, %%mm5 \n\t" - "pshufw $0, %%mm7, %%mm7 \n\t" - "pshufw $0, %%mm6, %%mm6 \n\t" - :: "r"((x*255+8)*(8-y)), "r"((x*255+8)*y), "m"(ff_pw_32) - ); - - __asm__ volatile( - "movd (%1), %%mm0 \n\t" - "punpcklbw 1(%1), %%mm0 \n\t" - "add %3, %1 \n\t" - "1: \n\t" - "movd (%1), %%mm1 \n\t" - "movd (%1,%3), %%mm3 \n\t" - "punpcklbw 1(%1), %%mm1 \n\t" - "punpcklbw 1(%1,%3), %%mm3 \n\t" - "lea (%1,%3,2), %1 \n\t" - "movq %%mm1, %%mm2 \n\t" - "movq %%mm3, %%mm4 \n\t" - "pmaddubsw %%mm7, %%mm0 \n\t" - "pmaddubsw %%mm6, %%mm1 \n\t" - "pmaddubsw %%mm7, %%mm2 \n\t" - "pmaddubsw %%mm6, %%mm3 \n\t" - "paddw %%mm5, %%mm0 \n\t" - "paddw %%mm5, %%mm2 \n\t" - "paddw %%mm0, %%mm1 \n\t" - "paddw %%mm2, %%mm3 \n\t" - "movq %%mm4, %%mm0 \n\t" - "psrlw $6, %%mm1 \n\t" - "psrlw $6, %%mm3 \n\t" - "packuswb %%mm1, %%mm1 \n\t" - "packuswb %%mm3, %%mm3 \n\t" - AVG_OP("pavgb (%0), %%mm1 \n\t") - AVG_OP("pavgb (%0,%3), %%mm3 \n\t") - "movd %%mm1, (%0)\n\t" - "movd %%mm3, (%0,%3)\n\t" - "sub $2, %2 \n\t" - "lea (%0,%3,2), %0 \n\t" - "jg 1b \n\t" - :"+r"(dst), "+r"(src), "+r"(h) - :"r"((x86_reg)stride) - ); -} -
--- a/i386/dsputil_mmx.c Mon Dec 22 06:50:18 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,2976 +0,0 @@ -/* - * MMX optimized DSP utils - * Copyright (c) 2000, 2001 Fabrice Bellard. - * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - * - * MMX optimization by Nick Kurshev <nickols_k@mail.ru> - */ - -#include "libavutil/x86_cpu.h" -#include "libavcodec/dsputil.h" -#include "libavcodec/h263.h" -#include "libavcodec/mpegvideo.h" -#include "libavcodec/simple_idct.h" -#include "dsputil_mmx.h" -#include "mmx.h" -#include "vp3dsp_mmx.h" -#include "vp3dsp_sse2.h" -#include "idct_xvid.h" - -//#undef NDEBUG -//#include <assert.h> - -int mm_flags; /* multimedia extension flags */ - -/* pixel operations */ -DECLARE_ALIGNED_8 (const uint64_t, ff_bone) = 0x0101010101010101ULL; -DECLARE_ALIGNED_8 (const uint64_t, ff_wtwo) = 0x0002000200020002ULL; - -DECLARE_ALIGNED_16(const uint64_t, ff_pdw_80000000[2]) = -{0x8000000080000000ULL, 0x8000000080000000ULL}; - -DECLARE_ALIGNED_8 (const uint64_t, ff_pw_3 ) = 0x0003000300030003ULL; -DECLARE_ALIGNED_8 (const uint64_t, ff_pw_4 ) = 0x0004000400040004ULL; -DECLARE_ALIGNED_16(const xmm_reg, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL}; -DECLARE_ALIGNED_16(const xmm_reg, ff_pw_8 ) = {0x0008000800080008ULL, 0x0008000800080008ULL}; -DECLARE_ALIGNED_8 (const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL; -DECLARE_ALIGNED_16(const xmm_reg, ff_pw_16 ) = {0x0010001000100010ULL, 0x0010001000100010ULL}; -DECLARE_ALIGNED_8 (const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL; -DECLARE_ALIGNED_16(const xmm_reg, ff_pw_28 ) = {0x001C001C001C001CULL, 0x001C001C001C001CULL}; -DECLARE_ALIGNED_16(const xmm_reg, ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL}; -DECLARE_ALIGNED_8 (const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL; -DECLARE_ALIGNED_8 (const uint64_t, ff_pw_64 ) = 0x0040004000400040ULL; -DECLARE_ALIGNED_8 (const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL; -DECLARE_ALIGNED_8 (const uint64_t, ff_pw_128) = 0x0080008000800080ULL; -DECLARE_ALIGNED_8 (const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL; - -DECLARE_ALIGNED_8 (const uint64_t, ff_pb_1 ) = 0x0101010101010101ULL; -DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3 ) = 0x0303030303030303ULL; -DECLARE_ALIGNED_8 (const uint64_t, ff_pb_7 ) = 0x0707070707070707ULL; -DECLARE_ALIGNED_8 (const uint64_t, ff_pb_1F ) = 0x1F1F1F1F1F1F1F1FULL; -DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL; -DECLARE_ALIGNED_8 (const uint64_t, ff_pb_81 ) = 0x8181818181818181ULL; -DECLARE_ALIGNED_8 (const uint64_t, ff_pb_A1 ) = 0xA1A1A1A1A1A1A1A1ULL; -DECLARE_ALIGNED_8 (const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL; - -DECLARE_ALIGNED_16(const double, ff_pd_1[2]) = { 1.0, 1.0 }; -DECLARE_ALIGNED_16(const double, ff_pd_2[2]) = { 2.0, 2.0 }; - -#define JUMPALIGN() __asm__ volatile (ASMALIGN(3)::) -#define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%" #regd ", %%" #regd ::) - -#define MOVQ_BFE(regd) \ - __asm__ volatile ( \ - "pcmpeqd %%" #regd ", %%" #regd " \n\t"\ - "paddb %%" #regd ", %%" #regd " \n\t" ::) - -#ifndef PIC -#define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone)) -#define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo)) -#else -// for shared library it's better to use this way for accessing constants -// pcmpeqd -> -1 -#define MOVQ_BONE(regd) \ - __asm__ volatile ( \ - "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ - "psrlw $15, %%" #regd " \n\t" \ - "packuswb %%" #regd ", %%" #regd " \n\t" ::) - -#define MOVQ_WTWO(regd) \ - __asm__ volatile ( \ - "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ - "psrlw $15, %%" #regd " \n\t" \ - "psllw $1, %%" #regd " \n\t"::) - -#endif - -// using regr as temporary and for the output result -// first argument is unmodifed and second is trashed -// regfe is supposed to contain 0xfefefefefefefefe -#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \ - "movq " #rega ", " #regr " \n\t"\ - "pand " #regb ", " #regr " \n\t"\ - "pxor " #rega ", " #regb " \n\t"\ - "pand " #regfe "," #regb " \n\t"\ - "psrlq $1, " #regb " \n\t"\ - "paddb " #regb ", " #regr " \n\t" - -#define PAVGB_MMX(rega, regb, regr, regfe) \ - "movq " #rega ", " #regr " \n\t"\ - "por " #regb ", " #regr " \n\t"\ - "pxor " #rega ", " #regb " \n\t"\ - "pand " #regfe "," #regb " \n\t"\ - "psrlq $1, " #regb " \n\t"\ - "psubb " #regb ", " #regr " \n\t" - -// mm6 is supposed to contain 0xfefefefefefefefe -#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \ - "movq " #rega ", " #regr " \n\t"\ - "movq " #regc ", " #regp " \n\t"\ - "pand " #regb ", " #regr " \n\t"\ - "pand " #regd ", " #regp " \n\t"\ - "pxor " #rega ", " #regb " \n\t"\ - "pxor " #regc ", " #regd " \n\t"\ - "pand %%mm6, " #regb " \n\t"\ - "pand %%mm6, " #regd " \n\t"\ - "psrlq $1, " #regb " \n\t"\ - "psrlq $1, " #regd " \n\t"\ - "paddb " #regb ", " #regr " \n\t"\ - "paddb " #regd ", " #regp " \n\t" - -#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \ - "movq " #rega ", " #regr " \n\t"\ - "movq " #regc ", " #regp " \n\t"\ - "por " #regb ", " #regr " \n\t"\ - "por " #regd ", " #regp " \n\t"\ - "pxor " #rega ", " #regb " \n\t"\ - "pxor " #regc ", " #regd " \n\t"\ - "pand %%mm6, " #regb " \n\t"\ - "pand %%mm6, " #regd " \n\t"\ - "psrlq $1, " #regd " \n\t"\ - "psrlq $1, " #regb " \n\t"\ - "psubb " #regb ", " #regr " \n\t"\ - "psubb " #regd ", " #regp " \n\t" - -/***********************************/ -/* MMX no rounding */ -#define DEF(x, y) x ## _no_rnd_ ## y ##_mmx -#define SET_RND MOVQ_WONE -#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f) -#define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e) - -#include "dsputil_mmx_rnd_template.c" - -#undef DEF -#undef SET_RND -#undef PAVGBP -#undef PAVGB -/***********************************/ -/* MMX rounding */ - -#define DEF(x, y) x ## _ ## y ##_mmx -#define SET_RND MOVQ_WTWO -#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f) -#define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e) - -#include "dsputil_mmx_rnd_template.c" - -#undef DEF -#undef SET_RND -#undef PAVGBP -#undef PAVGB - -/***********************************/ -/* 3Dnow specific */ - -#define DEF(x) x ## _3dnow -#define PAVGB "pavgusb" - -#include "dsputil_mmx_avg_template.c" - -#undef DEF -#undef PAVGB - -/***********************************/ -/* MMX2 specific */ - -#define DEF(x) x ## _mmx2 - -/* Introduced only in MMX2 set */ -#define PAVGB "pavgb" - -#include "dsputil_mmx_avg_template.c" - -#undef DEF -#undef PAVGB - -#define put_no_rnd_pixels16_mmx put_pixels16_mmx -#define put_no_rnd_pixels8_mmx put_pixels8_mmx -#define put_pixels16_mmx2 put_pixels16_mmx -#define put_pixels8_mmx2 put_pixels8_mmx -#define put_pixels4_mmx2 put_pixels4_mmx -#define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx -#define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx -#define put_pixels16_3dnow put_pixels16_mmx -#define put_pixels8_3dnow put_pixels8_mmx -#define put_pixels4_3dnow put_pixels4_mmx -#define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx -#define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx - -/***********************************/ -/* standard MMX */ - -void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) -{ - const DCTELEM *p; - uint8_t *pix; - - /* read the pixels */ - p = block; - pix = pixels; - /* unrolled loop */ - __asm__ volatile( - "movq %3, %%mm0 \n\t" - "movq 8%3, %%mm1 \n\t" - "movq 16%3, %%mm2 \n\t" - "movq 24%3, %%mm3 \n\t" - "movq 32%3, %%mm4 \n\t" - "movq 40%3, %%mm5 \n\t" - "movq 48%3, %%mm6 \n\t" - "movq 56%3, %%mm7 \n\t" - "packuswb %%mm1, %%mm0 \n\t" - "packuswb %%mm3, %%mm2 \n\t" - "packuswb %%mm5, %%mm4 \n\t" - "packuswb %%mm7, %%mm6 \n\t" - "movq %%mm0, (%0) \n\t" - "movq %%mm2, (%0, %1) \n\t" - "movq %%mm4, (%0, %1, 2) \n\t" - "movq %%mm6, (%0, %2) \n\t" - ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "m"(*p) - :"memory"); - pix += line_size*4; - p += 32; - - // if here would be an exact copy of the code above - // compiler would generate some very strange code - // thus using "r" - __asm__ volatile( - "movq (%3), %%mm0 \n\t" - "movq 8(%3), %%mm1 \n\t" - "movq 16(%3), %%mm2 \n\t" - "movq 24(%3), %%mm3 \n\t" - "movq 32(%3), %%mm4 \n\t" - "movq 40(%3), %%mm5 \n\t" - "movq 48(%3), %%mm6 \n\t" - "movq 56(%3), %%mm7 \n\t" - "packuswb %%mm1, %%mm0 \n\t" - "packuswb %%mm3, %%mm2 \n\t" - "packuswb %%mm5, %%mm4 \n\t" - "packuswb %%mm7, %%mm6 \n\t" - "movq %%mm0, (%0) \n\t" - "movq %%mm2, (%0, %1) \n\t" - "movq %%mm4, (%0, %1, 2) \n\t" - "movq %%mm6, (%0, %2) \n\t" - ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "r"(p) - :"memory"); -} - -static DECLARE_ALIGNED_8(const unsigned char, vector128[8]) = - { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 }; - -void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) -{ - int i; - - movq_m2r(*vector128, mm1); - for (i = 0; i < 8; i++) { - movq_m2r(*(block), mm0); - packsswb_m2r(*(block + 4), mm0); - block += 8; - paddb_r2r(mm1, mm0); - movq_r2m(mm0, *pixels); - pixels += line_size; - } -} - -void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) -{ - const DCTELEM *p; - uint8_t *pix; - int i; - - /* read the pixels */ - p = block; - pix = pixels; - MOVQ_ZERO(mm7); - i = 4; - do { - __asm__ volatile( - "movq (%2), %%mm0 \n\t" - "movq 8(%2), %%mm1 \n\t" - "movq 16(%2), %%mm2 \n\t" - "movq 24(%2), %%mm3 \n\t" - "movq %0, %%mm4 \n\t" - "movq %1, %%mm6 \n\t" - "movq %%mm4, %%mm5 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "punpckhbw %%mm7, %%mm5 \n\t" - "paddsw %%mm4, %%mm0 \n\t" - "paddsw %%mm5, %%mm1 \n\t" - "movq %%mm6, %%mm5 \n\t" - "punpcklbw %%mm7, %%mm6 \n\t" - "punpckhbw %%mm7, %%mm5 \n\t" - "paddsw %%mm6, %%mm2 \n\t" - "paddsw %%mm5, %%mm3 \n\t" - "packuswb %%mm1, %%mm0 \n\t" - "packuswb %%mm3, %%mm2 \n\t" - "movq %%mm0, %0 \n\t" - "movq %%mm2, %1 \n\t" - :"+m"(*pix), "+m"(*(pix+line_size)) - :"r"(p) - :"memory"); - pix += line_size*2; - p += 16; - } while (--i); -} - -static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) -{ - __asm__ volatile( - "lea (%3, %3), %%"REG_a" \n\t" - ASMALIGN(3) - "1: \n\t" - "movd (%1), %%mm0 \n\t" - "movd (%1, %3), %%mm1 \n\t" - "movd %%mm0, (%2) \n\t" - "movd %%mm1, (%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "movd (%1), %%mm0 \n\t" - "movd (%1, %3), %%mm1 \n\t" - "movd %%mm0, (%2) \n\t" - "movd %%mm1, (%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" - : "+g"(h), "+r" (pixels), "+r" (block) - : "r"((x86_reg)line_size) - : "%"REG_a, "memory" - ); -} - -static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) -{ - __asm__ volatile( - "lea (%3, %3), %%"REG_a" \n\t" - ASMALIGN(3) - "1: \n\t" - "movq (%1), %%mm0 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "movq (%1), %%mm0 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" - : "+g"(h), "+r" (pixels), "+r" (block) - : "r"((x86_reg)line_size) - : "%"REG_a, "memory" - ); -} - -static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) -{ - __asm__ volatile( - "lea (%3, %3), %%"REG_a" \n\t" - ASMALIGN(3) - "1: \n\t" - "movq (%1), %%mm0 \n\t" - "movq 8(%1), %%mm4 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq 8(%1, %3), %%mm5 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm4, 8(%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "movq %%mm5, 8(%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "movq (%1), %%mm0 \n\t" - "movq 8(%1), %%mm4 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq 8(%1, %3), %%mm5 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm4, 8(%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "movq %%mm5, 8(%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" - : "+g"(h), "+r" (pixels), "+r" (block) - : "r"((x86_reg)line_size) - : "%"REG_a, "memory" - ); -} - -static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h) -{ - __asm__ volatile( - "1: \n\t" - "movdqu (%1), %%xmm0 \n\t" - "movdqu (%1,%3), %%xmm1 \n\t" - "movdqu (%1,%3,2), %%xmm2 \n\t" - "movdqu (%1,%4), %%xmm3 \n\t" - "movdqa %%xmm0, (%2) \n\t" - "movdqa %%xmm1, (%2,%3) \n\t" - "movdqa %%xmm2, (%2,%3,2) \n\t" - "movdqa %%xmm3, (%2,%4) \n\t" - "subl $4, %0 \n\t" - "lea (%1,%3,4), %1 \n\t" - "lea (%2,%3,4), %2 \n\t" - "jnz 1b \n\t" - : "+g"(h), "+r" (pixels), "+r" (block) - : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size) - : "memory" - ); -} - -static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h) -{ - __asm__ volatile( - "1: \n\t" - "movdqu (%1), %%xmm0 \n\t" - "movdqu (%1,%3), %%xmm1 \n\t" - "movdqu (%1,%3,2), %%xmm2 \n\t" - "movdqu (%1,%4), %%xmm3 \n\t" - "pavgb (%2), %%xmm0 \n\t" - "pavgb (%2,%3), %%xmm1 \n\t" - "pavgb (%2,%3,2), %%xmm2 \n\t" - "pavgb (%2,%4), %%xmm3 \n\t" - "movdqa %%xmm0, (%2) \n\t" - "movdqa %%xmm1, (%2,%3) \n\t" - "movdqa %%xmm2, (%2,%3,2) \n\t" - "movdqa %%xmm3, (%2,%4) \n\t" - "subl $4, %0 \n\t" - "lea (%1,%3,4), %1 \n\t" - "lea (%2,%3,4), %2 \n\t" - "jnz 1b \n\t" - : "+g"(h), "+r" (pixels), "+r" (block) - : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size) - : "memory" - ); -} - -#define CLEAR_BLOCKS(name,n) \ -static void name(DCTELEM *blocks)\ -{\ - __asm__ volatile(\ - "pxor %%mm7, %%mm7 \n\t"\ - "mov %1, %%"REG_a" \n\t"\ - "1: \n\t"\ - "movq %%mm7, (%0, %%"REG_a") \n\t"\ - "movq %%mm7, 8(%0, %%"REG_a") \n\t"\ - "movq %%mm7, 16(%0, %%"REG_a") \n\t"\ - "movq %%mm7, 24(%0, %%"REG_a") \n\t"\ - "add $32, %%"REG_a" \n\t"\ - " js 1b \n\t"\ - : : "r" (((uint8_t *)blocks)+128*n),\ - "i" (-128*n)\ - : "%"REG_a\ - );\ -} -CLEAR_BLOCKS(clear_blocks_mmx, 6) -CLEAR_BLOCKS(clear_block_mmx, 1) - -static void clear_block_sse(DCTELEM *block) -{ - __asm__ volatile( - "xorps %%xmm0, %%xmm0 \n" - "movaps %%xmm0, (%0) \n" - "movaps %%xmm0, 16(%0) \n" - "movaps %%xmm0, 32(%0) \n" - "movaps %%xmm0, 48(%0) \n" - "movaps %%xmm0, 64(%0) \n" - "movaps %%xmm0, 80(%0) \n" - "movaps %%xmm0, 96(%0) \n" - "movaps %%xmm0, 112(%0) \n" - :: "r"(block) - : "memory" - ); -} - -static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){ - x86_reg i=0; - __asm__ volatile( - "jmp 2f \n\t" - "1: \n\t" - "movq (%1, %0), %%mm0 \n\t" - "movq (%2, %0), %%mm1 \n\t" - "paddb %%mm0, %%mm1 \n\t" - "movq %%mm1, (%2, %0) \n\t" - "movq 8(%1, %0), %%mm0 \n\t" - "movq 8(%2, %0), %%mm1 \n\t" - "paddb %%mm0, %%mm1 \n\t" - "movq %%mm1, 8(%2, %0) \n\t" - "add $16, %0 \n\t" - "2: \n\t" - "cmp %3, %0 \n\t" - " js 1b \n\t" - : "+r" (i) - : "r"(src), "r"(dst), "r"((x86_reg)w-15) - ); - for(; i<w; i++) - dst[i+0] += src[i+0]; -} - -static void add_bytes_l2_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ - x86_reg i=0; - __asm__ volatile( - "jmp 2f \n\t" - "1: \n\t" - "movq (%2, %0), %%mm0 \n\t" - "movq 8(%2, %0), %%mm1 \n\t" - "paddb (%3, %0), %%mm0 \n\t" - "paddb 8(%3, %0), %%mm1 \n\t" - "movq %%mm0, (%1, %0) \n\t" - "movq %%mm1, 8(%1, %0) \n\t" - "add $16, %0 \n\t" - "2: \n\t" - "cmp %4, %0 \n\t" - " js 1b \n\t" - : "+r" (i) - : "r"(dst), "r"(src1), "r"(src2), "r"((x86_reg)w-15) - ); - for(; i<w; i++) - dst[i] = src1[i] + src2[i]; -} - -#define H263_LOOP_FILTER \ - "pxor %%mm7, %%mm7 \n\t"\ - "movq %0, %%mm0 \n\t"\ - "movq %0, %%mm1 \n\t"\ - "movq %3, %%mm2 \n\t"\ - "movq %3, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "punpckhbw %%mm7, %%mm1 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpckhbw %%mm7, %%mm3 \n\t"\ - "psubw %%mm2, %%mm0 \n\t"\ - "psubw %%mm3, %%mm1 \n\t"\ - "movq %1, %%mm2 \n\t"\ - "movq %1, %%mm3 \n\t"\ - "movq %2, %%mm4 \n\t"\ - "movq %2, %%mm5 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpckhbw %%mm7, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm4 \n\t"\ - "punpckhbw %%mm7, %%mm5 \n\t"\ - "psubw %%mm2, %%mm4 \n\t"\ - "psubw %%mm3, %%mm5 \n\t"\ - "psllw $2, %%mm4 \n\t"\ - "psllw $2, %%mm5 \n\t"\ - "paddw %%mm0, %%mm4 \n\t"\ - "paddw %%mm1, %%mm5 \n\t"\ - "pxor %%mm6, %%mm6 \n\t"\ - "pcmpgtw %%mm4, %%mm6 \n\t"\ - "pcmpgtw %%mm5, %%mm7 \n\t"\ - "pxor %%mm6, %%mm4 \n\t"\ - "pxor %%mm7, %%mm5 \n\t"\ - "psubw %%mm6, %%mm4 \n\t"\ - "psubw %%mm7, %%mm5 \n\t"\ - "psrlw $3, %%mm4 \n\t"\ - "psrlw $3, %%mm5 \n\t"\ - "packuswb %%mm5, %%mm4 \n\t"\ - "packsswb %%mm7, %%mm6 \n\t"\ - "pxor %%mm7, %%mm7 \n\t"\ - "movd %4, %%mm2 \n\t"\ - "punpcklbw %%mm2, %%mm2 \n\t"\ - "punpcklbw %%mm2, %%mm2 \n\t"\ - "punpcklbw %%mm2, %%mm2 \n\t"\ - "psubusb %%mm4, %%mm2 \n\t"\ - "movq %%mm2, %%mm3 \n\t"\ - "psubusb %%mm4, %%mm3 \n\t"\ - "psubb %%mm3, %%mm2 \n\t"\ - "movq %1, %%mm3 \n\t"\ - "movq %2, %%mm4 \n\t"\ - "pxor %%mm6, %%mm3 \n\t"\ - "pxor %%mm6, %%mm4 \n\t"\ - "paddusb %%mm2, %%mm3 \n\t"\ - "psubusb %%mm2, %%mm4 \n\t"\ - "pxor %%mm6, %%mm3 \n\t"\ - "pxor %%mm6, %%mm4 \n\t"\ - "paddusb %%mm2, %%mm2 \n\t"\ - "packsswb %%mm1, %%mm0 \n\t"\ - "pcmpgtb %%mm0, %%mm7 \n\t"\ - "pxor %%mm7, %%mm0 \n\t"\ - "psubb %%mm7, %%mm0 \n\t"\ - "movq %%mm0, %%mm1 \n\t"\ - "psubusb %%mm2, %%mm0 \n\t"\ - "psubb %%mm0, %%mm1 \n\t"\ - "pand %5, %%mm1 \n\t"\ - "psrlw $2, %%mm1 \n\t"\ - "pxor %%mm7, %%mm1 \n\t"\ - "psubb %%mm7, %%mm1 \n\t"\ - "movq %0, %%mm5 \n\t"\ - "movq %3, %%mm6 \n\t"\ - "psubb %%mm1, %%mm5 \n\t"\ - "paddb %%mm1, %%mm6 \n\t" - -static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){ - if(ENABLE_ANY_H263) { - const int strength= ff_h263_loop_filter_strength[qscale]; - - __asm__ volatile( - - H263_LOOP_FILTER - - "movq %%mm3, %1 \n\t" - "movq %%mm4, %2 \n\t" - "movq %%mm5, %0 \n\t" - "movq %%mm6, %3 \n\t" - : "+m" (*(uint64_t*)(src - 2*stride)), - "+m" (*(uint64_t*)(src - 1*stride)), - "+m" (*(uint64_t*)(src + 0*stride)), - "+m" (*(uint64_t*)(src + 1*stride)) - : "g" (2*strength), "m"(ff_pb_FC) - ); - } -} - -static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){ - __asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ... - "movd %4, %%mm0 \n\t" - "movd %5, %%mm1 \n\t" - "movd %6, %%mm2 \n\t" - "movd %7, %%mm3 \n\t" - "punpcklbw %%mm1, %%mm0 \n\t" - "punpcklbw %%mm3, %%mm2 \n\t" - "movq %%mm0, %%mm1 \n\t" - "punpcklwd %%mm2, %%mm0 \n\t" - "punpckhwd %%mm2, %%mm1 \n\t" - "movd %%mm0, %0 \n\t" - "punpckhdq %%mm0, %%mm0 \n\t" - "movd %%mm0, %1 \n\t" - "movd %%mm1, %2 \n\t" - "punpckhdq %%mm1, %%mm1 \n\t" - "movd %%mm1, %3 \n\t" - - : "=m" (*(uint32_t*)(dst + 0*dst_stride)), - "=m" (*(uint32_t*)(dst + 1*dst_stride)), - "=m" (*(uint32_t*)(dst + 2*dst_stride)), - "=m" (*(uint32_t*)(dst + 3*dst_stride)) - : "m" (*(uint32_t*)(src + 0*src_stride)), - "m" (*(uint32_t*)(src + 1*src_stride)), - "m" (*(uint32_t*)(src + 2*src_stride)), - "m" (*(uint32_t*)(src + 3*src_stride)) - ); -} - -static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){ - if(ENABLE_ANY_H263) { - const int strength= ff_h263_loop_filter_strength[qscale]; - DECLARE_ALIGNED(8, uint64_t, temp[4]); - uint8_t *btemp= (uint8_t*)temp; - - src -= 2; - - transpose4x4(btemp , src , 8, stride); - transpose4x4(btemp+4, src + 4*stride, 8, stride); - __asm__ volatile( - H263_LOOP_FILTER // 5 3 4 6 - - : "+m" (temp[0]), - "+m" (temp[1]), - "+m" (temp[2]), - "+m" (temp[3]) - : "g" (2*strength), "m"(ff_pb_FC) - ); - - __asm__ volatile( - "movq %%mm5, %%mm1 \n\t" - "movq %%mm4, %%mm0 \n\t" - "punpcklbw %%mm3, %%mm5 \n\t" - "punpcklbw %%mm6, %%mm4 \n\t" - "punpckhbw %%mm3, %%mm1 \n\t" - "punpckhbw %%mm6, %%mm0 \n\t" - "movq %%mm5, %%mm3 \n\t" - "movq %%mm1, %%mm6 \n\t" - "punpcklwd %%mm4, %%mm5 \n\t" - "punpcklwd %%mm0, %%mm1 \n\t" - "punpckhwd %%mm4, %%mm3 \n\t" - "punpckhwd %%mm0, %%mm6 \n\t" - "movd %%mm5, (%0) \n\t" - "punpckhdq %%mm5, %%mm5 \n\t" - "movd %%mm5, (%0,%2) \n\t" - "movd %%mm3, (%0,%2,2) \n\t" - "punpckhdq %%mm3, %%mm3 \n\t" - "movd %%mm3, (%0,%3) \n\t" - "movd %%mm1, (%1) \n\t" - "punpckhdq %%mm1, %%mm1 \n\t" - "movd %%mm1, (%1,%2) \n\t" - "movd %%mm6, (%1,%2,2) \n\t" - "punpckhdq %%mm6, %%mm6 \n\t" - "movd %%mm6, (%1,%3) \n\t" - :: "r" (src), - "r" (src + 4*stride), - "r" ((x86_reg) stride ), - "r" ((x86_reg)(3*stride)) - ); - } -} - -/* draw the edges of width 'w' of an image of size width, height - this mmx version can only handle w==8 || w==16 */ -static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w) -{ - uint8_t *ptr, *last_line; - int i; - - last_line = buf + (height - 1) * wrap; - /* left and right */ - ptr = buf; - if(w==8) - { - __asm__ volatile( - "1: \n\t" - "movd (%0), %%mm0 \n\t" - "punpcklbw %%mm0, %%mm0 \n\t" - "punpcklwd %%mm0, %%mm0 \n\t" - "punpckldq %%mm0, %%mm0 \n\t" - "movq %%mm0, -8(%0) \n\t" - "movq -8(%0, %2), %%mm1 \n\t" - "punpckhbw %%mm1, %%mm1 \n\t" - "punpckhwd %%mm1, %%mm1 \n\t" - "punpckhdq %%mm1, %%mm1 \n\t" - "movq %%mm1, (%0, %2) \n\t" - "add %1, %0 \n\t" - "cmp %3, %0 \n\t" - " jb 1b \n\t" - : "+r" (ptr) - : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height) - ); - } - else - { - __asm__ volatile( - "1: \n\t" - "movd (%0), %%mm0 \n\t" - "punpcklbw %%mm0, %%mm0 \n\t" - "punpcklwd %%mm0, %%mm0 \n\t" - "punpckldq %%mm0, %%mm0 \n\t" - "movq %%mm0, -8(%0) \n\t" - "movq %%mm0, -16(%0) \n\t" - "movq -8(%0, %2), %%mm1 \n\t" - "punpckhbw %%mm1, %%mm1 \n\t" - "punpckhwd %%mm1, %%mm1 \n\t" - "punpckhdq %%mm1, %%mm1 \n\t" - "movq %%mm1, (%0, %2) \n\t" - "movq %%mm1, 8(%0, %2) \n\t" - "add %1, %0 \n\t" - "cmp %3, %0 \n\t" - " jb 1b \n\t" - : "+r" (ptr) - : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height) - ); - } - - for(i=0;i<w;i+=4) { - /* top and bottom (and hopefully also the corners) */ - ptr= buf - (i + 1) * wrap - w; - __asm__ volatile( - "1: \n\t" - "movq (%1, %0), %%mm0 \n\t" - "movq %%mm0, (%0) \n\t" - "movq %%mm0, (%0, %2) \n\t" - "movq %%mm0, (%0, %2, 2) \n\t" - "movq %%mm0, (%0, %3) \n\t" - "add $8, %0 \n\t" - "cmp %4, %0 \n\t" - " jb 1b \n\t" - : "+r" (ptr) - : "r" ((x86_reg)buf - (x86_reg)ptr - w), "r" ((x86_reg)-wrap), "r" ((x86_reg)-wrap*3), "r" (ptr+width+2*w) - ); - ptr= last_line + (i + 1) * wrap - w; - __asm__ volatile( - "1: \n\t" - "movq (%1, %0), %%mm0 \n\t" - "movq %%mm0, (%0) \n\t" - "movq %%mm0, (%0, %2) \n\t" - "movq %%mm0, (%0, %2, 2) \n\t" - "movq %%mm0, (%0, %3) \n\t" - "add $8, %0 \n\t" - "cmp %4, %0 \n\t" - " jb 1b \n\t" - : "+r" (ptr) - : "r" ((x86_reg)last_line - (x86_reg)ptr - w), "r" ((x86_reg)wrap), "r" ((x86_reg)wrap*3), "r" (ptr+width+2*w) - ); - } -} - -#define PAETH(cpu, abs3)\ -static void add_png_paeth_prediction_##cpu(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp)\ -{\ - x86_reg i = -bpp;\ - x86_reg end = w-3;\ - __asm__ volatile(\ - "pxor %%mm7, %%mm7 \n"\ - "movd (%1,%0), %%mm0 \n"\ - "movd (%2,%0), %%mm1 \n"\ - "punpcklbw %%mm7, %%mm0 \n"\ - "punpcklbw %%mm7, %%mm1 \n"\ - "add %4, %0 \n"\ - "1: \n"\ - "movq %%mm1, %%mm2 \n"\ - "movd (%2,%0), %%mm1 \n"\ - "movq %%mm2, %%mm3 \n"\ - "punpcklbw %%mm7, %%mm1 \n"\ - "movq %%mm2, %%mm4 \n"\ - "psubw %%mm1, %%mm3 \n"\ - "psubw %%mm0, %%mm4 \n"\ - "movq %%mm3, %%mm5 \n"\ - "paddw %%mm4, %%mm5 \n"\ - abs3\ - "movq %%mm4, %%mm6 \n"\ - "pminsw %%mm5, %%mm6 \n"\ - "pcmpgtw %%mm6, %%mm3 \n"\ - "pcmpgtw %%mm5, %%mm4 \n"\ - "movq %%mm4, %%mm6 \n"\ - "pand %%mm3, %%mm4 \n"\ - "pandn %%mm3, %%mm6 \n"\ - "pandn %%mm0, %%mm3 \n"\ - "movd (%3,%0), %%mm0 \n"\ - "pand %%mm1, %%mm6 \n"\ - "pand %%mm4, %%mm2 \n"\ - "punpcklbw %%mm7, %%mm0 \n"\ - "movq %6, %%mm5 \n"\ - "paddw %%mm6, %%mm0 \n"\ - "paddw %%mm2, %%mm3 \n"\ - "paddw %%mm3, %%mm0 \n"\ - "pand %%mm5, %%mm0 \n"\ - "movq %%mm0, %%mm3 \n"\ - "packuswb %%mm3, %%mm3 \n"\ - "movd %%mm3, (%1,%0) \n"\ - "add %4, %0 \n"\ - "cmp %5, %0 \n"\ - "jle 1b \n"\ - :"+r"(i)\ - :"r"(dst), "r"(top), "r"(src), "r"((x86_reg)bpp), "g"(end),\ - "m"(ff_pw_255)\ - :"memory"\ - );\ -} - -#define ABS3_MMX2\ - "psubw %%mm5, %%mm7 \n"\ - "pmaxsw %%mm7, %%mm5 \n"\ - "pxor %%mm6, %%mm6 \n"\ - "pxor %%mm7, %%mm7 \n"\ - "psubw %%mm3, %%mm6 \n"\ - "psubw %%mm4, %%mm7 \n"\ - "pmaxsw %%mm6, %%mm3 \n"\ - "pmaxsw %%mm7, %%mm4 \n"\ - "pxor %%mm7, %%mm7 \n" - -#define ABS3_SSSE3\ - "pabsw %%mm3, %%mm3 \n"\ - "pabsw %%mm4, %%mm4 \n"\ - "pabsw %%mm5, %%mm5 \n" - -PAETH(mmx2, ABS3_MMX2) -#ifdef HAVE_SSSE3 -PAETH(ssse3, ABS3_SSSE3) -#endif - -#define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\ - "paddw " #m4 ", " #m3 " \n\t" /* x1 */\ - "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\ - "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\ - "movq "#in7", " #m3 " \n\t" /* d */\ - "movq "#in0", %%mm5 \n\t" /* D */\ - "paddw " #m3 ", %%mm5 \n\t" /* x4 */\ - "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\ - "movq "#in1", %%mm5 \n\t" /* C */\ - "movq "#in2", %%mm6 \n\t" /* B */\ - "paddw " #m6 ", %%mm5 \n\t" /* x3 */\ - "paddw " #m5 ", %%mm6 \n\t" /* x2 */\ - "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\ - "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\ - "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\ - "paddw " #rnd ", %%mm4 \n\t" /* x2 */\ - "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\ - "psraw $5, %%mm5 \n\t"\ - "packuswb %%mm5, %%mm5 \n\t"\ - OP(%%mm5, out, %%mm7, d) - -#define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\ -static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ - uint64_t temp;\ -\ - __asm__ volatile(\ - "pxor %%mm7, %%mm7 \n\t"\ - "1: \n\t"\ - "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\ - "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\ - "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\ - "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\ - "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\ - "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\ - "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\ - "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\ - "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\ - "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\ - "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\ - "psllq $24, %%mm4 \n\t" /* 000ABCDE */\ - "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\ - "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\ - "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\ - "paddw %%mm3, %%mm5 \n\t" /* b */\ - "paddw %%mm2, %%mm6 \n\t" /* c */\ - "paddw %%mm5, %%mm5 \n\t" /* 2b */\ - "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\ - "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\ - "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\ - "paddw %%mm4, %%mm0 \n\t" /* a */\ - "paddw %%mm1, %%mm5 \n\t" /* d */\ - "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\ - "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\ - "paddw %6, %%mm6 \n\t"\ - "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ - "psraw $5, %%mm0 \n\t"\ - "movq %%mm0, %5 \n\t"\ - /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\ - \ - "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\ - "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\ - "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\ - "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\ - "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\ - "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\ - "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\ - "paddw %%mm0, %%mm2 \n\t" /* b */\ - "paddw %%mm5, %%mm3 \n\t" /* c */\ - "paddw %%mm2, %%mm2 \n\t" /* 2b */\ - "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\ - "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\ - "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\ - "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\ - "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\ - "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\ - "paddw %%mm2, %%mm1 \n\t" /* a */\ - "paddw %%mm6, %%mm4 \n\t" /* d */\ - "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\ - "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\ - "paddw %6, %%mm1 \n\t"\ - "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\ - "psraw $5, %%mm3 \n\t"\ - "movq %5, %%mm1 \n\t"\ - "packuswb %%mm3, %%mm1 \n\t"\ - OP_MMX2(%%mm1, (%1),%%mm4, q)\ - /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\ - \ - "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\ - "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\ - "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\ - "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\ - "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\ - "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\ - "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\ - "paddw %%mm1, %%mm5 \n\t" /* b */\ - "paddw %%mm4, %%mm0 \n\t" /* c */\ - "paddw %%mm5, %%mm5 \n\t" /* 2b */\ - "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\ - "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\ - "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\ - "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\ - "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\ - "paddw %%mm3, %%mm2 \n\t" /* d */\ - "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\ - "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\ - "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\ - "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\ - "paddw %%mm2, %%mm6 \n\t" /* a */\ - "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\ - "paddw %6, %%mm0 \n\t"\ - "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ - "psraw $5, %%mm0 \n\t"\ - /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\ - \ - "paddw %%mm5, %%mm3 \n\t" /* a */\ - "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\ - "paddw %%mm4, %%mm6 \n\t" /* b */\ - "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\ - "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\ - "paddw %%mm1, %%mm4 \n\t" /* c */\ - "paddw %%mm2, %%mm5 \n\t" /* d */\ - "paddw %%mm6, %%mm6 \n\t" /* 2b */\ - "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\ - "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\ - "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\ - "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\ - "paddw %6, %%mm4 \n\t"\ - "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\ - "psraw $5, %%mm4 \n\t"\ - "packuswb %%mm4, %%mm0 \n\t"\ - OP_MMX2(%%mm0, 8(%1), %%mm4, q)\ - \ - "add %3, %0 \n\t"\ - "add %4, %1 \n\t"\ - "decl %2 \n\t"\ - " jnz 1b \n\t"\ - : "+a"(src), "+c"(dst), "+D"(h)\ - : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\ - : "memory"\ - );\ -}\ -\ -static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ - int i;\ - int16_t temp[16];\ - /* quick HACK, XXX FIXME MUST be optimized */\ - for(i=0; i<h; i++)\ - {\ - temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\ - temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\ - temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\ - temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\ - temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\ - temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\ - temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\ - temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\ - temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\ - temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\ - temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\ - temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\ - temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\ - temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\ - temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\ - temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\ - __asm__ volatile(\ - "movq (%0), %%mm0 \n\t"\ - "movq 8(%0), %%mm1 \n\t"\ - "paddw %2, %%mm0 \n\t"\ - "paddw %2, %%mm1 \n\t"\ - "psraw $5, %%mm0 \n\t"\ - "psraw $5, %%mm1 \n\t"\ - "packuswb %%mm1, %%mm0 \n\t"\ - OP_3DNOW(%%mm0, (%1), %%mm1, q)\ - "movq 16(%0), %%mm0 \n\t"\ - "movq 24(%0), %%mm1 \n\t"\ - "paddw %2, %%mm0 \n\t"\ - "paddw %2, %%mm1 \n\t"\ - "psraw $5, %%mm0 \n\t"\ - "psraw $5, %%mm1 \n\t"\ - "packuswb %%mm1, %%mm0 \n\t"\ - OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\ - :: "r"(temp), "r"(dst), "m"(ROUNDER)\ - : "memory"\ - );\ - dst+=dstStride;\ - src+=srcStride;\ - }\ -}\ -\ -static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ - __asm__ volatile(\ - "pxor %%mm7, %%mm7 \n\t"\ - "1: \n\t"\ - "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\ - "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\ - "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\ - "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\ - "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\ - "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\ - "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\ - "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\ - "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\ - "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\ - "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\ - "psllq $24, %%mm4 \n\t" /* 000ABCDE */\ - "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\ - "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\ - "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\ - "paddw %%mm3, %%mm5 \n\t" /* b */\ - "paddw %%mm2, %%mm6 \n\t" /* c */\ - "paddw %%mm5, %%mm5 \n\t" /* 2b */\ - "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\ - "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\ - "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\ - "paddw %%mm4, %%mm0 \n\t" /* a */\ - "paddw %%mm1, %%mm5 \n\t" /* d */\ - "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\ - "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\ - "paddw %5, %%mm6 \n\t"\ - "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ - "psraw $5, %%mm0 \n\t"\ - /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\ - \ - "movd 5(%0), %%mm5 \n\t" /* FGHI */\ - "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\ - "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\ - "paddw %%mm5, %%mm1 \n\t" /* a */\ - "paddw %%mm6, %%mm2 \n\t" /* b */\ - "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\ - "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\ - "paddw %%mm6, %%mm3 \n\t" /* c */\ - "paddw %%mm5, %%mm4 \n\t" /* d */\ - "paddw %%mm2, %%mm2 \n\t" /* 2b */\ - "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\ - "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\ - "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\ - "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\ - "paddw %5, %%mm1 \n\t"\ - "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\ - "psraw $5, %%mm3 \n\t"\ - "packuswb %%mm3, %%mm0 \n\t"\ - OP_MMX2(%%mm0, (%1), %%mm4, q)\ - \ - "add %3, %0 \n\t"\ - "add %4, %1 \n\t"\ - "decl %2 \n\t"\ - " jnz 1b \n\t"\ - : "+a"(src), "+c"(dst), "+d"(h)\ - : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER)\ - : "memory"\ - );\ -}\ -\ -static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ - int i;\ - int16_t temp[8];\ - /* quick HACK, XXX FIXME MUST be optimized */\ - for(i=0; i<h; i++)\ - {\ - temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\ - temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\ - temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\ - temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\ - temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\ - temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\ - temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\ - temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\ - __asm__ volatile(\ - "movq (%0), %%mm0 \n\t"\ - "movq 8(%0), %%mm1 \n\t"\ - "paddw %2, %%mm0 \n\t"\ - "paddw %2, %%mm1 \n\t"\ - "psraw $5, %%mm0 \n\t"\ - "psraw $5, %%mm1 \n\t"\ - "packuswb %%mm1, %%mm0 \n\t"\ - OP_3DNOW(%%mm0, (%1), %%mm1, q)\ - :: "r"(temp), "r"(dst), "m"(ROUNDER)\ - :"memory"\ - );\ - dst+=dstStride;\ - src+=srcStride;\ - }\ -} - -#define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\ -\ -static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - uint64_t temp[17*4];\ - uint64_t *temp_ptr= temp;\ - int count= 17;\ -\ - /*FIXME unroll */\ - __asm__ volatile(\ - "pxor %%mm7, %%mm7 \n\t"\ - "1: \n\t"\ - "movq (%0), %%mm0 \n\t"\ - "movq (%0), %%mm1 \n\t"\ - "movq 8(%0), %%mm2 \n\t"\ - "movq 8(%0), %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "punpckhbw %%mm7, %%mm1 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpckhbw %%mm7, %%mm3 \n\t"\ - "movq %%mm0, (%1) \n\t"\ - "movq %%mm1, 17*8(%1) \n\t"\ - "movq %%mm2, 2*17*8(%1) \n\t"\ - "movq %%mm3, 3*17*8(%1) \n\t"\ - "add $8, %1 \n\t"\ - "add %3, %0 \n\t"\ - "decl %2 \n\t"\ - " jnz 1b \n\t"\ - : "+r" (src), "+r" (temp_ptr), "+r"(count)\ - : "r" ((x86_reg)srcStride)\ - : "memory"\ - );\ - \ - temp_ptr= temp;\ - count=4;\ - \ -/*FIXME reorder for speed */\ - __asm__ volatile(\ - /*"pxor %%mm7, %%mm7 \n\t"*/\ - "1: \n\t"\ - "movq (%0), %%mm0 \n\t"\ - "movq 8(%0), %%mm1 \n\t"\ - "movq 16(%0), %%mm2 \n\t"\ - "movq 24(%0), %%mm3 \n\t"\ - QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\ - QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\ - "add %4, %1 \n\t"\ - QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\ - \ - QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\ - "add %4, %1 \n\t"\ - QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\ - QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\ - "add %4, %1 \n\t"\ - QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\ - QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\ - "add %4, %1 \n\t"\ - QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\ - QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\ - "add %4, %1 \n\t"\ - QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\ - QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\ - "add %4, %1 \n\t"\ - QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\ - \ - QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\ - "add %4, %1 \n\t" \ - QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\ - QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\ - \ - "add $136, %0 \n\t"\ - "add %6, %1 \n\t"\ - "decl %2 \n\t"\ - " jnz 1b \n\t"\ - \ - : "+r"(temp_ptr), "+r"(dst), "+g"(count)\ - : "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(x86_reg)dstStride)\ - :"memory"\ - );\ -}\ -\ -static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - uint64_t temp[9*2];\ - uint64_t *temp_ptr= temp;\ - int count= 9;\ -\ - /*FIXME unroll */\ - __asm__ volatile(\ - "pxor %%mm7, %%mm7 \n\t"\ - "1: \n\t"\ - "movq (%0), %%mm0 \n\t"\ - "movq (%0), %%mm1 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "punpckhbw %%mm7, %%mm1 \n\t"\ - "movq %%mm0, (%1) \n\t"\ - "movq %%mm1, 9*8(%1) \n\t"\ - "add $8, %1 \n\t"\ - "add %3, %0 \n\t"\ - "decl %2 \n\t"\ - " jnz 1b \n\t"\ - : "+r" (src), "+r" (temp_ptr), "+r"(count)\ - : "r" ((x86_reg)srcStride)\ - : "memory"\ - );\ - \ - temp_ptr= temp;\ - count=2;\ - \ -/*FIXME reorder for speed */\ - __asm__ volatile(\ - /*"pxor %%mm7, %%mm7 \n\t"*/\ - "1: \n\t"\ - "movq (%0), %%mm0 \n\t"\ - "movq 8(%0), %%mm1 \n\t"\ - "movq 16(%0), %%mm2 \n\t"\ - "movq 24(%0), %%mm3 \n\t"\ - QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\ - QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\ - "add %4, %1 \n\t"\ - QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\ - \ - QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\ - "add %4, %1 \n\t"\ - QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\ - \ - QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\ - "add %4, %1 \n\t"\ - QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\ - QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\ - \ - "add $72, %0 \n\t"\ - "add %6, %1 \n\t"\ - "decl %2 \n\t"\ - " jnz 1b \n\t"\ - \ - : "+r"(temp_ptr), "+r"(dst), "+g"(count)\ - : "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(x86_reg)dstStride)\ - : "memory"\ - );\ -}\ -\ -static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);\ -}\ -\ -static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t temp[8];\ - uint8_t * const half= (uint8_t*)temp;\ - put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\ - OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\ -}\ -\ -static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\ -}\ -\ -static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t temp[8];\ - uint8_t * const half= (uint8_t*)temp;\ - put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\ - OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\ -}\ -\ -static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t temp[8];\ - uint8_t * const half= (uint8_t*)temp;\ - put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\ - OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\ -}\ -\ -static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\ -}\ -\ -static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t temp[8];\ - uint8_t * const half= (uint8_t*)temp;\ - put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\ - OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\ -}\ -static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t half[8 + 9];\ - uint8_t * const halfH= ((uint8_t*)half) + 64;\ - uint8_t * const halfHV= ((uint8_t*)half);\ - put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ - put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\ - put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ - OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\ -}\ -static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t half[8 + 9];\ - uint8_t * const halfH= ((uint8_t*)half) + 64;\ - uint8_t * const halfHV= ((uint8_t*)half);\ - put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ - put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\ - put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ - OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\ -}\ -static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t half[8 + 9];\ - uint8_t * const halfH= ((uint8_t*)half) + 64;\ - uint8_t * const halfHV= ((uint8_t*)half);\ - put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ - put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\ - put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ - OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\ -}\ -static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t half[8 + 9];\ - uint8_t * const halfH= ((uint8_t*)half) + 64;\ - uint8_t * const halfHV= ((uint8_t*)half);\ - put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ - put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\ - put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ - OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\ -}\ -static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t half[8 + 9];\ - uint8_t * const halfH= ((uint8_t*)half) + 64;\ - uint8_t * const halfHV= ((uint8_t*)half);\ - put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ - put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ - OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\ -}\ -static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t half[8 + 9];\ - uint8_t * const halfH= ((uint8_t*)half) + 64;\ - uint8_t * const halfHV= ((uint8_t*)half);\ - put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ - put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ - OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\ -}\ -static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t half[8 + 9];\ - uint8_t * const halfH= ((uint8_t*)half);\ - put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ - put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\ - OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\ -}\ -static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t half[8 + 9];\ - uint8_t * const halfH= ((uint8_t*)half);\ - put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ - put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\ - OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\ -}\ -static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t half[9];\ - uint8_t * const halfH= ((uint8_t*)half);\ - put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ - OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\ -}\ -static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);\ -}\ -\ -static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t temp[32];\ - uint8_t * const half= (uint8_t*)temp;\ - put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\ - OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\ -}\ -\ -static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\ -}\ -\ -static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t temp[32];\ - uint8_t * const half= (uint8_t*)temp;\ - put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\ - OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\ -}\ -\ -static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t temp[32];\ - uint8_t * const half= (uint8_t*)temp;\ - put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\ - OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\ -}\ -\ -static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\ -}\ -\ -static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t temp[32];\ - uint8_t * const half= (uint8_t*)temp;\ - put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\ - OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\ -}\ -static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t half[16*2 + 17*2];\ - uint8_t * const halfH= ((uint8_t*)half) + 256;\ - uint8_t * const halfHV= ((uint8_t*)half);\ - put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ - put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\ - put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ - OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\ -}\ -static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t half[16*2 + 17*2];\ - uint8_t * const halfH= ((uint8_t*)half) + 256;\ - uint8_t * const halfHV= ((uint8_t*)half);\ - put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ - put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\ - put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ - OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\ -}\ -static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t half[16*2 + 17*2];\ - uint8_t * const halfH= ((uint8_t*)half) + 256;\ - uint8_t * const halfHV= ((uint8_t*)half);\ - put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ - put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\ - put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ - OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\ -}\ -static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t half[16*2 + 17*2];\ - uint8_t * const halfH= ((uint8_t*)half) + 256;\ - uint8_t * const halfHV= ((uint8_t*)half);\ - put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ - put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\ - put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ - OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\ -}\ -static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t half[16*2 + 17*2];\ - uint8_t * const halfH= ((uint8_t*)half) + 256;\ - uint8_t * const halfHV= ((uint8_t*)half);\ - put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ - put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ - OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\ -}\ -static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t half[16*2 + 17*2];\ - uint8_t * const halfH= ((uint8_t*)half) + 256;\ - uint8_t * const halfHV= ((uint8_t*)half);\ - put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ - put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ - OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\ -}\ -static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t half[17*2];\ - uint8_t * const halfH= ((uint8_t*)half);\ - put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ - put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\ - OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ -}\ -static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t half[17*2];\ - uint8_t * const halfH= ((uint8_t*)half);\ - put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ - put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\ - OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ -}\ -static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t half[17*2];\ - uint8_t * const halfH= ((uint8_t*)half);\ - put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ - OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ -} - -#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t" -#define AVG_3DNOW_OP(a,b,temp, size) \ -"mov" #size " " #b ", " #temp " \n\t"\ -"pavgusb " #temp ", " #a " \n\t"\ -"mov" #size " " #a ", " #b " \n\t" -#define AVG_MMX2_OP(a,b,temp, size) \ -"mov" #size " " #b ", " #temp " \n\t"\ -"pavgb " #temp ", " #a " \n\t"\ -"mov" #size " " #a ", " #b " \n\t" - -QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP) -QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP) -QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP) -QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow) -QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow) -QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow) -QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2) -QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2) -QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2) - -/***********************************/ -/* bilinear qpel: not compliant to any spec, only for -lavdopts fast */ - -#define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\ -static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\ -} -#define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\ -static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\ -} - -#define QPEL_2TAP(OPNAME, SIZE, MMX)\ -QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\ -QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\ -QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\ -static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\ - OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\ -static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\ - OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\ -static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\ - OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\ -static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\ -}\ -static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\ -}\ -QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0)\ -QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0)\ -QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0)\ -QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0)\ -QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1)\ -QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1)\ -QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1)\ -QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\ - -QPEL_2TAP(put_, 16, mmx2) -QPEL_2TAP(avg_, 16, mmx2) -QPEL_2TAP(put_, 8, mmx2) -QPEL_2TAP(avg_, 8, mmx2) -QPEL_2TAP(put_, 16, 3dnow) -QPEL_2TAP(avg_, 16, 3dnow) -QPEL_2TAP(put_, 8, 3dnow) -QPEL_2TAP(avg_, 8, 3dnow) - - -#if 0 -static void just_return() { return; } -#endif - -static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, - int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height){ - const int w = 8; - const int ix = ox>>(16+shift); - const int iy = oy>>(16+shift); - const int oxs = ox>>4; - const int oys = oy>>4; - const int dxxs = dxx>>4; - const int dxys = dxy>>4; - const int dyxs = dyx>>4; - const int dyys = dyy>>4; - const uint16_t r4[4] = {r,r,r,r}; - const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys}; - const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys}; - const uint64_t shift2 = 2*shift; - uint8_t edge_buf[(h+1)*stride]; - int x, y; - - const int dxw = (dxx-(1<<(16+shift)))*(w-1); - const int dyh = (dyy-(1<<(16+shift)))*(h-1); - const int dxh = dxy*(h-1); - const int dyw = dyx*(w-1); - if( // non-constant fullpel offset (3% of blocks) - ((ox^(ox+dxw)) | (ox^(ox+dxh)) | (ox^(ox+dxw+dxh)) | - (oy^(oy+dyw)) | (oy^(oy+dyh)) | (oy^(oy+dyw+dyh))) >> (16+shift) - // uses more than 16 bits of subpel mv (only at huge resolution) - || (dxx|dxy|dyx|dyy)&15 ) - { - //FIXME could still use mmx for some of the rows - ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height); - return; - } - - src += ix + iy*stride; - if( (unsigned)ix >= width-w || - (unsigned)iy >= height-h ) - { - ff_emulated_edge_mc(edge_buf, src, stride, w+1, h+1, ix, iy, width, height); - src = edge_buf; - } - - __asm__ volatile( - "movd %0, %%mm6 \n\t" - "pxor %%mm7, %%mm7 \n\t" - "punpcklwd %%mm6, %%mm6 \n\t" - "punpcklwd %%mm6, %%mm6 \n\t" - :: "r"(1<<shift) - ); - - for(x=0; x<w; x+=4){ - uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0), - oxs - dxys + dxxs*(x+1), - oxs - dxys + dxxs*(x+2), - oxs - dxys + dxxs*(x+3) }; - uint16_t dy4[4] = { oys - dyys + dyxs*(x+0), - oys - dyys + dyxs*(x+1), - oys - dyys + dyxs*(x+2), - oys - dyys + dyxs*(x+3) }; - - for(y=0; y<h; y++){ - __asm__ volatile( - "movq %0, %%mm4 \n\t" - "movq %1, %%mm5 \n\t" - "paddw %2, %%mm4 \n\t" - "paddw %3, %%mm5 \n\t" - "movq %%mm4, %0 \n\t" - "movq %%mm5, %1 \n\t" - "psrlw $12, %%mm4 \n\t" - "psrlw $12, %%mm5 \n\t" - : "+m"(*dx4), "+m"(*dy4) - : "m"(*dxy4), "m"(*dyy4) - ); - - __asm__ volatile( - "movq %%mm6, %%mm2 \n\t" - "movq %%mm6, %%mm1 \n\t" - "psubw %%mm4, %%mm2 \n\t" - "psubw %%mm5, %%mm1 \n\t" - "movq %%mm2, %%mm0 \n\t" - "movq %%mm4, %%mm3 \n\t" - "pmullw %%mm1, %%mm0 \n\t" // (s-dx)*(s-dy) - "pmullw %%mm5, %%mm3 \n\t" // dx*dy - "pmullw %%mm5, %%mm2 \n\t" // (s-dx)*dy - "pmullw %%mm4, %%mm1 \n\t" // dx*(s-dy) - - "movd %4, %%mm5 \n\t" - "movd %3, %%mm4 \n\t" - "punpcklbw %%mm7, %%mm5 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "pmullw %%mm5, %%mm3 \n\t" // src[1,1] * dx*dy - "pmullw %%mm4, %%mm2 \n\t" // src[0,1] * (s-dx)*dy - - "movd %2, %%mm5 \n\t" - "movd %1, %%mm4 \n\t" - "punpcklbw %%mm7, %%mm5 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "pmullw %%mm5, %%mm1 \n\t" // src[1,0] * dx*(s-dy) - "pmullw %%mm4, %%mm0 \n\t" // src[0,0] * (s-dx)*(s-dy) - "paddw %5, %%mm1 \n\t" - "paddw %%mm3, %%mm2 \n\t" - "paddw %%mm1, %%mm0 \n\t" - "paddw %%mm2, %%mm0 \n\t" - - "psrlw %6, %%mm0 \n\t" - "packuswb %%mm0, %%mm0 \n\t" - "movd %%mm0, %0 \n\t" - - : "=m"(dst[x+y*stride]) - : "m"(src[0]), "m"(src[1]), - "m"(src[stride]), "m"(src[stride+1]), - "m"(*r4), "m"(shift2) - ); - src += stride; - } - src += 4-h*stride; - } -} - -#define PREFETCH(name, op) \ -static void name(void *mem, int stride, int h){\ - const uint8_t *p= mem;\ - do{\ - __asm__ volatile(#op" %0" :: "m"(*p));\ - p+= stride;\ - }while(--h);\ -} -PREFETCH(prefetch_mmx2, prefetcht0) -PREFETCH(prefetch_3dnow, prefetch) -#undef PREFETCH - -#include "h264dsp_mmx.c" - -/* CAVS specific */ -void ff_cavsdsp_init_mmx2(DSPContext* c, AVCodecContext *avctx); -void ff_cavsdsp_init_3dnow(DSPContext* c, AVCodecContext *avctx); - -void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) { - put_pixels8_mmx(dst, src, stride, 8); -} -void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) { - avg_pixels8_mmx(dst, src, stride, 8); -} -void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) { - put_pixels16_mmx(dst, src, stride, 16); -} -void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) { - avg_pixels16_mmx(dst, src, stride, 16); -} - -/* VC1 specific */ -void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx); - -void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) { - put_pixels8_mmx(dst, src, stride, 8); -} - -/* external functions, from idct_mmx.c */ -void ff_mmx_idct(DCTELEM *block); -void ff_mmxext_idct(DCTELEM *block); - -/* XXX: those functions should be suppressed ASAP when all IDCTs are - converted */ -#ifdef CONFIG_GPL -static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block) -{ - ff_mmx_idct (block); - put_pixels_clamped_mmx(block, dest, line_size); -} -static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block) -{ - ff_mmx_idct (block); - add_pixels_clamped_mmx(block, dest, line_size); -} -static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block) -{ - ff_mmxext_idct (block); - put_pixels_clamped_mmx(block, dest, line_size); -} -static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block) -{ - ff_mmxext_idct (block); - add_pixels_clamped_mmx(block, dest, line_size); -} -#endif -static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block) -{ - ff_idct_xvid_mmx (block); - put_pixels_clamped_mmx(block, dest, line_size); -} -static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block) -{ - ff_idct_xvid_mmx (block); - add_pixels_clamped_mmx(block, dest, line_size); -} -static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block) -{ - ff_idct_xvid_mmx2 (block); - put_pixels_clamped_mmx(block, dest, line_size); -} -static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block) -{ - ff_idct_xvid_mmx2 (block); - add_pixels_clamped_mmx(block, dest, line_size); -} - -static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize) -{ - int i; - __asm__ volatile("pxor %%mm7, %%mm7":); - for(i=0; i<blocksize; i+=2) { - __asm__ volatile( - "movq %0, %%mm0 \n\t" - "movq %1, %%mm1 \n\t" - "movq %%mm0, %%mm2 \n\t" - "movq %%mm1, %%mm3 \n\t" - "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0 - "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0 - "pslld $31, %%mm2 \n\t" // keep only the sign bit - "pxor %%mm2, %%mm1 \n\t" - "movq %%mm3, %%mm4 \n\t" - "pand %%mm1, %%mm3 \n\t" - "pandn %%mm1, %%mm4 \n\t" - "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m))) - "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m))) - "movq %%mm3, %1 \n\t" - "movq %%mm0, %0 \n\t" - :"+m"(mag[i]), "+m"(ang[i]) - ::"memory" - ); - } - __asm__ volatile("femms"); -} -static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize) -{ - int i; - - __asm__ volatile( - "movaps %0, %%xmm5 \n\t" - ::"m"(ff_pdw_80000000[0]) - ); - for(i=0; i<blocksize; i+=4) { - __asm__ volatile( - "movaps %0, %%xmm0 \n\t" - "movaps %1, %%xmm1 \n\t" - "xorps %%xmm2, %%xmm2 \n\t" - "xorps %%xmm3, %%xmm3 \n\t" - "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0 - "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0 - "andps %%xmm5, %%xmm2 \n\t" // keep only the sign bit - "xorps %%xmm2, %%xmm1 \n\t" - "movaps %%xmm3, %%xmm4 \n\t" - "andps %%xmm1, %%xmm3 \n\t" - "andnps %%xmm1, %%xmm4 \n\t" - "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m))) - "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m))) - "movaps %%xmm3, %1 \n\t" - "movaps %%xmm0, %0 \n\t" - :"+m"(mag[i]), "+m"(ang[i]) - ::"memory" - ); - } -} - -#define IF1(x) x -#define IF0(x) - -#define MIX5(mono,stereo)\ - __asm__ volatile(\ - "movss 0(%2), %%xmm5 \n"\ - "movss 8(%2), %%xmm6 \n"\ - "movss 24(%2), %%xmm7 \n"\ - "shufps $0, %%xmm5, %%xmm5 \n"\ - "shufps $0, %%xmm6, %%xmm6 \n"\ - "shufps $0, %%xmm7, %%xmm7 \n"\ - "1: \n"\ - "movaps (%0,%1), %%xmm0 \n"\ - "movaps 0x400(%0,%1), %%xmm1 \n"\ - "movaps 0x800(%0,%1), %%xmm2 \n"\ - "movaps 0xc00(%0,%1), %%xmm3 \n"\ - "movaps 0x1000(%0,%1), %%xmm4 \n"\ - "mulps %%xmm5, %%xmm0 \n"\ - "mulps %%xmm6, %%xmm1 \n"\ - "mulps %%xmm5, %%xmm2 \n"\ - "mulps %%xmm7, %%xmm3 \n"\ - "mulps %%xmm7, %%xmm4 \n"\ - stereo("addps %%xmm1, %%xmm0 \n")\ - "addps %%xmm1, %%xmm2 \n"\ - "addps %%xmm3, %%xmm0 \n"\ - "addps %%xmm4, %%xmm2 \n"\ - mono("addps %%xmm2, %%xmm0 \n")\ - "movaps %%xmm0, (%0,%1) \n"\ - stereo("movaps %%xmm2, 0x400(%0,%1) \n")\ - "add $16, %0 \n"\ - "jl 1b \n"\ - :"+&r"(i)\ - :"r"(samples[0]+len), "r"(matrix)\ - :"memory"\ - ); - -#define MIX_MISC(stereo)\ - __asm__ volatile(\ - "1: \n"\ - "movaps (%3,%0), %%xmm0 \n"\ - stereo("movaps %%xmm0, %%xmm1 \n")\ - "mulps %%xmm6, %%xmm0 \n"\ - stereo("mulps %%xmm7, %%xmm1 \n")\ - "lea 1024(%3,%0), %1 \n"\ - "mov %5, %2 \n"\ - "2: \n"\ - "movaps (%1), %%xmm2 \n"\ - stereo("movaps %%xmm2, %%xmm3 \n")\ - "mulps (%4,%2), %%xmm2 \n"\ - stereo("mulps 16(%4,%2), %%xmm3 \n")\ - "addps %%xmm2, %%xmm0 \n"\ - stereo("addps %%xmm3, %%xmm1 \n")\ - "add $1024, %1 \n"\ - "add $32, %2 \n"\ - "jl 2b \n"\ - "movaps %%xmm0, (%3,%0) \n"\ - stereo("movaps %%xmm1, 1024(%3,%0) \n")\ - "add $16, %0 \n"\ - "jl 1b \n"\ - :"+&r"(i), "=&r"(j), "=&r"(k)\ - :"r"(samples[0]+len), "r"(matrix_simd+in_ch), "g"((intptr_t)-32*(in_ch-1))\ - :"memory"\ - ); - -static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len) -{ - int (*matrix_cmp)[2] = (int(*)[2])matrix; - intptr_t i,j,k; - - i = -len*sizeof(float); - if(in_ch == 5 && out_ch == 2 && !(matrix_cmp[0][1]|matrix_cmp[2][0]|matrix_cmp[3][1]|matrix_cmp[4][0]|(matrix_cmp[1][0]^matrix_cmp[1][1])|(matrix_cmp[0][0]^matrix_cmp[2][1]))) { - MIX5(IF0,IF1); - } else if(in_ch == 5 && out_ch == 1 && matrix_cmp[0][0]==matrix_cmp[2][0] && matrix_cmp[3][0]==matrix_cmp[4][0]) { - MIX5(IF1,IF0); - } else { - DECLARE_ALIGNED_16(float, matrix_simd[in_ch][2][4]); - j = 2*in_ch*sizeof(float); - __asm__ volatile( - "1: \n" - "sub $8, %0 \n" - "movss (%2,%0), %%xmm6 \n" - "movss 4(%2,%0), %%xmm7 \n" - "shufps $0, %%xmm6, %%xmm6 \n" - "shufps $0, %%xmm7, %%xmm7 \n" - "movaps %%xmm6, (%1,%0,4) \n" - "movaps %%xmm7, 16(%1,%0,4) \n" - "jg 1b \n" - :"+&r"(j) - :"r"(matrix_simd), "r"(matrix) - :"memory" - ); - if(out_ch == 2) { - MIX_MISC(IF1); - } else { - MIX_MISC(IF0); - } - } -} - -static void vector_fmul_3dnow(float *dst, const float *src, int len){ - x86_reg i = (len-4)*4; - __asm__ volatile( - "1: \n\t" - "movq (%1,%0), %%mm0 \n\t" - "movq 8(%1,%0), %%mm1 \n\t" - "pfmul (%2,%0), %%mm0 \n\t" - "pfmul 8(%2,%0), %%mm1 \n\t" - "movq %%mm0, (%1,%0) \n\t" - "movq %%mm1, 8(%1,%0) \n\t" - "sub $16, %0 \n\t" - "jge 1b \n\t" - "femms \n\t" - :"+r"(i) - :"r"(dst), "r"(src) - :"memory" - ); -} -static void vector_fmul_sse(float *dst, const float *src, int len){ - x86_reg i = (len-8)*4; - __asm__ volatile( - "1: \n\t" - "movaps (%1,%0), %%xmm0 \n\t" - "movaps 16(%1,%0), %%xmm1 \n\t" - "mulps (%2,%0), %%xmm0 \n\t" - "mulps 16(%2,%0), %%xmm1 \n\t" - "movaps %%xmm0, (%1,%0) \n\t" - "movaps %%xmm1, 16(%1,%0) \n\t" - "sub $32, %0 \n\t" - "jge 1b \n\t" - :"+r"(i) - :"r"(dst), "r"(src) - :"memory" - ); -} - -static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){ - x86_reg i = len*4-16; - __asm__ volatile( - "1: \n\t" - "pswapd 8(%1), %%mm0 \n\t" - "pswapd (%1), %%mm1 \n\t" - "pfmul (%3,%0), %%mm0 \n\t" - "pfmul 8(%3,%0), %%mm1 \n\t" - "movq %%mm0, (%2,%0) \n\t" - "movq %%mm1, 8(%2,%0) \n\t" - "add $16, %1 \n\t" - "sub $16, %0 \n\t" - "jge 1b \n\t" - :"+r"(i), "+r"(src1) - :"r"(dst), "r"(src0) - ); - __asm__ volatile("femms"); -} -static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len){ - x86_reg i = len*4-32; - __asm__ volatile( - "1: \n\t" - "movaps 16(%1), %%xmm0 \n\t" - "movaps (%1), %%xmm1 \n\t" - "shufps $0x1b, %%xmm0, %%xmm0 \n\t" - "shufps $0x1b, %%xmm1, %%xmm1 \n\t" - "mulps (%3,%0), %%xmm0 \n\t" - "mulps 16(%3,%0), %%xmm1 \n\t" - "movaps %%xmm0, (%2,%0) \n\t" - "movaps %%xmm1, 16(%2,%0) \n\t" - "add $32, %1 \n\t" - "sub $32, %0 \n\t" - "jge 1b \n\t" - :"+r"(i), "+r"(src1) - :"r"(dst), "r"(src0) - ); -} - -static void vector_fmul_add_add_3dnow(float *dst, const float *src0, const float *src1, - const float *src2, int src3, int len, int step){ - x86_reg i = (len-4)*4; - if(step == 2 && src3 == 0){ - dst += (len-4)*2; - __asm__ volatile( - "1: \n\t" - "movq (%2,%0), %%mm0 \n\t" - "movq 8(%2,%0), %%mm1 \n\t" - "pfmul (%3,%0), %%mm0 \n\t" - "pfmul 8(%3,%0), %%mm1 \n\t" - "pfadd (%4,%0), %%mm0 \n\t" - "pfadd 8(%4,%0), %%mm1 \n\t" - "movd %%mm0, (%1) \n\t" - "movd %%mm1, 16(%1) \n\t" - "psrlq $32, %%mm0 \n\t" - "psrlq $32, %%mm1 \n\t" - "movd %%mm0, 8(%1) \n\t" - "movd %%mm1, 24(%1) \n\t" - "sub $32, %1 \n\t" - "sub $16, %0 \n\t" - "jge 1b \n\t" - :"+r"(i), "+r"(dst) - :"r"(src0), "r"(src1), "r"(src2) - :"memory" - ); - } - else if(step == 1 && src3 == 0){ - __asm__ volatile( - "1: \n\t" - "movq (%2,%0), %%mm0 \n\t" - "movq 8(%2,%0), %%mm1 \n\t" - "pfmul (%3,%0), %%mm0 \n\t" - "pfmul 8(%3,%0), %%mm1 \n\t" - "pfadd (%4,%0), %%mm0 \n\t" - "pfadd 8(%4,%0), %%mm1 \n\t" - "movq %%mm0, (%1,%0) \n\t" - "movq %%mm1, 8(%1,%0) \n\t" - "sub $16, %0 \n\t" - "jge 1b \n\t" - :"+r"(i) - :"r"(dst), "r"(src0), "r"(src1), "r"(src2) - :"memory" - ); - } - else - ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step); - __asm__ volatile("femms"); -} -static void vector_fmul_add_add_sse(float *dst, const float *src0, const float *src1, - const float *src2, int src3, int len, int step){ - x86_reg i = (len-8)*4; - if(step == 2 && src3 == 0){ - dst += (len-8)*2; - __asm__ volatile( - "1: \n\t" - "movaps (%2,%0), %%xmm0 \n\t" - "movaps 16(%2,%0), %%xmm1 \n\t" - "mulps (%3,%0), %%xmm0 \n\t" - "mulps 16(%3,%0), %%xmm1 \n\t" - "addps (%4,%0), %%xmm0 \n\t" - "addps 16(%4,%0), %%xmm1 \n\t" - "movss %%xmm0, (%1) \n\t" - "movss %%xmm1, 32(%1) \n\t" - "movhlps %%xmm0, %%xmm2 \n\t" - "movhlps %%xmm1, %%xmm3 \n\t" - "movss %%xmm2, 16(%1) \n\t" - "movss %%xmm3, 48(%1) \n\t" - "shufps $0xb1, %%xmm0, %%xmm0 \n\t" - "shufps $0xb1, %%xmm1, %%xmm1 \n\t" - "movss %%xmm0, 8(%1) \n\t" - "movss %%xmm1, 40(%1) \n\t" - "movhlps %%xmm0, %%xmm2 \n\t" - "movhlps %%xmm1, %%xmm3 \n\t" - "movss %%xmm2, 24(%1) \n\t" - "movss %%xmm3, 56(%1) \n\t" - "sub $64, %1 \n\t" - "sub $32, %0 \n\t" - "jge 1b \n\t" - :"+r"(i), "+r"(dst) - :"r"(src0), "r"(src1), "r"(src2) - :"memory" - ); - } - else if(step == 1 && src3 == 0){ - __asm__ volatile( - "1: \n\t" - "movaps (%2,%0), %%xmm0 \n\t" - "movaps 16(%2,%0), %%xmm1 \n\t" - "mulps (%3,%0), %%xmm0 \n\t" - "mulps 16(%3,%0), %%xmm1 \n\t" - "addps (%4,%0), %%xmm0 \n\t" - "addps 16(%4,%0), %%xmm1 \n\t" - "movaps %%xmm0, (%1,%0) \n\t" - "movaps %%xmm1, 16(%1,%0) \n\t" - "sub $32, %0 \n\t" - "jge 1b \n\t" - :"+r"(i) - :"r"(dst), "r"(src0), "r"(src1), "r"(src2) - :"memory" - ); - } - else - ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step); -} - -static void vector_fmul_window_3dnow2(float *dst, const float *src0, const float *src1, - const float *win, float add_bias, int len){ -#ifdef HAVE_6REGS - if(add_bias == 0){ - x86_reg i = -len*4; - x86_reg j = len*4-8; - __asm__ volatile( - "1: \n" - "pswapd (%5,%1), %%mm1 \n" - "movq (%5,%0), %%mm0 \n" - "pswapd (%4,%1), %%mm5 \n" - "movq (%3,%0), %%mm4 \n" - "movq %%mm0, %%mm2 \n" - "movq %%mm1, %%mm3 \n" - "pfmul %%mm4, %%mm2 \n" // src0[len+i]*win[len+i] - "pfmul %%mm5, %%mm3 \n" // src1[ j]*win[len+j] - "pfmul %%mm4, %%mm1 \n" // src0[len+i]*win[len+j] - "pfmul %%mm5, %%mm0 \n" // src1[ j]*win[len+i] - "pfadd %%mm3, %%mm2 \n" - "pfsub %%mm0, %%mm1 \n" - "pswapd %%mm2, %%mm2 \n" - "movq %%mm1, (%2,%0) \n" - "movq %%mm2, (%2,%1) \n" - "sub $8, %1 \n" - "add $8, %0 \n" - "jl 1b \n" - "femms \n" - :"+r"(i), "+r"(j) - :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len) - ); - }else -#endif - ff_vector_fmul_window_c(dst, src0, src1, win, add_bias, len); -} - -static void vector_fmul_window_sse(float *dst, const float *src0, const float *src1, - const float *win, float add_bias, int len){ -#ifdef HAVE_6REGS - if(add_bias == 0){ - x86_reg i = -len*4; - x86_reg j = len*4-16; - __asm__ volatile( - "1: \n" - "movaps (%5,%1), %%xmm1 \n" - "movaps (%5,%0), %%xmm0 \n" - "movaps (%4,%1), %%xmm5 \n" - "movaps (%3,%0), %%xmm4 \n" - "shufps $0x1b, %%xmm1, %%xmm1 \n" - "shufps $0x1b, %%xmm5, %%xmm5 \n" - "movaps %%xmm0, %%xmm2 \n" - "movaps %%xmm1, %%xmm3 \n" - "mulps %%xmm4, %%xmm2 \n" // src0[len+i]*win[len+i] - "mulps %%xmm5, %%xmm3 \n" // src1[ j]*win[len+j] - "mulps %%xmm4, %%xmm1 \n" // src0[len+i]*win[len+j] - "mulps %%xmm5, %%xmm0 \n" // src1[ j]*win[len+i] - "addps %%xmm3, %%xmm2 \n" - "subps %%xmm0, %%xmm1 \n" - "shufps $0x1b, %%xmm2, %%xmm2 \n" - "movaps %%xmm1, (%2,%0) \n" - "movaps %%xmm2, (%2,%1) \n" - "sub $16, %1 \n" - "add $16, %0 \n" - "jl 1b \n" - :"+r"(i), "+r"(j) - :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len) - ); - }else -#endif - ff_vector_fmul_window_c(dst, src0, src1, win, add_bias, len); -} - -static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len) -{ - x86_reg i = -4*len; - __asm__ volatile( - "movss %3, %%xmm4 \n" - "shufps $0, %%xmm4, %%xmm4 \n" - "1: \n" - "cvtpi2ps (%2,%0), %%xmm0 \n" - "cvtpi2ps 8(%2,%0), %%xmm1 \n" - "cvtpi2ps 16(%2,%0), %%xmm2 \n" - "cvtpi2ps 24(%2,%0), %%xmm3 \n" - "movlhps %%xmm1, %%xmm0 \n" - "movlhps %%xmm3, %%xmm2 \n" - "mulps %%xmm4, %%xmm0 \n" - "mulps %%xmm4, %%xmm2 \n" - "movaps %%xmm0, (%1,%0) \n" - "movaps %%xmm2, 16(%1,%0) \n" - "add $32, %0 \n" - "jl 1b \n" - :"+r"(i) - :"r"(dst+len), "r"(src+len), "m"(mul) - ); -} - -static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len) -{ - x86_reg i = -4*len; - __asm__ volatile( - "movss %3, %%xmm4 \n" - "shufps $0, %%xmm4, %%xmm4 \n" - "1: \n" - "cvtdq2ps (%2,%0), %%xmm0 \n" - "cvtdq2ps 16(%2,%0), %%xmm1 \n" - "mulps %%xmm4, %%xmm0 \n" - "mulps %%xmm4, %%xmm1 \n" - "movaps %%xmm0, (%1,%0) \n" - "movaps %%xmm1, 16(%1,%0) \n" - "add $32, %0 \n" - "jl 1b \n" - :"+r"(i) - :"r"(dst+len), "r"(src+len), "m"(mul) - ); -} - -static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){ - x86_reg reglen = len; - // not bit-exact: pf2id uses different rounding than C and SSE - __asm__ volatile( - "add %0 , %0 \n\t" - "lea (%2,%0,2) , %2 \n\t" - "add %0 , %1 \n\t" - "neg %0 \n\t" - "1: \n\t" - "pf2id (%2,%0,2) , %%mm0 \n\t" - "pf2id 8(%2,%0,2) , %%mm1 \n\t" - "pf2id 16(%2,%0,2) , %%mm2 \n\t" - "pf2id 24(%2,%0,2) , %%mm3 \n\t" - "packssdw %%mm1 , %%mm0 \n\t" - "packssdw %%mm3 , %%mm2 \n\t" - "movq %%mm0 , (%1,%0) \n\t" - "movq %%mm2 , 8(%1,%0) \n\t" - "add $16 , %0 \n\t" - " js 1b \n\t" - "femms \n\t" - :"+r"(reglen), "+r"(dst), "+r"(src) - ); -} -static void float_to_int16_sse(int16_t *dst, const float *src, long len){ - x86_reg reglen = len; - __asm__ volatile( - "add %0 , %0 \n\t" - "lea (%2,%0,2) , %2 \n\t" - "add %0 , %1 \n\t" - "neg %0 \n\t" - "1: \n\t" - "cvtps2pi (%2,%0,2) , %%mm0 \n\t" - "cvtps2pi 8(%2,%0,2) , %%mm1 \n\t" - "cvtps2pi 16(%2,%0,2) , %%mm2 \n\t" - "cvtps2pi 24(%2,%0,2) , %%mm3 \n\t" - "packssdw %%mm1 , %%mm0 \n\t" - "packssdw %%mm3 , %%mm2 \n\t" - "movq %%mm0 , (%1,%0) \n\t" - "movq %%mm2 , 8(%1,%0) \n\t" - "add $16 , %0 \n\t" - " js 1b \n\t" - "emms \n\t" - :"+r"(reglen), "+r"(dst), "+r"(src) - ); -} - -static void float_to_int16_sse2(int16_t *dst, const float *src, long len){ - x86_reg reglen = len; - __asm__ volatile( - "add %0 , %0 \n\t" - "lea (%2,%0,2) , %2 \n\t" - "add %0 , %1 \n\t" - "neg %0 \n\t" - "1: \n\t" - "cvtps2dq (%2,%0,2) , %%xmm0 \n\t" - "cvtps2dq 16(%2,%0,2) , %%xmm1 \n\t" - "packssdw %%xmm1 , %%xmm0 \n\t" - "movdqa %%xmm0 , (%1,%0) \n\t" - "add $16 , %0 \n\t" - " js 1b \n\t" - :"+r"(reglen), "+r"(dst), "+r"(src) - ); -} - -#ifdef HAVE_YASM -void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len); -void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len); -void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len); -void ff_x264_deblock_v_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); -void ff_x264_deblock_h_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); -void ff_x264_deblock_v8_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta); -void ff_x264_deblock_h_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta); -static void ff_x264_deblock_v_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta) -{ - ff_x264_deblock_v8_luma_intra_mmxext(pix+0, stride, alpha, beta); - ff_x264_deblock_v8_luma_intra_mmxext(pix+8, stride, alpha, beta); -} -void ff_x264_deblock_v_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta); -void ff_x264_deblock_h_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta); -#else -#define ff_float_to_int16_interleave6_sse(a,b,c) float_to_int16_interleave_misc_sse(a,b,c,6) -#define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6) -#define ff_float_to_int16_interleave6_3dn2(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6) -#endif -#define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse - -#define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \ -/* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\ -static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\ - DECLARE_ALIGNED_16(int16_t, tmp[len]);\ - int i,j,c;\ - for(c=0; c<channels; c++){\ - float_to_int16_##cpu(tmp, src[c], len);\ - for(i=0, j=c; i<len; i++, j+=channels)\ - dst[j] = tmp[i];\ - }\ -}\ -\ -static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\ - if(channels==1)\ - float_to_int16_##cpu(dst, src[0], len);\ - else if(channels==2){\ - x86_reg reglen = len; \ - const float *src0 = src[0];\ - const float *src1 = src[1];\ - __asm__ volatile(\ - "shl $2, %0 \n"\ - "add %0, %1 \n"\ - "add %0, %2 \n"\ - "add %0, %3 \n"\ - "neg %0 \n"\ - body\ - :"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\ - );\ - }else if(channels==6){\ - ff_float_to_int16_interleave6_##cpu(dst, src, len);\ - }else\ - float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\ -} - -FLOAT_TO_INT16_INTERLEAVE(3dnow, - "1: \n" - "pf2id (%2,%0), %%mm0 \n" - "pf2id 8(%2,%0), %%mm1 \n" - "pf2id (%3,%0), %%mm2 \n" - "pf2id 8(%3,%0), %%mm3 \n" - "packssdw %%mm1, %%mm0 \n" - "packssdw %%mm3, %%mm2 \n" - "movq %%mm0, %%mm1 \n" - "punpcklwd %%mm2, %%mm0 \n" - "punpckhwd %%mm2, %%mm1 \n" - "movq %%mm0, (%1,%0)\n" - "movq %%mm1, 8(%1,%0)\n" - "add $16, %0 \n" - "js 1b \n" - "femms \n" -) - -FLOAT_TO_INT16_INTERLEAVE(sse, - "1: \n" - "cvtps2pi (%2,%0), %%mm0 \n" - "cvtps2pi 8(%2,%0), %%mm1 \n" - "cvtps2pi (%3,%0), %%mm2 \n" - "cvtps2pi 8(%3,%0), %%mm3 \n" - "packssdw %%mm1, %%mm0 \n" - "packssdw %%mm3, %%mm2 \n" - "movq %%mm0, %%mm1 \n" - "punpcklwd %%mm2, %%mm0 \n" - "punpckhwd %%mm2, %%mm1 \n" - "movq %%mm0, (%1,%0)\n" - "movq %%mm1, 8(%1,%0)\n" - "add $16, %0 \n" - "js 1b \n" - "emms \n" -) - -FLOAT_TO_INT16_INTERLEAVE(sse2, - "1: \n" - "cvtps2dq (%2,%0), %%xmm0 \n" - "cvtps2dq (%3,%0), %%xmm1 \n" - "packssdw %%xmm1, %%xmm0 \n" - "movhlps %%xmm0, %%xmm1 \n" - "punpcklwd %%xmm1, %%xmm0 \n" - "movdqa %%xmm0, (%1,%0) \n" - "add $16, %0 \n" - "js 1b \n" -) - -static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){ - if(channels==6) - ff_float_to_int16_interleave6_3dn2(dst, src, len); - else - float_to_int16_interleave_3dnow(dst, src, len, channels); -} - - -void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, int width); -void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, int width); -void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width); -void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width); -void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, - int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8); -void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, - int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8); - - -static void add_int16_sse2(int16_t * v1, int16_t * v2, int order) -{ - x86_reg o = -(order << 1); - v1 += order; - v2 += order; - __asm__ volatile( - "1: \n\t" - "movdqu (%1,%2), %%xmm0 \n\t" - "movdqu 16(%1,%2), %%xmm1 \n\t" - "paddw (%0,%2), %%xmm0 \n\t" - "paddw 16(%0,%2), %%xmm1 \n\t" - "movdqa %%xmm0, (%0,%2) \n\t" - "movdqa %%xmm1, 16(%0,%2) \n\t" - "add $32, %2 \n\t" - "js 1b \n\t" - : "+r"(v1), "+r"(v2), "+r"(o) - ); -} - -static void sub_int16_sse2(int16_t * v1, int16_t * v2, int order) -{ - x86_reg o = -(order << 1); - v1 += order; - v2 += order; - __asm__ volatile( - "1: \n\t" - "movdqa (%0,%2), %%xmm0 \n\t" - "movdqa 16(%0,%2), %%xmm2 \n\t" - "movdqu (%1,%2), %%xmm1 \n\t" - "movdqu 16(%1,%2), %%xmm3 \n\t" - "psubw %%xmm1, %%xmm0 \n\t" - "psubw %%xmm3, %%xmm2 \n\t" - "movdqa %%xmm0, (%0,%2) \n\t" - "movdqa %%xmm2, 16(%0,%2) \n\t" - "add $32, %2 \n\t" - "js 1b \n\t" - : "+r"(v1), "+r"(v2), "+r"(o) - ); -} - -static int32_t scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift) -{ - int res = 0; - DECLARE_ALIGNED_16(int64_t, sh); - x86_reg o = -(order << 1); - - v1 += order; - v2 += order; - sh = shift; - __asm__ volatile( - "pxor %%xmm7, %%xmm7 \n\t" - "1: \n\t" - "movdqu (%0,%3), %%xmm0 \n\t" - "movdqu 16(%0,%3), %%xmm1 \n\t" - "pmaddwd (%1,%3), %%xmm0 \n\t" - "pmaddwd 16(%1,%3), %%xmm1 \n\t" - "paddd %%xmm0, %%xmm7 \n\t" - "paddd %%xmm1, %%xmm7 \n\t" - "add $32, %3 \n\t" - "js 1b \n\t" - "movhlps %%xmm7, %%xmm2 \n\t" - "paddd %%xmm2, %%xmm7 \n\t" - "psrad %4, %%xmm7 \n\t" - "pshuflw $0x4E, %%xmm7,%%xmm2 \n\t" - "paddd %%xmm2, %%xmm7 \n\t" - "movd %%xmm7, %2 \n\t" - : "+r"(v1), "+r"(v2), "=r"(res), "+r"(o) - : "m"(sh) - ); - return res; -} - -void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) -{ - mm_flags = mm_support(); - - if (avctx->dsp_mask) { - if (avctx->dsp_mask & FF_MM_FORCE) - mm_flags |= (avctx->dsp_mask & 0xffff); - else - mm_flags &= ~(avctx->dsp_mask & 0xffff); - } - -#if 0 - av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:"); - if (mm_flags & FF_MM_MMX) - av_log(avctx, AV_LOG_INFO, " mmx"); - if (mm_flags & FF_MM_MMXEXT) - av_log(avctx, AV_LOG_INFO, " mmxext"); - if (mm_flags & FF_MM_3DNOW) - av_log(avctx, AV_LOG_INFO, " 3dnow"); - if (mm_flags & FF_MM_SSE) - av_log(avctx, AV_LOG_INFO, " sse"); - if (mm_flags & FF_MM_SSE2) - av_log(avctx, AV_LOG_INFO, " sse2"); - av_log(avctx, AV_LOG_INFO, "\n"); -#endif - - if (mm_flags & FF_MM_MMX) { - const int idct_algo= avctx->idct_algo; - - if(avctx->lowres==0){ - if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){ - c->idct_put= ff_simple_idct_put_mmx; - c->idct_add= ff_simple_idct_add_mmx; - c->idct = ff_simple_idct_mmx; - c->idct_permutation_type= FF_SIMPLE_IDCT_PERM; -#ifdef CONFIG_GPL - }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){ - if(mm_flags & FF_MM_MMXEXT){ - c->idct_put= ff_libmpeg2mmx2_idct_put; - c->idct_add= ff_libmpeg2mmx2_idct_add; - c->idct = ff_mmxext_idct; - }else{ - c->idct_put= ff_libmpeg2mmx_idct_put; - c->idct_add= ff_libmpeg2mmx_idct_add; - c->idct = ff_mmx_idct; - } - c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM; -#endif - }else if((ENABLE_VP3_DECODER || ENABLE_VP5_DECODER || ENABLE_VP6_DECODER || ENABLE_THEORA_DECODER) && - idct_algo==FF_IDCT_VP3){ - if(mm_flags & FF_MM_SSE2){ - c->idct_put= ff_vp3_idct_put_sse2; - c->idct_add= ff_vp3_idct_add_sse2; - c->idct = ff_vp3_idct_sse2; - c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM; - }else{ - c->idct_put= ff_vp3_idct_put_mmx; - c->idct_add= ff_vp3_idct_add_mmx; - c->idct = ff_vp3_idct_mmx; - c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM; - } - }else if(idct_algo==FF_IDCT_CAVS){ - c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM; - }else if(idct_algo==FF_IDCT_XVIDMMX){ - if(mm_flags & FF_MM_SSE2){ - c->idct_put= ff_idct_xvid_sse2_put; - c->idct_add= ff_idct_xvid_sse2_add; - c->idct = ff_idct_xvid_sse2; - c->idct_permutation_type= FF_SSE2_IDCT_PERM; - }else if(mm_flags & FF_MM_MMXEXT){ - c->idct_put= ff_idct_xvid_mmx2_put; - c->idct_add= ff_idct_xvid_mmx2_add; - c->idct = ff_idct_xvid_mmx2; - }else{ - c->idct_put= ff_idct_xvid_mmx_put; - c->idct_add= ff_idct_xvid_mmx_add; - c->idct = ff_idct_xvid_mmx; - } - } - } - - c->put_pixels_clamped = put_pixels_clamped_mmx; - c->put_signed_pixels_clamped = put_signed_pixels_clamped_mmx; - c->add_pixels_clamped = add_pixels_clamped_mmx; - c->clear_block = clear_block_mmx; - c->clear_blocks = clear_blocks_mmx; - if (mm_flags & FF_MM_SSE) - c->clear_block = clear_block_sse; - -#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \ - c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \ - c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU - - SET_HPEL_FUNCS(put, 0, 16, mmx); - SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx); - SET_HPEL_FUNCS(avg, 0, 16, mmx); - SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx); - SET_HPEL_FUNCS(put, 1, 8, mmx); - SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx); - SET_HPEL_FUNCS(avg, 1, 8, mmx); - SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx); - - c->gmc= gmc_mmx; - - c->add_bytes= add_bytes_mmx; - c->add_bytes_l2= add_bytes_l2_mmx; - - c->draw_edges = draw_edges_mmx; - - if (ENABLE_ANY_H263) { - c->h263_v_loop_filter= h263_v_loop_filter_mmx; - c->h263_h_loop_filter= h263_h_loop_filter_mmx; - } - c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx_rnd; - c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_mmx; - c->put_no_rnd_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx_nornd; - - c->h264_idct_dc_add= - c->h264_idct_add= ff_h264_idct_add_mmx; - c->h264_idct8_dc_add= - c->h264_idct8_add= ff_h264_idct8_add_mmx; - - c->h264_idct_add16 = ff_h264_idct_add16_mmx; - c->h264_idct8_add4 = ff_h264_idct8_add4_mmx; - c->h264_idct_add8 = ff_h264_idct_add8_mmx; - c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx; - - if (mm_flags & FF_MM_MMXEXT) { - c->prefetch = prefetch_mmx2; - - c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2; - c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2; - - c->avg_pixels_tab[0][0] = avg_pixels16_mmx2; - c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2; - c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2; - - c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2; - c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2; - - c->avg_pixels_tab[1][0] = avg_pixels8_mmx2; - c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2; - c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2; - - c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2; - c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2; - c->h264_idct_add16 = ff_h264_idct_add16_mmx2; - c->h264_idct8_add4 = ff_h264_idct8_add4_mmx2; - c->h264_idct_add8 = ff_h264_idct_add8_mmx2; - c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2; - - if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ - c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2; - c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2; - c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2; - c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2; - c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2; - c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2; - - if (ENABLE_VP3_DECODER || ENABLE_THEORA_DECODER) { - c->vp3_v_loop_filter= ff_vp3_v_loop_filter_mmx2; - c->vp3_h_loop_filter= ff_vp3_h_loop_filter_mmx2; - } - } - -#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU) \ - c->PFX ## _pixels_tab[IDX][ 0] = PFX ## SIZE ## _mc00_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][ 1] = PFX ## SIZE ## _mc10_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][ 2] = PFX ## SIZE ## _mc20_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][ 3] = PFX ## SIZE ## _mc30_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][ 4] = PFX ## SIZE ## _mc01_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][ 5] = PFX ## SIZE ## _mc11_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][ 6] = PFX ## SIZE ## _mc21_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][ 7] = PFX ## SIZE ## _mc31_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][ 8] = PFX ## SIZE ## _mc02_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][ 9] = PFX ## SIZE ## _mc12_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][10] = PFX ## SIZE ## _mc22_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][11] = PFX ## SIZE ## _mc32_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][12] = PFX ## SIZE ## _mc03_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][13] = PFX ## SIZE ## _mc13_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][14] = PFX ## SIZE ## _mc23_ ## CPU; \ - c->PFX ## _pixels_tab[IDX][15] = PFX ## SIZE ## _mc33_ ## CPU - - SET_QPEL_FUNCS(put_qpel, 0, 16, mmx2); - SET_QPEL_FUNCS(put_qpel, 1, 8, mmx2); - SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2); - SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmx2); - SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2); - SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2); - - SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2); - SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2); - SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2); - SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2); - SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2); - SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2); - - SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2); - SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2); - SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2); - SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2); - - c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_mmx2_rnd; - c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_mmx2; - c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_mmx2; - c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_mmx2; - c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2; - c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2; - c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2; - c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2; - c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2; - c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2; - c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2; - - c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2; - c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2; - c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2; - c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2; - c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2; - c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2; - c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2; - c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2; - - c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2; - c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2; - c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2; - c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2; - c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2; - c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2; - c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2; - c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2; - - if (ENABLE_CAVS_DECODER) - ff_cavsdsp_init_mmx2(c, avctx); - - if (ENABLE_VC1_DECODER || ENABLE_WMV3_DECODER) - ff_vc1dsp_init_mmx(c, avctx); - - c->add_png_paeth_prediction= add_png_paeth_prediction_mmx2; - } else if (mm_flags & FF_MM_3DNOW) { - c->prefetch = prefetch_3dnow; - - c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow; - c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow; - - c->avg_pixels_tab[0][0] = avg_pixels16_3dnow; - c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow; - c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow; - - c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow; - c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow; - - c->avg_pixels_tab[1][0] = avg_pixels8_3dnow; - c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow; - c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow; - - if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ - c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow; - c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow; - c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow; - c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow; - c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow; - c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow; - } - - SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow); - SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow); - SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow); - SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, 3dnow); - SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow); - SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow); - - SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow); - SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow); - SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow); - SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow); - SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow); - SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow); - - SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow); - SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow); - SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow); - SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow); - - c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_3dnow_rnd; - c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_3dnow; - - if (ENABLE_CAVS_DECODER) - ff_cavsdsp_init_3dnow(c, avctx); - } - - -#define H264_QPEL_FUNCS(x, y, CPU)\ - c->put_h264_qpel_pixels_tab[0][x+y*4] = put_h264_qpel16_mc##x##y##_##CPU;\ - c->put_h264_qpel_pixels_tab[1][x+y*4] = put_h264_qpel8_mc##x##y##_##CPU;\ - c->avg_h264_qpel_pixels_tab[0][x+y*4] = avg_h264_qpel16_mc##x##y##_##CPU;\ - c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU; - if((mm_flags & FF_MM_SSE2) && !(mm_flags & FF_MM_3DNOW)){ - // these functions are slower than mmx on AMD, but faster on Intel -/* FIXME works in most codecs, but crashes svq1 due to unaligned chroma - c->put_pixels_tab[0][0] = put_pixels16_sse2; - c->avg_pixels_tab[0][0] = avg_pixels16_sse2; -*/ - H264_QPEL_FUNCS(0, 0, sse2); - } - if(mm_flags & FF_MM_SSE2){ - c->h264_idct8_add = ff_h264_idct8_add_sse2; - c->h264_idct8_add4= ff_h264_idct8_add4_sse2; - - H264_QPEL_FUNCS(0, 1, sse2); - H264_QPEL_FUNCS(0, 2, sse2); - H264_QPEL_FUNCS(0, 3, sse2); - H264_QPEL_FUNCS(1, 1, sse2); - H264_QPEL_FUNCS(1, 2, sse2); - H264_QPEL_FUNCS(1, 3, sse2); - H264_QPEL_FUNCS(2, 1, sse2); - H264_QPEL_FUNCS(2, 2, sse2); - H264_QPEL_FUNCS(2, 3, sse2); - H264_QPEL_FUNCS(3, 1, sse2); - H264_QPEL_FUNCS(3, 2, sse2); - H264_QPEL_FUNCS(3, 3, sse2); - } -#ifdef HAVE_SSSE3 - if(mm_flags & FF_MM_SSSE3){ - H264_QPEL_FUNCS(1, 0, ssse3); - H264_QPEL_FUNCS(1, 1, ssse3); - H264_QPEL_FUNCS(1, 2, ssse3); - H264_QPEL_FUNCS(1, 3, ssse3); - H264_QPEL_FUNCS(2, 0, ssse3); - H264_QPEL_FUNCS(2, 1, ssse3); - H264_QPEL_FUNCS(2, 2, ssse3); - H264_QPEL_FUNCS(2, 3, ssse3); - H264_QPEL_FUNCS(3, 0, ssse3); - H264_QPEL_FUNCS(3, 1, ssse3); - H264_QPEL_FUNCS(3, 2, ssse3); - H264_QPEL_FUNCS(3, 3, ssse3); - c->put_no_rnd_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_ssse3_nornd; - c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_ssse3_rnd; - c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_ssse3_rnd; - c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_ssse3; - c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_ssse3; - c->add_png_paeth_prediction= add_png_paeth_prediction_ssse3; - } -#endif - -#if defined(CONFIG_GPL) && defined(HAVE_YASM) - if( mm_flags&FF_MM_MMXEXT ){ -#ifdef ARCH_X86_32 - c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_mmxext; - c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_mmxext; -#endif - if( mm_flags&FF_MM_SSE2 ){ - c->h264_v_loop_filter_luma = ff_x264_deblock_v_luma_sse2; - c->h264_h_loop_filter_luma = ff_x264_deblock_h_luma_sse2; - c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_sse2; - c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_sse2; - } - } -#endif - -#ifdef CONFIG_SNOW_DECODER - if(mm_flags & FF_MM_SSE2 & 0){ - c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2; -#ifdef HAVE_7REGS - c->vertical_compose97i = ff_snow_vertical_compose97i_sse2; -#endif - c->inner_add_yblock = ff_snow_inner_add_yblock_sse2; - } - else{ - if(mm_flags & FF_MM_MMXEXT){ - c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx; -#ifdef HAVE_7REGS - c->vertical_compose97i = ff_snow_vertical_compose97i_mmx; -#endif - } - c->inner_add_yblock = ff_snow_inner_add_yblock_mmx; - } -#endif - - if(mm_flags & FF_MM_3DNOW){ - c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow; - c->vector_fmul = vector_fmul_3dnow; - if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ - c->float_to_int16 = float_to_int16_3dnow; - c->float_to_int16_interleave = float_to_int16_interleave_3dnow; - } - } - if(mm_flags & FF_MM_3DNOWEXT){ - c->vector_fmul_reverse = vector_fmul_reverse_3dnow2; - c->vector_fmul_window = vector_fmul_window_3dnow2; - if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ - c->float_to_int16_interleave = float_to_int16_interleave_3dn2; - } - } - if(mm_flags & FF_MM_SSE){ - c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse; - c->ac3_downmix = ac3_downmix_sse; - c->vector_fmul = vector_fmul_sse; - c->vector_fmul_reverse = vector_fmul_reverse_sse; - c->vector_fmul_add_add = vector_fmul_add_add_sse; - c->vector_fmul_window = vector_fmul_window_sse; - c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse; - c->float_to_int16 = float_to_int16_sse; - c->float_to_int16_interleave = float_to_int16_interleave_sse; - } - if(mm_flags & FF_MM_3DNOW) - c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse - if(mm_flags & FF_MM_SSE2){ - c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2; - c->float_to_int16 = float_to_int16_sse2; - c->float_to_int16_interleave = float_to_int16_interleave_sse2; - c->add_int16 = add_int16_sse2; - c->sub_int16 = sub_int16_sse2; - c->scalarproduct_int16 = scalarproduct_int16_sse2; - } - } - - if (ENABLE_ENCODERS) - dsputilenc_init_mmx(c, avctx); - -#if 0 - // for speed testing - get_pixels = just_return; - put_pixels_clamped = just_return; - add_pixels_clamped = just_return; - - pix_abs16x16 = just_return; - pix_abs16x16_x2 = just_return; - pix_abs16x16_y2 = just_return; - pix_abs16x16_xy2 = just_return; - - put_pixels_tab[0] = just_return; - put_pixels_tab[1] = just_return; - put_pixels_tab[2] = just_return; - put_pixels_tab[3] = just_return; - - put_no_rnd_pixels_tab[0] = just_return; - put_no_rnd_pixels_tab[1] = just_return; - put_no_rnd_pixels_tab[2] = just_return; - put_no_rnd_pixels_tab[3] = just_return; - - avg_pixels_tab[0] = just_return; - avg_pixels_tab[1] = just_return; - avg_pixels_tab[2] = just_return; - avg_pixels_tab[3] = just_return; - - avg_no_rnd_pixels_tab[0] = just_return; - avg_no_rnd_pixels_tab[1] = just_return; - avg_no_rnd_pixels_tab[2] = just_return; - avg_no_rnd_pixels_tab[3] = just_return; - - //av_fdct = just_return; - //ff_idct = just_return; -#endif -}
--- a/i386/dsputil_mmx.h Mon Dec 22 06:50:18 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,154 +0,0 @@ -/* - * MMX optimized DSP utils - * Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVCODEC_I386_DSPUTIL_MMX_H -#define AVCODEC_I386_DSPUTIL_MMX_H - -#include <stdint.h> -#include "libavcodec/dsputil.h" - -typedef struct { uint64_t a, b; } xmm_reg; - -extern const uint64_t ff_bone; -extern const uint64_t ff_wtwo; - -extern const uint64_t ff_pdw_80000000[2]; - -extern const uint64_t ff_pw_3; -extern const uint64_t ff_pw_4; -extern const xmm_reg ff_pw_5; -extern const xmm_reg ff_pw_8; -extern const uint64_t ff_pw_15; -extern const xmm_reg ff_pw_16; -extern const uint64_t ff_pw_20; -extern const xmm_reg ff_pw_28; -extern const xmm_reg ff_pw_32; -extern const uint64_t ff_pw_42; -extern const uint64_t ff_pw_64; -extern const uint64_t ff_pw_96; -extern const uint64_t ff_pw_128; -extern const uint64_t ff_pw_255; - -extern const uint64_t ff_pb_1; -extern const uint64_t ff_pb_3; -extern const uint64_t ff_pb_7; -extern const uint64_t ff_pb_1F; -extern const uint64_t ff_pb_3F; -extern const uint64_t ff_pb_81; -extern const uint64_t ff_pb_A1; -extern const uint64_t ff_pb_FC; - -extern const double ff_pd_1[2]; -extern const double ff_pd_2[2]; - -#define LOAD4(stride,in,a,b,c,d)\ - "movq 0*"#stride"+"#in", "#a"\n\t"\ - "movq 1*"#stride"+"#in", "#b"\n\t"\ - "movq 2*"#stride"+"#in", "#c"\n\t"\ - "movq 3*"#stride"+"#in", "#d"\n\t" - -#define STORE4(stride,out,a,b,c,d)\ - "movq "#a", 0*"#stride"+"#out"\n\t"\ - "movq "#b", 1*"#stride"+"#out"\n\t"\ - "movq "#c", 2*"#stride"+"#out"\n\t"\ - "movq "#d", 3*"#stride"+"#out"\n\t" - -/* in/out: mma=mma+mmb, mmb=mmb-mma */ -#define SUMSUB_BA( a, b ) \ - "paddw "#b", "#a" \n\t"\ - "paddw "#b", "#b" \n\t"\ - "psubw "#a", "#b" \n\t" - -#define SBUTTERFLY(a,b,t,n,m)\ - "mov" #m " " #a ", " #t " \n\t" /* abcd */\ - "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\ - "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\ - -#define TRANSPOSE4(a,b,c,d,t)\ - SBUTTERFLY(a,b,t,wd,q) /* a=aebf t=cgdh */\ - SBUTTERFLY(c,d,b,wd,q) /* c=imjn b=kolp */\ - SBUTTERFLY(a,c,d,dq,q) /* a=aeim d=bfjn */\ - SBUTTERFLY(t,b,c,dq,q) /* t=cgko c=dhlp */ - -// e,f,g,h can be memory -// out: a,d,t,c -#define TRANSPOSE8x4(a,b,c,d,e,f,g,h,t)\ - "punpcklbw " #e ", " #a " \n\t" /* a0 e0 a1 e1 a2 e2 a3 e3 */\ - "punpcklbw " #f ", " #b " \n\t" /* b0 f0 b1 f1 b2 f2 b3 f3 */\ - "punpcklbw " #g ", " #c " \n\t" /* c0 g0 c1 g1 c2 g2 d3 g3 */\ - "punpcklbw " #h ", " #d " \n\t" /* d0 h0 d1 h1 d2 h2 d3 h3 */\ - SBUTTERFLY(a, b, t, bw, q) /* a= a0 b0 e0 f0 a1 b1 e1 f1 */\ - /* t= a2 b2 e2 f2 a3 b3 e3 f3 */\ - SBUTTERFLY(c, d, b, bw, q) /* c= c0 d0 g0 h0 c1 d1 g1 h1 */\ - /* b= c2 d2 g2 h2 c3 d3 g3 h3 */\ - SBUTTERFLY(a, c, d, wd, q) /* a= a0 b0 c0 d0 e0 f0 g0 h0 */\ - /* d= a1 b1 c1 d1 e1 f1 g1 h1 */\ - SBUTTERFLY(t, b, c, wd, q) /* t= a2 b2 c2 d2 e2 f2 g2 h2 */\ - /* c= a3 b3 c3 d3 e3 f3 g3 h3 */ - -#ifdef ARCH_X86_64 -// permutes 01234567 -> 05736421 -#define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\ - SBUTTERFLY(a,b,%%xmm8,wd,dqa)\ - SBUTTERFLY(c,d,b,wd,dqa)\ - SBUTTERFLY(e,f,d,wd,dqa)\ - SBUTTERFLY(g,h,f,wd,dqa)\ - SBUTTERFLY(a,c,h,dq,dqa)\ - SBUTTERFLY(%%xmm8,b,c,dq,dqa)\ - SBUTTERFLY(e,g,b,dq,dqa)\ - SBUTTERFLY(d,f,g,dq,dqa)\ - SBUTTERFLY(a,e,f,qdq,dqa)\ - SBUTTERFLY(%%xmm8,d,e,qdq,dqa)\ - SBUTTERFLY(h,b,d,qdq,dqa)\ - SBUTTERFLY(c,g,b,qdq,dqa)\ - "movdqa %%xmm8, "#g" \n\t" -#else -#define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\ - "movdqa "#h", "#t" \n\t"\ - SBUTTERFLY(a,b,h,wd,dqa)\ - "movdqa "#h", 16"#t" \n\t"\ - "movdqa "#t", "#h" \n\t"\ - SBUTTERFLY(c,d,b,wd,dqa)\ - SBUTTERFLY(e,f,d,wd,dqa)\ - SBUTTERFLY(g,h,f,wd,dqa)\ - SBUTTERFLY(a,c,h,dq,dqa)\ - "movdqa "#h", "#t" \n\t"\ - "movdqa 16"#t", "#h" \n\t"\ - SBUTTERFLY(h,b,c,dq,dqa)\ - SBUTTERFLY(e,g,b,dq,dqa)\ - SBUTTERFLY(d,f,g,dq,dqa)\ - SBUTTERFLY(a,e,f,qdq,dqa)\ - SBUTTERFLY(h,d,e,qdq,dqa)\ - "movdqa "#h", 16"#t" \n\t"\ - "movdqa "#t", "#h" \n\t"\ - SBUTTERFLY(h,b,d,qdq,dqa)\ - SBUTTERFLY(c,g,b,qdq,dqa)\ - "movdqa 16"#t", "#g" \n\t" -#endif - -#define MOVQ_WONE(regd) \ - __asm__ volatile ( \ - "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ - "psrlw $15, %%" #regd ::) - -void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx); - -#endif /* AVCODEC_I386_DSPUTIL_MMX_H */
--- a/i386/dsputil_mmx_avg_template.c Mon Dec 22 06:50:18 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,896 +0,0 @@ -/* - * DSP utils : average functions are compiled twice for 3dnow/mmx2 - * Copyright (c) 2000, 2001 Fabrice Bellard. - * Copyright (c) 2002-2004 Michael Niedermayer - * - * MMX optimization by Nick Kurshev <nickols_k@mail.ru> - * mostly rewritten by Michael Niedermayer <michaelni@gmx.at> - * and improved by Zdenek Kabelac <kabi@users.sf.net> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm - clobber bug - now it will work with 2.95.2 and also with -fPIC - */ -static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) -{ - __asm__ volatile( - "lea (%3, %3), %%"REG_a" \n\t" - "1: \n\t" - "movq (%1), %%mm0 \n\t" - "movq (%1, %3), %%mm1 \n\t" - PAVGB" 1(%1), %%mm0 \n\t" - PAVGB" 1(%1, %3), %%mm1 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "movq (%1), %%mm0 \n\t" - "movq (%1, %3), %%mm1 \n\t" - PAVGB" 1(%1), %%mm0 \n\t" - PAVGB" 1(%1, %3), %%mm1 \n\t" - "add %%"REG_a", %1 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "add %%"REG_a", %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" - :"+g"(h), "+S"(pixels), "+D"(block) - :"r" ((x86_reg)line_size) - :"%"REG_a, "memory"); -} - -static void DEF(put_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) -{ - __asm__ volatile( - "testl $1, %0 \n\t" - " jz 1f \n\t" - "movd (%1), %%mm0 \n\t" - "movd (%2), %%mm1 \n\t" - "add %4, %1 \n\t" - "add $4, %2 \n\t" - PAVGB" %%mm1, %%mm0 \n\t" - "movd %%mm0, (%3) \n\t" - "add %5, %3 \n\t" - "decl %0 \n\t" - "1: \n\t" - "movd (%1), %%mm0 \n\t" - "add %4, %1 \n\t" - "movd (%1), %%mm1 \n\t" - "movd (%2), %%mm2 \n\t" - "movd 4(%2), %%mm3 \n\t" - "add %4, %1 \n\t" - PAVGB" %%mm2, %%mm0 \n\t" - PAVGB" %%mm3, %%mm1 \n\t" - "movd %%mm0, (%3) \n\t" - "add %5, %3 \n\t" - "movd %%mm1, (%3) \n\t" - "add %5, %3 \n\t" - "movd (%1), %%mm0 \n\t" - "add %4, %1 \n\t" - "movd (%1), %%mm1 \n\t" - "movd 8(%2), %%mm2 \n\t" - "movd 12(%2), %%mm3 \n\t" - "add %4, %1 \n\t" - PAVGB" %%mm2, %%mm0 \n\t" - PAVGB" %%mm3, %%mm1 \n\t" - "movd %%mm0, (%3) \n\t" - "add %5, %3 \n\t" - "movd %%mm1, (%3) \n\t" - "add %5, %3 \n\t" - "add $16, %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" -#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used - :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) -#else - :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) -#endif - :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) - :"memory"); -} - - -static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) -{ - __asm__ volatile( - "testl $1, %0 \n\t" - " jz 1f \n\t" - "movq (%1), %%mm0 \n\t" - "movq (%2), %%mm1 \n\t" - "add %4, %1 \n\t" - "add $8, %2 \n\t" - PAVGB" %%mm1, %%mm0 \n\t" - "movq %%mm0, (%3) \n\t" - "add %5, %3 \n\t" - "decl %0 \n\t" - "1: \n\t" - "movq (%1), %%mm0 \n\t" - "add %4, %1 \n\t" - "movq (%1), %%mm1 \n\t" - "add %4, %1 \n\t" - PAVGB" (%2), %%mm0 \n\t" - PAVGB" 8(%2), %%mm1 \n\t" - "movq %%mm0, (%3) \n\t" - "add %5, %3 \n\t" - "movq %%mm1, (%3) \n\t" - "add %5, %3 \n\t" - "movq (%1), %%mm0 \n\t" - "add %4, %1 \n\t" - "movq (%1), %%mm1 \n\t" - "add %4, %1 \n\t" - PAVGB" 16(%2), %%mm0 \n\t" - PAVGB" 24(%2), %%mm1 \n\t" - "movq %%mm0, (%3) \n\t" - "add %5, %3 \n\t" - "movq %%mm1, (%3) \n\t" - "add %5, %3 \n\t" - "add $32, %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" -#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used - :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) -#else - :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) -#endif - :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) - :"memory"); -//the following should be used, though better not with gcc ... -/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) - :"r"(src1Stride), "r"(dstStride) - :"memory");*/ -} - -static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) -{ - __asm__ volatile( - "pcmpeqb %%mm6, %%mm6 \n\t" - "testl $1, %0 \n\t" - " jz 1f \n\t" - "movq (%1), %%mm0 \n\t" - "movq (%2), %%mm1 \n\t" - "add %4, %1 \n\t" - "add $8, %2 \n\t" - "pxor %%mm6, %%mm0 \n\t" - "pxor %%mm6, %%mm1 \n\t" - PAVGB" %%mm1, %%mm0 \n\t" - "pxor %%mm6, %%mm0 \n\t" - "movq %%mm0, (%3) \n\t" - "add %5, %3 \n\t" - "decl %0 \n\t" - "1: \n\t" - "movq (%1), %%mm0 \n\t" - "add %4, %1 \n\t" - "movq (%1), %%mm1 \n\t" - "add %4, %1 \n\t" - "movq (%2), %%mm2 \n\t" - "movq 8(%2), %%mm3 \n\t" - "pxor %%mm6, %%mm0 \n\t" - "pxor %%mm6, %%mm1 \n\t" - "pxor %%mm6, %%mm2 \n\t" - "pxor %%mm6, %%mm3 \n\t" - PAVGB" %%mm2, %%mm0 \n\t" - PAVGB" %%mm3, %%mm1 \n\t" - "pxor %%mm6, %%mm0 \n\t" - "pxor %%mm6, %%mm1 \n\t" - "movq %%mm0, (%3) \n\t" - "add %5, %3 \n\t" - "movq %%mm1, (%3) \n\t" - "add %5, %3 \n\t" - "movq (%1), %%mm0 \n\t" - "add %4, %1 \n\t" - "movq (%1), %%mm1 \n\t" - "add %4, %1 \n\t" - "movq 16(%2), %%mm2 \n\t" - "movq 24(%2), %%mm3 \n\t" - "pxor %%mm6, %%mm0 \n\t" - "pxor %%mm6, %%mm1 \n\t" - "pxor %%mm6, %%mm2 \n\t" - "pxor %%mm6, %%mm3 \n\t" - PAVGB" %%mm2, %%mm0 \n\t" - PAVGB" %%mm3, %%mm1 \n\t" - "pxor %%mm6, %%mm0 \n\t" - "pxor %%mm6, %%mm1 \n\t" - "movq %%mm0, (%3) \n\t" - "add %5, %3 \n\t" - "movq %%mm1, (%3) \n\t" - "add %5, %3 \n\t" - "add $32, %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" -#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used - :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) -#else - :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) -#endif - :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) - :"memory"); -//the following should be used, though better not with gcc ... -/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) - :"r"(src1Stride), "r"(dstStride) - :"memory");*/ -} - -static void DEF(avg_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) -{ - __asm__ volatile( - "testl $1, %0 \n\t" - " jz 1f \n\t" - "movd (%1), %%mm0 \n\t" - "movd (%2), %%mm1 \n\t" - "add %4, %1 \n\t" - "add $4, %2 \n\t" - PAVGB" %%mm1, %%mm0 \n\t" - PAVGB" (%3), %%mm0 \n\t" - "movd %%mm0, (%3) \n\t" - "add %5, %3 \n\t" - "decl %0 \n\t" - "1: \n\t" - "movd (%1), %%mm0 \n\t" - "add %4, %1 \n\t" - "movd (%1), %%mm1 \n\t" - "add %4, %1 \n\t" - PAVGB" (%2), %%mm0 \n\t" - PAVGB" 4(%2), %%mm1 \n\t" - PAVGB" (%3), %%mm0 \n\t" - "movd %%mm0, (%3) \n\t" - "add %5, %3 \n\t" - PAVGB" (%3), %%mm1 \n\t" - "movd %%mm1, (%3) \n\t" - "add %5, %3 \n\t" - "movd (%1), %%mm0 \n\t" - "add %4, %1 \n\t" - "movd (%1), %%mm1 \n\t" - "add %4, %1 \n\t" - PAVGB" 8(%2), %%mm0 \n\t" - PAVGB" 12(%2), %%mm1 \n\t" - PAVGB" (%3), %%mm0 \n\t" - "movd %%mm0, (%3) \n\t" - "add %5, %3 \n\t" - PAVGB" (%3), %%mm1 \n\t" - "movd %%mm1, (%3) \n\t" - "add %5, %3 \n\t" - "add $16, %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" -#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used - :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) -#else - :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) -#endif - :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) - :"memory"); -} - - -static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) -{ - __asm__ volatile( - "testl $1, %0 \n\t" - " jz 1f \n\t" - "movq (%1), %%mm0 \n\t" - "movq (%2), %%mm1 \n\t" - "add %4, %1 \n\t" - "add $8, %2 \n\t" - PAVGB" %%mm1, %%mm0 \n\t" - PAVGB" (%3), %%mm0 \n\t" - "movq %%mm0, (%3) \n\t" - "add %5, %3 \n\t" - "decl %0 \n\t" - "1: \n\t" - "movq (%1), %%mm0 \n\t" - "add %4, %1 \n\t" - "movq (%1), %%mm1 \n\t" - "add %4, %1 \n\t" - PAVGB" (%2), %%mm0 \n\t" - PAVGB" 8(%2), %%mm1 \n\t" - PAVGB" (%3), %%mm0 \n\t" - "movq %%mm0, (%3) \n\t" - "add %5, %3 \n\t" - PAVGB" (%3), %%mm1 \n\t" - "movq %%mm1, (%3) \n\t" - "add %5, %3 \n\t" - "movq (%1), %%mm0 \n\t" - "add %4, %1 \n\t" - "movq (%1), %%mm1 \n\t" - "add %4, %1 \n\t" - PAVGB" 16(%2), %%mm0 \n\t" - PAVGB" 24(%2), %%mm1 \n\t" - PAVGB" (%3), %%mm0 \n\t" - "movq %%mm0, (%3) \n\t" - "add %5, %3 \n\t" - PAVGB" (%3), %%mm1 \n\t" - "movq %%mm1, (%3) \n\t" - "add %5, %3 \n\t" - "add $32, %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" -#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used - :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) -#else - :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) -#endif - :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) - :"memory"); -//the following should be used, though better not with gcc ... -/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) - :"r"(src1Stride), "r"(dstStride) - :"memory");*/ -} - -static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) -{ - __asm__ volatile( - "lea (%3, %3), %%"REG_a" \n\t" - "1: \n\t" - "movq (%1), %%mm0 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq 8(%1), %%mm2 \n\t" - "movq 8(%1, %3), %%mm3 \n\t" - PAVGB" 1(%1), %%mm0 \n\t" - PAVGB" 1(%1, %3), %%mm1 \n\t" - PAVGB" 9(%1), %%mm2 \n\t" - PAVGB" 9(%1, %3), %%mm3 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "movq %%mm2, 8(%2) \n\t" - "movq %%mm3, 8(%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "movq (%1), %%mm0 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq 8(%1), %%mm2 \n\t" - "movq 8(%1, %3), %%mm3 \n\t" - PAVGB" 1(%1), %%mm0 \n\t" - PAVGB" 1(%1, %3), %%mm1 \n\t" - PAVGB" 9(%1), %%mm2 \n\t" - PAVGB" 9(%1, %3), %%mm3 \n\t" - "add %%"REG_a", %1 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "movq %%mm2, 8(%2) \n\t" - "movq %%mm3, 8(%2, %3) \n\t" - "add %%"REG_a", %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" - :"+g"(h), "+S"(pixels), "+D"(block) - :"r" ((x86_reg)line_size) - :"%"REG_a, "memory"); -} - -static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) -{ - __asm__ volatile( - "testl $1, %0 \n\t" - " jz 1f \n\t" - "movq (%1), %%mm0 \n\t" - "movq 8(%1), %%mm1 \n\t" - PAVGB" (%2), %%mm0 \n\t" - PAVGB" 8(%2), %%mm1 \n\t" - "add %4, %1 \n\t" - "add $16, %2 \n\t" - "movq %%mm0, (%3) \n\t" - "movq %%mm1, 8(%3) \n\t" - "add %5, %3 \n\t" - "decl %0 \n\t" - "1: \n\t" - "movq (%1), %%mm0 \n\t" - "movq 8(%1), %%mm1 \n\t" - "add %4, %1 \n\t" - PAVGB" (%2), %%mm0 \n\t" - PAVGB" 8(%2), %%mm1 \n\t" - "movq %%mm0, (%3) \n\t" - "movq %%mm1, 8(%3) \n\t" - "add %5, %3 \n\t" - "movq (%1), %%mm0 \n\t" - "movq 8(%1), %%mm1 \n\t" - "add %4, %1 \n\t" - PAVGB" 16(%2), %%mm0 \n\t" - PAVGB" 24(%2), %%mm1 \n\t" - "movq %%mm0, (%3) \n\t" - "movq %%mm1, 8(%3) \n\t" - "add %5, %3 \n\t" - "add $32, %2 \n\t" - "subl $2, %0 \n\t" - "jnz 1b \n\t" -#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used - :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) -#else - :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) -#endif - :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) - :"memory"); -//the following should be used, though better not with gcc ... -/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) - :"r"(src1Stride), "r"(dstStride) - :"memory");*/ -} - -static void DEF(avg_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) -{ - __asm__ volatile( - "testl $1, %0 \n\t" - " jz 1f \n\t" - "movq (%1), %%mm0 \n\t" - "movq 8(%1), %%mm1 \n\t" - PAVGB" (%2), %%mm0 \n\t" - PAVGB" 8(%2), %%mm1 \n\t" - "add %4, %1 \n\t" - "add $16, %2 \n\t" - PAVGB" (%3), %%mm0 \n\t" - PAVGB" 8(%3), %%mm1 \n\t" - "movq %%mm0, (%3) \n\t" - "movq %%mm1, 8(%3) \n\t" - "add %5, %3 \n\t" - "decl %0 \n\t" - "1: \n\t" - "movq (%1), %%mm0 \n\t" - "movq 8(%1), %%mm1 \n\t" - "add %4, %1 \n\t" - PAVGB" (%2), %%mm0 \n\t" - PAVGB" 8(%2), %%mm1 \n\t" - PAVGB" (%3), %%mm0 \n\t" - PAVGB" 8(%3), %%mm1 \n\t" - "movq %%mm0, (%3) \n\t" - "movq %%mm1, 8(%3) \n\t" - "add %5, %3 \n\t" - "movq (%1), %%mm0 \n\t" - "movq 8(%1), %%mm1 \n\t" - "add %4, %1 \n\t" - PAVGB" 16(%2), %%mm0 \n\t" - PAVGB" 24(%2), %%mm1 \n\t" - PAVGB" (%3), %%mm0 \n\t" - PAVGB" 8(%3), %%mm1 \n\t" - "movq %%mm0, (%3) \n\t" - "movq %%mm1, 8(%3) \n\t" - "add %5, %3 \n\t" - "add $32, %2 \n\t" - "subl $2, %0 \n\t" - "jnz 1b \n\t" -#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used - :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) -#else - :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) -#endif - :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) - :"memory"); -//the following should be used, though better not with gcc ... -/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) - :"r"(src1Stride), "r"(dstStride) - :"memory");*/ -} - -static void DEF(put_no_rnd_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) -{ - __asm__ volatile( - "pcmpeqb %%mm6, %%mm6 \n\t" - "testl $1, %0 \n\t" - " jz 1f \n\t" - "movq (%1), %%mm0 \n\t" - "movq 8(%1), %%mm1 \n\t" - "movq (%2), %%mm2 \n\t" - "movq 8(%2), %%mm3 \n\t" - "pxor %%mm6, %%mm0 \n\t" - "pxor %%mm6, %%mm1 \n\t" - "pxor %%mm6, %%mm2 \n\t" - "pxor %%mm6, %%mm3 \n\t" - PAVGB" %%mm2, %%mm0 \n\t" - PAVGB" %%mm3, %%mm1 \n\t" - "pxor %%mm6, %%mm0 \n\t" - "pxor %%mm6, %%mm1 \n\t" - "add %4, %1 \n\t" - "add $16, %2 \n\t" - "movq %%mm0, (%3) \n\t" - "movq %%mm1, 8(%3) \n\t" - "add %5, %3 \n\t" - "decl %0 \n\t" - "1: \n\t" - "movq (%1), %%mm0 \n\t" - "movq 8(%1), %%mm1 \n\t" - "add %4, %1 \n\t" - "movq (%2), %%mm2 \n\t" - "movq 8(%2), %%mm3 \n\t" - "pxor %%mm6, %%mm0 \n\t" - "pxor %%mm6, %%mm1 \n\t" - "pxor %%mm6, %%mm2 \n\t" - "pxor %%mm6, %%mm3 \n\t" - PAVGB" %%mm2, %%mm0 \n\t" - PAVGB" %%mm3, %%mm1 \n\t" - "pxor %%mm6, %%mm0 \n\t" - "pxor %%mm6, %%mm1 \n\t" - "movq %%mm0, (%3) \n\t" - "movq %%mm1, 8(%3) \n\t" - "add %5, %3 \n\t" - "movq (%1), %%mm0 \n\t" - "movq 8(%1), %%mm1 \n\t" - "add %4, %1 \n\t" - "movq 16(%2), %%mm2 \n\t" - "movq 24(%2), %%mm3 \n\t" - "pxor %%mm6, %%mm0 \n\t" - "pxor %%mm6, %%mm1 \n\t" - "pxor %%mm6, %%mm2 \n\t" - "pxor %%mm6, %%mm3 \n\t" - PAVGB" %%mm2, %%mm0 \n\t" - PAVGB" %%mm3, %%mm1 \n\t" - "pxor %%mm6, %%mm0 \n\t" - "pxor %%mm6, %%mm1 \n\t" - "movq %%mm0, (%3) \n\t" - "movq %%mm1, 8(%3) \n\t" - "add %5, %3 \n\t" - "add $32, %2 \n\t" - "subl $2, %0 \n\t" - "jnz 1b \n\t" -#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used - :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) -#else - :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) -#endif - :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) - :"memory"); -//the following should be used, though better not with gcc ... -/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) - :"r"(src1Stride), "r"(dstStride) - :"memory");*/ -} - -/* GL: this function does incorrect rounding if overflow */ -static void DEF(put_no_rnd_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) -{ - MOVQ_BONE(mm6); - __asm__ volatile( - "lea (%3, %3), %%"REG_a" \n\t" - "1: \n\t" - "movq (%1), %%mm0 \n\t" - "movq (%1, %3), %%mm2 \n\t" - "movq 1(%1), %%mm1 \n\t" - "movq 1(%1, %3), %%mm3 \n\t" - "add %%"REG_a", %1 \n\t" - "psubusb %%mm6, %%mm0 \n\t" - "psubusb %%mm6, %%mm2 \n\t" - PAVGB" %%mm1, %%mm0 \n\t" - PAVGB" %%mm3, %%mm2 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm2, (%2, %3) \n\t" - "movq (%1), %%mm0 \n\t" - "movq 1(%1), %%mm1 \n\t" - "movq (%1, %3), %%mm2 \n\t" - "movq 1(%1, %3), %%mm3 \n\t" - "add %%"REG_a", %2 \n\t" - "add %%"REG_a", %1 \n\t" - "psubusb %%mm6, %%mm0 \n\t" - "psubusb %%mm6, %%mm2 \n\t" - PAVGB" %%mm1, %%mm0 \n\t" - PAVGB" %%mm3, %%mm2 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm2, (%2, %3) \n\t" - "add %%"REG_a", %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" - :"+g"(h), "+S"(pixels), "+D"(block) - :"r" ((x86_reg)line_size) - :"%"REG_a, "memory"); -} - -static void DEF(put_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) -{ - __asm__ volatile( - "lea (%3, %3), %%"REG_a" \n\t" - "movq (%1), %%mm0 \n\t" - "sub %3, %2 \n\t" - "1: \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq (%1, %%"REG_a"), %%mm2 \n\t" - "add %%"REG_a", %1 \n\t" - PAVGB" %%mm1, %%mm0 \n\t" - PAVGB" %%mm2, %%mm1 \n\t" - "movq %%mm0, (%2, %3) \n\t" - "movq %%mm1, (%2, %%"REG_a") \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq (%1, %%"REG_a"), %%mm0 \n\t" - "add %%"REG_a", %2 \n\t" - "add %%"REG_a", %1 \n\t" - PAVGB" %%mm1, %%mm2 \n\t" - PAVGB" %%mm0, %%mm1 \n\t" - "movq %%mm2, (%2, %3) \n\t" - "movq %%mm1, (%2, %%"REG_a") \n\t" - "add %%"REG_a", %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" - :"+g"(h), "+S"(pixels), "+D" (block) - :"r" ((x86_reg)line_size) - :"%"REG_a, "memory"); -} - -/* GL: this function does incorrect rounding if overflow */ -static void DEF(put_no_rnd_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) -{ - MOVQ_BONE(mm6); - __asm__ volatile( - "lea (%3, %3), %%"REG_a" \n\t" - "movq (%1), %%mm0 \n\t" - "sub %3, %2 \n\t" - "1: \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq (%1, %%"REG_a"), %%mm2 \n\t" - "add %%"REG_a", %1 \n\t" - "psubusb %%mm6, %%mm1 \n\t" - PAVGB" %%mm1, %%mm0 \n\t" - PAVGB" %%mm2, %%mm1 \n\t" - "movq %%mm0, (%2, %3) \n\t" - "movq %%mm1, (%2, %%"REG_a") \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq (%1, %%"REG_a"), %%mm0 \n\t" - "add %%"REG_a", %2 \n\t" - "add %%"REG_a", %1 \n\t" - "psubusb %%mm6, %%mm1 \n\t" - PAVGB" %%mm1, %%mm2 \n\t" - PAVGB" %%mm0, %%mm1 \n\t" - "movq %%mm2, (%2, %3) \n\t" - "movq %%mm1, (%2, %%"REG_a") \n\t" - "add %%"REG_a", %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" - :"+g"(h), "+S"(pixels), "+D" (block) - :"r" ((x86_reg)line_size) - :"%"REG_a, "memory"); -} - -static void DEF(avg_pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h) -{ - __asm__ volatile( - "lea (%3, %3), %%"REG_a" \n\t" - "1: \n\t" - "movq (%2), %%mm0 \n\t" - "movq (%2, %3), %%mm1 \n\t" - PAVGB" (%1), %%mm0 \n\t" - PAVGB" (%1, %3), %%mm1 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "movq (%2), %%mm0 \n\t" - "movq (%2, %3), %%mm1 \n\t" - PAVGB" (%1), %%mm0 \n\t" - PAVGB" (%1, %3), %%mm1 \n\t" - "add %%"REG_a", %1 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "add %%"REG_a", %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" - :"+g"(h), "+S"(pixels), "+D"(block) - :"r" ((x86_reg)line_size) - :"%"REG_a, "memory"); -} - -static void DEF(avg_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) -{ - __asm__ volatile( - "lea (%3, %3), %%"REG_a" \n\t" - "1: \n\t" - "movq (%1), %%mm0 \n\t" - "movq (%1, %3), %%mm2 \n\t" - PAVGB" 1(%1), %%mm0 \n\t" - PAVGB" 1(%1, %3), %%mm2 \n\t" - PAVGB" (%2), %%mm0 \n\t" - PAVGB" (%2, %3), %%mm2 \n\t" - "add %%"REG_a", %1 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm2, (%2, %3) \n\t" - "movq (%1), %%mm0 \n\t" - "movq (%1, %3), %%mm2 \n\t" - PAVGB" 1(%1), %%mm0 \n\t" - PAVGB" 1(%1, %3), %%mm2 \n\t" - "add %%"REG_a", %2 \n\t" - "add %%"REG_a", %1 \n\t" - PAVGB" (%2), %%mm0 \n\t" - PAVGB" (%2, %3), %%mm2 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm2, (%2, %3) \n\t" - "add %%"REG_a", %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" - :"+g"(h), "+S"(pixels), "+D"(block) - :"r" ((x86_reg)line_size) - :"%"REG_a, "memory"); -} - -static void DEF(avg_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) -{ - __asm__ volatile( - "lea (%3, %3), %%"REG_a" \n\t" - "movq (%1), %%mm0 \n\t" - "sub %3, %2 \n\t" - "1: \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq (%1, %%"REG_a"), %%mm2 \n\t" - "add %%"REG_a", %1 \n\t" - PAVGB" %%mm1, %%mm0 \n\t" - PAVGB" %%mm2, %%mm1 \n\t" - "movq (%2, %3), %%mm3 \n\t" - "movq (%2, %%"REG_a"), %%mm4 \n\t" - PAVGB" %%mm3, %%mm0 \n\t" - PAVGB" %%mm4, %%mm1 \n\t" - "movq %%mm0, (%2, %3) \n\t" - "movq %%mm1, (%2, %%"REG_a") \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq (%1, %%"REG_a"), %%mm0 \n\t" - PAVGB" %%mm1, %%mm2 \n\t" - PAVGB" %%mm0, %%mm1 \n\t" - "add %%"REG_a", %2 \n\t" - "add %%"REG_a", %1 \n\t" - "movq (%2, %3), %%mm3 \n\t" - "movq (%2, %%"REG_a"), %%mm4 \n\t" - PAVGB" %%mm3, %%mm2 \n\t" - PAVGB" %%mm4, %%mm1 \n\t" - "movq %%mm2, (%2, %3) \n\t" - "movq %%mm1, (%2, %%"REG_a") \n\t" - "add %%"REG_a", %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" - :"+g"(h), "+S"(pixels), "+D"(block) - :"r" ((x86_reg)line_size) - :"%"REG_a, "memory"); -} - -/* Note this is not correctly rounded, but this function is only - * used for B-frames so it does not matter. */ -static void DEF(avg_pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) -{ - MOVQ_BONE(mm6); - __asm__ volatile( - "lea (%3, %3), %%"REG_a" \n\t" - "movq (%1), %%mm0 \n\t" - PAVGB" 1(%1), %%mm0 \n\t" - ASMALIGN(3) - "1: \n\t" - "movq (%1, %%"REG_a"), %%mm2 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "psubusb %%mm6, %%mm2 \n\t" - PAVGB" 1(%1, %3), %%mm1 \n\t" - PAVGB" 1(%1, %%"REG_a"), %%mm2 \n\t" - "add %%"REG_a", %1 \n\t" - PAVGB" %%mm1, %%mm0 \n\t" - PAVGB" %%mm2, %%mm1 \n\t" - PAVGB" (%2), %%mm0 \n\t" - PAVGB" (%2, %3), %%mm1 \n\t" - "movq %%mm0, (%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq (%1, %%"REG_a"), %%mm0 \n\t" - PAVGB" 1(%1, %3), %%mm1 \n\t" - PAVGB" 1(%1, %%"REG_a"), %%mm0 \n\t" - "add %%"REG_a", %2 \n\t" - "add %%"REG_a", %1 \n\t" - PAVGB" %%mm1, %%mm2 \n\t" - PAVGB" %%mm0, %%mm1 \n\t" - PAVGB" (%2), %%mm2 \n\t" - PAVGB" (%2, %3), %%mm1 \n\t" - "movq %%mm2, (%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "add %%"REG_a", %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" - :"+g"(h), "+S"(pixels), "+D"(block) - :"r" ((x86_reg)line_size) - :"%"REG_a, "memory"); -} - -static void DEF(avg_pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h) -{ - do { - __asm__ volatile( - "movd (%1), %%mm0 \n\t" - "movd (%1, %2), %%mm1 \n\t" - "movd (%1, %2, 2), %%mm2 \n\t" - "movd (%1, %3), %%mm3 \n\t" - PAVGB" (%0), %%mm0 \n\t" - PAVGB" (%0, %2), %%mm1 \n\t" - PAVGB" (%0, %2, 2), %%mm2 \n\t" - PAVGB" (%0, %3), %%mm3 \n\t" - "movd %%mm0, (%1) \n\t" - "movd %%mm1, (%1, %2) \n\t" - "movd %%mm2, (%1, %2, 2) \n\t" - "movd %%mm3, (%1, %3) \n\t" - ::"S"(pixels), "D"(block), - "r" ((x86_reg)line_size), "r"((x86_reg)3L*line_size) - :"memory"); - block += 4*line_size; - pixels += 4*line_size; - h -= 4; - } while(h > 0); -} - -//FIXME the following could be optimized too ... -static void DEF(put_no_rnd_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ - DEF(put_no_rnd_pixels8_x2)(block , pixels , line_size, h); - DEF(put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h); -} -static void DEF(put_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ - DEF(put_pixels8_y2)(block , pixels , line_size, h); - DEF(put_pixels8_y2)(block+8, pixels+8, line_size, h); -} -static void DEF(put_no_rnd_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ - DEF(put_no_rnd_pixels8_y2)(block , pixels , line_size, h); - DEF(put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h); -} -static void DEF(avg_pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ - DEF(avg_pixels8)(block , pixels , line_size, h); - DEF(avg_pixels8)(block+8, pixels+8, line_size, h); -} -static void DEF(avg_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ - DEF(avg_pixels8_x2)(block , pixels , line_size, h); - DEF(avg_pixels8_x2)(block+8, pixels+8, line_size, h); -} -static void DEF(avg_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ - DEF(avg_pixels8_y2)(block , pixels , line_size, h); - DEF(avg_pixels8_y2)(block+8, pixels+8, line_size, h); -} -static void DEF(avg_pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ - DEF(avg_pixels8_xy2)(block , pixels , line_size, h); - DEF(avg_pixels8_xy2)(block+8, pixels+8, line_size, h); -} - -#define QPEL_2TAP_L3(OPNAME) \ -static void DEF(OPNAME ## 2tap_qpel16_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2){\ - __asm__ volatile(\ - "1: \n\t"\ - "movq (%1,%2), %%mm0 \n\t"\ - "movq 8(%1,%2), %%mm1 \n\t"\ - PAVGB" (%1,%3), %%mm0 \n\t"\ - PAVGB" 8(%1,%3), %%mm1 \n\t"\ - PAVGB" (%1), %%mm0 \n\t"\ - PAVGB" 8(%1), %%mm1 \n\t"\ - STORE_OP( (%1,%4),%%mm0)\ - STORE_OP(8(%1,%4),%%mm1)\ - "movq %%mm0, (%1,%4) \n\t"\ - "movq %%mm1, 8(%1,%4) \n\t"\ - "add %5, %1 \n\t"\ - "decl %0 \n\t"\ - "jnz 1b \n\t"\ - :"+g"(h), "+r"(src)\ - :"r"((x86_reg)off1), "r"((x86_reg)off2),\ - "r"((x86_reg)(dst-src)), "r"((x86_reg)stride)\ - :"memory"\ - );\ -}\ -static void DEF(OPNAME ## 2tap_qpel8_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2){\ - __asm__ volatile(\ - "1: \n\t"\ - "movq (%1,%2), %%mm0 \n\t"\ - PAVGB" (%1,%3), %%mm0 \n\t"\ - PAVGB" (%1), %%mm0 \n\t"\ - STORE_OP((%1,%4),%%mm0)\ - "movq %%mm0, (%1,%4) \n\t"\ - "add %5, %1 \n\t"\ - "decl %0 \n\t"\ - "jnz 1b \n\t"\ - :"+g"(h), "+r"(src)\ - :"r"((x86_reg)off1), "r"((x86_reg)off2),\ - "r"((x86_reg)(dst-src)), "r"((x86_reg)stride)\ - :"memory"\ - );\ -} - -#define STORE_OP(a,b) PAVGB" "#a","#b" \n\t" -QPEL_2TAP_L3(avg_) -#undef STORE_OP -#define STORE_OP(a,b) -QPEL_2TAP_L3(put_) -#undef STORE_OP -#undef QPEL_2TAP_L3
--- a/i386/dsputil_mmx_qns_template.c Mon Dec 22 06:50:18 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,101 +0,0 @@ -/* - * DSP utils : QNS functions are compiled 3 times for mmx/3dnow/ssse3 - * Copyright (c) 2004 Michael Niedermayer - * - * MMX optimization by Michael Niedermayer <michaelni@gmx.at> - * 3DNow! and SSSE3 optimization by Zuxy Meng <zuxy.meng@gmail.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#define MAX_ABS (512 >> (SCALE_OFFSET>0 ? SCALE_OFFSET : 0)) - -static int DEF(try_8x8basis)(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale) -{ - x86_reg i=0; - - assert(FFABS(scale) < MAX_ABS); - scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT; - - SET_RND(mm6); - __asm__ volatile( - "pxor %%mm7, %%mm7 \n\t" - "movd %4, %%mm5 \n\t" - "punpcklwd %%mm5, %%mm5 \n\t" - "punpcklwd %%mm5, %%mm5 \n\t" - ASMALIGN(4) - "1: \n\t" - "movq (%1, %0), %%mm0 \n\t" - "movq 8(%1, %0), %%mm1 \n\t" - PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6) - "paddw (%2, %0), %%mm0 \n\t" - "paddw 8(%2, %0), %%mm1 \n\t" - "psraw $6, %%mm0 \n\t" - "psraw $6, %%mm1 \n\t" - "pmullw (%3, %0), %%mm0 \n\t" - "pmullw 8(%3, %0), %%mm1 \n\t" - "pmaddwd %%mm0, %%mm0 \n\t" - "pmaddwd %%mm1, %%mm1 \n\t" - "paddd %%mm1, %%mm0 \n\t" - "psrld $4, %%mm0 \n\t" - "paddd %%mm0, %%mm7 \n\t" - "add $16, %0 \n\t" - "cmp $128, %0 \n\t" //FIXME optimize & bench - " jb 1b \n\t" - PHADDD(%%mm7, %%mm6) - "psrld $2, %%mm7 \n\t" - "movd %%mm7, %0 \n\t" - - : "+r" (i) - : "r"(basis), "r"(rem), "r"(weight), "g"(scale) - ); - return i; -} - -static void DEF(add_8x8basis)(int16_t rem[64], int16_t basis[64], int scale) -{ - x86_reg i=0; - - if(FFABS(scale) < MAX_ABS){ - scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT; - SET_RND(mm6); - __asm__ volatile( - "movd %3, %%mm5 \n\t" - "punpcklwd %%mm5, %%mm5 \n\t" - "punpcklwd %%mm5, %%mm5 \n\t" - ASMALIGN(4) - "1: \n\t" - "movq (%1, %0), %%mm0 \n\t" - "movq 8(%1, %0), %%mm1 \n\t" - PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6) - "paddw (%2, %0), %%mm0 \n\t" - "paddw 8(%2, %0), %%mm1 \n\t" - "movq %%mm0, (%2, %0) \n\t" - "movq %%mm1, 8(%2, %0) \n\t" - "add $16, %0 \n\t" - "cmp $128, %0 \n\t" // FIXME optimize & bench - " jb 1b \n\t" - - : "+r" (i) - : "r"(basis), "r"(rem), "g"(scale) - ); - }else{ - for(i=0; i<8*8; i++){ - rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT); - } - } -}
--- a/i386/dsputil_mmx_rnd_template.c Mon Dec 22 06:50:18 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,590 +0,0 @@ -/* - * DSP utils mmx functions are compiled twice for rnd/no_rnd - * Copyright (c) 2000, 2001 Fabrice Bellard. - * Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at> - * - * MMX optimization by Nick Kurshev <nickols_k@mail.ru> - * mostly rewritten by Michael Niedermayer <michaelni@gmx.at> - * and improved by Zdenek Kabelac <kabi@users.sf.net> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -// put_pixels -static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) -{ - MOVQ_BFE(mm6); - __asm__ volatile( - "lea (%3, %3), %%"REG_a" \n\t" - ASMALIGN(3) - "1: \n\t" - "movq (%1), %%mm0 \n\t" - "movq 1(%1), %%mm1 \n\t" - "movq (%1, %3), %%mm2 \n\t" - "movq 1(%1, %3), %%mm3 \n\t" - PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) - "movq %%mm4, (%2) \n\t" - "movq %%mm5, (%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "movq (%1), %%mm0 \n\t" - "movq 1(%1), %%mm1 \n\t" - "movq (%1, %3), %%mm2 \n\t" - "movq 1(%1, %3), %%mm3 \n\t" - PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) - "movq %%mm4, (%2) \n\t" - "movq %%mm5, (%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" - :"+g"(h), "+S"(pixels), "+D"(block) - :"r"((x86_reg)line_size) - :REG_a, "memory"); -} - -static void av_unused DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) -{ - MOVQ_BFE(mm6); - __asm__ volatile( - "testl $1, %0 \n\t" - " jz 1f \n\t" - "movq (%1), %%mm0 \n\t" - "movq (%2), %%mm1 \n\t" - "add %4, %1 \n\t" - "add $8, %2 \n\t" - PAVGB(%%mm0, %%mm1, %%mm4, %%mm6) - "movq %%mm4, (%3) \n\t" - "add %5, %3 \n\t" - "decl %0 \n\t" - ASMALIGN(3) - "1: \n\t" - "movq (%1), %%mm0 \n\t" - "movq (%2), %%mm1 \n\t" - "add %4, %1 \n\t" - "movq (%1), %%mm2 \n\t" - "movq 8(%2), %%mm3 \n\t" - "add %4, %1 \n\t" - PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) - "movq %%mm4, (%3) \n\t" - "add %5, %3 \n\t" - "movq %%mm5, (%3) \n\t" - "add %5, %3 \n\t" - "movq (%1), %%mm0 \n\t" - "movq 16(%2), %%mm1 \n\t" - "add %4, %1 \n\t" - "movq (%1), %%mm2 \n\t" - "movq 24(%2), %%mm3 \n\t" - "add %4, %1 \n\t" - "add $32, %2 \n\t" - PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) - "movq %%mm4, (%3) \n\t" - "add %5, %3 \n\t" - "movq %%mm5, (%3) \n\t" - "add %5, %3 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" -#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used - :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) -#else - :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) -#endif - :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) - :"memory"); -} - -static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) -{ - MOVQ_BFE(mm6); - __asm__ volatile( - "lea (%3, %3), %%"REG_a" \n\t" - ASMALIGN(3) - "1: \n\t" - "movq (%1), %%mm0 \n\t" - "movq 1(%1), %%mm1 \n\t" - "movq (%1, %3), %%mm2 \n\t" - "movq 1(%1, %3), %%mm3 \n\t" - PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) - "movq %%mm4, (%2) \n\t" - "movq %%mm5, (%2, %3) \n\t" - "movq 8(%1), %%mm0 \n\t" - "movq 9(%1), %%mm1 \n\t" - "movq 8(%1, %3), %%mm2 \n\t" - "movq 9(%1, %3), %%mm3 \n\t" - PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) - "movq %%mm4, 8(%2) \n\t" - "movq %%mm5, 8(%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "movq (%1), %%mm0 \n\t" - "movq 1(%1), %%mm1 \n\t" - "movq (%1, %3), %%mm2 \n\t" - "movq 1(%1, %3), %%mm3 \n\t" - PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) - "movq %%mm4, (%2) \n\t" - "movq %%mm5, (%2, %3) \n\t" - "movq 8(%1), %%mm0 \n\t" - "movq 9(%1), %%mm1 \n\t" - "movq 8(%1, %3), %%mm2 \n\t" - "movq 9(%1, %3), %%mm3 \n\t" - PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) - "movq %%mm4, 8(%2) \n\t" - "movq %%mm5, 8(%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" - :"+g"(h), "+S"(pixels), "+D"(block) - :"r"((x86_reg)line_size) - :REG_a, "memory"); -} - -static void av_unused DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) -{ - MOVQ_BFE(mm6); - __asm__ volatile( - "testl $1, %0 \n\t" - " jz 1f \n\t" - "movq (%1), %%mm0 \n\t" - "movq (%2), %%mm1 \n\t" - "movq 8(%1), %%mm2 \n\t" - "movq 8(%2), %%mm3 \n\t" - "add %4, %1 \n\t" - "add $16, %2 \n\t" - PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) - "movq %%mm4, (%3) \n\t" - "movq %%mm5, 8(%3) \n\t" - "add %5, %3 \n\t" - "decl %0 \n\t" - ASMALIGN(3) - "1: \n\t" - "movq (%1), %%mm0 \n\t" - "movq (%2), %%mm1 \n\t" - "movq 8(%1), %%mm2 \n\t" - "movq 8(%2), %%mm3 \n\t" - "add %4, %1 \n\t" - PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) - "movq %%mm4, (%3) \n\t" - "movq %%mm5, 8(%3) \n\t" - "add %5, %3 \n\t" - "movq (%1), %%mm0 \n\t" - "movq 16(%2), %%mm1 \n\t" - "movq 8(%1), %%mm2 \n\t" - "movq 24(%2), %%mm3 \n\t" - "add %4, %1 \n\t" - PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) - "movq %%mm4, (%3) \n\t" - "movq %%mm5, 8(%3) \n\t" - "add %5, %3 \n\t" - "add $32, %2 \n\t" - "subl $2, %0 \n\t" - "jnz 1b \n\t" -#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used - :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) -#else - :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) -#endif - :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) - :"memory"); -} - -static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) -{ - MOVQ_BFE(mm6); - __asm__ volatile( - "lea (%3, %3), %%"REG_a" \n\t" - "movq (%1), %%mm0 \n\t" - ASMALIGN(3) - "1: \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq (%1, %%"REG_a"),%%mm2 \n\t" - PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5) - "movq %%mm4, (%2) \n\t" - "movq %%mm5, (%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq (%1, %%"REG_a"),%%mm0 \n\t" - PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5) - "movq %%mm4, (%2) \n\t" - "movq %%mm5, (%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" - :"+g"(h), "+S"(pixels), "+D"(block) - :"r"((x86_reg)line_size) - :REG_a, "memory"); -} - -static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) -{ - MOVQ_ZERO(mm7); - SET_RND(mm6); // =2 for rnd and =1 for no_rnd version - __asm__ volatile( - "movq (%1), %%mm0 \n\t" - "movq 1(%1), %%mm4 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm4, %%mm5 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "punpckhbw %%mm7, %%mm1 \n\t" - "punpckhbw %%mm7, %%mm5 \n\t" - "paddusw %%mm0, %%mm4 \n\t" - "paddusw %%mm1, %%mm5 \n\t" - "xor %%"REG_a", %%"REG_a" \n\t" - "add %3, %1 \n\t" - ASMALIGN(3) - "1: \n\t" - "movq (%1, %%"REG_a"), %%mm0 \n\t" - "movq 1(%1, %%"REG_a"), %%mm2 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm2, %%mm3 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpckhbw %%mm7, %%mm1 \n\t" - "punpckhbw %%mm7, %%mm3 \n\t" - "paddusw %%mm2, %%mm0 \n\t" - "paddusw %%mm3, %%mm1 \n\t" - "paddusw %%mm6, %%mm4 \n\t" - "paddusw %%mm6, %%mm5 \n\t" - "paddusw %%mm0, %%mm4 \n\t" - "paddusw %%mm1, %%mm5 \n\t" - "psrlw $2, %%mm4 \n\t" - "psrlw $2, %%mm5 \n\t" - "packuswb %%mm5, %%mm4 \n\t" - "movq %%mm4, (%2, %%"REG_a") \n\t" - "add %3, %%"REG_a" \n\t" - - "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 - "movq 1(%1, %%"REG_a"), %%mm4 \n\t" - "movq %%mm2, %%mm3 \n\t" - "movq %%mm4, %%mm5 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "punpckhbw %%mm7, %%mm3 \n\t" - "punpckhbw %%mm7, %%mm5 \n\t" - "paddusw %%mm2, %%mm4 \n\t" - "paddusw %%mm3, %%mm5 \n\t" - "paddusw %%mm6, %%mm0 \n\t" - "paddusw %%mm6, %%mm1 \n\t" - "paddusw %%mm4, %%mm0 \n\t" - "paddusw %%mm5, %%mm1 \n\t" - "psrlw $2, %%mm0 \n\t" - "psrlw $2, %%mm1 \n\t" - "packuswb %%mm1, %%mm0 \n\t" - "movq %%mm0, (%2, %%"REG_a") \n\t" - "add %3, %%"REG_a" \n\t" - - "subl $2, %0 \n\t" - "jnz 1b \n\t" - :"+g"(h), "+S"(pixels) - :"D"(block), "r"((x86_reg)line_size) - :REG_a, "memory"); -} - -// avg_pixels -static void av_unused DEF(avg, pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h) -{ - MOVQ_BFE(mm6); - JUMPALIGN(); - do { - __asm__ volatile( - "movd %0, %%mm0 \n\t" - "movd %1, %%mm1 \n\t" - PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) - "movd %%mm2, %0 \n\t" - :"+m"(*block) - :"m"(*pixels) - :"memory"); - pixels += line_size; - block += line_size; - } - while (--h); -} - -// in case more speed is needed - unroling would certainly help -static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h) -{ - MOVQ_BFE(mm6); - JUMPALIGN(); - do { - __asm__ volatile( - "movq %0, %%mm0 \n\t" - "movq %1, %%mm1 \n\t" - PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) - "movq %%mm2, %0 \n\t" - :"+m"(*block) - :"m"(*pixels) - :"memory"); - pixels += line_size; - block += line_size; - } - while (--h); -} - -static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h) -{ - MOVQ_BFE(mm6); - JUMPALIGN(); - do { - __asm__ volatile( - "movq %0, %%mm0 \n\t" - "movq %1, %%mm1 \n\t" - PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) - "movq %%mm2, %0 \n\t" - "movq 8%0, %%mm0 \n\t" - "movq 8%1, %%mm1 \n\t" - PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) - "movq %%mm2, 8%0 \n\t" - :"+m"(*block) - :"m"(*pixels) - :"memory"); - pixels += line_size; - block += line_size; - } - while (--h); -} - -static void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) -{ - MOVQ_BFE(mm6); - JUMPALIGN(); - do { - __asm__ volatile( - "movq %1, %%mm0 \n\t" - "movq 1%1, %%mm1 \n\t" - "movq %0, %%mm3 \n\t" - PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) - PAVGB(%%mm3, %%mm2, %%mm0, %%mm6) - "movq %%mm0, %0 \n\t" - :"+m"(*block) - :"m"(*pixels) - :"memory"); - pixels += line_size; - block += line_size; - } while (--h); -} - -static av_unused void DEF(avg, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) -{ - MOVQ_BFE(mm6); - JUMPALIGN(); - do { - __asm__ volatile( - "movq %1, %%mm0 \n\t" - "movq %2, %%mm1 \n\t" - "movq %0, %%mm3 \n\t" - PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) - PAVGB(%%mm3, %%mm2, %%mm0, %%mm6) - "movq %%mm0, %0 \n\t" - :"+m"(*dst) - :"m"(*src1), "m"(*src2) - :"memory"); - dst += dstStride; - src1 += src1Stride; - src2 += 8; - } while (--h); -} - -static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) -{ - MOVQ_BFE(mm6); - JUMPALIGN(); - do { - __asm__ volatile( - "movq %1, %%mm0 \n\t" - "movq 1%1, %%mm1 \n\t" - "movq %0, %%mm3 \n\t" - PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) - PAVGB(%%mm3, %%mm2, %%mm0, %%mm6) - "movq %%mm0, %0 \n\t" - "movq 8%1, %%mm0 \n\t" - "movq 9%1, %%mm1 \n\t" - "movq 8%0, %%mm3 \n\t" - PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) - PAVGB(%%mm3, %%mm2, %%mm0, %%mm6) - "movq %%mm0, 8%0 \n\t" - :"+m"(*block) - :"m"(*pixels) - :"memory"); - pixels += line_size; - block += line_size; - } while (--h); -} - -static av_unused void DEF(avg, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) -{ - MOVQ_BFE(mm6); - JUMPALIGN(); - do { - __asm__ volatile( - "movq %1, %%mm0 \n\t" - "movq %2, %%mm1 \n\t" - "movq %0, %%mm3 \n\t" - PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) - PAVGB(%%mm3, %%mm2, %%mm0, %%mm6) - "movq %%mm0, %0 \n\t" - "movq 8%1, %%mm0 \n\t" - "movq 8%2, %%mm1 \n\t" - "movq 8%0, %%mm3 \n\t" - PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) - PAVGB(%%mm3, %%mm2, %%mm0, %%mm6) - "movq %%mm0, 8%0 \n\t" - :"+m"(*dst) - :"m"(*src1), "m"(*src2) - :"memory"); - dst += dstStride; - src1 += src1Stride; - src2 += 16; - } while (--h); -} - -static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) -{ - MOVQ_BFE(mm6); - __asm__ volatile( - "lea (%3, %3), %%"REG_a" \n\t" - "movq (%1), %%mm0 \n\t" - ASMALIGN(3) - "1: \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq (%1, %%"REG_a"), %%mm2 \n\t" - PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5) - "movq (%2), %%mm3 \n\t" - PAVGB(%%mm3, %%mm4, %%mm0, %%mm6) - "movq (%2, %3), %%mm3 \n\t" - PAVGB(%%mm3, %%mm5, %%mm1, %%mm6) - "movq %%mm0, (%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - - "movq (%1, %3), %%mm1 \n\t" - "movq (%1, %%"REG_a"), %%mm0 \n\t" - PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5) - "movq (%2), %%mm3 \n\t" - PAVGB(%%mm3, %%mm4, %%mm2, %%mm6) - "movq (%2, %3), %%mm3 \n\t" - PAVGB(%%mm3, %%mm5, %%mm1, %%mm6) - "movq %%mm2, (%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "add %%"REG_a", %1 \n\t" - "add %%"REG_a", %2 \n\t" - - "subl $4, %0 \n\t" - "jnz 1b \n\t" - :"+g"(h), "+S"(pixels), "+D"(block) - :"r"((x86_reg)line_size) - :REG_a, "memory"); -} - -// this routine is 'slightly' suboptimal but mostly unused -static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) -{ - MOVQ_ZERO(mm7); - SET_RND(mm6); // =2 for rnd and =1 for no_rnd version - __asm__ volatile( - "movq (%1), %%mm0 \n\t" - "movq 1(%1), %%mm4 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm4, %%mm5 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "punpckhbw %%mm7, %%mm1 \n\t" - "punpckhbw %%mm7, %%mm5 \n\t" - "paddusw %%mm0, %%mm4 \n\t" - "paddusw %%mm1, %%mm5 \n\t" - "xor %%"REG_a", %%"REG_a" \n\t" - "add %3, %1 \n\t" - ASMALIGN(3) - "1: \n\t" - "movq (%1, %%"REG_a"), %%mm0 \n\t" - "movq 1(%1, %%"REG_a"), %%mm2 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm2, %%mm3 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpckhbw %%mm7, %%mm1 \n\t" - "punpckhbw %%mm7, %%mm3 \n\t" - "paddusw %%mm2, %%mm0 \n\t" - "paddusw %%mm3, %%mm1 \n\t" - "paddusw %%mm6, %%mm4 \n\t" - "paddusw %%mm6, %%mm5 \n\t" - "paddusw %%mm0, %%mm4 \n\t" - "paddusw %%mm1, %%mm5 \n\t" - "psrlw $2, %%mm4 \n\t" - "psrlw $2, %%mm5 \n\t" - "movq (%2, %%"REG_a"), %%mm3 \n\t" - "packuswb %%mm5, %%mm4 \n\t" - "pcmpeqd %%mm2, %%mm2 \n\t" - "paddb %%mm2, %%mm2 \n\t" - PAVGB(%%mm3, %%mm4, %%mm5, %%mm2) - "movq %%mm5, (%2, %%"REG_a") \n\t" - "add %3, %%"REG_a" \n\t" - - "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 - "movq 1(%1, %%"REG_a"), %%mm4 \n\t" - "movq %%mm2, %%mm3 \n\t" - "movq %%mm4, %%mm5 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "punpckhbw %%mm7, %%mm3 \n\t" - "punpckhbw %%mm7, %%mm5 \n\t" - "paddusw %%mm2, %%mm4 \n\t" - "paddusw %%mm3, %%mm5 \n\t" - "paddusw %%mm6, %%mm0 \n\t" - "paddusw %%mm6, %%mm1 \n\t" - "paddusw %%mm4, %%mm0 \n\t" - "paddusw %%mm5, %%mm1 \n\t" - "psrlw $2, %%mm0 \n\t" - "psrlw $2, %%mm1 \n\t" - "movq (%2, %%"REG_a"), %%mm3 \n\t" - "packuswb %%mm1, %%mm0 \n\t" - "pcmpeqd %%mm2, %%mm2 \n\t" - "paddb %%mm2, %%mm2 \n\t" - PAVGB(%%mm3, %%mm0, %%mm1, %%mm2) - "movq %%mm1, (%2, %%"REG_a") \n\t" - "add %3, %%"REG_a" \n\t" - - "subl $2, %0 \n\t" - "jnz 1b \n\t" - :"+g"(h), "+S"(pixels) - :"D"(block), "r"((x86_reg)line_size) - :REG_a, "memory"); -} - -//FIXME optimize -static void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ - DEF(put, pixels8_y2)(block , pixels , line_size, h); - DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h); -} - -static void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ - DEF(put, pixels8_xy2)(block , pixels , line_size, h); - DEF(put, pixels8_xy2)(block+8, pixels+8, line_size, h); -} - -static void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ - DEF(avg, pixels8_y2)(block , pixels , line_size, h); - DEF(avg, pixels8_y2)(block+8, pixels+8, line_size, h); -} - -static void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ - DEF(avg, pixels8_xy2)(block , pixels , line_size, h); - DEF(avg, pixels8_xy2)(block+8, pixels+8, line_size, h); -}
--- a/i386/dsputil_yasm.asm Mon Dec 22 06:50:18 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,92 +0,0 @@ -;****************************************************************************** -;* MMX optimized DSP utils -;* Copyright (c) 2008 Loren Merritt -;* -;* This file is part of FFmpeg. -;* -;* FFmpeg is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* FFmpeg is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with FFmpeg; if not, write to the Free Software -;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "x86inc.asm" - -section .text align=16 - -%macro PSWAPD_SSE 2 - pshufw %1, %2, 0x4e -%endmacro -%macro PSWAPD_3DN1 2 - movq %1, %2 - psrlq %1, 32 - punpckldq %1, %2 -%endmacro - -%macro FLOAT_TO_INT16_INTERLEAVE6 1 -; void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len) -cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5 -%ifdef ARCH_X86_64 - %define lend r10d - mov lend, r2d -%else - %define lend dword r2m -%endif - mov src1q, [srcq+1*gprsize] - mov src2q, [srcq+2*gprsize] - mov src3q, [srcq+3*gprsize] - mov src4q, [srcq+4*gprsize] - mov src5q, [srcq+5*gprsize] - mov srcq, [srcq] - sub src1q, srcq - sub src2q, srcq - sub src3q, srcq - sub src4q, srcq - sub src5q, srcq -.loop: - cvtps2pi mm0, [srcq] - cvtps2pi mm1, [srcq+src1q] - cvtps2pi mm2, [srcq+src2q] - cvtps2pi mm3, [srcq+src3q] - cvtps2pi mm4, [srcq+src4q] - cvtps2pi mm5, [srcq+src5q] - packssdw mm0, mm3 - packssdw mm1, mm4 - packssdw mm2, mm5 - pswapd mm3, mm0 - punpcklwd mm0, mm1 - punpckhwd mm1, mm2 - punpcklwd mm2, mm3 - pswapd mm3, mm0 - punpckldq mm0, mm2 - punpckhdq mm2, mm1 - punpckldq mm1, mm3 - movq [dstq ], mm0 - movq [dstq+16], mm2 - movq [dstq+ 8], mm1 - add srcq, 8 - add dstq, 24 - sub lend, 2 - jg .loop - emms - RET -%endmacro ; FLOAT_TO_INT16_INTERLEAVE6 - -%define pswapd PSWAPD_SSE -FLOAT_TO_INT16_INTERLEAVE6 sse -%define cvtps2pi pf2id -%define pswapd PSWAPD_3DN1 -FLOAT_TO_INT16_INTERLEAVE6 3dnow -%undef pswapd -FLOAT_TO_INT16_INTERLEAVE6 3dn2 -%undef cvtps2pi -
--- a/i386/dsputilenc_mmx.c Mon Dec 22 06:50:18 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1441 +0,0 @@ -/* - * MMX optimized DSP utils - * Copyright (c) 2000, 2001 Fabrice Bellard. - * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - * - * MMX optimization by Nick Kurshev <nickols_k@mail.ru> - */ - -#include "libavutil/x86_cpu.h" -#include "libavcodec/dsputil.h" -#include "libavcodec/mpegvideo.h" -#include "dsputil_mmx.h" - - -static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size) -{ - __asm__ volatile( - "mov $-128, %%"REG_a" \n\t" - "pxor %%mm7, %%mm7 \n\t" - ASMALIGN(4) - "1: \n\t" - "movq (%0), %%mm0 \n\t" - "movq (%0, %2), %%mm2 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm2, %%mm3 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpckhbw %%mm7, %%mm1 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpckhbw %%mm7, %%mm3 \n\t" - "movq %%mm0, (%1, %%"REG_a") \n\t" - "movq %%mm1, 8(%1, %%"REG_a") \n\t" - "movq %%mm2, 16(%1, %%"REG_a") \n\t" - "movq %%mm3, 24(%1, %%"REG_a") \n\t" - "add %3, %0 \n\t" - "add $32, %%"REG_a" \n\t" - "js 1b \n\t" - : "+r" (pixels) - : "r" (block+64), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*2) - : "%"REG_a - ); -} - -static void get_pixels_sse2(DCTELEM *block, const uint8_t *pixels, int line_size) -{ - __asm__ volatile( - "pxor %%xmm7, %%xmm7 \n\t" - "movq (%0), %%xmm0 \n\t" - "movq (%0, %2), %%xmm1 \n\t" - "movq (%0, %2,2), %%xmm2 \n\t" - "movq (%0, %3), %%xmm3 \n\t" - "lea (%0,%2,4), %0 \n\t" - "punpcklbw %%xmm7, %%xmm0 \n\t" - "punpcklbw %%xmm7, %%xmm1 \n\t" - "punpcklbw %%xmm7, %%xmm2 \n\t" - "punpcklbw %%xmm7, %%xmm3 \n\t" - "movdqa %%xmm0, (%1) \n\t" - "movdqa %%xmm1, 16(%1) \n\t" - "movdqa %%xmm2, 32(%1) \n\t" - "movdqa %%xmm3, 48(%1) \n\t" - "movq (%0), %%xmm0 \n\t" - "movq (%0, %2), %%xmm1 \n\t" - "movq (%0, %2,2), %%xmm2 \n\t" - "movq (%0, %3), %%xmm3 \n\t" - "punpcklbw %%xmm7, %%xmm0 \n\t" - "punpcklbw %%xmm7, %%xmm1 \n\t" - "punpcklbw %%xmm7, %%xmm2 \n\t" - "punpcklbw %%xmm7, %%xmm3 \n\t" - "movdqa %%xmm0, 64(%1) \n\t" - "movdqa %%xmm1, 80(%1) \n\t" - "movdqa %%xmm2, 96(%1) \n\t" - "movdqa %%xmm3, 112(%1) \n\t" - : "+r" (pixels) - : "r" (block), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3) - ); -} - -static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride) -{ - __asm__ volatile( - "pxor %%mm7, %%mm7 \n\t" - "mov $-128, %%"REG_a" \n\t" - ASMALIGN(4) - "1: \n\t" - "movq (%0), %%mm0 \n\t" - "movq (%1), %%mm2 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm2, %%mm3 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpckhbw %%mm7, %%mm1 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpckhbw %%mm7, %%mm3 \n\t" - "psubw %%mm2, %%mm0 \n\t" - "psubw %%mm3, %%mm1 \n\t" - "movq %%mm0, (%2, %%"REG_a") \n\t" - "movq %%mm1, 8(%2, %%"REG_a") \n\t" - "add %3, %0 \n\t" - "add %3, %1 \n\t" - "add $16, %%"REG_a" \n\t" - "jnz 1b \n\t" - : "+r" (s1), "+r" (s2) - : "r" (block+64), "r" ((x86_reg)stride) - : "%"REG_a - ); -} - -static int pix_sum16_mmx(uint8_t * pix, int line_size){ - const int h=16; - int sum; - x86_reg index= -line_size*h; - - __asm__ volatile( - "pxor %%mm7, %%mm7 \n\t" - "pxor %%mm6, %%mm6 \n\t" - "1: \n\t" - "movq (%2, %1), %%mm0 \n\t" - "movq (%2, %1), %%mm1 \n\t" - "movq 8(%2, %1), %%mm2 \n\t" - "movq 8(%2, %1), %%mm3 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpckhbw %%mm7, %%mm1 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpckhbw %%mm7, %%mm3 \n\t" - "paddw %%mm0, %%mm1 \n\t" - "paddw %%mm2, %%mm3 \n\t" - "paddw %%mm1, %%mm3 \n\t" - "paddw %%mm3, %%mm6 \n\t" - "add %3, %1 \n\t" - " js 1b \n\t" - "movq %%mm6, %%mm5 \n\t" - "psrlq $32, %%mm6 \n\t" - "paddw %%mm5, %%mm6 \n\t" - "movq %%mm6, %%mm5 \n\t" - "psrlq $16, %%mm6 \n\t" - "paddw %%mm5, %%mm6 \n\t" - "movd %%mm6, %0 \n\t" - "andl $0xFFFF, %0 \n\t" - : "=&r" (sum), "+r" (index) - : "r" (pix - index), "r" ((x86_reg)line_size) - ); - - return sum; -} - -static int pix_norm1_mmx(uint8_t *pix, int line_size) { - int tmp; - __asm__ volatile ( - "movl $16,%%ecx\n" - "pxor %%mm0,%%mm0\n" - "pxor %%mm7,%%mm7\n" - "1:\n" - "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */ - "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */ - - "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */ - - "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */ - "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */ - - "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */ - "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */ - "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */ - - "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */ - "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */ - - "pmaddwd %%mm3,%%mm3\n" - "pmaddwd %%mm4,%%mm4\n" - - "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2, - pix2^2+pix3^2+pix6^2+pix7^2) */ - "paddd %%mm3,%%mm4\n" - "paddd %%mm2,%%mm7\n" - - "add %2, %0\n" - "paddd %%mm4,%%mm7\n" - "dec %%ecx\n" - "jnz 1b\n" - - "movq %%mm7,%%mm1\n" - "psrlq $32, %%mm7\n" /* shift hi dword to lo */ - "paddd %%mm7,%%mm1\n" - "movd %%mm1,%1\n" - : "+r" (pix), "=r"(tmp) : "r" ((x86_reg)line_size) : "%ecx" ); - return tmp; -} - -static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { - int tmp; - __asm__ volatile ( - "movl %4,%%ecx\n" - "shr $1,%%ecx\n" - "pxor %%mm0,%%mm0\n" /* mm0 = 0 */ - "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */ - "1:\n" - "movq (%0),%%mm1\n" /* mm1 = pix1[0][0-7] */ - "movq (%1),%%mm2\n" /* mm2 = pix2[0][0-7] */ - "movq (%0,%3),%%mm3\n" /* mm3 = pix1[1][0-7] */ - "movq (%1,%3),%%mm4\n" /* mm4 = pix2[1][0-7] */ - - /* todo: mm1-mm2, mm3-mm4 */ - /* algo: subtract mm1 from mm2 with saturation and vice versa */ - /* OR the results to get absolute difference */ - "movq %%mm1,%%mm5\n" - "movq %%mm3,%%mm6\n" - "psubusb %%mm2,%%mm1\n" - "psubusb %%mm4,%%mm3\n" - "psubusb %%mm5,%%mm2\n" - "psubusb %%mm6,%%mm4\n" - - "por %%mm1,%%mm2\n" - "por %%mm3,%%mm4\n" - - /* now convert to 16-bit vectors so we can square them */ - "movq %%mm2,%%mm1\n" - "movq %%mm4,%%mm3\n" - - "punpckhbw %%mm0,%%mm2\n" - "punpckhbw %%mm0,%%mm4\n" - "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */ - "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */ - - "pmaddwd %%mm2,%%mm2\n" - "pmaddwd %%mm4,%%mm4\n" - "pmaddwd %%mm1,%%mm1\n" - "pmaddwd %%mm3,%%mm3\n" - - "lea (%0,%3,2), %0\n" /* pix1 += 2*line_size */ - "lea (%1,%3,2), %1\n" /* pix2 += 2*line_size */ - - "paddd %%mm2,%%mm1\n" - "paddd %%mm4,%%mm3\n" - "paddd %%mm1,%%mm7\n" - "paddd %%mm3,%%mm7\n" - - "decl %%ecx\n" - "jnz 1b\n" - - "movq %%mm7,%%mm1\n" - "psrlq $32, %%mm7\n" /* shift hi dword to lo */ - "paddd %%mm7,%%mm1\n" - "movd %%mm1,%2\n" - : "+r" (pix1), "+r" (pix2), "=r"(tmp) - : "r" ((x86_reg)line_size) , "m" (h) - : "%ecx"); - return tmp; -} - -static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { - int tmp; - __asm__ volatile ( - "movl %4,%%ecx\n" - "pxor %%mm0,%%mm0\n" /* mm0 = 0 */ - "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */ - "1:\n" - "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */ - "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */ - "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */ - "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */ - - /* todo: mm1-mm2, mm3-mm4 */ - /* algo: subtract mm1 from mm2 with saturation and vice versa */ - /* OR the results to get absolute difference */ - "movq %%mm1,%%mm5\n" - "movq %%mm3,%%mm6\n" - "psubusb %%mm2,%%mm1\n" - "psubusb %%mm4,%%mm3\n" - "psubusb %%mm5,%%mm2\n" - "psubusb %%mm6,%%mm4\n" - - "por %%mm1,%%mm2\n" - "por %%mm3,%%mm4\n" - - /* now convert to 16-bit vectors so we can square them */ - "movq %%mm2,%%mm1\n" - "movq %%mm4,%%mm3\n" - - "punpckhbw %%mm0,%%mm2\n" - "punpckhbw %%mm0,%%mm4\n" - "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */ - "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */ - - "pmaddwd %%mm2,%%mm2\n" - "pmaddwd %%mm4,%%mm4\n" - "pmaddwd %%mm1,%%mm1\n" - "pmaddwd %%mm3,%%mm3\n" - - "add %3,%0\n" - "add %3,%1\n" - - "paddd %%mm2,%%mm1\n" - "paddd %%mm4,%%mm3\n" - "paddd %%mm1,%%mm7\n" - "paddd %%mm3,%%mm7\n" - - "decl %%ecx\n" - "jnz 1b\n" - - "movq %%mm7,%%mm1\n" - "psrlq $32, %%mm7\n" /* shift hi dword to lo */ - "paddd %%mm7,%%mm1\n" - "movd %%mm1,%2\n" - : "+r" (pix1), "+r" (pix2), "=r"(tmp) - : "r" ((x86_reg)line_size) , "m" (h) - : "%ecx"); - return tmp; -} - -static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { - int tmp; - __asm__ volatile ( - "shr $1,%2\n" - "pxor %%xmm0,%%xmm0\n" /* mm0 = 0 */ - "pxor %%xmm7,%%xmm7\n" /* mm7 holds the sum */ - "1:\n" - "movdqu (%0),%%xmm1\n" /* mm1 = pix1[0][0-15] */ - "movdqu (%1),%%xmm2\n" /* mm2 = pix2[0][0-15] */ - "movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */ - "movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */ - - /* todo: mm1-mm2, mm3-mm4 */ - /* algo: subtract mm1 from mm2 with saturation and vice versa */ - /* OR the results to get absolute difference */ - "movdqa %%xmm1,%%xmm5\n" - "movdqa %%xmm3,%%xmm6\n" - "psubusb %%xmm2,%%xmm1\n" - "psubusb %%xmm4,%%xmm3\n" - "psubusb %%xmm5,%%xmm2\n" - "psubusb %%xmm6,%%xmm4\n" - - "por %%xmm1,%%xmm2\n" - "por %%xmm3,%%xmm4\n" - - /* now convert to 16-bit vectors so we can square them */ - "movdqa %%xmm2,%%xmm1\n" - "movdqa %%xmm4,%%xmm3\n" - - "punpckhbw %%xmm0,%%xmm2\n" - "punpckhbw %%xmm0,%%xmm4\n" - "punpcklbw %%xmm0,%%xmm1\n" /* mm1 now spread over (mm1,mm2) */ - "punpcklbw %%xmm0,%%xmm3\n" /* mm4 now spread over (mm3,mm4) */ - - "pmaddwd %%xmm2,%%xmm2\n" - "pmaddwd %%xmm4,%%xmm4\n" - "pmaddwd %%xmm1,%%xmm1\n" - "pmaddwd %%xmm3,%%xmm3\n" - - "lea (%0,%4,2), %0\n" /* pix1 += 2*line_size */ - "lea (%1,%4,2), %1\n" /* pix2 += 2*line_size */ - - "paddd %%xmm2,%%xmm1\n" - "paddd %%xmm4,%%xmm3\n" - "paddd %%xmm1,%%xmm7\n" - "paddd %%xmm3,%%xmm7\n" - - "decl %2\n" - "jnz 1b\n" - - "movdqa %%xmm7,%%xmm1\n" - "psrldq $8, %%xmm7\n" /* shift hi qword to lo */ - "paddd %%xmm1,%%xmm7\n" - "movdqa %%xmm7,%%xmm1\n" - "psrldq $4, %%xmm7\n" /* shift hi dword to lo */ - "paddd %%xmm1,%%xmm7\n" - "movd %%xmm7,%3\n" - : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp) - : "r" ((x86_reg)line_size)); - return tmp; -} - -static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) { - int tmp; - __asm__ volatile ( - "movl %3,%%ecx\n" - "pxor %%mm7,%%mm7\n" - "pxor %%mm6,%%mm6\n" - - "movq (%0),%%mm0\n" - "movq %%mm0, %%mm1\n" - "psllq $8, %%mm0\n" - "psrlq $8, %%mm1\n" - "psrlq $8, %%mm0\n" - "movq %%mm0, %%mm2\n" - "movq %%mm1, %%mm3\n" - "punpcklbw %%mm7,%%mm0\n" - "punpcklbw %%mm7,%%mm1\n" - "punpckhbw %%mm7,%%mm2\n" - "punpckhbw %%mm7,%%mm3\n" - "psubw %%mm1, %%mm0\n" - "psubw %%mm3, %%mm2\n" - - "add %2,%0\n" - - "movq (%0),%%mm4\n" - "movq %%mm4, %%mm1\n" - "psllq $8, %%mm4\n" - "psrlq $8, %%mm1\n" - "psrlq $8, %%mm4\n" - "movq %%mm4, %%mm5\n" - "movq %%mm1, %%mm3\n" - "punpcklbw %%mm7,%%mm4\n" - "punpcklbw %%mm7,%%mm1\n" - "punpckhbw %%mm7,%%mm5\n" - "punpckhbw %%mm7,%%mm3\n" - "psubw %%mm1, %%mm4\n" - "psubw %%mm3, %%mm5\n" - "psubw %%mm4, %%mm0\n" - "psubw %%mm5, %%mm2\n" - "pxor %%mm3, %%mm3\n" - "pxor %%mm1, %%mm1\n" - "pcmpgtw %%mm0, %%mm3\n\t" - "pcmpgtw %%mm2, %%mm1\n\t" - "pxor %%mm3, %%mm0\n" - "pxor %%mm1, %%mm2\n" - "psubw %%mm3, %%mm0\n" - "psubw %%mm1, %%mm2\n" - "paddw %%mm0, %%mm2\n" - "paddw %%mm2, %%mm6\n" - - "add %2,%0\n" - "1:\n" - - "movq (%0),%%mm0\n" - "movq %%mm0, %%mm1\n" - "psllq $8, %%mm0\n" - "psrlq $8, %%mm1\n" - "psrlq $8, %%mm0\n" - "movq %%mm0, %%mm2\n" - "movq %%mm1, %%mm3\n" - "punpcklbw %%mm7,%%mm0\n" - "punpcklbw %%mm7,%%mm1\n" - "punpckhbw %%mm7,%%mm2\n" - "punpckhbw %%mm7,%%mm3\n" - "psubw %%mm1, %%mm0\n" - "psubw %%mm3, %%mm2\n" - "psubw %%mm0, %%mm4\n" - "psubw %%mm2, %%mm5\n" - "pxor %%mm3, %%mm3\n" - "pxor %%mm1, %%mm1\n" - "pcmpgtw %%mm4, %%mm3\n\t" - "pcmpgtw %%mm5, %%mm1\n\t" - "pxor %%mm3, %%mm4\n" - "pxor %%mm1, %%mm5\n" - "psubw %%mm3, %%mm4\n" - "psubw %%mm1, %%mm5\n" - "paddw %%mm4, %%mm5\n" - "paddw %%mm5, %%mm6\n" - - "add %2,%0\n" - - "movq (%0),%%mm4\n" - "movq %%mm4, %%mm1\n" - "psllq $8, %%mm4\n" - "psrlq $8, %%mm1\n" - "psrlq $8, %%mm4\n" - "movq %%mm4, %%mm5\n" - "movq %%mm1, %%mm3\n" - "punpcklbw %%mm7,%%mm4\n" - "punpcklbw %%mm7,%%mm1\n" - "punpckhbw %%mm7,%%mm5\n" - "punpckhbw %%mm7,%%mm3\n" - "psubw %%mm1, %%mm4\n" - "psubw %%mm3, %%mm5\n" - "psubw %%mm4, %%mm0\n" - "psubw %%mm5, %%mm2\n" - "pxor %%mm3, %%mm3\n" - "pxor %%mm1, %%mm1\n" - "pcmpgtw %%mm0, %%mm3\n\t" - "pcmpgtw %%mm2, %%mm1\n\t" - "pxor %%mm3, %%mm0\n" - "pxor %%mm1, %%mm2\n" - "psubw %%mm3, %%mm0\n" - "psubw %%mm1, %%mm2\n" - "paddw %%mm0, %%mm2\n" - "paddw %%mm2, %%mm6\n" - - "add %2,%0\n" - "subl $2, %%ecx\n" - " jnz 1b\n" - - "movq %%mm6, %%mm0\n" - "punpcklwd %%mm7,%%mm0\n" - "punpckhwd %%mm7,%%mm6\n" - "paddd %%mm0, %%mm6\n" - - "movq %%mm6,%%mm0\n" - "psrlq $32, %%mm6\n" - "paddd %%mm6,%%mm0\n" - "movd %%mm0,%1\n" - : "+r" (pix1), "=r"(tmp) - : "r" ((x86_reg)line_size) , "g" (h-2) - : "%ecx"); - return tmp; -} - -static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) { - int tmp; - uint8_t * pix= pix1; - __asm__ volatile ( - "movl %3,%%ecx\n" - "pxor %%mm7,%%mm7\n" - "pxor %%mm6,%%mm6\n" - - "movq (%0),%%mm0\n" - "movq 1(%0),%%mm1\n" - "movq %%mm0, %%mm2\n" - "movq %%mm1, %%mm3\n" - "punpcklbw %%mm7,%%mm0\n" - "punpcklbw %%mm7,%%mm1\n" - "punpckhbw %%mm7,%%mm2\n" - "punpckhbw %%mm7,%%mm3\n" - "psubw %%mm1, %%mm0\n" - "psubw %%mm3, %%mm2\n" - - "add %2,%0\n" - - "movq (%0),%%mm4\n" - "movq 1(%0),%%mm1\n" - "movq %%mm4, %%mm5\n" - "movq %%mm1, %%mm3\n" - "punpcklbw %%mm7,%%mm4\n" - "punpcklbw %%mm7,%%mm1\n" - "punpckhbw %%mm7,%%mm5\n" - "punpckhbw %%mm7,%%mm3\n" - "psubw %%mm1, %%mm4\n" - "psubw %%mm3, %%mm5\n" - "psubw %%mm4, %%mm0\n" - "psubw %%mm5, %%mm2\n" - "pxor %%mm3, %%mm3\n" - "pxor %%mm1, %%mm1\n" - "pcmpgtw %%mm0, %%mm3\n\t" - "pcmpgtw %%mm2, %%mm1\n\t" - "pxor %%mm3, %%mm0\n" - "pxor %%mm1, %%mm2\n" - "psubw %%mm3, %%mm0\n" - "psubw %%mm1, %%mm2\n" - "paddw %%mm0, %%mm2\n" - "paddw %%mm2, %%mm6\n" - - "add %2,%0\n" - "1:\n" - - "movq (%0),%%mm0\n" - "movq 1(%0),%%mm1\n" - "movq %%mm0, %%mm2\n" - "movq %%mm1, %%mm3\n" - "punpcklbw %%mm7,%%mm0\n" - "punpcklbw %%mm7,%%mm1\n" - "punpckhbw %%mm7,%%mm2\n" - "punpckhbw %%mm7,%%mm3\n" - "psubw %%mm1, %%mm0\n" - "psubw %%mm3, %%mm2\n" - "psubw %%mm0, %%mm4\n" - "psubw %%mm2, %%mm5\n" - "pxor %%mm3, %%mm3\n" - "pxor %%mm1, %%mm1\n" - "pcmpgtw %%mm4, %%mm3\n\t" - "pcmpgtw %%mm5, %%mm1\n\t" - "pxor %%mm3, %%mm4\n" - "pxor %%mm1, %%mm5\n" - "psubw %%mm3, %%mm4\n" - "psubw %%mm1, %%mm5\n" - "paddw %%mm4, %%mm5\n" - "paddw %%mm5, %%mm6\n" - - "add %2,%0\n" - - "movq (%0),%%mm4\n" - "movq 1(%0),%%mm1\n" - "movq %%mm4, %%mm5\n" - "movq %%mm1, %%mm3\n" - "punpcklbw %%mm7,%%mm4\n" - "punpcklbw %%mm7,%%mm1\n" - "punpckhbw %%mm7,%%mm5\n" - "punpckhbw %%mm7,%%mm3\n" - "psubw %%mm1, %%mm4\n" - "psubw %%mm3, %%mm5\n" - "psubw %%mm4, %%mm0\n" - "psubw %%mm5, %%mm2\n" - "pxor %%mm3, %%mm3\n" - "pxor %%mm1, %%mm1\n" - "pcmpgtw %%mm0, %%mm3\n\t" - "pcmpgtw %%mm2, %%mm1\n\t" - "pxor %%mm3, %%mm0\n" - "pxor %%mm1, %%mm2\n" - "psubw %%mm3, %%mm0\n" - "psubw %%mm1, %%mm2\n" - "paddw %%mm0, %%mm2\n" - "paddw %%mm2, %%mm6\n" - - "add %2,%0\n" - "subl $2, %%ecx\n" - " jnz 1b\n" - - "movq %%mm6, %%mm0\n" - "punpcklwd %%mm7,%%mm0\n" - "punpckhwd %%mm7,%%mm6\n" - "paddd %%mm0, %%mm6\n" - - "movq %%mm6,%%mm0\n" - "psrlq $32, %%mm6\n" - "paddd %%mm6,%%mm0\n" - "movd %%mm0,%1\n" - : "+r" (pix1), "=r"(tmp) - : "r" ((x86_reg)line_size) , "g" (h-2) - : "%ecx"); - return tmp + hf_noise8_mmx(pix+8, line_size, h); -} - -static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { - MpegEncContext *c = p; - int score1, score2; - - if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h); - else score1 = sse16_mmx(c, pix1, pix2, line_size, h); - score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h); - - if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight; - else return score1 + FFABS(score2)*8; -} - -static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { - MpegEncContext *c = p; - int score1= sse8_mmx(c, pix1, pix2, line_size, h); - int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h); - - if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight; - else return score1 + FFABS(score2)*8; -} - -static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) { - int tmp; - - assert( (((int)pix) & 7) == 0); - assert((line_size &7) ==0); - -#define SUM(in0, in1, out0, out1) \ - "movq (%0), %%mm2\n"\ - "movq 8(%0), %%mm3\n"\ - "add %2,%0\n"\ - "movq %%mm2, " #out0 "\n"\ - "movq %%mm3, " #out1 "\n"\ - "psubusb " #in0 ", %%mm2\n"\ - "psubusb " #in1 ", %%mm3\n"\ - "psubusb " #out0 ", " #in0 "\n"\ - "psubusb " #out1 ", " #in1 "\n"\ - "por %%mm2, " #in0 "\n"\ - "por %%mm3, " #in1 "\n"\ - "movq " #in0 ", %%mm2\n"\ - "movq " #in1 ", %%mm3\n"\ - "punpcklbw %%mm7, " #in0 "\n"\ - "punpcklbw %%mm7, " #in1 "\n"\ - "punpckhbw %%mm7, %%mm2\n"\ - "punpckhbw %%mm7, %%mm3\n"\ - "paddw " #in1 ", " #in0 "\n"\ - "paddw %%mm3, %%mm2\n"\ - "paddw %%mm2, " #in0 "\n"\ - "paddw " #in0 ", %%mm6\n" - - - __asm__ volatile ( - "movl %3,%%ecx\n" - "pxor %%mm6,%%mm6\n" - "pxor %%mm7,%%mm7\n" - "movq (%0),%%mm0\n" - "movq 8(%0),%%mm1\n" - "add %2,%0\n" - "jmp 2f\n" - "1:\n" - - SUM(%%mm4, %%mm5, %%mm0, %%mm1) - "2:\n" - SUM(%%mm0, %%mm1, %%mm4, %%mm5) - - "subl $2, %%ecx\n" - "jnz 1b\n" - - "movq %%mm6,%%mm0\n" - "psrlq $32, %%mm6\n" - "paddw %%mm6,%%mm0\n" - "movq %%mm0,%%mm6\n" - "psrlq $16, %%mm0\n" - "paddw %%mm6,%%mm0\n" - "movd %%mm0,%1\n" - : "+r" (pix), "=r"(tmp) - : "r" ((x86_reg)line_size) , "m" (h) - : "%ecx"); - return tmp & 0xFFFF; -} -#undef SUM - -static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) { - int tmp; - - assert( (((int)pix) & 7) == 0); - assert((line_size &7) ==0); - -#define SUM(in0, in1, out0, out1) \ - "movq (%0), " #out0 "\n"\ - "movq 8(%0), " #out1 "\n"\ - "add %2,%0\n"\ - "psadbw " #out0 ", " #in0 "\n"\ - "psadbw " #out1 ", " #in1 "\n"\ - "paddw " #in1 ", " #in0 "\n"\ - "paddw " #in0 ", %%mm6\n" - - __asm__ volatile ( - "movl %3,%%ecx\n" - "pxor %%mm6,%%mm6\n" - "pxor %%mm7,%%mm7\n" - "movq (%0),%%mm0\n" - "movq 8(%0),%%mm1\n" - "add %2,%0\n" - "jmp 2f\n" - "1:\n" - - SUM(%%mm4, %%mm5, %%mm0, %%mm1) - "2:\n" - SUM(%%mm0, %%mm1, %%mm4, %%mm5) - - "subl $2, %%ecx\n" - "jnz 1b\n" - - "movd %%mm6,%1\n" - : "+r" (pix), "=r"(tmp) - : "r" ((x86_reg)line_size) , "m" (h) - : "%ecx"); - return tmp; -} -#undef SUM - -static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { - int tmp; - - assert( (((int)pix1) & 7) == 0); - assert( (((int)pix2) & 7) == 0); - assert((line_size &7) ==0); - -#define SUM(in0, in1, out0, out1) \ - "movq (%0),%%mm2\n"\ - "movq (%1)," #out0 "\n"\ - "movq 8(%0),%%mm3\n"\ - "movq 8(%1)," #out1 "\n"\ - "add %3,%0\n"\ - "add %3,%1\n"\ - "psubb " #out0 ", %%mm2\n"\ - "psubb " #out1 ", %%mm3\n"\ - "pxor %%mm7, %%mm2\n"\ - "pxor %%mm7, %%mm3\n"\ - "movq %%mm2, " #out0 "\n"\ - "movq %%mm3, " #out1 "\n"\ - "psubusb " #in0 ", %%mm2\n"\ - "psubusb " #in1 ", %%mm3\n"\ - "psubusb " #out0 ", " #in0 "\n"\ - "psubusb " #out1 ", " #in1 "\n"\ - "por %%mm2, " #in0 "\n"\ - "por %%mm3, " #in1 "\n"\ - "movq " #in0 ", %%mm2\n"\ - "movq " #in1 ", %%mm3\n"\ - "punpcklbw %%mm7, " #in0 "\n"\ - "punpcklbw %%mm7, " #in1 "\n"\ - "punpckhbw %%mm7, %%mm2\n"\ - "punpckhbw %%mm7, %%mm3\n"\ - "paddw " #in1 ", " #in0 "\n"\ - "paddw %%mm3, %%mm2\n"\ - "paddw %%mm2, " #in0 "\n"\ - "paddw " #in0 ", %%mm6\n" - - - __asm__ volatile ( - "movl %4,%%ecx\n" - "pxor %%mm6,%%mm6\n" - "pcmpeqw %%mm7,%%mm7\n" - "psllw $15, %%mm7\n" - "packsswb %%mm7, %%mm7\n" - "movq (%0),%%mm0\n" - "movq (%1),%%mm2\n" - "movq 8(%0),%%mm1\n" - "movq 8(%1),%%mm3\n" - "add %3,%0\n" - "add %3,%1\n" - "psubb %%mm2, %%mm0\n" - "psubb %%mm3, %%mm1\n" - "pxor %%mm7, %%mm0\n" - "pxor %%mm7, %%mm1\n" - "jmp 2f\n" - "1:\n" - - SUM(%%mm4, %%mm5, %%mm0, %%mm1) - "2:\n" - SUM(%%mm0, %%mm1, %%mm4, %%mm5) - - "subl $2, %%ecx\n" - "jnz 1b\n" - - "movq %%mm6,%%mm0\n" - "psrlq $32, %%mm6\n" - "paddw %%mm6,%%mm0\n" - "movq %%mm0,%%mm6\n" - "psrlq $16, %%mm0\n" - "paddw %%mm6,%%mm0\n" - "movd %%mm0,%2\n" - : "+r" (pix1), "+r" (pix2), "=r"(tmp) - : "r" ((x86_reg)line_size) , "m" (h) - : "%ecx"); - return tmp & 0x7FFF; -} -#undef SUM - -static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { - int tmp; - - assert( (((int)pix1) & 7) == 0); - assert( (((int)pix2) & 7) == 0); - assert((line_size &7) ==0); - -#define SUM(in0, in1, out0, out1) \ - "movq (%0)," #out0 "\n"\ - "movq (%1),%%mm2\n"\ - "movq 8(%0)," #out1 "\n"\ - "movq 8(%1),%%mm3\n"\ - "add %3,%0\n"\ - "add %3,%1\n"\ - "psubb %%mm2, " #out0 "\n"\ - "psubb %%mm3, " #out1 "\n"\ - "pxor %%mm7, " #out0 "\n"\ - "pxor %%mm7, " #out1 "\n"\ - "psadbw " #out0 ", " #in0 "\n"\ - "psadbw " #out1 ", " #in1 "\n"\ - "paddw " #in1 ", " #in0 "\n"\ - "paddw " #in0 ", %%mm6\n" - - __asm__ volatile ( - "movl %4,%%ecx\n" - "pxor %%mm6,%%mm6\n" - "pcmpeqw %%mm7,%%mm7\n" - "psllw $15, %%mm7\n" - "packsswb %%mm7, %%mm7\n" - "movq (%0),%%mm0\n" - "movq (%1),%%mm2\n" - "movq 8(%0),%%mm1\n" - "movq 8(%1),%%mm3\n" - "add %3,%0\n" - "add %3,%1\n" - "psubb %%mm2, %%mm0\n" - "psubb %%mm3, %%mm1\n" - "pxor %%mm7, %%mm0\n" - "pxor %%mm7, %%mm1\n" - "jmp 2f\n" - "1:\n" - - SUM(%%mm4, %%mm5, %%mm0, %%mm1) - "2:\n" - SUM(%%mm0, %%mm1, %%mm4, %%mm5) - - "subl $2, %%ecx\n" - "jnz 1b\n" - - "movd %%mm6,%2\n" - : "+r" (pix1), "+r" (pix2), "=r"(tmp) - : "r" ((x86_reg)line_size) , "m" (h) - : "%ecx"); - return tmp; -} -#undef SUM - -static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ - x86_reg i=0; - __asm__ volatile( - "1: \n\t" - "movq (%2, %0), %%mm0 \n\t" - "movq (%1, %0), %%mm1 \n\t" - "psubb %%mm0, %%mm1 \n\t" - "movq %%mm1, (%3, %0) \n\t" - "movq 8(%2, %0), %%mm0 \n\t" - "movq 8(%1, %0), %%mm1 \n\t" - "psubb %%mm0, %%mm1 \n\t" - "movq %%mm1, 8(%3, %0) \n\t" - "add $16, %0 \n\t" - "cmp %4, %0 \n\t" - " jb 1b \n\t" - : "+r" (i) - : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w-15) - ); - for(; i<w; i++) - dst[i+0] = src1[i+0]-src2[i+0]; -} - -static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){ - x86_reg i=0; - uint8_t l, lt; - - __asm__ volatile( - "1: \n\t" - "movq -1(%1, %0), %%mm0 \n\t" // LT - "movq (%1, %0), %%mm1 \n\t" // T - "movq -1(%2, %0), %%mm2 \n\t" // L - "movq (%2, %0), %%mm3 \n\t" // X - "movq %%mm2, %%mm4 \n\t" // L - "psubb %%mm0, %%mm2 \n\t" - "paddb %%mm1, %%mm2 \n\t" // L + T - LT - "movq %%mm4, %%mm5 \n\t" // L - "pmaxub %%mm1, %%mm4 \n\t" // max(T, L) - "pminub %%mm5, %%mm1 \n\t" // min(T, L) - "pminub %%mm2, %%mm4 \n\t" - "pmaxub %%mm1, %%mm4 \n\t" - "psubb %%mm4, %%mm3 \n\t" // dst - pred - "movq %%mm3, (%3, %0) \n\t" - "add $8, %0 \n\t" - "cmp %4, %0 \n\t" - " jb 1b \n\t" - : "+r" (i) - : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w) - ); - - l= *left; - lt= *left_top; - - dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF); - - *left_top= src1[w-1]; - *left = src2[w-1]; -} - -#define DIFF_PIXELS_1(m,a,t,p1,p2)\ - "mov"#m" "#p1", "#a" \n\t"\ - "mov"#m" "#p2", "#t" \n\t"\ - "punpcklbw "#a", "#t" \n\t"\ - "punpcklbw "#a", "#a" \n\t"\ - "psubw "#t", "#a" \n\t"\ - -#define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\ - uint8_t *p1b=p1, *p2b=p2;\ - __asm__ volatile(\ - DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\ - DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\ - DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\ - "add %4, %1 \n\t"\ - "add %4, %2 \n\t"\ - DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\ - DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\ - DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\ - DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\ - "mov"#m1" "#mm"0, %0 \n\t"\ - DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\ - "mov"#m1" %0, "#mm"0 \n\t"\ - : "+m"(temp), "+r"(p1b), "+r"(p2b)\ - : "r"((x86_reg)stride), "r"((x86_reg)stride*3)\ - );\ -} - //the "+m"(temp) is needed as gcc 2.95 sometimes fails to compile "=m"(temp) - -#define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q, %%mm, p1, p2, stride, temp) -#define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp) - -#define LBUTTERFLY2(a1,b1,a2,b2)\ - "paddw " #b1 ", " #a1 " \n\t"\ - "paddw " #b2 ", " #a2 " \n\t"\ - "paddw " #b1 ", " #b1 " \n\t"\ - "paddw " #b2 ", " #b2 " \n\t"\ - "psubw " #a1 ", " #b1 " \n\t"\ - "psubw " #a2 ", " #b2 " \n\t" - -#define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\ - LBUTTERFLY2(m0, m1, m2, m3)\ - LBUTTERFLY2(m4, m5, m6, m7)\ - LBUTTERFLY2(m0, m2, m1, m3)\ - LBUTTERFLY2(m4, m6, m5, m7)\ - LBUTTERFLY2(m0, m4, m1, m5)\ - LBUTTERFLY2(m2, m6, m3, m7)\ - -#define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7) - -#define MMABS_MMX(a,z)\ - "pxor " #z ", " #z " \n\t"\ - "pcmpgtw " #a ", " #z " \n\t"\ - "pxor " #z ", " #a " \n\t"\ - "psubw " #z ", " #a " \n\t" - -#define MMABS_MMX2(a,z)\ - "pxor " #z ", " #z " \n\t"\ - "psubw " #a ", " #z " \n\t"\ - "pmaxsw " #z ", " #a " \n\t" - -#define MMABS_SSSE3(a,z)\ - "pabsw " #a ", " #a " \n\t" - -#define MMABS_SUM(a,z, sum)\ - MMABS(a,z)\ - "paddusw " #a ", " #sum " \n\t" - -#define MMABS_SUM_8x8_NOSPILL\ - MMABS(%%xmm0, %%xmm8)\ - MMABS(%%xmm1, %%xmm9)\ - MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\ - MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\ - MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\ - MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\ - MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\ - MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\ - "paddusw %%xmm1, %%xmm0 \n\t" - -#ifdef ARCH_X86_64 -#define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL -#else -#define MMABS_SUM_8x8_SSE2\ - "movdqa %%xmm7, (%1) \n\t"\ - MMABS(%%xmm0, %%xmm7)\ - MMABS(%%xmm1, %%xmm7)\ - MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\ - MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\ - MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\ - MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\ - MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\ - "movdqa (%1), %%xmm2 \n\t"\ - MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\ - "paddusw %%xmm1, %%xmm0 \n\t" -#endif - -/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to - * about 100k on extreme inputs. But that's very unlikely to occur in natural video, - * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */ -#define HSUM_MMX(a, t, dst)\ - "movq "#a", "#t" \n\t"\ - "psrlq $32, "#a" \n\t"\ - "paddusw "#t", "#a" \n\t"\ - "movq "#a", "#t" \n\t"\ - "psrlq $16, "#a" \n\t"\ - "paddusw "#t", "#a" \n\t"\ - "movd "#a", "#dst" \n\t"\ - -#define HSUM_MMX2(a, t, dst)\ - "pshufw $0x0E, "#a", "#t" \n\t"\ - "paddusw "#t", "#a" \n\t"\ - "pshufw $0x01, "#a", "#t" \n\t"\ - "paddusw "#t", "#a" \n\t"\ - "movd "#a", "#dst" \n\t"\ - -#define HSUM_SSE2(a, t, dst)\ - "movhlps "#a", "#t" \n\t"\ - "paddusw "#t", "#a" \n\t"\ - "pshuflw $0x0E, "#a", "#t" \n\t"\ - "paddusw "#t", "#a" \n\t"\ - "pshuflw $0x01, "#a", "#t" \n\t"\ - "paddusw "#t", "#a" \n\t"\ - "movd "#a", "#dst" \n\t"\ - -#define HADAMARD8_DIFF_MMX(cpu) \ -static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\ - DECLARE_ALIGNED_8(uint64_t, temp[13]);\ - int sum;\ -\ - assert(h==8);\ -\ - DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\ -\ - __asm__ volatile(\ - HADAMARD48\ -\ - "movq %%mm7, 96(%1) \n\t"\ -\ - TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\ - STORE4(8, 0(%1), %%mm0, %%mm3, %%mm7, %%mm2)\ -\ - "movq 96(%1), %%mm7 \n\t"\ - TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\ - STORE4(8, 64(%1), %%mm4, %%mm7, %%mm0, %%mm6)\ -\ - : "=r" (sum)\ - : "r"(temp)\ - );\ -\ - DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\ -\ - __asm__ volatile(\ - HADAMARD48\ -\ - "movq %%mm7, 96(%1) \n\t"\ -\ - TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\ - STORE4(8, 32(%1), %%mm0, %%mm3, %%mm7, %%mm2)\ -\ - "movq 96(%1), %%mm7 \n\t"\ - TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\ - "movq %%mm7, %%mm5 \n\t"/*FIXME remove*/\ - "movq %%mm6, %%mm7 \n\t"\ - "movq %%mm0, %%mm6 \n\t"\ -\ - LOAD4(8, 64(%1), %%mm0, %%mm1, %%mm2, %%mm3)\ -\ - HADAMARD48\ - "movq %%mm7, 64(%1) \n\t"\ - MMABS(%%mm0, %%mm7)\ - MMABS(%%mm1, %%mm7)\ - MMABS_SUM(%%mm2, %%mm7, %%mm0)\ - MMABS_SUM(%%mm3, %%mm7, %%mm1)\ - MMABS_SUM(%%mm4, %%mm7, %%mm0)\ - MMABS_SUM(%%mm5, %%mm7, %%mm1)\ - MMABS_SUM(%%mm6, %%mm7, %%mm0)\ - "movq 64(%1), %%mm2 \n\t"\ - MMABS_SUM(%%mm2, %%mm7, %%mm1)\ - "paddusw %%mm1, %%mm0 \n\t"\ - "movq %%mm0, 64(%1) \n\t"\ -\ - LOAD4(8, 0(%1), %%mm0, %%mm1, %%mm2, %%mm3)\ - LOAD4(8, 32(%1), %%mm4, %%mm5, %%mm6, %%mm7)\ -\ - HADAMARD48\ - "movq %%mm7, (%1) \n\t"\ - MMABS(%%mm0, %%mm7)\ - MMABS(%%mm1, %%mm7)\ - MMABS_SUM(%%mm2, %%mm7, %%mm0)\ - MMABS_SUM(%%mm3, %%mm7, %%mm1)\ - MMABS_SUM(%%mm4, %%mm7, %%mm0)\ - MMABS_SUM(%%mm5, %%mm7, %%mm1)\ - MMABS_SUM(%%mm6, %%mm7, %%mm0)\ - "movq (%1), %%mm2 \n\t"\ - MMABS_SUM(%%mm2, %%mm7, %%mm1)\ - "paddusw 64(%1), %%mm0 \n\t"\ - "paddusw %%mm1, %%mm0 \n\t"\ -\ - HSUM(%%mm0, %%mm1, %0)\ -\ - : "=r" (sum)\ - : "r"(temp)\ - );\ - return sum&0xFFFF;\ -}\ -WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu) - -#define HADAMARD8_DIFF_SSE2(cpu) \ -static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\ - DECLARE_ALIGNED_16(uint64_t, temp[4]);\ - int sum;\ -\ - assert(h==8);\ -\ - DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\ -\ - __asm__ volatile(\ - HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\ - TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\ - HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\ - MMABS_SUM_8x8\ - HSUM_SSE2(%%xmm0, %%xmm1, %0)\ - : "=r" (sum)\ - : "r"(temp)\ - );\ - return sum&0xFFFF;\ -}\ -WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu) - -#define MMABS(a,z) MMABS_MMX(a,z) -#define HSUM(a,t,dst) HSUM_MMX(a,t,dst) -HADAMARD8_DIFF_MMX(mmx) -#undef MMABS -#undef HSUM - -#define MMABS(a,z) MMABS_MMX2(a,z) -#define MMABS_SUM_8x8 MMABS_SUM_8x8_SSE2 -#define HSUM(a,t,dst) HSUM_MMX2(a,t,dst) -HADAMARD8_DIFF_MMX(mmx2) -HADAMARD8_DIFF_SSE2(sse2) -#undef MMABS -#undef MMABS_SUM_8x8 -#undef HSUM - -#ifdef HAVE_SSSE3 -#define MMABS(a,z) MMABS_SSSE3(a,z) -#define MMABS_SUM_8x8 MMABS_SUM_8x8_NOSPILL -HADAMARD8_DIFF_SSE2(ssse3) -#undef MMABS -#undef MMABS_SUM_8x8 -#endif - -#define DCT_SAD4(m,mm,o)\ - "mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\ - "mov"#m" "#o"+16(%1), "#mm"3 \n\t"\ - "mov"#m" "#o"+32(%1), "#mm"4 \n\t"\ - "mov"#m" "#o"+48(%1), "#mm"5 \n\t"\ - MMABS_SUM(mm##2, mm##6, mm##0)\ - MMABS_SUM(mm##3, mm##7, mm##1)\ - MMABS_SUM(mm##4, mm##6, mm##0)\ - MMABS_SUM(mm##5, mm##7, mm##1)\ - -#define DCT_SAD_MMX\ - "pxor %%mm0, %%mm0 \n\t"\ - "pxor %%mm1, %%mm1 \n\t"\ - DCT_SAD4(q, %%mm, 0)\ - DCT_SAD4(q, %%mm, 8)\ - DCT_SAD4(q, %%mm, 64)\ - DCT_SAD4(q, %%mm, 72)\ - "paddusw %%mm1, %%mm0 \n\t"\ - HSUM(%%mm0, %%mm1, %0) - -#define DCT_SAD_SSE2\ - "pxor %%xmm0, %%xmm0 \n\t"\ - "pxor %%xmm1, %%xmm1 \n\t"\ - DCT_SAD4(dqa, %%xmm, 0)\ - DCT_SAD4(dqa, %%xmm, 64)\ - "paddusw %%xmm1, %%xmm0 \n\t"\ - HSUM(%%xmm0, %%xmm1, %0) - -#define DCT_SAD_FUNC(cpu) \ -static int sum_abs_dctelem_##cpu(DCTELEM *block){\ - int sum;\ - __asm__ volatile(\ - DCT_SAD\ - :"=r"(sum)\ - :"r"(block)\ - );\ - return sum&0xFFFF;\ -} - -#define DCT_SAD DCT_SAD_MMX -#define HSUM(a,t,dst) HSUM_MMX(a,t,dst) -#define MMABS(a,z) MMABS_MMX(a,z) -DCT_SAD_FUNC(mmx) -#undef MMABS -#undef HSUM - -#define HSUM(a,t,dst) HSUM_MMX2(a,t,dst) -#define MMABS(a,z) MMABS_MMX2(a,z) -DCT_SAD_FUNC(mmx2) -#undef HSUM -#undef DCT_SAD - -#define DCT_SAD DCT_SAD_SSE2 -#define HSUM(a,t,dst) HSUM_SSE2(a,t,dst) -DCT_SAD_FUNC(sse2) -#undef MMABS - -#ifdef HAVE_SSSE3 -#define MMABS(a,z) MMABS_SSSE3(a,z) -DCT_SAD_FUNC(ssse3) -#undef MMABS -#endif -#undef HSUM -#undef DCT_SAD - -static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){ - int sum; - x86_reg i=size; - __asm__ volatile( - "pxor %%mm4, %%mm4 \n" - "1: \n" - "sub $8, %0 \n" - "movq (%2,%0), %%mm2 \n" - "movq (%3,%0,2), %%mm0 \n" - "movq 8(%3,%0,2), %%mm1 \n" - "punpckhbw %%mm2, %%mm3 \n" - "punpcklbw %%mm2, %%mm2 \n" - "psraw $8, %%mm3 \n" - "psraw $8, %%mm2 \n" - "psubw %%mm3, %%mm1 \n" - "psubw %%mm2, %%mm0 \n" - "pmaddwd %%mm1, %%mm1 \n" - "pmaddwd %%mm0, %%mm0 \n" - "paddd %%mm1, %%mm4 \n" - "paddd %%mm0, %%mm4 \n" - "jg 1b \n" - "movq %%mm4, %%mm3 \n" - "psrlq $32, %%mm3 \n" - "paddd %%mm3, %%mm4 \n" - "movd %%mm4, %1 \n" - :"+r"(i), "=r"(sum) - :"r"(pix1), "r"(pix2) - ); - return sum; -} - -#define PHADDD(a, t)\ - "movq "#a", "#t" \n\t"\ - "psrlq $32, "#a" \n\t"\ - "paddd "#t", "#a" \n\t" -/* - pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31] - pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31] - pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30] - */ -#define PMULHRW(x, y, s, o)\ - "pmulhw " #s ", "#x " \n\t"\ - "pmulhw " #s ", "#y " \n\t"\ - "paddw " #o ", "#x " \n\t"\ - "paddw " #o ", "#y " \n\t"\ - "psraw $1, "#x " \n\t"\ - "psraw $1, "#y " \n\t" -#define DEF(x) x ## _mmx -#define SET_RND MOVQ_WONE -#define SCALE_OFFSET 1 - -#include "dsputil_mmx_qns_template.c" - -#undef DEF -#undef SET_RND -#undef SCALE_OFFSET -#undef PMULHRW - -#define DEF(x) x ## _3dnow -#define SET_RND(x) -#define SCALE_OFFSET 0 -#define PMULHRW(x, y, s, o)\ - "pmulhrw " #s ", "#x " \n\t"\ - "pmulhrw " #s ", "#y " \n\t" - -#include "dsputil_mmx_qns_template.c" - -#undef DEF -#undef SET_RND -#undef SCALE_OFFSET -#undef PMULHRW - -#ifdef HAVE_SSSE3 -#undef PHADDD -#define DEF(x) x ## _ssse3 -#define SET_RND(x) -#define SCALE_OFFSET -1 -#define PHADDD(a, t)\ - "pshufw $0x0E, "#a", "#t" \n\t"\ - "paddd "#t", "#a" \n\t" /* faster than phaddd on core2 */ -#define PMULHRW(x, y, s, o)\ - "pmulhrsw " #s ", "#x " \n\t"\ - "pmulhrsw " #s ", "#y " \n\t" - -#include "dsputil_mmx_qns_template.c" - -#undef DEF -#undef SET_RND -#undef SCALE_OFFSET -#undef PMULHRW -#undef PHADDD -#endif //HAVE_SSSE3 - - -/* FLAC specific */ -void ff_flac_compute_autocorr_sse2(const int32_t *data, int len, int lag, - double *autoc); - - -void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx) -{ - if (mm_flags & FF_MM_MMX) { - const int dct_algo = avctx->dct_algo; - if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){ - if(mm_flags & FF_MM_SSE2){ - c->fdct = ff_fdct_sse2; - }else if(mm_flags & FF_MM_MMXEXT){ - c->fdct = ff_fdct_mmx2; - }else{ - c->fdct = ff_fdct_mmx; - } - } - - c->get_pixels = get_pixels_mmx; - c->diff_pixels = diff_pixels_mmx; - c->pix_sum = pix_sum16_mmx; - - c->diff_bytes= diff_bytes_mmx; - c->sum_abs_dctelem= sum_abs_dctelem_mmx; - - c->hadamard8_diff[0]= hadamard8_diff16_mmx; - c->hadamard8_diff[1]= hadamard8_diff_mmx; - - c->pix_norm1 = pix_norm1_mmx; - c->sse[0] = (mm_flags & FF_MM_SSE2) ? sse16_sse2 : sse16_mmx; - c->sse[1] = sse8_mmx; - c->vsad[4]= vsad_intra16_mmx; - - c->nsse[0] = nsse16_mmx; - c->nsse[1] = nsse8_mmx; - if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ - c->vsad[0] = vsad16_mmx; - } - - if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ - c->try_8x8basis= try_8x8basis_mmx; - } - c->add_8x8basis= add_8x8basis_mmx; - - c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx; - - - if (mm_flags & FF_MM_MMXEXT) { - c->sum_abs_dctelem= sum_abs_dctelem_mmx2; - c->hadamard8_diff[0]= hadamard8_diff16_mmx2; - c->hadamard8_diff[1]= hadamard8_diff_mmx2; - c->vsad[4]= vsad_intra16_mmx2; - - if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ - c->vsad[0] = vsad16_mmx2; - } - - c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2; - } - - if(mm_flags & FF_MM_SSE2){ - c->get_pixels = get_pixels_sse2; - c->sum_abs_dctelem= sum_abs_dctelem_sse2; - c->hadamard8_diff[0]= hadamard8_diff16_sse2; - c->hadamard8_diff[1]= hadamard8_diff_sse2; - if (ENABLE_FLAC_ENCODER) - c->flac_compute_autocorr = ff_flac_compute_autocorr_sse2; - } - -#ifdef HAVE_SSSE3 - if(mm_flags & FF_MM_SSSE3){ - if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ - c->try_8x8basis= try_8x8basis_ssse3; - } - c->add_8x8basis= add_8x8basis_ssse3; - c->sum_abs_dctelem= sum_abs_dctelem_ssse3; - c->hadamard8_diff[0]= hadamard8_diff16_ssse3; - c->hadamard8_diff[1]= hadamard8_diff_ssse3; - } -#endif - - if(mm_flags & FF_MM_3DNOW){ - if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ - c->try_8x8basis= try_8x8basis_3dnow; - } - c->add_8x8basis= add_8x8basis_3dnow; - } - } - - dsputil_init_pix_mmx(c, avctx); -}
--- a/i386/fdct_mmx.c Mon Dec 22 06:50:18 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,580 +0,0 @@ -/* - * MMX optimized forward DCT - * The gcc porting is Copyright (c) 2001 Fabrice Bellard. - * cleanup/optimizations are Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> - * SSE2 optimization is Copyright (c) 2004 Denes Balatoni. - * - * from fdctam32.c - AP922 MMX(3D-Now) forward-DCT - * - * Intel Application Note AP-922 - fast, precise implementation of DCT - * http://developer.intel.com/vtune/cbts/appnotes.htm - * - * Also of inspiration: - * a page about fdct at http://www.geocities.com/ssavekar/dct.htm - * Skal's fdct at http://skal.planet-d.net/coding/dct.html - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/common.h" -#include "libavcodec/dsputil.h" - -#define ATTR_ALIGN(align) __attribute__ ((__aligned__ (align))) - -////////////////////////////////////////////////////////////////////// -// -// constants for the forward DCT -// ----------------------------- -// -// Be sure to check that your compiler is aligning all constants to QWORD -// (8-byte) memory boundaries! Otherwise the unaligned memory access will -// severely stall MMX execution. -// -////////////////////////////////////////////////////////////////////// - -#define BITS_FRW_ACC 3 //; 2 or 3 for accuracy -#define SHIFT_FRW_COL BITS_FRW_ACC -#define SHIFT_FRW_ROW (BITS_FRW_ACC + 17 - 3) -#define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1)) -//#define RND_FRW_COL (1 << (SHIFT_FRW_COL-1)) - -#define X8(x) x,x,x,x,x,x,x,x - -//concatenated table, for forward DCT transformation -static const int16_t fdct_tg_all_16[24] ATTR_ALIGN(16) = { - X8(13036), // tg * (2<<16) + 0.5 - X8(27146), // tg * (2<<16) + 0.5 - X8(-21746) // tg * (2<<16) + 0.5 -}; - -static const int16_t ocos_4_16[8] ATTR_ALIGN(16) = { - X8(23170) //cos * (2<<15) + 0.5 -}; - -static const int16_t fdct_one_corr[8] ATTR_ALIGN(16) = { X8(1) }; - -static const int32_t fdct_r_row[2] ATTR_ALIGN(8) = {RND_FRW_ROW, RND_FRW_ROW }; - -static struct -{ - const int32_t fdct_r_row_sse2[4] ATTR_ALIGN(16); -} fdct_r_row_sse2 ATTR_ALIGN(16)= -{{ - RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW -}}; -//static const long fdct_r_row_sse2[4] ATTR_ALIGN(16) = {RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW}; - -static const int16_t tab_frw_01234567[] ATTR_ALIGN(8) = { // forward_dct coeff table - 16384, 16384, 22725, 19266, - 16384, 16384, 12873, 4520, - 21407, 8867, 19266, -4520, - -8867, -21407, -22725, -12873, - 16384, -16384, 12873, -22725, - -16384, 16384, 4520, 19266, - 8867, -21407, 4520, -12873, - 21407, -8867, 19266, -22725, - - 22725, 22725, 31521, 26722, - 22725, 22725, 17855, 6270, - 29692, 12299, 26722, -6270, - -12299, -29692, -31521, -17855, - 22725, -22725, 17855, -31521, - -22725, 22725, 6270, 26722, - 12299, -29692, 6270, -17855, - 29692, -12299, 26722, -31521, - - 21407, 21407, 29692, 25172, - 21407, 21407, 16819, 5906, - 27969, 11585, 25172, -5906, - -11585, -27969, -29692, -16819, - 21407, -21407, 16819, -29692, - -21407, 21407, 5906, 25172, - 11585, -27969, 5906, -16819, - 27969, -11585, 25172, -29692, - - 19266, 19266, 26722, 22654, - 19266, 19266, 15137, 5315, - 25172, 10426, 22654, -5315, - -10426, -25172, -26722, -15137, - 19266, -19266, 15137, -26722, - -19266, 19266, 5315, 22654, - 10426, -25172, 5315, -15137, - 25172, -10426, 22654, -26722, - - 16384, 16384, 22725, 19266, - 16384, 16384, 12873, 4520, - 21407, 8867, 19266, -4520, - -8867, -21407, -22725, -12873, - 16384, -16384, 12873, -22725, - -16384, 16384, 4520, 19266, - 8867, -21407, 4520, -12873, - 21407, -8867, 19266, -22725, - - 19266, 19266, 26722, 22654, - 19266, 19266, 15137, 5315, - 25172, 10426, 22654, -5315, - -10426, -25172, -26722, -15137, - 19266, -19266, 15137, -26722, - -19266, 19266, 5315, 22654, - 10426, -25172, 5315, -15137, - 25172, -10426, 22654, -26722, - - 21407, 21407, 29692, 25172, - 21407, 21407, 16819, 5906, - 27969, 11585, 25172, -5906, - -11585, -27969, -29692, -16819, - 21407, -21407, 16819, -29692, - -21407, 21407, 5906, 25172, - 11585, -27969, 5906, -16819, - 27969, -11585, 25172, -29692, - - 22725, 22725, 31521, 26722, - 22725, 22725, 17855, 6270, - 29692, 12299, 26722, -6270, - -12299, -29692, -31521, -17855, - 22725, -22725, 17855, -31521, - -22725, 22725, 6270, 26722, - 12299, -29692, 6270, -17855, - 29692, -12299, 26722, -31521, -}; - -static struct -{ - const int16_t tab_frw_01234567_sse2[256] ATTR_ALIGN(16); -} tab_frw_01234567_sse2 ATTR_ALIGN(16) = -{{ -//static const int16_t tab_frw_01234567_sse2[] ATTR_ALIGN(16) = { // forward_dct coeff table -#define TABLE_SSE2 C4, C4, C1, C3, -C6, -C2, -C1, -C5, \ - C4, C4, C5, C7, C2, C6, C3, -C7, \ - -C4, C4, C7, C3, C6, -C2, C7, -C5, \ - C4, -C4, C5, -C1, C2, -C6, C3, -C1, -// c1..c7 * cos(pi/4) * 2^15 -#define C1 22725 -#define C2 21407 -#define C3 19266 -#define C4 16384 -#define C5 12873 -#define C6 8867 -#define C7 4520 -TABLE_SSE2 - -#undef C1 -#undef C2 -#undef C3 -#undef C4 -#undef C5 -#undef C6 -#undef C7 -#define C1 31521 -#define C2 29692 -#define C3 26722 -#define C4 22725 -#define C5 17855 -#define C6 12299 -#define C7 6270 -TABLE_SSE2 - -#undef C1 -#undef C2 -#undef C3 -#undef C4 -#undef C5 -#undef C6 -#undef C7 -#define C1 29692 -#define C2 27969 -#define C3 25172 -#define C4 21407 -#define C5 16819 -#define C6 11585 -#define C7 5906 -TABLE_SSE2 - -#undef C1 -#undef C2 -#undef C3 -#undef C4 -#undef C5 -#undef C6 -#undef C7 -#define C1 26722 -#define C2 25172 -#define C3 22654 -#define C4 19266 -#define C5 15137 -#define C6 10426 -#define C7 5315 -TABLE_SSE2 - -#undef C1 -#undef C2 -#undef C3 -#undef C4 -#undef C5 -#undef C6 -#undef C7 -#define C1 22725 -#define C2 21407 -#define C3 19266 -#define C4 16384 -#define C5 12873 -#define C6 8867 -#define C7 4520 -TABLE_SSE2 - -#undef C1 -#undef C2 -#undef C3 -#undef C4 -#undef C5 -#undef C6 -#undef C7 -#define C1 26722 -#define C2 25172 -#define C3 22654 -#define C4 19266 -#define C5 15137 -#define C6 10426 -#define C7 5315 -TABLE_SSE2 - -#undef C1 -#undef C2 -#undef C3 -#undef C4 -#undef C5 -#undef C6 -#undef C7 -#define C1 29692 -#define C2 27969 -#define C3 25172 -#define C4 21407 -#define C5 16819 -#define C6 11585 -#define C7 5906 -TABLE_SSE2 - -#undef C1 -#undef C2 -#undef C3 -#undef C4 -#undef C5 -#undef C6 -#undef C7 -#define C1 31521 -#define C2 29692 -#define C3 26722 -#define C4 22725 -#define C5 17855 -#define C6 12299 -#define C7 6270 -TABLE_SSE2 -}}; - -#define S(s) AV_TOSTRING(s) //AV_STRINGIFY is too long - -#define FDCT_COL(cpu, mm, mov)\ -static av_always_inline void fdct_col_##cpu(const int16_t *in, int16_t *out, int offset)\ -{\ - __asm__ volatile (\ - #mov" 16(%0), %%"#mm"0 \n\t" \ - #mov" 96(%0), %%"#mm"1 \n\t" \ - #mov" %%"#mm"0, %%"#mm"2 \n\t" \ - #mov" 32(%0), %%"#mm"3 \n\t" \ - "paddsw %%"#mm"1, %%"#mm"0 \n\t" \ - #mov" 80(%0), %%"#mm"4 \n\t" \ - "psllw $"S(SHIFT_FRW_COL)", %%"#mm"0 \n\t" \ - #mov" (%0), %%"#mm"5 \n\t" \ - "paddsw %%"#mm"3, %%"#mm"4 \n\t" \ - "paddsw 112(%0), %%"#mm"5 \n\t" \ - "psllw $"S(SHIFT_FRW_COL)", %%"#mm"4 \n\t" \ - #mov" %%"#mm"0, %%"#mm"6 \n\t" \ - "psubsw %%"#mm"1, %%"#mm"2 \n\t" \ - #mov" 16(%1), %%"#mm"1 \n\t" \ - "psubsw %%"#mm"4, %%"#mm"0 \n\t" \ - #mov" 48(%0), %%"#mm"7 \n\t" \ - "pmulhw %%"#mm"0, %%"#mm"1 \n\t" \ - "paddsw 64(%0), %%"#mm"7 \n\t" \ - "psllw $"S(SHIFT_FRW_COL)", %%"#mm"5 \n\t" \ - "paddsw %%"#mm"4, %%"#mm"6 \n\t" \ - "psllw $"S(SHIFT_FRW_COL)", %%"#mm"7 \n\t" \ - #mov" %%"#mm"5, %%"#mm"4 \n\t" \ - "psubsw %%"#mm"7, %%"#mm"5 \n\t" \ - "paddsw %%"#mm"5, %%"#mm"1 \n\t" \ - "paddsw %%"#mm"7, %%"#mm"4 \n\t" \ - "por (%2), %%"#mm"1 \n\t" \ - "psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"2 \n\t" \ - "pmulhw 16(%1), %%"#mm"5 \n\t" \ - #mov" %%"#mm"4, %%"#mm"7 \n\t" \ - "psubsw 80(%0), %%"#mm"3 \n\t" \ - "psubsw %%"#mm"6, %%"#mm"4 \n\t" \ - #mov" %%"#mm"1, 32(%3) \n\t" \ - "paddsw %%"#mm"6, %%"#mm"7 \n\t" \ - #mov" 48(%0), %%"#mm"1 \n\t" \ - "psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"3 \n\t" \ - "psubsw 64(%0), %%"#mm"1 \n\t" \ - #mov" %%"#mm"2, %%"#mm"6 \n\t" \ - #mov" %%"#mm"4, 64(%3) \n\t" \ - "paddsw %%"#mm"3, %%"#mm"2 \n\t" \ - "pmulhw (%4), %%"#mm"2 \n\t" \ - "psubsw %%"#mm"3, %%"#mm"6 \n\t" \ - "pmulhw (%4), %%"#mm"6 \n\t" \ - "psubsw %%"#mm"0, %%"#mm"5 \n\t" \ - "por (%2), %%"#mm"5 \n\t" \ - "psllw $"S(SHIFT_FRW_COL)", %%"#mm"1 \n\t" \ - "por (%2), %%"#mm"2 \n\t" \ - #mov" %%"#mm"1, %%"#mm"4 \n\t" \ - #mov" (%0), %%"#mm"3 \n\t" \ - "paddsw %%"#mm"6, %%"#mm"1 \n\t" \ - "psubsw 112(%0), %%"#mm"3 \n\t" \ - "psubsw %%"#mm"6, %%"#mm"4 \n\t" \ - #mov" (%1), %%"#mm"0 \n\t" \ - "psllw $"S(SHIFT_FRW_COL)", %%"#mm"3 \n\t" \ - #mov" 32(%1), %%"#mm"6 \n\t" \ - "pmulhw %%"#mm"1, %%"#mm"0 \n\t" \ - #mov" %%"#mm"7, (%3) \n\t" \ - "pmulhw %%"#mm"4, %%"#mm"6 \n\t" \ - #mov" %%"#mm"5, 96(%3) \n\t" \ - #mov" %%"#mm"3, %%"#mm"7 \n\t" \ - #mov" 32(%1), %%"#mm"5 \n\t" \ - "psubsw %%"#mm"2, %%"#mm"7 \n\t" \ - "paddsw %%"#mm"2, %%"#mm"3 \n\t" \ - "pmulhw %%"#mm"7, %%"#mm"5 \n\t" \ - "paddsw %%"#mm"3, %%"#mm"0 \n\t" \ - "paddsw %%"#mm"4, %%"#mm"6 \n\t" \ - "pmulhw (%1), %%"#mm"3 \n\t" \ - "por (%2), %%"#mm"0 \n\t" \ - "paddsw %%"#mm"7, %%"#mm"5 \n\t" \ - "psubsw %%"#mm"6, %%"#mm"7 \n\t" \ - #mov" %%"#mm"0, 16(%3) \n\t" \ - "paddsw %%"#mm"4, %%"#mm"5 \n\t" \ - #mov" %%"#mm"7, 48(%3) \n\t" \ - "psubsw %%"#mm"1, %%"#mm"3 \n\t" \ - #mov" %%"#mm"5, 80(%3) \n\t" \ - #mov" %%"#mm"3, 112(%3) \n\t" \ - : \ - : "r" (in + offset), "r" (fdct_tg_all_16), "r" (fdct_one_corr), \ - "r" (out + offset), "r" (ocos_4_16)); \ -} - -FDCT_COL(mmx, mm, movq) -FDCT_COL(sse2, xmm, movdqa) - -static av_always_inline void fdct_row_sse2(const int16_t *in, int16_t *out) -{ - __asm__ volatile( -#define FDCT_ROW_SSE2_H1(i,t) \ - "movq " #i "(%0), %%xmm2 \n\t" \ - "movq " #i "+8(%0), %%xmm0 \n\t" \ - "movdqa " #t "+32(%1), %%xmm3 \n\t" \ - "movdqa " #t "+48(%1), %%xmm7 \n\t" \ - "movdqa " #t "(%1), %%xmm4 \n\t" \ - "movdqa " #t "+16(%1), %%xmm5 \n\t" - -#define FDCT_ROW_SSE2_H2(i,t) \ - "movq " #i "(%0), %%xmm2 \n\t" \ - "movq " #i "+8(%0), %%xmm0 \n\t" \ - "movdqa " #t "+32(%1), %%xmm3 \n\t" \ - "movdqa " #t "+48(%1), %%xmm7 \n\t" - -#define FDCT_ROW_SSE2(i) \ - "movq %%xmm2, %%xmm1 \n\t" \ - "pshuflw $27, %%xmm0, %%xmm0 \n\t" \ - "paddsw %%xmm0, %%xmm1 \n\t" \ - "psubsw %%xmm0, %%xmm2 \n\t" \ - "punpckldq %%xmm2, %%xmm1 \n\t" \ - "pshufd $78, %%xmm1, %%xmm2 \n\t" \ - "pmaddwd %%xmm2, %%xmm3 \n\t" \ - "pmaddwd %%xmm1, %%xmm7 \n\t" \ - "pmaddwd %%xmm5, %%xmm2 \n\t" \ - "pmaddwd %%xmm4, %%xmm1 \n\t" \ - "paddd %%xmm7, %%xmm3 \n\t" \ - "paddd %%xmm2, %%xmm1 \n\t" \ - "paddd %%xmm6, %%xmm3 \n\t" \ - "paddd %%xmm6, %%xmm1 \n\t" \ - "psrad %3, %%xmm3 \n\t" \ - "psrad %3, %%xmm1 \n\t" \ - "packssdw %%xmm3, %%xmm1 \n\t" \ - "movdqa %%xmm1, " #i "(%4) \n\t" - - "movdqa (%2), %%xmm6 \n\t" - FDCT_ROW_SSE2_H1(0,0) - FDCT_ROW_SSE2(0) - FDCT_ROW_SSE2_H2(64,0) - FDCT_ROW_SSE2(64) - - FDCT_ROW_SSE2_H1(16,64) - FDCT_ROW_SSE2(16) - FDCT_ROW_SSE2_H2(112,64) - FDCT_ROW_SSE2(112) - - FDCT_ROW_SSE2_H1(32,128) - FDCT_ROW_SSE2(32) - FDCT_ROW_SSE2_H2(96,128) - FDCT_ROW_SSE2(96) - - FDCT_ROW_SSE2_H1(48,192) - FDCT_ROW_SSE2(48) - FDCT_ROW_SSE2_H2(80,192) - FDCT_ROW_SSE2(80) - : - : "r" (in), "r" (tab_frw_01234567_sse2.tab_frw_01234567_sse2), "r" (fdct_r_row_sse2.fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out) - ); -} - -static av_always_inline void fdct_row_mmx2(const int16_t *in, int16_t *out, const int16_t *table) -{ - __asm__ volatile ( - "pshufw $0x1B, 8(%0), %%mm5 \n\t" - "movq (%0), %%mm0 \n\t" - "movq %%mm0, %%mm1 \n\t" - "paddsw %%mm5, %%mm0 \n\t" - "psubsw %%mm5, %%mm1 \n\t" - "movq %%mm0, %%mm2 \n\t" - "punpckldq %%mm1, %%mm0 \n\t" - "punpckhdq %%mm1, %%mm2 \n\t" - "movq (%1), %%mm1 \n\t" - "movq 8(%1), %%mm3 \n\t" - "movq 16(%1), %%mm4 \n\t" - "movq 24(%1), %%mm5 \n\t" - "movq 32(%1), %%mm6 \n\t" - "movq 40(%1), %%mm7 \n\t" - "pmaddwd %%mm0, %%mm1 \n\t" - "pmaddwd %%mm2, %%mm3 \n\t" - "pmaddwd %%mm0, %%mm4 \n\t" - "pmaddwd %%mm2, %%mm5 \n\t" - "pmaddwd %%mm0, %%mm6 \n\t" - "pmaddwd %%mm2, %%mm7 \n\t" - "pmaddwd 48(%1), %%mm0 \n\t" - "pmaddwd 56(%1), %%mm2 \n\t" - "paddd %%mm1, %%mm3 \n\t" - "paddd %%mm4, %%mm5 \n\t" - "paddd %%mm6, %%mm7 \n\t" - "paddd %%mm0, %%mm2 \n\t" - "movq (%2), %%mm0 \n\t" - "paddd %%mm0, %%mm3 \n\t" - "paddd %%mm0, %%mm5 \n\t" - "paddd %%mm0, %%mm7 \n\t" - "paddd %%mm0, %%mm2 \n\t" - "psrad $"S(SHIFT_FRW_ROW)", %%mm3 \n\t" - "psrad $"S(SHIFT_FRW_ROW)", %%mm5 \n\t" - "psrad $"S(SHIFT_FRW_ROW)", %%mm7 \n\t" - "psrad $"S(SHIFT_FRW_ROW)", %%mm2 \n\t" - "packssdw %%mm5, %%mm3 \n\t" - "packssdw %%mm2, %%mm7 \n\t" - "movq %%mm3, (%3) \n\t" - "movq %%mm7, 8(%3) \n\t" - : - : "r" (in), "r" (table), "r" (fdct_r_row), "r" (out)); -} - -static av_always_inline void fdct_row_mmx(const int16_t *in, int16_t *out, const int16_t *table) -{ - //FIXME reorder (I do not have an old MMX-only CPU here to benchmark ...) - __asm__ volatile( - "movd 12(%0), %%mm1 \n\t" - "punpcklwd 8(%0), %%mm1 \n\t" - "movq %%mm1, %%mm2 \n\t" - "psrlq $0x20, %%mm1 \n\t" - "movq 0(%0), %%mm0 \n\t" - "punpcklwd %%mm2, %%mm1 \n\t" - "movq %%mm0, %%mm5 \n\t" - "paddsw %%mm1, %%mm0 \n\t" - "psubsw %%mm1, %%mm5 \n\t" - "movq %%mm0, %%mm2 \n\t" - "punpckldq %%mm5, %%mm0 \n\t" - "punpckhdq %%mm5, %%mm2 \n\t" - "movq 0(%1), %%mm1 \n\t" - "movq 8(%1), %%mm3 \n\t" - "movq 16(%1), %%mm4 \n\t" - "movq 24(%1), %%mm5 \n\t" - "movq 32(%1), %%mm6 \n\t" - "movq 40(%1), %%mm7 \n\t" - "pmaddwd %%mm0, %%mm1 \n\t" - "pmaddwd %%mm2, %%mm3 \n\t" - "pmaddwd %%mm0, %%mm4 \n\t" - "pmaddwd %%mm2, %%mm5 \n\t" - "pmaddwd %%mm0, %%mm6 \n\t" - "pmaddwd %%mm2, %%mm7 \n\t" - "pmaddwd 48(%1), %%mm0 \n\t" - "pmaddwd 56(%1), %%mm2 \n\t" - "paddd %%mm1, %%mm3 \n\t" - "paddd %%mm4, %%mm5 \n\t" - "paddd %%mm6, %%mm7 \n\t" - "paddd %%mm0, %%mm2 \n\t" - "movq (%2), %%mm0 \n\t" - "paddd %%mm0, %%mm3 \n\t" - "paddd %%mm0, %%mm5 \n\t" - "paddd %%mm0, %%mm7 \n\t" - "paddd %%mm0, %%mm2 \n\t" - "psrad $"S(SHIFT_FRW_ROW)", %%mm3 \n\t" - "psrad $"S(SHIFT_FRW_ROW)", %%mm5 \n\t" - "psrad $"S(SHIFT_FRW_ROW)", %%mm7 \n\t" - "psrad $"S(SHIFT_FRW_ROW)", %%mm2 \n\t" - "packssdw %%mm5, %%mm3 \n\t" - "packssdw %%mm2, %%mm7 \n\t" - "movq %%mm3, 0(%3) \n\t" - "movq %%mm7, 8(%3) \n\t" - : - : "r" (in), "r" (table), "r" (fdct_r_row), "r" (out)); -} - -void ff_fdct_mmx(int16_t *block) -{ - int64_t align_tmp[16] ATTR_ALIGN(8); - int16_t * block1= (int16_t*)align_tmp; - const int16_t *table= tab_frw_01234567; - int i; - - fdct_col_mmx(block, block1, 0); - fdct_col_mmx(block, block1, 4); - - for(i=8;i>0;i--) { - fdct_row_mmx(block1, block, table); - block1 += 8; - table += 32; - block += 8; - } -} - -void ff_fdct_mmx2(int16_t *block) -{ - int64_t align_tmp[16] ATTR_ALIGN(8); - int16_t *block1= (int16_t*)align_tmp; - const int16_t *table= tab_frw_01234567; - int i; - - fdct_col_mmx(block, block1, 0); - fdct_col_mmx(block, block1, 4); - - for(i=8;i>0;i--) { - fdct_row_mmx2(block1, block, table); - block1 += 8; - table += 32; - block += 8; - } -} - -void ff_fdct_sse2(int16_t *block) -{ - int64_t align_tmp[16] ATTR_ALIGN(16); - int16_t * const block1= (int16_t*)align_tmp; - - fdct_col_sse2(block, block1, 0); - fdct_row_sse2(block1, block); -} -
--- a/i386/fft_3dn.c Mon Dec 22 06:50:18 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,23 +0,0 @@ -/* - * FFT/MDCT transform with 3DNow! optimizations - * Copyright (c) 2008 Loren Merritt - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#define EMULATE_3DNOWEXT -#include "fft_3dn2.c"
--- a/i386/fft_3dn2.c Mon Dec 22 06:50:18 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,173 +0,0 @@ -/* - * FFT/MDCT transform with Extended 3DNow! optimizations - * Copyright (c) 2006-2008 Zuxy MENG Jie, Loren Merritt - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/x86_cpu.h" -#include "libavcodec/dsputil.h" - -DECLARE_ALIGNED_8(static const int, m1m1[2]) = { 1<<31, 1<<31 }; - -#ifdef EMULATE_3DNOWEXT -#define PSWAPD(s,d)\ - "movq "#s","#d"\n"\ - "psrlq $32,"#d"\n"\ - "punpckldq "#s","#d"\n" -#define ff_fft_calc_3dn2 ff_fft_calc_3dn -#define ff_fft_dispatch_3dn2 ff_fft_dispatch_3dn -#define ff_fft_dispatch_interleave_3dn2 ff_fft_dispatch_interleave_3dn -#define ff_imdct_calc_3dn2 ff_imdct_calc_3dn -#define ff_imdct_half_3dn2 ff_imdct_half_3dn -#else -#define PSWAPD(s,d) "pswapd "#s","#d"\n" -#endif - -void ff_fft_dispatch_3dn2(FFTComplex *z, int nbits); -void ff_fft_dispatch_interleave_3dn2(FFTComplex *z, int nbits); - -void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z) -{ - int n = 1<<s->nbits; - int i; - ff_fft_dispatch_interleave_3dn2(z, s->nbits); - __asm__ volatile("femms"); - if(n <= 8) - for(i=0; i<n; i+=2) - FFSWAP(FFTSample, z[i].im, z[i+1].re); -} - -void ff_imdct_half_3dn2(MDCTContext *s, FFTSample *output, const FFTSample *input) -{ - x86_reg j, k; - long n = 1 << s->nbits; - long n2 = n >> 1; - long n4 = n >> 2; - long n8 = n >> 3; - const uint16_t *revtab = s->fft.revtab; - const FFTSample *tcos = s->tcos; - const FFTSample *tsin = s->tsin; - const FFTSample *in1, *in2; - FFTComplex *z = (FFTComplex *)output; - - /* pre rotation */ - in1 = input; - in2 = input + n2 - 1; -#ifdef EMULATE_3DNOWEXT - __asm__ volatile("movd %0, %%mm7" ::"r"(1<<31)); -#endif - for(k = 0; k < n4; k++) { - // FIXME a single block is faster, but gcc 2.95 and 3.4.x on 32bit can't compile it - __asm__ volatile( - "movd %0, %%mm0 \n" - "movd %2, %%mm1 \n" - "punpckldq %1, %%mm0 \n" - "punpckldq %3, %%mm1 \n" - "movq %%mm0, %%mm2 \n" - PSWAPD( %%mm1, %%mm3 ) - "pfmul %%mm1, %%mm0 \n" - "pfmul %%mm3, %%mm2 \n" -#ifdef EMULATE_3DNOWEXT - "movq %%mm0, %%mm1 \n" - "punpckhdq %%mm2, %%mm0 \n" - "punpckldq %%mm2, %%mm1 \n" - "pxor %%mm7, %%mm0 \n" - "pfadd %%mm1, %%mm0 \n" -#else - "pfpnacc %%mm2, %%mm0 \n" -#endif - ::"m"(in2[-2*k]), "m"(in1[2*k]), - "m"(tcos[k]), "m"(tsin[k]) - ); - __asm__ volatile( - "movq %%mm0, %0 \n\t" - :"=m"(z[revtab[k]]) - ); - } - - ff_fft_dispatch_3dn2(z, s->fft.nbits); - -#define CMUL(j,mm0,mm1)\ - "movq (%2,"#j",2), %%mm6 \n"\ - "movq 8(%2,"#j",2), "#mm0"\n"\ - "movq %%mm6, "#mm1"\n"\ - "movq "#mm0",%%mm7 \n"\ - "pfmul (%3,"#j"), %%mm6 \n"\ - "pfmul (%4,"#j"), "#mm0"\n"\ - "pfmul (%4,"#j"), "#mm1"\n"\ - "pfmul (%3,"#j"), %%mm7 \n"\ - "pfsub %%mm6, "#mm0"\n"\ - "pfadd %%mm7, "#mm1"\n" - - /* post rotation */ - j = -n2; - k = n2-8; - __asm__ volatile( - "1: \n" - CMUL(%0, %%mm0, %%mm1) - CMUL(%1, %%mm2, %%mm3) - "movd %%mm0, (%2,%0,2) \n" - "movd %%mm1,12(%2,%1,2) \n" - "movd %%mm2, (%2,%1,2) \n" - "movd %%mm3,12(%2,%0,2) \n" - "psrlq $32, %%mm0 \n" - "psrlq $32, %%mm1 \n" - "psrlq $32, %%mm2 \n" - "psrlq $32, %%mm3 \n" - "movd %%mm0, 8(%2,%0,2) \n" - "movd %%mm1, 4(%2,%1,2) \n" - "movd %%mm2, 8(%2,%1,2) \n" - "movd %%mm3, 4(%2,%0,2) \n" - "sub $8, %1 \n" - "add $8, %0 \n" - "jl 1b \n" - :"+r"(j), "+r"(k) - :"r"(z+n8), "r"(tcos+n8), "r"(tsin+n8) - :"memory" - ); - __asm__ volatile("femms"); -} - -void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output, const FFTSample *input) -{ - x86_reg j, k; - long n = 1 << s->nbits; - long n4 = n >> 2; - - ff_imdct_half_3dn2(s, output+n4, input); - - j = -n; - k = n-8; - __asm__ volatile( - "movq %4, %%mm7 \n" - "1: \n" - PSWAPD((%2,%1), %%mm0) - PSWAPD((%3,%0), %%mm1) - "pxor %%mm7, %%mm0 \n" - "movq %%mm1, (%3,%1) \n" - "movq %%mm0, (%2,%0) \n" - "sub $8, %1 \n" - "add $8, %0 \n" - "jl 1b \n" - :"+r"(j), "+r"(k) - :"r"(output+n4), "r"(output+n4*3), - "m"(*m1m1) - ); - __asm__ volatile("femms"); -} -
--- a/i386/fft_mmx.asm Mon Dec 22 06:50:18 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,467 +0,0 @@ -;****************************************************************************** -;* FFT transform with SSE/3DNow optimizations -;* Copyright (c) 2008 Loren Merritt -;* -;* This file is part of FFmpeg. -;* -;* FFmpeg is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* FFmpeg is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with FFmpeg; if not, write to the Free Software -;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -; These functions are not individually interchangeable with the C versions. -; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results -; in blocks as conventient to the vector size. -; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively) - -%include "x86inc.asm" - -SECTION_RODATA - -%define M_SQRT1_2 0.70710678118654752440 -ps_root2: times 4 dd M_SQRT1_2 -ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2 -ps_m1p1: dd 1<<31, 0 - -%assign i 16 -%rep 13 -cextern ff_cos_ %+ i -%assign i i<<1 -%endrep - -%ifdef ARCH_X86_64 - %define pointer dq -%else - %define pointer dd -%endif - -%macro IF0 1+ -%endmacro -%macro IF1 1+ - %1 -%endmacro - -section .text align=16 - -%macro T2_3DN 4 ; z0, z1, mem0, mem1 - mova %1, %3 - mova %2, %1 - pfadd %1, %4 - pfsub %2, %4 -%endmacro - -%macro T4_3DN 6 ; z0, z1, z2, z3, tmp0, tmp1 - mova %5, %3 - pfsub %3, %4 - pfadd %5, %4 ; {t6,t5} - pxor %3, [ps_m1p1 GLOBAL] ; {t8,t7} - mova %6, %1 - pswapd %3, %3 - pfadd %1, %5 ; {r0,i0} - pfsub %6, %5 ; {r2,i2} - mova %4, %2 - pfadd %2, %3 ; {r1,i1} - pfsub %4, %3 ; {r3,i3} - SWAP %3, %6 -%endmacro - -; in: %1={r0,i0,r1,i1} %2={r2,i2,r3,i3} -; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} -%macro T4_SSE 3 - mova %3, %1 - shufps %1, %2, 0x64 ; {r0,i0,r3,i2} - shufps %3, %2, 0xce ; {r1,i1,r2,i3} - mova %2, %1 - addps %1, %3 ; {t1,t2,t6,t5} - subps %2, %3 ; {t3,t4,t8,t7} - mova %3, %1 - shufps %1, %2, 0x44 ; {t1,t2,t3,t4} - shufps %3, %2, 0xbe ; {t6,t5,t7,t8} - mova %2, %1 - addps %1, %3 ; {r0,i0,r1,i1} - subps %2, %3 ; {r2,i2,r3,i3} - mova %3, %1 - shufps %1, %2, 0x88 ; {r0,r1,r2,r3} - shufps %3, %2, 0xdd ; {i0,i1,i2,i3} - SWAP %2, %3 -%endmacro - -%macro T8_SSE 6 ; r0,i0,r1,i1,t0,t1 - mova %5, %3 - shufps %3, %4, 0x44 ; {r4,i4,r6,i6} - shufps %5, %4, 0xee ; {r5,i5,r7,i7} - mova %6, %3 - subps %3, %5 ; {r5,i5,r7,i7} - addps %6, %5 ; {t1,t2,t3,t4} - mova %5, %3 - shufps %5, %5, 0xb1 ; {i5,r5,i7,r7} - mulps %3, [ps_root2mppm GLOBAL] ; {-r5,i5,r7,-i7} - mulps %5, [ps_root2 GLOBAL] - addps %3, %5 ; {t8,t7,ta,t9} - mova %5, %6 - shufps %6, %3, 0x36 ; {t3,t2,t9,t8} - shufps %5, %3, 0x9c ; {t1,t4,t7,ta} - mova %3, %6 - addps %6, %5 ; {t1,t2,t9,ta} - subps %3, %5 ; {t6,t5,tc,tb} - mova %5, %6 - shufps %6, %3, 0xd8 ; {t1,t9,t5,tb} - shufps %5, %3, 0x8d ; {t2,ta,t6,tc} - mova %3, %1 - mova %4, %2 - addps %1, %6 ; {r0,r1,r2,r3} - addps %2, %5 ; {i0,i1,i2,i3} - subps %3, %6 ; {r4,r5,r6,r7} - subps %4, %5 ; {i4,i5,i6,i7} -%endmacro - -; scheduled for cpu-bound sizes -%macro PASS_SMALL 3 ; (to load m4-m7), wre, wim -IF%1 mova m4, Z(4) -IF%1 mova m5, Z(5) - mova m0, %2 ; wre - mova m2, m4 - mova m1, %3 ; wim - mova m3, m5 - mulps m2, m0 ; r2*wre -IF%1 mova m6, Z(6) - mulps m3, m1 ; i2*wim -IF%1 mova m7, Z(7) - mulps m4, m1 ; r2*wim - mulps m5, m0 ; i2*wre - addps m2, m3 ; r2*wre + i2*wim - mova m3, m1 - mulps m1, m6 ; r3*wim - subps m5, m4 ; i2*wre - r2*wim - mova m4, m0 - mulps m3, m7 ; i3*wim - mulps m4, m6 ; r3*wre - mulps m0, m7 ; i3*wre - subps m4, m3 ; r3*wre - i3*wim - mova m3, Z(0) - addps m0, m1 ; i3*wre + r3*wim - mova m1, m4 - addps m4, m2 ; t5 - subps m1, m2 ; t3 - subps m3, m4 ; r2 - addps m4, Z(0) ; r0 - mova m6, Z(2) - mova Z(4), m3 - mova Z(0), m4 - mova m3, m5 - subps m5, m0 ; t4 - mova m4, m6 - subps m6, m5 ; r3 - addps m5, m4 ; r1 - mova Z(6), m6 - mova Z(2), m5 - mova m2, Z(3) - addps m3, m0 ; t6 - subps m2, m1 ; i3 - mova m7, Z(1) - addps m1, Z(3) ; i1 - mova Z(7), m2 - mova Z(3), m1 - mova m4, m7 - subps m7, m3 ; i2 - addps m3, m4 ; i0 - mova Z(5), m7 - mova Z(1), m3 -%endmacro - -; scheduled to avoid store->load aliasing -%macro PASS_BIG 1 ; (!interleave) - mova m4, Z(4) ; r2 - mova m5, Z(5) ; i2 - mova m2, m4 - mova m0, [wq] ; wre - mova m3, m5 - mova m1, [wq+o1q] ; wim - mulps m2, m0 ; r2*wre - mova m6, Z(6) ; r3 - mulps m3, m1 ; i2*wim - mova m7, Z(7) ; i3 - mulps m4, m1 ; r2*wim - mulps m5, m0 ; i2*wre - addps m2, m3 ; r2*wre + i2*wim - mova m3, m1 - mulps m1, m6 ; r3*wim - subps m5, m4 ; i2*wre - r2*wim - mova m4, m0 - mulps m3, m7 ; i3*wim - mulps m4, m6 ; r3*wre - mulps m0, m7 ; i3*wre - subps m4, m3 ; r3*wre - i3*wim - mova m3, Z(0) - addps m0, m1 ; i3*wre + r3*wim - mova m1, m4 - addps m4, m2 ; t5 - subps m1, m2 ; t3 - subps m3, m4 ; r2 - addps m4, Z(0) ; r0 - mova m6, Z(2) - mova Z(4), m3 - mova Z(0), m4 - mova m3, m5 - subps m5, m0 ; t4 - mova m4, m6 - subps m6, m5 ; r3 - addps m5, m4 ; r1 -IF%1 mova Z(6), m6 -IF%1 mova Z(2), m5 - mova m2, Z(3) - addps m3, m0 ; t6 - subps m2, m1 ; i3 - mova m7, Z(1) - addps m1, Z(3) ; i1 -IF%1 mova Z(7), m2 -IF%1 mova Z(3), m1 - mova m4, m7 - subps m7, m3 ; i2 - addps m3, m4 ; i0 -IF%1 mova Z(5), m7 -IF%1 mova Z(1), m3 -%if %1==0 - mova m4, m5 ; r1 - mova m0, m6 ; r3 - unpcklps m5, m1 - unpckhps m4, m1 - unpcklps m6, m2 - unpckhps m0, m2 - mova m1, Z(0) - mova m2, Z(4) - mova Z(2), m5 - mova Z(3), m4 - mova Z(6), m6 - mova Z(7), m0 - mova m5, m1 ; r0 - mova m4, m2 ; r2 - unpcklps m1, m3 - unpckhps m5, m3 - unpcklps m2, m7 - unpckhps m4, m7 - mova Z(0), m1 - mova Z(1), m5 - mova Z(4), m2 - mova Z(5), m4 -%endif -%endmacro - -%macro PUNPCK 3 - mova %3, %1 - punpckldq %1, %2 - punpckhdq %3, %2 -%endmacro - -INIT_XMM - -%define Z(x) [r0+mmsize*x] - -align 16 -fft4_sse: - mova m0, Z(0) - mova m1, Z(1) - T4_SSE m0, m1, m2 - mova Z(0), m0 - mova Z(1), m1 - ret - -align 16 -fft8_sse: - mova m0, Z(0) - mova m1, Z(1) - T4_SSE m0, m1, m2 - mova m2, Z(2) - mova m3, Z(3) - T8_SSE m0, m1, m2, m3, m4, m5 - mova Z(0), m0 - mova Z(1), m1 - mova Z(2), m2 - mova Z(3), m3 - ret - -align 16 -fft16_sse: - mova m0, Z(0) - mova m1, Z(1) - T4_SSE m0, m1, m2 - mova m2, Z(2) - mova m3, Z(3) - T8_SSE m0, m1, m2, m3, m4, m5 - mova m4, Z(4) - mova m5, Z(5) - mova Z(0), m0 - mova Z(1), m1 - mova Z(2), m2 - mova Z(3), m3 - T4_SSE m4, m5, m6 - mova m6, Z(6) - mova m7, Z(7) - T4_SSE m6, m7, m0 - PASS_SMALL 0, [ff_cos_16 GLOBAL], [ff_cos_16+16 GLOBAL] - ret - - -INIT_MMX - -%macro FFT48_3DN 1 -align 16 -fft4%1: - T2_3DN m0, m1, Z(0), Z(1) - mova m2, Z(2) - mova m3, Z(3) - T4_3DN m0, m1, m2, m3, m4, m5 - PUNPCK m0, m1, m4 - PUNPCK m2, m3, m5 - mova Z(0), m0 - mova Z(1), m4 - mova Z(2), m2 - mova Z(3), m5 - ret - -align 16 -fft8%1: - T2_3DN m0, m1, Z(0), Z(1) - mova m2, Z(2) - mova m3, Z(3) - T4_3DN m0, m1, m2, m3, m4, m5 - mova Z(0), m0 - mova Z(2), m2 - T2_3DN m4, m5, Z(4), Z(5) - T2_3DN m6, m7, Z(6), Z(7) - pswapd m0, m5 - pswapd m2, m7 - pxor m0, [ps_m1p1 GLOBAL] - pxor m2, [ps_m1p1 GLOBAL] - pfsub m5, m0 - pfadd m7, m2 - pfmul m5, [ps_root2 GLOBAL] - pfmul m7, [ps_root2 GLOBAL] - T4_3DN m1, m3, m5, m7, m0, m2 - mova Z(5), m5 - mova Z(7), m7 - mova m0, Z(0) - mova m2, Z(2) - T4_3DN m0, m2, m4, m6, m5, m7 - PUNPCK m0, m1, m5 - PUNPCK m2, m3, m7 - mova Z(0), m0 - mova Z(1), m5 - mova Z(2), m2 - mova Z(3), m7 - PUNPCK m4, Z(5), m5 - PUNPCK m6, Z(7), m7 - mova Z(4), m4 - mova Z(5), m5 - mova Z(6), m6 - mova Z(7), m7 - ret -%endmacro - -FFT48_3DN _3dn2 - -%macro pswapd 2 -%ifidn %1, %2 - movd [r0+12], %1 - punpckhdq %1, [r0+8] -%else - movq %1, %2 - psrlq %1, 32 - punpckldq %1, %2 -%endif -%endmacro - -FFT48_3DN _3dn - - -%define Z(x) [zq + o1q*(x&6)*((x/6)^1) + o3q*(x/6) + mmsize*(x&1)] - -%macro DECL_PASS 2+ ; name, payload -align 16 -%1: -DEFINE_ARGS z, w, n, o1, o3 - lea o3q, [nq*3] - lea o1q, [nq*8] - shl o3q, 4 -.loop: - %2 - add zq, mmsize*2 - add wq, mmsize - sub nd, mmsize/8 - jg .loop - rep ret -%endmacro - -INIT_XMM -DECL_PASS pass_sse, PASS_BIG 1 -DECL_PASS pass_interleave_sse, PASS_BIG 0 - -INIT_MMX -%define mulps pfmul -%define addps pfadd -%define subps pfsub -%define unpcklps punpckldq -%define unpckhps punpckhdq -DECL_PASS pass_3dn, PASS_SMALL 1, [wq], [wq+o1q] -DECL_PASS pass_interleave_3dn, PASS_BIG 0 -%define pass_3dn2 pass_3dn -%define pass_interleave_3dn2 pass_interleave_3dn - - -%macro DECL_FFT 2-3 ; nbits, cpu, suffix -%xdefine list_of_fft fft4%2, fft8%2 -%if %1==5 -%xdefine list_of_fft list_of_fft, fft16%2 -%endif - -%assign n 1<<%1 -%rep 17-%1 -%assign n2 n/2 -%assign n4 n/4 -%xdefine list_of_fft list_of_fft, fft %+ n %+ %3%2 - -align 16 -fft %+ n %+ %3%2: - call fft %+ n2 %+ %2 - add r0, n*4 - (n&(-2<<%1)) - call fft %+ n4 %+ %2 - add r0, n*2 - (n2&(-2<<%1)) - call fft %+ n4 %+ %2 - sub r0, n*6 + (n2&(-2<<%1)) - lea r1, [ff_cos_ %+ n GLOBAL] - mov r2d, n4/2 - jmp pass%3%2 - -%assign n n*2 -%endrep -%undef n - -align 8 -dispatch_tab%3%2: pointer list_of_fft - -; On x86_32, this function does the register saving and restoring for all of fft. -; The others pass args in registers and don't spill anything. -cglobal fft_dispatch%3%2, 2,5,0, z, nbits - lea r2, [dispatch_tab%3%2 GLOBAL] - mov r2, [r2 + (nbitsq-2)*gprsize] - call r2 - RET -%endmacro ; DECL_FFT - -DECL_FFT 5, _sse -DECL_FFT 5, _sse, _interleave -DECL_FFT 4, _3dn -DECL_FFT 4, _3dn, _interleave -DECL_FFT 4, _3dn2 -DECL_FFT 4, _3dn2, _interleave -
--- a/i386/fft_sse.c Mon Dec 22 06:50:18 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,202 +0,0 @@ -/* - * FFT/MDCT transform with SSE optimizations - * Copyright (c) 2008 Loren Merritt - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/x86_cpu.h" -#include "libavcodec/dsputil.h" - -static const int m1m1m1m1[4] __attribute__((aligned(16))) = - { 1 << 31, 1 << 31, 1 << 31, 1 << 31 }; - -void ff_fft_dispatch_sse(FFTComplex *z, int nbits); -void ff_fft_dispatch_interleave_sse(FFTComplex *z, int nbits); - -void ff_fft_calc_sse(FFTContext *s, FFTComplex *z) -{ - int n = 1 << s->nbits; - - ff_fft_dispatch_interleave_sse(z, s->nbits); - - if(n <= 16) { - x86_reg i = -8*n; - __asm__ volatile( - "1: \n" - "movaps (%0,%1), %%xmm0 \n" - "movaps %%xmm0, %%xmm1 \n" - "unpcklps 16(%0,%1), %%xmm0 \n" - "unpckhps 16(%0,%1), %%xmm1 \n" - "movaps %%xmm0, (%0,%1) \n" - "movaps %%xmm1, 16(%0,%1) \n" - "add $32, %0 \n" - "jl 1b \n" - :"+r"(i) - :"r"(z+n) - :"memory" - ); - } -} - -void ff_fft_permute_sse(FFTContext *s, FFTComplex *z) -{ - int n = 1 << s->nbits; - int i; - for(i=0; i<n; i+=2) { - __asm__ volatile( - "movaps %2, %%xmm0 \n" - "movlps %%xmm0, %0 \n" - "movhps %%xmm0, %1 \n" - :"=m"(s->tmp_buf[s->revtab[i]]), - "=m"(s->tmp_buf[s->revtab[i+1]]) - :"m"(z[i]) - ); - } - memcpy(z, s->tmp_buf, n*sizeof(FFTComplex)); -} - -void ff_imdct_half_sse(MDCTContext *s, FFTSample *output, const FFTSample *input) -{ - av_unused x86_reg i, j, k, l; - long n = 1 << s->nbits; - long n2 = n >> 1; - long n4 = n >> 2; - long n8 = n >> 3; - const uint16_t *revtab = s->fft.revtab + n8; - const FFTSample *tcos = s->tcos; - const FFTSample *tsin = s->tsin; - FFTComplex *z = (FFTComplex *)output; - - /* pre rotation */ - for(k=n8-2; k>=0; k-=2) { - __asm__ volatile( - "movaps (%2,%1,2), %%xmm0 \n" // { z[k].re, z[k].im, z[k+1].re, z[k+1].im } - "movaps -16(%2,%0,2), %%xmm1 \n" // { z[-k-2].re, z[-k-2].im, z[-k-1].re, z[-k-1].im } - "movaps %%xmm0, %%xmm2 \n" - "shufps $0x88, %%xmm1, %%xmm0 \n" // { z[k].re, z[k+1].re, z[-k-2].re, z[-k-1].re } - "shufps $0x77, %%xmm2, %%xmm1 \n" // { z[-k-1].im, z[-k-2].im, z[k+1].im, z[k].im } - "movlps (%3,%1), %%xmm4 \n" - "movlps (%4,%1), %%xmm5 \n" - "movhps -8(%3,%0), %%xmm4 \n" // { cos[k], cos[k+1], cos[-k-2], cos[-k-1] } - "movhps -8(%4,%0), %%xmm5 \n" // { sin[k], sin[k+1], sin[-k-2], sin[-k-1] } - "movaps %%xmm0, %%xmm2 \n" - "movaps %%xmm1, %%xmm3 \n" - "mulps %%xmm5, %%xmm0 \n" // re*sin - "mulps %%xmm4, %%xmm1 \n" // im*cos - "mulps %%xmm4, %%xmm2 \n" // re*cos - "mulps %%xmm5, %%xmm3 \n" // im*sin - "subps %%xmm0, %%xmm1 \n" // -> re - "addps %%xmm3, %%xmm2 \n" // -> im - "movaps %%xmm1, %%xmm0 \n" - "unpcklps %%xmm2, %%xmm1 \n" // { z[k], z[k+1] } - "unpckhps %%xmm2, %%xmm0 \n" // { z[-k-2], z[-k-1] } - ::"r"(-4*k), "r"(4*k), - "r"(input+n4), "r"(tcos+n8), "r"(tsin+n8) - ); -#ifdef ARCH_X86_64 - // if we have enough regs, don't let gcc make the luts latency-bound - // but if not, latency is faster than spilling - __asm__("movlps %%xmm0, %0 \n" - "movhps %%xmm0, %1 \n" - "movlps %%xmm1, %2 \n" - "movhps %%xmm1, %3 \n" - :"=m"(z[revtab[-k-2]]), - "=m"(z[revtab[-k-1]]), - "=m"(z[revtab[ k ]]), - "=m"(z[revtab[ k+1]]) - ); -#else - __asm__("movlps %%xmm0, %0" :"=m"(z[revtab[-k-2]])); - __asm__("movhps %%xmm0, %0" :"=m"(z[revtab[-k-1]])); - __asm__("movlps %%xmm1, %0" :"=m"(z[revtab[ k ]])); - __asm__("movhps %%xmm1, %0" :"=m"(z[revtab[ k+1]])); -#endif - } - - ff_fft_dispatch_sse(z, s->fft.nbits); - - /* post rotation + reinterleave + reorder */ - -#define CMUL(j,xmm0,xmm1)\ - "movaps (%2,"#j",2), %%xmm6 \n"\ - "movaps 16(%2,"#j",2), "#xmm0"\n"\ - "movaps %%xmm6, "#xmm1"\n"\ - "movaps "#xmm0",%%xmm7 \n"\ - "mulps (%3,"#j"), %%xmm6 \n"\ - "mulps (%4,"#j"), "#xmm0"\n"\ - "mulps (%4,"#j"), "#xmm1"\n"\ - "mulps (%3,"#j"), %%xmm7 \n"\ - "subps %%xmm6, "#xmm0"\n"\ - "addps %%xmm7, "#xmm1"\n" - - j = -n2; - k = n2-16; - __asm__ volatile( - "1: \n" - CMUL(%0, %%xmm0, %%xmm1) - CMUL(%1, %%xmm4, %%xmm5) - "shufps $0x1b, %%xmm1, %%xmm1 \n" - "shufps $0x1b, %%xmm5, %%xmm5 \n" - "movaps %%xmm4, %%xmm6 \n" - "unpckhps %%xmm1, %%xmm4 \n" - "unpcklps %%xmm1, %%xmm6 \n" - "movaps %%xmm0, %%xmm2 \n" - "unpcklps %%xmm5, %%xmm0 \n" - "unpckhps %%xmm5, %%xmm2 \n" - "movaps %%xmm6, (%2,%1,2) \n" - "movaps %%xmm4, 16(%2,%1,2) \n" - "movaps %%xmm0, (%2,%0,2) \n" - "movaps %%xmm2, 16(%2,%0,2) \n" - "sub $16, %1 \n" - "add $16, %0 \n" - "jl 1b \n" - :"+&r"(j), "+&r"(k) - :"r"(z+n8), "r"(tcos+n8), "r"(tsin+n8) - :"memory" - ); -} - -void ff_imdct_calc_sse(MDCTContext *s, FFTSample *output, const FFTSample *input) -{ - x86_reg j, k; - long n = 1 << s->nbits; - long n4 = n >> 2; - - ff_imdct_half_sse(s, output+n4, input); - - j = -n; - k = n-16; - __asm__ volatile( - "movaps %4, %%xmm7 \n" - "1: \n" - "movaps (%2,%1), %%xmm0 \n" - "movaps (%3,%0), %%xmm1 \n" - "shufps $0x1b, %%xmm0, %%xmm0 \n" - "shufps $0x1b, %%xmm1, %%xmm1 \n" - "xorps %%xmm7, %%xmm0 \n" - "movaps %%xmm1, (%3,%1) \n" - "movaps %%xmm0, (%2,%0) \n" - "sub $16, %1 \n" - "add $16, %0 \n" - "jl 1b \n" - :"+r"(j), "+r"(k) - :"r"(output+n4), "r"(output+n4*3), - "m"(*m1m1m1m1) - ); -} -
--- a/i386/flacdsp_mmx.c Mon Dec 22 06:50:18 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,139 +0,0 @@ -/* - * MMX optimized FLAC DSP utils - * Copyright (c) 2007 Loren Merritt - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/x86_cpu.h" -#include "dsputil_mmx.h" - -static void apply_welch_window_sse2(const int32_t *data, int len, double *w_data) -{ - double c = 2.0 / (len-1.0); - int n2 = len>>1; - x86_reg i = -n2*sizeof(int32_t); - x86_reg j = n2*sizeof(int32_t); - __asm__ volatile( - "movsd %0, %%xmm7 \n\t" - "movapd "MANGLE(ff_pd_1)", %%xmm6 \n\t" - "movapd "MANGLE(ff_pd_2)", %%xmm5 \n\t" - "movlhps %%xmm7, %%xmm7 \n\t" - "subpd %%xmm5, %%xmm7 \n\t" - "addsd %%xmm6, %%xmm7 \n\t" - ::"m"(c) - ); -#define WELCH(MOVPD, offset)\ - __asm__ volatile(\ - "1: \n\t"\ - "movapd %%xmm7, %%xmm1 \n\t"\ - "mulpd %%xmm1, %%xmm1 \n\t"\ - "movapd %%xmm6, %%xmm0 \n\t"\ - "subpd %%xmm1, %%xmm0 \n\t"\ - "pshufd $0x4e, %%xmm0, %%xmm1 \n\t"\ - "cvtpi2pd (%3,%0), %%xmm2 \n\t"\ - "cvtpi2pd "#offset"*4(%3,%1), %%xmm3 \n\t"\ - "mulpd %%xmm0, %%xmm2 \n\t"\ - "mulpd %%xmm1, %%xmm3 \n\t"\ - "movapd %%xmm2, (%2,%0,2) \n\t"\ - MOVPD" %%xmm3, "#offset"*8(%2,%1,2) \n\t"\ - "subpd %%xmm5, %%xmm7 \n\t"\ - "sub $8, %1 \n\t"\ - "add $8, %0 \n\t"\ - "jl 1b \n\t"\ - :"+&r"(i), "+&r"(j)\ - :"r"(w_data+n2), "r"(data+n2)\ - ); - if(len&1) - WELCH("movupd", -1) - else - WELCH("movapd", -2) -#undef WELCH -} - -void ff_flac_compute_autocorr_sse2(const int32_t *data, int len, int lag, - double *autoc) -{ - double tmp[len + lag + 2]; - double *data1 = tmp + lag; - int j; - - if((x86_reg)data1 & 15) - data1++; - - apply_welch_window_sse2(data, len, data1); - - for(j=0; j<lag; j++) - data1[j-lag]= 0.0; - data1[len] = 0.0; - - for(j=0; j<lag; j+=2){ - x86_reg i = -len*sizeof(double); - if(j == lag-2) { - __asm__ volatile( - "movsd "MANGLE(ff_pd_1)", %%xmm0 \n\t" - "movsd "MANGLE(ff_pd_1)", %%xmm1 \n\t" - "movsd "MANGLE(ff_pd_1)", %%xmm2 \n\t" - "1: \n\t" - "movapd (%4,%0), %%xmm3 \n\t" - "movupd -8(%5,%0), %%xmm4 \n\t" - "movapd (%5,%0), %%xmm5 \n\t" - "mulpd %%xmm3, %%xmm4 \n\t" - "mulpd %%xmm3, %%xmm5 \n\t" - "mulpd -16(%5,%0), %%xmm3 \n\t" - "addpd %%xmm4, %%xmm1 \n\t" - "addpd %%xmm5, %%xmm0 \n\t" - "addpd %%xmm3, %%xmm2 \n\t" - "add $16, %0 \n\t" - "jl 1b \n\t" - "movhlps %%xmm0, %%xmm3 \n\t" - "movhlps %%xmm1, %%xmm4 \n\t" - "movhlps %%xmm2, %%xmm5 \n\t" - "addsd %%xmm3, %%xmm0 \n\t" - "addsd %%xmm4, %%xmm1 \n\t" - "addsd %%xmm5, %%xmm2 \n\t" - "movsd %%xmm0, %1 \n\t" - "movsd %%xmm1, %2 \n\t" - "movsd %%xmm2, %3 \n\t" - :"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1]), "=m"(autoc[j+2]) - :"r"(data1+len), "r"(data1+len-j) - ); - } else { - __asm__ volatile( - "movsd "MANGLE(ff_pd_1)", %%xmm0 \n\t" - "movsd "MANGLE(ff_pd_1)", %%xmm1 \n\t" - "1: \n\t" - "movapd (%3,%0), %%xmm3 \n\t" - "movupd -8(%4,%0), %%xmm4 \n\t" - "mulpd %%xmm3, %%xmm4 \n\t" - "mulpd (%4,%0), %%xmm3 \n\t" - "addpd %%xmm4, %%xmm1 \n\t" - "addpd %%xmm3, %%xmm0 \n\t" - "add $16, %0 \n\t" - "jl 1b \n\t" - "movhlps %%xmm0, %%xmm3 \n\t" - "movhlps %%xmm1, %%xmm4 \n\t" - "addsd %%xmm3, %%xmm0 \n\t" - "addsd %%xmm4, %%xmm1 \n\t" - "movsd %%xmm0, %1 \n\t" - "movsd %%xmm1, %2 \n\t" - :"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1]) - :"r"(data1+len), "r"(data1+len-j) - ); - } - } -}
--- a/i386/h264_deblock_sse2.asm Mon Dec 22 06:50:18 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,747 +0,0 @@ -;***************************************************************************** -;* deblock-a.asm: h264 encoder library -;***************************************************************************** -;* Copyright (C) 2005-2008 x264 project -;* -;* Authors: Loren Merritt <lorenm@u.washington.edu> -;* -;* This program is free software; you can redistribute it and/or modify -;* it under the terms of the GNU General Public License as published by -;* the Free Software Foundation; either version 2 of the License, or -;* (at your option) any later version. -;* -;* This program is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -;* GNU General Public License for more details. -;* -;* You should have received a copy of the GNU General Public License -;* along with this program; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. -;***************************************************************************** - -%include "x86inc.asm" - -SECTION_RODATA -pb_00: times 16 db 0x00 -pb_01: times 16 db 0x01 -pb_03: times 16 db 0x03 -pb_a1: times 16 db 0xa1 - -SECTION .text - -; expands to [base],...,[base+7*stride] -%define PASS8ROWS(base, base3, stride, stride3) \ - [base], [base+stride], [base+stride*2], [base3], \ - [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4] - -; in: 8 rows of 4 bytes in %1..%8 -; out: 4 rows of 8 bytes in m0..m3 -%macro TRANSPOSE4x8_LOAD 8 - movd m0, %1 - movd m2, %2 - movd m1, %3 - movd m3, %4 - punpcklbw m0, m2 - punpcklbw m1, m3 - movq m2, m0 - punpcklwd m0, m1 - punpckhwd m2, m1 - - movd m4, %5 - movd m6, %6 - movd m5, %7 - movd m7, %8 - punpcklbw m4, m6 - punpcklbw m5, m7 - movq m6, m4 - punpcklwd m4, m5 - punpckhwd m6, m5 - - movq m1, m0 - movq m3, m2 - punpckldq m0, m4 - punpckhdq m1, m4 - punpckldq m2, m6 - punpckhdq m3, m6 -%endmacro - -; in: 4 rows of 8 bytes in m0..m3 -; out: 8 rows of 4 bytes in %1..%8 -%macro TRANSPOSE8x4_STORE 8 - movq m4, m0 - movq m5, m1 - movq m6, m2 - punpckhdq m4, m4 - punpckhdq m5, m5 - punpckhdq m6, m6 - - punpcklbw m0, m1 - punpcklbw m2, m3 - movq m1, m0 - punpcklwd m0, m2 - punpckhwd m1, m2 - movd %1, m0 - punpckhdq m0, m0 - movd %2, m0 - movd %3, m1 - punpckhdq m1, m1 - movd %4, m1 - - punpckhdq m3, m3 - punpcklbw m4, m5 - punpcklbw m6, m3 - movq m5, m4 - punpcklwd m4, m6 - punpckhwd m5, m6 - movd %5, m4 - punpckhdq m4, m4 - movd %6, m4 - movd %7, m5 - punpckhdq m5, m5 - movd %8, m5 -%endmacro - -%macro SBUTTERFLY 4 - movq %4, %2 - punpckl%1 %2, %3 - punpckh%1 %4, %3 -%endmacro - -; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8 -; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16] -%macro TRANSPOSE6x8_MEM 9 - movq m0, %1 - movq m1, %2 - movq m2, %3 - movq m3, %4 - movq m4, %5 - movq m5, %6 - movq m6, %7 - SBUTTERFLY bw, m0, m1, m7 - SBUTTERFLY bw, m2, m3, m1 - SBUTTERFLY bw, m4, m5, m3 - movq [%9+0x10], m1 - SBUTTERFLY bw, m6, %8, m5 - SBUTTERFLY wd, m0, m2, m1 - SBUTTERFLY wd, m4, m6, m2 - punpckhdq m0, m4 - movq [%9+0x00], m0 - SBUTTERFLY wd, m7, [%9+0x10], m6 - SBUTTERFLY wd, m3, m5, m4 - SBUTTERFLY dq, m7, m3, m0 - SBUTTERFLY dq, m1, m2, m5 - punpckldq m6, m4 - movq [%9+0x10], m1 - movq [%9+0x20], m5 - movq [%9+0x30], m7 - movq [%9+0x40], m0 - movq [%9+0x50], m6 -%endmacro - -; in: 8 rows of 8 in %1..%8 -; out: 8 rows of 8 in %9..%16 -%macro TRANSPOSE8x8_MEM 16 - movq m0, %1 - movq m1, %2 - movq m2, %3 - movq m3, %4 - movq m4, %5 - movq m5, %6 - movq m6, %7 - SBUTTERFLY bw, m0, m1, m7 - SBUTTERFLY bw, m2, m3, m1 - SBUTTERFLY bw, m4, m5, m3 - SBUTTERFLY bw, m6, %8, m5 - movq %9, m3 - SBUTTERFLY wd, m0, m2, m3 - SBUTTERFLY wd, m4, m6, m2 - SBUTTERFLY wd, m7, m1, m6 - movq %11, m2 - movq m2, %9 - SBUTTERFLY wd, m2, m5, m1 - SBUTTERFLY dq, m0, m4, m5 - SBUTTERFLY dq, m7, m2, m4 - movq %9, m0 - movq %10, m5 - movq %13, m7 - movq %14, m4 - SBUTTERFLY dq, m3, %11, m0 - SBUTTERFLY dq, m6, m1, m5 - movq %11, m3 - movq %12, m0 - movq %15, m6 - movq %16, m5 -%endmacro - -; out: %4 = |%1-%2|>%3 -; clobbers: %5 -%macro DIFF_GT 5 - mova %5, %2 - mova %4, %1 - psubusb %5, %1 - psubusb %4, %2 - por %4, %5 - psubusb %4, %3 -%endmacro - -; out: %4 = |%1-%2|>%3 -; clobbers: %5 -%macro DIFF_GT2 5 - mova %5, %2 - mova %4, %1 - psubusb %5, %1 - psubusb %4, %2 - psubusb %5, %3 - psubusb %4, %3 - pcmpeqb %4, %5 -%endmacro - -%macro SPLATW 1 -%ifidn m0, xmm0 - pshuflw %1, %1, 0 - punpcklqdq %1, %1 -%else - pshufw %1, %1, 0 -%endif -%endmacro - -; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1 -; out: m5=beta-1, m7=mask, %3=alpha-1 -; clobbers: m4,m6 -%macro LOAD_MASK 2-3 - movd m4, %1 - movd m5, %2 - SPLATW m4 - SPLATW m5 - packuswb m4, m4 ; 16x alpha-1 - packuswb m5, m5 ; 16x beta-1 -%if %0>2 - mova %3, m4 -%endif - DIFF_GT m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1 - DIFF_GT m0, m1, m5, m4, m6 ; |p1-p0| > beta-1 - por m7, m4 - DIFF_GT m3, m2, m5, m4, m6 ; |q1-q0| > beta-1 - por m7, m4 - pxor m6, m6 - pcmpeqb m7, m6 -%endmacro - -; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask) -; out: m1=p0' m2=q0' -; clobbers: m0,3-6 -%macro DEBLOCK_P0_Q0 0 - mova m5, m1 - pxor m5, m2 ; p0^q0 - pand m5, [pb_01 GLOBAL] ; (p0^q0)&1 - pcmpeqb m4, m4 - pxor m3, m4 - pavgb m3, m0 ; (p1 - q1 + 256)>>1 - pavgb m3, [pb_03 GLOBAL] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2 - pxor m4, m1 - pavgb m4, m2 ; (q0 - p0 + 256)>>1 - pavgb m3, m5 - paddusb m3, m4 ; d+128+33 - mova m6, [pb_a1 GLOBAL] - psubusb m6, m3 - psubusb m3, [pb_a1 GLOBAL] - pminub m6, m7 - pminub m3, m7 - psubusb m1, m6 - psubusb m2, m3 - paddusb m1, m3 - paddusb m2, m6 -%endmacro - -; in: m1=p0 m2=q0 -; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp -; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 ) -; clobbers: q2, tmp, tc0 -%macro LUMA_Q1 6 - mova %6, m1 - pavgb %6, m2 - pavgb %2, %6 ; avg(p2,avg(p0,q0)) - pxor %6, %3 - pand %6, [pb_01 GLOBAL] ; (p2^avg(p0,q0))&1 - psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1 - mova %6, %1 - psubusb %6, %5 - paddusb %5, %1 - pmaxub %2, %6 - pminub %2, %5 - mova %4, %2 -%endmacro - -%ifdef ARCH_X86_64 -;----------------------------------------------------------------------------- -; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) -;----------------------------------------------------------------------------- -INIT_XMM -cglobal x264_deblock_v_luma_sse2 - movd m8, [r4] ; tc0 - lea r4, [r1*3] - dec r2d ; alpha-1 - neg r4 - dec r3d ; beta-1 - add r4, r0 ; pix-3*stride - - mova m0, [r4+r1] ; p1 - mova m1, [r4+2*r1] ; p0 - mova m2, [r0] ; q0 - mova m3, [r0+r1] ; q1 - LOAD_MASK r2d, r3d - - punpcklbw m8, m8 - punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] - pcmpeqb m9, m9 - pcmpeqb m9, m8 - pandn m9, m7 - pand m8, m9 - - movdqa m3, [r4] ; p2 - DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 - pand m6, m9 - mova m7, m8 - psubb m7, m6 - pand m6, m8 - LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 - - movdqa m4, [r0+2*r1] ; q2 - DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 - pand m6, m9 - pand m8, m6 - psubb m7, m6 - mova m3, [r0+r1] - LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6 - - DEBLOCK_P0_Q0 - mova [r4+2*r1], m1 - mova [r0], m2 - ret - -;----------------------------------------------------------------------------- -; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) -;----------------------------------------------------------------------------- -INIT_MMX -cglobal x264_deblock_h_luma_sse2 - movsxd r10, esi - lea r11, [r10+r10*2] - lea rax, [r0-4] - lea r9, [r0-4+r11] - sub rsp, 0x68 - %define pix_tmp rsp - - ; transpose 6x16 -> tmp space - TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp - lea rax, [rax+r10*8] - lea r9, [r9 +r10*8] - TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp+8 - - ; vertical filter - ; alpha, beta, tc0 are still in r2d, r3d, r4 - ; don't backup rax, r9, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them - lea r0, [pix_tmp+0x30] - mov esi, 0x10 - call x264_deblock_v_luma_sse2 - - ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) - add rax, 2 - add r9, 2 - movq m0, [pix_tmp+0x18] - movq m1, [pix_tmp+0x28] - movq m2, [pix_tmp+0x38] - movq m3, [pix_tmp+0x48] - TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11) - - shl r10, 3 - sub rax, r10 - sub r9, r10 - shr r10, 3 - movq m0, [pix_tmp+0x10] - movq m1, [pix_tmp+0x20] - movq m2, [pix_tmp+0x30] - movq m3, [pix_tmp+0x40] - TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11) - - add rsp, 0x68 - ret - -%else - -%macro DEBLOCK_LUMA 3 -;----------------------------------------------------------------------------- -; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) -;----------------------------------------------------------------------------- -cglobal x264_deblock_%2_luma_%1, 5,5 - lea r4, [r1*3] - dec r2 ; alpha-1 - neg r4 - dec r3 ; beta-1 - add r4, r0 ; pix-3*stride - %assign pad 2*%3+12-(stack_offset&15) - SUB esp, pad - - mova m0, [r4+r1] ; p1 - mova m1, [r4+2*r1] ; p0 - mova m2, [r0] ; q0 - mova m3, [r0+r1] ; q1 - LOAD_MASK r2, r3 - - mov r3, r4m - movd m4, [r3] ; tc0 - punpcklbw m4, m4 - punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] - mova [esp+%3], m4 ; tc - pcmpeqb m3, m3 - pcmpgtb m4, m3 - pand m4, m7 - mova [esp], m4 ; mask - - mova m3, [r4] ; p2 - DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 - pand m6, m4 - pand m4, [esp+%3] ; tc - mova m7, m4 - psubb m7, m6 - pand m6, m4 - LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 - - mova m4, [r0+2*r1] ; q2 - DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 - mova m5, [esp] ; mask - pand m6, m5 - mova m5, [esp+%3] ; tc - pand m5, m6 - psubb m7, m6 - mova m3, [r0+r1] - LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6 - - DEBLOCK_P0_Q0 - mova [r4+2*r1], m1 - mova [r0], m2 - ADD esp, pad - RET - -;----------------------------------------------------------------------------- -; void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) -;----------------------------------------------------------------------------- -INIT_MMX -cglobal x264_deblock_h_luma_%1, 0,5 - mov r0, r0m - mov r3, r1m - lea r4, [r3*3] - sub r0, 4 - lea r1, [r0+r4] - %assign pad 0x78-(stack_offset&15) - SUB esp, pad -%define pix_tmp esp+12 - - ; transpose 6x16 -> tmp space - TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp - lea r0, [r0+r3*8] - lea r1, [r1+r3*8] - TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp+8 - - ; vertical filter - lea r0, [pix_tmp+0x30] - PUSH dword r4m - PUSH dword r3m - PUSH dword r2m - PUSH dword 16 - PUSH dword r0 - call x264_deblock_%2_luma_%1 -%ifidn %2, v8 - add dword [esp ], 8 ; pix_tmp+0x38 - add dword [esp+16], 2 ; tc0+2 - call x264_deblock_%2_luma_%1 -%endif - ADD esp, 20 - - ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) - mov r0, r0m - sub r0, 2 - lea r1, [r0+r4] - - movq m0, [pix_tmp+0x10] - movq m1, [pix_tmp+0x20] - movq m2, [pix_tmp+0x30] - movq m3, [pix_tmp+0x40] - TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4) - - lea r0, [r0+r3*8] - lea r1, [r1+r3*8] - movq m0, [pix_tmp+0x18] - movq m1, [pix_tmp+0x28] - movq m2, [pix_tmp+0x38] - movq m3, [pix_tmp+0x48] - TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4) - - ADD esp, pad - RET -%endmacro ; DEBLOCK_LUMA - -INIT_XMM -DEBLOCK_LUMA sse2, v, 16 - -%endif ; ARCH - - - -%macro LUMA_INTRA_P012 4 ; p0..p3 in memory - mova t0, p2 - mova t1, p0 - pavgb t0, p1 - pavgb t1, q0 - pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2 - mova t5, t1 - mova t2, p2 - mova t3, p0 - paddb t2, p1 - paddb t3, q0 - paddb t2, t3 - mova t3, t2 - mova t4, t2 - psrlw t2, 1 - pavgb t2, mpb_00 - pxor t2, t0 - pand t2, mpb_01 - psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4; - - mova t1, p2 - mova t2, p2 - pavgb t1, q1 - psubb t2, q1 - paddb t3, t3 - psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1 - pand t2, mpb_01 - psubb t1, t2 - pavgb t1, p1 - pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2 - psrlw t3, 2 - pavgb t3, mpb_00 - pxor t3, t1 - pand t3, mpb_01 - psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8 - - mova t3, p0 - mova t2, p0 - pxor t3, q1 - pavgb t2, q1 - pand t3, mpb_01 - psubb t2, t3 - pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4 - - pxor t1, t2 - pxor t2, p0 - pand t1, mask1p - pand t2, mask0 - pxor t1, t2 - pxor t1, p0 - mova %1, t1 ; store p0 - - mova t1, %4 ; p3 - mova t2, t1 - pavgb t1, p2 - paddb t2, p2 - pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4 - paddb t2, t2 - paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0 - psrlw t2, 2 - pavgb t2, mpb_00 - pxor t2, t1 - pand t2, mpb_01 - psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8 - - pxor t0, p1 - pxor t1, p2 - pand t0, mask1p - pand t1, mask1p - pxor t0, p1 - pxor t1, p2 - mova %2, t0 ; store p1 - mova %3, t1 ; store p2 -%endmacro - -%macro LUMA_INTRA_SWAP_PQ 0 - %define q1 m0 - %define q0 m1 - %define p0 m2 - %define p1 m3 - %define p2 q2 - %define mask1p mask1q -%endmacro - -%macro DEBLOCK_LUMA_INTRA 2 - %define p1 m0 - %define p0 m1 - %define q0 m2 - %define q1 m3 - %define t0 m4 - %define t1 m5 - %define t2 m6 - %define t3 m7 -%ifdef ARCH_X86_64 - %define p2 m8 - %define q2 m9 - %define t4 m10 - %define t5 m11 - %define mask0 m12 - %define mask1p m13 - %define mask1q [rsp-24] - %define mpb_00 m14 - %define mpb_01 m15 -%else - %define spill(x) [esp+16*x+((stack_offset+4)&15)] - %define p2 [r4+r1] - %define q2 [r0+2*r1] - %define t4 spill(0) - %define t5 spill(1) - %define mask0 spill(2) - %define mask1p spill(3) - %define mask1q spill(4) - %define mpb_00 [pb_00 GLOBAL] - %define mpb_01 [pb_01 GLOBAL] -%endif - -;----------------------------------------------------------------------------- -; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ) -;----------------------------------------------------------------------------- -cglobal x264_deblock_%2_luma_intra_%1, 4,6 -%ifndef ARCH_X86_64 - sub esp, 0x60 -%endif - lea r4, [r1*4] - lea r5, [r1*3] ; 3*stride - dec r2d ; alpha-1 - jl .end - neg r4 - dec r3d ; beta-1 - jl .end - add r4, r0 ; pix-4*stride - mova p1, [r4+2*r1] - mova p0, [r4+r5] - mova q0, [r0] - mova q1, [r0+r1] -%ifdef ARCH_X86_64 - pxor mpb_00, mpb_00 - mova mpb_01, [pb_01 GLOBAL] - LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 - SWAP 7, 12 ; m12=mask0 - pavgb t5, mpb_00 - pavgb t5, mpb_01 ; alpha/4+1 - movdqa p2, [r4+r1] - movdqa q2, [r0+2*r1] - DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1 - DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1 - DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1 - pand t0, mask0 - pand t4, t0 - pand t2, t0 - mova mask1q, t4 - mova mask1p, t2 -%else - LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 - mova m4, t5 - mova mask0, m7 - pavgb m4, [pb_00 GLOBAL] - pavgb m4, [pb_01 GLOBAL] ; alpha/4+1 - DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1 - pand m6, mask0 - DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1 - pand m4, m6 - mova mask1p, m4 - DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1 - pand m4, m6 - mova mask1q, m4 -%endif - LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4] - LUMA_INTRA_SWAP_PQ - LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5] -.end: -%ifndef ARCH_X86_64 - add esp, 0x60 -%endif - RET - -INIT_MMX -%ifdef ARCH_X86_64 -;----------------------------------------------------------------------------- -; void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ) -;----------------------------------------------------------------------------- -cglobal x264_deblock_h_luma_intra_%1 - movsxd r10, r1d - lea r11, [r10*3] - lea rax, [r0-4] - lea r9, [r0-4+r11] - sub rsp, 0x88 - %define pix_tmp rsp - - ; transpose 8x16 -> tmp space - TRANSPOSE8x8_MEM PASS8ROWS(rax, r9, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) - lea rax, [rax+r10*8] - lea r9, [r9+r10*8] - TRANSPOSE8x8_MEM PASS8ROWS(rax, r9, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) - - lea r0, [pix_tmp+0x40] - mov r1, 0x10 - call x264_deblock_v_luma_intra_%1 - - ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) - lea r9, [rax+r11] - TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(rax, r9, r10, r11) - shl r10, 3 - sub rax, r10 - sub r9, r10 - shr r10, 3 - TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(rax, r9, r10, r11) - add rsp, 0x88 - ret -%else -cglobal x264_deblock_h_luma_intra_%1, 2,4 - lea r3, [r1*3] - sub r0, 4 - lea r2, [r0+r3] -%assign pad 0x8c-(stack_offset&15) - SUB rsp, pad - %define pix_tmp rsp - - ; transpose 8x16 -> tmp space - TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) - lea r0, [r0+r1*8] - lea r2, [r2+r1*8] - TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) - - lea r0, [pix_tmp+0x40] - PUSH dword r3m - PUSH dword r2m - PUSH dword 16 - PUSH r0 - call x264_deblock_%2_luma_intra_%1 -%ifidn %2, v8 - add dword [rsp], 8 ; pix_tmp+8 - call x264_deblock_%2_luma_intra_%1 -%endif - ADD esp, 16 - - mov r1, r1m - mov r0, r0m - lea r3, [r1*3] - sub r0, 4 - lea r2, [r0+r3] - ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) - TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3) - lea r0, [r0+r1*8] - lea r2, [r2+r1*8] - TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3) - ADD rsp, pad - RET -%endif ; ARCH_X86_64 -%endmacro ; DEBLOCK_LUMA_INTRA - -INIT_XMM -DEBLOCK_LUMA_INTRA sse2, v -%ifndef ARCH_X86_64 -INIT_MMX -DEBLOCK_LUMA_INTRA mmxext, v8 -%endif
--- a/i386/h264_i386.h Mon Dec 22 06:50:18 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,155 +0,0 @@ -/* - * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder - * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file h264_i386.h - * H.264 / AVC / MPEG4 part10 codec. - * non-MMX i386-specific optimizations for H.264 - * @author Michael Niedermayer <michaelni@gmx.at> - */ - -#ifndef AVCODEC_I386_H264_I386_H -#define AVCODEC_I386_H264_I386_H - -#include "libavcodec/cabac.h" - -//FIXME use some macros to avoid duplicating get_cabac (cannot be done yet -//as that would make optimization work hard) -#if defined(ARCH_X86) && defined(HAVE_7REGS) && \ - defined(HAVE_EBX_AVAILABLE) && \ - !defined(BROKEN_RELOCATIONS) -static int decode_significance_x86(CABACContext *c, int max_coeff, - uint8_t *significant_coeff_ctx_base, - int *index){ - void *end= significant_coeff_ctx_base + max_coeff - 1; - int minusstart= -(int)significant_coeff_ctx_base; - int minusindex= 4-(int)index; - int coeff_count; - __asm__ volatile( - "movl "RANGE "(%3), %%esi \n\t" - "movl "LOW "(%3), %%ebx \n\t" - - "2: \n\t" - - BRANCHLESS_GET_CABAC("%%edx", "%3", "(%1)", "%%ebx", - "%%bx", "%%esi", "%%eax", "%%al") - - "test $1, %%edx \n\t" - " jz 3f \n\t" - - BRANCHLESS_GET_CABAC("%%edx", "%3", "61(%1)", "%%ebx", - "%%bx", "%%esi", "%%eax", "%%al") - - "mov %2, %%"REG_a" \n\t" - "movl %4, %%ecx \n\t" - "add %1, %%"REG_c" \n\t" - "movl %%ecx, (%%"REG_a") \n\t" - - "test $1, %%edx \n\t" - " jnz 4f \n\t" - - "add $4, %%"REG_a" \n\t" - "mov %%"REG_a", %2 \n\t" - - "3: \n\t" - "add $1, %1 \n\t" - "cmp %5, %1 \n\t" - " jb 2b \n\t" - "mov %2, %%"REG_a" \n\t" - "movl %4, %%ecx \n\t" - "add %1, %%"REG_c" \n\t" - "movl %%ecx, (%%"REG_a") \n\t" - "4: \n\t" - "add %6, %%eax \n\t" - "shr $2, %%eax \n\t" - - "movl %%esi, "RANGE "(%3) \n\t" - "movl %%ebx, "LOW "(%3) \n\t" - :"=&a"(coeff_count), "+r"(significant_coeff_ctx_base), "+m"(index) - :"r"(c), "m"(minusstart), "m"(end), "m"(minusindex) - : "%"REG_c, "%ebx", "%edx", "%esi", "memory" - ); - return coeff_count; -} - -static int decode_significance_8x8_x86(CABACContext *c, - uint8_t *significant_coeff_ctx_base, - int *index, const uint8_t *sig_off){ - int minusindex= 4-(int)index; - int coeff_count; - x86_reg last=0; - __asm__ volatile( - "movl "RANGE "(%3), %%esi \n\t" - "movl "LOW "(%3), %%ebx \n\t" - - "mov %1, %%"REG_D" \n\t" - "2: \n\t" - - "mov %6, %%"REG_a" \n\t" - "movzbl (%%"REG_a", %%"REG_D"), %%edi \n\t" - "add %5, %%"REG_D" \n\t" - - BRANCHLESS_GET_CABAC("%%edx", "%3", "(%%"REG_D")", "%%ebx", - "%%bx", "%%esi", "%%eax", "%%al") - - "mov %1, %%edi \n\t" - "test $1, %%edx \n\t" - " jz 3f \n\t" - - "movzbl "MANGLE(last_coeff_flag_offset_8x8)"(%%edi), %%edi\n\t" - "add %5, %%"REG_D" \n\t" - - BRANCHLESS_GET_CABAC("%%edx", "%3", "15(%%"REG_D")", "%%ebx", - "%%bx", "%%esi", "%%eax", "%%al") - - "mov %2, %%"REG_a" \n\t" - "mov %1, %%edi \n\t" - "movl %%edi, (%%"REG_a") \n\t" - - "test $1, %%edx \n\t" - " jnz 4f \n\t" - - "add $4, %%"REG_a" \n\t" - "mov %%"REG_a", %2 \n\t" - - "3: \n\t" - "addl $1, %%edi \n\t" - "mov %%edi, %1 \n\t" - "cmpl $63, %%edi \n\t" - " jb 2b \n\t" - "mov %2, %%"REG_a" \n\t" - "movl %%edi, (%%"REG_a") \n\t" - "4: \n\t" - "addl %4, %%eax \n\t" - "shr $2, %%eax \n\t" - - "movl %%esi, "RANGE "(%3) \n\t" - "movl %%ebx, "LOW "(%3) \n\t" - :"=&a"(coeff_count),"+m"(last), "+m"(index) - :"r"(c), "m"(minusindex), "m"(significant_coeff_ctx_base), "m"(sig_off) - : "%"REG_c, "%ebx", "%edx", "%esi", "%"REG_D, "memory" - ); - return coeff_count; -} -#endif /* defined(ARCH_X86) && defined(HAVE_7REGS) && */ - /* defined(HAVE_EBX_AVAILABLE) && !defined(BROKEN_RELOCATIONS) */ - -#endif /* AVCODEC_I386_H264_I386_H */
--- a/i386/h264dsp_mmx.c Mon Dec 22 06:50:18 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,2208 +0,0 @@ -/* - * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "dsputil_mmx.h" - -DECLARE_ALIGNED_8 (static const uint64_t, ff_pb_3_1 ) = 0x0103010301030103ULL; -DECLARE_ALIGNED_8 (static const uint64_t, ff_pb_7_3 ) = 0x0307030703070307ULL; - -/***********************************/ -/* IDCT */ - -#define SUMSUB_BADC( a, b, c, d ) \ - "paddw "#b", "#a" \n\t"\ - "paddw "#d", "#c" \n\t"\ - "paddw "#b", "#b" \n\t"\ - "paddw "#d", "#d" \n\t"\ - "psubw "#a", "#b" \n\t"\ - "psubw "#c", "#d" \n\t" - -#define SUMSUBD2_AB( a, b, t ) \ - "movq "#b", "#t" \n\t"\ - "psraw $1 , "#b" \n\t"\ - "paddw "#a", "#b" \n\t"\ - "psraw $1 , "#a" \n\t"\ - "psubw "#t", "#a" \n\t" - -#define IDCT4_1D( s02, s13, d02, d13, t ) \ - SUMSUB_BA ( s02, d02 )\ - SUMSUBD2_AB( s13, d13, t )\ - SUMSUB_BADC( d13, s02, s13, d02 ) - -#define STORE_DIFF_4P( p, t, z ) \ - "psraw $6, "#p" \n\t"\ - "movd (%0), "#t" \n\t"\ - "punpcklbw "#z", "#t" \n\t"\ - "paddsw "#t", "#p" \n\t"\ - "packuswb "#z", "#p" \n\t"\ - "movd "#p", (%0) \n\t" - -static void ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride) -{ - /* Load dct coeffs */ - __asm__ volatile( - "movq (%0), %%mm0 \n\t" - "movq 8(%0), %%mm1 \n\t" - "movq 16(%0), %%mm2 \n\t" - "movq 24(%0), %%mm3 \n\t" - :: "r"(block) ); - - __asm__ volatile( - /* mm1=s02+s13 mm2=s02-s13 mm4=d02+d13 mm0=d02-d13 */ - IDCT4_1D( %%mm2, %%mm1, %%mm0, %%mm3, %%mm4 ) - - "movq %0, %%mm6 \n\t" - /* in: 1,4,0,2 out: 1,2,3,0 */ - TRANSPOSE4( %%mm3, %%mm1, %%mm0, %%mm2, %%mm4 ) - - "paddw %%mm6, %%mm3 \n\t" - - /* mm2=s02+s13 mm3=s02-s13 mm4=d02+d13 mm1=d02-d13 */ - IDCT4_1D( %%mm4, %%mm2, %%mm3, %%mm0, %%mm1 ) - - "pxor %%mm7, %%mm7 \n\t" - :: "m"(ff_pw_32)); - - __asm__ volatile( - STORE_DIFF_4P( %%mm0, %%mm1, %%mm7) - "add %1, %0 \n\t" - STORE_DIFF_4P( %%mm2, %%mm1, %%mm7) - "add %1, %0 \n\t" - STORE_DIFF_4P( %%mm3, %%mm1, %%mm7) - "add %1, %0 \n\t" - STORE_DIFF_4P( %%mm4, %%mm1, %%mm7) - : "+r"(dst) - : "r" ((x86_reg)stride) - ); -} - -static inline void h264_idct8_1d(int16_t *block) -{ - __asm__ volatile( - "movq 112(%0), %%mm7 \n\t" - "movq 80(%0), %%mm0 \n\t" - "movq 48(%0), %%mm3 \n\t" - "movq 16(%0), %%mm5 \n\t" - - "movq %%mm0, %%mm4 \n\t" - "movq %%mm5, %%mm1 \n\t" - "psraw $1, %%mm4 \n\t" - "psraw $1, %%mm1 \n\t" - "paddw %%mm0, %%mm4 \n\t" - "paddw %%mm5, %%mm1 \n\t" - "paddw %%mm7, %%mm4 \n\t" - "paddw %%mm0, %%mm1 \n\t" - "psubw %%mm5, %%mm4 \n\t" - "paddw %%mm3, %%mm1 \n\t" - - "psubw %%mm3, %%mm5 \n\t" - "psubw %%mm3, %%mm0 \n\t" - "paddw %%mm7, %%mm5 \n\t" - "psubw %%mm7, %%mm0 \n\t" - "psraw $1, %%mm3 \n\t" - "psraw $1, %%mm7 \n\t" - "psubw %%mm3, %%mm5 \n\t" - "psubw %%mm7, %%mm0 \n\t" - - "movq %%mm4, %%mm3 \n\t" - "movq %%mm1, %%mm7 \n\t" - "psraw $2, %%mm1 \n\t" - "psraw $2, %%mm3 \n\t" - "paddw %%mm5, %%mm3 \n\t" - "psraw $2, %%mm5 \n\t" - "paddw %%mm0, %%mm1 \n\t" - "psraw $2, %%mm0 \n\t" - "psubw %%mm4, %%mm5 \n\t" - "psubw %%mm0, %%mm7 \n\t" - - "movq 32(%0), %%mm2 \n\t" - "movq 96(%0), %%mm6 \n\t" - "movq %%mm2, %%mm4 \n\t" - "movq %%mm6, %%mm0 \n\t" - "psraw $1, %%mm4 \n\t" - "psraw $1, %%mm6 \n\t" - "psubw %%mm0, %%mm4 \n\t" - "paddw %%mm2, %%mm6 \n\t" - - "movq (%0), %%mm2 \n\t" - "movq 64(%0), %%mm0 \n\t" - SUMSUB_BA( %%mm0, %%mm2 ) - SUMSUB_BA( %%mm6, %%mm0 ) - SUMSUB_BA( %%mm4, %%mm2 ) - SUMSUB_BA( %%mm7, %%mm6 ) - SUMSUB_BA( %%mm5, %%mm4 ) - SUMSUB_BA( %%mm3, %%mm2 ) - SUMSUB_BA( %%mm1, %%mm0 ) - :: "r"(block) - ); -} - -static void ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride) -{ - int i; - int16_t __attribute__ ((aligned(8))) b2[64]; - - block[0] += 32; - - for(i=0; i<2; i++){ - DECLARE_ALIGNED_8(uint64_t, tmp); - - h264_idct8_1d(block+4*i); - - __asm__ volatile( - "movq %%mm7, %0 \n\t" - TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 ) - "movq %%mm0, 8(%1) \n\t" - "movq %%mm6, 24(%1) \n\t" - "movq %%mm7, 40(%1) \n\t" - "movq %%mm4, 56(%1) \n\t" - "movq %0, %%mm7 \n\t" - TRANSPOSE4( %%mm7, %%mm5, %%mm3, %%mm1, %%mm0 ) - "movq %%mm7, (%1) \n\t" - "movq %%mm1, 16(%1) \n\t" - "movq %%mm0, 32(%1) \n\t" - "movq %%mm3, 48(%1) \n\t" - : "=m"(tmp) - : "r"(b2+32*i) - : "memory" - ); - } - - for(i=0; i<2; i++){ - h264_idct8_1d(b2+4*i); - - __asm__ volatile( - "psraw $6, %%mm7 \n\t" - "psraw $6, %%mm6 \n\t" - "psraw $6, %%mm5 \n\t" - "psraw $6, %%mm4 \n\t" - "psraw $6, %%mm3 \n\t" - "psraw $6, %%mm2 \n\t" - "psraw $6, %%mm1 \n\t" - "psraw $6, %%mm0 \n\t" - - "movq %%mm7, (%0) \n\t" - "movq %%mm5, 16(%0) \n\t" - "movq %%mm3, 32(%0) \n\t" - "movq %%mm1, 48(%0) \n\t" - "movq %%mm0, 64(%0) \n\t" - "movq %%mm2, 80(%0) \n\t" - "movq %%mm4, 96(%0) \n\t" - "movq %%mm6, 112(%0) \n\t" - :: "r"(b2+4*i) - : "memory" - ); - } - - add_pixels_clamped_mmx(b2, dst, stride); -} - -#define STORE_DIFF_8P( p, d, t, z )\ - "movq "#d", "#t" \n"\ - "psraw $6, "#p" \n"\ - "punpcklbw "#z", "#t" \n"\ - "paddsw "#t", "#p" \n"\ - "packuswb "#p", "#p" \n"\ - "movq "#p", "#d" \n" - -#define H264_IDCT8_1D_SSE2(a,b,c,d,e,f,g,h)\ - "movdqa "#c", "#a" \n"\ - "movdqa "#g", "#e" \n"\ - "psraw $1, "#c" \n"\ - "psraw $1, "#g" \n"\ - "psubw "#e", "#c" \n"\ - "paddw "#a", "#g" \n"\ - "movdqa "#b", "#e" \n"\ - "psraw $1, "#e" \n"\ - "paddw "#b", "#e" \n"\ - "paddw "#d", "#e" \n"\ - "paddw "#f", "#e" \n"\ - "movdqa "#f", "#a" \n"\ - "psraw $1, "#a" \n"\ - "paddw "#f", "#a" \n"\ - "paddw "#h", "#a" \n"\ - "psubw "#b", "#a" \n"\ - "psubw "#d", "#b" \n"\ - "psubw "#d", "#f" \n"\ - "paddw "#h", "#b" \n"\ - "psubw "#h", "#f" \n"\ - "psraw $1, "#d" \n"\ - "psraw $1, "#h" \n"\ - "psubw "#d", "#b" \n"\ - "psubw "#h", "#f" \n"\ - "movdqa "#e", "#d" \n"\ - "movdqa "#a", "#h" \n"\ - "psraw $2, "#d" \n"\ - "psraw $2, "#h" \n"\ - "paddw "#f", "#d" \n"\ - "paddw "#b", "#h" \n"\ - "psraw $2, "#f" \n"\ - "psraw $2, "#b" \n"\ - "psubw "#f", "#e" \n"\ - "psubw "#a", "#b" \n"\ - "movdqa 0x00(%1), "#a" \n"\ - "movdqa 0x40(%1), "#f" \n"\ - SUMSUB_BA(f, a)\ - SUMSUB_BA(g, f)\ - SUMSUB_BA(c, a)\ - SUMSUB_BA(e, g)\ - SUMSUB_BA(b, c)\ - SUMSUB_BA(h, a)\ - SUMSUB_BA(d, f) - -static void ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride) -{ - __asm__ volatile( - "movdqa 0x10(%1), %%xmm1 \n" - "movdqa 0x20(%1), %%xmm2 \n" - "movdqa 0x30(%1), %%xmm3 \n" - "movdqa 0x50(%1), %%xmm5 \n" - "movdqa 0x60(%1), %%xmm6 \n" - "movdqa 0x70(%1), %%xmm7 \n" - H264_IDCT8_1D_SSE2(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7) - TRANSPOSE8(%%xmm4, %%xmm1, %%xmm7, %%xmm3, %%xmm5, %%xmm0, %%xmm2, %%xmm6, (%1)) - "paddw %4, %%xmm4 \n" - "movdqa %%xmm4, 0x00(%1) \n" - "movdqa %%xmm2, 0x40(%1) \n" - H264_IDCT8_1D_SSE2(%%xmm4, %%xmm0, %%xmm6, %%xmm3, %%xmm2, %%xmm5, %%xmm7, %%xmm1) - "movdqa %%xmm6, 0x60(%1) \n" - "movdqa %%xmm7, 0x70(%1) \n" - "pxor %%xmm7, %%xmm7 \n" - STORE_DIFF_8P(%%xmm2, (%0), %%xmm6, %%xmm7) - STORE_DIFF_8P(%%xmm0, (%0,%2), %%xmm6, %%xmm7) - STORE_DIFF_8P(%%xmm1, (%0,%2,2), %%xmm6, %%xmm7) - STORE_DIFF_8P(%%xmm3, (%0,%3), %%xmm6, %%xmm7) - "lea (%0,%2,4), %0 \n" - STORE_DIFF_8P(%%xmm5, (%0), %%xmm6, %%xmm7) - STORE_DIFF_8P(%%xmm4, (%0,%2), %%xmm6, %%xmm7) - "movdqa 0x60(%1), %%xmm0 \n" - "movdqa 0x70(%1), %%xmm1 \n" - STORE_DIFF_8P(%%xmm0, (%0,%2,2), %%xmm6, %%xmm7) - STORE_DIFF_8P(%%xmm1, (%0,%3), %%xmm6, %%xmm7) - :"+r"(dst) - :"r"(block), "r"((x86_reg)stride), "r"((x86_reg)3L*stride), "m"(ff_pw_32) - ); -} - -static void ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride) -{ - int dc = (block[0] + 32) >> 6; - __asm__ volatile( - "movd %0, %%mm0 \n\t" - "pshufw $0, %%mm0, %%mm0 \n\t" - "pxor %%mm1, %%mm1 \n\t" - "psubw %%mm0, %%mm1 \n\t" - "packuswb %%mm0, %%mm0 \n\t" - "packuswb %%mm1, %%mm1 \n\t" - ::"r"(dc) - ); - __asm__ volatile( - "movd %0, %%mm2 \n\t" - "movd %1, %%mm3 \n\t" - "movd %2, %%mm4 \n\t" - "movd %3, %%mm5 \n\t" - "paddusb %%mm0, %%mm2 \n\t" - "paddusb %%mm0, %%mm3 \n\t" - "paddusb %%mm0, %%mm4 \n\t" - "paddusb %%mm0, %%mm5 \n\t" - "psubusb %%mm1, %%mm2 \n\t" - "psubusb %%mm1, %%mm3 \n\t" - "psubusb %%mm1, %%mm4 \n\t" - "psubusb %%mm1, %%mm5 \n\t" - "movd %%mm2, %0 \n\t" - "movd %%mm3, %1 \n\t" - "movd %%mm4, %2 \n\t" - "movd %%mm5, %3 \n\t" - :"+m"(*(uint32_t*)(dst+0*stride)), - "+m"(*(uint32_t*)(dst+1*stride)), - "+m"(*(uint32_t*)(dst+2*stride)), - "+m"(*(uint32_t*)(dst+3*stride)) - ); -} - -static void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride) -{ - int dc = (block[0] + 32) >> 6; - int y; - __asm__ volatile( - "movd %0, %%mm0 \n\t" - "pshufw $0, %%mm0, %%mm0 \n\t" - "pxor %%mm1, %%mm1 \n\t" - "psubw %%mm0, %%mm1 \n\t" - "packuswb %%mm0, %%mm0 \n\t" - "packuswb %%mm1, %%mm1 \n\t" - ::"r"(dc) - ); - for(y=2; y--; dst += 4*stride){ - __asm__ volatile( - "movq %0, %%mm2 \n\t" - "movq %1, %%mm3 \n\t" - "movq %2, %%mm4 \n\t" - "movq %3, %%mm5 \n\t" - "paddusb %%mm0, %%mm2 \n\t" - "paddusb %%mm0, %%mm3 \n\t" - "paddusb %%mm0, %%mm4 \n\t" - "paddusb %%mm0, %%mm5 \n\t" - "psubusb %%mm1, %%mm2 \n\t" - "psubusb %%mm1, %%mm3 \n\t" - "psubusb %%mm1, %%mm4 \n\t" - "psubusb %%mm1, %%mm5 \n\t" - "movq %%mm2, %0 \n\t" - "movq %%mm3, %1 \n\t" - "movq %%mm4, %2 \n\t" - "movq %%mm5, %3 \n\t" - :"+m"(*(uint64_t*)(dst+0*stride)), - "+m"(*(uint64_t*)(dst+1*stride)), - "+m"(*(uint64_t*)(dst+2*stride)), - "+m"(*(uint64_t*)(dst+3*stride)) - ); - } -} - -//FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split -static const uint8_t scan8[16 + 2*4]={ - 4+1*8, 5+1*8, 4+2*8, 5+2*8, - 6+1*8, 7+1*8, 6+2*8, 7+2*8, - 4+3*8, 5+3*8, 4+4*8, 5+4*8, - 6+3*8, 7+3*8, 6+4*8, 7+4*8, - 1+1*8, 2+1*8, - 1+2*8, 2+2*8, - 1+4*8, 2+4*8, - 1+5*8, 2+5*8, -}; - -static void ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ - int i; - for(i=0; i<16; i++){ - if(nnzc[ scan8[i] ]) - ff_h264_idct_add_mmx(dst + block_offset[i], block + i*16, stride); - } -} - -static void ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ - int i; - for(i=0; i<16; i+=4){ - if(nnzc[ scan8[i] ]) - ff_h264_idct8_add_mmx(dst + block_offset[i], block + i*16, stride); - } -} - - -static void ff_h264_idct_add16_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ - int i; - for(i=0; i<16; i++){ - int nnz = nnzc[ scan8[i] ]; - if(nnz){ - if(nnz==1 && block[i*16]) ff_h264_idct_dc_add_mmx2(dst + block_offset[i], block + i*16, stride); - else ff_h264_idct_add_mmx (dst + block_offset[i], block + i*16, stride); - } - } -} - -static void ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ - int i; - for(i=0; i<16; i++){ - if(nnzc[ scan8[i] ] || block[i*16]) - ff_h264_idct_add_mmx(dst + block_offset[i], block + i*16, stride); - } -} - -static void ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ - int i; - for(i=0; i<16; i++){ - if(nnzc[ scan8[i] ]) ff_h264_idct_add_mmx (dst + block_offset[i], block + i*16, stride); - else if(block[i*16]) ff_h264_idct_dc_add_mmx2(dst + block_offset[i], block + i*16, stride); - } -} - -static void ff_h264_idct8_add4_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ - int i; - for(i=0; i<16; i+=4){ - int nnz = nnzc[ scan8[i] ]; - if(nnz){ - if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_mmx2(dst + block_offset[i], block + i*16, stride); - else ff_h264_idct8_add_mmx (dst + block_offset[i], block + i*16, stride); - } - } -} - -static void ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ - int i; - for(i=0; i<16; i+=4){ - int nnz = nnzc[ scan8[i] ]; - if(nnz){ - if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_mmx2(dst + block_offset[i], block + i*16, stride); - else ff_h264_idct8_add_sse2 (dst + block_offset[i], block + i*16, stride); - } - } -} - -static void ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ - int i; - for(i=16; i<16+8; i++){ - if(nnzc[ scan8[i] ] || block[i*16]) - ff_h264_idct_add_mmx (dest[(i&4)>>2] + block_offset[i], block + i*16, stride); - } -} - -static void ff_h264_idct_add8_mmx2(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ - int i; - for(i=16; i<16+8; i++){ - if(nnzc[ scan8[i] ]) - ff_h264_idct_add_mmx (dest[(i&4)>>2] + block_offset[i], block + i*16, stride); - else if(block[i*16]) - ff_h264_idct_dc_add_mmx2(dest[(i&4)>>2] + block_offset[i], block + i*16, stride); - } -} - -/***********************************/ -/* deblocking */ - -// out: o = |x-y|>a -// clobbers: t -#define DIFF_GT_MMX(x,y,a,o,t)\ - "movq "#y", "#t" \n\t"\ - "movq "#x", "#o" \n\t"\ - "psubusb "#x", "#t" \n\t"\ - "psubusb "#y", "#o" \n\t"\ - "por "#t", "#o" \n\t"\ - "psubusb "#a", "#o" \n\t" - -// out: o = |x-y|>a -// clobbers: t -#define DIFF_GT2_MMX(x,y,a,o,t)\ - "movq "#y", "#t" \n\t"\ - "movq "#x", "#o" \n\t"\ - "psubusb "#x", "#t" \n\t"\ - "psubusb "#y", "#o" \n\t"\ - "psubusb "#a", "#t" \n\t"\ - "psubusb "#a", "#o" \n\t"\ - "pcmpeqb "#t", "#o" \n\t"\ - -// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 -// out: mm5=beta-1, mm7=mask -// clobbers: mm4,mm6 -#define H264_DEBLOCK_MASK(alpha1, beta1) \ - "pshufw $0, "#alpha1", %%mm4 \n\t"\ - "pshufw $0, "#beta1 ", %%mm5 \n\t"\ - "packuswb %%mm4, %%mm4 \n\t"\ - "packuswb %%mm5, %%mm5 \n\t"\ - DIFF_GT_MMX(%%mm1, %%mm2, %%mm4, %%mm7, %%mm6) /* |p0-q0| > alpha-1 */\ - DIFF_GT_MMX(%%mm0, %%mm1, %%mm5, %%mm4, %%mm6) /* |p1-p0| > beta-1 */\ - "por %%mm4, %%mm7 \n\t"\ - DIFF_GT_MMX(%%mm3, %%mm2, %%mm5, %%mm4, %%mm6) /* |q1-q0| > beta-1 */\ - "por %%mm4, %%mm7 \n\t"\ - "pxor %%mm6, %%mm6 \n\t"\ - "pcmpeqb %%mm6, %%mm7 \n\t" - -// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask) -// out: mm1=p0' mm2=q0' -// clobbers: mm0,3-6 -#define H264_DEBLOCK_P0_Q0(pb_01, pb_3f)\ - "movq %%mm1 , %%mm5 \n\t"\ - "pxor %%mm2 , %%mm5 \n\t" /* p0^q0*/\ - "pand "#pb_01" , %%mm5 \n\t" /* (p0^q0)&1*/\ - "pcmpeqb %%mm4 , %%mm4 \n\t"\ - "pxor %%mm4 , %%mm3 \n\t"\ - "pavgb %%mm0 , %%mm3 \n\t" /* (p1 - q1 + 256)>>1*/\ - "pavgb "MANGLE(ff_pb_3)" , %%mm3 \n\t" /*(((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2*/\ - "pxor %%mm1 , %%mm4 \n\t"\ - "pavgb %%mm2 , %%mm4 \n\t" /* (q0 - p0 + 256)>>1*/\ - "pavgb %%mm5 , %%mm3 \n\t"\ - "paddusb %%mm4 , %%mm3 \n\t" /* d+128+33*/\ - "movq "MANGLE(ff_pb_A1)" , %%mm6 \n\t"\ - "psubusb %%mm3 , %%mm6 \n\t"\ - "psubusb "MANGLE(ff_pb_A1)" , %%mm3 \n\t"\ - "pminub %%mm7 , %%mm6 \n\t"\ - "pminub %%mm7 , %%mm3 \n\t"\ - "psubusb %%mm6 , %%mm1 \n\t"\ - "psubusb %%mm3 , %%mm2 \n\t"\ - "paddusb %%mm3 , %%mm1 \n\t"\ - "paddusb %%mm6 , %%mm2 \n\t" - -// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask) %8=ff_bone -// out: (q1addr) = av_clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 ) -// clobbers: q2, tmp, tc0 -#define H264_DEBLOCK_Q1(p1, q2, q2addr, q1addr, tc0, tmp)\ - "movq %%mm1, "#tmp" \n\t"\ - "pavgb %%mm2, "#tmp" \n\t"\ - "pavgb "#tmp", "#q2" \n\t" /* avg(p2,avg(p0,q0)) */\ - "pxor "q2addr", "#tmp" \n\t"\ - "pand %8, "#tmp" \n\t" /* (p2^avg(p0,q0))&1 */\ - "psubusb "#tmp", "#q2" \n\t" /* (p2+((p0+q0+1)>>1))>>1 */\ - "movq "#p1", "#tmp" \n\t"\ - "psubusb "#tc0", "#tmp" \n\t"\ - "paddusb "#p1", "#tc0" \n\t"\ - "pmaxub "#tmp", "#q2" \n\t"\ - "pminub "#tc0", "#q2" \n\t"\ - "movq "#q2", "q1addr" \n\t" - -static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0) -{ - DECLARE_ALIGNED_8(uint64_t, tmp0[2]); - - __asm__ volatile( - "movq (%1,%3), %%mm0 \n\t" //p1 - "movq (%1,%3,2), %%mm1 \n\t" //p0 - "movq (%2), %%mm2 \n\t" //q0 - "movq (%2,%3), %%mm3 \n\t" //q1 - H264_DEBLOCK_MASK(%6, %7) - - "movd %5, %%mm4 \n\t" - "punpcklbw %%mm4, %%mm4 \n\t" - "punpcklwd %%mm4, %%mm4 \n\t" - "pcmpeqb %%mm3, %%mm3 \n\t" - "movq %%mm4, %%mm6 \n\t" - "pcmpgtb %%mm3, %%mm4 \n\t" - "movq %%mm6, 8+%0 \n\t" - "pand %%mm4, %%mm7 \n\t" - "movq %%mm7, %0 \n\t" - - /* filter p1 */ - "movq (%1), %%mm3 \n\t" //p2 - DIFF_GT2_MMX(%%mm1, %%mm3, %%mm5, %%mm6, %%mm4) // |p2-p0|>beta-1 - "pand %%mm7, %%mm6 \n\t" // mask & |p2-p0|<beta - "pand 8+%0, %%mm7 \n\t" // mask & tc0 - "movq %%mm7, %%mm4 \n\t" - "psubb %%mm6, %%mm7 \n\t" - "pand %%mm4, %%mm6 \n\t" // mask & |p2-p0|<beta & tc0 - H264_DEBLOCK_Q1(%%mm0, %%mm3, "(%1)", "(%1,%3)", %%mm6, %%mm4) - - /* filter q1 */ - "movq (%2,%3,2), %%mm4 \n\t" //q2 - DIFF_GT2_MMX(%%mm2, %%mm4, %%mm5, %%mm6, %%mm3) // |q2-q0|>beta-1 - "pand %0, %%mm6 \n\t" - "movq 8+%0, %%mm5 \n\t" // can be merged with the and below but is slower then - "pand %%mm6, %%mm5 \n\t" - "psubb %%mm6, %%mm7 \n\t" - "movq (%2,%3), %%mm3 \n\t" - H264_DEBLOCK_Q1(%%mm3, %%mm4, "(%2,%3,2)", "(%2,%3)", %%mm5, %%mm6) - - /* filter p0, q0 */ - H264_DEBLOCK_P0_Q0(%8, unused) - "movq %%mm1, (%1,%3,2) \n\t" - "movq %%mm2, (%2) \n\t" - - : "=m"(*tmp0) - : "r"(pix-3*stride), "r"(pix), "r"((x86_reg)stride), - "m"(*tmp0/*unused*/), "m"(*(uint32_t*)tc0), "m"(alpha1), "m"(beta1), - "m"(ff_bone) - ); -} - -static void h264_v_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) -{ - if((tc0[0] & tc0[1]) >= 0) - h264_loop_filter_luma_mmx2(pix, stride, alpha-1, beta-1, tc0); - if((tc0[2] & tc0[3]) >= 0) - h264_loop_filter_luma_mmx2(pix+8, stride, alpha-1, beta-1, tc0+2); -} -static void h264_h_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) -{ - //FIXME: could cut some load/stores by merging transpose with filter - // also, it only needs to transpose 6x8 - DECLARE_ALIGNED_8(uint8_t, trans[8*8]); - int i; - for(i=0; i<2; i++, pix+=8*stride, tc0+=2) { - if((tc0[0] & tc0[1]) < 0) - continue; - transpose4x4(trans, pix-4, 8, stride); - transpose4x4(trans +4*8, pix, 8, stride); - transpose4x4(trans+4, pix-4+4*stride, 8, stride); - transpose4x4(trans+4+4*8, pix +4*stride, 8, stride); - h264_loop_filter_luma_mmx2(trans+4*8, 8, alpha-1, beta-1, tc0); - transpose4x4(pix-2, trans +2*8, stride, 8); - transpose4x4(pix-2+4*stride, trans+4+2*8, stride, 8); - } -} - -static inline void h264_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0) -{ - __asm__ volatile( - "movq (%0), %%mm0 \n\t" //p1 - "movq (%0,%2), %%mm1 \n\t" //p0 - "movq (%1), %%mm2 \n\t" //q0 - "movq (%1,%2), %%mm3 \n\t" //q1 - H264_DEBLOCK_MASK(%4, %5) - "movd %3, %%mm6 \n\t" - "punpcklbw %%mm6, %%mm6 \n\t" - "pand %%mm6, %%mm7 \n\t" // mm7 = tc&mask - H264_DEBLOCK_P0_Q0(%6, %7) - "movq %%mm1, (%0,%2) \n\t" - "movq %%mm2, (%1) \n\t" - - :: "r"(pix-2*stride), "r"(pix), "r"((x86_reg)stride), - "r"(*(uint32_t*)tc0), - "m"(alpha1), "m"(beta1), "m"(ff_bone), "m"(ff_pb_3F) - ); -} - -static void h264_v_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) -{ - h264_loop_filter_chroma_mmx2(pix, stride, alpha-1, beta-1, tc0); -} - -static void h264_h_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) -{ - //FIXME: could cut some load/stores by merging transpose with filter - DECLARE_ALIGNED_8(uint8_t, trans[8*4]); - transpose4x4(trans, pix-2, 8, stride); - transpose4x4(trans+4, pix-2+4*stride, 8, stride); - h264_loop_filter_chroma_mmx2(trans+2*8, 8, alpha-1, beta-1, tc0); - transpose4x4(pix-2, trans, stride, 8); - transpose4x4(pix-2+4*stride, trans+4, stride, 8); -} - -// p0 = (p0 + q1 + 2*p1 + 2) >> 2 -#define H264_FILTER_CHROMA4(p0, p1, q1, one) \ - "movq "#p0", %%mm4 \n\t"\ - "pxor "#q1", %%mm4 \n\t"\ - "pand "#one", %%mm4 \n\t" /* mm4 = (p0^q1)&1 */\ - "pavgb "#q1", "#p0" \n\t"\ - "psubusb %%mm4, "#p0" \n\t"\ - "pavgb "#p1", "#p0" \n\t" /* dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) */\ - -static inline void h264_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha1, int beta1) -{ - __asm__ volatile( - "movq (%0), %%mm0 \n\t" - "movq (%0,%2), %%mm1 \n\t" - "movq (%1), %%mm2 \n\t" - "movq (%1,%2), %%mm3 \n\t" - H264_DEBLOCK_MASK(%3, %4) - "movq %%mm1, %%mm5 \n\t" - "movq %%mm2, %%mm6 \n\t" - H264_FILTER_CHROMA4(%%mm1, %%mm0, %%mm3, %5) //p0' - H264_FILTER_CHROMA4(%%mm2, %%mm3, %%mm0, %5) //q0' - "psubb %%mm5, %%mm1 \n\t" - "psubb %%mm6, %%mm2 \n\t" - "pand %%mm7, %%mm1 \n\t" - "pand %%mm7, %%mm2 \n\t" - "paddb %%mm5, %%mm1 \n\t" - "paddb %%mm6, %%mm2 \n\t" - "movq %%mm1, (%0,%2) \n\t" - "movq %%mm2, (%1) \n\t" - :: "r"(pix-2*stride), "r"(pix), "r"((x86_reg)stride), - "m"(alpha1), "m"(beta1), "m"(ff_bone) - ); -} - -static void h264_v_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta) -{ - h264_loop_filter_chroma_intra_mmx2(pix, stride, alpha-1, beta-1); -} - -static void h264_h_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta) -{ - //FIXME: could cut some load/stores by merging transpose with filter - DECLARE_ALIGNED_8(uint8_t, trans[8*4]); - transpose4x4(trans, pix-2, 8, stride); - transpose4x4(trans+4, pix-2+4*stride, 8, stride); - h264_loop_filter_chroma_intra_mmx2(trans+2*8, 8, alpha-1, beta-1); - transpose4x4(pix-2, trans, stride, 8); - transpose4x4(pix-2+4*stride, trans+4, stride, 8); -} - -static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2], - int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field ) { - int dir; - __asm__ volatile( - "pxor %%mm7, %%mm7 \n\t" - "movq %0, %%mm6 \n\t" - "movq %1, %%mm5 \n\t" - "movq %2, %%mm4 \n\t" - ::"m"(ff_pb_1), "m"(ff_pb_3), "m"(ff_pb_7) - ); - if(field) - __asm__ volatile( - "movq %0, %%mm5 \n\t" - "movq %1, %%mm4 \n\t" - ::"m"(ff_pb_3_1), "m"(ff_pb_7_3) - ); - - // could do a special case for dir==0 && edges==1, but it only reduces the - // average filter time by 1.2% - for( dir=1; dir>=0; dir-- ) { - const int d_idx = dir ? -8 : -1; - const int mask_mv = dir ? mask_mv1 : mask_mv0; - DECLARE_ALIGNED_8(const uint64_t, mask_dir) = dir ? 0 : 0xffffffffffffffffULL; - int b_idx, edge, l; - for( b_idx=12, edge=0; edge<edges; edge+=step, b_idx+=8*step ) { - __asm__ volatile( - "pand %0, %%mm0 \n\t" - ::"m"(mask_dir) - ); - if(!(mask_mv & edge)) { - __asm__ volatile("pxor %%mm0, %%mm0 \n\t":); - for( l = bidir; l >= 0; l-- ) { - __asm__ volatile( - "movd %0, %%mm1 \n\t" - "punpckldq %1, %%mm1 \n\t" - "movq %%mm1, %%mm2 \n\t" - "psrlw $7, %%mm2 \n\t" - "pand %%mm6, %%mm2 \n\t" - "por %%mm2, %%mm1 \n\t" // ref_cache with -2 mapped to -1 - "punpckldq %%mm1, %%mm2 \n\t" - "pcmpeqb %%mm2, %%mm1 \n\t" - "paddb %%mm6, %%mm1 \n\t" - "punpckhbw %%mm7, %%mm1 \n\t" // ref[b] != ref[bn] - "por %%mm1, %%mm0 \n\t" - - "movq %2, %%mm1 \n\t" - "movq %3, %%mm2 \n\t" - "psubw %4, %%mm1 \n\t" - "psubw %5, %%mm2 \n\t" - "packsswb %%mm2, %%mm1 \n\t" - "paddb %%mm5, %%mm1 \n\t" - "pminub %%mm4, %%mm1 \n\t" - "pcmpeqb %%mm4, %%mm1 \n\t" // abs(mv[b] - mv[bn]) >= limit - "por %%mm1, %%mm0 \n\t" - ::"m"(ref[l][b_idx]), - "m"(ref[l][b_idx+d_idx]), - "m"(mv[l][b_idx][0]), - "m"(mv[l][b_idx+2][0]), - "m"(mv[l][b_idx+d_idx][0]), - "m"(mv[l][b_idx+d_idx+2][0]) - ); - } - } - __asm__ volatile( - "movd %0, %%mm1 \n\t" - "por %1, %%mm1 \n\t" - "punpcklbw %%mm7, %%mm1 \n\t" - "pcmpgtw %%mm7, %%mm1 \n\t" // nnz[b] || nnz[bn] - ::"m"(nnz[b_idx]), - "m"(nnz[b_idx+d_idx]) - ); - __asm__ volatile( - "pcmpeqw %%mm7, %%mm0 \n\t" - "pcmpeqw %%mm7, %%mm0 \n\t" - "psrlw $15, %%mm0 \n\t" // nonzero -> 1 - "psrlw $14, %%mm1 \n\t" - "movq %%mm0, %%mm2 \n\t" - "por %%mm1, %%mm2 \n\t" - "psrlw $1, %%mm1 \n\t" - "pandn %%mm2, %%mm1 \n\t" - "movq %%mm1, %0 \n\t" - :"=m"(*bS[dir][edge]) - ::"memory" - ); - } - edges = 4; - step = 1; - } - __asm__ volatile( - "movq (%0), %%mm0 \n\t" - "movq 8(%0), %%mm1 \n\t" - "movq 16(%0), %%mm2 \n\t" - "movq 24(%0), %%mm3 \n\t" - TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4) - "movq %%mm0, (%0) \n\t" - "movq %%mm3, 8(%0) \n\t" - "movq %%mm4, 16(%0) \n\t" - "movq %%mm2, 24(%0) \n\t" - ::"r"(bS[0]) - :"memory" - ); -} - -/***********************************/ -/* motion compensation */ - -#define QPEL_H264V_MM(A,B,C,D,E,F,OP,T,Z,d,q)\ - "mov"#q" "#C", "#T" \n\t"\ - "mov"#d" (%0), "#F" \n\t"\ - "paddw "#D", "#T" \n\t"\ - "psllw $2, "#T" \n\t"\ - "psubw "#B", "#T" \n\t"\ - "psubw "#E", "#T" \n\t"\ - "punpcklbw "#Z", "#F" \n\t"\ - "pmullw %4, "#T" \n\t"\ - "paddw %5, "#A" \n\t"\ - "add %2, %0 \n\t"\ - "paddw "#F", "#A" \n\t"\ - "paddw "#A", "#T" \n\t"\ - "psraw $5, "#T" \n\t"\ - "packuswb "#T", "#T" \n\t"\ - OP(T, (%1), A, d)\ - "add %3, %1 \n\t" - -#define QPEL_H264HV_MM(A,B,C,D,E,F,OF,T,Z,d,q)\ - "mov"#q" "#C", "#T" \n\t"\ - "mov"#d" (%0), "#F" \n\t"\ - "paddw "#D", "#T" \n\t"\ - "psllw $2, "#T" \n\t"\ - "paddw %4, "#A" \n\t"\ - "psubw "#B", "#T" \n\t"\ - "psubw "#E", "#T" \n\t"\ - "punpcklbw "#Z", "#F" \n\t"\ - "pmullw %3, "#T" \n\t"\ - "paddw "#F", "#A" \n\t"\ - "add %2, %0 \n\t"\ - "paddw "#A", "#T" \n\t"\ - "mov"#q" "#T", "#OF"(%1) \n\t" - -#define QPEL_H264V(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%mm6,%%mm7,d,q) -#define QPEL_H264HV(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%mm6,%%mm7,d,q) -#define QPEL_H264V_XMM(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%xmm6,%%xmm7,q,dqa) -#define QPEL_H264HV_XMM(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%xmm6,%%xmm7,q,dqa) - - -#define QPEL_H264(OPNAME, OP, MMX)\ -static av_noinline void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - int h=4;\ -\ - __asm__ volatile(\ - "pxor %%mm7, %%mm7 \n\t"\ - "movq %5, %%mm4 \n\t"\ - "movq %6, %%mm5 \n\t"\ - "1: \n\t"\ - "movd -1(%0), %%mm1 \n\t"\ - "movd (%0), %%mm2 \n\t"\ - "movd 1(%0), %%mm3 \n\t"\ - "movd 2(%0), %%mm0 \n\t"\ - "punpcklbw %%mm7, %%mm1 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpcklbw %%mm7, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "paddw %%mm0, %%mm1 \n\t"\ - "paddw %%mm3, %%mm2 \n\t"\ - "movd -2(%0), %%mm0 \n\t"\ - "movd 3(%0), %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "punpcklbw %%mm7, %%mm3 \n\t"\ - "paddw %%mm3, %%mm0 \n\t"\ - "psllw $2, %%mm2 \n\t"\ - "psubw %%mm1, %%mm2 \n\t"\ - "pmullw %%mm4, %%mm2 \n\t"\ - "paddw %%mm5, %%mm0 \n\t"\ - "paddw %%mm2, %%mm0 \n\t"\ - "psraw $5, %%mm0 \n\t"\ - "packuswb %%mm0, %%mm0 \n\t"\ - OP(%%mm0, (%1),%%mm6, d)\ - "add %3, %0 \n\t"\ - "add %4, %1 \n\t"\ - "decl %2 \n\t"\ - " jnz 1b \n\t"\ - : "+a"(src), "+c"(dst), "+g"(h)\ - : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ - : "memory"\ - );\ -}\ -static av_noinline void OPNAME ## h264_qpel4_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ - int h=4;\ - __asm__ volatile(\ - "pxor %%mm7, %%mm7 \n\t"\ - "movq %0, %%mm4 \n\t"\ - "movq %1, %%mm5 \n\t"\ - :: "m"(ff_pw_5), "m"(ff_pw_16)\ - );\ - do{\ - __asm__ volatile(\ - "movd -1(%0), %%mm1 \n\t"\ - "movd (%0), %%mm2 \n\t"\ - "movd 1(%0), %%mm3 \n\t"\ - "movd 2(%0), %%mm0 \n\t"\ - "punpcklbw %%mm7, %%mm1 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpcklbw %%mm7, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "paddw %%mm0, %%mm1 \n\t"\ - "paddw %%mm3, %%mm2 \n\t"\ - "movd -2(%0), %%mm0 \n\t"\ - "movd 3(%0), %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "punpcklbw %%mm7, %%mm3 \n\t"\ - "paddw %%mm3, %%mm0 \n\t"\ - "psllw $2, %%mm2 \n\t"\ - "psubw %%mm1, %%mm2 \n\t"\ - "pmullw %%mm4, %%mm2 \n\t"\ - "paddw %%mm5, %%mm0 \n\t"\ - "paddw %%mm2, %%mm0 \n\t"\ - "movd (%2), %%mm3 \n\t"\ - "psraw $5, %%mm0 \n\t"\ - "packuswb %%mm0, %%mm0 \n\t"\ - PAVGB" %%mm3, %%mm0 \n\t"\ - OP(%%mm0, (%1),%%mm6, d)\ - "add %4, %0 \n\t"\ - "add %4, %1 \n\t"\ - "add %3, %2 \n\t"\ - : "+a"(src), "+c"(dst), "+d"(src2)\ - : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\ - : "memory"\ - );\ - }while(--h);\ -}\ -static av_noinline void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - src -= 2*srcStride;\ - __asm__ volatile(\ - "pxor %%mm7, %%mm7 \n\t"\ - "movd (%0), %%mm0 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm1 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm2 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm3 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm4 \n\t"\ - "add %2, %0 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "punpcklbw %%mm7, %%mm1 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpcklbw %%mm7, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm4 \n\t"\ - QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ - QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ - QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ - QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ - \ - : "+a"(src), "+c"(dst)\ - : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ - : "memory"\ - );\ -}\ -static av_noinline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ - int h=4;\ - int w=3;\ - src -= 2*srcStride+2;\ - while(w--){\ - __asm__ volatile(\ - "pxor %%mm7, %%mm7 \n\t"\ - "movd (%0), %%mm0 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm1 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm2 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm3 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm4 \n\t"\ - "add %2, %0 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "punpcklbw %%mm7, %%mm1 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpcklbw %%mm7, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm4 \n\t"\ - QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*3)\ - QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*3)\ - QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*3)\ - QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\ - \ - : "+a"(src)\ - : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\ - : "memory"\ - );\ - tmp += 4;\ - src += 4 - 9*srcStride;\ - }\ - tmp -= 3*4;\ - __asm__ volatile(\ - "1: \n\t"\ - "movq (%0), %%mm0 \n\t"\ - "paddw 10(%0), %%mm0 \n\t"\ - "movq 2(%0), %%mm1 \n\t"\ - "paddw 8(%0), %%mm1 \n\t"\ - "movq 4(%0), %%mm2 \n\t"\ - "paddw 6(%0), %%mm2 \n\t"\ - "psubw %%mm1, %%mm0 \n\t"/*a-b (abccba)*/\ - "psraw $2, %%mm0 \n\t"/*(a-b)/4 */\ - "psubw %%mm1, %%mm0 \n\t"/*(a-b)/4-b */\ - "paddsw %%mm2, %%mm0 \n\t"\ - "psraw $2, %%mm0 \n\t"/*((a-b)/4-b+c)/4 */\ - "paddw %%mm2, %%mm0 \n\t"/*(a-5*b+20*c)/16 */\ - "psraw $6, %%mm0 \n\t"\ - "packuswb %%mm0, %%mm0 \n\t"\ - OP(%%mm0, (%1),%%mm7, d)\ - "add $24, %0 \n\t"\ - "add %3, %1 \n\t"\ - "decl %2 \n\t"\ - " jnz 1b \n\t"\ - : "+a"(tmp), "+c"(dst), "+g"(h)\ - : "S"((x86_reg)dstStride)\ - : "memory"\ - );\ -}\ -\ -static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - int h=8;\ - __asm__ volatile(\ - "pxor %%mm7, %%mm7 \n\t"\ - "movq %5, %%mm6 \n\t"\ - "1: \n\t"\ - "movq (%0), %%mm0 \n\t"\ - "movq 1(%0), %%mm2 \n\t"\ - "movq %%mm0, %%mm1 \n\t"\ - "movq %%mm2, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "punpckhbw %%mm7, %%mm1 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpckhbw %%mm7, %%mm3 \n\t"\ - "paddw %%mm2, %%mm0 \n\t"\ - "paddw %%mm3, %%mm1 \n\t"\ - "psllw $2, %%mm0 \n\t"\ - "psllw $2, %%mm1 \n\t"\ - "movq -1(%0), %%mm2 \n\t"\ - "movq 2(%0), %%mm4 \n\t"\ - "movq %%mm2, %%mm3 \n\t"\ - "movq %%mm4, %%mm5 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpckhbw %%mm7, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm4 \n\t"\ - "punpckhbw %%mm7, %%mm5 \n\t"\ - "paddw %%mm4, %%mm2 \n\t"\ - "paddw %%mm3, %%mm5 \n\t"\ - "psubw %%mm2, %%mm0 \n\t"\ - "psubw %%mm5, %%mm1 \n\t"\ - "pmullw %%mm6, %%mm0 \n\t"\ - "pmullw %%mm6, %%mm1 \n\t"\ - "movd -2(%0), %%mm2 \n\t"\ - "movd 7(%0), %%mm5 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpcklbw %%mm7, %%mm5 \n\t"\ - "paddw %%mm3, %%mm2 \n\t"\ - "paddw %%mm5, %%mm4 \n\t"\ - "movq %6, %%mm5 \n\t"\ - "paddw %%mm5, %%mm2 \n\t"\ - "paddw %%mm5, %%mm4 \n\t"\ - "paddw %%mm2, %%mm0 \n\t"\ - "paddw %%mm4, %%mm1 \n\t"\ - "psraw $5, %%mm0 \n\t"\ - "psraw $5, %%mm1 \n\t"\ - "packuswb %%mm1, %%mm0 \n\t"\ - OP(%%mm0, (%1),%%mm5, q)\ - "add %3, %0 \n\t"\ - "add %4, %1 \n\t"\ - "decl %2 \n\t"\ - " jnz 1b \n\t"\ - : "+a"(src), "+c"(dst), "+g"(h)\ - : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ - : "memory"\ - );\ -}\ -\ -static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ - int h=8;\ - __asm__ volatile(\ - "pxor %%mm7, %%mm7 \n\t"\ - "movq %0, %%mm6 \n\t"\ - :: "m"(ff_pw_5)\ - );\ - do{\ - __asm__ volatile(\ - "movq (%0), %%mm0 \n\t"\ - "movq 1(%0), %%mm2 \n\t"\ - "movq %%mm0, %%mm1 \n\t"\ - "movq %%mm2, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "punpckhbw %%mm7, %%mm1 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpckhbw %%mm7, %%mm3 \n\t"\ - "paddw %%mm2, %%mm0 \n\t"\ - "paddw %%mm3, %%mm1 \n\t"\ - "psllw $2, %%mm0 \n\t"\ - "psllw $2, %%mm1 \n\t"\ - "movq -1(%0), %%mm2 \n\t"\ - "movq 2(%0), %%mm4 \n\t"\ - "movq %%mm2, %%mm3 \n\t"\ - "movq %%mm4, %%mm5 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpckhbw %%mm7, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm4 \n\t"\ - "punpckhbw %%mm7, %%mm5 \n\t"\ - "paddw %%mm4, %%mm2 \n\t"\ - "paddw %%mm3, %%mm5 \n\t"\ - "psubw %%mm2, %%mm0 \n\t"\ - "psubw %%mm5, %%mm1 \n\t"\ - "pmullw %%mm6, %%mm0 \n\t"\ - "pmullw %%mm6, %%mm1 \n\t"\ - "movd -2(%0), %%mm2 \n\t"\ - "movd 7(%0), %%mm5 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpcklbw %%mm7, %%mm5 \n\t"\ - "paddw %%mm3, %%mm2 \n\t"\ - "paddw %%mm5, %%mm4 \n\t"\ - "movq %5, %%mm5 \n\t"\ - "paddw %%mm5, %%mm2 \n\t"\ - "paddw %%mm5, %%mm4 \n\t"\ - "paddw %%mm2, %%mm0 \n\t"\ - "paddw %%mm4, %%mm1 \n\t"\ - "psraw $5, %%mm0 \n\t"\ - "psraw $5, %%mm1 \n\t"\ - "movq (%2), %%mm4 \n\t"\ - "packuswb %%mm1, %%mm0 \n\t"\ - PAVGB" %%mm4, %%mm0 \n\t"\ - OP(%%mm0, (%1),%%mm5, q)\ - "add %4, %0 \n\t"\ - "add %4, %1 \n\t"\ - "add %3, %2 \n\t"\ - : "+a"(src), "+c"(dst), "+d"(src2)\ - : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\ - "m"(ff_pw_16)\ - : "memory"\ - );\ - }while(--h);\ -}\ -\ -static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ - int w= 2;\ - src -= 2*srcStride;\ - \ - while(w--){\ - __asm__ volatile(\ - "pxor %%mm7, %%mm7 \n\t"\ - "movd (%0), %%mm0 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm1 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm2 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm3 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm4 \n\t"\ - "add %2, %0 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "punpcklbw %%mm7, %%mm1 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpcklbw %%mm7, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm4 \n\t"\ - QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ - QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ - QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ - QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ - QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\ - QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\ - QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ - QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ - \ - : "+a"(src), "+c"(dst)\ - : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ - : "memory"\ - );\ - if(h==16){\ - __asm__ volatile(\ - QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ - QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ - QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\ - QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\ - QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ - QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ - QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ - QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ - \ - : "+a"(src), "+c"(dst)\ - : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ - : "memory"\ - );\ - }\ - src += 4-(h+5)*srcStride;\ - dst += 4-h*dstStride;\ - }\ -}\ -static av_always_inline void OPNAME ## h264_qpel8or16_hv1_lowpass_ ## MMX(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){\ - int w = (size+8)>>2;\ - src -= 2*srcStride+2;\ - while(w--){\ - __asm__ volatile(\ - "pxor %%mm7, %%mm7 \n\t"\ - "movd (%0), %%mm0 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm1 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm2 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm3 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm4 \n\t"\ - "add %2, %0 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "punpcklbw %%mm7, %%mm1 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpcklbw %%mm7, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm4 \n\t"\ - QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*48)\ - QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*48)\ - QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*48)\ - QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*48)\ - QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*48)\ - QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*48)\ - QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*48)\ - QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*48)\ - : "+a"(src)\ - : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\ - : "memory"\ - );\ - if(size==16){\ - __asm__ volatile(\ - QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 8*48)\ - QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 9*48)\ - QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 10*48)\ - QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 11*48)\ - QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 12*48)\ - QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 13*48)\ - QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 14*48)\ - QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 15*48)\ - : "+a"(src)\ - : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\ - : "memory"\ - );\ - }\ - tmp += 4;\ - src += 4 - (size+5)*srcStride;\ - }\ -}\ -static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\ - int w = size>>4;\ - do{\ - int h = size;\ - __asm__ volatile(\ - "1: \n\t"\ - "movq (%0), %%mm0 \n\t"\ - "movq 8(%0), %%mm3 \n\t"\ - "movq 2(%0), %%mm1 \n\t"\ - "movq 10(%0), %%mm4 \n\t"\ - "paddw %%mm4, %%mm0 \n\t"\ - "paddw %%mm3, %%mm1 \n\t"\ - "paddw 18(%0), %%mm3 \n\t"\ - "paddw 16(%0), %%mm4 \n\t"\ - "movq 4(%0), %%mm2 \n\t"\ - "movq 12(%0), %%mm5 \n\t"\ - "paddw 6(%0), %%mm2 \n\t"\ - "paddw 14(%0), %%mm5 \n\t"\ - "psubw %%mm1, %%mm0 \n\t"\ - "psubw %%mm4, %%mm3 \n\t"\ - "psraw $2, %%mm0 \n\t"\ - "psraw $2, %%mm3 \n\t"\ - "psubw %%mm1, %%mm0 \n\t"\ - "psubw %%mm4, %%mm3 \n\t"\ - "paddsw %%mm2, %%mm0 \n\t"\ - "paddsw %%mm5, %%mm3 \n\t"\ - "psraw $2, %%mm0 \n\t"\ - "psraw $2, %%mm3 \n\t"\ - "paddw %%mm2, %%mm0 \n\t"\ - "paddw %%mm5, %%mm3 \n\t"\ - "psraw $6, %%mm0 \n\t"\ - "psraw $6, %%mm3 \n\t"\ - "packuswb %%mm3, %%mm0 \n\t"\ - OP(%%mm0, (%1),%%mm7, q)\ - "add $48, %0 \n\t"\ - "add %3, %1 \n\t"\ - "decl %2 \n\t"\ - " jnz 1b \n\t"\ - : "+a"(tmp), "+c"(dst), "+g"(h)\ - : "S"((x86_reg)dstStride)\ - : "memory"\ - );\ - tmp += 8 - size*24;\ - dst += 8 - size*dstStride;\ - }while(w--);\ -}\ -\ -static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\ -}\ -static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\ - OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ -}\ -\ -static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ - OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ - src += 8*srcStride;\ - dst += 8*dstStride;\ - OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ - OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ -}\ -\ -static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ - OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ - OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ - src += 8*dstStride;\ - dst += 8*dstStride;\ - src2 += 8*src2Stride;\ - OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ - OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ -}\ -\ -static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\ - put_h264_qpel8or16_hv1_lowpass_ ## MMX(tmp, src, tmpStride, srcStride, size);\ - OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\ -}\ -static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ - OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 8);\ -}\ -\ -static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ - OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 16);\ -}\ -\ -static av_noinline void OPNAME ## pixels4_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\ -{\ - __asm__ volatile(\ - "movq (%1), %%mm0 \n\t"\ - "movq 24(%1), %%mm1 \n\t"\ - "psraw $5, %%mm0 \n\t"\ - "psraw $5, %%mm1 \n\t"\ - "packuswb %%mm0, %%mm0 \n\t"\ - "packuswb %%mm1, %%mm1 \n\t"\ - PAVGB" (%0), %%mm0 \n\t"\ - PAVGB" (%0,%3), %%mm1 \n\t"\ - OP(%%mm0, (%2), %%mm4, d)\ - OP(%%mm1, (%2,%4), %%mm5, d)\ - "lea (%0,%3,2), %0 \n\t"\ - "lea (%2,%4,2), %2 \n\t"\ - "movq 48(%1), %%mm0 \n\t"\ - "movq 72(%1), %%mm1 \n\t"\ - "psraw $5, %%mm0 \n\t"\ - "psraw $5, %%mm1 \n\t"\ - "packuswb %%mm0, %%mm0 \n\t"\ - "packuswb %%mm1, %%mm1 \n\t"\ - PAVGB" (%0), %%mm0 \n\t"\ - PAVGB" (%0,%3), %%mm1 \n\t"\ - OP(%%mm0, (%2), %%mm4, d)\ - OP(%%mm1, (%2,%4), %%mm5, d)\ - :"+a"(src8), "+c"(src16), "+d"(dst)\ - :"S"((x86_reg)src8Stride), "D"((x86_reg)dstStride)\ - :"memory");\ -}\ -static av_noinline void OPNAME ## pixels8_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\ -{\ - do{\ - __asm__ volatile(\ - "movq (%1), %%mm0 \n\t"\ - "movq 8(%1), %%mm1 \n\t"\ - "movq 48(%1), %%mm2 \n\t"\ - "movq 8+48(%1), %%mm3 \n\t"\ - "psraw $5, %%mm0 \n\t"\ - "psraw $5, %%mm1 \n\t"\ - "psraw $5, %%mm2 \n\t"\ - "psraw $5, %%mm3 \n\t"\ - "packuswb %%mm1, %%mm0 \n\t"\ - "packuswb %%mm3, %%mm2 \n\t"\ - PAVGB" (%0), %%mm0 \n\t"\ - PAVGB" (%0,%3), %%mm2 \n\t"\ - OP(%%mm0, (%2), %%mm5, q)\ - OP(%%mm2, (%2,%4), %%mm5, q)\ - ::"a"(src8), "c"(src16), "d"(dst),\ - "r"((x86_reg)src8Stride), "r"((x86_reg)dstStride)\ - :"memory");\ - src8 += 2L*src8Stride;\ - src16 += 48;\ - dst += 2L*dstStride;\ - }while(h-=2);\ -}\ -static void OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\ -{\ - OPNAME ## pixels8_l2_shift5_ ## MMX(dst , src16 , src8 , dstStride, src8Stride, h);\ - OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\ -}\ - - -#ifdef ARCH_X86_64 -#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ -static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ - int h=16;\ - __asm__ volatile(\ - "pxor %%xmm15, %%xmm15 \n\t"\ - "movdqa %6, %%xmm14 \n\t"\ - "movdqa %7, %%xmm13 \n\t"\ - "1: \n\t"\ - "lddqu 3(%0), %%xmm1 \n\t"\ - "lddqu -5(%0), %%xmm7 \n\t"\ - "movdqa %%xmm1, %%xmm0 \n\t"\ - "punpckhbw %%xmm15, %%xmm1 \n\t"\ - "punpcklbw %%xmm15, %%xmm0 \n\t"\ - "punpcklbw %%xmm15, %%xmm7 \n\t"\ - "movdqa %%xmm1, %%xmm2 \n\t"\ - "movdqa %%xmm0, %%xmm6 \n\t"\ - "movdqa %%xmm1, %%xmm3 \n\t"\ - "movdqa %%xmm0, %%xmm8 \n\t"\ - "movdqa %%xmm1, %%xmm4 \n\t"\ - "movdqa %%xmm0, %%xmm9 \n\t"\ - "movdqa %%xmm1, %%xmm5 \n\t"\ - "movdqa %%xmm0, %%xmm10 \n\t"\ - "palignr $6, %%xmm0, %%xmm5 \n\t"\ - "palignr $6, %%xmm7, %%xmm10\n\t"\ - "palignr $8, %%xmm0, %%xmm4 \n\t"\ - "palignr $8, %%xmm7, %%xmm9 \n\t"\ - "palignr $10,%%xmm0, %%xmm3 \n\t"\ - "palignr $10,%%xmm7, %%xmm8 \n\t"\ - "paddw %%xmm1, %%xmm5 \n\t"\ - "paddw %%xmm0, %%xmm10 \n\t"\ - "palignr $12,%%xmm0, %%xmm2 \n\t"\ - "palignr $12,%%xmm7, %%xmm6 \n\t"\ - "palignr $14,%%xmm0, %%xmm1 \n\t"\ - "palignr $14,%%xmm7, %%xmm0 \n\t"\ - "paddw %%xmm3, %%xmm2 \n\t"\ - "paddw %%xmm8, %%xmm6 \n\t"\ - "paddw %%xmm4, %%xmm1 \n\t"\ - "paddw %%xmm9, %%xmm0 \n\t"\ - "psllw $2, %%xmm2 \n\t"\ - "psllw $2, %%xmm6 \n\t"\ - "psubw %%xmm1, %%xmm2 \n\t"\ - "psubw %%xmm0, %%xmm6 \n\t"\ - "paddw %%xmm13,%%xmm5 \n\t"\ - "paddw %%xmm13,%%xmm10 \n\t"\ - "pmullw %%xmm14,%%xmm2 \n\t"\ - "pmullw %%xmm14,%%xmm6 \n\t"\ - "lddqu (%2), %%xmm3 \n\t"\ - "paddw %%xmm5, %%xmm2 \n\t"\ - "paddw %%xmm10,%%xmm6 \n\t"\ - "psraw $5, %%xmm2 \n\t"\ - "psraw $5, %%xmm6 \n\t"\ - "packuswb %%xmm2,%%xmm6 \n\t"\ - "pavgb %%xmm3, %%xmm6 \n\t"\ - OP(%%xmm6, (%1), %%xmm4, dqa)\ - "add %5, %0 \n\t"\ - "add %5, %1 \n\t"\ - "add %4, %2 \n\t"\ - "decl %3 \n\t"\ - "jg 1b \n\t"\ - : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\ - : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\ - "m"(ff_pw_5), "m"(ff_pw_16)\ - : "memory"\ - );\ -} -#else // ARCH_X86_64 -#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ -static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ - OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ - OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ - src += 8*dstStride;\ - dst += 8*dstStride;\ - src2 += 8*src2Stride;\ - OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ - OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ -} -#endif // ARCH_X86_64 - -#define QPEL_H264_H_XMM(OPNAME, OP, MMX)\ -static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ - int h=8;\ - __asm__ volatile(\ - "pxor %%xmm7, %%xmm7 \n\t"\ - "movdqa %0, %%xmm6 \n\t"\ - :: "m"(ff_pw_5)\ - );\ - do{\ - __asm__ volatile(\ - "lddqu -5(%0), %%xmm1 \n\t"\ - "movdqa %%xmm1, %%xmm0 \n\t"\ - "punpckhbw %%xmm7, %%xmm1 \n\t"\ - "punpcklbw %%xmm7, %%xmm0 \n\t"\ - "movdqa %%xmm1, %%xmm2 \n\t"\ - "movdqa %%xmm1, %%xmm3 \n\t"\ - "movdqa %%xmm1, %%xmm4 \n\t"\ - "movdqa %%xmm1, %%xmm5 \n\t"\ - "palignr $6, %%xmm0, %%xmm5 \n\t"\ - "palignr $8, %%xmm0, %%xmm4 \n\t"\ - "palignr $10,%%xmm0, %%xmm3 \n\t"\ - "paddw %%xmm1, %%xmm5 \n\t"\ - "palignr $12,%%xmm0, %%xmm2 \n\t"\ - "palignr $14,%%xmm0, %%xmm1 \n\t"\ - "paddw %%xmm3, %%xmm2 \n\t"\ - "paddw %%xmm4, %%xmm1 \n\t"\ - "psllw $2, %%xmm2 \n\t"\ - "movq (%2), %%xmm3 \n\t"\ - "psubw %%xmm1, %%xmm2 \n\t"\ - "paddw %5, %%xmm5 \n\t"\ - "pmullw %%xmm6, %%xmm2 \n\t"\ - "paddw %%xmm5, %%xmm2 \n\t"\ - "psraw $5, %%xmm2 \n\t"\ - "packuswb %%xmm2, %%xmm2 \n\t"\ - "pavgb %%xmm3, %%xmm2 \n\t"\ - OP(%%xmm2, (%1), %%xmm4, q)\ - "add %4, %0 \n\t"\ - "add %4, %1 \n\t"\ - "add %3, %2 \n\t"\ - : "+a"(src), "+c"(dst), "+d"(src2)\ - : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\ - "m"(ff_pw_16)\ - : "memory"\ - );\ - }while(--h);\ -}\ -QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ -\ -static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - int h=8;\ - __asm__ volatile(\ - "pxor %%xmm7, %%xmm7 \n\t"\ - "movdqa %5, %%xmm6 \n\t"\ - "1: \n\t"\ - "lddqu -5(%0), %%xmm1 \n\t"\ - "movdqa %%xmm1, %%xmm0 \n\t"\ - "punpckhbw %%xmm7, %%xmm1 \n\t"\ - "punpcklbw %%xmm7, %%xmm0 \n\t"\ - "movdqa %%xmm1, %%xmm2 \n\t"\ - "movdqa %%xmm1, %%xmm3 \n\t"\ - "movdqa %%xmm1, %%xmm4 \n\t"\ - "movdqa %%xmm1, %%xmm5 \n\t"\ - "palignr $6, %%xmm0, %%xmm5 \n\t"\ - "palignr $8, %%xmm0, %%xmm4 \n\t"\ - "palignr $10,%%xmm0, %%xmm3 \n\t"\ - "paddw %%xmm1, %%xmm5 \n\t"\ - "palignr $12,%%xmm0, %%xmm2 \n\t"\ - "palignr $14,%%xmm0, %%xmm1 \n\t"\ - "paddw %%xmm3, %%xmm2 \n\t"\ - "paddw %%xmm4, %%xmm1 \n\t"\ - "psllw $2, %%xmm2 \n\t"\ - "psubw %%xmm1, %%xmm2 \n\t"\ - "paddw %6, %%xmm5 \n\t"\ - "pmullw %%xmm6, %%xmm2 \n\t"\ - "paddw %%xmm5, %%xmm2 \n\t"\ - "psraw $5, %%xmm2 \n\t"\ - "packuswb %%xmm2, %%xmm2 \n\t"\ - OP(%%xmm2, (%1), %%xmm4, q)\ - "add %3, %0 \n\t"\ - "add %4, %1 \n\t"\ - "decl %2 \n\t"\ - " jnz 1b \n\t"\ - : "+a"(src), "+c"(dst), "+g"(h)\ - : "D"((x86_reg)srcStride), "S"((x86_reg)dstStride),\ - "m"(ff_pw_5), "m"(ff_pw_16)\ - : "memory"\ - );\ -}\ -static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ - OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ - src += 8*srcStride;\ - dst += 8*dstStride;\ - OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ - OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ -}\ - -#define QPEL_H264_V_XMM(OPNAME, OP, MMX)\ -static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ - src -= 2*srcStride;\ - \ - __asm__ volatile(\ - "pxor %%xmm7, %%xmm7 \n\t"\ - "movq (%0), %%xmm0 \n\t"\ - "add %2, %0 \n\t"\ - "movq (%0), %%xmm1 \n\t"\ - "add %2, %0 \n\t"\ - "movq (%0), %%xmm2 \n\t"\ - "add %2, %0 \n\t"\ - "movq (%0), %%xmm3 \n\t"\ - "add %2, %0 \n\t"\ - "movq (%0), %%xmm4 \n\t"\ - "add %2, %0 \n\t"\ - "punpcklbw %%xmm7, %%xmm0 \n\t"\ - "punpcklbw %%xmm7, %%xmm1 \n\t"\ - "punpcklbw %%xmm7, %%xmm2 \n\t"\ - "punpcklbw %%xmm7, %%xmm3 \n\t"\ - "punpcklbw %%xmm7, %%xmm4 \n\t"\ - QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\ - QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\ - QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\ - QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\ - QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\ - QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\ - QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\ - QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\ - \ - : "+a"(src), "+c"(dst)\ - : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ - : "memory"\ - );\ - if(h==16){\ - __asm__ volatile(\ - QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\ - QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\ - QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\ - QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\ - QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\ - QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\ - QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\ - QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\ - \ - : "+a"(src), "+c"(dst)\ - : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ - : "memory"\ - );\ - }\ -}\ -static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\ -}\ -static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\ - OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ -} - -static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){ - int w = (size+8)>>3; - src -= 2*srcStride+2; - while(w--){ - __asm__ volatile( - "pxor %%xmm7, %%xmm7 \n\t" - "movq (%0), %%xmm0 \n\t" - "add %2, %0 \n\t" - "movq (%0), %%xmm1 \n\t" - "add %2, %0 \n\t" - "movq (%0), %%xmm2 \n\t" - "add %2, %0 \n\t" - "movq (%0), %%xmm3 \n\t" - "add %2, %0 \n\t" - "movq (%0), %%xmm4 \n\t" - "add %2, %0 \n\t" - "punpcklbw %%xmm7, %%xmm0 \n\t" - "punpcklbw %%xmm7, %%xmm1 \n\t" - "punpcklbw %%xmm7, %%xmm2 \n\t" - "punpcklbw %%xmm7, %%xmm3 \n\t" - "punpcklbw %%xmm7, %%xmm4 \n\t" - QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 0*48) - QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 1*48) - QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 2*48) - QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 3*48) - QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 4*48) - QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 5*48) - QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 6*48) - QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 7*48) - : "+a"(src) - : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16) - : "memory" - ); - if(size==16){ - __asm__ volatile( - QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 8*48) - QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 9*48) - QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 10*48) - QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 11*48) - QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 12*48) - QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 13*48) - QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 14*48) - QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 15*48) - : "+a"(src) - : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16) - : "memory" - ); - } - tmp += 8; - src += 8 - (size+5)*srcStride; - } -} - -#define QPEL_H264_HV2_XMM(OPNAME, OP, MMX)\ -static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\ - int h = size;\ - if(size == 16){\ - __asm__ volatile(\ - "1: \n\t"\ - "movdqa 32(%0), %%xmm4 \n\t"\ - "movdqa 16(%0), %%xmm5 \n\t"\ - "movdqa (%0), %%xmm7 \n\t"\ - "movdqa %%xmm4, %%xmm3 \n\t"\ - "movdqa %%xmm4, %%xmm2 \n\t"\ - "movdqa %%xmm4, %%xmm1 \n\t"\ - "movdqa %%xmm4, %%xmm0 \n\t"\ - "palignr $10, %%xmm5, %%xmm0 \n\t"\ - "palignr $8, %%xmm5, %%xmm1 \n\t"\ - "palignr $6, %%xmm5, %%xmm2 \n\t"\ - "palignr $4, %%xmm5, %%xmm3 \n\t"\ - "palignr $2, %%xmm5, %%xmm4 \n\t"\ - "paddw %%xmm5, %%xmm0 \n\t"\ - "paddw %%xmm4, %%xmm1 \n\t"\ - "paddw %%xmm3, %%xmm2 \n\t"\ - "movdqa %%xmm5, %%xmm6 \n\t"\ - "movdqa %%xmm5, %%xmm4 \n\t"\ - "movdqa %%xmm5, %%xmm3 \n\t"\ - "palignr $8, %%xmm7, %%xmm4 \n\t"\ - "palignr $2, %%xmm7, %%xmm6 \n\t"\ - "palignr $10, %%xmm7, %%xmm3 \n\t"\ - "paddw %%xmm6, %%xmm4 \n\t"\ - "movdqa %%xmm5, %%xmm6 \n\t"\ - "palignr $6, %%xmm7, %%xmm5 \n\t"\ - "palignr $4, %%xmm7, %%xmm6 \n\t"\ - "paddw %%xmm7, %%xmm3 \n\t"\ - "paddw %%xmm6, %%xmm5 \n\t"\ - \ - "psubw %%xmm1, %%xmm0 \n\t"\ - "psubw %%xmm4, %%xmm3 \n\t"\ - "psraw $2, %%xmm0 \n\t"\ - "psraw $2, %%xmm3 \n\t"\ - "psubw %%xmm1, %%xmm0 \n\t"\ - "psubw %%xmm4, %%xmm3 \n\t"\ - "paddw %%xmm2, %%xmm0 \n\t"\ - "paddw %%xmm5, %%xmm3 \n\t"\ - "psraw $2, %%xmm0 \n\t"\ - "psraw $2, %%xmm3 \n\t"\ - "paddw %%xmm2, %%xmm0 \n\t"\ - "paddw %%xmm5, %%xmm3 \n\t"\ - "psraw $6, %%xmm0 \n\t"\ - "psraw $6, %%xmm3 \n\t"\ - "packuswb %%xmm0, %%xmm3 \n\t"\ - OP(%%xmm3, (%1), %%xmm7, dqa)\ - "add $48, %0 \n\t"\ - "add %3, %1 \n\t"\ - "decl %2 \n\t"\ - " jnz 1b \n\t"\ - : "+a"(tmp), "+c"(dst), "+g"(h)\ - : "S"((x86_reg)dstStride)\ - : "memory"\ - );\ - }else{\ - __asm__ volatile(\ - "1: \n\t"\ - "movdqa 16(%0), %%xmm1 \n\t"\ - "movdqa (%0), %%xmm0 \n\t"\ - "movdqa %%xmm1, %%xmm2 \n\t"\ - "movdqa %%xmm1, %%xmm3 \n\t"\ - "movdqa %%xmm1, %%xmm4 \n\t"\ - "movdqa %%xmm1, %%xmm5 \n\t"\ - "palignr $10, %%xmm0, %%xmm5 \n\t"\ - "palignr $8, %%xmm0, %%xmm4 \n\t"\ - "palignr $6, %%xmm0, %%xmm3 \n\t"\ - "palignr $4, %%xmm0, %%xmm2 \n\t"\ - "palignr $2, %%xmm0, %%xmm1 \n\t"\ - "paddw %%xmm5, %%xmm0 \n\t"\ - "paddw %%xmm4, %%xmm1 \n\t"\ - "paddw %%xmm3, %%xmm2 \n\t"\ - "psubw %%xmm1, %%xmm0 \n\t"\ - "psraw $2, %%xmm0 \n\t"\ - "psubw %%xmm1, %%xmm0 \n\t"\ - "paddw %%xmm2, %%xmm0 \n\t"\ - "psraw $2, %%xmm0 \n\t"\ - "paddw %%xmm2, %%xmm0 \n\t"\ - "psraw $6, %%xmm0 \n\t"\ - "packuswb %%xmm0, %%xmm0 \n\t"\ - OP(%%xmm0, (%1), %%xmm7, q)\ - "add $48, %0 \n\t"\ - "add %3, %1 \n\t"\ - "decl %2 \n\t"\ - " jnz 1b \n\t"\ - : "+a"(tmp), "+c"(dst), "+g"(h)\ - : "S"((x86_reg)dstStride)\ - : "memory"\ - );\ - }\ -} - -#define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\ -static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\ - put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\ - OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\ -}\ -static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ - OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\ -}\ -static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ - OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\ -}\ - -#define put_pixels8_l2_sse2 put_pixels8_l2_mmx2 -#define avg_pixels8_l2_sse2 avg_pixels8_l2_mmx2 -#define put_pixels16_l2_sse2 put_pixels16_l2_mmx2 -#define avg_pixels16_l2_sse2 avg_pixels16_l2_mmx2 -#define put_pixels8_l2_ssse3 put_pixels8_l2_mmx2 -#define avg_pixels8_l2_ssse3 avg_pixels8_l2_mmx2 -#define put_pixels16_l2_ssse3 put_pixels16_l2_mmx2 -#define avg_pixels16_l2_ssse3 avg_pixels16_l2_mmx2 - -#define put_pixels8_l2_shift5_sse2 put_pixels8_l2_shift5_mmx2 -#define avg_pixels8_l2_shift5_sse2 avg_pixels8_l2_shift5_mmx2 -#define put_pixels16_l2_shift5_sse2 put_pixels16_l2_shift5_mmx2 -#define avg_pixels16_l2_shift5_sse2 avg_pixels16_l2_shift5_mmx2 -#define put_pixels8_l2_shift5_ssse3 put_pixels8_l2_shift5_mmx2 -#define avg_pixels8_l2_shift5_ssse3 avg_pixels8_l2_shift5_mmx2 -#define put_pixels16_l2_shift5_ssse3 put_pixels16_l2_shift5_mmx2 -#define avg_pixels16_l2_shift5_ssse3 avg_pixels16_l2_shift5_mmx2 - -#define put_h264_qpel8_h_lowpass_l2_sse2 put_h264_qpel8_h_lowpass_l2_mmx2 -#define avg_h264_qpel8_h_lowpass_l2_sse2 avg_h264_qpel8_h_lowpass_l2_mmx2 -#define put_h264_qpel16_h_lowpass_l2_sse2 put_h264_qpel16_h_lowpass_l2_mmx2 -#define avg_h264_qpel16_h_lowpass_l2_sse2 avg_h264_qpel16_h_lowpass_l2_mmx2 - -#define put_h264_qpel8_v_lowpass_ssse3 put_h264_qpel8_v_lowpass_sse2 -#define avg_h264_qpel8_v_lowpass_ssse3 avg_h264_qpel8_v_lowpass_sse2 -#define put_h264_qpel16_v_lowpass_ssse3 put_h264_qpel16_v_lowpass_sse2 -#define avg_h264_qpel16_v_lowpass_ssse3 avg_h264_qpel16_v_lowpass_sse2 - -#define put_h264_qpel8or16_hv2_lowpass_sse2 put_h264_qpel8or16_hv2_lowpass_mmx2 -#define avg_h264_qpel8or16_hv2_lowpass_sse2 avg_h264_qpel8or16_hv2_lowpass_mmx2 - -#define H264_MC(OPNAME, SIZE, MMX, ALIGN) \ -H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\ -H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\ -H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\ -H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\ - -static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){ - put_pixels16_sse2(dst, src, stride, 16); -} -static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){ - avg_pixels16_sse2(dst, src, stride, 16); -} -#define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmx2 -#define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmx2 - -#define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \ -static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\ -}\ - -#define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \ -static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\ -}\ - -#define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \ -static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\ -}\ - -#define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \ -static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\ - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\ - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(ALIGN, uint16_t, temp[SIZE*(SIZE<8?12:24)]);\ - OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\ - uint8_t * const halfHV= temp;\ - int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ - assert(((int)temp & 7) == 0);\ - put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\ - uint8_t * const halfHV= temp;\ - int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ - assert(((int)temp & 7) == 0);\ - put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\ - uint8_t * const halfHV= temp;\ - int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ - assert(((int)temp & 7) == 0);\ - put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+2, halfHV, stride, SIZE, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\ - uint8_t * const halfHV= temp;\ - int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ - assert(((int)temp & 7) == 0);\ - put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+3, halfHV, stride, SIZE, SIZE);\ -}\ - -#define H264_MC_4816(MMX)\ -H264_MC(put_, 4, MMX, 8)\ -H264_MC(put_, 8, MMX, 8)\ -H264_MC(put_, 16,MMX, 8)\ -H264_MC(avg_, 4, MMX, 8)\ -H264_MC(avg_, 8, MMX, 8)\ -H264_MC(avg_, 16,MMX, 8)\ - -#define H264_MC_816(QPEL, XMM)\ -QPEL(put_, 8, XMM, 16)\ -QPEL(put_, 16,XMM, 16)\ -QPEL(avg_, 8, XMM, 16)\ -QPEL(avg_, 16,XMM, 16)\ - - -#define AVG_3DNOW_OP(a,b,temp, size) \ -"mov" #size " " #b ", " #temp " \n\t"\ -"pavgusb " #temp ", " #a " \n\t"\ -"mov" #size " " #a ", " #b " \n\t" -#define AVG_MMX2_OP(a,b,temp, size) \ -"mov" #size " " #b ", " #temp " \n\t"\ -"pavgb " #temp ", " #a " \n\t"\ -"mov" #size " " #a ", " #b " \n\t" - -#define PAVGB "pavgusb" -QPEL_H264(put_, PUT_OP, 3dnow) -QPEL_H264(avg_, AVG_3DNOW_OP, 3dnow) -#undef PAVGB -#define PAVGB "pavgb" -QPEL_H264(put_, PUT_OP, mmx2) -QPEL_H264(avg_, AVG_MMX2_OP, mmx2) -QPEL_H264_V_XMM(put_, PUT_OP, sse2) -QPEL_H264_V_XMM(avg_, AVG_MMX2_OP, sse2) -QPEL_H264_HV_XMM(put_, PUT_OP, sse2) -QPEL_H264_HV_XMM(avg_, AVG_MMX2_OP, sse2) -#ifdef HAVE_SSSE3 -QPEL_H264_H_XMM(put_, PUT_OP, ssse3) -QPEL_H264_H_XMM(avg_, AVG_MMX2_OP, ssse3) -QPEL_H264_HV2_XMM(put_, PUT_OP, ssse3) -QPEL_H264_HV2_XMM(avg_, AVG_MMX2_OP, ssse3) -QPEL_H264_HV_XMM(put_, PUT_OP, ssse3) -QPEL_H264_HV_XMM(avg_, AVG_MMX2_OP, ssse3) -#endif -#undef PAVGB - -H264_MC_4816(3dnow) -H264_MC_4816(mmx2) -H264_MC_816(H264_MC_V, sse2) -H264_MC_816(H264_MC_HV, sse2) -#ifdef HAVE_SSSE3 -H264_MC_816(H264_MC_H, ssse3) -H264_MC_816(H264_MC_HV, ssse3) -#endif - - -#define H264_CHROMA_OP(S,D) -#define H264_CHROMA_OP4(S,D,T) -#define H264_CHROMA_MC8_TMPL put_h264_chroma_mc8_mmx -#define H264_CHROMA_MC4_TMPL put_h264_chroma_mc4_mmx -#define H264_CHROMA_MC2_TMPL put_h264_chroma_mc2_mmx2 -#define H264_CHROMA_MC8_MV0 put_pixels8_mmx -#include "dsputil_h264_template_mmx.c" - -static void put_h264_chroma_mc8_mmx_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) -{ - put_h264_chroma_mc8_mmx(dst, src, stride, h, x, y, 1); -} -static void put_h264_chroma_mc8_mmx_nornd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) -{ - put_h264_chroma_mc8_mmx(dst, src, stride, h, x, y, 0); -} - -#undef H264_CHROMA_OP -#undef H264_CHROMA_OP4 -#undef H264_CHROMA_MC8_TMPL -#undef H264_CHROMA_MC4_TMPL -#undef H264_CHROMA_MC2_TMPL -#undef H264_CHROMA_MC8_MV0 - -#define H264_CHROMA_OP(S,D) "pavgb " #S ", " #D " \n\t" -#define H264_CHROMA_OP4(S,D,T) "movd " #S ", " #T " \n\t"\ - "pavgb " #T ", " #D " \n\t" -#define H264_CHROMA_MC8_TMPL avg_h264_chroma_mc8_mmx2 -#define H264_CHROMA_MC4_TMPL avg_h264_chroma_mc4_mmx2 -#define H264_CHROMA_MC2_TMPL avg_h264_chroma_mc2_mmx2 -#define H264_CHROMA_MC8_MV0 avg_pixels8_mmx2 -#include "dsputil_h264_template_mmx.c" -static void avg_h264_chroma_mc8_mmx2_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) -{ - avg_h264_chroma_mc8_mmx2(dst, src, stride, h, x, y, 1); -} -#undef H264_CHROMA_OP -#undef H264_CHROMA_OP4 -#undef H264_CHROMA_MC8_TMPL -#undef H264_CHROMA_MC4_TMPL -#undef H264_CHROMA_MC2_TMPL -#undef H264_CHROMA_MC8_MV0 - -#define H264_CHROMA_OP(S,D) "pavgusb " #S ", " #D " \n\t" -#define H264_CHROMA_OP4(S,D,T) "movd " #S ", " #T " \n\t"\ - "pavgusb " #T ", " #D " \n\t" -#define H264_CHROMA_MC8_TMPL avg_h264_chroma_mc8_3dnow -#define H264_CHROMA_MC4_TMPL avg_h264_chroma_mc4_3dnow -#define H264_CHROMA_MC8_MV0 avg_pixels8_3dnow -#include "dsputil_h264_template_mmx.c" -static void avg_h264_chroma_mc8_3dnow_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) -{ - avg_h264_chroma_mc8_3dnow(dst, src, stride, h, x, y, 1); -} -#undef H264_CHROMA_OP -#undef H264_CHROMA_OP4 -#undef H264_CHROMA_MC8_TMPL -#undef H264_CHROMA_MC4_TMPL -#undef H264_CHROMA_MC8_MV0 - -#ifdef HAVE_SSSE3 -#define AVG_OP(X) -#undef H264_CHROMA_MC8_TMPL -#undef H264_CHROMA_MC4_TMPL -#define H264_CHROMA_MC8_TMPL put_h264_chroma_mc8_ssse3 -#define H264_CHROMA_MC4_TMPL put_h264_chroma_mc4_ssse3 -#define H264_CHROMA_MC8_MV0 put_pixels8_mmx -#include "dsputil_h264_template_ssse3.c" -static void put_h264_chroma_mc8_ssse3_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) -{ - put_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 1); -} -static void put_h264_chroma_mc8_ssse3_nornd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) -{ - put_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 0); -} - -#undef AVG_OP -#undef H264_CHROMA_MC8_TMPL -#undef H264_CHROMA_MC4_TMPL -#undef H264_CHROMA_MC8_MV0 -#define AVG_OP(X) X -#define H264_CHROMA_MC8_TMPL avg_h264_chroma_mc8_ssse3 -#define H264_CHROMA_MC4_TMPL avg_h264_chroma_mc4_ssse3 -#define H264_CHROMA_MC8_MV0 avg_pixels8_mmx2 -#include "dsputil_h264_template_ssse3.c" -static void avg_h264_chroma_mc8_ssse3_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) -{ - avg_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 1); -} -#undef AVG_OP -#undef H264_CHROMA_MC8_TMPL -#undef H264_CHROMA_MC4_TMPL -#undef H264_CHROMA_MC8_MV0 -#endif - -/***********************************/ -/* weighted prediction */ - -static inline void ff_h264_weight_WxH_mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset, int w, int h) -{ - int x, y; - offset <<= log2_denom; - offset += (1 << log2_denom) >> 1; - __asm__ volatile( - "movd %0, %%mm4 \n\t" - "movd %1, %%mm5 \n\t" - "movd %2, %%mm6 \n\t" - "pshufw $0, %%mm4, %%mm4 \n\t" - "pshufw $0, %%mm5, %%mm5 \n\t" - "pxor %%mm7, %%mm7 \n\t" - :: "g"(weight), "g"(offset), "g"(log2_denom) - ); - for(y=0; y<h; y+=2){ - for(x=0; x<w; x+=4){ - __asm__ volatile( - "movd %0, %%mm0 \n\t" - "movd %1, %%mm1 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm1 \n\t" - "pmullw %%mm4, %%mm0 \n\t" - "pmullw %%mm4, %%mm1 \n\t" - "paddsw %%mm5, %%mm0 \n\t" - "paddsw %%mm5, %%mm1 \n\t" - "psraw %%mm6, %%mm0 \n\t" - "psraw %%mm6, %%mm1 \n\t" - "packuswb %%mm7, %%mm0 \n\t" - "packuswb %%mm7, %%mm1 \n\t" - "movd %%mm0, %0 \n\t" - "movd %%mm1, %1 \n\t" - : "+m"(*(uint32_t*)(dst+x)), - "+m"(*(uint32_t*)(dst+x+stride)) - ); - } - dst += 2*stride; - } -} - -static inline void ff_h264_biweight_WxH_mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset, int w, int h) -{ - int x, y; - offset = ((offset + 1) | 1) << log2_denom; - __asm__ volatile( - "movd %0, %%mm3 \n\t" - "movd %1, %%mm4 \n\t" - "movd %2, %%mm5 \n\t" - "movd %3, %%mm6 \n\t" - "pshufw $0, %%mm3, %%mm3 \n\t" - "pshufw $0, %%mm4, %%mm4 \n\t" - "pshufw $0, %%mm5, %%mm5 \n\t" - "pxor %%mm7, %%mm7 \n\t" - :: "g"(weightd), "g"(weights), "g"(offset), "g"(log2_denom+1) - ); - for(y=0; y<h; y++){ - for(x=0; x<w; x+=4){ - __asm__ volatile( - "movd %0, %%mm0 \n\t" - "movd %1, %%mm1 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm1 \n\t" - "pmullw %%mm3, %%mm0 \n\t" - "pmullw %%mm4, %%mm1 \n\t" - "paddsw %%mm1, %%mm0 \n\t" - "paddsw %%mm5, %%mm0 \n\t" - "psraw %%mm6, %%mm0 \n\t" - "packuswb %%mm0, %%mm0 \n\t" - "movd %%mm0, %0 \n\t" - : "+m"(*(uint32_t*)(dst+x)) - : "m"(*(uint32_t*)(src+x)) - ); - } - src += stride; - dst += stride; - } -} - -#define H264_WEIGHT(W,H) \ -static void ff_h264_biweight_ ## W ## x ## H ## _mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \ - ff_h264_biweight_WxH_mmx2(dst, src, stride, log2_denom, weightd, weights, offset, W, H); \ -} \ -static void ff_h264_weight_ ## W ## x ## H ## _mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset){ \ - ff_h264_weight_WxH_mmx2(dst, stride, log2_denom, weight, offset, W, H); \ -} - -H264_WEIGHT(16,16) -H264_WEIGHT(16, 8) -H264_WEIGHT( 8,16) -H264_WEIGHT( 8, 8) -H264_WEIGHT( 8, 4) -H264_WEIGHT( 4, 8) -H264_WEIGHT( 4, 4) -H264_WEIGHT( 4, 2) -
--- a/i386/idct_mmx.c Mon Dec 22 06:50:18 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,605 +0,0 @@ -/* - * idct_mmx.c - * Copyright (C) 1999-2001 Aaron Holtzman <aholtzma@ess.engr.uvic.ca> - * - * This file is part of mpeg2dec, a free MPEG-2 video stream decoder. - * See http://libmpeg2.sourceforge.net/ for updates. - * - * mpeg2dec is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * mpeg2dec is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with mpeg2dec; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/common.h" -#include "libavcodec/dsputil.h" - -#include "mmx.h" - -#define ATTR_ALIGN(align) __attribute__ ((__aligned__ (align))) - -#define ROW_SHIFT 11 -#define COL_SHIFT 6 - -#define round(bias) ((int)(((bias)+0.5) * (1<<ROW_SHIFT))) -#define rounder(bias) {round (bias), round (bias)} - - -#if 0 -/* C row IDCT - it is just here to document the MMXEXT and MMX versions */ -static inline void idct_row (int16_t * row, int offset, - int16_t * table, int32_t * rounder) -{ - int C1, C2, C3, C4, C5, C6, C7; - int a0, a1, a2, a3, b0, b1, b2, b3; - - row += offset; - - C1 = table[1]; - C2 = table[2]; - C3 = table[3]; - C4 = table[4]; - C5 = table[5]; - C6 = table[6]; - C7 = table[7]; - - a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + *rounder; - a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + *rounder; - a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + *rounder; - a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + *rounder; - - b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7]; - b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7]; - b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7]; - b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7]; - - row[0] = (a0 + b0) >> ROW_SHIFT; - row[1] = (a1 + b1) >> ROW_SHIFT; - row[2] = (a2 + b2) >> ROW_SHIFT; - row[3] = (a3 + b3) >> ROW_SHIFT; - row[4] = (a3 - b3) >> ROW_SHIFT; - row[5] = (a2 - b2) >> ROW_SHIFT; - row[6] = (a1 - b1) >> ROW_SHIFT; - row[7] = (a0 - b0) >> ROW_SHIFT; -} -#endif - - -/* MMXEXT row IDCT */ - -#define mmxext_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, -c4, -c2, \ - c4, c6, c4, c6, \ - c1, c3, -c1, -c5, \ - c5, c7, c3, -c7, \ - c4, -c6, c4, -c6, \ - -c4, c2, c4, -c2, \ - c5, -c1, c3, -c1, \ - c7, c3, c7, -c5 } - -static inline void mmxext_row_head (int16_t * const row, const int offset, - const int16_t * const table) -{ - movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */ - - movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ - movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */ - - movq_m2r (*table, mm3); /* mm3 = -C2 -C4 C2 C4 */ - movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */ - - movq_m2r (*(table+4), mm4); /* mm4 = C6 C4 C6 C4 */ - pmaddwd_r2r (mm0, mm3); /* mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 */ - - pshufw_r2r (mm2, mm2, 0x4e); /* mm2 = x2 x0 x6 x4 */ -} - -static inline void mmxext_row (const int16_t * const table, - const int32_t * const rounder) -{ - movq_m2r (*(table+8), mm1); /* mm1 = -C5 -C1 C3 C1 */ - pmaddwd_r2r (mm2, mm4); /* mm4 = C4*x0+C6*x2 C4*x4+C6*x6 */ - - pmaddwd_m2r (*(table+16), mm0); /* mm0 = C4*x4-C6*x6 C4*x0-C6*x2 */ - pshufw_r2r (mm6, mm6, 0x4e); /* mm6 = x3 x1 x7 x5 */ - - movq_m2r (*(table+12), mm7); /* mm7 = -C7 C3 C7 C5 */ - pmaddwd_r2r (mm5, mm1); /* mm1 = -C1*x5-C5*x7 C1*x1+C3*x3 */ - - paddd_m2r (*rounder, mm3); /* mm3 += rounder */ - pmaddwd_r2r (mm6, mm7); /* mm7 = C3*x1-C7*x3 C5*x5+C7*x7 */ - - pmaddwd_m2r (*(table+20), mm2); /* mm2 = C4*x0-C2*x2 -C4*x4+C2*x6 */ - paddd_r2r (mm4, mm3); /* mm3 = a1 a0 + rounder */ - - pmaddwd_m2r (*(table+24), mm5); /* mm5 = C3*x5-C1*x7 C5*x1-C1*x3 */ - movq_r2r (mm3, mm4); /* mm4 = a1 a0 + rounder */ - - pmaddwd_m2r (*(table+28), mm6); /* mm6 = C7*x1-C5*x3 C7*x5+C3*x7 */ - paddd_r2r (mm7, mm1); /* mm1 = b1 b0 */ - - paddd_m2r (*rounder, mm0); /* mm0 += rounder */ - psubd_r2r (mm1, mm3); /* mm3 = a1-b1 a0-b0 + rounder */ - - psrad_i2r (ROW_SHIFT, mm3); /* mm3 = y6 y7 */ - paddd_r2r (mm4, mm1); /* mm1 = a1+b1 a0+b0 + rounder */ - - paddd_r2r (mm2, mm0); /* mm0 = a3 a2 + rounder */ - psrad_i2r (ROW_SHIFT, mm1); /* mm1 = y1 y0 */ - - paddd_r2r (mm6, mm5); /* mm5 = b3 b2 */ - movq_r2r (mm0, mm4); /* mm4 = a3 a2 + rounder */ - - paddd_r2r (mm5, mm0); /* mm0 = a3+b3 a2+b2 + rounder */ - psubd_r2r (mm5, mm4); /* mm4 = a3-b3 a2-b2 + rounder */ -} - -static inline void mmxext_row_tail (int16_t * const row, const int store) -{ - psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */ - - psrad_i2r (ROW_SHIFT, mm4); /* mm4 = y4 y5 */ - - packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */ - - packssdw_r2r (mm3, mm4); /* mm4 = y6 y7 y4 y5 */ - - movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */ - pshufw_r2r (mm4, mm4, 0xb1); /* mm4 = y7 y6 y5 y4 */ - - /* slot */ - - movq_r2m (mm4, *(row+store+4)); /* save y7 y6 y5 y4 */ -} - -static inline void mmxext_row_mid (int16_t * const row, const int store, - const int offset, - const int16_t * const table) -{ - movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */ - psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */ - - movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ - psrad_i2r (ROW_SHIFT, mm4); /* mm4 = y4 y5 */ - - packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */ - movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */ - - packssdw_r2r (mm3, mm4); /* mm4 = y6 y7 y4 y5 */ - movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */ - - movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */ - pshufw_r2r (mm4, mm4, 0xb1); /* mm4 = y7 y6 y5 y4 */ - - movq_m2r (*table, mm3); /* mm3 = -C2 -C4 C2 C4 */ - movq_r2m (mm4, *(row+store+4)); /* save y7 y6 y5 y4 */ - - pmaddwd_r2r (mm0, mm3); /* mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 */ - - movq_m2r (*(table+4), mm4); /* mm4 = C6 C4 C6 C4 */ - pshufw_r2r (mm2, mm2, 0x4e); /* mm2 = x2 x0 x6 x4 */ -} - - -/* MMX row IDCT */ - -#define mmx_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, c4, c6, \ - c4, c6, -c4, -c2, \ - c1, c3, c3, -c7, \ - c5, c7, -c1, -c5, \ - c4, -c6, c4, -c2, \ - -c4, c2, c4, -c6, \ - c5, -c1, c7, -c5, \ - c7, c3, c3, -c1 } - -static inline void mmx_row_head (int16_t * const row, const int offset, - const int16_t * const table) -{ - movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */ - - movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ - movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */ - - movq_m2r (*table, mm3); /* mm3 = C6 C4 C2 C4 */ - movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */ - - punpckldq_r2r (mm0, mm0); /* mm0 = x2 x0 x2 x0 */ - - movq_m2r (*(table+4), mm4); /* mm4 = -C2 -C4 C6 C4 */ - pmaddwd_r2r (mm0, mm3); /* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */ - - movq_m2r (*(table+8), mm1); /* mm1 = -C7 C3 C3 C1 */ - punpckhdq_r2r (mm2, mm2); /* mm2 = x6 x4 x6 x4 */ -} - -static inline void mmx_row (const int16_t * const table, - const int32_t * const rounder) -{ - pmaddwd_r2r (mm2, mm4); /* mm4 = -C4*x4-C2*x6 C4*x4+C6*x6 */ - punpckldq_r2r (mm5, mm5); /* mm5 = x3 x1 x3 x1 */ - - pmaddwd_m2r (*(table+16), mm0); /* mm0 = C4*x0-C2*x2 C4*x0-C6*x2 */ - punpckhdq_r2r (mm6, mm6); /* mm6 = x7 x5 x7 x5 */ - - movq_m2r (*(table+12), mm7); /* mm7 = -C5 -C1 C7 C5 */ - pmaddwd_r2r (mm5, mm1); /* mm1 = C3*x1-C7*x3 C1*x1+C3*x3 */ - - paddd_m2r (*rounder, mm3); /* mm3 += rounder */ - pmaddwd_r2r (mm6, mm7); /* mm7 = -C1*x5-C5*x7 C5*x5+C7*x7 */ - - pmaddwd_m2r (*(table+20), mm2); /* mm2 = C4*x4-C6*x6 -C4*x4+C2*x6 */ - paddd_r2r (mm4, mm3); /* mm3 = a1 a0 + rounder */ - - pmaddwd_m2r (*(table+24), mm5); /* mm5 = C7*x1-C5*x3 C5*x1-C1*x3 */ - movq_r2r (mm3, mm4); /* mm4 = a1 a0 + rounder */ - - pmaddwd_m2r (*(table+28), mm6); /* mm6 = C3*x5-C1*x7 C7*x5+C3*x7 */ - paddd_r2r (mm7, mm1); /* mm1 = b1 b0 */ - - paddd_m2r (*rounder, mm0); /* mm0 += rounder */ - psubd_r2r (mm1, mm3); /* mm3 = a1-b1 a0-b0 + rounder */ - - psrad_i2r (ROW_SHIFT, mm3); /* mm3 = y6 y7 */ - paddd_r2r (mm4, mm1); /* mm1 = a1+b1 a0+b0 + rounder */ - - paddd_r2r (mm2, mm0); /* mm0 = a3 a2 + rounder */ - psrad_i2r (ROW_SHIFT, mm1); /* mm1 = y1 y0 */ - - paddd_r2r (mm6, mm5); /* mm5 = b3 b2 */ - movq_r2r (mm0, mm7); /* mm7 = a3 a2 + rounder */ - - paddd_r2r (mm5, mm0); /* mm0 = a3+b3 a2+b2 + rounder */ - psubd_r2r (mm5, mm7); /* mm7 = a3-b3 a2-b2 + rounder */ -} - -static inline void mmx_row_tail (int16_t * const row, const int store) -{ - psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */ - - psrad_i2r (ROW_SHIFT, mm7); /* mm7 = y4 y5 */ - - packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */ - - packssdw_r2r (mm3, mm7); /* mm7 = y6 y7 y4 y5 */ - - movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */ - movq_r2r (mm7, mm4); /* mm4 = y6 y7 y4 y5 */ - - pslld_i2r (16, mm7); /* mm7 = y7 0 y5 0 */ - - psrld_i2r (16, mm4); /* mm4 = 0 y6 0 y4 */ - - por_r2r (mm4, mm7); /* mm7 = y7 y6 y5 y4 */ - - /* slot */ - - movq_r2m (mm7, *(row+store+4)); /* save y7 y6 y5 y4 */ -} - -static inline void mmx_row_mid (int16_t * const row, const int store, - const int offset, const int16_t * const table) -{ - movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */ - psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */ - - movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ - psrad_i2r (ROW_SHIFT, mm7); /* mm7 = y4 y5 */ - - packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */ - movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */ - - packssdw_r2r (mm3, mm7); /* mm7 = y6 y7 y4 y5 */ - movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */ - - movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */ - movq_r2r (mm7, mm1); /* mm1 = y6 y7 y4 y5 */ - - punpckldq_r2r (mm0, mm0); /* mm0 = x2 x0 x2 x0 */ - psrld_i2r (16, mm7); /* mm7 = 0 y6 0 y4 */ - - movq_m2r (*table, mm3); /* mm3 = C6 C4 C2 C4 */ - pslld_i2r (16, mm1); /* mm1 = y7 0 y5 0 */ - - movq_m2r (*(table+4), mm4); /* mm4 = -C2 -C4 C6 C4 */ - por_r2r (mm1, mm7); /* mm7 = y7 y6 y5 y4 */ - - movq_m2r (*(table+8), mm1); /* mm1 = -C7 C3 C3 C1 */ - punpckhdq_r2r (mm2, mm2); /* mm2 = x6 x4 x6 x4 */ - - movq_r2m (mm7, *(row+store+4)); /* save y7 y6 y5 y4 */ - pmaddwd_r2r (mm0, mm3); /* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */ -} - - -#if 0 -/* C column IDCT - it is just here to document the MMXEXT and MMX versions */ -static inline void idct_col (int16_t * col, int offset) -{ -/* multiplication - as implemented on mmx */ -#define F(c,x) (((c) * (x)) >> 16) - -/* saturation - it helps us handle torture test cases */ -#define S(x) (((x)>32767) ? 32767 : ((x)<-32768) ? -32768 : (x)) - - int16_t x0, x1, x2, x3, x4, x5, x6, x7; - int16_t y0, y1, y2, y3, y4, y5, y6, y7; - int16_t a0, a1, a2, a3, b0, b1, b2, b3; - int16_t u04, v04, u26, v26, u17, v17, u35, v35, u12, v12; - - col += offset; - - x0 = col[0*8]; - x1 = col[1*8]; - x2 = col[2*8]; - x3 = col[3*8]; - x4 = col[4*8]; - x5 = col[5*8]; - x6 = col[6*8]; - x7 = col[7*8]; - - u04 = S (x0 + x4); - v04 = S (x0 - x4); - u26 = S (F (T2, x6) + x2); - v26 = S (F (T2, x2) - x6); - - a0 = S (u04 + u26); - a1 = S (v04 + v26); - a2 = S (v04 - v26); - a3 = S (u04 - u26); - - u17 = S (F (T1, x7) + x1); - v17 = S (F (T1, x1) - x7); - u35 = S (F (T3, x5) + x3); - v35 = S (F (T3, x3) - x5); - - b0 = S (u17 + u35); - b3 = S (v17 - v35); - u12 = S (u17 - u35); - v12 = S (v17 + v35); - u12 = S (2 * F (C4, u12)); - v12 = S (2 * F (C4, v12)); - b1 = S (u12 + v12); - b2 = S (u12 - v12); - - y0 = S (a0 + b0) >> COL_SHIFT; - y1 = S (a1 + b1) >> COL_SHIFT; - y2 = S (a2 + b2) >> COL_SHIFT; - y3 = S (a3 + b3) >> COL_SHIFT; - - y4 = S (a3 - b3) >> COL_SHIFT; - y5 = S (a2 - b2) >> COL_SHIFT; - y6 = S (a1 - b1) >> COL_SHIFT; - y7 = S (a0 - b0) >> COL_SHIFT; - - col[0*8] = y0; - col[1*8] = y1; - col[2*8] = y2; - col[3*8] = y3; - col[4*8] = y4; - col[5*8] = y5; - col[6*8] = y6; - col[7*8] = y7; -} -#endif - - -/* MMX column IDCT */ -static inline void idct_col (int16_t * const col, const int offset) -{ -#define T1 13036 -#define T2 27146 -#define T3 43790 -#define C4 23170 - - static const short t1_vector[] ATTR_ALIGN(8) = {T1,T1,T1,T1}; - static const short t2_vector[] ATTR_ALIGN(8) = {T2,T2,T2,T2}; - static const short t3_vector[] ATTR_ALIGN(8) = {T3,T3,T3,T3}; - static const short c4_vector[] ATTR_ALIGN(8) = {C4,C4,C4,C4}; - - /* column code adapted from Peter Gubanov */ - /* http://www.elecard.com/peter/idct.shtml */ - - movq_m2r (*t1_vector, mm0); /* mm0 = T1 */ - - movq_m2r (*(col+offset+1*8), mm1); /* mm1 = x1 */ - movq_r2r (mm0, mm2); /* mm2 = T1 */ - - movq_m2r (*(col+offset+7*8), mm4); /* mm4 = x7 */ - pmulhw_r2r (mm1, mm0); /* mm0 = T1*x1 */ - - movq_m2r (*t3_vector, mm5); /* mm5 = T3 */ - pmulhw_r2r (mm4, mm2); /* mm2 = T1*x7 */ - - movq_m2r (*(col+offset+5*8), mm6); /* mm6 = x5 */ - movq_r2r (mm5, mm7); /* mm7 = T3-1 */ - - movq_m2r (*(col+offset+3*8), mm3); /* mm3 = x3 */ - psubsw_r2r (mm4, mm0); /* mm0 = v17 */ - - movq_m2r (*t2_vector, mm4); /* mm4 = T2 */ - pmulhw_r2r (mm3, mm5); /* mm5 = (T3-1)*x3 */ - - paddsw_r2r (mm2, mm1); /* mm1 = u17 */ - pmulhw_r2r (mm6, mm7); /* mm7 = (T3-1)*x5 */ - - /* slot */ - - movq_r2r (mm4, mm2); /* mm2 = T2 */ - paddsw_r2r (mm3, mm5); /* mm5 = T3*x3 */ - - pmulhw_m2r (*(col+offset+2*8), mm4);/* mm4 = T2*x2 */ - paddsw_r2r (mm6, mm7); /* mm7 = T3*x5 */ - - psubsw_r2r (mm6, mm5); /* mm5 = v35 */ - paddsw_r2r (mm3, mm7); /* mm7 = u35 */ - - movq_m2r (*(col+offset+6*8), mm3); /* mm3 = x6 */ - movq_r2r (mm0, mm6); /* mm6 = v17 */ - - pmulhw_r2r (mm3, mm2); /* mm2 = T2*x6 */ - psubsw_r2r (mm5, mm0); /* mm0 = b3 */ - - psubsw_r2r (mm3, mm4); /* mm4 = v26 */ - paddsw_r2r (mm6, mm5); /* mm5 = v12 */ - - movq_r2m (mm0, *(col+offset+3*8)); /* save b3 in scratch0 */ - movq_r2r (mm1, mm6); /* mm6 = u17 */ - - paddsw_m2r (*(col+offset+2*8), mm2);/* mm2 = u26 */ - paddsw_r2r (mm7, mm6); /* mm6 = b0 */ - - psubsw_r2r (mm7, mm1); /* mm1 = u12 */ - movq_r2r (mm1, mm7); /* mm7 = u12 */ - - movq_m2r (*(col+offset+0*8), mm3); /* mm3 = x0 */ - paddsw_r2r (mm5, mm1); /* mm1 = u12+v12 */ - - movq_m2r (*c4_vector, mm0); /* mm0 = C4/2 */ - psubsw_r2r (mm5, mm7); /* mm7 = u12-v12 */ - - movq_r2m (mm6, *(col+offset+5*8)); /* save b0 in scratch1 */ - pmulhw_r2r (mm0, mm1); /* mm1 = b1/2 */ - - movq_r2r (mm4, mm6); /* mm6 = v26 */ - pmulhw_r2r (mm0, mm7); /* mm7 = b2/2 */ - - movq_m2r (*(col+offset+4*8), mm5); /* mm5 = x4 */ - movq_r2r (mm3, mm0); /* mm0 = x0 */ - - psubsw_r2r (mm5, mm3); /* mm3 = v04 */ - paddsw_r2r (mm5, mm0); /* mm0 = u04 */ - - paddsw_r2r (mm3, mm4); /* mm4 = a1 */ - movq_r2r (mm0, mm5); /* mm5 = u04 */ - - psubsw_r2r (mm6, mm3); /* mm3 = a2 */ - paddsw_r2r (mm2, mm5); /* mm5 = a0 */ - - paddsw_r2r (mm1, mm1); /* mm1 = b1 */ - psubsw_r2r (mm2, mm0); /* mm0 = a3 */ - - paddsw_r2r (mm7, mm7); /* mm7 = b2 */ - movq_r2r (mm3, mm2); /* mm2 = a2 */ - - movq_r2r (mm4, mm6); /* mm6 = a1 */ - paddsw_r2r (mm7, mm3); /* mm3 = a2+b2 */ - - psraw_i2r (COL_SHIFT, mm3); /* mm3 = y2 */ - paddsw_r2r (mm1, mm4); /* mm4 = a1+b1 */ - - psraw_i2r (COL_SHIFT, mm4); /* mm4 = y1 */ - psubsw_r2r (mm1, mm6); /* mm6 = a1-b1 */ - - movq_m2r (*(col+offset+5*8), mm1); /* mm1 = b0 */ - psubsw_r2r (mm7, mm2); /* mm2 = a2-b2 */ - - psraw_i2r (COL_SHIFT, mm6); /* mm6 = y6 */ - movq_r2r (mm5, mm7); /* mm7 = a0 */ - - movq_r2m (mm4, *(col+offset+1*8)); /* save y1 */ - psraw_i2r (COL_SHIFT, mm2); /* mm2 = y5 */ - - movq_r2m (mm3, *(col+offset+2*8)); /* save y2 */ - paddsw_r2r (mm1, mm5); /* mm5 = a0+b0 */ - - movq_m2r (*(col+offset+3*8), mm4); /* mm4 = b3 */ - psubsw_r2r (mm1, mm7); /* mm7 = a0-b0 */ - - psraw_i2r (COL_SHIFT, mm5); /* mm5 = y0 */ - movq_r2r (mm0, mm3); /* mm3 = a3 */ - - movq_r2m (mm2, *(col+offset+5*8)); /* save y5 */ - psubsw_r2r (mm4, mm3); /* mm3 = a3-b3 */ - - psraw_i2r (COL_SHIFT, mm7); /* mm7 = y7 */ - paddsw_r2r (mm0, mm4); /* mm4 = a3+b3 */ - - movq_r2m (mm5, *(col+offset+0*8)); /* save y0 */ - psraw_i2r (COL_SHIFT, mm3); /* mm3 = y4 */ - - movq_r2m (mm6, *(col+offset+6*8)); /* save y6 */ - psraw_i2r (COL_SHIFT, mm4); /* mm4 = y3 */ - - movq_r2m (mm7, *(col+offset+7*8)); /* save y7 */ - - movq_r2m (mm3, *(col+offset+4*8)); /* save y4 */ - - movq_r2m (mm4, *(col+offset+3*8)); /* save y3 */ - -#undef T1 -#undef T2 -#undef T3 -#undef C4 -} - - -static const int32_t rounder0[] ATTR_ALIGN(8) = - rounder ((1 << (COL_SHIFT - 1)) - 0.5); -static const int32_t rounder4[] ATTR_ALIGN(8) = rounder (0); -static const int32_t rounder1[] ATTR_ALIGN(8) = - rounder (1.25683487303); /* C1*(C1/C4+C1+C7)/2 */ -static const int32_t rounder7[] ATTR_ALIGN(8) = - rounder (-0.25); /* C1*(C7/C4+C7-C1)/2 */ -static const int32_t rounder2[] ATTR_ALIGN(8) = - rounder (0.60355339059); /* C2 * (C6+C2)/2 */ -static const int32_t rounder6[] ATTR_ALIGN(8) = - rounder (-0.25); /* C2 * (C6-C2)/2 */ -static const int32_t rounder3[] ATTR_ALIGN(8) = - rounder (0.087788325588); /* C3*(-C3/C4+C3+C5)/2 */ -static const int32_t rounder5[] ATTR_ALIGN(8) = - rounder (-0.441341716183); /* C3*(-C5/C4+C5-C3)/2 */ - -#undef COL_SHIFT -#undef ROW_SHIFT - -#define declare_idct(idct,table,idct_row_head,idct_row,idct_row_tail,idct_row_mid) \ -void idct (int16_t * const block) \ -{ \ - static const int16_t table04[] ATTR_ALIGN(16) = \ - table (22725, 21407, 19266, 16384, 12873, 8867, 4520); \ - static const int16_t table17[] ATTR_ALIGN(16) = \ - table (31521, 29692, 26722, 22725, 17855, 12299, 6270); \ - static const int16_t table26[] ATTR_ALIGN(16) = \ - table (29692, 27969, 25172, 21407, 16819, 11585, 5906); \ - static const int16_t table35[] ATTR_ALIGN(16) = \ - table (26722, 25172, 22654, 19266, 15137, 10426, 5315); \ - \ - idct_row_head (block, 0*8, table04); \ - idct_row (table04, rounder0); \ - idct_row_mid (block, 0*8, 4*8, table04); \ - idct_row (table04, rounder4); \ - idct_row_mid (block, 4*8, 1*8, table17); \ - idct_row (table17, rounder1); \ - idct_row_mid (block, 1*8, 7*8, table17); \ - idct_row (table17, rounder7); \ - idct_row_mid (block, 7*8, 2*8, table26); \ - idct_row (table26, rounder2); \ - idct_row_mid (block, 2*8, 6*8, table26); \ - idct_row (table26, rounder6); \ - idct_row_mid (block, 6*8, 3*8, table35); \ - idct_row (table35, rounder3); \ - idct_row_mid (block, 3*8, 5*8, table35); \ - idct_row (table35, rounder5); \ - idct_row_tail (block, 5*8); \ - \ - idct_col (block, 0); \ - idct_col (block, 4); \ -} - -void ff_mmx_idct(DCTELEM *block); -void ff_mmxext_idct(DCTELEM *block); - -declare_idct (ff_mmxext_idct, mmxext_table, - mmxext_row_head, mmxext_row, mmxext_row_tail, mmxext_row_mid) - -declare_idct (ff_mmx_idct, mmx_table, - mmx_row_head, mmx_row, mmx_row_tail, mmx_row_mid) -
--- a/i386/idct_mmx_xvid.c Mon Dec 22 06:50:18 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,525 +0,0 @@ -/* - * XVID MPEG-4 VIDEO CODEC - * - MMX and XMM forward discrete cosine transform - - * - * Copyright(C) 2001 Peter Ross <pross@xvid.org> - * - * Originally provided by Intel at AP-922 - * http://developer.intel.com/vtune/cbts/strmsimd/922down.htm - * (See more app notes at http://developer.intel.com/vtune/cbts/strmsimd/appnotes.htm) - * but in a limited edition. - * New macro implements a column part for precise iDCT - * The routine precision now satisfies IEEE standard 1180-1990. - * - * Copyright(C) 2000-2001 Peter Gubanov <peter@elecard.net.ru> - * Rounding trick Copyright(C) 2000 Michel Lespinasse <walken@zoy.org> - * - * http://www.elecard.com/peter/idct.html - * http://www.linuxvideo.org/mpeg2dec/ - * - * These examples contain code fragments for first stage iDCT 8x8 - * (for rows) and first stage DCT 8x8 (for columns) - * - * conversion to gcc syntax by Michael Niedermayer - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with FFmpeg; if not, write to the Free Software Foundation, - * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <inttypes.h> -#include "libavcodec/avcodec.h" - -//============================================================================= -// Macros and other preprocessor constants -//============================================================================= - -#define BITS_INV_ACC 5 // 4 or 5 for IEEE -#define SHIFT_INV_ROW (16 - BITS_INV_ACC) //11 -#define SHIFT_INV_COL (1 + BITS_INV_ACC) //6 -#define RND_INV_ROW (1024 * (6 - BITS_INV_ACC)) -#define RND_INV_COL (16 * (BITS_INV_ACC - 3)) -#define RND_INV_CORR (RND_INV_COL - 1) - -#define BITS_FRW_ACC 3 // 2 or 3 for accuracy -#define SHIFT_FRW_COL BITS_FRW_ACC -#define SHIFT_FRW_ROW (BITS_FRW_ACC + 17) -#define RND_FRW_ROW (262144*(BITS_FRW_ACC - 1)) - - -//----------------------------------------------------------------------------- -// Various memory constants (trigonometric values or rounding values) -//----------------------------------------------------------------------------- - - -DECLARE_ALIGNED(8, static const int16_t, tg_1_16[4*4]) = { - 13036,13036,13036,13036, // tg * (2<<16) + 0.5 - 27146,27146,27146,27146, // tg * (2<<16) + 0.5 - -21746,-21746,-21746,-21746, // tg * (2<<16) + 0.5 - 23170,23170,23170,23170}; // cos * (2<<15) + 0.5 - -DECLARE_ALIGNED(8, static const int32_t, rounder_0[2*8]) = { - 65536,65536, - 3597,3597, - 2260,2260, - 1203,1203, - 0,0, - 120,120, - 512,512, - 512,512}; - -//----------------------------------------------------------------------------- -// -// The first stage iDCT 8x8 - inverse DCTs of rows -// -//----------------------------------------------------------------------------- -// The 8-point inverse DCT direct algorithm -//----------------------------------------------------------------------------- -// -// static const short w[32] = { -// FIX(cos_4_16), FIX(cos_2_16), FIX(cos_4_16), FIX(cos_6_16), -// FIX(cos_4_16), FIX(cos_6_16), -FIX(cos_4_16), -FIX(cos_2_16), -// FIX(cos_4_16), -FIX(cos_6_16), -FIX(cos_4_16), FIX(cos_2_16), -// FIX(cos_4_16), -FIX(cos_2_16), FIX(cos_4_16), -FIX(cos_6_16), -// FIX(cos_1_16), FIX(cos_3_16), FIX(cos_5_16), FIX(cos_7_16), -// FIX(cos_3_16), -FIX(cos_7_16), -FIX(cos_1_16), -FIX(cos_5_16), -// FIX(cos_5_16), -FIX(cos_1_16), FIX(cos_7_16), FIX(cos_3_16), -// FIX(cos_7_16), -FIX(cos_5_16), FIX(cos_3_16), -FIX(cos_1_16) }; -// -// #define DCT_8_INV_ROW(x, y) -// { -// int a0, a1, a2, a3, b0, b1, b2, b3; -// -// a0 =x[0]*w[0]+x[2]*w[1]+x[4]*w[2]+x[6]*w[3]; -// a1 =x[0]*w[4]+x[2]*w[5]+x[4]*w[6]+x[6]*w[7]; -// a2 = x[0] * w[ 8] + x[2] * w[ 9] + x[4] * w[10] + x[6] * w[11]; -// a3 = x[0] * w[12] + x[2] * w[13] + x[4] * w[14] + x[6] * w[15]; -// b0 = x[1] * w[16] + x[3] * w[17] + x[5] * w[18] + x[7] * w[19]; -// b1 = x[1] * w[20] + x[3] * w[21] + x[5] * w[22] + x[7] * w[23]; -// b2 = x[1] * w[24] + x[3] * w[25] + x[5] * w[26] + x[7] * w[27]; -// b3 = x[1] * w[28] + x[3] * w[29] + x[5] * w[30] + x[7] * w[31]; -// -// y[0] = SHIFT_ROUND ( a0 + b0 ); -// y[1] = SHIFT_ROUND ( a1 + b1 ); -// y[2] = SHIFT_ROUND ( a2 + b2 ); -// y[3] = SHIFT_ROUND ( a3 + b3 ); -// y[4] = SHIFT_ROUND ( a3 - b3 ); -// y[5] = SHIFT_ROUND ( a2 - b2 ); -// y[6] = SHIFT_ROUND ( a1 - b1 ); -// y[7] = SHIFT_ROUND ( a0 - b0 ); -// } -// -//----------------------------------------------------------------------------- -// -// In this implementation the outputs of the iDCT-1D are multiplied -// for rows 0,4 - by cos_4_16, -// for rows 1,7 - by cos_1_16, -// for rows 2,6 - by cos_2_16, -// for rows 3,5 - by cos_3_16 -// and are shifted to the left for better accuracy -// -// For the constants used, -// FIX(float_const) = (short) (float_const * (1<<15) + 0.5) -// -//----------------------------------------------------------------------------- - -//----------------------------------------------------------------------------- -// Tables for mmx processors -//----------------------------------------------------------------------------- - -// Table for rows 0,4 - constants are multiplied by cos_4_16 -DECLARE_ALIGNED(8, static const int16_t, tab_i_04_mmx[32*4]) = { - 16384,16384,16384,-16384, // movq-> w06 w04 w02 w00 - 21407,8867,8867,-21407, // w07 w05 w03 w01 - 16384,-16384,16384,16384, // w14 w12 w10 w08 - -8867,21407,-21407,-8867, // w15 w13 w11 w09 - 22725,12873,19266,-22725, // w22 w20 w18 w16 - 19266,4520,-4520,-12873, // w23 w21 w19 w17 - 12873,4520,4520,19266, // w30 w28 w26 w24 - -22725,19266,-12873,-22725, // w31 w29 w27 w25 -// Table for rows 1,7 - constants are multiplied by cos_1_16 - 22725,22725,22725,-22725, // movq-> w06 w04 w02 w00 - 29692,12299,12299,-29692, // w07 w05 w03 w01 - 22725,-22725,22725,22725, // w14 w12 w10 w08 - -12299,29692,-29692,-12299, // w15 w13 w11 w09 - 31521,17855,26722,-31521, // w22 w20 w18 w16 - 26722,6270,-6270,-17855, // w23 w21 w19 w17 - 17855,6270,6270,26722, // w30 w28 w26 w24 - -31521,26722,-17855,-31521, // w31 w29 w27 w25 -// Table for rows 2,6 - constants are multiplied by cos_2_16 - 21407,21407,21407,-21407, // movq-> w06 w04 w02 w00 - 27969,11585,11585,-27969, // w07 w05 w03 w01 - 21407,-21407,21407,21407, // w14 w12 w10 w08 - -11585,27969,-27969,-11585, // w15 w13 w11 w09 - 29692,16819,25172,-29692, // w22 w20 w18 w16 - 25172,5906,-5906,-16819, // w23 w21 w19 w17 - 16819,5906,5906,25172, // w30 w28 w26 w24 - -29692,25172,-16819,-29692, // w31 w29 w27 w25 -// Table for rows 3,5 - constants are multiplied by cos_3_16 - 19266,19266,19266,-19266, // movq-> w06 w04 w02 w00 - 25172,10426,10426,-25172, // w07 w05 w03 w01 - 19266,-19266,19266,19266, // w14 w12 w10 w08 - -10426,25172,-25172,-10426, // w15 w13 w11 w09 - 26722,15137,22654,-26722, // w22 w20 w18 w16 - 22654,5315,-5315,-15137, // w23 w21 w19 w17 - 15137,5315,5315,22654, // w30 w28 w26 w24 - -26722,22654,-15137,-26722, // w31 w29 w27 w25 -}; -//----------------------------------------------------------------------------- -// Tables for xmm processors -//----------------------------------------------------------------------------- - -// %3 for rows 0,4 - constants are multiplied by cos_4_16 -DECLARE_ALIGNED(8, static const int16_t, tab_i_04_xmm[32*4]) = { - 16384,21407,16384,8867, // movq-> w05 w04 w01 w00 - 16384,8867,-16384,-21407, // w07 w06 w03 w02 - 16384,-8867,16384,-21407, // w13 w12 w09 w08 - -16384,21407,16384,-8867, // w15 w14 w11 w10 - 22725,19266,19266,-4520, // w21 w20 w17 w16 - 12873,4520,-22725,-12873, // w23 w22 w19 w18 - 12873,-22725,4520,-12873, // w29 w28 w25 w24 - 4520,19266,19266,-22725, // w31 w30 w27 w26 -// %3 for rows 1,7 - constants are multiplied by cos_1_16 - 22725,29692,22725,12299, // movq-> w05 w04 w01 w00 - 22725,12299,-22725,-29692, // w07 w06 w03 w02 - 22725,-12299,22725,-29692, // w13 w12 w09 w08 - -22725,29692,22725,-12299, // w15 w14 w11 w10 - 31521,26722,26722,-6270, // w21 w20 w17 w16 - 17855,6270,-31521,-17855, // w23 w22 w19 w18 - 17855,-31521,6270,-17855, // w29 w28 w25 w24 - 6270,26722,26722,-31521, // w31 w30 w27 w26 -// %3 for rows 2,6 - constants are multiplied by cos_2_16 - 21407,27969,21407,11585, // movq-> w05 w04 w01 w00 - 21407,11585,-21407,-27969, // w07 w06 w03 w02 - 21407,-11585,21407,-27969, // w13 w12 w09 w08 - -21407,27969,21407,-11585, // w15 w14 w11 w10 - 29692,25172,25172,-5906, // w21 w20 w17 w16 - 16819,5906,-29692,-16819, // w23 w22 w19 w18 - 16819,-29692,5906,-16819, // w29 w28 w25 w24 - 5906,25172,25172,-29692, // w31 w30 w27 w26 -// %3 for rows 3,5 - constants are multiplied by cos_3_16 - 19266,25172,19266,10426, // movq-> w05 w04 w01 w00 - 19266,10426,-19266,-25172, // w07 w06 w03 w02 - 19266,-10426,19266,-25172, // w13 w12 w09 w08 - -19266,25172,19266,-10426, // w15 w14 w11 w10 - 26722,22654,22654,-5315, // w21 w20 w17 w16 - 15137,5315,-26722,-15137, // w23 w22 w19 w18 - 15137,-26722,5315,-15137, // w29 w28 w25 w24 - 5315,22654,22654,-26722, // w31 w30 w27 w26 -}; -//============================================================================= -// Helper macros for the code -//============================================================================= - -//----------------------------------------------------------------------------- -// DCT_8_INV_ROW_MMX( INP, OUT, TABLE, ROUNDER -//----------------------------------------------------------------------------- - -#define DCT_8_INV_ROW_MMX(A1,A2,A3,A4)\ - "movq " #A1 ",%%mm0 \n\t"/* 0 ; x3 x2 x1 x0*/\ - "movq 8+" #A1 ",%%mm1 \n\t"/* 1 ; x7 x6 x5 x4*/\ - "movq %%mm0,%%mm2 \n\t"/* 2 ; x3 x2 x1 x0*/\ - "movq " #A3 ",%%mm3 \n\t"/* 3 ; w06 w04 w02 w00*/\ - "punpcklwd %%mm1,%%mm0 \n\t"/* x5 x1 x4 x0*/\ - "movq %%mm0,%%mm5 \n\t"/* 5 ; x5 x1 x4 x0*/\ - "punpckldq %%mm0,%%mm0 \n\t"/* x4 x0 x4 x0*/\ - "movq 8+" #A3 ",%%mm4 \n\t"/* 4 ; w07 w05 w03 w01*/\ - "punpckhwd %%mm1,%%mm2 \n\t"/* 1 ; x7 x3 x6 x2*/\ - "pmaddwd %%mm0,%%mm3 \n\t"/* x4*w06+x0*w04 x4*w02+x0*w00*/\ - "movq %%mm2,%%mm6 \n\t"/* 6 ; x7 x3 x6 x2*/\ - "movq 32+" #A3 ",%%mm1 \n\t"/* 1 ; w22 w20 w18 w16*/\ - "punpckldq %%mm2,%%mm2 \n\t"/* x6 x2 x6 x2*/\ - "pmaddwd %%mm2,%%mm4 \n\t"/* x6*w07+x2*w05 x6*w03+x2*w01*/\ - "punpckhdq %%mm5,%%mm5 \n\t"/* x5 x1 x5 x1*/\ - "pmaddwd 16+" #A3 ",%%mm0 \n\t"/* x4*w14+x0*w12 x4*w10+x0*w08*/\ - "punpckhdq %%mm6,%%mm6 \n\t"/* x7 x3 x7 x3*/\ - "movq 40+" #A3 ",%%mm7 \n\t"/* 7 ; w23 w21 w19 w17*/\ - "pmaddwd %%mm5,%%mm1 \n\t"/* x5*w22+x1*w20 x5*w18+x1*w16*/\ - "paddd " #A4 ",%%mm3 \n\t"/* +%4*/\ - "pmaddwd %%mm6,%%mm7 \n\t"/* x7*w23+x3*w21 x7*w19+x3*w17*/\ - "pmaddwd 24+" #A3 ",%%mm2 \n\t"/* x6*w15+x2*w13 x6*w11+x2*w09*/\ - "paddd %%mm4,%%mm3 \n\t"/* 4 ; a1=sum(even1) a0=sum(even0)*/\ - "pmaddwd 48+" #A3 ",%%mm5 \n\t"/* x5*w30+x1*w28 x5*w26+x1*w24*/\ - "movq %%mm3,%%mm4 \n\t"/* 4 ; a1 a0*/\ - "pmaddwd 56+" #A3 ",%%mm6 \n\t"/* x7*w31+x3*w29 x7*w27+x3*w25*/\ - "paddd %%mm7,%%mm1 \n\t"/* 7 ; b1=sum(odd1) b0=sum(odd0)*/\ - "paddd " #A4 ",%%mm0 \n\t"/* +%4*/\ - "psubd %%mm1,%%mm3 \n\t"/* a1-b1 a0-b0*/\ - "psrad $11,%%mm3 \n\t"/* y6=a1-b1 y7=a0-b0*/\ - "paddd %%mm4,%%mm1 \n\t"/* 4 ; a1+b1 a0+b0*/\ - "paddd %%mm2,%%mm0 \n\t"/* 2 ; a3=sum(even3) a2=sum(even2)*/\ - "psrad $11,%%mm1 \n\t"/* y1=a1+b1 y0=a0+b0*/\ - "paddd %%mm6,%%mm5 \n\t"/* 6 ; b3=sum(odd3) b2=sum(odd2)*/\ - "movq %%mm0,%%mm4 \n\t"/* 4 ; a3 a2*/\ - "paddd %%mm5,%%mm0 \n\t"/* a3+b3 a2+b2*/\ - "psubd %%mm5,%%mm4 \n\t"/* 5 ; a3-b3 a2-b2*/\ - "psrad $11,%%mm0 \n\t"/* y3=a3+b3 y2=a2+b2*/\ - "psrad $11,%%mm4 \n\t"/* y4=a3-b3 y5=a2-b2*/\ - "packssdw %%mm0,%%mm1 \n\t"/* 0 ; y3 y2 y1 y0*/\ - "packssdw %%mm3,%%mm4 \n\t"/* 3 ; y6 y7 y4 y5*/\ - "movq %%mm4,%%mm7 \n\t"/* 7 ; y6 y7 y4 y5*/\ - "psrld $16,%%mm4 \n\t"/* 0 y6 0 y4*/\ - "pslld $16,%%mm7 \n\t"/* y7 0 y5 0*/\ - "movq %%mm1," #A2 " \n\t"/* 1 ; save y3 y2 y1 y0*/\ - "por %%mm4,%%mm7 \n\t"/* 4 ; y7 y6 y5 y4*/\ - "movq %%mm7,8 +" #A2 "\n\t"/* 7 ; save y7 y6 y5 y4*/\ - - -//----------------------------------------------------------------------------- -// DCT_8_INV_ROW_XMM( INP, OUT, TABLE, ROUNDER -//----------------------------------------------------------------------------- - -#define DCT_8_INV_ROW_XMM(A1,A2,A3,A4)\ - "movq " #A1 ",%%mm0 \n\t"/* 0 ; x3 x2 x1 x0*/\ - "movq 8+" #A1 ",%%mm1 \n\t"/* 1 ; x7 x6 x5 x4*/\ - "movq %%mm0,%%mm2 \n\t"/* 2 ; x3 x2 x1 x0*/\ - "movq " #A3 ",%%mm3 \n\t"/* 3 ; w05 w04 w01 w00*/\ - "pshufw $0x88,%%mm0,%%mm0 \n\t"/* x2 x0 x2 x0*/\ - "movq 8+" #A3 ",%%mm4 \n\t"/* 4 ; w07 w06 w03 w02*/\ - "movq %%mm1,%%mm5 \n\t"/* 5 ; x7 x6 x5 x4*/\ - "pmaddwd %%mm0,%%mm3 \n\t"/* x2*w05+x0*w04 x2*w01+x0*w00*/\ - "movq 32+" #A3 ",%%mm6 \n\t"/* 6 ; w21 w20 w17 w16*/\ - "pshufw $0x88,%%mm1,%%mm1 \n\t"/* x6 x4 x6 x4*/\ - "pmaddwd %%mm1,%%mm4 \n\t"/* x6*w07+x4*w06 x6*w03+x4*w02*/\ - "movq 40+" #A3 ",%%mm7 \n\t"/* 7 ; w23 w22 w19 w18*/\ - "pshufw $0xdd,%%mm2,%%mm2 \n\t"/* x3 x1 x3 x1*/\ - "pmaddwd %%mm2,%%mm6 \n\t"/* x3*w21+x1*w20 x3*w17+x1*w16*/\ - "pshufw $0xdd,%%mm5,%%mm5 \n\t"/* x7 x5 x7 x5*/\ - "pmaddwd %%mm5,%%mm7 \n\t"/* x7*w23+x5*w22 x7*w19+x5*w18*/\ - "paddd " #A4 ",%%mm3 \n\t"/* +%4*/\ - "pmaddwd 16+" #A3 ",%%mm0 \n\t"/* x2*w13+x0*w12 x2*w09+x0*w08*/\ - "paddd %%mm4,%%mm3 \n\t"/* 4 ; a1=sum(even1) a0=sum(even0)*/\ - "pmaddwd 24+" #A3 ",%%mm1 \n\t"/* x6*w15+x4*w14 x6*w11+x4*w10*/\ - "movq %%mm3,%%mm4 \n\t"/* 4 ; a1 a0*/\ - "pmaddwd 48+" #A3 ",%%mm2 \n\t"/* x3*w29+x1*w28 x3*w25+x1*w24*/\ - "paddd %%mm7,%%mm6 \n\t"/* 7 ; b1=sum(odd1) b0=sum(odd0)*/\ - "pmaddwd 56+" #A3 ",%%mm5 \n\t"/* x7*w31+x5*w30 x7*w27+x5*w26*/\ - "paddd %%mm6,%%mm3 \n\t"/* a1+b1 a0+b0*/\ - "paddd " #A4 ",%%mm0 \n\t"/* +%4*/\ - "psrad $11,%%mm3 \n\t"/* y1=a1+b1 y0=a0+b0*/\ - "paddd %%mm1,%%mm0 \n\t"/* 1 ; a3=sum(even3) a2=sum(even2)*/\ - "psubd %%mm6,%%mm4 \n\t"/* 6 ; a1-b1 a0-b0*/\ - "movq %%mm0,%%mm7 \n\t"/* 7 ; a3 a2*/\ - "paddd %%mm5,%%mm2 \n\t"/* 5 ; b3=sum(odd3) b2=sum(odd2)*/\ - "paddd %%mm2,%%mm0 \n\t"/* a3+b3 a2+b2*/\ - "psrad $11,%%mm4 \n\t"/* y6=a1-b1 y7=a0-b0*/\ - "psubd %%mm2,%%mm7 \n\t"/* 2 ; a3-b3 a2-b2*/\ - "psrad $11,%%mm0 \n\t"/* y3=a3+b3 y2=a2+b2*/\ - "psrad $11,%%mm7 \n\t"/* y4=a3-b3 y5=a2-b2*/\ - "packssdw %%mm0,%%mm3 \n\t"/* 0 ; y3 y2 y1 y0*/\ - "packssdw %%mm4,%%mm7 \n\t"/* 4 ; y6 y7 y4 y5*/\ - "movq %%mm3, " #A2 " \n\t"/* 3 ; save y3 y2 y1 y0*/\ - "pshufw $0xb1,%%mm7,%%mm7 \n\t"/* y7 y6 y5 y4*/\ - "movq %%mm7,8 +" #A2 "\n\t"/* 7 ; save y7 y6 y5 y4*/\ - - -//----------------------------------------------------------------------------- -// -// The first stage DCT 8x8 - forward DCTs of columns -// -// The %2puts are multiplied -// for rows 0,4 - on cos_4_16, -// for rows 1,7 - on cos_1_16, -// for rows 2,6 - on cos_2_16, -// for rows 3,5 - on cos_3_16 -// and are shifted to the left for rise of accuracy -// -//----------------------------------------------------------------------------- -// -// The 8-point scaled forward DCT algorithm (26a8m) -// -//----------------------------------------------------------------------------- -// -// #define DCT_8_FRW_COL(x, y) -//{ -// short t0, t1, t2, t3, t4, t5, t6, t7; -// short tp03, tm03, tp12, tm12, tp65, tm65; -// short tp465, tm465, tp765, tm765; -// -// t0 = LEFT_SHIFT ( x[0] + x[7] ); -// t1 = LEFT_SHIFT ( x[1] + x[6] ); -// t2 = LEFT_SHIFT ( x[2] + x[5] ); -// t3 = LEFT_SHIFT ( x[3] + x[4] ); -// t4 = LEFT_SHIFT ( x[3] - x[4] ); -// t5 = LEFT_SHIFT ( x[2] - x[5] ); -// t6 = LEFT_SHIFT ( x[1] - x[6] ); -// t7 = LEFT_SHIFT ( x[0] - x[7] ); -// -// tp03 = t0 + t3; -// tm03 = t0 - t3; -// tp12 = t1 + t2; -// tm12 = t1 - t2; -// -// y[0] = tp03 + tp12; -// y[4] = tp03 - tp12; -// -// y[2] = tm03 + tm12 * tg_2_16; -// y[6] = tm03 * tg_2_16 - tm12; -// -// tp65 =(t6 +t5 )*cos_4_16; -// tm65 =(t6 -t5 )*cos_4_16; -// -// tp765 = t7 + tp65; -// tm765 = t7 - tp65; -// tp465 = t4 + tm65; -// tm465 = t4 - tm65; -// -// y[1] = tp765 + tp465 * tg_1_16; -// y[7] = tp765 * tg_1_16 - tp465; -// y[5] = tm765 * tg_3_16 + tm465; -// y[3] = tm765 - tm465 * tg_3_16; -//} -// -//----------------------------------------------------------------------------- - -//----------------------------------------------------------------------------- -// DCT_8_INV_COL_4 INP,OUT -//----------------------------------------------------------------------------- - -#define DCT_8_INV_COL(A1,A2)\ - "movq 2*8(%3),%%mm0\n\t"\ - "movq 16*3+" #A1 ",%%mm3\n\t"\ - "movq %%mm0,%%mm1 \n\t"/* tg_3_16*/\ - "movq 16*5+" #A1 ",%%mm5\n\t"\ - "pmulhw %%mm3,%%mm0 \n\t"/* x3*(tg_3_16-1)*/\ - "movq (%3),%%mm4\n\t"\ - "pmulhw %%mm5,%%mm1 \n\t"/* x5*(tg_3_16-1)*/\ - "movq 16*7+" #A1 ",%%mm7\n\t"\ - "movq %%mm4,%%mm2 \n\t"/* tg_1_16*/\ - "movq 16*1+" #A1 ",%%mm6\n\t"\ - "pmulhw %%mm7,%%mm4 \n\t"/* x7*tg_1_16*/\ - "paddsw %%mm3,%%mm0 \n\t"/* x3*tg_3_16*/\ - "pmulhw %%mm6,%%mm2 \n\t"/* x1*tg_1_16*/\ - "paddsw %%mm3,%%mm1 \n\t"/* x3+x5*(tg_3_16-1)*/\ - "psubsw %%mm5,%%mm0 \n\t"/* x3*tg_3_16-x5 = tm35*/\ - "movq 3*8(%3),%%mm3\n\t"\ - "paddsw %%mm5,%%mm1 \n\t"/* x3+x5*tg_3_16 = tp35*/\ - "paddsw %%mm6,%%mm4 \n\t"/* x1+tg_1_16*x7 = tp17*/\ - "psubsw %%mm7,%%mm2 \n\t"/* x1*tg_1_16-x7 = tm17*/\ - "movq %%mm4,%%mm5 \n\t"/* tp17*/\ - "movq %%mm2,%%mm6 \n\t"/* tm17*/\ - "paddsw %%mm1,%%mm5 \n\t"/* tp17+tp35 = b0*/\ - "psubsw %%mm0,%%mm6 \n\t"/* tm17-tm35 = b3*/\ - "psubsw %%mm1,%%mm4 \n\t"/* tp17-tp35 = t1*/\ - "paddsw %%mm0,%%mm2 \n\t"/* tm17+tm35 = t2*/\ - "movq 1*8(%3),%%mm7\n\t"\ - "movq %%mm4,%%mm1 \n\t"/* t1*/\ - "movq %%mm5,3*16 +" #A2 "\n\t"/* save b0*/\ - "paddsw %%mm2,%%mm1 \n\t"/* t1+t2*/\ - "movq %%mm6,5*16 +" #A2 "\n\t"/* save b3*/\ - "psubsw %%mm2,%%mm4 \n\t"/* t1-t2*/\ - "movq 2*16+" #A1 ",%%mm5\n\t"\ - "movq %%mm7,%%mm0 \n\t"/* tg_2_16*/\ - "movq 6*16+" #A1 ",%%mm6\n\t"\ - "pmulhw %%mm5,%%mm0 \n\t"/* x2*tg_2_16*/\ - "pmulhw %%mm6,%%mm7 \n\t"/* x6*tg_2_16*/\ - "pmulhw %%mm3,%%mm1 \n\t"/* ocos_4_16*(t1+t2) = b1/2*/\ - "movq 0*16+" #A1 ",%%mm2\n\t"\ - "pmulhw %%mm3,%%mm4 \n\t"/* ocos_4_16*(t1-t2) = b2/2*/\ - "psubsw %%mm6,%%mm0 \n\t"/* t2*tg_2_16-x6 = tm26*/\ - "movq %%mm2,%%mm3 \n\t"/* x0*/\ - "movq 4*16+" #A1 ",%%mm6\n\t"\ - "paddsw %%mm5,%%mm7 \n\t"/* x2+x6*tg_2_16 = tp26*/\ - "paddsw %%mm6,%%mm2 \n\t"/* x0+x4 = tp04*/\ - "psubsw %%mm6,%%mm3 \n\t"/* x0-x4 = tm04*/\ - "movq %%mm2,%%mm5 \n\t"/* tp04*/\ - "movq %%mm3,%%mm6 \n\t"/* tm04*/\ - "psubsw %%mm7,%%mm2 \n\t"/* tp04-tp26 = a3*/\ - "paddsw %%mm0,%%mm3 \n\t"/* tm04+tm26 = a1*/\ - "paddsw %%mm1,%%mm1 \n\t"/* b1*/\ - "paddsw %%mm4,%%mm4 \n\t"/* b2*/\ - "paddsw %%mm7,%%mm5 \n\t"/* tp04+tp26 = a0*/\ - "psubsw %%mm0,%%mm6 \n\t"/* tm04-tm26 = a2*/\ - "movq %%mm3,%%mm7 \n\t"/* a1*/\ - "movq %%mm6,%%mm0 \n\t"/* a2*/\ - "paddsw %%mm1,%%mm3 \n\t"/* a1+b1*/\ - "paddsw %%mm4,%%mm6 \n\t"/* a2+b2*/\ - "psraw $6,%%mm3 \n\t"/* dst1*/\ - "psubsw %%mm1,%%mm7 \n\t"/* a1-b1*/\ - "psraw $6,%%mm6 \n\t"/* dst2*/\ - "psubsw %%mm4,%%mm0 \n\t"/* a2-b2*/\ - "movq 3*16+" #A2 ",%%mm1 \n\t"/* load b0*/\ - "psraw $6,%%mm7 \n\t"/* dst6*/\ - "movq %%mm5,%%mm4 \n\t"/* a0*/\ - "psraw $6,%%mm0 \n\t"/* dst5*/\ - "movq %%mm3,1*16+" #A2 "\n\t"\ - "paddsw %%mm1,%%mm5 \n\t"/* a0+b0*/\ - "movq %%mm6,2*16+" #A2 "\n\t"\ - "psubsw %%mm1,%%mm4 \n\t"/* a0-b0*/\ - "movq 5*16+" #A2 ",%%mm3 \n\t"/* load b3*/\ - "psraw $6,%%mm5 \n\t"/* dst0*/\ - "movq %%mm2,%%mm6 \n\t"/* a3*/\ - "psraw $6,%%mm4 \n\t"/* dst7*/\ - "movq %%mm0,5*16+" #A2 "\n\t"\ - "paddsw %%mm3,%%mm2 \n\t"/* a3+b3*/\ - "movq %%mm7,6*16+" #A2 "\n\t"\ - "psubsw %%mm3,%%mm6 \n\t"/* a3-b3*/\ - "movq %%mm5,0*16+" #A2 "\n\t"\ - "psraw $6,%%mm2 \n\t"/* dst3*/\ - "movq %%mm4,7*16+" #A2 "\n\t"\ - "psraw $6,%%mm6 \n\t"/* dst4*/\ - "movq %%mm2,3*16+" #A2 "\n\t"\ - "movq %%mm6,4*16+" #A2 "\n\t" - -//============================================================================= -// Code -//============================================================================= - -//----------------------------------------------------------------------------- -// void idct_mmx(uint16_t block[64]); -//----------------------------------------------------------------------------- - - -void ff_idct_xvid_mmx(short *block){ -__asm__ volatile( - //# Process each row - DCT_8_INV_ROW_MMX(0*16(%0), 0*16(%0), 64*0(%2), 8*0(%1)) - DCT_8_INV_ROW_MMX(1*16(%0), 1*16(%0), 64*1(%2), 8*1(%1)) - DCT_8_INV_ROW_MMX(2*16(%0), 2*16(%0), 64*2(%2), 8*2(%1)) - DCT_8_INV_ROW_MMX(3*16(%0), 3*16(%0), 64*3(%2), 8*3(%1)) - DCT_8_INV_ROW_MMX(4*16(%0), 4*16(%0), 64*0(%2), 8*4(%1)) - DCT_8_INV_ROW_MMX(5*16(%0), 5*16(%0), 64*3(%2), 8*5(%1)) - DCT_8_INV_ROW_MMX(6*16(%0), 6*16(%0), 64*2(%2), 8*6(%1)) - DCT_8_INV_ROW_MMX(7*16(%0), 7*16(%0), 64*1(%2), 8*7(%1)) - - //# Process the columns (4 at a time) - DCT_8_INV_COL(0(%0), 0(%0)) - DCT_8_INV_COL(8(%0), 8(%0)) - :: "r"(block), "r"(rounder_0), "r"(tab_i_04_mmx), "r"(tg_1_16)); -} - -//----------------------------------------------------------------------------- -// void idct_xmm(uint16_t block[64]); -//----------------------------------------------------------------------------- - - -void ff_idct_xvid_mmx2(short *block){ -__asm__ volatile( - //# Process each row - DCT_8_INV_ROW_XMM(0*16(%0), 0*16(%0), 64*0(%2), 8*0(%1)) - DCT_8_INV_ROW_XMM(1*16(%0), 1*16(%0), 64*1(%2), 8*1(%1)) - DCT_8_INV_ROW_XMM(2*16(%0), 2*16(%0), 64*2(%2), 8*2(%1)) - DCT_8_INV_ROW_XMM(3*16(%0), 3*16(%0), 64*3(%2), 8*3(%1)) - DCT_8_INV_ROW_XMM(4*16(%0), 4*16(%0), 64*0(%2), 8*4(%1)) - DCT_8_INV_ROW_XMM(5*16(%0), 5*16(%0), 64*3(%2), 8*5(%1)) - DCT_8_INV_ROW_XMM(6*16(%0), 6*16(%0), 64*2(%2), 8*6(%1)) - DCT_8_INV_ROW_XMM(7*16(%0), 7*16(%0), 64*1(%2), 8*7(%1)) - - //# Process the columns (4 at a time) - DCT_8_INV_COL(0(%0), 0(%0)) - DCT_8_INV_COL(8(%0), 8(%0)) - :: "r"(block), "r"(rounder_0), "r"(tab_i_04_xmm), "r"(tg_1_16)); -} -
--- a/i386/idct_sse2_xvid.c Mon Dec 22 06:50:18 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,394 +0,0 @@ -/* - * XVID MPEG-4 VIDEO CODEC - * - SSE2 inverse discrete cosine transform - - * - * Copyright(C) 2003 Pascal Massimino <skal@planet-d.net> - * - * Conversion to gcc syntax with modifications - * by Alexander Strange <astrange@ithinksw.com> - * - * Originally from dct/x86_asm/fdct_sse2_skal.asm in Xvid. - * - * This file is part of FFmpeg. - * - * Vertical pass is an implementation of the scheme: - * Loeffler C., Ligtenberg A., and Moschytz C.S.: - * Practical Fast 1D DCT Algorithm with Eleven Multiplications, - * Proc. ICASSP 1989, 988-991. - * - * Horizontal pass is a double 4x4 vector/matrix multiplication, - * (see also Intel's Application Note 922: - * http://developer.intel.com/vtune/cbts/strmsimd/922down.htm - * Copyright (C) 1999 Intel Corporation) - * - * More details at http://skal.planet-d.net/coding/dct.html - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with FFmpeg; if not, write to the Free Software Foundation, - * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavcodec/dsputil.h" -#include "idct_xvid.h" - -/*! - * @file idct_sse2_xvid.c - * @brief SSE2 idct compatible with xvidmmx - */ - -#define X8(x) x,x,x,x,x,x,x,x - -#define ROW_SHIFT 11 -#define COL_SHIFT 6 - -DECLARE_ASM_CONST(16, int16_t, tan1[]) = {X8(13036)}; // tan( pi/16) -DECLARE_ASM_CONST(16, int16_t, tan2[]) = {X8(27146)}; // tan(2pi/16) = sqrt(2)-1 -DECLARE_ASM_CONST(16, int16_t, tan3[]) = {X8(43790)}; // tan(3pi/16)-1 -DECLARE_ASM_CONST(16, int16_t, sqrt2[])= {X8(23170)}; // 0.5/sqrt(2) -DECLARE_ASM_CONST(8, uint8_t, m127[]) = {X8(127)}; - -DECLARE_ASM_CONST(16, int16_t, iTab1[]) = { - 0x4000, 0x539f, 0xc000, 0xac61, 0x4000, 0xdd5d, 0x4000, 0xdd5d, - 0x4000, 0x22a3, 0x4000, 0x22a3, 0xc000, 0x539f, 0x4000, 0xac61, - 0x3249, 0x11a8, 0x4b42, 0xee58, 0x11a8, 0x4b42, 0x11a8, 0xcdb7, - 0x58c5, 0x4b42, 0xa73b, 0xcdb7, 0x3249, 0xa73b, 0x4b42, 0xa73b -}; - -DECLARE_ASM_CONST(16, int16_t, iTab2[]) = { - 0x58c5, 0x73fc, 0xa73b, 0x8c04, 0x58c5, 0xcff5, 0x58c5, 0xcff5, - 0x58c5, 0x300b, 0x58c5, 0x300b, 0xa73b, 0x73fc, 0x58c5, 0x8c04, - 0x45bf, 0x187e, 0x6862, 0xe782, 0x187e, 0x6862, 0x187e, 0xba41, - 0x7b21, 0x6862, 0x84df, 0xba41, 0x45bf, 0x84df, 0x6862, 0x84df -}; - -DECLARE_ASM_CONST(16, int16_t, iTab3[]) = { - 0x539f, 0x6d41, 0xac61, 0x92bf, 0x539f, 0xd2bf, 0x539f, 0xd2bf, - 0x539f, 0x2d41, 0x539f, 0x2d41, 0xac61, 0x6d41, 0x539f, 0x92bf, - 0x41b3, 0x1712, 0x6254, 0xe8ee, 0x1712, 0x6254, 0x1712, 0xbe4d, - 0x73fc, 0x6254, 0x8c04, 0xbe4d, 0x41b3, 0x8c04, 0x6254, 0x8c04 -}; - -DECLARE_ASM_CONST(16, int16_t, iTab4[]) = { - 0x4b42, 0x6254, 0xb4be, 0x9dac, 0x4b42, 0xd746, 0x4b42, 0xd746, - 0x4b42, 0x28ba, 0x4b42, 0x28ba, 0xb4be, 0x6254, 0x4b42, 0x9dac, - 0x3b21, 0x14c3, 0x587e, 0xeb3d, 0x14c3, 0x587e, 0x14c3, 0xc4df, - 0x6862, 0x587e, 0x979e, 0xc4df, 0x3b21, 0x979e, 0x587e, 0x979e -}; - -DECLARE_ASM_CONST(16, int32_t, walkenIdctRounders[]) = { - 65536, 65536, 65536, 65536, - 3597, 3597, 3597, 3597, - 2260, 2260, 2260, 2260, - 1203, 1203, 1203, 1203, - 120, 120, 120, 120, - 512, 512, 512, 512 -}; - -// Temporary storage before the column pass -#define ROW1 "%%xmm6" -#define ROW3 "%%xmm4" -#define ROW5 "%%xmm5" -#define ROW7 "%%xmm7" - -#define CLEAR_ODD(r) "pxor "r","r" \n\t" -#define PUT_ODD(dst) "pshufhw $0x1B, %%xmm2, "dst" \n\t" - -#ifdef ARCH_X86_64 - -# define ROW0 "%%xmm8" -# define REG0 ROW0 -# define ROW2 "%%xmm9" -# define REG2 ROW2 -# define ROW4 "%%xmm10" -# define REG4 ROW4 -# define ROW6 "%%xmm11" -# define REG6 ROW6 -# define CLEAR_EVEN(r) CLEAR_ODD(r) -# define PUT_EVEN(dst) PUT_ODD(dst) -# define XMMS "%%xmm12" -# define MOV_32_ONLY "#" -# define SREG2 REG2 -# define TAN3 "%%xmm13" -# define TAN1 "%%xmm14" - -#else - -# define ROW0 "(%0)" -# define REG0 "%%xmm4" -# define ROW2 "2*16(%0)" -# define REG2 "%%xmm4" -# define ROW4 "4*16(%0)" -# define REG4 "%%xmm6" -# define ROW6 "6*16(%0)" -# define REG6 "%%xmm6" -# define CLEAR_EVEN(r) -# define PUT_EVEN(dst) \ - "pshufhw $0x1B, %%xmm2, %%xmm2 \n\t" \ - "movdqa %%xmm2, "dst" \n\t" -# define XMMS "%%xmm2" -# define MOV_32_ONLY "movdqa " -# define SREG2 "%%xmm7" -# define TAN3 "%%xmm0" -# define TAN1 "%%xmm2" - -#endif - -#define ROUND(x) "paddd "MANGLE(x) - -#define JZ(reg, to) \ - "testl "reg","reg" \n\t" \ - "jz "to" \n\t" - -#define JNZ(reg, to) \ - "testl "reg","reg" \n\t" \ - "jnz "to" \n\t" - -#define TEST_ONE_ROW(src, reg, clear) \ - clear \ - "movq "src", %%mm1 \n\t" \ - "por 8+"src", %%mm1 \n\t" \ - "paddusb %%mm0, %%mm1 \n\t" \ - "pmovmskb %%mm1, "reg" \n\t" - -#define TEST_TWO_ROWS(row1, row2, reg1, reg2, clear1, clear2) \ - clear1 \ - clear2 \ - "movq "row1", %%mm1 \n\t" \ - "por 8+"row1", %%mm1 \n\t" \ - "movq "row2", %%mm2 \n\t" \ - "por 8+"row2", %%mm2 \n\t" \ - "paddusb %%mm0, %%mm1 \n\t" \ - "paddusb %%mm0, %%mm2 \n\t" \ - "pmovmskb %%mm1, "reg1" \n\t" \ - "pmovmskb %%mm2, "reg2" \n\t" - -///IDCT pass on rows. -#define iMTX_MULT(src, table, rounder, put) \ - "movdqa "src", %%xmm3 \n\t" \ - "movdqa %%xmm3, %%xmm0 \n\t" \ - "pshufd $0x11, %%xmm3, %%xmm1 \n\t" /* 4602 */ \ - "punpcklqdq %%xmm0, %%xmm0 \n\t" /* 0246 */ \ - "pmaddwd "table", %%xmm0 \n\t" \ - "pmaddwd 16+"table", %%xmm1 \n\t" \ - "pshufd $0xBB, %%xmm3, %%xmm2 \n\t" /* 5713 */ \ - "punpckhqdq %%xmm3, %%xmm3 \n\t" /* 1357 */ \ - "pmaddwd 32+"table", %%xmm2 \n\t" \ - "pmaddwd 48+"table", %%xmm3 \n\t" \ - "paddd %%xmm1, %%xmm0 \n\t" \ - "paddd %%xmm3, %%xmm2 \n\t" \ - rounder", %%xmm0 \n\t" \ - "movdqa %%xmm2, %%xmm3 \n\t" \ - "paddd %%xmm0, %%xmm2 \n\t" \ - "psubd %%xmm3, %%xmm0 \n\t" \ - "psrad $11, %%xmm2 \n\t" \ - "psrad $11, %%xmm0 \n\t" \ - "packssdw %%xmm0, %%xmm2 \n\t" \ - put \ - "1: \n\t" - -#define iLLM_HEAD \ - "movdqa "MANGLE(tan3)", "TAN3" \n\t" \ - "movdqa "MANGLE(tan1)", "TAN1" \n\t" \ - -///IDCT pass on columns. -#define iLLM_PASS(dct) \ - "movdqa "TAN3", %%xmm1 \n\t" \ - "movdqa "TAN1", %%xmm3 \n\t" \ - "pmulhw %%xmm4, "TAN3" \n\t" \ - "pmulhw %%xmm5, %%xmm1 \n\t" \ - "paddsw %%xmm4, "TAN3" \n\t" \ - "paddsw %%xmm5, %%xmm1 \n\t" \ - "psubsw %%xmm5, "TAN3" \n\t" \ - "paddsw %%xmm4, %%xmm1 \n\t" \ - "pmulhw %%xmm7, %%xmm3 \n\t" \ - "pmulhw %%xmm6, "TAN1" \n\t" \ - "paddsw %%xmm6, %%xmm3 \n\t" \ - "psubsw %%xmm7, "TAN1" \n\t" \ - "movdqa %%xmm3, %%xmm7 \n\t" \ - "movdqa "TAN1", %%xmm6 \n\t" \ - "psubsw %%xmm1, %%xmm3 \n\t" \ - "psubsw "TAN3", "TAN1" \n\t" \ - "paddsw %%xmm7, %%xmm1 \n\t" \ - "paddsw %%xmm6, "TAN3" \n\t" \ - "movdqa %%xmm3, %%xmm6 \n\t" \ - "psubsw "TAN3", %%xmm3 \n\t" \ - "paddsw %%xmm6, "TAN3" \n\t" \ - "movdqa "MANGLE(sqrt2)", %%xmm4 \n\t" \ - "pmulhw %%xmm4, %%xmm3 \n\t" \ - "pmulhw %%xmm4, "TAN3" \n\t" \ - "paddsw "TAN3", "TAN3" \n\t" \ - "paddsw %%xmm3, %%xmm3 \n\t" \ - "movdqa "MANGLE(tan2)", %%xmm7 \n\t" \ - MOV_32_ONLY ROW2", "REG2" \n\t" \ - MOV_32_ONLY ROW6", "REG6" \n\t" \ - "movdqa %%xmm7, %%xmm5 \n\t" \ - "pmulhw "REG6", %%xmm7 \n\t" \ - "pmulhw "REG2", %%xmm5 \n\t" \ - "paddsw "REG2", %%xmm7 \n\t" \ - "psubsw "REG6", %%xmm5 \n\t" \ - MOV_32_ONLY ROW0", "REG0" \n\t" \ - MOV_32_ONLY ROW4", "REG4" \n\t" \ - MOV_32_ONLY" "TAN1", (%0) \n\t" \ - "movdqa "REG0", "XMMS" \n\t" \ - "psubsw "REG4", "REG0" \n\t" \ - "paddsw "XMMS", "REG4" \n\t" \ - "movdqa "REG4", "XMMS" \n\t" \ - "psubsw %%xmm7, "REG4" \n\t" \ - "paddsw "XMMS", %%xmm7 \n\t" \ - "movdqa "REG0", "XMMS" \n\t" \ - "psubsw %%xmm5, "REG0" \n\t" \ - "paddsw "XMMS", %%xmm5 \n\t" \ - "movdqa %%xmm5, "XMMS" \n\t" \ - "psubsw "TAN3", %%xmm5 \n\t" \ - "paddsw "XMMS", "TAN3" \n\t" \ - "movdqa "REG0", "XMMS" \n\t" \ - "psubsw %%xmm3, "REG0" \n\t" \ - "paddsw "XMMS", %%xmm3 \n\t" \ - MOV_32_ONLY" (%0), "TAN1" \n\t" \ - "psraw $6, %%xmm5 \n\t" \ - "psraw $6, "REG0" \n\t" \ - "psraw $6, "TAN3" \n\t" \ - "psraw $6, %%xmm3 \n\t" \ - "movdqa "TAN3", 1*16("dct") \n\t" \ - "movdqa %%xmm3, 2*16("dct") \n\t" \ - "movdqa "REG0", 5*16("dct") \n\t" \ - "movdqa %%xmm5, 6*16("dct") \n\t" \ - "movdqa %%xmm7, %%xmm0 \n\t" \ - "movdqa "REG4", %%xmm4 \n\t" \ - "psubsw %%xmm1, %%xmm7 \n\t" \ - "psubsw "TAN1", "REG4" \n\t" \ - "paddsw %%xmm0, %%xmm1 \n\t" \ - "paddsw %%xmm4, "TAN1" \n\t" \ - "psraw $6, %%xmm1 \n\t" \ - "psraw $6, %%xmm7 \n\t" \ - "psraw $6, "TAN1" \n\t" \ - "psraw $6, "REG4" \n\t" \ - "movdqa %%xmm1, ("dct") \n\t" \ - "movdqa "TAN1", 3*16("dct") \n\t" \ - "movdqa "REG4", 4*16("dct") \n\t" \ - "movdqa %%xmm7, 7*16("dct") \n\t" - -///IDCT pass on columns, assuming rows 4-7 are zero. -#define iLLM_PASS_SPARSE(dct) \ - "pmulhw %%xmm4, "TAN3" \n\t" \ - "paddsw %%xmm4, "TAN3" \n\t" \ - "movdqa %%xmm6, %%xmm3 \n\t" \ - "pmulhw %%xmm6, "TAN1" \n\t" \ - "movdqa %%xmm4, %%xmm1 \n\t" \ - "psubsw %%xmm1, %%xmm3 \n\t" \ - "paddsw %%xmm6, %%xmm1 \n\t" \ - "movdqa "TAN1", %%xmm6 \n\t" \ - "psubsw "TAN3", "TAN1" \n\t" \ - "paddsw %%xmm6, "TAN3" \n\t" \ - "movdqa %%xmm3, %%xmm6 \n\t" \ - "psubsw "TAN3", %%xmm3 \n\t" \ - "paddsw %%xmm6, "TAN3" \n\t" \ - "movdqa "MANGLE(sqrt2)", %%xmm4 \n\t" \ - "pmulhw %%xmm4, %%xmm3 \n\t" \ - "pmulhw %%xmm4, "TAN3" \n\t" \ - "paddsw "TAN3", "TAN3" \n\t" \ - "paddsw %%xmm3, %%xmm3 \n\t" \ - "movdqa "MANGLE(tan2)", %%xmm5 \n\t" \ - MOV_32_ONLY ROW2", "SREG2" \n\t" \ - "pmulhw "SREG2", %%xmm5 \n\t" \ - MOV_32_ONLY ROW0", "REG0" \n\t" \ - "movdqa "REG0", %%xmm6 \n\t" \ - "psubsw "SREG2", %%xmm6 \n\t" \ - "paddsw "REG0", "SREG2" \n\t" \ - MOV_32_ONLY" "TAN1", (%0) \n\t" \ - "movdqa "REG0", "XMMS" \n\t" \ - "psubsw %%xmm5, "REG0" \n\t" \ - "paddsw "XMMS", %%xmm5 \n\t" \ - "movdqa %%xmm5, "XMMS" \n\t" \ - "psubsw "TAN3", %%xmm5 \n\t" \ - "paddsw "XMMS", "TAN3" \n\t" \ - "movdqa "REG0", "XMMS" \n\t" \ - "psubsw %%xmm3, "REG0" \n\t" \ - "paddsw "XMMS", %%xmm3 \n\t" \ - MOV_32_ONLY" (%0), "TAN1" \n\t" \ - "psraw $6, %%xmm5 \n\t" \ - "psraw $6, "REG0" \n\t" \ - "psraw $6, "TAN3" \n\t" \ - "psraw $6, %%xmm3 \n\t" \ - "movdqa "TAN3", 1*16("dct") \n\t" \ - "movdqa %%xmm3, 2*16("dct") \n\t" \ - "movdqa "REG0", 5*16("dct") \n\t" \ - "movdqa %%xmm5, 6*16("dct") \n\t" \ - "movdqa "SREG2", %%xmm0 \n\t" \ - "movdqa %%xmm6, %%xmm4 \n\t" \ - "psubsw %%xmm1, "SREG2" \n\t" \ - "psubsw "TAN1", %%xmm6 \n\t" \ - "paddsw %%xmm0, %%xmm1 \n\t" \ - "paddsw %%xmm4, "TAN1" \n\t" \ - "psraw $6, %%xmm1 \n\t" \ - "psraw $6, "SREG2" \n\t" \ - "psraw $6, "TAN1" \n\t" \ - "psraw $6, %%xmm6 \n\t" \ - "movdqa %%xmm1, ("dct") \n\t" \ - "movdqa "TAN1", 3*16("dct") \n\t" \ - "movdqa %%xmm6, 4*16("dct") \n\t" \ - "movdqa "SREG2", 7*16("dct") \n\t" - -inline void ff_idct_xvid_sse2(short *block) -{ - __asm__ volatile( - "movq "MANGLE(m127)", %%mm0 \n\t" - iMTX_MULT("(%0)", MANGLE(iTab1), ROUND(walkenIdctRounders), PUT_EVEN(ROW0)) - iMTX_MULT("1*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+1*16), PUT_ODD(ROW1)) - iMTX_MULT("2*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+2*16), PUT_EVEN(ROW2)) - - TEST_TWO_ROWS("3*16(%0)", "4*16(%0)", "%%eax", "%%ecx", CLEAR_ODD(ROW3), CLEAR_EVEN(ROW4)) - JZ("%%eax", "1f") - iMTX_MULT("3*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+3*16), PUT_ODD(ROW3)) - - TEST_TWO_ROWS("5*16(%0)", "6*16(%0)", "%%eax", "%%edx", CLEAR_ODD(ROW5), CLEAR_EVEN(ROW6)) - TEST_ONE_ROW("7*16(%0)", "%%esi", CLEAR_ODD(ROW7)) - iLLM_HEAD - ASMALIGN(4) - JNZ("%%ecx", "2f") - JNZ("%%eax", "3f") - JNZ("%%edx", "4f") - JNZ("%%esi", "5f") - iLLM_PASS_SPARSE("%0") - "jmp 6f \n\t" - "2: \n\t" - iMTX_MULT("4*16(%0)", MANGLE(iTab1), "#", PUT_EVEN(ROW4)) - "3: \n\t" - iMTX_MULT("5*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+4*16), PUT_ODD(ROW5)) - JZ("%%edx", "1f") - "4: \n\t" - iMTX_MULT("6*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+5*16), PUT_EVEN(ROW6)) - JZ("%%esi", "1f") - "5: \n\t" - iMTX_MULT("7*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+5*16), PUT_ODD(ROW7)) -#ifndef ARCH_X86_64 - iLLM_HEAD -#endif - iLLM_PASS("%0") - "6: \n\t" - : "+r"(block) - : - : "%eax", "%ecx", "%edx", "%esi", "memory"); -} - -void ff_idct_xvid_sse2_put(uint8_t *dest, int line_size, short *block) -{ - ff_idct_xvid_sse2(block); - put_pixels_clamped_mmx(block, dest, line_size); -} - -void ff_idct_xvid_sse2_add(uint8_t *dest, int line_size, short *block) -{ - ff_idct_xvid_sse2(block); - add_pixels_clamped_mmx(block, dest, line_size); -}
--- a/i386/idct_xvid.h Mon Dec 22 06:50:18 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,37 +0,0 @@ -/* - * XVID MPEG-4 VIDEO CODEC - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/*! - * @file idct_xvid.h - * header for Xvid IDCT functions - */ - -#ifndef AVCODEC_I386_IDCT_XVID_H -#define AVCODEC_I386_IDCT_XVID_H - -#include <stdint.h> - -void ff_idct_xvid_mmx(short *block); -void ff_idct_xvid_mmx2(short *block); -void ff_idct_xvid_sse2(short *block); -void ff_idct_xvid_sse2_put(uint8_t *dest, int line_size, short *block); -void ff_idct_xvid_sse2_add(uint8_t *dest, int line_size, short *block); - -#endif /* AVCODEC_I386_IDCT_XVID_H */
--- a/i386/mathops.h Mon Dec 22 06:50:18 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,43 +0,0 @@ -/* - * simple math operations - * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVCODEC_I386_MATHOPS_H -#define AVCODEC_I386_MATHOPS_H - -#define MULL(ra, rb, shift) \ - ({ int rt, dummy; __asm__ (\ - "imull %3 \n\t"\ - "shrdl %4, %%edx, %%eax \n\t"\ - : "=a"(rt), "=d"(dummy)\ - : "a" ((int)ra), "rm" ((int)rb), "i"(shift));\ - rt; }) - -#define MULH(ra, rb) \ - ({ int rt, dummy;\ - __asm__ ("imull %3\n\t" : "=d"(rt), "=a"(dummy): "a" ((int)ra), "rm" ((int)rb));\ - rt; }) - -#define MUL64(ra, rb) \ - ({ int64_t rt;\ - __asm__ ("imull %2\n\t" : "=A"(rt) : "a" ((int)ra), "g" ((int)rb));\ - rt; }) - -#endif /* AVCODEC_I386_MATHOPS_H */
--- a/i386/mmx.h Mon Dec 22 06:50:18 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,267 +0,0 @@ -/* - * mmx.h - * Copyright (C) 1997-2001 H. Dietz and R. Fisher - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef AVCODEC_I386_MMX_H -#define AVCODEC_I386_MMX_H - -#warning Everything in this header is deprecated, use plain __asm__()! New code using this header will be rejected. - - -#define mmx_i2r(op,imm,reg) \ - __asm__ volatile (#op " %0, %%" #reg \ - : /* nothing */ \ - : "i" (imm) ) - -#define mmx_m2r(op,mem,reg) \ - __asm__ volatile (#op " %0, %%" #reg \ - : /* nothing */ \ - : "m" (mem)) - -#define mmx_r2m(op,reg,mem) \ - __asm__ volatile (#op " %%" #reg ", %0" \ - : "=m" (mem) \ - : /* nothing */ ) - -#define mmx_r2r(op,regs,regd) \ - __asm__ volatile (#op " %" #regs ", %" #regd) - - -#define emms() __asm__ volatile ("emms") - -#define movd_m2r(var,reg) mmx_m2r (movd, var, reg) -#define movd_r2m(reg,var) mmx_r2m (movd, reg, var) -#define movd_r2r(regs,regd) mmx_r2r (movd, regs, regd) - -#define movq_m2r(var,reg) mmx_m2r (movq, var, reg) -#define movq_r2m(reg,var) mmx_r2m (movq, reg, var) -#define movq_r2r(regs,regd) mmx_r2r (movq, regs, regd) - -#define packssdw_m2r(var,reg) mmx_m2r (packssdw, var, reg) -#define packssdw_r2r(regs,regd) mmx_r2r (packssdw, regs, regd) -#define packsswb_m2r(var,reg) mmx_m2r (packsswb, var, reg) -#define packsswb_r2r(regs,regd) mmx_r2r (packsswb, regs, regd) - -#define packuswb_m2r(var,reg) mmx_m2r (packuswb, var, reg) -#define packuswb_r2r(regs,regd) mmx_r2r (packuswb, regs, regd) - -#define paddb_m2r(var,reg) mmx_m2r (paddb, var, reg) -#define paddb_r2r(regs,regd) mmx_r2r (paddb, regs, regd) -#define paddd_m2r(var,reg) mmx_m2r (paddd, var, reg) -#define paddd_r2r(regs,regd) mmx_r2r (paddd, regs, regd) -#define paddw_m2r(var,reg) mmx_m2r (paddw, var, reg) -#define paddw_r2r(regs,regd) mmx_r2r (paddw, regs, regd) - -#define paddsb_m2r(var,reg) mmx_m2r (paddsb, var, reg) -#define paddsb_r2r(regs,regd) mmx_r2r (paddsb, regs, regd) -#define paddsw_m2r(var,reg) mmx_m2r (paddsw, var, reg) -#define paddsw_r2r(regs,regd) mmx_r2r (paddsw, regs, regd) - -#define paddusb_m2r(var,reg) mmx_m2r (paddusb, var, reg) -#define paddusb_r2r(regs,regd) mmx_r2r (paddusb, regs, regd) -#define paddusw_m2r(var,reg) mmx_m2r (paddusw, var, reg) -#define paddusw_r2r(regs,regd) mmx_r2r (paddusw, regs, regd) - -#define pand_m2r(var,reg) mmx_m2r (pand, var, reg) -#define pand_r2r(regs,regd) mmx_r2r (pand, regs, regd) - -#define pandn_m2r(var,reg) mmx_m2r (pandn, var, reg) -#define pandn_r2r(regs,regd) mmx_r2r (pandn, regs, regd) - -#define pcmpeqb_m2r(var,reg) mmx_m2r (pcmpeqb, var, reg) -#define pcmpeqb_r2r(regs,regd) mmx_r2r (pcmpeqb, regs, regd) -#define pcmpeqd_m2r(var,reg) mmx_m2r (pcmpeqd, var, reg) -#define pcmpeqd_r2r(regs,regd) mmx_r2r (pcmpeqd, regs, regd) -#define pcmpeqw_m2r(var,reg) mmx_m2r (pcmpeqw, var, reg) -#define pcmpeqw_r2r(regs,regd) mmx_r2r (pcmpeqw, regs, regd) - -#define pcmpgtb_m2r(var,reg) mmx_m2r (pcmpgtb, var, reg) -#define pcmpgtb_r2r(regs,regd) mmx_r2r (pcmpgtb, regs, regd) -#define pcmpgtd_m2r(var,reg) mmx_m2r (pcmpgtd, var, reg) -#define pcmpgtd_r2r(regs,regd) mmx_r2r (pcmpgtd, regs, regd) -#define pcmpgtw_m2r(var,reg) mmx_m2r (pcmpgtw, var, reg) -#define pcmpgtw_r2r(regs,regd) mmx_r2r (pcmpgtw, regs, regd) - -#define pmaddwd_m2r(var,reg) mmx_m2r (pmaddwd, var, reg) -#define pmaddwd_r2r(regs,regd) mmx_r2r (pmaddwd, regs, regd) - -#define pmulhw_m2r(var,reg) mmx_m2r (pmulhw, var, reg) -#define pmulhw_r2r(regs,regd) mmx_r2r (pmulhw, regs, regd) - -#define pmullw_m2r(var,reg) mmx_m2r (pmullw, var, reg) -#define pmullw_r2r(regs,regd) mmx_r2r (pmullw, regs, regd) - -#define por_m2r(var,reg) mmx_m2r (por, var, reg) -#define por_r2r(regs,regd) mmx_r2r (por, regs, regd) - -#define pslld_i2r(imm,reg) mmx_i2r (pslld, imm, reg) -#define pslld_m2r(var,reg) mmx_m2r (pslld, var, reg) -#define pslld_r2r(regs,regd) mmx_r2r (pslld, regs, regd) -#define psllq_i2r(imm,reg) mmx_i2r (psllq, imm, reg) -#define psllq_m2r(var,reg) mmx_m2r (psllq, var, reg) -#define psllq_r2r(regs,regd) mmx_r2r (psllq, regs, regd) -#define psllw_i2r(imm,reg) mmx_i2r (psllw, imm, reg) -#define psllw_m2r(var,reg) mmx_m2r (psllw, var, reg) -#define psllw_r2r(regs,regd) mmx_r2r (psllw, regs, regd) - -#define psrad_i2r(imm,reg) mmx_i2r (psrad, imm, reg) -#define psrad_m2r(var,reg) mmx_m2r (psrad, var, reg) -#define psrad_r2r(regs,regd) mmx_r2r (psrad, regs, regd) -#define psraw_i2r(imm,reg) mmx_i2r (psraw, imm, reg) -#define psraw_m2r(var,reg) mmx_m2r (psraw, var, reg) -#define psraw_r2r(regs,regd) mmx_r2r (psraw, regs, regd) - -#define psrld_i2r(imm,reg) mmx_i2r (psrld, imm, reg) -#define psrld_m2r(var,reg) mmx_m2r (psrld, var, reg) -#define psrld_r2r(regs,regd) mmx_r2r (psrld, regs, regd) -#define psrlq_i2r(imm,reg) mmx_i2r (psrlq, imm, reg) -#define psrlq_m2r(var,reg) mmx_m2r (psrlq, var, reg) -#define psrlq_r2r(regs,regd) mmx_r2r (psrlq, regs, regd) -#define psrlw_i2r(imm,reg) mmx_i2r (psrlw, imm, reg) -#define psrlw_m2r(var,reg) mmx_m2r (psrlw, var, reg) -#define psrlw_r2r(regs,regd) mmx_r2r (psrlw, regs, regd) - -#define psubb_m2r(var,reg) mmx_m2r (psubb, var, reg) -#define psubb_r2r(regs,regd) mmx_r2r (psubb, regs, regd) -#define psubd_m2r(var,reg) mmx_m2r (psubd, var, reg) -#define psubd_r2r(regs,regd) mmx_r2r (psubd, regs, regd) -#define psubw_m2r(var,reg) mmx_m2r (psubw, var, reg) -#define psubw_r2r(regs,regd) mmx_r2r (psubw, regs, regd) - -#define psubsb_m2r(var,reg) mmx_m2r (psubsb, var, reg) -#define psubsb_r2r(regs,regd) mmx_r2r (psubsb, regs, regd) -#define psubsw_m2r(var,reg) mmx_m2r (psubsw, var, reg) -#define psubsw_r2r(regs,regd) mmx_r2r (psubsw, regs, regd) - -#define psubusb_m2r(var,reg) mmx_m2r (psubusb, var, reg) -#define psubusb_r2r(regs,regd) mmx_r2r (psubusb, regs, regd) -#define psubusw_m2r(var,reg) mmx_m2r (psubusw, var, reg) -#define psubusw_r2r(regs,regd) mmx_r2r (psubusw, regs, regd) - -#define punpckhbw_m2r(var,reg) mmx_m2r (punpckhbw, var, reg) -#define punpckhbw_r2r(regs,regd) mmx_r2r (punpckhbw, regs, regd) -#define punpckhdq_m2r(var,reg) mmx_m2r (punpckhdq, var, reg) -#define punpckhdq_r2r(regs,regd) mmx_r2r (punpckhdq, regs, regd) -#define punpckhwd_m2r(var,reg) mmx_m2r (punpckhwd, var, reg) -#define punpckhwd_r2r(regs,regd) mmx_r2r (punpckhwd, regs, regd) - -#define punpcklbw_m2r(var,reg) mmx_m2r (punpcklbw, var, reg) -#define punpcklbw_r2r(regs,regd) mmx_r2r (punpcklbw, regs, regd) -#define punpckldq_m2r(var,reg) mmx_m2r (punpckldq, var, reg) -#define punpckldq_r2r(regs,regd) mmx_r2r (punpckldq, regs, regd) -#define punpcklwd_m2r(var,reg) mmx_m2r (punpcklwd, var, reg) -#define punpcklwd_r2r(regs,regd) mmx_r2r (punpcklwd, regs, regd) - -#define pxor_m2r(var,reg) mmx_m2r (pxor, var, reg) -#define pxor_r2r(regs,regd) mmx_r2r (pxor, regs, regd) - - -/* 3DNOW extensions */ - -#define pavgusb_m2r(var,reg) mmx_m2r (pavgusb, var, reg) -#define pavgusb_r2r(regs,regd) mmx_r2r (pavgusb, regs, regd) - - -/* AMD MMX extensions - also available in intel SSE */ - - -#define mmx_m2ri(op,mem,reg,imm) \ - __asm__ volatile (#op " %1, %0, %%" #reg \ - : /* nothing */ \ - : "m" (mem), "i" (imm)) -#define mmx_r2ri(op,regs,regd,imm) \ - __asm__ volatile (#op " %0, %%" #regs ", %%" #regd \ - : /* nothing */ \ - : "i" (imm) ) - -#define mmx_fetch(mem,hint) \ - __asm__ volatile ("prefetch" #hint " %0" \ - : /* nothing */ \ - : "m" (mem)) - - -#define maskmovq(regs,maskreg) mmx_r2ri (maskmovq, regs, maskreg) - -#define movntq_r2m(mmreg,var) mmx_r2m (movntq, mmreg, var) - -#define pavgb_m2r(var,reg) mmx_m2r (pavgb, var, reg) -#define pavgb_r2r(regs,regd) mmx_r2r (pavgb, regs, regd) -#define pavgw_m2r(var,reg) mmx_m2r (pavgw, var, reg) -#define pavgw_r2r(regs,regd) mmx_r2r (pavgw, regs, regd) - -#define pextrw_r2r(mmreg,reg,imm) mmx_r2ri (pextrw, mmreg, reg, imm) - -#define pinsrw_r2r(reg,mmreg,imm) mmx_r2ri (pinsrw, reg, mmreg, imm) - -#define pmaxsw_m2r(var,reg) mmx_m2r (pmaxsw, var, reg) -#define pmaxsw_r2r(regs,regd) mmx_r2r (pmaxsw, regs, regd) - -#define pmaxub_m2r(var,reg) mmx_m2r (pmaxub, var, reg) -#define pmaxub_r2r(regs,regd) mmx_r2r (pmaxub, regs, regd) - -#define pminsw_m2r(var,reg) mmx_m2r (pminsw, var, reg) -#define pminsw_r2r(regs,regd) mmx_r2r (pminsw, regs, regd) - -#define pminub_m2r(var,reg) mmx_m2r (pminub, var, reg) -#define pminub_r2r(regs,regd) mmx_r2r (pminub, regs, regd) - -#define pmovmskb(mmreg,reg) \ - __asm__ volatile ("movmskps %" #mmreg ", %" #reg) - -#define pmulhuw_m2r(var,reg) mmx_m2r (pmulhuw, var, reg) -#define pmulhuw_r2r(regs,regd) mmx_r2r (pmulhuw, regs, regd) - -#define prefetcht0(mem) mmx_fetch (mem, t0) -#define prefetcht1(mem) mmx_fetch (mem, t1) -#define prefetcht2(mem) mmx_fetch (mem, t2) -#define prefetchnta(mem) mmx_fetch (mem, nta) - -#define psadbw_m2r(var,reg) mmx_m2r (psadbw, var, reg) -#define psadbw_r2r(regs,regd) mmx_r2r (psadbw, regs, regd) - -#define pshufw_m2r(var,reg,imm) mmx_m2ri(pshufw, var, reg, imm) -#define pshufw_r2r(regs,regd,imm) mmx_r2ri(pshufw, regs, regd, imm) - -#define sfence() __asm__ volatile ("sfence\n\t") - -/* SSE2 */ -#define pshufhw_m2r(var,reg,imm) mmx_m2ri(pshufhw, var, reg, imm) -#define pshufhw_r2r(regs,regd,imm) mmx_r2ri(pshufhw, regs, regd, imm) -#define pshuflw_m2r(var,reg,imm) mmx_m2ri(pshuflw, var, reg, imm) -#define pshuflw_r2r(regs,regd,imm) mmx_r2ri(pshuflw, regs, regd, imm) - -#define pshufd_r2r(regs,regd,imm) mmx_r2ri(pshufd, regs, regd, imm) - -#define movdqa_m2r(var,reg) mmx_m2r (movdqa, var, reg) -#define movdqa_r2m(reg,var) mmx_r2m (movdqa, reg, var) -#define movdqa_r2r(regs,regd) mmx_r2r (movdqa, regs, regd) -#define movdqu_m2r(var,reg) mmx_m2r (movdqu, var, reg) -#define movdqu_r2m(reg,var) mmx_r2m (movdqu, reg, var) -#define movdqu_r2r(regs,regd) mmx_r2r (movdqu, regs, regd) - -#define pmullw_r2m(reg,var) mmx_r2m (pmullw, reg, var) - -#define pslldq_i2r(imm,reg) mmx_i2r (pslldq, imm, reg) -#define psrldq_i2r(imm,reg) mmx_i2r (psrldq, imm, reg) - -#define punpcklqdq_r2r(regs,regd) mmx_r2r (punpcklqdq, regs, regd) -#define punpckhqdq_r2r(regs,regd) mmx_r2r (punpckhqdq, regs, regd) - - -#endif /* AVCODEC_I386_MMX_H */
--- a/i386/motion_est_mmx.c Mon Dec 22 06:50:18 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,461 +0,0 @@ -/* - * MMX optimized motion estimation - * Copyright (c) 2001 Fabrice Bellard. - * Copyright (c) 2002-2004 Michael Niedermayer - * - * mostly by Michael Niedermayer <michaelni@gmx.at> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/x86_cpu.h" -#include "libavcodec/dsputil.h" - -DECLARE_ASM_CONST(8, uint64_t, round_tab[3])={ -0x0000000000000000ULL, -0x0001000100010001ULL, -0x0002000200020002ULL, -}; - -DECLARE_ASM_CONST(8, uint64_t, bone)= 0x0101010101010101LL; - -static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) -{ - x86_reg len= -(stride*h); - __asm__ volatile( - ASMALIGN(4) - "1: \n\t" - "movq (%1, %%"REG_a"), %%mm0 \n\t" - "movq (%2, %%"REG_a"), %%mm2 \n\t" - "movq (%2, %%"REG_a"), %%mm4 \n\t" - "add %3, %%"REG_a" \n\t" - "psubusb %%mm0, %%mm2 \n\t" - "psubusb %%mm4, %%mm0 \n\t" - "movq (%1, %%"REG_a"), %%mm1 \n\t" - "movq (%2, %%"REG_a"), %%mm3 \n\t" - "movq (%2, %%"REG_a"), %%mm5 \n\t" - "psubusb %%mm1, %%mm3 \n\t" - "psubusb %%mm5, %%mm1 \n\t" - "por %%mm2, %%mm0 \n\t" - "por %%mm1, %%mm3 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm3, %%mm2 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpckhbw %%mm7, %%mm1 \n\t" - "punpcklbw %%mm7, %%mm3 \n\t" - "punpckhbw %%mm7, %%mm2 \n\t" - "paddw %%mm1, %%mm0 \n\t" - "paddw %%mm3, %%mm2 \n\t" - "paddw %%mm2, %%mm0 \n\t" - "paddw %%mm0, %%mm6 \n\t" - "add %3, %%"REG_a" \n\t" - " js 1b \n\t" - : "+a" (len) - : "r" (blk1 - len), "r" (blk2 - len), "r" ((x86_reg)stride) - ); -} - -static inline void sad8_1_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h) -{ - __asm__ volatile( - ASMALIGN(4) - "1: \n\t" - "movq (%1), %%mm0 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "psadbw (%2), %%mm0 \n\t" - "psadbw (%2, %3), %%mm1 \n\t" - "paddw %%mm0, %%mm6 \n\t" - "paddw %%mm1, %%mm6 \n\t" - "lea (%1,%3,2), %1 \n\t" - "lea (%2,%3,2), %2 \n\t" - "sub $2, %0 \n\t" - " jg 1b \n\t" - : "+r" (h), "+r" (blk1), "+r" (blk2) - : "r" ((x86_reg)stride) - ); -} - -static int sad16_sse2(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h) -{ - int ret; - __asm__ volatile( - "pxor %%xmm6, %%xmm6 \n\t" - ASMALIGN(4) - "1: \n\t" - "movdqu (%1), %%xmm0 \n\t" - "movdqu (%1, %3), %%xmm1 \n\t" - "psadbw (%2), %%xmm0 \n\t" - "psadbw (%2, %3), %%xmm1 \n\t" - "paddw %%xmm0, %%xmm6 \n\t" - "paddw %%xmm1, %%xmm6 \n\t" - "lea (%1,%3,2), %1 \n\t" - "lea (%2,%3,2), %2 \n\t" - "sub $2, %0 \n\t" - " jg 1b \n\t" - : "+r" (h), "+r" (blk1), "+r" (blk2) - : "r" ((x86_reg)stride) - ); - __asm__ volatile( - "movhlps %%xmm6, %%xmm0 \n\t" - "paddw %%xmm0, %%xmm6 \n\t" - "movd %%xmm6, %0 \n\t" - : "=r"(ret) - ); - return ret; -} - -static inline void sad8_x2a_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h) -{ - __asm__ volatile( - ASMALIGN(4) - "1: \n\t" - "movq (%1), %%mm0 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "pavgb 1(%1), %%mm0 \n\t" - "pavgb 1(%1, %3), %%mm1 \n\t" - "psadbw (%2), %%mm0 \n\t" - "psadbw (%2, %3), %%mm1 \n\t" - "paddw %%mm0, %%mm6 \n\t" - "paddw %%mm1, %%mm6 \n\t" - "lea (%1,%3,2), %1 \n\t" - "lea (%2,%3,2), %2 \n\t" - "sub $2, %0 \n\t" - " jg 1b \n\t" - : "+r" (h), "+r" (blk1), "+r" (blk2) - : "r" ((x86_reg)stride) - ); -} - -static inline void sad8_y2a_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h) -{ - __asm__ volatile( - "movq (%1), %%mm0 \n\t" - "add %3, %1 \n\t" - ASMALIGN(4) - "1: \n\t" - "movq (%1), %%mm1 \n\t" - "movq (%1, %3), %%mm2 \n\t" - "pavgb %%mm1, %%mm0 \n\t" - "pavgb %%mm2, %%mm1 \n\t" - "psadbw (%2), %%mm0 \n\t" - "psadbw (%2, %3), %%mm1 \n\t" - "paddw %%mm0, %%mm6 \n\t" - "paddw %%mm1, %%mm6 \n\t" - "movq %%mm2, %%mm0 \n\t" - "lea (%1,%3,2), %1 \n\t" - "lea (%2,%3,2), %2 \n\t" - "sub $2, %0 \n\t" - " jg 1b \n\t" - : "+r" (h), "+r" (blk1), "+r" (blk2) - : "r" ((x86_reg)stride) - ); -} - -static inline void sad8_4_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h) -{ - __asm__ volatile( - "movq "MANGLE(bone)", %%mm5 \n\t" - "movq (%1), %%mm0 \n\t" - "pavgb 1(%1), %%mm0 \n\t" - "add %3, %1 \n\t" - ASMALIGN(4) - "1: \n\t" - "movq (%1), %%mm1 \n\t" - "movq (%1,%3), %%mm2 \n\t" - "pavgb 1(%1), %%mm1 \n\t" - "pavgb 1(%1,%3), %%mm2 \n\t" - "psubusb %%mm5, %%mm1 \n\t" - "pavgb %%mm1, %%mm0 \n\t" - "pavgb %%mm2, %%mm1 \n\t" - "psadbw (%2), %%mm0 \n\t" - "psadbw (%2,%3), %%mm1 \n\t" - "paddw %%mm0, %%mm6 \n\t" - "paddw %%mm1, %%mm6 \n\t" - "movq %%mm2, %%mm0 \n\t" - "lea (%1,%3,2), %1 \n\t" - "lea (%2,%3,2), %2 \n\t" - "sub $2, %0 \n\t" - " jg 1b \n\t" - : "+r" (h), "+r" (blk1), "+r" (blk2) - : "r" ((x86_reg)stride) - ); -} - -static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h) -{ - x86_reg len= -(stride*h); - __asm__ volatile( - ASMALIGN(4) - "1: \n\t" - "movq (%1, %%"REG_a"), %%mm0 \n\t" - "movq (%2, %%"REG_a"), %%mm1 \n\t" - "movq (%1, %%"REG_a"), %%mm2 \n\t" - "movq (%2, %%"REG_a"), %%mm3 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm1 \n\t" - "punpckhbw %%mm7, %%mm2 \n\t" - "punpckhbw %%mm7, %%mm3 \n\t" - "paddw %%mm0, %%mm1 \n\t" - "paddw %%mm2, %%mm3 \n\t" - "movq (%3, %%"REG_a"), %%mm4 \n\t" - "movq (%3, %%"REG_a"), %%mm2 \n\t" - "paddw %%mm5, %%mm1 \n\t" - "paddw %%mm5, %%mm3 \n\t" - "psrlw $1, %%mm1 \n\t" - "psrlw $1, %%mm3 \n\t" - "packuswb %%mm3, %%mm1 \n\t" - "psubusb %%mm1, %%mm4 \n\t" - "psubusb %%mm2, %%mm1 \n\t" - "por %%mm4, %%mm1 \n\t" - "movq %%mm1, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpckhbw %%mm7, %%mm1 \n\t" - "paddw %%mm1, %%mm0 \n\t" - "paddw %%mm0, %%mm6 \n\t" - "add %4, %%"REG_a" \n\t" - " js 1b \n\t" - : "+a" (len) - : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" ((x86_reg)stride) - ); -} - -static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) -{ - x86_reg len= -(stride*h); - __asm__ volatile( - "movq (%1, %%"REG_a"), %%mm0 \n\t" - "movq 1(%1, %%"REG_a"), %%mm2 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm2, %%mm3 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpckhbw %%mm7, %%mm1 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpckhbw %%mm7, %%mm3 \n\t" - "paddw %%mm2, %%mm0 \n\t" - "paddw %%mm3, %%mm1 \n\t" - ASMALIGN(4) - "1: \n\t" - "movq (%2, %%"REG_a"), %%mm2 \n\t" - "movq 1(%2, %%"REG_a"), %%mm4 \n\t" - "movq %%mm2, %%mm3 \n\t" - "movq %%mm4, %%mm5 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpckhbw %%mm7, %%mm3 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "punpckhbw %%mm7, %%mm5 \n\t" - "paddw %%mm4, %%mm2 \n\t" - "paddw %%mm5, %%mm3 \n\t" - "movq 16+"MANGLE(round_tab)", %%mm5 \n\t" - "paddw %%mm2, %%mm0 \n\t" - "paddw %%mm3, %%mm1 \n\t" - "paddw %%mm5, %%mm0 \n\t" - "paddw %%mm5, %%mm1 \n\t" - "movq (%3, %%"REG_a"), %%mm4 \n\t" - "movq (%3, %%"REG_a"), %%mm5 \n\t" - "psrlw $2, %%mm0 \n\t" - "psrlw $2, %%mm1 \n\t" - "packuswb %%mm1, %%mm0 \n\t" - "psubusb %%mm0, %%mm4 \n\t" - "psubusb %%mm5, %%mm0 \n\t" - "por %%mm4, %%mm0 \n\t" - "movq %%mm0, %%mm4 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpckhbw %%mm7, %%mm4 \n\t" - "paddw %%mm0, %%mm6 \n\t" - "paddw %%mm4, %%mm6 \n\t" - "movq %%mm2, %%mm0 \n\t" - "movq %%mm3, %%mm1 \n\t" - "add %4, %%"REG_a" \n\t" - " js 1b \n\t" - : "+a" (len) - : "r" (blk1 - len), "r" (blk1 -len + stride), "r" (blk2 - len), "r" ((x86_reg)stride) - ); -} - -static inline int sum_mmx(void) -{ - int ret; - __asm__ volatile( - "movq %%mm6, %%mm0 \n\t" - "psrlq $32, %%mm6 \n\t" - "paddw %%mm0, %%mm6 \n\t" - "movq %%mm6, %%mm0 \n\t" - "psrlq $16, %%mm6 \n\t" - "paddw %%mm0, %%mm6 \n\t" - "movd %%mm6, %0 \n\t" - : "=r" (ret) - ); - return ret&0xFFFF; -} - -static inline int sum_mmx2(void) -{ - int ret; - __asm__ volatile( - "movd %%mm6, %0 \n\t" - : "=r" (ret) - ); - return ret; -} - -static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) -{ - sad8_2_mmx(blk1, blk1+1, blk2, stride, h); -} -static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) -{ - sad8_2_mmx(blk1, blk1+stride, blk2, stride, h); -} - - -#define PIX_SAD(suf)\ -static int sad8_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ -{\ - assert(h==8);\ - __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ - "pxor %%mm6, %%mm6 \n\t":);\ -\ - sad8_1_ ## suf(blk1, blk2, stride, 8);\ -\ - return sum_ ## suf();\ -}\ -static int sad8_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ -{\ - assert(h==8);\ - __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ - "pxor %%mm6, %%mm6 \n\t"\ - "movq %0, %%mm5 \n\t"\ - :: "m"(round_tab[1]) \ - );\ -\ - sad8_x2a_ ## suf(blk1, blk2, stride, 8);\ -\ - return sum_ ## suf();\ -}\ -\ -static int sad8_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ -{\ - assert(h==8);\ - __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ - "pxor %%mm6, %%mm6 \n\t"\ - "movq %0, %%mm5 \n\t"\ - :: "m"(round_tab[1]) \ - );\ -\ - sad8_y2a_ ## suf(blk1, blk2, stride, 8);\ -\ - return sum_ ## suf();\ -}\ -\ -static int sad8_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ -{\ - assert(h==8);\ - __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ - "pxor %%mm6, %%mm6 \n\t"\ - ::);\ -\ - sad8_4_ ## suf(blk1, blk2, stride, 8);\ -\ - return sum_ ## suf();\ -}\ -\ -static int sad16_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ -{\ - __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ - "pxor %%mm6, %%mm6 \n\t":);\ -\ - sad8_1_ ## suf(blk1 , blk2 , stride, h);\ - sad8_1_ ## suf(blk1+8, blk2+8, stride, h);\ -\ - return sum_ ## suf();\ -}\ -static int sad16_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ -{\ - __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ - "pxor %%mm6, %%mm6 \n\t"\ - "movq %0, %%mm5 \n\t"\ - :: "m"(round_tab[1]) \ - );\ -\ - sad8_x2a_ ## suf(blk1 , blk2 , stride, h);\ - sad8_x2a_ ## suf(blk1+8, blk2+8, stride, h);\ -\ - return sum_ ## suf();\ -}\ -static int sad16_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ -{\ - __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ - "pxor %%mm6, %%mm6 \n\t"\ - "movq %0, %%mm5 \n\t"\ - :: "m"(round_tab[1]) \ - );\ -\ - sad8_y2a_ ## suf(blk1 , blk2 , stride, h);\ - sad8_y2a_ ## suf(blk1+8, blk2+8, stride, h);\ -\ - return sum_ ## suf();\ -}\ -static int sad16_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ -{\ - __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ - "pxor %%mm6, %%mm6 \n\t"\ - ::);\ -\ - sad8_4_ ## suf(blk1 , blk2 , stride, h);\ - sad8_4_ ## suf(blk1+8, blk2+8, stride, h);\ -\ - return sum_ ## suf();\ -}\ - -PIX_SAD(mmx) -PIX_SAD(mmx2) - -void dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx) -{ - if (mm_flags & FF_MM_MMX) { - c->pix_abs[0][0] = sad16_mmx; - c->pix_abs[0][1] = sad16_x2_mmx; - c->pix_abs[0][2] = sad16_y2_mmx; - c->pix_abs[0][3] = sad16_xy2_mmx; - c->pix_abs[1][0] = sad8_mmx; - c->pix_abs[1][1] = sad8_x2_mmx; - c->pix_abs[1][2] = sad8_y2_mmx; - c->pix_abs[1][3] = sad8_xy2_mmx; - - c->sad[0]= sad16_mmx; - c->sad[1]= sad8_mmx; - } - if (mm_flags & FF_MM_MMXEXT) { - c->pix_abs[0][0] = sad16_mmx2; - c->pix_abs[1][0] = sad8_mmx2; - - c->sad[0]= sad16_mmx2; - c->sad[1]= sad8_mmx2; - - if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ - c->pix_abs[0][1] = sad16_x2_mmx2; - c->pix_abs[0][2] = sad16_y2_mmx2; - c->pix_abs[0][3] = sad16_xy2_mmx2; - c->pix_abs[1][1] = sad8_x2_mmx2; - c->pix_abs[1][2] = sad8_y2_mmx2; - c->pix_abs[1][3] = sad8_xy2_mmx2; - } - } - if ((mm_flags & FF_MM_SSE2) && !(mm_flags & FF_MM_3DNOW)) { - c->sad[0]= sad16_sse2; - } -}
--- a/i386/mpegvideo_mmx.c Mon Dec 22 06:50:18 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,654 +0,0 @@ -/* - * The simplest mpeg encoder (well, it was the simplest!) - * Copyright (c) 2000,2001 Fabrice Bellard. - * - * Optimized for ia32 CPUs by Nick Kurshev <nickols_k@mail.ru> - * h263, mpeg1, mpeg2 dequantizer & draw_edges by Michael Niedermayer <michaelni@gmx.at> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/x86_cpu.h" -#include "libavcodec/avcodec.h" -#include "libavcodec/dsputil.h" -#include "libavcodec/mpegvideo.h" -#include "dsputil_mmx.h" - -extern uint16_t inv_zigzag_direct16[64]; - - -static void dct_unquantize_h263_intra_mmx(MpegEncContext *s, - DCTELEM *block, int n, int qscale) -{ - x86_reg level, qmul, qadd, nCoeffs; - - qmul = qscale << 1; - - assert(s->block_last_index[n]>=0 || s->h263_aic); - - if (!s->h263_aic) { - if (n < 4) - level = block[0] * s->y_dc_scale; - else - level = block[0] * s->c_dc_scale; - qadd = (qscale - 1) | 1; - }else{ - qadd = 0; - level= block[0]; - } - if(s->ac_pred) - nCoeffs=63; - else - nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; -//printf("%d %d ", qmul, qadd); -__asm__ volatile( - "movd %1, %%mm6 \n\t" //qmul - "packssdw %%mm6, %%mm6 \n\t" - "packssdw %%mm6, %%mm6 \n\t" - "movd %2, %%mm5 \n\t" //qadd - "pxor %%mm7, %%mm7 \n\t" - "packssdw %%mm5, %%mm5 \n\t" - "packssdw %%mm5, %%mm5 \n\t" - "psubw %%mm5, %%mm7 \n\t" - "pxor %%mm4, %%mm4 \n\t" - ASMALIGN(4) - "1: \n\t" - "movq (%0, %3), %%mm0 \n\t" - "movq 8(%0, %3), %%mm1 \n\t" - - "pmullw %%mm6, %%mm0 \n\t" - "pmullw %%mm6, %%mm1 \n\t" - - "movq (%0, %3), %%mm2 \n\t" - "movq 8(%0, %3), %%mm3 \n\t" - - "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 - "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 - - "pxor %%mm2, %%mm0 \n\t" - "pxor %%mm3, %%mm1 \n\t" - - "paddw %%mm7, %%mm0 \n\t" - "paddw %%mm7, %%mm1 \n\t" - - "pxor %%mm0, %%mm2 \n\t" - "pxor %%mm1, %%mm3 \n\t" - - "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 - "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 - - "pandn %%mm2, %%mm0 \n\t" - "pandn %%mm3, %%mm1 \n\t" - - "movq %%mm0, (%0, %3) \n\t" - "movq %%mm1, 8(%0, %3) \n\t" - - "add $16, %3 \n\t" - "jng 1b \n\t" - ::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs)) - : "memory" - ); - block[0]= level; -} - - -static void dct_unquantize_h263_inter_mmx(MpegEncContext *s, - DCTELEM *block, int n, int qscale) -{ - x86_reg qmul, qadd, nCoeffs; - - qmul = qscale << 1; - qadd = (qscale - 1) | 1; - - assert(s->block_last_index[n]>=0 || s->h263_aic); - - nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; -//printf("%d %d ", qmul, qadd); -__asm__ volatile( - "movd %1, %%mm6 \n\t" //qmul - "packssdw %%mm6, %%mm6 \n\t" - "packssdw %%mm6, %%mm6 \n\t" - "movd %2, %%mm5 \n\t" //qadd - "pxor %%mm7, %%mm7 \n\t" - "packssdw %%mm5, %%mm5 \n\t" - "packssdw %%mm5, %%mm5 \n\t" - "psubw %%mm5, %%mm7 \n\t" - "pxor %%mm4, %%mm4 \n\t" - ASMALIGN(4) - "1: \n\t" - "movq (%0, %3), %%mm0 \n\t" - "movq 8(%0, %3), %%mm1 \n\t" - - "pmullw %%mm6, %%mm0 \n\t" - "pmullw %%mm6, %%mm1 \n\t" - - "movq (%0, %3), %%mm2 \n\t" - "movq 8(%0, %3), %%mm3 \n\t" - - "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 - "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 - - "pxor %%mm2, %%mm0 \n\t" - "pxor %%mm3, %%mm1 \n\t" - - "paddw %%mm7, %%mm0 \n\t" - "paddw %%mm7, %%mm1 \n\t" - - "pxor %%mm0, %%mm2 \n\t" - "pxor %%mm1, %%mm3 \n\t" - - "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 - "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 - - "pandn %%mm2, %%mm0 \n\t" - "pandn %%mm3, %%mm1 \n\t" - - "movq %%mm0, (%0, %3) \n\t" - "movq %%mm1, 8(%0, %3) \n\t" - - "add $16, %3 \n\t" - "jng 1b \n\t" - ::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs)) - : "memory" - ); -} - - -/* - NK: - Note: looking at PARANOID: - "enable all paranoid tests for rounding, overflows, etc..." - -#ifdef PARANOID - if (level < -2048 || level > 2047) - fprintf(stderr, "unquant error %d %d\n", i, level); -#endif - We can suppose that result of two multiplications can't be greater than 0xFFFF - i.e. is 16-bit, so we use here only PMULLW instruction and can avoid - a complex multiplication. -===================================================== - Full formula for multiplication of 2 integer numbers - which are represent as high:low words: - input: value1 = high1:low1 - value2 = high2:low2 - output: value3 = value1*value2 - value3=high3:low3 (on overflow: modulus 2^32 wrap-around) - this mean that for 0x123456 * 0x123456 correct result is 0x766cb0ce4 - but this algorithm will compute only 0x66cb0ce4 - this limited by 16-bit size of operands - --------------------------------- - tlow1 = high1*low2 - tlow2 = high2*low1 - tlow1 = tlow1 + tlow2 - high3:low3 = low1*low2 - high3 += tlow1 -*/ -static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s, - DCTELEM *block, int n, int qscale) -{ - x86_reg nCoeffs; - const uint16_t *quant_matrix; - int block0; - - assert(s->block_last_index[n]>=0); - - nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1; - - if (n < 4) - block0 = block[0] * s->y_dc_scale; - else - block0 = block[0] * s->c_dc_scale; - /* XXX: only mpeg1 */ - quant_matrix = s->intra_matrix; -__asm__ volatile( - "pcmpeqw %%mm7, %%mm7 \n\t" - "psrlw $15, %%mm7 \n\t" - "movd %2, %%mm6 \n\t" - "packssdw %%mm6, %%mm6 \n\t" - "packssdw %%mm6, %%mm6 \n\t" - "mov %3, %%"REG_a" \n\t" - ASMALIGN(4) - "1: \n\t" - "movq (%0, %%"REG_a"), %%mm0 \n\t" - "movq 8(%0, %%"REG_a"), %%mm1 \n\t" - "movq (%1, %%"REG_a"), %%mm4 \n\t" - "movq 8(%1, %%"REG_a"), %%mm5 \n\t" - "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] - "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] - "pxor %%mm2, %%mm2 \n\t" - "pxor %%mm3, %%mm3 \n\t" - "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 - "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 - "pxor %%mm2, %%mm0 \n\t" - "pxor %%mm3, %%mm1 \n\t" - "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) - "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) - "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q - "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q - "pxor %%mm4, %%mm4 \n\t" - "pxor %%mm5, %%mm5 \n\t" // FIXME slow - "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 - "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 - "psraw $3, %%mm0 \n\t" - "psraw $3, %%mm1 \n\t" - "psubw %%mm7, %%mm0 \n\t" - "psubw %%mm7, %%mm1 \n\t" - "por %%mm7, %%mm0 \n\t" - "por %%mm7, %%mm1 \n\t" - "pxor %%mm2, %%mm0 \n\t" - "pxor %%mm3, %%mm1 \n\t" - "psubw %%mm2, %%mm0 \n\t" - "psubw %%mm3, %%mm1 \n\t" - "pandn %%mm0, %%mm4 \n\t" - "pandn %%mm1, %%mm5 \n\t" - "movq %%mm4, (%0, %%"REG_a") \n\t" - "movq %%mm5, 8(%0, %%"REG_a") \n\t" - - "add $16, %%"REG_a" \n\t" - "js 1b \n\t" - ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs) - : "%"REG_a, "memory" - ); - block[0]= block0; -} - -static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s, - DCTELEM *block, int n, int qscale) -{ - x86_reg nCoeffs; - const uint16_t *quant_matrix; - - assert(s->block_last_index[n]>=0); - - nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1; - - quant_matrix = s->inter_matrix; -__asm__ volatile( - "pcmpeqw %%mm7, %%mm7 \n\t" - "psrlw $15, %%mm7 \n\t" - "movd %2, %%mm6 \n\t" - "packssdw %%mm6, %%mm6 \n\t" - "packssdw %%mm6, %%mm6 \n\t" - "mov %3, %%"REG_a" \n\t" - ASMALIGN(4) - "1: \n\t" - "movq (%0, %%"REG_a"), %%mm0 \n\t" - "movq 8(%0, %%"REG_a"), %%mm1 \n\t" - "movq (%1, %%"REG_a"), %%mm4 \n\t" - "movq 8(%1, %%"REG_a"), %%mm5 \n\t" - "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] - "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] - "pxor %%mm2, %%mm2 \n\t" - "pxor %%mm3, %%mm3 \n\t" - "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 - "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 - "pxor %%mm2, %%mm0 \n\t" - "pxor %%mm3, %%mm1 \n\t" - "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) - "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) - "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2 - "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2 - "paddw %%mm7, %%mm0 \n\t" // abs(block[i])*2 + 1 - "paddw %%mm7, %%mm1 \n\t" // abs(block[i])*2 + 1 - "pmullw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q - "pmullw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q - "pxor %%mm4, %%mm4 \n\t" - "pxor %%mm5, %%mm5 \n\t" // FIXME slow - "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 - "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 - "psraw $4, %%mm0 \n\t" - "psraw $4, %%mm1 \n\t" - "psubw %%mm7, %%mm0 \n\t" - "psubw %%mm7, %%mm1 \n\t" - "por %%mm7, %%mm0 \n\t" - "por %%mm7, %%mm1 \n\t" - "pxor %%mm2, %%mm0 \n\t" - "pxor %%mm3, %%mm1 \n\t" - "psubw %%mm2, %%mm0 \n\t" - "psubw %%mm3, %%mm1 \n\t" - "pandn %%mm0, %%mm4 \n\t" - "pandn %%mm1, %%mm5 \n\t" - "movq %%mm4, (%0, %%"REG_a") \n\t" - "movq %%mm5, 8(%0, %%"REG_a") \n\t" - - "add $16, %%"REG_a" \n\t" - "js 1b \n\t" - ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs) - : "%"REG_a, "memory" - ); -} - -static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s, - DCTELEM *block, int n, int qscale) -{ - x86_reg nCoeffs; - const uint16_t *quant_matrix; - int block0; - - assert(s->block_last_index[n]>=0); - - if(s->alternate_scan) nCoeffs= 63; //FIXME - else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; - - if (n < 4) - block0 = block[0] * s->y_dc_scale; - else - block0 = block[0] * s->c_dc_scale; - quant_matrix = s->intra_matrix; -__asm__ volatile( - "pcmpeqw %%mm7, %%mm7 \n\t" - "psrlw $15, %%mm7 \n\t" - "movd %2, %%mm6 \n\t" - "packssdw %%mm6, %%mm6 \n\t" - "packssdw %%mm6, %%mm6 \n\t" - "mov %3, %%"REG_a" \n\t" - ASMALIGN(4) - "1: \n\t" - "movq (%0, %%"REG_a"), %%mm0 \n\t" - "movq 8(%0, %%"REG_a"), %%mm1 \n\t" - "movq (%1, %%"REG_a"), %%mm4 \n\t" - "movq 8(%1, %%"REG_a"), %%mm5 \n\t" - "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] - "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] - "pxor %%mm2, %%mm2 \n\t" - "pxor %%mm3, %%mm3 \n\t" - "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 - "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 - "pxor %%mm2, %%mm0 \n\t" - "pxor %%mm3, %%mm1 \n\t" - "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) - "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) - "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q - "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q - "pxor %%mm4, %%mm4 \n\t" - "pxor %%mm5, %%mm5 \n\t" // FIXME slow - "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 - "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 - "psraw $3, %%mm0 \n\t" - "psraw $3, %%mm1 \n\t" - "pxor %%mm2, %%mm0 \n\t" - "pxor %%mm3, %%mm1 \n\t" - "psubw %%mm2, %%mm0 \n\t" - "psubw %%mm3, %%mm1 \n\t" - "pandn %%mm0, %%mm4 \n\t" - "pandn %%mm1, %%mm5 \n\t" - "movq %%mm4, (%0, %%"REG_a") \n\t" - "movq %%mm5, 8(%0, %%"REG_a") \n\t" - - "add $16, %%"REG_a" \n\t" - "jng 1b \n\t" - ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs) - : "%"REG_a, "memory" - ); - block[0]= block0; - //Note, we do not do mismatch control for intra as errors cannot accumulate -} - -static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s, - DCTELEM *block, int n, int qscale) -{ - x86_reg nCoeffs; - const uint16_t *quant_matrix; - - assert(s->block_last_index[n]>=0); - - if(s->alternate_scan) nCoeffs= 63; //FIXME - else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; - - quant_matrix = s->inter_matrix; -__asm__ volatile( - "pcmpeqw %%mm7, %%mm7 \n\t" - "psrlq $48, %%mm7 \n\t" - "movd %2, %%mm6 \n\t" - "packssdw %%mm6, %%mm6 \n\t" - "packssdw %%mm6, %%mm6 \n\t" - "mov %3, %%"REG_a" \n\t" - ASMALIGN(4) - "1: \n\t" - "movq (%0, %%"REG_a"), %%mm0 \n\t" - "movq 8(%0, %%"REG_a"), %%mm1 \n\t" - "movq (%1, %%"REG_a"), %%mm4 \n\t" - "movq 8(%1, %%"REG_a"), %%mm5 \n\t" - "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] - "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] - "pxor %%mm2, %%mm2 \n\t" - "pxor %%mm3, %%mm3 \n\t" - "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 - "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 - "pxor %%mm2, %%mm0 \n\t" - "pxor %%mm3, %%mm1 \n\t" - "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) - "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) - "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2 - "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2 - "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*2*q - "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*2*q - "paddw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q - "paddw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q - "pxor %%mm4, %%mm4 \n\t" - "pxor %%mm5, %%mm5 \n\t" // FIXME slow - "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 - "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 - "psrlw $4, %%mm0 \n\t" - "psrlw $4, %%mm1 \n\t" - "pxor %%mm2, %%mm0 \n\t" - "pxor %%mm3, %%mm1 \n\t" - "psubw %%mm2, %%mm0 \n\t" - "psubw %%mm3, %%mm1 \n\t" - "pandn %%mm0, %%mm4 \n\t" - "pandn %%mm1, %%mm5 \n\t" - "pxor %%mm4, %%mm7 \n\t" - "pxor %%mm5, %%mm7 \n\t" - "movq %%mm4, (%0, %%"REG_a") \n\t" - "movq %%mm5, 8(%0, %%"REG_a") \n\t" - - "add $16, %%"REG_a" \n\t" - "jng 1b \n\t" - "movd 124(%0, %3), %%mm0 \n\t" - "movq %%mm7, %%mm6 \n\t" - "psrlq $32, %%mm7 \n\t" - "pxor %%mm6, %%mm7 \n\t" - "movq %%mm7, %%mm6 \n\t" - "psrlq $16, %%mm7 \n\t" - "pxor %%mm6, %%mm7 \n\t" - "pslld $31, %%mm7 \n\t" - "psrlq $15, %%mm7 \n\t" - "pxor %%mm7, %%mm0 \n\t" - "movd %%mm0, 124(%0, %3) \n\t" - - ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "r" (-2*nCoeffs) - : "%"REG_a, "memory" - ); -} - -static void denoise_dct_mmx(MpegEncContext *s, DCTELEM *block){ - const int intra= s->mb_intra; - int *sum= s->dct_error_sum[intra]; - uint16_t *offset= s->dct_offset[intra]; - - s->dct_count[intra]++; - - __asm__ volatile( - "pxor %%mm7, %%mm7 \n\t" - "1: \n\t" - "pxor %%mm0, %%mm0 \n\t" - "pxor %%mm1, %%mm1 \n\t" - "movq (%0), %%mm2 \n\t" - "movq 8(%0), %%mm3 \n\t" - "pcmpgtw %%mm2, %%mm0 \n\t" - "pcmpgtw %%mm3, %%mm1 \n\t" - "pxor %%mm0, %%mm2 \n\t" - "pxor %%mm1, %%mm3 \n\t" - "psubw %%mm0, %%mm2 \n\t" - "psubw %%mm1, %%mm3 \n\t" - "movq %%mm2, %%mm4 \n\t" - "movq %%mm3, %%mm5 \n\t" - "psubusw (%2), %%mm2 \n\t" - "psubusw 8(%2), %%mm3 \n\t" - "pxor %%mm0, %%mm2 \n\t" - "pxor %%mm1, %%mm3 \n\t" - "psubw %%mm0, %%mm2 \n\t" - "psubw %%mm1, %%mm3 \n\t" - "movq %%mm2, (%0) \n\t" - "movq %%mm3, 8(%0) \n\t" - "movq %%mm4, %%mm2 \n\t" - "movq %%mm5, %%mm3 \n\t" - "punpcklwd %%mm7, %%mm4 \n\t" - "punpckhwd %%mm7, %%mm2 \n\t" - "punpcklwd %%mm7, %%mm5 \n\t" - "punpckhwd %%mm7, %%mm3 \n\t" - "paddd (%1), %%mm4 \n\t" - "paddd 8(%1), %%mm2 \n\t" - "paddd 16(%1), %%mm5 \n\t" - "paddd 24(%1), %%mm3 \n\t" - "movq %%mm4, (%1) \n\t" - "movq %%mm2, 8(%1) \n\t" - "movq %%mm5, 16(%1) \n\t" - "movq %%mm3, 24(%1) \n\t" - "add $16, %0 \n\t" - "add $32, %1 \n\t" - "add $16, %2 \n\t" - "cmp %3, %0 \n\t" - " jb 1b \n\t" - : "+r" (block), "+r" (sum), "+r" (offset) - : "r"(block+64) - ); -} - -static void denoise_dct_sse2(MpegEncContext *s, DCTELEM *block){ - const int intra= s->mb_intra; - int *sum= s->dct_error_sum[intra]; - uint16_t *offset= s->dct_offset[intra]; - - s->dct_count[intra]++; - - __asm__ volatile( - "pxor %%xmm7, %%xmm7 \n\t" - "1: \n\t" - "pxor %%xmm0, %%xmm0 \n\t" - "pxor %%xmm1, %%xmm1 \n\t" - "movdqa (%0), %%xmm2 \n\t" - "movdqa 16(%0), %%xmm3 \n\t" - "pcmpgtw %%xmm2, %%xmm0 \n\t" - "pcmpgtw %%xmm3, %%xmm1 \n\t" - "pxor %%xmm0, %%xmm2 \n\t" - "pxor %%xmm1, %%xmm3 \n\t" - "psubw %%xmm0, %%xmm2 \n\t" - "psubw %%xmm1, %%xmm3 \n\t" - "movdqa %%xmm2, %%xmm4 \n\t" - "movdqa %%xmm3, %%xmm5 \n\t" - "psubusw (%2), %%xmm2 \n\t" - "psubusw 16(%2), %%xmm3 \n\t" - "pxor %%xmm0, %%xmm2 \n\t" - "pxor %%xmm1, %%xmm3 \n\t" - "psubw %%xmm0, %%xmm2 \n\t" - "psubw %%xmm1, %%xmm3 \n\t" - "movdqa %%xmm2, (%0) \n\t" - "movdqa %%xmm3, 16(%0) \n\t" - "movdqa %%xmm4, %%xmm6 \n\t" - "movdqa %%xmm5, %%xmm0 \n\t" - "punpcklwd %%xmm7, %%xmm4 \n\t" - "punpckhwd %%xmm7, %%xmm6 \n\t" - "punpcklwd %%xmm7, %%xmm5 \n\t" - "punpckhwd %%xmm7, %%xmm0 \n\t" - "paddd (%1), %%xmm4 \n\t" - "paddd 16(%1), %%xmm6 \n\t" - "paddd 32(%1), %%xmm5 \n\t" - "paddd 48(%1), %%xmm0 \n\t" - "movdqa %%xmm4, (%1) \n\t" - "movdqa %%xmm6, 16(%1) \n\t" - "movdqa %%xmm5, 32(%1) \n\t" - "movdqa %%xmm0, 48(%1) \n\t" - "add $32, %0 \n\t" - "add $64, %1 \n\t" - "add $32, %2 \n\t" - "cmp %3, %0 \n\t" - " jb 1b \n\t" - : "+r" (block), "+r" (sum), "+r" (offset) - : "r"(block+64) - ); -} - -#ifdef HAVE_SSSE3 -#define HAVE_SSSE3_BAK -#endif -#undef HAVE_SSSE3 - -#undef HAVE_SSE2 -#undef HAVE_MMX2 -#define RENAME(a) a ## _MMX -#define RENAMEl(a) a ## _mmx -#include "mpegvideo_mmx_template.c" - -#define HAVE_MMX2 -#undef RENAME -#undef RENAMEl -#define RENAME(a) a ## _MMX2 -#define RENAMEl(a) a ## _mmx2 -#include "mpegvideo_mmx_template.c" - -#define HAVE_SSE2 -#undef RENAME -#undef RENAMEl -#define RENAME(a) a ## _SSE2 -#define RENAMEl(a) a ## _sse2 -#include "mpegvideo_mmx_template.c" - -#ifdef HAVE_SSSE3_BAK -#define HAVE_SSSE3 -#undef RENAME -#undef RENAMEl -#define RENAME(a) a ## _SSSE3 -#define RENAMEl(a) a ## _sse2 -#include "mpegvideo_mmx_template.c" -#endif - -void MPV_common_init_mmx(MpegEncContext *s) -{ - if (mm_flags & FF_MM_MMX) { - const int dct_algo = s->avctx->dct_algo; - - s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_mmx; - s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_mmx; - s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_mmx; - s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_mmx; - if(!(s->flags & CODEC_FLAG_BITEXACT)) - s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx; - s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx; - - if (mm_flags & FF_MM_SSE2) { - s->denoise_dct= denoise_dct_sse2; - } else { - s->denoise_dct= denoise_dct_mmx; - } - - if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){ -#ifdef HAVE_SSSE3 - if(mm_flags & FF_MM_SSSE3){ - s->dct_quantize= dct_quantize_SSSE3; - } else -#endif - if(mm_flags & FF_MM_SSE2){ - s->dct_quantize= dct_quantize_SSE2; - } else if(mm_flags & FF_MM_MMXEXT){ - s->dct_quantize= dct_quantize_MMX2; - } else { - s->dct_quantize= dct_quantize_MMX; - } - } - } -}
--- a/i386/mpegvideo_mmx_template.c Mon Dec 22 06:50:18 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,376 +0,0 @@ -/* - * MPEG video MMX templates - * - * Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#undef MMREG_WIDTH -#undef MM -#undef MOVQ -#undef SPREADW -#undef PMAXW -#undef PMAX -#undef SAVE_SIGN -#undef RESTORE_SIGN - -#if defined(HAVE_SSE2) -#define MMREG_WIDTH "16" -#define MM "%%xmm" -#define MOVQ "movdqa" -#define SPREADW(a) \ - "pshuflw $0, "a", "a" \n\t"\ - "punpcklwd "a", "a" \n\t" -#define PMAXW(a,b) "pmaxsw "a", "b" \n\t" -#define PMAX(a,b) \ - "movhlps "a", "b" \n\t"\ - PMAXW(b, a)\ - "pshuflw $0x0E, "a", "b" \n\t"\ - PMAXW(b, a)\ - "pshuflw $0x01, "a", "b" \n\t"\ - PMAXW(b, a) -#else -#define MMREG_WIDTH "8" -#define MM "%%mm" -#define MOVQ "movq" -#if defined(HAVE_MMX2) -#define SPREADW(a) "pshufw $0, "a", "a" \n\t" -#define PMAXW(a,b) "pmaxsw "a", "b" \n\t" -#define PMAX(a,b) \ - "pshufw $0x0E, "a", "b" \n\t"\ - PMAXW(b, a)\ - "pshufw $0x01, "a", "b" \n\t"\ - PMAXW(b, a) -#else -#define SPREADW(a) \ - "punpcklwd "a", "a" \n\t"\ - "punpcklwd "a", "a" \n\t" -#define PMAXW(a,b) \ - "psubusw "a", "b" \n\t"\ - "paddw "a", "b" \n\t" -#define PMAX(a,b) \ - "movq "a", "b" \n\t"\ - "psrlq $32, "a" \n\t"\ - PMAXW(b, a)\ - "movq "a", "b" \n\t"\ - "psrlq $16, "a" \n\t"\ - PMAXW(b, a) - -#endif -#endif - -#ifdef HAVE_SSSE3 -#define SAVE_SIGN(a,b) \ - "movdqa "b", "a" \n\t"\ - "pabsw "b", "b" \n\t" -#define RESTORE_SIGN(a,b) \ - "psignw "a", "b" \n\t" -#else -#define SAVE_SIGN(a,b) \ - "pxor "a", "a" \n\t"\ - "pcmpgtw "b", "a" \n\t" /* block[i] <= 0 ? 0xFF : 0x00 */\ - "pxor "a", "b" \n\t"\ - "psubw "a", "b" \n\t" /* ABS(block[i]) */ -#define RESTORE_SIGN(a,b) \ - "pxor "a", "b" \n\t"\ - "psubw "a", "b" \n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i]) -#endif - -static int RENAME(dct_quantize)(MpegEncContext *s, - DCTELEM *block, int n, - int qscale, int *overflow) -{ - x86_reg last_non_zero_p1; - int level=0, q; //=0 is because gcc says uninitialized ... - const uint16_t *qmat, *bias; - DECLARE_ALIGNED_16(int16_t, temp_block[64]); - - assert((7&(int)(&temp_block[0])) == 0); //did gcc align it correctly? - - //s->fdct (block); - RENAMEl(ff_fdct) (block); //cannot be anything else ... - - if(s->dct_error_sum) - s->denoise_dct(s, block); - - if (s->mb_intra) { - int dummy; - if (n < 4) - q = s->y_dc_scale; - else - q = s->c_dc_scale; - /* note: block[0] is assumed to be positive */ - if (!s->h263_aic) { -#if 1 - __asm__ volatile ( - "mul %%ecx \n\t" - : "=d" (level), "=a"(dummy) - : "a" ((block[0]>>2) + q), "c" (ff_inverse[q<<1]) - ); -#else - __asm__ volatile ( - "xorl %%edx, %%edx \n\t" - "divw %%cx \n\t" - "movzwl %%ax, %%eax \n\t" - : "=a" (level) - : "a" ((block[0]>>2) + q), "c" (q<<1) - : "%edx" - ); -#endif - } else - /* For AIC we skip quant/dequant of INTRADC */ - level = (block[0] + 4)>>3; - - block[0]=0; //avoid fake overflow -// temp_block[0] = (block[0] + (q >> 1)) / q; - last_non_zero_p1 = 1; - bias = s->q_intra_matrix16[qscale][1]; - qmat = s->q_intra_matrix16[qscale][0]; - } else { - last_non_zero_p1 = 0; - bias = s->q_inter_matrix16[qscale][1]; - qmat = s->q_inter_matrix16[qscale][0]; - } - - if((s->out_format == FMT_H263 || s->out_format == FMT_H261) && s->mpeg_quant==0){ - - __asm__ volatile( - "movd %%"REG_a", "MM"3 \n\t" // last_non_zero_p1 - SPREADW(MM"3") - "pxor "MM"7, "MM"7 \n\t" // 0 - "pxor "MM"4, "MM"4 \n\t" // 0 - MOVQ" (%2), "MM"5 \n\t" // qmat[0] - "pxor "MM"6, "MM"6 \n\t" - "psubw (%3), "MM"6 \n\t" // -bias[0] - "mov $-128, %%"REG_a" \n\t" - ASMALIGN(4) - "1: \n\t" - MOVQ" (%1, %%"REG_a"), "MM"0 \n\t" // block[i] - SAVE_SIGN(MM"1", MM"0") // ABS(block[i]) - "psubusw "MM"6, "MM"0 \n\t" // ABS(block[i]) + bias[0] - "pmulhw "MM"5, "MM"0 \n\t" // (ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16 - "por "MM"0, "MM"4 \n\t" - RESTORE_SIGN(MM"1", MM"0") // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i]) - MOVQ" "MM"0, (%5, %%"REG_a") \n\t" - "pcmpeqw "MM"7, "MM"0 \n\t" // out==0 ? 0xFF : 0x00 - MOVQ" (%4, %%"REG_a"), "MM"1 \n\t" - MOVQ" "MM"7, (%1, %%"REG_a") \n\t" // 0 - "pandn "MM"1, "MM"0 \n\t" - PMAXW(MM"0", MM"3") - "add $"MMREG_WIDTH", %%"REG_a" \n\t" - " js 1b \n\t" - PMAX(MM"3", MM"0") - "movd "MM"3, %%"REG_a" \n\t" - "movzb %%al, %%"REG_a" \n\t" // last_non_zero_p1 - : "+a" (last_non_zero_p1) - : "r" (block+64), "r" (qmat), "r" (bias), - "r" (inv_zigzag_direct16+64), "r" (temp_block+64) - ); - }else{ // FMT_H263 - __asm__ volatile( - "movd %%"REG_a", "MM"3 \n\t" // last_non_zero_p1 - SPREADW(MM"3") - "pxor "MM"7, "MM"7 \n\t" // 0 - "pxor "MM"4, "MM"4 \n\t" // 0 - "mov $-128, %%"REG_a" \n\t" - ASMALIGN(4) - "1: \n\t" - MOVQ" (%1, %%"REG_a"), "MM"0 \n\t" // block[i] - SAVE_SIGN(MM"1", MM"0") // ABS(block[i]) - MOVQ" (%3, %%"REG_a"), "MM"6 \n\t" // bias[0] - "paddusw "MM"6, "MM"0 \n\t" // ABS(block[i]) + bias[0] - MOVQ" (%2, %%"REG_a"), "MM"5 \n\t" // qmat[i] - "pmulhw "MM"5, "MM"0 \n\t" // (ABS(block[i])*qmat[0] + bias[0]*qmat[0])>>16 - "por "MM"0, "MM"4 \n\t" - RESTORE_SIGN(MM"1", MM"0") // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i]) - MOVQ" "MM"0, (%5, %%"REG_a") \n\t" - "pcmpeqw "MM"7, "MM"0 \n\t" // out==0 ? 0xFF : 0x00 - MOVQ" (%4, %%"REG_a"), "MM"1 \n\t" - MOVQ" "MM"7, (%1, %%"REG_a") \n\t" // 0 - "pandn "MM"1, "MM"0 \n\t" - PMAXW(MM"0", MM"3") - "add $"MMREG_WIDTH", %%"REG_a" \n\t" - " js 1b \n\t" - PMAX(MM"3", MM"0") - "movd "MM"3, %%"REG_a" \n\t" - "movzb %%al, %%"REG_a" \n\t" // last_non_zero_p1 - : "+a" (last_non_zero_p1) - : "r" (block+64), "r" (qmat+64), "r" (bias+64), - "r" (inv_zigzag_direct16+64), "r" (temp_block+64) - ); - } - __asm__ volatile( - "movd %1, "MM"1 \n\t" // max_qcoeff - SPREADW(MM"1") - "psubusw "MM"1, "MM"4 \n\t" - "packuswb "MM"4, "MM"4 \n\t" -#ifdef HAVE_SSE2 - "packuswb "MM"4, "MM"4 \n\t" -#endif - "movd "MM"4, %0 \n\t" // *overflow - : "=g" (*overflow) - : "g" (s->max_qcoeff) - ); - - if(s->mb_intra) block[0]= level; - else block[0]= temp_block[0]; - - if(s->dsp.idct_permutation_type == FF_SIMPLE_IDCT_PERM){ - if(last_non_zero_p1 <= 1) goto end; - block[0x08] = temp_block[0x01]; block[0x10] = temp_block[0x08]; - block[0x20] = temp_block[0x10]; - if(last_non_zero_p1 <= 4) goto end; - block[0x18] = temp_block[0x09]; block[0x04] = temp_block[0x02]; - block[0x09] = temp_block[0x03]; - if(last_non_zero_p1 <= 7) goto end; - block[0x14] = temp_block[0x0A]; block[0x28] = temp_block[0x11]; - block[0x12] = temp_block[0x18]; block[0x02] = temp_block[0x20]; - if(last_non_zero_p1 <= 11) goto end; - block[0x1A] = temp_block[0x19]; block[0x24] = temp_block[0x12]; - block[0x19] = temp_block[0x0B]; block[0x01] = temp_block[0x04]; - block[0x0C] = temp_block[0x05]; - if(last_non_zero_p1 <= 16) goto end; - block[0x11] = temp_block[0x0C]; block[0x29] = temp_block[0x13]; - block[0x16] = temp_block[0x1A]; block[0x0A] = temp_block[0x21]; - block[0x30] = temp_block[0x28]; block[0x22] = temp_block[0x30]; - block[0x38] = temp_block[0x29]; block[0x06] = temp_block[0x22]; - if(last_non_zero_p1 <= 24) goto end; - block[0x1B] = temp_block[0x1B]; block[0x21] = temp_block[0x14]; - block[0x1C] = temp_block[0x0D]; block[0x05] = temp_block[0x06]; - block[0x0D] = temp_block[0x07]; block[0x15] = temp_block[0x0E]; - block[0x2C] = temp_block[0x15]; block[0x13] = temp_block[0x1C]; - if(last_non_zero_p1 <= 32) goto end; - block[0x0B] = temp_block[0x23]; block[0x34] = temp_block[0x2A]; - block[0x2A] = temp_block[0x31]; block[0x32] = temp_block[0x38]; - block[0x3A] = temp_block[0x39]; block[0x26] = temp_block[0x32]; - block[0x39] = temp_block[0x2B]; block[0x03] = temp_block[0x24]; - if(last_non_zero_p1 <= 40) goto end; - block[0x1E] = temp_block[0x1D]; block[0x25] = temp_block[0x16]; - block[0x1D] = temp_block[0x0F]; block[0x2D] = temp_block[0x17]; - block[0x17] = temp_block[0x1E]; block[0x0E] = temp_block[0x25]; - block[0x31] = temp_block[0x2C]; block[0x2B] = temp_block[0x33]; - if(last_non_zero_p1 <= 48) goto end; - block[0x36] = temp_block[0x3A]; block[0x3B] = temp_block[0x3B]; - block[0x23] = temp_block[0x34]; block[0x3C] = temp_block[0x2D]; - block[0x07] = temp_block[0x26]; block[0x1F] = temp_block[0x1F]; - block[0x0F] = temp_block[0x27]; block[0x35] = temp_block[0x2E]; - if(last_non_zero_p1 <= 56) goto end; - block[0x2E] = temp_block[0x35]; block[0x33] = temp_block[0x3C]; - block[0x3E] = temp_block[0x3D]; block[0x27] = temp_block[0x36]; - block[0x3D] = temp_block[0x2F]; block[0x2F] = temp_block[0x37]; - block[0x37] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F]; - }else if(s->dsp.idct_permutation_type == FF_LIBMPEG2_IDCT_PERM){ - if(last_non_zero_p1 <= 1) goto end; - block[0x04] = temp_block[0x01]; - block[0x08] = temp_block[0x08]; block[0x10] = temp_block[0x10]; - if(last_non_zero_p1 <= 4) goto end; - block[0x0C] = temp_block[0x09]; block[0x01] = temp_block[0x02]; - block[0x05] = temp_block[0x03]; - if(last_non_zero_p1 <= 7) goto end; - block[0x09] = temp_block[0x0A]; block[0x14] = temp_block[0x11]; - block[0x18] = temp_block[0x18]; block[0x20] = temp_block[0x20]; - if(last_non_zero_p1 <= 11) goto end; - block[0x1C] = temp_block[0x19]; - block[0x11] = temp_block[0x12]; block[0x0D] = temp_block[0x0B]; - block[0x02] = temp_block[0x04]; block[0x06] = temp_block[0x05]; - if(last_non_zero_p1 <= 16) goto end; - block[0x0A] = temp_block[0x0C]; block[0x15] = temp_block[0x13]; - block[0x19] = temp_block[0x1A]; block[0x24] = temp_block[0x21]; - block[0x28] = temp_block[0x28]; block[0x30] = temp_block[0x30]; - block[0x2C] = temp_block[0x29]; block[0x21] = temp_block[0x22]; - if(last_non_zero_p1 <= 24) goto end; - block[0x1D] = temp_block[0x1B]; block[0x12] = temp_block[0x14]; - block[0x0E] = temp_block[0x0D]; block[0x03] = temp_block[0x06]; - block[0x07] = temp_block[0x07]; block[0x0B] = temp_block[0x0E]; - block[0x16] = temp_block[0x15]; block[0x1A] = temp_block[0x1C]; - if(last_non_zero_p1 <= 32) goto end; - block[0x25] = temp_block[0x23]; block[0x29] = temp_block[0x2A]; - block[0x34] = temp_block[0x31]; block[0x38] = temp_block[0x38]; - block[0x3C] = temp_block[0x39]; block[0x31] = temp_block[0x32]; - block[0x2D] = temp_block[0x2B]; block[0x22] = temp_block[0x24]; - if(last_non_zero_p1 <= 40) goto end; - block[0x1E] = temp_block[0x1D]; block[0x13] = temp_block[0x16]; - block[0x0F] = temp_block[0x0F]; block[0x17] = temp_block[0x17]; - block[0x1B] = temp_block[0x1E]; block[0x26] = temp_block[0x25]; - block[0x2A] = temp_block[0x2C]; block[0x35] = temp_block[0x33]; - if(last_non_zero_p1 <= 48) goto end; - block[0x39] = temp_block[0x3A]; block[0x3D] = temp_block[0x3B]; - block[0x32] = temp_block[0x34]; block[0x2E] = temp_block[0x2D]; - block[0x23] = temp_block[0x26]; block[0x1F] = temp_block[0x1F]; - block[0x27] = temp_block[0x27]; block[0x2B] = temp_block[0x2E]; - if(last_non_zero_p1 <= 56) goto end; - block[0x36] = temp_block[0x35]; block[0x3A] = temp_block[0x3C]; - block[0x3E] = temp_block[0x3D]; block[0x33] = temp_block[0x36]; - block[0x2F] = temp_block[0x2F]; block[0x37] = temp_block[0x37]; - block[0x3B] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F]; - }else{ - if(last_non_zero_p1 <= 1) goto end; - block[0x01] = temp_block[0x01]; - block[0x08] = temp_block[0x08]; block[0x10] = temp_block[0x10]; - if(last_non_zero_p1 <= 4) goto end; - block[0x09] = temp_block[0x09]; block[0x02] = temp_block[0x02]; - block[0x03] = temp_block[0x03]; - if(last_non_zero_p1 <= 7) goto end; - block[0x0A] = temp_block[0x0A]; block[0x11] = temp_block[0x11]; - block[0x18] = temp_block[0x18]; block[0x20] = temp_block[0x20]; - if(last_non_zero_p1 <= 11) goto end; - block[0x19] = temp_block[0x19]; - block[0x12] = temp_block[0x12]; block[0x0B] = temp_block[0x0B]; - block[0x04] = temp_block[0x04]; block[0x05] = temp_block[0x05]; - if(last_non_zero_p1 <= 16) goto end; - block[0x0C] = temp_block[0x0C]; block[0x13] = temp_block[0x13]; - block[0x1A] = temp_block[0x1A]; block[0x21] = temp_block[0x21]; - block[0x28] = temp_block[0x28]; block[0x30] = temp_block[0x30]; - block[0x29] = temp_block[0x29]; block[0x22] = temp_block[0x22]; - if(last_non_zero_p1 <= 24) goto end; - block[0x1B] = temp_block[0x1B]; block[0x14] = temp_block[0x14]; - block[0x0D] = temp_block[0x0D]; block[0x06] = temp_block[0x06]; - block[0x07] = temp_block[0x07]; block[0x0E] = temp_block[0x0E]; - block[0x15] = temp_block[0x15]; block[0x1C] = temp_block[0x1C]; - if(last_non_zero_p1 <= 32) goto end; - block[0x23] = temp_block[0x23]; block[0x2A] = temp_block[0x2A]; - block[0x31] = temp_block[0x31]; block[0x38] = temp_block[0x38]; - block[0x39] = temp_block[0x39]; block[0x32] = temp_block[0x32]; - block[0x2B] = temp_block[0x2B]; block[0x24] = temp_block[0x24]; - if(last_non_zero_p1 <= 40) goto end; - block[0x1D] = temp_block[0x1D]; block[0x16] = temp_block[0x16]; - block[0x0F] = temp_block[0x0F]; block[0x17] = temp_block[0x17]; - block[0x1E] = temp_block[0x1E]; block[0x25] = temp_block[0x25]; - block[0x2C] = temp_block[0x2C]; block[0x33] = temp_block[0x33]; - if(last_non_zero_p1 <= 48) goto end; - block[0x3A] = temp_block[0x3A]; block[0x3B] = temp_block[0x3B]; - block[0x34] = temp_block[0x34]; block[0x2D] = temp_block[0x2D]; - block[0x26] = temp_block[0x26]; block[0x1F] = temp_block[0x1F]; - block[0x27] = temp_block[0x27]; block[0x2E] = temp_block[0x2E]; - if(last_non_zero_p1 <= 56) goto end; - block[0x35] = temp_block[0x35]; block[0x3C] = temp_block[0x3C]; - block[0x3D] = temp_block[0x3D]; block[0x36] = temp_block[0x36]; - block[0x2F] = temp_block[0x2F]; block[0x37] = temp_block[0x37]; - block[0x3E] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F]; - } - end: -/* - for(i=0; i<last_non_zero_p1; i++) - { - int j= zigzag_direct_noperm[i]; - block[block_permute_op(j)]= temp_block[j]; - } -*/ - - return last_non_zero_p1 - 1; -}
--- a/i386/simple_idct_mmx.c Mon Dec 22 06:50:18 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1294 +0,0 @@ -/* - * Simple IDCT MMX - * - * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "libavcodec/dsputil.h" -#include "libavcodec/simple_idct.h" - -/* -23170.475006 -22725.260826 -21406.727617 -19265.545870 -16384.000000 -12872.826198 -8866.956905 -4520.335430 -*/ -#define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 -#define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 -#define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 -#define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 -#if 0 -#define C4 16384 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 -#else -#define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5 -#endif -#define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 -#define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 -#define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 - -#define ROW_SHIFT 11 -#define COL_SHIFT 20 // 6 - -DECLARE_ASM_CONST(8, uint64_t, wm1010)= 0xFFFF0000FFFF0000ULL; -DECLARE_ASM_CONST(8, uint64_t, d40000)= 0x0000000000040000ULL; - -DECLARE_ALIGNED(8, static const int16_t, coeffs[])= { - 1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0, -// 1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0, -// 0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16), - 1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0, - // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :) -// 0, 0, 0, 0, -// 0, 0, 0, 0, - - C4, C4, C4, C4, - C4, -C4, C4, -C4, - - C2, C6, C2, C6, - C6, -C2, C6, -C2, - - C1, C3, C1, C3, - C5, C7, C5, C7, - - C3, -C7, C3, -C7, --C1, -C5, -C1, -C5, - - C5, -C1, C5, -C1, - C7, C3, C7, C3, - - C7, -C5, C7, -C5, - C3, -C1, C3, -C1 -}; - -#if 0 -static void unused_var_killer(){ - int a= wm1010 + d40000; - temp[0]=a; -} - -static void inline idctCol (int16_t * col, int16_t *input) -{ -#undef C0 -#undef C1 -#undef C2 -#undef C3 -#undef C4 -#undef C5 -#undef C6 -#undef C7 - int a0, a1, a2, a3, b0, b1, b2, b3; - const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 - const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 - const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 - const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 - const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 - const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 - const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 - const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 -/* - if( !(col[8*1] | col[8*2] |col[8*3] |col[8*4] |col[8*5] |col[8*6] | col[8*7])) { - col[8*0] = col[8*1] = col[8*2] = col[8*3] = col[8*4] = - col[8*5] = col[8*6] = col[8*7] = col[8*0]<<3; - return; - }*/ - -col[8*0] = input[8*0 + 0]; -col[8*1] = input[8*2 + 0]; -col[8*2] = input[8*0 + 1]; -col[8*3] = input[8*2 + 1]; -col[8*4] = input[8*4 + 0]; -col[8*5] = input[8*6 + 0]; -col[8*6] = input[8*4 + 1]; -col[8*7] = input[8*6 + 1]; - - a0 = C4*col[8*0] + C2*col[8*2] + C4*col[8*4] + C6*col[8*6] + (1<<(COL_SHIFT-1)); - a1 = C4*col[8*0] + C6*col[8*2] - C4*col[8*4] - C2*col[8*6] + (1<<(COL_SHIFT-1)); - a2 = C4*col[8*0] - C6*col[8*2] - C4*col[8*4] + C2*col[8*6] + (1<<(COL_SHIFT-1)); - a3 = C4*col[8*0] - C2*col[8*2] + C4*col[8*4] - C6*col[8*6] + (1<<(COL_SHIFT-1)); - - b0 = C1*col[8*1] + C3*col[8*3] + C5*col[8*5] + C7*col[8*7]; - b1 = C3*col[8*1] - C7*col[8*3] - C1*col[8*5] - C5*col[8*7]; - b2 = C5*col[8*1] - C1*col[8*3] + C7*col[8*5] + C3*col[8*7]; - b3 = C7*col[8*1] - C5*col[8*3] + C3*col[8*5] - C1*col[8*7]; - - col[8*0] = (a0 + b0) >> COL_SHIFT; - col[8*1] = (a1 + b1) >> COL_SHIFT; - col[8*2] = (a2 + b2) >> COL_SHIFT; - col[8*3] = (a3 + b3) >> COL_SHIFT; - col[8*4] = (a3 - b3) >> COL_SHIFT; - col[8*5] = (a2 - b2) >> COL_SHIFT; - col[8*6] = (a1 - b1) >> COL_SHIFT; - col[8*7] = (a0 - b0) >> COL_SHIFT; -} - -static void inline idctRow (int16_t * output, int16_t * input) -{ - int16_t row[8]; - - int a0, a1, a2, a3, b0, b1, b2, b3; - const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 - const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 - const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 - const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 - const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 - const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 - const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 - const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 - -row[0] = input[0]; -row[2] = input[1]; -row[4] = input[4]; -row[6] = input[5]; -row[1] = input[8]; -row[3] = input[9]; -row[5] = input[12]; -row[7] = input[13]; - - if( !(row[1] | row[2] |row[3] |row[4] |row[5] |row[6] | row[7]) ) { - row[0] = row[1] = row[2] = row[3] = row[4] = - row[5] = row[6] = row[7] = row[0]<<3; - output[0] = row[0]; - output[2] = row[1]; - output[4] = row[2]; - output[6] = row[3]; - output[8] = row[4]; - output[10] = row[5]; - output[12] = row[6]; - output[14] = row[7]; - return; - } - - a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + (1<<(ROW_SHIFT-1)); - a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + (1<<(ROW_SHIFT-1)); - a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + (1<<(ROW_SHIFT-1)); - a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + (1<<(ROW_SHIFT-1)); - - b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7]; - b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7]; - b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7]; - b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7]; - - row[0] = (a0 + b0) >> ROW_SHIFT; - row[1] = (a1 + b1) >> ROW_SHIFT; - row[2] = (a2 + b2) >> ROW_SHIFT; - row[3] = (a3 + b3) >> ROW_SHIFT; - row[4] = (a3 - b3) >> ROW_SHIFT; - row[5] = (a2 - b2) >> ROW_SHIFT; - row[6] = (a1 - b1) >> ROW_SHIFT; - row[7] = (a0 - b0) >> ROW_SHIFT; - - output[0] = row[0]; - output[2] = row[1]; - output[4] = row[2]; - output[6] = row[3]; - output[8] = row[4]; - output[10] = row[5]; - output[12] = row[6]; - output[14] = row[7]; -} -#endif - -static inline void idct(int16_t *block) -{ - DECLARE_ALIGNED(8, int64_t, align_tmp[16]); - int16_t * const temp= (int16_t*)align_tmp; - - __asm__ volatile( -#if 0 //Alternative, simpler variant - -#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \ - "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ - "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ - "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ - "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ - "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ - "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ - "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ - "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ - "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ - "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ - "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ - "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ - "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ - #rounder ", %%mm4 \n\t"\ - "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ - "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ - "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ - "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ - #rounder ", %%mm0 \n\t"\ - "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ - "paddd %%mm0, %%mm0 \n\t" \ - "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ - "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ - "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ - "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ - "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ - "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ - "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ - "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ - "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ - "psrad $" #shift ", %%mm7 \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ - "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ - "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ - "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ - "psrad $" #shift ", %%mm1 \n\t"\ - "psrad $" #shift ", %%mm2 \n\t"\ - "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ - "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ - "movq %%mm7, " #dst " \n\t"\ - "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ - "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ - "movq %%mm2, 24+" #dst " \n\t"\ - "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ - "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ - "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ - "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ - "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ - "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ - "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ - "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ - "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ - "psrad $" #shift ", %%mm2 \n\t"\ - "psrad $" #shift ", %%mm0 \n\t"\ - "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ - "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ - "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ - "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ - "psrad $" #shift ", %%mm6 \n\t"\ - "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ - "movq %%mm2, 8+" #dst " \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ - "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ - "movq %%mm4, 16+" #dst " \n\t"\ - -#define COL_IDCT(src0, src4, src1, src5, dst, shift) \ - "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ - "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ - "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ - "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ - "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ - "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ - "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ - "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ - "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ - "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ - "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ - "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ - "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ - "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ - "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ - "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ - "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ - "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ - "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ - "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ - "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ - "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\ - "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\ - "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ - "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ - "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ - "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ - "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\ - "psrad $" #shift ", %%mm7 \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ - "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ - "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\ - "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\ - "psrad $" #shift ", %%mm0 \n\t"\ - "psrad $" #shift ", %%mm2 \n\t"\ - "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ - "movd %%mm7, " #dst " \n\t"\ - "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ - "movd %%mm0, 16+" #dst " \n\t"\ - "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ - "movd %%mm2, 96+" #dst " \n\t"\ - "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ - "movd %%mm4, 112+" #dst " \n\t"\ - "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\ - "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ - "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ - "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ - "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ - "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ - "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ - "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ - "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ - "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ - "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ - "psrad $" #shift ", %%mm2 \n\t"\ - "psrad $" #shift ", %%mm5 \n\t"\ - "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ - "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\ - "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ - "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ - "psrad $" #shift ", %%mm6 \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ - "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ - "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ - "movd %%mm2, 32+" #dst " \n\t"\ - "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ - "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ - "movd %%mm6, 48+" #dst " \n\t"\ - "movd %%mm4, 64+" #dst " \n\t"\ - "movd %%mm5, 80+" #dst " \n\t"\ - - -#define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \ - "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ - "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ - "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ - "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ - "movq "MANGLE(wm1010)", %%mm4 \n\t"\ - "pand %%mm0, %%mm4 \n\t"\ - "por %%mm1, %%mm4 \n\t"\ - "por %%mm2, %%mm4 \n\t"\ - "por %%mm3, %%mm4 \n\t"\ - "packssdw %%mm4,%%mm4 \n\t"\ - "movd %%mm4, %%eax \n\t"\ - "orl %%eax, %%eax \n\t"\ - "jz 1f \n\t"\ - "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ - "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ - "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ - "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ - "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ - "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ - "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ - "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ - "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ - #rounder ", %%mm4 \n\t"\ - "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ - "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ - "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ - "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ - #rounder ", %%mm0 \n\t"\ - "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ - "paddd %%mm0, %%mm0 \n\t" \ - "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ - "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ - "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ - "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ - "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ - "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ - "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ - "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ - "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ - "psrad $" #shift ", %%mm7 \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ - "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ - "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ - "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ - "psrad $" #shift ", %%mm1 \n\t"\ - "psrad $" #shift ", %%mm2 \n\t"\ - "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ - "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ - "movq %%mm7, " #dst " \n\t"\ - "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ - "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ - "movq %%mm2, 24+" #dst " \n\t"\ - "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ - "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ - "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ - "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ - "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ - "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ - "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ - "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ - "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ - "psrad $" #shift ", %%mm2 \n\t"\ - "psrad $" #shift ", %%mm0 \n\t"\ - "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ - "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ - "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ - "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ - "psrad $" #shift ", %%mm6 \n\t"\ - "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ - "movq %%mm2, 8+" #dst " \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ - "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ - "movq %%mm4, 16+" #dst " \n\t"\ - "jmp 2f \n\t"\ - "1: \n\t"\ - "pslld $16, %%mm0 \n\t"\ - "#paddd "MANGLE(d40000)", %%mm0 \n\t"\ - "psrad $13, %%mm0 \n\t"\ - "packssdw %%mm0, %%mm0 \n\t"\ - "movq %%mm0, " #dst " \n\t"\ - "movq %%mm0, 8+" #dst " \n\t"\ - "movq %%mm0, 16+" #dst " \n\t"\ - "movq %%mm0, 24+" #dst " \n\t"\ - "2: \n\t" - - -//IDCT( src0, src4, src1, src5, dst, rounder, shift) -ROW_IDCT( (%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11) -/*ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11) -ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11) -ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/ - -DC_COND_ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11) -DC_COND_ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11) -DC_COND_ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11) - - -//IDCT( src0, src4, src1, src5, dst, shift) -COL_IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) -COL_IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) -COL_IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) -COL_IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) - -#else - -#define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \ - "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ - "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ - "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ - "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ - "movq "MANGLE(wm1010)", %%mm4 \n\t"\ - "pand %%mm0, %%mm4 \n\t"\ - "por %%mm1, %%mm4 \n\t"\ - "por %%mm2, %%mm4 \n\t"\ - "por %%mm3, %%mm4 \n\t"\ - "packssdw %%mm4,%%mm4 \n\t"\ - "movd %%mm4, %%eax \n\t"\ - "orl %%eax, %%eax \n\t"\ - "jz 1f \n\t"\ - "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ - "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ - "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ - "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ - "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ - "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ - "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ - "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ - "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ - #rounder ", %%mm4 \n\t"\ - "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ - "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ - "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ - "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ - #rounder ", %%mm0 \n\t"\ - "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ - "paddd %%mm0, %%mm0 \n\t" \ - "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ - "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ - "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ - "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ - "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ - "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ - "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ - "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ - "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ - "psrad $" #shift ", %%mm7 \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ - "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ - "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ - "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ - "psrad $" #shift ", %%mm1 \n\t"\ - "psrad $" #shift ", %%mm2 \n\t"\ - "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ - "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ - "movq %%mm7, " #dst " \n\t"\ - "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ - "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ - "movq %%mm2, 24+" #dst " \n\t"\ - "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ - "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ - "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ - "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ - "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ - "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ - "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ - "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ - "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ - "psrad $" #shift ", %%mm2 \n\t"\ - "psrad $" #shift ", %%mm0 \n\t"\ - "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ - "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ - "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ - "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ - "psrad $" #shift ", %%mm6 \n\t"\ - "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ - "movq %%mm2, 8+" #dst " \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ - "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ - "movq %%mm4, 16+" #dst " \n\t"\ - "jmp 2f \n\t"\ - "1: \n\t"\ - "pslld $16, %%mm0 \n\t"\ - "paddd "MANGLE(d40000)", %%mm0 \n\t"\ - "psrad $13, %%mm0 \n\t"\ - "packssdw %%mm0, %%mm0 \n\t"\ - "movq %%mm0, " #dst " \n\t"\ - "movq %%mm0, 8+" #dst " \n\t"\ - "movq %%mm0, 16+" #dst " \n\t"\ - "movq %%mm0, 24+" #dst " \n\t"\ - "2: \n\t" - -#define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \ - "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ - "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ - "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ - "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ - "movq %%mm0, %%mm4 \n\t"\ - "por %%mm1, %%mm4 \n\t"\ - "por %%mm2, %%mm4 \n\t"\ - "por %%mm3, %%mm4 \n\t"\ - "packssdw %%mm4,%%mm4 \n\t"\ - "movd %%mm4, %%eax \n\t"\ - "orl %%eax, %%eax \n\t"\ - "jz " #bt " \n\t"\ - "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ - "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ - "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ - "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ - "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ - "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ - "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ - "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ - "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ - #rounder ", %%mm4 \n\t"\ - "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ - "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ - "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ - "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ - #rounder ", %%mm0 \n\t"\ - "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ - "paddd %%mm0, %%mm0 \n\t" \ - "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ - "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ - "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ - "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ - "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ - "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ - "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ - "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ - "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ - "psrad $" #shift ", %%mm7 \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ - "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ - "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ - "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ - "psrad $" #shift ", %%mm1 \n\t"\ - "psrad $" #shift ", %%mm2 \n\t"\ - "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ - "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ - "movq %%mm7, " #dst " \n\t"\ - "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ - "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ - "movq %%mm2, 24+" #dst " \n\t"\ - "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ - "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ - "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ - "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ - "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ - "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ - "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ - "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ - "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ - "psrad $" #shift ", %%mm2 \n\t"\ - "psrad $" #shift ", %%mm0 \n\t"\ - "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ - "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ - "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ - "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ - "psrad $" #shift ", %%mm6 \n\t"\ - "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ - "movq %%mm2, 8+" #dst " \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ - "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ - "movq %%mm4, 16+" #dst " \n\t"\ - -#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \ - "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ - "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ - "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ - "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ - "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ - "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ - "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ - "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ - "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ - "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ - "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ - "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ - "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ - #rounder ", %%mm4 \n\t"\ - "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ - "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ - "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ - "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ - #rounder ", %%mm0 \n\t"\ - "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ - "paddd %%mm0, %%mm0 \n\t" \ - "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ - "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ - "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ - "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ - "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ - "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ - "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ - "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ - "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ - "psrad $" #shift ", %%mm7 \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ - "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ - "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ - "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ - "psrad $" #shift ", %%mm1 \n\t"\ - "psrad $" #shift ", %%mm2 \n\t"\ - "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ - "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ - "movq %%mm7, " #dst " \n\t"\ - "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ - "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ - "movq %%mm2, 24+" #dst " \n\t"\ - "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ - "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ - "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ - "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ - "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ - "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ - "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ - "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ - "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ - "psrad $" #shift ", %%mm2 \n\t"\ - "psrad $" #shift ", %%mm0 \n\t"\ - "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ - "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ - "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ - "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ - "psrad $" #shift ", %%mm6 \n\t"\ - "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ - "movq %%mm2, 8+" #dst " \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ - "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ - "movq %%mm4, 16+" #dst " \n\t"\ - -//IDCT( src0, src4, src1, src5, dst, rounder, shift) -DC_COND_IDCT( 0(%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11) -Z_COND_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f) -Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f) -Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f) - -#undef IDCT -#define IDCT(src0, src4, src1, src5, dst, shift) \ - "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ - "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ - "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ - "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ - "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ - "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ - "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ - "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ - "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ - "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ - "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ - "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ - "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ - "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ - "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ - "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ - "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ - "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ - "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ - "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ - "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ - "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\ - "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\ - "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ - "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ - "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ - "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ - "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\ - "psrad $" #shift ", %%mm7 \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ - "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ - "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\ - "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\ - "psrad $" #shift ", %%mm0 \n\t"\ - "psrad $" #shift ", %%mm2 \n\t"\ - "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ - "movd %%mm7, " #dst " \n\t"\ - "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ - "movd %%mm0, 16+" #dst " \n\t"\ - "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ - "movd %%mm2, 96+" #dst " \n\t"\ - "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ - "movd %%mm4, 112+" #dst " \n\t"\ - "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\ - "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ - "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ - "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ - "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ - "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ - "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ - "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ - "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ - "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ - "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ - "psrad $" #shift ", %%mm2 \n\t"\ - "psrad $" #shift ", %%mm5 \n\t"\ - "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ - "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\ - "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ - "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ - "psrad $" #shift ", %%mm6 \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ - "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ - "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ - "movd %%mm2, 32+" #dst " \n\t"\ - "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ - "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ - "movd %%mm6, 48+" #dst " \n\t"\ - "movd %%mm4, 64+" #dst " \n\t"\ - "movd %%mm5, 80+" #dst " \n\t" - - -//IDCT( src0, src4, src1, src5, dst, shift) -IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) -IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) -IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) -IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) - "jmp 9f \n\t" - - "#" ASMALIGN(4) \ - "4: \n\t" -Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f) -Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f) - -#undef IDCT -#define IDCT(src0, src4, src1, src5, dst, shift) \ - "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ - "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ - "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ - "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ - "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ - "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ - "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ - "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ - "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ - "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ - "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ - "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ - "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ - "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ - "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ - "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ - "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ - "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\ - "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ - "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\ - "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ - "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\ - "psrad $" #shift ", %%mm1 \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ - "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ - "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\ - "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\ - "psrad $" #shift ", %%mm0 \n\t"\ - "psrad $" #shift ", %%mm2 \n\t"\ - "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\ - "movd %%mm1, " #dst " \n\t"\ - "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ - "movd %%mm0, 16+" #dst " \n\t"\ - "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ - "movd %%mm2, 96+" #dst " \n\t"\ - "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ - "movd %%mm4, 112+" #dst " \n\t"\ - "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\ - "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ - "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ - "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ - "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\ - "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\ - "psrad $" #shift ", %%mm2 \n\t"\ - "psrad $" #shift ", %%mm5 \n\t"\ - "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\ - "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ - "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\ - "psrad $" #shift ", %%mm6 \n\t"\ - "psrad $" #shift ", %%mm1 \n\t"\ - "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ - "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ - "movd %%mm2, 32+" #dst " \n\t"\ - "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\ - "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ - "movd %%mm6, 48+" #dst " \n\t"\ - "movd %%mm1, 64+" #dst " \n\t"\ - "movd %%mm5, 80+" #dst " \n\t" - -//IDCT( src0, src4, src1, src5, dst, shift) -IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) -IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) -IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) -IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) - "jmp 9f \n\t" - - "#" ASMALIGN(4) \ - "6: \n\t" -Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f) - -#undef IDCT -#define IDCT(src0, src4, src1, src5, dst, shift) \ - "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ - "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ - "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ - "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ - "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ - "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ - "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ - "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ - "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\ - "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ - "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\ - "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ - "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\ - "psrad $" #shift ", %%mm1 \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ - "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ - "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\ - "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\ - "psrad $" #shift ", %%mm0 \n\t"\ - "psrad $" #shift ", %%mm2 \n\t"\ - "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\ - "movd %%mm1, " #dst " \n\t"\ - "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ - "movd %%mm0, 16+" #dst " \n\t"\ - "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ - "movd %%mm2, 96+" #dst " \n\t"\ - "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ - "movd %%mm4, 112+" #dst " \n\t"\ - "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\ - "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ - "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ - "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ - "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\ - "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\ - "psrad $" #shift ", %%mm2 \n\t"\ - "psrad $" #shift ", %%mm5 \n\t"\ - "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\ - "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ - "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\ - "psrad $" #shift ", %%mm6 \n\t"\ - "psrad $" #shift ", %%mm1 \n\t"\ - "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ - "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ - "movd %%mm2, 32+" #dst " \n\t"\ - "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\ - "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ - "movd %%mm6, 48+" #dst " \n\t"\ - "movd %%mm1, 64+" #dst " \n\t"\ - "movd %%mm5, 80+" #dst " \n\t" - - -//IDCT( src0, src4, src1, src5, dst, shift) -IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) -IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) -IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) -IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) - "jmp 9f \n\t" - - "#" ASMALIGN(4) \ - "2: \n\t" -Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f) - -#undef IDCT -#define IDCT(src0, src4, src1, src5, dst, shift) \ - "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ - "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ - "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ - "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ - "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ - "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ - "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ - "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ - "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ - "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ - "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ - "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ - "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\ - "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\ - "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ - "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ - "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ - "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ - "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\ - "psrad $" #shift ", %%mm7 \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ - "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ - "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\ - "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\ - "psrad $" #shift ", %%mm0 \n\t"\ - "psrad $" #shift ", %%mm2 \n\t"\ - "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ - "movd %%mm7, " #dst " \n\t"\ - "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ - "movd %%mm0, 16+" #dst " \n\t"\ - "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ - "movd %%mm2, 96+" #dst " \n\t"\ - "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ - "movd %%mm4, 112+" #dst " \n\t"\ - "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\ - "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ - "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ - "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ - "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ - "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ - "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ - "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ - "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ - "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ - "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ - "psrad $" #shift ", %%mm2 \n\t"\ - "psrad $" #shift ", %%mm5 \n\t"\ - "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ - "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\ - "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ - "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ - "psrad $" #shift ", %%mm6 \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ - "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ - "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ - "movd %%mm2, 32+" #dst " \n\t"\ - "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ - "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ - "movd %%mm6, 48+" #dst " \n\t"\ - "movd %%mm4, 64+" #dst " \n\t"\ - "movd %%mm5, 80+" #dst " \n\t" - -//IDCT( src0, src4, src1, src5, dst, shift) -IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) -IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) -IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) -IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) - "jmp 9f \n\t" - - "#" ASMALIGN(4) \ - "3: \n\t" -#undef IDCT -#define IDCT(src0, src4, src1, src5, dst, shift) \ - "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ - "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ - "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ - "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ - "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ - "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ - "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ - "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ - "movq 64(%2), %%mm3 \n\t"\ - "pmaddwd %%mm2, %%mm3 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ - "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ - "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ - "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ - "psrad $" #shift ", %%mm7 \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ - "movq %%mm0, %%mm1 \n\t" /* A1 a1 */\ - "paddd %%mm3, %%mm0 \n\t" /* A1+B1 a1+b1 */\ - "psubd %%mm3, %%mm1 \n\t" /* A1-B1 a1-b1 */\ - "psrad $" #shift ", %%mm0 \n\t"\ - "psrad $" #shift ", %%mm1 \n\t"\ - "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ - "movd %%mm7, " #dst " \n\t"\ - "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ - "movd %%mm0, 16+" #dst " \n\t"\ - "packssdw %%mm1, %%mm1 \n\t" /* A1-B1 a1-b1 */\ - "movd %%mm1, 96+" #dst " \n\t"\ - "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ - "movd %%mm4, 112+" #dst " \n\t"\ - "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ - "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ - "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ - "movq %%mm5, %%mm1 \n\t" /* A2 a2 */\ - "paddd %%mm4, %%mm1 \n\t" /* A2+B2 a2+b2 */\ - "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ - "psrad $" #shift ", %%mm1 \n\t"\ - "psrad $" #shift ", %%mm5 \n\t"\ - "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ - "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\ - "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\ - "psrad $" #shift ", %%mm6 \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ - "packssdw %%mm1, %%mm1 \n\t" /* A2+B2 a2+b2 */\ - "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ - "movd %%mm1, 32+" #dst " \n\t"\ - "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ - "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ - "movd %%mm6, 48+" #dst " \n\t"\ - "movd %%mm4, 64+" #dst " \n\t"\ - "movd %%mm5, 80+" #dst " \n\t" - - -//IDCT( src0, src4, src1, src5, dst, shift) -IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) -IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) -IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) -IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) - "jmp 9f \n\t" - - "#" ASMALIGN(4) \ - "5: \n\t" -#undef IDCT -#define IDCT(src0, src4, src1, src5, dst, shift) \ - "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ - "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ - "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ - "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ - "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ - "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ - "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ - "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ - "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ - "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ - "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ - "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ - "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ - "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ - "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\ - "movq 8+" #src4 ", %%mm3 \n\t" /* R6 R2 r6 r2 */\ - "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\ - "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\ - "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ - "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\ - "pmaddwd %%mm3, %%mm7 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ - "pmaddwd 40(%2), %%mm3 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ - "paddd %%mm1, %%mm7 \n\t" /* A0 a0 */\ - "paddd %%mm1, %%mm1 \n\t" /* 2C0 2c0 */\ - "psubd %%mm7, %%mm1 \n\t" /* A3 a3 */\ - "paddd %%mm2, %%mm3 \n\t" /* A1 a1 */\ - "paddd %%mm2, %%mm2 \n\t" /* 2C1 2c1 */\ - "psubd %%mm3, %%mm2 \n\t" /* A2 a2 */\ - "psrad $" #shift ", %%mm4 \n\t"\ - "psrad $" #shift ", %%mm7 \n\t"\ - "psrad $" #shift ", %%mm3 \n\t"\ - "packssdw %%mm7, %%mm4 \n\t" /* A0 a0 */\ - "movq %%mm4, " #dst " \n\t"\ - "psrad $" #shift ", %%mm0 \n\t"\ - "packssdw %%mm3, %%mm0 \n\t" /* A1 a1 */\ - "movq %%mm0, 16+" #dst " \n\t"\ - "movq %%mm0, 96+" #dst " \n\t"\ - "movq %%mm4, 112+" #dst " \n\t"\ - "psrad $" #shift ", %%mm5 \n\t"\ - "psrad $" #shift ", %%mm6 \n\t"\ - "psrad $" #shift ", %%mm2 \n\t"\ - "packssdw %%mm2, %%mm5 \n\t" /* A2-B2 a2-b2 */\ - "movq %%mm5, 32+" #dst " \n\t"\ - "psrad $" #shift ", %%mm1 \n\t"\ - "packssdw %%mm1, %%mm6 \n\t" /* A3+B3 a3+b3 */\ - "movq %%mm6, 48+" #dst " \n\t"\ - "movq %%mm6, 64+" #dst " \n\t"\ - "movq %%mm5, 80+" #dst " \n\t" - - -//IDCT( src0, src4, src1, src5, dst, shift) -IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) -//IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) -IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) -//IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) - "jmp 9f \n\t" - - - "#" ASMALIGN(4) \ - "1: \n\t" -#undef IDCT -#define IDCT(src0, src4, src1, src5, dst, shift) \ - "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ - "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ - "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ - "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ - "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ - "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ - "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ - "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ - "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ - "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ - "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ - "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ - "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ - "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ - "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ - "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ - "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ - "movq 64(%2), %%mm1 \n\t"\ - "pmaddwd %%mm2, %%mm1 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ - "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ - "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ - "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ - "psrad $" #shift ", %%mm7 \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ - "movq %%mm0, %%mm3 \n\t" /* A1 a1 */\ - "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\ - "psubd %%mm1, %%mm3 \n\t" /* A1-B1 a1-b1 */\ - "psrad $" #shift ", %%mm0 \n\t"\ - "psrad $" #shift ", %%mm3 \n\t"\ - "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ - "movd %%mm7, " #dst " \n\t"\ - "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ - "movd %%mm0, 16+" #dst " \n\t"\ - "packssdw %%mm3, %%mm3 \n\t" /* A1-B1 a1-b1 */\ - "movd %%mm3, 96+" #dst " \n\t"\ - "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ - "movd %%mm4, 112+" #dst " \n\t"\ - "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ - "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ - "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ - "movq %%mm5, %%mm3 \n\t" /* A2 a2 */\ - "paddd %%mm4, %%mm3 \n\t" /* A2+B2 a2+b2 */\ - "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ - "psrad $" #shift ", %%mm3 \n\t"\ - "psrad $" #shift ", %%mm5 \n\t"\ - "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ - "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\ - "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\ - "psrad $" #shift ", %%mm6 \n\t"\ - "packssdw %%mm3, %%mm3 \n\t" /* A2+B2 a2+b2 */\ - "movd %%mm3, 32+" #dst " \n\t"\ - "psrad $" #shift ", %%mm4 \n\t"\ - "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ - "movd %%mm6, 48+" #dst " \n\t"\ - "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ - "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ - "movd %%mm4, 64+" #dst " \n\t"\ - "movd %%mm5, 80+" #dst " \n\t" - - -//IDCT( src0, src4, src1, src5, dst, shift) -IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) -IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) -IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) -IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) - "jmp 9f \n\t" - - - "#" ASMALIGN(4) - "7: \n\t" -#undef IDCT -#define IDCT(src0, src4, src1, src5, dst, shift) \ - "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ - "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ - "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ - "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ - "psrad $" #shift ", %%mm4 \n\t"\ - "psrad $" #shift ", %%mm0 \n\t"\ - "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\ - "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\ - "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\ - "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ - "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\ - "psrad $" #shift ", %%mm1 \n\t"\ - "packssdw %%mm1, %%mm4 \n\t" /* A0 a0 */\ - "movq %%mm4, " #dst " \n\t"\ - "psrad $" #shift ", %%mm2 \n\t"\ - "packssdw %%mm2, %%mm0 \n\t" /* A1 a1 */\ - "movq %%mm0, 16+" #dst " \n\t"\ - "movq %%mm0, 96+" #dst " \n\t"\ - "movq %%mm4, 112+" #dst " \n\t"\ - "movq %%mm0, 32+" #dst " \n\t"\ - "movq %%mm4, 48+" #dst " \n\t"\ - "movq %%mm4, 64+" #dst " \n\t"\ - "movq %%mm0, 80+" #dst " \n\t" - -//IDCT( src0, src4, src1, src5, dst, shift) -IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) -//IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) -IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) -//IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) - - -#endif - -/* -Input - 00 40 04 44 20 60 24 64 - 10 30 14 34 50 70 54 74 - 01 41 03 43 21 61 23 63 - 11 31 13 33 51 71 53 73 - 02 42 06 46 22 62 26 66 - 12 32 16 36 52 72 56 76 - 05 45 07 47 25 65 27 67 - 15 35 17 37 55 75 57 77 - -Temp - 00 04 10 14 20 24 30 34 - 40 44 50 54 60 64 70 74 - 01 03 11 13 21 23 31 33 - 41 43 51 53 61 63 71 73 - 02 06 12 16 22 26 32 36 - 42 46 52 56 62 66 72 76 - 05 07 15 17 25 27 35 37 - 45 47 55 57 65 67 75 77 -*/ - -"9: \n\t" - :: "r" (block), "r" (temp), "r" (coeffs) - : "%eax" - ); -} - -void ff_simple_idct_mmx(int16_t *block) -{ - idct(block); -} - -//FIXME merge add/put into the idct - -void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block) -{ - idct(block); - put_pixels_clamped_mmx(block, dest, line_size); -} -void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block) -{ - idct(block); - add_pixels_clamped_mmx(block, dest, line_size); -}
--- a/i386/snowdsp_mmx.c Mon Dec 22 06:50:18 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,871 +0,0 @@ -/* - * MMX and SSE2 optimized snow DSP utils - * Copyright (c) 2005-2006 Robert Edele <yartrebo@earthlink.net> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/x86_cpu.h" -#include "libavcodec/avcodec.h" -#include "libavcodec/snow.h" - -void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, int width){ - const int w2= (width+1)>>1; - DECLARE_ALIGNED_16(IDWTELEM, temp[width>>1]); - const int w_l= (width>>1); - const int w_r= w2 - 1; - int i; - - { // Lift 0 - IDWTELEM * const ref = b + w2 - 1; - IDWTELEM b_0 = b[0]; //By allowing the first entry in b[0] to be calculated twice - // (the first time erroneously), we allow the SSE2 code to run an extra pass. - // The savings in code and time are well worth having to store this value and - // calculate b[0] correctly afterwards. - - i = 0; - __asm__ volatile( - "pcmpeqd %%xmm7, %%xmm7 \n\t" - "pcmpeqd %%xmm3, %%xmm3 \n\t" - "psllw $1, %%xmm3 \n\t" - "paddw %%xmm7, %%xmm3 \n\t" - "psllw $13, %%xmm3 \n\t" - ::); - for(; i<w_l-15; i+=16){ - __asm__ volatile( - "movdqu (%1), %%xmm1 \n\t" - "movdqu 16(%1), %%xmm5 \n\t" - "movdqu 2(%1), %%xmm2 \n\t" - "movdqu 18(%1), %%xmm6 \n\t" - "paddw %%xmm1, %%xmm2 \n\t" - "paddw %%xmm5, %%xmm6 \n\t" - "paddw %%xmm7, %%xmm2 \n\t" - "paddw %%xmm7, %%xmm6 \n\t" - "pmulhw %%xmm3, %%xmm2 \n\t" - "pmulhw %%xmm3, %%xmm6 \n\t" - "paddw (%0), %%xmm2 \n\t" - "paddw 16(%0), %%xmm6 \n\t" - "movdqa %%xmm2, (%0) \n\t" - "movdqa %%xmm6, 16(%0) \n\t" - :: "r"(&b[i]), "r"(&ref[i]) - : "memory" - ); - } - snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS); - b[0] = b_0 - ((W_DM * 2 * ref[1]+W_DO)>>W_DS); - } - - { // Lift 1 - IDWTELEM * const dst = b+w2; - - i = 0; - for(; (((x86_reg)&dst[i]) & 0x1F) && i<w_r; i++){ - dst[i] = dst[i] - (b[i] + b[i + 1]); - } - for(; i<w_r-15; i+=16){ - __asm__ volatile( - "movdqu (%1), %%xmm1 \n\t" - "movdqu 16(%1), %%xmm5 \n\t" - "movdqu 2(%1), %%xmm2 \n\t" - "movdqu 18(%1), %%xmm6 \n\t" - "paddw %%xmm1, %%xmm2 \n\t" - "paddw %%xmm5, %%xmm6 \n\t" - "movdqa (%0), %%xmm0 \n\t" - "movdqa 16(%0), %%xmm4 \n\t" - "psubw %%xmm2, %%xmm0 \n\t" - "psubw %%xmm6, %%xmm4 \n\t" - "movdqa %%xmm0, (%0) \n\t" - "movdqa %%xmm4, 16(%0) \n\t" - :: "r"(&dst[i]), "r"(&b[i]) - : "memory" - ); - } - snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS); - } - - { // Lift 2 - IDWTELEM * const ref = b+w2 - 1; - IDWTELEM b_0 = b[0]; - - i = 0; - __asm__ volatile( - "psllw $15, %%xmm7 \n\t" - "pcmpeqw %%xmm6, %%xmm6 \n\t" - "psrlw $13, %%xmm6 \n\t" - "paddw %%xmm7, %%xmm6 \n\t" - ::); - for(; i<w_l-15; i+=16){ - __asm__ volatile( - "movdqu (%1), %%xmm0 \n\t" - "movdqu 16(%1), %%xmm4 \n\t" - "movdqu 2(%1), %%xmm1 \n\t" - "movdqu 18(%1), %%xmm5 \n\t" //FIXME try aligned reads and shifts - "paddw %%xmm6, %%xmm0 \n\t" - "paddw %%xmm6, %%xmm4 \n\t" - "paddw %%xmm7, %%xmm1 \n\t" - "paddw %%xmm7, %%xmm5 \n\t" - "pavgw %%xmm1, %%xmm0 \n\t" - "pavgw %%xmm5, %%xmm4 \n\t" - "psubw %%xmm7, %%xmm0 \n\t" - "psubw %%xmm7, %%xmm4 \n\t" - "psraw $1, %%xmm0 \n\t" - "psraw $1, %%xmm4 \n\t" - "movdqa (%0), %%xmm1 \n\t" - "movdqa 16(%0), %%xmm5 \n\t" - "paddw %%xmm1, %%xmm0 \n\t" - "paddw %%xmm5, %%xmm4 \n\t" - "psraw $2, %%xmm0 \n\t" - "psraw $2, %%xmm4 \n\t" - "paddw %%xmm1, %%xmm0 \n\t" - "paddw %%xmm5, %%xmm4 \n\t" - "movdqa %%xmm0, (%0) \n\t" - "movdqa %%xmm4, 16(%0) \n\t" - :: "r"(&b[i]), "r"(&ref[i]) - : "memory" - ); - } - snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l); - b[0] = b_0 + ((2 * ref[1] + W_BO-1 + 4 * b_0) >> W_BS); - } - - { // Lift 3 - IDWTELEM * const src = b+w2; - - i = 0; - for(; (((x86_reg)&temp[i]) & 0x1F) && i<w_r; i++){ - temp[i] = src[i] - ((-W_AM*(b[i] + b[i+1]))>>W_AS); - } - for(; i<w_r-7; i+=8){ - __asm__ volatile( - "movdqu 2(%1), %%xmm2 \n\t" - "movdqu 18(%1), %%xmm6 \n\t" - "paddw (%1), %%xmm2 \n\t" - "paddw 16(%1), %%xmm6 \n\t" - "movdqu (%0), %%xmm0 \n\t" - "movdqu 16(%0), %%xmm4 \n\t" - "paddw %%xmm2, %%xmm0 \n\t" - "paddw %%xmm6, %%xmm4 \n\t" - "psraw $1, %%xmm2 \n\t" - "psraw $1, %%xmm6 \n\t" - "paddw %%xmm0, %%xmm2 \n\t" - "paddw %%xmm4, %%xmm6 \n\t" - "movdqa %%xmm2, (%2) \n\t" - "movdqa %%xmm6, 16(%2) \n\t" - :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i]) - : "memory" - ); - } - snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS); - } - - { - snow_interleave_line_header(&i, width, b, temp); - - for (; (i & 0x3E) != 0x3E; i-=2){ - b[i+1] = temp[i>>1]; - b[i] = b[i>>1]; - } - for (i-=62; i>=0; i-=64){ - __asm__ volatile( - "movdqa (%1), %%xmm0 \n\t" - "movdqa 16(%1), %%xmm2 \n\t" - "movdqa 32(%1), %%xmm4 \n\t" - "movdqa 48(%1), %%xmm6 \n\t" - "movdqa (%1), %%xmm1 \n\t" - "movdqa 16(%1), %%xmm3 \n\t" - "movdqa 32(%1), %%xmm5 \n\t" - "movdqa 48(%1), %%xmm7 \n\t" - "punpcklwd (%2), %%xmm0 \n\t" - "punpcklwd 16(%2), %%xmm2 \n\t" - "punpcklwd 32(%2), %%xmm4 \n\t" - "punpcklwd 48(%2), %%xmm6 \n\t" - "movdqa %%xmm0, (%0) \n\t" - "movdqa %%xmm2, 32(%0) \n\t" - "movdqa %%xmm4, 64(%0) \n\t" - "movdqa %%xmm6, 96(%0) \n\t" - "punpckhwd (%2), %%xmm1 \n\t" - "punpckhwd 16(%2), %%xmm3 \n\t" - "punpckhwd 32(%2), %%xmm5 \n\t" - "punpckhwd 48(%2), %%xmm7 \n\t" - "movdqa %%xmm1, 16(%0) \n\t" - "movdqa %%xmm3, 48(%0) \n\t" - "movdqa %%xmm5, 80(%0) \n\t" - "movdqa %%xmm7, 112(%0) \n\t" - :: "r"(&(b)[i]), "r"(&(b)[i>>1]), "r"(&(temp)[i>>1]) - : "memory" - ); - } - } -} - -void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, int width){ - const int w2= (width+1)>>1; - IDWTELEM temp[width >> 1]; - const int w_l= (width>>1); - const int w_r= w2 - 1; - int i; - - { // Lift 0 - IDWTELEM * const ref = b + w2 - 1; - - i = 1; - b[0] = b[0] - ((W_DM * 2 * ref[1]+W_DO)>>W_DS); - __asm__ volatile( - "pcmpeqw %%mm7, %%mm7 \n\t" - "pcmpeqw %%mm3, %%mm3 \n\t" - "psllw $1, %%mm3 \n\t" - "paddw %%mm7, %%mm3 \n\t" - "psllw $13, %%mm3 \n\t" - ::); - for(; i<w_l-7; i+=8){ - __asm__ volatile( - "movq (%1), %%mm2 \n\t" - "movq 8(%1), %%mm6 \n\t" - "paddw 2(%1), %%mm2 \n\t" - "paddw 10(%1), %%mm6 \n\t" - "paddw %%mm7, %%mm2 \n\t" - "paddw %%mm7, %%mm6 \n\t" - "pmulhw %%mm3, %%mm2 \n\t" - "pmulhw %%mm3, %%mm6 \n\t" - "paddw (%0), %%mm2 \n\t" - "paddw 8(%0), %%mm6 \n\t" - "movq %%mm2, (%0) \n\t" - "movq %%mm6, 8(%0) \n\t" - :: "r"(&b[i]), "r"(&ref[i]) - : "memory" - ); - } - snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS); - } - - { // Lift 1 - IDWTELEM * const dst = b+w2; - - i = 0; - for(; i<w_r-7; i+=8){ - __asm__ volatile( - "movq (%1), %%mm2 \n\t" - "movq 8(%1), %%mm6 \n\t" - "paddw 2(%1), %%mm2 \n\t" - "paddw 10(%1), %%mm6 \n\t" - "movq (%0), %%mm0 \n\t" - "movq 8(%0), %%mm4 \n\t" - "psubw %%mm2, %%mm0 \n\t" - "psubw %%mm6, %%mm4 \n\t" - "movq %%mm0, (%0) \n\t" - "movq %%mm4, 8(%0) \n\t" - :: "r"(&dst[i]), "r"(&b[i]) - : "memory" - ); - } - snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS); - } - - { // Lift 2 - IDWTELEM * const ref = b+w2 - 1; - - i = 1; - b[0] = b[0] + (((2 * ref[1] + W_BO) + 4 * b[0]) >> W_BS); - __asm__ volatile( - "psllw $15, %%mm7 \n\t" - "pcmpeqw %%mm6, %%mm6 \n\t" - "psrlw $13, %%mm6 \n\t" - "paddw %%mm7, %%mm6 \n\t" - ::); - for(; i<w_l-7; i+=8){ - __asm__ volatile( - "movq (%1), %%mm0 \n\t" - "movq 8(%1), %%mm4 \n\t" - "movq 2(%1), %%mm1 \n\t" - "movq 10(%1), %%mm5 \n\t" - "paddw %%mm6, %%mm0 \n\t" - "paddw %%mm6, %%mm4 \n\t" - "paddw %%mm7, %%mm1 \n\t" - "paddw %%mm7, %%mm5 \n\t" - "pavgw %%mm1, %%mm0 \n\t" - "pavgw %%mm5, %%mm4 \n\t" - "psubw %%mm7, %%mm0 \n\t" - "psubw %%mm7, %%mm4 \n\t" - "psraw $1, %%mm0 \n\t" - "psraw $1, %%mm4 \n\t" - "movq (%0), %%mm1 \n\t" - "movq 8(%0), %%mm5 \n\t" - "paddw %%mm1, %%mm0 \n\t" - "paddw %%mm5, %%mm4 \n\t" - "psraw $2, %%mm0 \n\t" - "psraw $2, %%mm4 \n\t" - "paddw %%mm1, %%mm0 \n\t" - "paddw %%mm5, %%mm4 \n\t" - "movq %%mm0, (%0) \n\t" - "movq %%mm4, 8(%0) \n\t" - :: "r"(&b[i]), "r"(&ref[i]) - : "memory" - ); - } - snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l); - } - - { // Lift 3 - IDWTELEM * const src = b+w2; - i = 0; - - for(; i<w_r-7; i+=8){ - __asm__ volatile( - "movq 2(%1), %%mm2 \n\t" - "movq 10(%1), %%mm6 \n\t" - "paddw (%1), %%mm2 \n\t" - "paddw 8(%1), %%mm6 \n\t" - "movq (%0), %%mm0 \n\t" - "movq 8(%0), %%mm4 \n\t" - "paddw %%mm2, %%mm0 \n\t" - "paddw %%mm6, %%mm4 \n\t" - "psraw $1, %%mm2 \n\t" - "psraw $1, %%mm6 \n\t" - "paddw %%mm0, %%mm2 \n\t" - "paddw %%mm4, %%mm6 \n\t" - "movq %%mm2, (%2) \n\t" - "movq %%mm6, 8(%2) \n\t" - :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i]) - : "memory" - ); - } - snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS); - } - - { - snow_interleave_line_header(&i, width, b, temp); - - for (; (i & 0x1E) != 0x1E; i-=2){ - b[i+1] = temp[i>>1]; - b[i] = b[i>>1]; - } - for (i-=30; i>=0; i-=32){ - __asm__ volatile( - "movq (%1), %%mm0 \n\t" - "movq 8(%1), %%mm2 \n\t" - "movq 16(%1), %%mm4 \n\t" - "movq 24(%1), %%mm6 \n\t" - "movq (%1), %%mm1 \n\t" - "movq 8(%1), %%mm3 \n\t" - "movq 16(%1), %%mm5 \n\t" - "movq 24(%1), %%mm7 \n\t" - "punpcklwd (%2), %%mm0 \n\t" - "punpcklwd 8(%2), %%mm2 \n\t" - "punpcklwd 16(%2), %%mm4 \n\t" - "punpcklwd 24(%2), %%mm6 \n\t" - "movq %%mm0, (%0) \n\t" - "movq %%mm2, 16(%0) \n\t" - "movq %%mm4, 32(%0) \n\t" - "movq %%mm6, 48(%0) \n\t" - "punpckhwd (%2), %%mm1 \n\t" - "punpckhwd 8(%2), %%mm3 \n\t" - "punpckhwd 16(%2), %%mm5 \n\t" - "punpckhwd 24(%2), %%mm7 \n\t" - "movq %%mm1, 8(%0) \n\t" - "movq %%mm3, 24(%0) \n\t" - "movq %%mm5, 40(%0) \n\t" - "movq %%mm7, 56(%0) \n\t" - :: "r"(&b[i]), "r"(&b[i>>1]), "r"(&temp[i>>1]) - : "memory" - ); - } - } -} - -#ifdef HAVE_7REGS -#define snow_vertical_compose_sse2_load_add(op,r,t0,t1,t2,t3)\ - ""op" ("r",%%"REG_d"), %%"t0" \n\t"\ - ""op" 16("r",%%"REG_d"), %%"t1" \n\t"\ - ""op" 32("r",%%"REG_d"), %%"t2" \n\t"\ - ""op" 48("r",%%"REG_d"), %%"t3" \n\t" - -#define snow_vertical_compose_sse2_load(r,t0,t1,t2,t3)\ - snow_vertical_compose_sse2_load_add("movdqa",r,t0,t1,t2,t3) - -#define snow_vertical_compose_sse2_add(r,t0,t1,t2,t3)\ - snow_vertical_compose_sse2_load_add("paddw",r,t0,t1,t2,t3) - -#define snow_vertical_compose_r2r_sub(s0,s1,s2,s3,t0,t1,t2,t3)\ - "psubw %%"s0", %%"t0" \n\t"\ - "psubw %%"s1", %%"t1" \n\t"\ - "psubw %%"s2", %%"t2" \n\t"\ - "psubw %%"s3", %%"t3" \n\t" - -#define snow_vertical_compose_sse2_store(w,s0,s1,s2,s3)\ - "movdqa %%"s0", ("w",%%"REG_d") \n\t"\ - "movdqa %%"s1", 16("w",%%"REG_d") \n\t"\ - "movdqa %%"s2", 32("w",%%"REG_d") \n\t"\ - "movdqa %%"s3", 48("w",%%"REG_d") \n\t" - -#define snow_vertical_compose_sra(n,t0,t1,t2,t3)\ - "psraw $"n", %%"t0" \n\t"\ - "psraw $"n", %%"t1" \n\t"\ - "psraw $"n", %%"t2" \n\t"\ - "psraw $"n", %%"t3" \n\t" - -#define snow_vertical_compose_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)\ - "paddw %%"s0", %%"t0" \n\t"\ - "paddw %%"s1", %%"t1" \n\t"\ - "paddw %%"s2", %%"t2" \n\t"\ - "paddw %%"s3", %%"t3" \n\t" - -#define snow_vertical_compose_r2r_pmulhw(s0,s1,s2,s3,t0,t1,t2,t3)\ - "pmulhw %%"s0", %%"t0" \n\t"\ - "pmulhw %%"s1", %%"t1" \n\t"\ - "pmulhw %%"s2", %%"t2" \n\t"\ - "pmulhw %%"s3", %%"t3" \n\t" - -#define snow_vertical_compose_sse2_move(s0,s1,s2,s3,t0,t1,t2,t3)\ - "movdqa %%"s0", %%"t0" \n\t"\ - "movdqa %%"s1", %%"t1" \n\t"\ - "movdqa %%"s2", %%"t2" \n\t"\ - "movdqa %%"s3", %%"t3" \n\t" - -void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){ - x86_reg i = width; - - while(i & 0x1F) - { - i--; - b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS; - b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS; - b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS; - b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS; - } - i+=i; - - __asm__ volatile ( - "jmp 2f \n\t" - "1: \n\t" - snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6") - snow_vertical_compose_sse2_add("%6","xmm0","xmm2","xmm4","xmm6") - - - "pcmpeqw %%xmm0, %%xmm0 \n\t" - "pcmpeqw %%xmm2, %%xmm2 \n\t" - "paddw %%xmm2, %%xmm2 \n\t" - "paddw %%xmm0, %%xmm2 \n\t" - "psllw $13, %%xmm2 \n\t" - snow_vertical_compose_r2r_add("xmm0","xmm0","xmm0","xmm0","xmm1","xmm3","xmm5","xmm7") - snow_vertical_compose_r2r_pmulhw("xmm2","xmm2","xmm2","xmm2","xmm1","xmm3","xmm5","xmm7") - snow_vertical_compose_sse2_add("%5","xmm1","xmm3","xmm5","xmm7") - snow_vertical_compose_sse2_store("%5","xmm1","xmm3","xmm5","xmm7") - snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6") - snow_vertical_compose_sse2_add("%3","xmm1","xmm3","xmm5","xmm7") - snow_vertical_compose_r2r_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6") - snow_vertical_compose_sse2_store("%4","xmm0","xmm2","xmm4","xmm6") - - "pcmpeqw %%xmm7, %%xmm7 \n\t" - "pcmpeqw %%xmm5, %%xmm5 \n\t" - "psllw $15, %%xmm7 \n\t" - "psrlw $13, %%xmm5 \n\t" - "paddw %%xmm7, %%xmm5 \n\t" - snow_vertical_compose_r2r_add("xmm5","xmm5","xmm5","xmm5","xmm0","xmm2","xmm4","xmm6") - "movq (%2,%%"REG_d"), %%xmm1 \n\t" - "movq 8(%2,%%"REG_d"), %%xmm3 \n\t" - "paddw %%xmm7, %%xmm1 \n\t" - "paddw %%xmm7, %%xmm3 \n\t" - "pavgw %%xmm1, %%xmm0 \n\t" - "pavgw %%xmm3, %%xmm2 \n\t" - "movq 16(%2,%%"REG_d"), %%xmm1 \n\t" - "movq 24(%2,%%"REG_d"), %%xmm3 \n\t" - "paddw %%xmm7, %%xmm1 \n\t" - "paddw %%xmm7, %%xmm3 \n\t" - "pavgw %%xmm1, %%xmm4 \n\t" - "pavgw %%xmm3, %%xmm6 \n\t" - snow_vertical_compose_r2r_sub("xmm7","xmm7","xmm7","xmm7","xmm0","xmm2","xmm4","xmm6") - snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6") - snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6") - - snow_vertical_compose_sra("2","xmm0","xmm2","xmm4","xmm6") - snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6") - snow_vertical_compose_sse2_store("%3","xmm0","xmm2","xmm4","xmm6") - snow_vertical_compose_sse2_add("%1","xmm0","xmm2","xmm4","xmm6") - snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7") - snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6") - snow_vertical_compose_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6") - snow_vertical_compose_sse2_add("%2","xmm0","xmm2","xmm4","xmm6") - snow_vertical_compose_sse2_store("%2","xmm0","xmm2","xmm4","xmm6") - - "2: \n\t" - "sub $64, %%"REG_d" \n\t" - "jge 1b \n\t" - :"+d"(i) - :"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5)); -} - -#define snow_vertical_compose_mmx_load_add(op,r,t0,t1,t2,t3)\ - ""op" ("r",%%"REG_d"), %%"t0" \n\t"\ - ""op" 8("r",%%"REG_d"), %%"t1" \n\t"\ - ""op" 16("r",%%"REG_d"), %%"t2" \n\t"\ - ""op" 24("r",%%"REG_d"), %%"t3" \n\t" - -#define snow_vertical_compose_mmx_load(r,t0,t1,t2,t3)\ - snow_vertical_compose_mmx_load_add("movq",r,t0,t1,t2,t3) - -#define snow_vertical_compose_mmx_add(r,t0,t1,t2,t3)\ - snow_vertical_compose_mmx_load_add("paddw",r,t0,t1,t2,t3) - -#define snow_vertical_compose_mmx_store(w,s0,s1,s2,s3)\ - "movq %%"s0", ("w",%%"REG_d") \n\t"\ - "movq %%"s1", 8("w",%%"REG_d") \n\t"\ - "movq %%"s2", 16("w",%%"REG_d") \n\t"\ - "movq %%"s3", 24("w",%%"REG_d") \n\t" - -#define snow_vertical_compose_mmx_move(s0,s1,s2,s3,t0,t1,t2,t3)\ - "movq %%"s0", %%"t0" \n\t"\ - "movq %%"s1", %%"t1" \n\t"\ - "movq %%"s2", %%"t2" \n\t"\ - "movq %%"s3", %%"t3" \n\t" - - -void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){ - x86_reg i = width; - while(i & 15) - { - i--; - b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS; - b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS; - b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS; - b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS; - } - i+=i; - __asm__ volatile( - "jmp 2f \n\t" - "1: \n\t" - - snow_vertical_compose_mmx_load("%4","mm1","mm3","mm5","mm7") - snow_vertical_compose_mmx_add("%6","mm1","mm3","mm5","mm7") - "pcmpeqw %%mm0, %%mm0 \n\t" - "pcmpeqw %%mm2, %%mm2 \n\t" - "paddw %%mm2, %%mm2 \n\t" - "paddw %%mm0, %%mm2 \n\t" - "psllw $13, %%mm2 \n\t" - snow_vertical_compose_r2r_add("mm0","mm0","mm0","mm0","mm1","mm3","mm5","mm7") - snow_vertical_compose_r2r_pmulhw("mm2","mm2","mm2","mm2","mm1","mm3","mm5","mm7") - snow_vertical_compose_mmx_add("%5","mm1","mm3","mm5","mm7") - snow_vertical_compose_mmx_store("%5","mm1","mm3","mm5","mm7") - snow_vertical_compose_mmx_load("%4","mm0","mm2","mm4","mm6") - snow_vertical_compose_mmx_add("%3","mm1","mm3","mm5","mm7") - snow_vertical_compose_r2r_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6") - snow_vertical_compose_mmx_store("%4","mm0","mm2","mm4","mm6") - "pcmpeqw %%mm7, %%mm7 \n\t" - "pcmpeqw %%mm5, %%mm5 \n\t" - "psllw $15, %%mm7 \n\t" - "psrlw $13, %%mm5 \n\t" - "paddw %%mm7, %%mm5 \n\t" - snow_vertical_compose_r2r_add("mm5","mm5","mm5","mm5","mm0","mm2","mm4","mm6") - "movq (%2,%%"REG_d"), %%mm1 \n\t" - "movq 8(%2,%%"REG_d"), %%mm3 \n\t" - "paddw %%mm7, %%mm1 \n\t" - "paddw %%mm7, %%mm3 \n\t" - "pavgw %%mm1, %%mm0 \n\t" - "pavgw %%mm3, %%mm2 \n\t" - "movq 16(%2,%%"REG_d"), %%mm1 \n\t" - "movq 24(%2,%%"REG_d"), %%mm3 \n\t" - "paddw %%mm7, %%mm1 \n\t" - "paddw %%mm7, %%mm3 \n\t" - "pavgw %%mm1, %%mm4 \n\t" - "pavgw %%mm3, %%mm6 \n\t" - snow_vertical_compose_r2r_sub("mm7","mm7","mm7","mm7","mm0","mm2","mm4","mm6") - snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6") - snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6") - - snow_vertical_compose_sra("2","mm0","mm2","mm4","mm6") - snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6") - snow_vertical_compose_mmx_store("%3","mm0","mm2","mm4","mm6") - snow_vertical_compose_mmx_add("%1","mm0","mm2","mm4","mm6") - snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7") - snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6") - snow_vertical_compose_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6") - snow_vertical_compose_mmx_add("%2","mm0","mm2","mm4","mm6") - snow_vertical_compose_mmx_store("%2","mm0","mm2","mm4","mm6") - - "2: \n\t" - "sub $32, %%"REG_d" \n\t" - "jge 1b \n\t" - :"+d"(i) - :"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5)); -} -#endif //HAVE_7REGS - -#define snow_inner_add_yblock_sse2_header \ - IDWTELEM * * dst_array = sb->line + src_y;\ - x86_reg tmp;\ - __asm__ volatile(\ - "mov %7, %%"REG_c" \n\t"\ - "mov %6, %2 \n\t"\ - "mov %4, %%"REG_S" \n\t"\ - "pxor %%xmm7, %%xmm7 \n\t" /* 0 */\ - "pcmpeqd %%xmm3, %%xmm3 \n\t"\ - "psllw $15, %%xmm3 \n\t"\ - "psrlw $12, %%xmm3 \n\t" /* FRAC_BITS >> 1 */\ - "1: \n\t"\ - "mov %1, %%"REG_D" \n\t"\ - "mov (%%"REG_D"), %%"REG_D" \n\t"\ - "add %3, %%"REG_D" \n\t" - -#define snow_inner_add_yblock_sse2_start_8(out_reg1, out_reg2, ptr_offset, s_offset)\ - "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\ - "movq (%%"REG_d"), %%"out_reg1" \n\t"\ - "movq (%%"REG_d", %%"REG_c"), %%"out_reg2" \n\t"\ - "punpcklbw %%xmm7, %%"out_reg1" \n\t"\ - "punpcklbw %%xmm7, %%"out_reg2" \n\t"\ - "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\ - "movq "s_offset"+16(%%"REG_S"), %%xmm4 \n\t"\ - "punpcklbw %%xmm7, %%xmm0 \n\t"\ - "punpcklbw %%xmm7, %%xmm4 \n\t"\ - "pmullw %%xmm0, %%"out_reg1" \n\t"\ - "pmullw %%xmm4, %%"out_reg2" \n\t" - -#define snow_inner_add_yblock_sse2_start_16(out_reg1, out_reg2, ptr_offset, s_offset)\ - "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\ - "movq (%%"REG_d"), %%"out_reg1" \n\t"\ - "movq 8(%%"REG_d"), %%"out_reg2" \n\t"\ - "punpcklbw %%xmm7, %%"out_reg1" \n\t"\ - "punpcklbw %%xmm7, %%"out_reg2" \n\t"\ - "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\ - "movq "s_offset"+8(%%"REG_S"), %%xmm4 \n\t"\ - "punpcklbw %%xmm7, %%xmm0 \n\t"\ - "punpcklbw %%xmm7, %%xmm4 \n\t"\ - "pmullw %%xmm0, %%"out_reg1" \n\t"\ - "pmullw %%xmm4, %%"out_reg2" \n\t" - -#define snow_inner_add_yblock_sse2_accum_8(ptr_offset, s_offset) \ - snow_inner_add_yblock_sse2_start_8("xmm2", "xmm6", ptr_offset, s_offset)\ - "paddusw %%xmm2, %%xmm1 \n\t"\ - "paddusw %%xmm6, %%xmm5 \n\t" - -#define snow_inner_add_yblock_sse2_accum_16(ptr_offset, s_offset) \ - snow_inner_add_yblock_sse2_start_16("xmm2", "xmm6", ptr_offset, s_offset)\ - "paddusw %%xmm2, %%xmm1 \n\t"\ - "paddusw %%xmm6, %%xmm5 \n\t" - -#define snow_inner_add_yblock_sse2_end_common1\ - "add $32, %%"REG_S" \n\t"\ - "add %%"REG_c", %0 \n\t"\ - "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\ - "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\ - "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\ - "add %%"REG_c", (%%"REG_a") \n\t" - -#define snow_inner_add_yblock_sse2_end_common2\ - "jnz 1b \n\t"\ - :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\ - :\ - "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"((x86_reg)b_h),"m"((x86_reg)src_stride):\ - "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d""); - -#define snow_inner_add_yblock_sse2_end_8\ - "sal $1, %%"REG_c" \n\t"\ - "add $"PTR_SIZE"*2, %1 \n\t"\ - snow_inner_add_yblock_sse2_end_common1\ - "sar $1, %%"REG_c" \n\t"\ - "sub $2, %2 \n\t"\ - snow_inner_add_yblock_sse2_end_common2 - -#define snow_inner_add_yblock_sse2_end_16\ - "add $"PTR_SIZE"*1, %1 \n\t"\ - snow_inner_add_yblock_sse2_end_common1\ - "dec %2 \n\t"\ - snow_inner_add_yblock_sse2_end_common2 - -static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h, - int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){ -snow_inner_add_yblock_sse2_header -snow_inner_add_yblock_sse2_start_8("xmm1", "xmm5", "3", "0") -snow_inner_add_yblock_sse2_accum_8("2", "8") -snow_inner_add_yblock_sse2_accum_8("1", "128") -snow_inner_add_yblock_sse2_accum_8("0", "136") - - "mov %0, %%"REG_d" \n\t" - "movdqa (%%"REG_D"), %%xmm0 \n\t" - "movdqa %%xmm1, %%xmm2 \n\t" - - "punpckhwd %%xmm7, %%xmm1 \n\t" - "punpcklwd %%xmm7, %%xmm2 \n\t" - "paddd %%xmm2, %%xmm0 \n\t" - "movdqa 16(%%"REG_D"), %%xmm2 \n\t" - "paddd %%xmm1, %%xmm2 \n\t" - "paddd %%xmm3, %%xmm0 \n\t" - "paddd %%xmm3, %%xmm2 \n\t" - - "mov %1, %%"REG_D" \n\t" - "mov "PTR_SIZE"(%%"REG_D"), %%"REG_D";\n\t" - "add %3, %%"REG_D" \n\t" - - "movdqa (%%"REG_D"), %%xmm4 \n\t" - "movdqa %%xmm5, %%xmm6 \n\t" - "punpckhwd %%xmm7, %%xmm5 \n\t" - "punpcklwd %%xmm7, %%xmm6 \n\t" - "paddd %%xmm6, %%xmm4 \n\t" - "movdqa 16(%%"REG_D"), %%xmm6 \n\t" - "paddd %%xmm5, %%xmm6 \n\t" - "paddd %%xmm3, %%xmm4 \n\t" - "paddd %%xmm3, %%xmm6 \n\t" - - "psrad $8, %%xmm0 \n\t" /* FRAC_BITS. */ - "psrad $8, %%xmm2 \n\t" /* FRAC_BITS. */ - "packssdw %%xmm2, %%xmm0 \n\t" - "packuswb %%xmm7, %%xmm0 \n\t" - "movq %%xmm0, (%%"REG_d") \n\t" - - "psrad $8, %%xmm4 \n\t" /* FRAC_BITS. */ - "psrad $8, %%xmm6 \n\t" /* FRAC_BITS. */ - "packssdw %%xmm6, %%xmm4 \n\t" - "packuswb %%xmm7, %%xmm4 \n\t" - "movq %%xmm4, (%%"REG_d",%%"REG_c");\n\t" -snow_inner_add_yblock_sse2_end_8 -} - -static void inner_add_yblock_bw_16_obmc_32_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h, - int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){ -snow_inner_add_yblock_sse2_header -snow_inner_add_yblock_sse2_start_16("xmm1", "xmm5", "3", "0") -snow_inner_add_yblock_sse2_accum_16("2", "16") -snow_inner_add_yblock_sse2_accum_16("1", "512") -snow_inner_add_yblock_sse2_accum_16("0", "528") - - "mov %0, %%"REG_d" \n\t" - "psrlw $4, %%xmm1 \n\t" - "psrlw $4, %%xmm5 \n\t" - "paddw (%%"REG_D"), %%xmm1 \n\t" - "paddw 16(%%"REG_D"), %%xmm5 \n\t" - "paddw %%xmm3, %%xmm1 \n\t" - "paddw %%xmm3, %%xmm5 \n\t" - "psraw $4, %%xmm1 \n\t" /* FRAC_BITS. */ - "psraw $4, %%xmm5 \n\t" /* FRAC_BITS. */ - "packuswb %%xmm5, %%xmm1 \n\t" - - "movdqu %%xmm1, (%%"REG_d") \n\t" - -snow_inner_add_yblock_sse2_end_16 -} - -#define snow_inner_add_yblock_mmx_header \ - IDWTELEM * * dst_array = sb->line + src_y;\ - x86_reg tmp;\ - __asm__ volatile(\ - "mov %7, %%"REG_c" \n\t"\ - "mov %6, %2 \n\t"\ - "mov %4, %%"REG_S" \n\t"\ - "pxor %%mm7, %%mm7 \n\t" /* 0 */\ - "pcmpeqd %%mm3, %%mm3 \n\t"\ - "psllw $15, %%mm3 \n\t"\ - "psrlw $12, %%mm3 \n\t" /* FRAC_BITS >> 1 */\ - "1: \n\t"\ - "mov %1, %%"REG_D" \n\t"\ - "mov (%%"REG_D"), %%"REG_D" \n\t"\ - "add %3, %%"REG_D" \n\t" - -#define snow_inner_add_yblock_mmx_start(out_reg1, out_reg2, ptr_offset, s_offset, d_offset)\ - "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\ - "movd "d_offset"(%%"REG_d"), %%"out_reg1" \n\t"\ - "movd "d_offset"+4(%%"REG_d"), %%"out_reg2" \n\t"\ - "punpcklbw %%mm7, %%"out_reg1" \n\t"\ - "punpcklbw %%mm7, %%"out_reg2" \n\t"\ - "movd "s_offset"(%%"REG_S"), %%mm0 \n\t"\ - "movd "s_offset"+4(%%"REG_S"), %%mm4 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "punpcklbw %%mm7, %%mm4 \n\t"\ - "pmullw %%mm0, %%"out_reg1" \n\t"\ - "pmullw %%mm4, %%"out_reg2" \n\t" - -#define snow_inner_add_yblock_mmx_accum(ptr_offset, s_offset, d_offset) \ - snow_inner_add_yblock_mmx_start("mm2", "mm6", ptr_offset, s_offset, d_offset)\ - "paddusw %%mm2, %%mm1 \n\t"\ - "paddusw %%mm6, %%mm5 \n\t" - -#define snow_inner_add_yblock_mmx_mix(read_offset, write_offset)\ - "mov %0, %%"REG_d" \n\t"\ - "psrlw $4, %%mm1 \n\t"\ - "psrlw $4, %%mm5 \n\t"\ - "paddw "read_offset"(%%"REG_D"), %%mm1 \n\t"\ - "paddw "read_offset"+8(%%"REG_D"), %%mm5 \n\t"\ - "paddw %%mm3, %%mm1 \n\t"\ - "paddw %%mm3, %%mm5 \n\t"\ - "psraw $4, %%mm1 \n\t"\ - "psraw $4, %%mm5 \n\t"\ - "packuswb %%mm5, %%mm1 \n\t"\ - "movq %%mm1, "write_offset"(%%"REG_d") \n\t" - -#define snow_inner_add_yblock_mmx_end(s_step)\ - "add $"s_step", %%"REG_S" \n\t"\ - "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\ - "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\ - "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\ - "add %%"REG_c", (%%"REG_a") \n\t"\ - "add $"PTR_SIZE"*1, %1 \n\t"\ - "add %%"REG_c", %0 \n\t"\ - "dec %2 \n\t"\ - "jnz 1b \n\t"\ - :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\ - :\ - "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"((x86_reg)b_h),"m"((x86_reg)src_stride):\ - "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d""); - -static void inner_add_yblock_bw_8_obmc_16_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h, - int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){ -snow_inner_add_yblock_mmx_header -snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0") -snow_inner_add_yblock_mmx_accum("2", "8", "0") -snow_inner_add_yblock_mmx_accum("1", "128", "0") -snow_inner_add_yblock_mmx_accum("0", "136", "0") -snow_inner_add_yblock_mmx_mix("0", "0") -snow_inner_add_yblock_mmx_end("16") -} - -static void inner_add_yblock_bw_16_obmc_32_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h, - int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){ -snow_inner_add_yblock_mmx_header -snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0") -snow_inner_add_yblock_mmx_accum("2", "16", "0") -snow_inner_add_yblock_mmx_accum("1", "512", "0") -snow_inner_add_yblock_mmx_accum("0", "528", "0") -snow_inner_add_yblock_mmx_mix("0", "0") - -snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "8", "8") -snow_inner_add_yblock_mmx_accum("2", "24", "8") -snow_inner_add_yblock_mmx_accum("1", "520", "8") -snow_inner_add_yblock_mmx_accum("0", "536", "8") -snow_inner_add_yblock_mmx_mix("16", "8") -snow_inner_add_yblock_mmx_end("32") -} - -void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, - int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){ - - if (b_w == 16) - inner_add_yblock_bw_16_obmc_32_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); - else if (b_w == 8 && obmc_stride == 16) { - if (!(b_h & 1)) - inner_add_yblock_bw_8_obmc_16_bh_even_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); - else - inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); - } else - ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); -} - -void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, - int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){ - if (b_w == 16) - inner_add_yblock_bw_16_obmc_32_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); - else if (b_w == 8 && obmc_stride == 16) - inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); - else - ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); -}
--- a/i386/vc1dsp_mmx.c Mon Dec 22 06:50:18 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,490 +0,0 @@ -/* - * VC-1 and WMV3 - DSP functions MMX-optimized - * Copyright (c) 2007 Christophe GISQUET <christophe.gisquet@free.fr> - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#include "libavutil/x86_cpu.h" -#include "libavcodec/dsputil.h" -#include "dsputil_mmx.h" - -/** Add rounder from mm7 to mm3 and pack result at destination */ -#define NORMALIZE_MMX(SHIFT) \ - "paddw %%mm7, %%mm3 \n\t" /* +bias-r */ \ - "paddw %%mm7, %%mm4 \n\t" /* +bias-r */ \ - "psraw "SHIFT", %%mm3 \n\t" \ - "psraw "SHIFT", %%mm4 \n\t" - -#define TRANSFER_DO_PACK \ - "packuswb %%mm4, %%mm3 \n\t" \ - "movq %%mm3, (%2) \n\t" - -#define TRANSFER_DONT_PACK \ - "movq %%mm3, 0(%2) \n\t" \ - "movq %%mm4, 8(%2) \n\t" - -/** @see MSPEL_FILTER13_CORE for use as UNPACK macro */ -#define DO_UNPACK(reg) "punpcklbw %%mm0, " reg "\n\t" -#define DONT_UNPACK(reg) - -/** Compute the rounder 32-r or 8-r and unpacks it to mm7 */ -#define LOAD_ROUNDER_MMX(ROUND) \ - "movd "ROUND", %%mm7 \n\t" \ - "punpcklwd %%mm7, %%mm7 \n\t" \ - "punpckldq %%mm7, %%mm7 \n\t" - -#define SHIFT2_LINE(OFF, R0,R1,R2,R3) \ - "paddw %%mm"#R2", %%mm"#R1" \n\t" \ - "movd (%0,%3), %%mm"#R0" \n\t" \ - "pmullw %%mm6, %%mm"#R1" \n\t" \ - "punpcklbw %%mm0, %%mm"#R0" \n\t" \ - "movd (%0,%2), %%mm"#R3" \n\t" \ - "psubw %%mm"#R0", %%mm"#R1" \n\t" \ - "punpcklbw %%mm0, %%mm"#R3" \n\t" \ - "paddw %%mm7, %%mm"#R1" \n\t" \ - "psubw %%mm"#R3", %%mm"#R1" \n\t" \ - "psraw %4, %%mm"#R1" \n\t" \ - "movq %%mm"#R1", "#OFF"(%1) \n\t" \ - "add %2, %0 \n\t" - -DECLARE_ALIGNED_16(const uint64_t, ff_pw_9) = 0x0009000900090009ULL; - -/** Sacrifying mm6 allows to pipeline loads from src */ -static void vc1_put_ver_16b_shift2_mmx(int16_t *dst, - const uint8_t *src, x86_reg stride, - int rnd, int64_t shift) -{ - __asm__ volatile( - "mov $3, %%"REG_c" \n\t" - LOAD_ROUNDER_MMX("%5") - "movq "MANGLE(ff_pw_9)", %%mm6 \n\t" - "1: \n\t" - "movd (%0), %%mm2 \n\t" - "add %2, %0 \n\t" - "movd (%0), %%mm3 \n\t" - "punpcklbw %%mm0, %%mm2 \n\t" - "punpcklbw %%mm0, %%mm3 \n\t" - SHIFT2_LINE( 0, 1, 2, 3, 4) - SHIFT2_LINE( 24, 2, 3, 4, 1) - SHIFT2_LINE( 48, 3, 4, 1, 2) - SHIFT2_LINE( 72, 4, 1, 2, 3) - SHIFT2_LINE( 96, 1, 2, 3, 4) - SHIFT2_LINE(120, 2, 3, 4, 1) - SHIFT2_LINE(144, 3, 4, 1, 2) - SHIFT2_LINE(168, 4, 1, 2, 3) - "sub %6, %0 \n\t" - "add $8, %1 \n\t" - "dec %%"REG_c" \n\t" - "jnz 1b \n\t" - : "+r"(src), "+r"(dst) - : "r"(stride), "r"(-2*stride), - "m"(shift), "m"(rnd), "r"(9*stride-4) - : "%"REG_c, "memory" - ); -} - -/** - * Data is already unpacked, so some operations can directly be made from - * memory. - */ -static void vc1_put_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride, - const int16_t *src, int rnd) -{ - int h = 8; - - src -= 1; - rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */ - __asm__ volatile( - LOAD_ROUNDER_MMX("%4") - "movq "MANGLE(ff_pw_128)", %%mm6\n\t" - "movq "MANGLE(ff_pw_9)", %%mm5 \n\t" - "1: \n\t" - "movq 2*0+0(%1), %%mm1 \n\t" - "movq 2*0+8(%1), %%mm2 \n\t" - "movq 2*1+0(%1), %%mm3 \n\t" - "movq 2*1+8(%1), %%mm4 \n\t" - "paddw 2*3+0(%1), %%mm1 \n\t" - "paddw 2*3+8(%1), %%mm2 \n\t" - "paddw 2*2+0(%1), %%mm3 \n\t" - "paddw 2*2+8(%1), %%mm4 \n\t" - "pmullw %%mm5, %%mm3 \n\t" - "pmullw %%mm5, %%mm4 \n\t" - "psubw %%mm1, %%mm3 \n\t" - "psubw %%mm2, %%mm4 \n\t" - NORMALIZE_MMX("$7") - /* Remove bias */ - "paddw %%mm6, %%mm3 \n\t" - "paddw %%mm6, %%mm4 \n\t" - TRANSFER_DO_PACK - "add $24, %1 \n\t" - "add %3, %2 \n\t" - "decl %0 \n\t" - "jnz 1b \n\t" - : "+r"(h), "+r" (src), "+r" (dst) - : "r"(stride), "m"(rnd) - : "memory" - ); -} - - -/** - * Purely vertical or horizontal 1/2 shift interpolation. - * Sacrify mm6 for *9 factor. - */ -static void vc1_put_shift2_mmx(uint8_t *dst, const uint8_t *src, - x86_reg stride, int rnd, x86_reg offset) -{ - rnd = 8-rnd; - __asm__ volatile( - "mov $8, %%"REG_c" \n\t" - LOAD_ROUNDER_MMX("%5") - "movq "MANGLE(ff_pw_9)", %%mm6\n\t" - "1: \n\t" - "movd 0(%0 ), %%mm3 \n\t" - "movd 4(%0 ), %%mm4 \n\t" - "movd 0(%0,%2), %%mm1 \n\t" - "movd 4(%0,%2), %%mm2 \n\t" - "add %2, %0 \n\t" - "punpcklbw %%mm0, %%mm3 \n\t" - "punpcklbw %%mm0, %%mm4 \n\t" - "punpcklbw %%mm0, %%mm1 \n\t" - "punpcklbw %%mm0, %%mm2 \n\t" - "paddw %%mm1, %%mm3 \n\t" - "paddw %%mm2, %%mm4 \n\t" - "movd 0(%0,%3), %%mm1 \n\t" - "movd 4(%0,%3), %%mm2 \n\t" - "pmullw %%mm6, %%mm3 \n\t" /* 0,9,9,0*/ - "pmullw %%mm6, %%mm4 \n\t" /* 0,9,9,0*/ - "punpcklbw %%mm0, %%mm1 \n\t" - "punpcklbw %%mm0, %%mm2 \n\t" - "psubw %%mm1, %%mm3 \n\t" /*-1,9,9,0*/ - "psubw %%mm2, %%mm4 \n\t" /*-1,9,9,0*/ - "movd 0(%0,%2), %%mm1 \n\t" - "movd 4(%0,%2), %%mm2 \n\t" - "punpcklbw %%mm0, %%mm1 \n\t" - "punpcklbw %%mm0, %%mm2 \n\t" - "psubw %%mm1, %%mm3 \n\t" /*-1,9,9,-1*/ - "psubw %%mm2, %%mm4 \n\t" /*-1,9,9,-1*/ - NORMALIZE_MMX("$4") - "packuswb %%mm4, %%mm3 \n\t" - "movq %%mm3, (%1) \n\t" - "add %6, %0 \n\t" - "add %4, %1 \n\t" - "dec %%"REG_c" \n\t" - "jnz 1b \n\t" - : "+r"(src), "+r"(dst) - : "r"(offset), "r"(-2*offset), "g"(stride), "m"(rnd), - "g"(stride-offset) - : "%"REG_c, "memory" - ); -} - -/** - * Filter coefficients made global to allow access by all 1 or 3 quarter shift - * interpolation functions. - */ -DECLARE_ASM_CONST(16, uint64_t, ff_pw_53) = 0x0035003500350035ULL; -DECLARE_ASM_CONST(16, uint64_t, ff_pw_18) = 0x0012001200120012ULL; - -/** - * Core of the 1/4 and 3/4 shift bicubic interpolation. - * - * @param UNPACK Macro unpacking arguments from 8 to 16bits (can be empty). - * @param MOVQ "movd 1" or "movq 2", if data read is already unpacked. - * @param A1 Address of 1st tap (beware of unpacked/packed). - * @param A2 Address of 2nd tap - * @param A3 Address of 3rd tap - * @param A4 Address of 4th tap - */ -#define MSPEL_FILTER13_CORE(UNPACK, MOVQ, A1, A2, A3, A4) \ - MOVQ "*0+"A1", %%mm1 \n\t" \ - MOVQ "*4+"A1", %%mm2 \n\t" \ - UNPACK("%%mm1") \ - UNPACK("%%mm2") \ - "pmullw "MANGLE(ff_pw_3)", %%mm1\n\t" \ - "pmullw "MANGLE(ff_pw_3)", %%mm2\n\t" \ - MOVQ "*0+"A2", %%mm3 \n\t" \ - MOVQ "*4+"A2", %%mm4 \n\t" \ - UNPACK("%%mm3") \ - UNPACK("%%mm4") \ - "pmullw %%mm6, %%mm3 \n\t" /* *18 */ \ - "pmullw %%mm6, %%mm4 \n\t" /* *18 */ \ - "psubw %%mm1, %%mm3 \n\t" /* 18,-3 */ \ - "psubw %%mm2, %%mm4 \n\t" /* 18,-3 */ \ - MOVQ "*0+"A4", %%mm1 \n\t" \ - MOVQ "*4+"A4", %%mm2 \n\t" \ - UNPACK("%%mm1") \ - UNPACK("%%mm2") \ - "psllw $2, %%mm1 \n\t" /* 4* */ \ - "psllw $2, %%mm2 \n\t" /* 4* */ \ - "psubw %%mm1, %%mm3 \n\t" /* -4,18,-3 */ \ - "psubw %%mm2, %%mm4 \n\t" /* -4,18,-3 */ \ - MOVQ "*0+"A3", %%mm1 \n\t" \ - MOVQ "*4+"A3", %%mm2 \n\t" \ - UNPACK("%%mm1") \ - UNPACK("%%mm2") \ - "pmullw %%mm5, %%mm1 \n\t" /* *53 */ \ - "pmullw %%mm5, %%mm2 \n\t" /* *53 */ \ - "paddw %%mm1, %%mm3 \n\t" /* 4,53,18,-3 */ \ - "paddw %%mm2, %%mm4 \n\t" /* 4,53,18,-3 */ - -/** - * Macro to build the vertical 16bits version of vc1_put_shift[13]. - * Here, offset=src_stride. Parameters passed A1 to A4 must use - * %3 (src_stride) and %4 (3*src_stride). - * - * @param NAME Either 1 or 3 - * @see MSPEL_FILTER13_CORE for information on A1->A4 - */ -#define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4) \ -static void \ -vc1_put_ver_16b_ ## NAME ## _mmx(int16_t *dst, const uint8_t *src, \ - x86_reg src_stride, \ - int rnd, int64_t shift) \ -{ \ - int h = 8; \ - src -= src_stride; \ - __asm__ volatile( \ - LOAD_ROUNDER_MMX("%5") \ - "movq "MANGLE(ff_pw_53)", %%mm5\n\t" \ - "movq "MANGLE(ff_pw_18)", %%mm6\n\t" \ - ASMALIGN(3) \ - "1: \n\t" \ - MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \ - NORMALIZE_MMX("%6") \ - TRANSFER_DONT_PACK \ - /* Last 3 (in fact 4) bytes on the line */ \ - "movd 8+"A1", %%mm1 \n\t" \ - DO_UNPACK("%%mm1") \ - "movq %%mm1, %%mm3 \n\t" \ - "paddw %%mm1, %%mm1 \n\t" \ - "paddw %%mm3, %%mm1 \n\t" /* 3* */ \ - "movd 8+"A2", %%mm3 \n\t" \ - DO_UNPACK("%%mm3") \ - "pmullw %%mm6, %%mm3 \n\t" /* *18 */ \ - "psubw %%mm1, %%mm3 \n\t" /*18,-3 */ \ - "movd 8+"A3", %%mm1 \n\t" \ - DO_UNPACK("%%mm1") \ - "pmullw %%mm5, %%mm1 \n\t" /* *53 */ \ - "paddw %%mm1, %%mm3 \n\t" /*53,18,-3 */ \ - "movd 8+"A4", %%mm1 \n\t" \ - DO_UNPACK("%%mm1") \ - "psllw $2, %%mm1 \n\t" /* 4* */ \ - "psubw %%mm1, %%mm3 \n\t" \ - "paddw %%mm7, %%mm3 \n\t" \ - "psraw %6, %%mm3 \n\t" \ - "movq %%mm3, 16(%2) \n\t" \ - "add %3, %1 \n\t" \ - "add $24, %2 \n\t" \ - "decl %0 \n\t" \ - "jnz 1b \n\t" \ - : "+r"(h), "+r" (src), "+r" (dst) \ - : "r"(src_stride), "r"(3*src_stride), \ - "m"(rnd), "m"(shift) \ - : "memory" \ - ); \ -} - -/** - * Macro to build the horizontal 16bits version of vc1_put_shift[13]. - * Here, offset=16bits, so parameters passed A1 to A4 should be simple. - * - * @param NAME Either 1 or 3 - * @see MSPEL_FILTER13_CORE for information on A1->A4 - */ -#define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4) \ -static void \ -vc1_put_hor_16b_ ## NAME ## _mmx(uint8_t *dst, x86_reg stride, \ - const int16_t *src, int rnd) \ -{ \ - int h = 8; \ - src -= 1; \ - rnd -= (-4+58+13-3)*256; /* Add -256 bias */ \ - __asm__ volatile( \ - LOAD_ROUNDER_MMX("%4") \ - "movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \ - "movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \ - ASMALIGN(3) \ - "1: \n\t" \ - MSPEL_FILTER13_CORE(DONT_UNPACK, "movq 2", A1, A2, A3, A4) \ - NORMALIZE_MMX("$7") \ - /* Remove bias */ \ - "paddw "MANGLE(ff_pw_128)", %%mm3 \n\t" \ - "paddw "MANGLE(ff_pw_128)", %%mm4 \n\t" \ - TRANSFER_DO_PACK \ - "add $24, %1 \n\t" \ - "add %3, %2 \n\t" \ - "decl %0 \n\t" \ - "jnz 1b \n\t" \ - : "+r"(h), "+r" (src), "+r" (dst) \ - : "r"(stride), "m"(rnd) \ - : "memory" \ - ); \ -} - -/** - * Macro to build the 8bits, any direction, version of vc1_put_shift[13]. - * Here, offset=src_stride. Parameters passed A1 to A4 must use - * %3 (offset) and %4 (3*offset). - * - * @param NAME Either 1 or 3 - * @see MSPEL_FILTER13_CORE for information on A1->A4 - */ -#define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4) \ -static void \ -vc1_put_## NAME ## _mmx(uint8_t *dst, const uint8_t *src, \ - x86_reg stride, int rnd, x86_reg offset) \ -{ \ - int h = 8; \ - src -= offset; \ - rnd = 32-rnd; \ - __asm__ volatile ( \ - LOAD_ROUNDER_MMX("%6") \ - "movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \ - "movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \ - ASMALIGN(3) \ - "1: \n\t" \ - MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \ - NORMALIZE_MMX("$6") \ - TRANSFER_DO_PACK \ - "add %5, %1 \n\t" \ - "add %5, %2 \n\t" \ - "decl %0 \n\t" \ - "jnz 1b \n\t" \ - : "+r"(h), "+r" (src), "+r" (dst) \ - : "r"(offset), "r"(3*offset), "g"(stride), "m"(rnd) \ - : "memory" \ - ); \ -} - -/** 1/4 shift bicubic interpolation */ -MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )") -MSPEL_FILTER13_VER_16B(shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )") -MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)") - -/** 3/4 shift bicubic interpolation */ -MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )") -MSPEL_FILTER13_VER_16B(shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )") -MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)") - -typedef void (*vc1_mspel_mc_filter_ver_16bits)(int16_t *dst, const uint8_t *src, x86_reg src_stride, int rnd, int64_t shift); -typedef void (*vc1_mspel_mc_filter_hor_16bits)(uint8_t *dst, x86_reg dst_stride, const int16_t *src, int rnd); -typedef void (*vc1_mspel_mc_filter_8bits)(uint8_t *dst, const uint8_t *src, x86_reg stride, int rnd, x86_reg offset); - -/** - * Interpolates fractional pel values by applying proper vertical then - * horizontal filter. - * - * @param dst Destination buffer for interpolated pels. - * @param src Source buffer. - * @param stride Stride for both src and dst buffers. - * @param hmode Horizontal filter (expressed in quarter pixels shift). - * @param hmode Vertical filter. - * @param rnd Rounding bias. - */ -static void vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride, - int hmode, int vmode, int rnd) -{ - static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] = - { NULL, vc1_put_ver_16b_shift1_mmx, vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx }; - static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] = - { NULL, vc1_put_hor_16b_shift1_mmx, vc1_put_hor_16b_shift2_mmx, vc1_put_hor_16b_shift3_mmx }; - static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] = - { NULL, vc1_put_shift1_mmx, vc1_put_shift2_mmx, vc1_put_shift3_mmx }; - - __asm__ volatile( - "pxor %%mm0, %%mm0 \n\t" - ::: "memory" - ); - - if (vmode) { /* Vertical filter to apply */ - if (hmode) { /* Horizontal filter to apply, output to tmp */ - static const int shift_value[] = { 0, 5, 1, 5 }; - int shift = (shift_value[hmode]+shift_value[vmode])>>1; - int r; - DECLARE_ALIGNED_16(int16_t, tmp[12*8]); - - r = (1<<(shift-1)) + rnd-1; - vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift); - - vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd); - return; - } - else { /* No horizontal filter, output 8 lines to dst */ - vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride); - return; - } - } - - /* Horizontal mode with no vertical mode */ - vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1); -} - -void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd); - -/** Macro to ease bicubic filter interpolation functions declarations */ -#define DECLARE_FUNCTION(a, b) \ -static void put_vc1_mspel_mc ## a ## b ## _mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) { \ - vc1_mspel_mc(dst, src, stride, a, b, rnd); \ -} - -DECLARE_FUNCTION(0, 1) -DECLARE_FUNCTION(0, 2) -DECLARE_FUNCTION(0, 3) - -DECLARE_FUNCTION(1, 0) -DECLARE_FUNCTION(1, 1) -DECLARE_FUNCTION(1, 2) -DECLARE_FUNCTION(1, 3) - -DECLARE_FUNCTION(2, 0) -DECLARE_FUNCTION(2, 1) -DECLARE_FUNCTION(2, 2) -DECLARE_FUNCTION(2, 3) - -DECLARE_FUNCTION(3, 0) -DECLARE_FUNCTION(3, 1) -DECLARE_FUNCTION(3, 2) -DECLARE_FUNCTION(3, 3) - -void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx) { - dsp->put_vc1_mspel_pixels_tab[ 0] = ff_put_vc1_mspel_mc00_mmx; - dsp->put_vc1_mspel_pixels_tab[ 4] = put_vc1_mspel_mc01_mmx; - dsp->put_vc1_mspel_pixels_tab[ 8] = put_vc1_mspel_mc02_mmx; - dsp->put_vc1_mspel_pixels_tab[12] = put_vc1_mspel_mc03_mmx; - - dsp->put_vc1_mspel_pixels_tab[ 1] = put_vc1_mspel_mc10_mmx; - dsp->put_vc1_mspel_pixels_tab[ 5] = put_vc1_mspel_mc11_mmx; - dsp->put_vc1_mspel_pixels_tab[ 9] = put_vc1_mspel_mc12_mmx; - dsp->put_vc1_mspel_pixels_tab[13] = put_vc1_mspel_mc13_mmx; - - dsp->put_vc1_mspel_pixels_tab[ 2] = put_vc1_mspel_mc20_mmx; - dsp->put_vc1_mspel_pixels_tab[ 6] = put_vc1_mspel_mc21_mmx; - dsp->put_vc1_mspel_pixels_tab[10] = put_vc1_mspel_mc22_mmx; - dsp->put_vc1_mspel_pixels_tab[14] = put_vc1_mspel_mc23_mmx; - - dsp->put_vc1_mspel_pixels_tab[ 3] = put_vc1_mspel_mc30_mmx; - dsp->put_vc1_mspel_pixels_tab[ 7] = put_vc1_mspel_mc31_mmx; - dsp->put_vc1_mspel_pixels_tab[11] = put_vc1_mspel_mc32_mmx; - dsp->put_vc1_mspel_pixels_tab[15] = put_vc1_mspel_mc33_mmx; -}
--- a/i386/vp3dsp_mmx.c Mon Dec 22 06:50:18 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,396 +0,0 @@ -/* - * Copyright (C) 2004 the ffmpeg project - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file vp3dsp_mmx.c - * MMX-optimized functions cribbed from the original VP3 source code. - */ - -#include "libavutil/x86_cpu.h" -#include "libavcodec/dsputil.h" -#include "dsputil_mmx.h" - -extern const uint16_t ff_vp3_idct_data[]; - -// this is off by one or two for some cases when filter_limit is greater than 63 -// in: p0 in mm6, p1 in mm4, p2 in mm2, p3 in mm1 -// out: p1 in mm4, p2 in mm3 -#define VP3_LOOP_FILTER(flim) \ - "movq %%mm6, %%mm7 \n\t" \ - "pand "MANGLE(ff_pb_7 )", %%mm6 \n\t" /* p0&7 */ \ - "psrlw $3, %%mm7 \n\t" \ - "pand "MANGLE(ff_pb_1F)", %%mm7 \n\t" /* p0>>3 */ \ - "movq %%mm2, %%mm3 \n\t" /* mm3 = p2 */ \ - "pxor %%mm4, %%mm2 \n\t" \ - "pand "MANGLE(ff_pb_1 )", %%mm2 \n\t" /* (p2^p1)&1 */ \ - "movq %%mm2, %%mm5 \n\t" \ - "paddb %%mm2, %%mm2 \n\t" \ - "paddb %%mm5, %%mm2 \n\t" /* 3*(p2^p1)&1 */ \ - "paddb %%mm6, %%mm2 \n\t" /* extra bits lost in shifts */ \ - "pcmpeqb %%mm0, %%mm0 \n\t" \ - "pxor %%mm0, %%mm1 \n\t" /* 255 - p3 */ \ - "pavgb %%mm2, %%mm1 \n\t" /* (256 - p3 + extrabits) >> 1 */ \ - "pxor %%mm4, %%mm0 \n\t" /* 255 - p1 */ \ - "pavgb %%mm3, %%mm0 \n\t" /* (256 + p2-p1) >> 1 */ \ - "paddb "MANGLE(ff_pb_3 )", %%mm1 \n\t" \ - "pavgb %%mm0, %%mm1 \n\t" /* 128+2+( p2-p1 - p3) >> 2 */ \ - "pavgb %%mm0, %%mm1 \n\t" /* 128+1+(3*(p2-p1) - p3) >> 3 */ \ - "paddusb %%mm1, %%mm7 \n\t" /* d+128+1 */ \ - "movq "MANGLE(ff_pb_81)", %%mm6 \n\t" \ - "psubusb %%mm7, %%mm6 \n\t" \ - "psubusb "MANGLE(ff_pb_81)", %%mm7 \n\t" \ -\ - "movq "#flim", %%mm5 \n\t" \ - "pminub %%mm5, %%mm6 \n\t" \ - "pminub %%mm5, %%mm7 \n\t" \ - "movq %%mm6, %%mm0 \n\t" \ - "movq %%mm7, %%mm1 \n\t" \ - "paddb %%mm6, %%mm6 \n\t" \ - "paddb %%mm7, %%mm7 \n\t" \ - "pminub %%mm5, %%mm6 \n\t" \ - "pminub %%mm5, %%mm7 \n\t" \ - "psubb %%mm0, %%mm6 \n\t" \ - "psubb %%mm1, %%mm7 \n\t" \ - "paddusb %%mm7, %%mm4 \n\t" \ - "psubusb %%mm6, %%mm4 \n\t" \ - "psubusb %%mm7, %%mm3 \n\t" \ - "paddusb %%mm6, %%mm3 \n\t" - -#define STORE_4_WORDS(dst0, dst1, dst2, dst3, mm) \ - "movd "#mm", %0 \n\t" \ - "movw %w0, -1"#dst0" \n\t" \ - "psrlq $32, "#mm" \n\t" \ - "shr $16, %0 \n\t" \ - "movw %w0, -1"#dst1" \n\t" \ - "movd "#mm", %0 \n\t" \ - "movw %w0, -1"#dst2" \n\t" \ - "shr $16, %0 \n\t" \ - "movw %w0, -1"#dst3" \n\t" - -void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values) -{ - __asm__ volatile( - "movq %0, %%mm6 \n\t" - "movq %1, %%mm4 \n\t" - "movq %2, %%mm2 \n\t" - "movq %3, %%mm1 \n\t" - - VP3_LOOP_FILTER(%4) - - "movq %%mm4, %1 \n\t" - "movq %%mm3, %2 \n\t" - - : "+m" (*(uint64_t*)(src - 2*stride)), - "+m" (*(uint64_t*)(src - 1*stride)), - "+m" (*(uint64_t*)(src + 0*stride)), - "+m" (*(uint64_t*)(src + 1*stride)) - : "m"(*(uint64_t*)(bounding_values+129)) - ); -} - -void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values) -{ - x86_reg tmp; - - __asm__ volatile( - "movd -2(%1), %%mm6 \n\t" - "movd -2(%1,%3), %%mm0 \n\t" - "movd -2(%1,%3,2), %%mm1 \n\t" - "movd -2(%1,%4), %%mm4 \n\t" - - TRANSPOSE8x4(%%mm6, %%mm0, %%mm1, %%mm4, -2(%2), -2(%2,%3), -2(%2,%3,2), -2(%2,%4), %%mm2) - VP3_LOOP_FILTER(%5) - SBUTTERFLY(%%mm4, %%mm3, %%mm5, bw, q) - - STORE_4_WORDS((%1), (%1,%3), (%1,%3,2), (%1,%4), %%mm4) - STORE_4_WORDS((%2), (%2,%3), (%2,%3,2), (%2,%4), %%mm5) - - : "=&r"(tmp) - : "r"(src), "r"(src+4*stride), "r"((x86_reg)stride), "r"((x86_reg)3*stride), - "m"(*(uint64_t*)(bounding_values+129)) - : "memory" - ); -} - -/* from original comments: The Macro does IDct on 4 1-D Dcts */ -#define BeginIDCT() \ - "movq "I(3)", %%mm2 \n\t" \ - "movq "C(3)", %%mm6 \n\t" \ - "movq %%mm2, %%mm4 \n\t" \ - "movq "J(5)", %%mm7 \n\t" \ - "pmulhw %%mm6, %%mm4 \n\t" /* r4 = c3*i3 - i3 */ \ - "movq "C(5)", %%mm1 \n\t" \ - "pmulhw %%mm7, %%mm6 \n\t" /* r6 = c3*i5 - i5 */ \ - "movq %%mm1, %%mm5 \n\t" \ - "pmulhw %%mm2, %%mm1 \n\t" /* r1 = c5*i3 - i3 */ \ - "movq "I(1)", %%mm3 \n\t" \ - "pmulhw %%mm7, %%mm5 \n\t" /* r5 = c5*i5 - i5 */ \ - "movq "C(1)", %%mm0 \n\t" \ - "paddw %%mm2, %%mm4 \n\t" /* r4 = c3*i3 */ \ - "paddw %%mm7, %%mm6 \n\t" /* r6 = c3*i5 */ \ - "paddw %%mm1, %%mm2 \n\t" /* r2 = c5*i3 */ \ - "movq "J(7)", %%mm1 \n\t" \ - "paddw %%mm5, %%mm7 \n\t" /* r7 = c5*i5 */ \ - "movq %%mm0, %%mm5 \n\t" /* r5 = c1 */ \ - "pmulhw %%mm3, %%mm0 \n\t" /* r0 = c1*i1 - i1 */ \ - "paddsw %%mm7, %%mm4 \n\t" /* r4 = C = c3*i3 + c5*i5 */ \ - "pmulhw %%mm1, %%mm5 \n\t" /* r5 = c1*i7 - i7 */ \ - "movq "C(7)", %%mm7 \n\t" \ - "psubsw %%mm2, %%mm6 \n\t" /* r6 = D = c3*i5 - c5*i3 */ \ - "paddw %%mm3, %%mm0 \n\t" /* r0 = c1*i1 */ \ - "pmulhw %%mm7, %%mm3 \n\t" /* r3 = c7*i1 */ \ - "movq "I(2)", %%mm2 \n\t" \ - "pmulhw %%mm1, %%mm7 \n\t" /* r7 = c7*i7 */ \ - "paddw %%mm1, %%mm5 \n\t" /* r5 = c1*i7 */ \ - "movq %%mm2, %%mm1 \n\t" /* r1 = i2 */ \ - "pmulhw "C(2)", %%mm2 \n\t" /* r2 = c2*i2 - i2 */ \ - "psubsw %%mm5, %%mm3 \n\t" /* r3 = B = c7*i1 - c1*i7 */ \ - "movq "J(6)", %%mm5 \n\t" \ - "paddsw %%mm7, %%mm0 \n\t" /* r0 = A = c1*i1 + c7*i7 */ \ - "movq %%mm5, %%mm7 \n\t" /* r7 = i6 */ \ - "psubsw %%mm4, %%mm0 \n\t" /* r0 = A - C */ \ - "pmulhw "C(2)", %%mm5 \n\t" /* r5 = c2*i6 - i6 */ \ - "paddw %%mm1, %%mm2 \n\t" /* r2 = c2*i2 */ \ - "pmulhw "C(6)", %%mm1 \n\t" /* r1 = c6*i2 */ \ - "paddsw %%mm4, %%mm4 \n\t" /* r4 = C + C */ \ - "paddsw %%mm0, %%mm4 \n\t" /* r4 = C. = A + C */ \ - "psubsw %%mm6, %%mm3 \n\t" /* r3 = B - D */ \ - "paddw %%mm7, %%mm5 \n\t" /* r5 = c2*i6 */ \ - "paddsw %%mm6, %%mm6 \n\t" /* r6 = D + D */ \ - "pmulhw "C(6)", %%mm7 \n\t" /* r7 = c6*i6 */ \ - "paddsw %%mm3, %%mm6 \n\t" /* r6 = D. = B + D */ \ - "movq %%mm4, "I(1)"\n\t" /* save C. at I(1) */ \ - "psubsw %%mm5, %%mm1 \n\t" /* r1 = H = c6*i2 - c2*i6 */ \ - "movq "C(4)", %%mm4 \n\t" \ - "movq %%mm3, %%mm5 \n\t" /* r5 = B - D */ \ - "pmulhw %%mm4, %%mm3 \n\t" /* r3 = (c4 - 1) * (B - D) */ \ - "paddsw %%mm2, %%mm7 \n\t" /* r3 = (c4 - 1) * (B - D) */ \ - "movq %%mm6, "I(2)"\n\t" /* save D. at I(2) */ \ - "movq %%mm0, %%mm2 \n\t" /* r2 = A - C */ \ - "movq "I(0)", %%mm6 \n\t" \ - "pmulhw %%mm4, %%mm0 \n\t" /* r0 = (c4 - 1) * (A - C) */ \ - "paddw %%mm3, %%mm5 \n\t" /* r5 = B. = c4 * (B - D) */ \ - "movq "J(4)", %%mm3 \n\t" \ - "psubsw %%mm1, %%mm5 \n\t" /* r5 = B.. = B. - H */ \ - "paddw %%mm0, %%mm2 \n\t" /* r0 = A. = c4 * (A - C) */ \ - "psubsw %%mm3, %%mm6 \n\t" /* r6 = i0 - i4 */ \ - "movq %%mm6, %%mm0 \n\t" \ - "pmulhw %%mm4, %%mm6 \n\t" /* r6 = (c4 - 1) * (i0 - i4) */ \ - "paddsw %%mm3, %%mm3 \n\t" /* r3 = i4 + i4 */ \ - "paddsw %%mm1, %%mm1 \n\t" /* r1 = H + H */ \ - "paddsw %%mm0, %%mm3 \n\t" /* r3 = i0 + i4 */ \ - "paddsw %%mm5, %%mm1 \n\t" /* r1 = H. = B + H */ \ - "pmulhw %%mm3, %%mm4 \n\t" /* r4 = (c4 - 1) * (i0 + i4) */ \ - "paddsw %%mm0, %%mm6 \n\t" /* r6 = F = c4 * (i0 - i4) */ \ - "psubsw %%mm2, %%mm6 \n\t" /* r6 = F. = F - A. */ \ - "paddsw %%mm2, %%mm2 \n\t" /* r2 = A. + A. */ \ - "movq "I(1)", %%mm0 \n\t" /* r0 = C. */ \ - "paddsw %%mm6, %%mm2 \n\t" /* r2 = A.. = F + A. */ \ - "paddw %%mm3, %%mm4 \n\t" /* r4 = E = c4 * (i0 + i4) */ \ - "psubsw %%mm1, %%mm2 \n\t" /* r2 = R2 = A.. - H. */ - -/* RowIDCT gets ready to transpose */ -#define RowIDCT() \ - BeginIDCT() \ - "movq "I(2)", %%mm3 \n\t" /* r3 = D. */ \ - "psubsw %%mm7, %%mm4 \n\t" /* r4 = E. = E - G */ \ - "paddsw %%mm1, %%mm1 \n\t" /* r1 = H. + H. */ \ - "paddsw %%mm7, %%mm7 \n\t" /* r7 = G + G */ \ - "paddsw %%mm2, %%mm1 \n\t" /* r1 = R1 = A.. + H. */ \ - "paddsw %%mm4, %%mm7 \n\t" /* r1 = R1 = A.. + H. */ \ - "psubsw %%mm3, %%mm4 \n\t" /* r4 = R4 = E. - D. */ \ - "paddsw %%mm3, %%mm3 \n\t" \ - "psubsw %%mm5, %%mm6 \n\t" /* r6 = R6 = F. - B.. */ \ - "paddsw %%mm5, %%mm5 \n\t" \ - "paddsw %%mm4, %%mm3 \n\t" /* r3 = R3 = E. + D. */ \ - "paddsw %%mm6, %%mm5 \n\t" /* r5 = R5 = F. + B.. */ \ - "psubsw %%mm0, %%mm7 \n\t" /* r7 = R7 = G. - C. */ \ - "paddsw %%mm0, %%mm0 \n\t" \ - "movq %%mm1, "I(1)"\n\t" /* save R1 */ \ - "paddsw %%mm7, %%mm0 \n\t" /* r0 = R0 = G. + C. */ - -/* Column IDCT normalizes and stores final results */ -#define ColumnIDCT() \ - BeginIDCT() \ - "paddsw "OC_8", %%mm2 \n\t" /* adjust R2 (and R1) for shift */ \ - "paddsw %%mm1, %%mm1 \n\t" /* r1 = H. + H. */ \ - "paddsw %%mm2, %%mm1 \n\t" /* r1 = R1 = A.. + H. */ \ - "psraw $4, %%mm2 \n\t" /* r2 = NR2 */ \ - "psubsw %%mm7, %%mm4 \n\t" /* r4 = E. = E - G */ \ - "psraw $4, %%mm1 \n\t" /* r1 = NR1 */ \ - "movq "I(2)", %%mm3 \n\t" /* r3 = D. */ \ - "paddsw %%mm7, %%mm7 \n\t" /* r7 = G + G */ \ - "movq %%mm2, "I(2)"\n\t" /* store NR2 at I2 */ \ - "paddsw %%mm4, %%mm7 \n\t" /* r7 = G. = E + G */ \ - "movq %%mm1, "I(1)"\n\t" /* store NR1 at I1 */ \ - "psubsw %%mm3, %%mm4 \n\t" /* r4 = R4 = E. - D. */ \ - "paddsw "OC_8", %%mm4 \n\t" /* adjust R4 (and R3) for shift */ \ - "paddsw %%mm3, %%mm3 \n\t" /* r3 = D. + D. */ \ - "paddsw %%mm4, %%mm3 \n\t" /* r3 = R3 = E. + D. */ \ - "psraw $4, %%mm4 \n\t" /* r4 = NR4 */ \ - "psubsw %%mm5, %%mm6 \n\t" /* r6 = R6 = F. - B.. */ \ - "psraw $4, %%mm3 \n\t" /* r3 = NR3 */ \ - "paddsw "OC_8", %%mm6 \n\t" /* adjust R6 (and R5) for shift */ \ - "paddsw %%mm5, %%mm5 \n\t" /* r5 = B.. + B.. */ \ - "paddsw %%mm6, %%mm5 \n\t" /* r5 = R5 = F. + B.. */ \ - "psraw $4, %%mm6 \n\t" /* r6 = NR6 */ \ - "movq %%mm4, "J(4)"\n\t" /* store NR4 at J4 */ \ - "psraw $4, %%mm5 \n\t" /* r5 = NR5 */ \ - "movq %%mm3, "I(3)"\n\t" /* store NR3 at I3 */ \ - "psubsw %%mm0, %%mm7 \n\t" /* r7 = R7 = G. - C. */ \ - "paddsw "OC_8", %%mm7 \n\t" /* adjust R7 (and R0) for shift */ \ - "paddsw %%mm0, %%mm0 \n\t" /* r0 = C. + C. */ \ - "paddsw %%mm7, %%mm0 \n\t" /* r0 = R0 = G. + C. */ \ - "psraw $4, %%mm7 \n\t" /* r7 = NR7 */ \ - "movq %%mm6, "J(6)"\n\t" /* store NR6 at J6 */ \ - "psraw $4, %%mm0 \n\t" /* r0 = NR0 */ \ - "movq %%mm5, "J(5)"\n\t" /* store NR5 at J5 */ \ - "movq %%mm7, "J(7)"\n\t" /* store NR7 at J7 */ \ - "movq %%mm0, "I(0)"\n\t" /* store NR0 at I0 */ - -/* Following macro does two 4x4 transposes in place. - - At entry (we assume): - - r0 = a3 a2 a1 a0 - I(1) = b3 b2 b1 b0 - r2 = c3 c2 c1 c0 - r3 = d3 d2 d1 d0 - - r4 = e3 e2 e1 e0 - r5 = f3 f2 f1 f0 - r6 = g3 g2 g1 g0 - r7 = h3 h2 h1 h0 - - At exit, we have: - - I(0) = d0 c0 b0 a0 - I(1) = d1 c1 b1 a1 - I(2) = d2 c2 b2 a2 - I(3) = d3 c3 b3 a3 - - J(4) = h0 g0 f0 e0 - J(5) = h1 g1 f1 e1 - J(6) = h2 g2 f2 e2 - J(7) = h3 g3 f3 e3 - - I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3. - J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7. - - Since r1 is free at entry, we calculate the Js first. */ -#define Transpose() \ - "movq %%mm4, %%mm1 \n\t" /* r1 = e3 e2 e1 e0 */ \ - "punpcklwd %%mm5, %%mm4 \n\t" /* r4 = f1 e1 f0 e0 */ \ - "movq %%mm0, "I(0)"\n\t" /* save a3 a2 a1 a0 */ \ - "punpckhwd %%mm5, %%mm1 \n\t" /* r1 = f3 e3 f2 e2 */ \ - "movq %%mm6, %%mm0 \n\t" /* r0 = g3 g2 g1 g0 */ \ - "punpcklwd %%mm7, %%mm6 \n\t" /* r6 = h1 g1 h0 g0 */ \ - "movq %%mm4, %%mm5 \n\t" /* r5 = f1 e1 f0 e0 */ \ - "punpckldq %%mm6, %%mm4 \n\t" /* r4 = h0 g0 f0 e0 = R4 */ \ - "punpckhdq %%mm6, %%mm5 \n\t" /* r5 = h1 g1 f1 e1 = R5 */ \ - "movq %%mm1, %%mm6 \n\t" /* r6 = f3 e3 f2 e2 */ \ - "movq %%mm4, "J(4)"\n\t" \ - "punpckhwd %%mm7, %%mm0 \n\t" /* r0 = h3 g3 h2 g2 */ \ - "movq %%mm5, "J(5)"\n\t" \ - "punpckhdq %%mm0, %%mm6 \n\t" /* r6 = h3 g3 f3 e3 = R7 */ \ - "movq "I(0)", %%mm4 \n\t" /* r4 = a3 a2 a1 a0 */ \ - "punpckldq %%mm0, %%mm1 \n\t" /* r1 = h2 g2 f2 e2 = R6 */ \ - "movq "I(1)", %%mm5 \n\t" /* r5 = b3 b2 b1 b0 */ \ - "movq %%mm4, %%mm0 \n\t" /* r0 = a3 a2 a1 a0 */ \ - "movq %%mm6, "J(7)"\n\t" \ - "punpcklwd %%mm5, %%mm0 \n\t" /* r0 = b1 a1 b0 a0 */ \ - "movq %%mm1, "J(6)"\n\t" \ - "punpckhwd %%mm5, %%mm4 \n\t" /* r4 = b3 a3 b2 a2 */ \ - "movq %%mm2, %%mm5 \n\t" /* r5 = c3 c2 c1 c0 */ \ - "punpcklwd %%mm3, %%mm2 \n\t" /* r2 = d1 c1 d0 c0 */ \ - "movq %%mm0, %%mm1 \n\t" /* r1 = b1 a1 b0 a0 */ \ - "punpckldq %%mm2, %%mm0 \n\t" /* r0 = d0 c0 b0 a0 = R0 */ \ - "punpckhdq %%mm2, %%mm1 \n\t" /* r1 = d1 c1 b1 a1 = R1 */ \ - "movq %%mm4, %%mm2 \n\t" /* r2 = b3 a3 b2 a2 */ \ - "movq %%mm0, "I(0)"\n\t" \ - "punpckhwd %%mm3, %%mm5 \n\t" /* r5 = d3 c3 d2 c2 */ \ - "movq %%mm1, "I(1)"\n\t" \ - "punpckhdq %%mm5, %%mm4 \n\t" /* r4 = d3 c3 b3 a3 = R3 */ \ - "punpckldq %%mm5, %%mm2 \n\t" /* r2 = d2 c2 b2 a2 = R2 */ \ - "movq %%mm4, "I(3)"\n\t" \ - "movq %%mm2, "I(2)"\n\t" - -void ff_vp3_idct_mmx(int16_t *output_data) -{ - /* eax = quantized input - * ebx = dequantizer matrix - * ecx = IDCT constants - * M(I) = ecx + MaskOffset(0) + I * 8 - * C(I) = ecx + CosineOffset(32) + (I-1) * 8 - * edx = output - * r0..r7 = mm0..mm7 - */ - -#define C(x) AV_STRINGIFY(16*(x-1))"(%1)" -#define OC_8 "%2" - - /* at this point, function has completed dequantization + dezigzag + - * partial transposition; now do the idct itself */ -#define I(x) AV_STRINGIFY(16* x )"(%0)" -#define J(x) AV_STRINGIFY(16*(x-4) + 8)"(%0)" - - __asm__ volatile ( - RowIDCT() - Transpose() - -#undef I -#undef J -#define I(x) AV_STRINGIFY(16* x + 64)"(%0)" -#define J(x) AV_STRINGIFY(16*(x-4) + 72)"(%0)" - - RowIDCT() - Transpose() - -#undef I -#undef J -#define I(x) AV_STRINGIFY(16*x)"(%0)" -#define J(x) AV_STRINGIFY(16*x)"(%0)" - - ColumnIDCT() - -#undef I -#undef J -#define I(x) AV_STRINGIFY(16*x + 8)"(%0)" -#define J(x) AV_STRINGIFY(16*x + 8)"(%0)" - - ColumnIDCT() - :: "r"(output_data), "r"(ff_vp3_idct_data), "m"(ff_pw_8) - ); -#undef I -#undef J - -} - -void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block) -{ - ff_vp3_idct_mmx(block); - put_signed_pixels_clamped_mmx(block, dest, line_size); -} - -void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block) -{ - ff_vp3_idct_mmx(block); - add_pixels_clamped_mmx(block, dest, line_size); -}
--- a/i386/vp3dsp_mmx.h Mon Dec 22 06:50:18 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,35 +0,0 @@ -/* - * vp3dsp MMX function declarations - * Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVCODEC_I386_VP3DSP_MMX_H -#define AVCODEC_I386_VP3DSP_MMX_H - -#include <stdint.h> -#include "libavcodec/dsputil.h" - -void ff_vp3_idct_mmx(int16_t *data); -void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block); -void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block); - -void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values); -void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values); - -#endif /* AVCODEC_I386_VP3DSP_MMX_H */
--- a/i386/vp3dsp_sse2.c Mon Dec 22 06:50:18 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,186 +0,0 @@ -/* - * Copyright (C) 2004 the ffmpeg project - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file vp3dsp_sse2.c - * SSE2-optimized functions cribbed from the original VP3 source code. - */ - -#include "libavcodec/dsputil.h" -#include "dsputil_mmx.h" - -DECLARE_ALIGNED_16(const uint16_t, ff_vp3_idct_data[7 * 8]) = -{ - 64277,64277,64277,64277,64277,64277,64277,64277, - 60547,60547,60547,60547,60547,60547,60547,60547, - 54491,54491,54491,54491,54491,54491,54491,54491, - 46341,46341,46341,46341,46341,46341,46341,46341, - 36410,36410,36410,36410,36410,36410,36410,36410, - 25080,25080,25080,25080,25080,25080,25080,25080, - 12785,12785,12785,12785,12785,12785,12785,12785 -}; - - -#define VP3_1D_IDCT_SSE2(ADD, SHIFT) \ - "movdqa "I(3)", %%xmm2 \n\t" /* xmm2 = i3 */ \ - "movdqa "C(3)", %%xmm6 \n\t" /* xmm6 = c3 */ \ - "movdqa %%xmm2, %%xmm4 \n\t" /* xmm4 = i3 */ \ - "movdqa "I(5)", %%xmm7 \n\t" /* xmm7 = i5 */ \ - "pmulhw %%xmm6, %%xmm4 \n\t" /* xmm4 = c3 * i3 - i3 */ \ - "movdqa "C(5)", %%xmm1 \n\t" /* xmm1 = c5 */ \ - "pmulhw %%xmm7, %%xmm6 \n\t" /* xmm6 = c3 * i5 - i5 */ \ - "movdqa %%xmm1, %%xmm5 \n\t" /* xmm5 = c5 */ \ - "pmulhw %%xmm2, %%xmm1 \n\t" /* xmm1 = c5 * i3 - i3 */ \ - "movdqa "I(1)", %%xmm3 \n\t" /* xmm3 = i1 */ \ - "pmulhw %%xmm7, %%xmm5 \n\t" /* xmm5 = c5 * i5 - i5 */ \ - "movdqa "C(1)", %%xmm0 \n\t" /* xmm0 = c1 */ \ - "paddw %%xmm2, %%xmm4 \n\t" /* xmm4 = c3 * i3 */ \ - "paddw %%xmm7, %%xmm6 \n\t" /* xmm6 = c3 * i5 */ \ - "paddw %%xmm1, %%xmm2 \n\t" /* xmm2 = c5 * i3 */ \ - "movdqa "I(7)", %%xmm1 \n\t" /* xmm1 = i7 */ \ - "paddw %%xmm5, %%xmm7 \n\t" /* xmm7 = c5 * i5 */ \ - "movdqa %%xmm0, %%xmm5 \n\t" /* xmm5 = c1 */ \ - "pmulhw %%xmm3, %%xmm0 \n\t" /* xmm0 = c1 * i1 - i1 */ \ - "paddsw %%xmm7, %%xmm4 \n\t" /* xmm4 = c3 * i3 + c5 * i5 = C */ \ - "pmulhw %%xmm1, %%xmm5 \n\t" /* xmm5 = c1 * i7 - i7 */ \ - "movdqa "C(7)", %%xmm7 \n\t" /* xmm7 = c7 */ \ - "psubsw %%xmm2, %%xmm6 \n\t" /* xmm6 = c3 * i5 - c5 * i3 = D */ \ - "paddw %%xmm3, %%xmm0 \n\t" /* xmm0 = c1 * i1 */ \ - "pmulhw %%xmm7, %%xmm3 \n\t" /* xmm3 = c7 * i1 */ \ - "movdqa "I(2)", %%xmm2 \n\t" /* xmm2 = i2 */ \ - "pmulhw %%xmm1, %%xmm7 \n\t" /* xmm7 = c7 * i7 */ \ - "paddw %%xmm1, %%xmm5 \n\t" /* xmm5 = c1 * i7 */ \ - "movdqa %%xmm2, %%xmm1 \n\t" /* xmm1 = i2 */ \ - "pmulhw "C(2)", %%xmm2 \n\t" /* xmm2 = i2 * c2 -i2 */ \ - "psubsw %%xmm5, %%xmm3 \n\t" /* xmm3 = c7 * i1 - c1 * i7 = B */ \ - "movdqa "I(6)", %%xmm5 \n\t" /* xmm5 = i6 */ \ - "paddsw %%xmm7, %%xmm0 \n\t" /* xmm0 = c1 * i1 + c7 * i7 = A */ \ - "movdqa %%xmm5, %%xmm7 \n\t" /* xmm7 = i6 */ \ - "psubsw %%xmm4, %%xmm0 \n\t" /* xmm0 = A - C */ \ - "pmulhw "C(2)", %%xmm5 \n\t" /* xmm5 = c2 * i6 - i6 */ \ - "paddw %%xmm1, %%xmm2 \n\t" /* xmm2 = i2 * c2 */ \ - "pmulhw "C(6)", %%xmm1 \n\t" /* xmm1 = c6 * i2 */ \ - "paddsw %%xmm4, %%xmm4 \n\t" /* xmm4 = C + C */ \ - "paddsw %%xmm0, %%xmm4 \n\t" /* xmm4 = A + C = C. */ \ - "psubsw %%xmm6, %%xmm3 \n\t" /* xmm3 = B - D */ \ - "paddw %%xmm7, %%xmm5 \n\t" /* xmm5 = c2 * i6 */ \ - "paddsw %%xmm6, %%xmm6 \n\t" /* xmm6 = D + D */ \ - "pmulhw "C(6)", %%xmm7 \n\t" /* xmm7 = c6 * i6 */ \ - "paddsw %%xmm3, %%xmm6 \n\t" /* xmm6 = B + D = D. */ \ - "movdqa %%xmm4, "I(1)" \n\t" /* Save C. at I(1) */ \ - "psubsw %%xmm5, %%xmm1 \n\t" /* xmm1 = c6 * i2 - c2 * i6 = H */ \ - "movdqa "C(4)", %%xmm4 \n\t" /* xmm4 = c4 */ \ - "movdqa %%xmm3, %%xmm5 \n\t" /* xmm5 = B - D */ \ - "pmulhw %%xmm4, %%xmm3 \n\t" /* xmm3 = ( c4 -1 ) * ( B - D ) */ \ - "paddsw %%xmm2, %%xmm7 \n\t" /* xmm7 = c2 * i2 + c6 * i6 = G */ \ - "movdqa %%xmm6, "I(2)" \n\t" /* Save D. at I(2) */ \ - "movdqa %%xmm0, %%xmm2 \n\t" /* xmm2 = A - C */ \ - "movdqa "I(0)", %%xmm6 \n\t" /* xmm6 = i0 */ \ - "pmulhw %%xmm4, %%xmm0 \n\t" /* xmm0 = ( c4 - 1 ) * ( A - C ) = A. */ \ - "paddw %%xmm3, %%xmm5 \n\t" /* xmm5 = c4 * ( B - D ) = B. */ \ - "movdqa "I(4)", %%xmm3 \n\t" /* xmm3 = i4 */ \ - "psubsw %%xmm1, %%xmm5 \n\t" /* xmm5 = B. - H = B.. */ \ - "paddw %%xmm0, %%xmm2 \n\t" /* xmm2 = c4 * ( A - C) = A. */ \ - "psubsw %%xmm3, %%xmm6 \n\t" /* xmm6 = i0 - i4 */ \ - "movdqa %%xmm6, %%xmm0 \n\t" /* xmm0 = i0 - i4 */ \ - "pmulhw %%xmm4, %%xmm6 \n\t" /* xmm6 = (c4 - 1) * (i0 - i4) = F */ \ - "paddsw %%xmm3, %%xmm3 \n\t" /* xmm3 = i4 + i4 */ \ - "paddsw %%xmm1, %%xmm1 \n\t" /* xmm1 = H + H */ \ - "paddsw %%xmm0, %%xmm3 \n\t" /* xmm3 = i0 + i4 */ \ - "paddsw %%xmm5, %%xmm1 \n\t" /* xmm1 = B. + H = H. */ \ - "pmulhw %%xmm3, %%xmm4 \n\t" /* xmm4 = ( c4 - 1 ) * ( i0 + i4 ) */ \ - "paddw %%xmm0, %%xmm6 \n\t" /* xmm6 = c4 * ( i0 - i4 ) */ \ - "psubsw %%xmm2, %%xmm6 \n\t" /* xmm6 = F - A. = F. */ \ - "paddsw %%xmm2, %%xmm2 \n\t" /* xmm2 = A. + A. */ \ - "movdqa "I(1)", %%xmm0 \n\t" /* Load C. from I(1) */ \ - "paddsw %%xmm6, %%xmm2 \n\t" /* xmm2 = F + A. = A.. */ \ - "paddw %%xmm3, %%xmm4 \n\t" /* xmm4 = c4 * ( i0 + i4 ) = 3 */ \ - "psubsw %%xmm1, %%xmm2 \n\t" /* xmm2 = A.. - H. = R2 */ \ - ADD(%%xmm2) /* Adjust R2 and R1 before shifting */ \ - "paddsw %%xmm1, %%xmm1 \n\t" /* xmm1 = H. + H. */ \ - "paddsw %%xmm2, %%xmm1 \n\t" /* xmm1 = A.. + H. = R1 */ \ - SHIFT(%%xmm2) /* xmm2 = op2 */ \ - "psubsw %%xmm7, %%xmm4 \n\t" /* xmm4 = E - G = E. */ \ - SHIFT(%%xmm1) /* xmm1 = op1 */ \ - "movdqa "I(2)", %%xmm3 \n\t" /* Load D. from I(2) */ \ - "paddsw %%xmm7, %%xmm7 \n\t" /* xmm7 = G + G */ \ - "paddsw %%xmm4, %%xmm7 \n\t" /* xmm7 = E + G = G. */ \ - "psubsw %%xmm3, %%xmm4 \n\t" /* xmm4 = E. - D. = R4 */ \ - ADD(%%xmm4) /* Adjust R4 and R3 before shifting */ \ - "paddsw %%xmm3, %%xmm3 \n\t" /* xmm3 = D. + D. */ \ - "paddsw %%xmm4, %%xmm3 \n\t" /* xmm3 = E. + D. = R3 */ \ - SHIFT(%%xmm4) /* xmm4 = op4 */ \ - "psubsw %%xmm5, %%xmm6 \n\t" /* xmm6 = F. - B..= R6 */ \ - SHIFT(%%xmm3) /* xmm3 = op3 */ \ - ADD(%%xmm6) /* Adjust R6 and R5 before shifting */ \ - "paddsw %%xmm5, %%xmm5 \n\t" /* xmm5 = B.. + B.. */ \ - "paddsw %%xmm6, %%xmm5 \n\t" /* xmm5 = F. + B.. = R5 */ \ - SHIFT(%%xmm6) /* xmm6 = op6 */ \ - SHIFT(%%xmm5) /* xmm5 = op5 */ \ - "psubsw %%xmm0, %%xmm7 \n\t" /* xmm7 = G. - C. = R7 */ \ - ADD(%%xmm7) /* Adjust R7 and R0 before shifting */ \ - "paddsw %%xmm0, %%xmm0 \n\t" /* xmm0 = C. + C. */ \ - "paddsw %%xmm7, %%xmm0 \n\t" /* xmm0 = G. + C. */ \ - SHIFT(%%xmm7) /* xmm7 = op7 */ \ - SHIFT(%%xmm0) /* xmm0 = op0 */ - -#define PUT_BLOCK(r0, r1, r2, r3, r4, r5, r6, r7) \ - "movdqa " #r0 ", " O(0) "\n\t" \ - "movdqa " #r1 ", " O(1) "\n\t" \ - "movdqa " #r2 ", " O(2) "\n\t" \ - "movdqa " #r3 ", " O(3) "\n\t" \ - "movdqa " #r4 ", " O(4) "\n\t" \ - "movdqa " #r5 ", " O(5) "\n\t" \ - "movdqa " #r6 ", " O(6) "\n\t" \ - "movdqa " #r7 ", " O(7) "\n\t" - -#define NOP(xmm) -#define SHIFT4(xmm) "psraw $4, "#xmm"\n\t" -#define ADD8(xmm) "paddsw %2, "#xmm"\n\t" - -void ff_vp3_idct_sse2(int16_t *input_data) -{ -#define I(x) AV_STRINGIFY(16*x)"(%0)" -#define O(x) I(x) -#define C(x) AV_STRINGIFY(16*(x-1))"(%1)" - - __asm__ volatile ( - VP3_1D_IDCT_SSE2(NOP, NOP) - - TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%0)) - PUT_BLOCK(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1) - - VP3_1D_IDCT_SSE2(ADD8, SHIFT4) - PUT_BLOCK(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7) - :: "r"(input_data), "r"(ff_vp3_idct_data), "m"(ff_pw_8) - ); -} - -void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block) -{ - ff_vp3_idct_sse2(block); - put_signed_pixels_clamped_mmx(block, dest, line_size); -} - -void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block) -{ - ff_vp3_idct_sse2(block); - add_pixels_clamped_mmx(block, dest, line_size); -}
--- a/i386/vp3dsp_sse2.h Mon Dec 22 06:50:18 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,31 +0,0 @@ -/* - * vp3dsp SSE2 function declarations - * Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVCODEC_I386_VP3DSP_SSE2_H -#define AVCODEC_I386_VP3DSP_SSE2_H - -#include "libavcodec/dsputil.h" - -void ff_vp3_idct_sse2(int16_t *input_data); -void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block); -void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block); - -#endif /* AVCODEC_I386_VP3DSP_SSE2_H */
--- a/i386/x86inc.asm Mon Dec 22 06:50:18 2008 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,540 +0,0 @@ -;***************************************************************************** -;* x86inc.asm -;***************************************************************************** -;* Copyright (C) 2005-2008 Loren Merritt <lorenm@u.washington.edu> -;* -;* This file is part of FFmpeg. -;* -;* FFmpeg is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* FFmpeg is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with FFmpeg; if not, write to the Free Software -;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;***************************************************************************** - -; FIXME: All of the 64bit asm functions that take a stride as an argument -; via register, assume that the high dword of that register is filled with 0. -; This is true in practice (since we never do any 64bit arithmetic on strides, -; and x264's strides are all positive), but is not guaranteed by the ABI. - -; Name of the .rodata section. -; Kludge: Something on OS X fails to align .rodata even given an align attribute, -; so use a different read-only section. -%macro SECTION_RODATA 0 - %ifidn __OUTPUT_FORMAT__,macho64 - SECTION .text align=16 - %elifidn __OUTPUT_FORMAT__,macho - SECTION .text align=16 - fakegot: - %else - SECTION .rodata align=16 - %endif -%endmacro - -; PIC support macros. All these macros are totally harmless when PIC is -; not defined but can ruin everything if misused in PIC mode. On x86_32, shared -; objects cannot directly access global variables by address, they need to -; go through the GOT (global offset table). Most OSes do not care about it -; and let you load non-shared .so objects (Linux, Win32...). However, OS X -; requires PIC code in its .dylib objects. -; -; - GLOBAL should be used as a suffix for global addressing, eg. -; picgetgot ebx -; mov eax, [foo GLOBAL] -; instead of -; mov eax, [foo] -; -; - picgetgot computes the GOT address into the given register in PIC -; mode, otherwise does nothing. You need to do this before using GLOBAL. -; Before in both execution order and compiled code order (so GLOBAL knows -; which register the GOT is in). - -%ifndef PIC - %define GLOBAL - %macro picgetgot 1 - %endmacro -%elifdef ARCH_X86_64 - %define PIC64 - %define GLOBAL wrt rip - %macro picgetgot 1 - %endmacro -%else - %define PIC32 - %ifidn __OUTPUT_FORMAT__,macho - ; There is no real global offset table on OS X, but we still - ; need to reference our variables by offset. - %macro picgetgot 1 - call %%getgot - %%getgot: - pop %1 - add %1, $$ - %%getgot - %undef GLOBAL - %define GLOBAL + %1 - fakegot - %endmacro - %else ; elf - extern _GLOBAL_OFFSET_TABLE_ - %macro picgetgot 1 - call %%getgot - %%getgot: - pop %1 - add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%getgot wrt ..gotpc - %undef GLOBAL - %define GLOBAL + %1 wrt ..gotoff - %endmacro - %endif -%endif - -; Macros to eliminate most code duplication between x86_32 and x86_64: -; Currently this works only for leaf functions which load all their arguments -; into registers at the start, and make no other use of the stack. Luckily that -; covers most of x264's asm. - -; PROLOGUE: -; %1 = number of arguments. loads them from stack if needed. -; %2 = number of registers used, not including PIC. pushes callee-saved regs if needed. -; %3 = whether global constants are used in this function. inits x86_32 PIC if needed. -; %4 = list of names to define to registers -; PROLOGUE can also be invoked by adding the same options to cglobal - -; e.g. -; cglobal foo, 2,3,0, dst, src, tmp -; declares a function (foo), taking two args (dst and src), one local variable (tmp), and not using globals - -; TODO Some functions can use some args directly from the stack. If they're the -; last args then you can just not declare them, but if they're in the middle -; we need more flexible macro. - -; RET: -; Pops anything that was pushed by PROLOGUE - -; REP_RET: -; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons -; which are slow when a normal ret follows a branch. - -%macro DECLARE_REG 6 - %define r%1q %2 - %define r%1d %3 - %define r%1w %4 - %define r%1b %5 - %define r%1m %6 - %define r%1 %2 -%endmacro - -%macro DECLARE_REG_SIZE 2 - %define r%1q r%1 - %define e%1q r%1 - %define r%1d e%1 - %define e%1d e%1 - %define r%1w %1 - %define e%1w %1 - %define r%1b %2 - %define e%1b %2 -%ifndef ARCH_X86_64 - %define r%1 e%1 -%endif -%endmacro - -DECLARE_REG_SIZE ax, al -DECLARE_REG_SIZE bx, bl -DECLARE_REG_SIZE cx, cl -DECLARE_REG_SIZE dx, dl -DECLARE_REG_SIZE si, sil -DECLARE_REG_SIZE di, dil -DECLARE_REG_SIZE bp, bpl - -%ifdef ARCH_X86_64 - %define gprsize 8 -%else - %define gprsize 4 -%endif - -%macro PUSH 1 - push %1 - %assign stack_offset stack_offset+gprsize -%endmacro - -%macro POP 1 - pop %1 - %assign stack_offset stack_offset-gprsize -%endmacro - -%macro SUB 2 - sub %1, %2 - %ifidn %1, rsp - %assign stack_offset stack_offset+(%2) - %endif -%endmacro - -%macro ADD 2 - add %1, %2 - %ifidn %1, rsp - %assign stack_offset stack_offset-(%2) - %endif -%endmacro - -%macro movifnidn 2 - %ifnidn %1, %2 - mov %1, %2 - %endif -%endmacro - -%macro movsxdifnidn 2 - %ifnidn %1, %2 - movsxd %1, %2 - %endif -%endmacro - -%macro ASSERT 1 - %if (%1) == 0 - %error assert failed - %endif -%endmacro - -%macro DEFINE_ARGS 0-* - %ifdef n_arg_names - %assign %%i 0 - %rep n_arg_names - CAT_UNDEF arg_name %+ %%i, q - CAT_UNDEF arg_name %+ %%i, d - CAT_UNDEF arg_name %+ %%i, w - CAT_UNDEF arg_name %+ %%i, b - CAT_UNDEF arg_name, %%i - %assign %%i %%i+1 - %endrep - %endif - - %assign %%i 0 - %rep %0 - %xdefine %1q r %+ %%i %+ q - %xdefine %1d r %+ %%i %+ d - %xdefine %1w r %+ %%i %+ w - %xdefine %1b r %+ %%i %+ b - CAT_XDEFINE arg_name, %%i, %1 - %assign %%i %%i+1 - %rotate 1 - %endrep - %assign n_arg_names %%i -%endmacro - -%ifdef ARCH_X86_64 ;========================================================== -%ifidn __OUTPUT_FORMAT__,win32 - -DECLARE_REG 0, rcx, ecx, cx, cl, ecx -DECLARE_REG 1, rdx, edx, dx, dl, edx -DECLARE_REG 2, r8, r8d, r8w, r8b, r8d -DECLARE_REG 3, r9, r9d, r9w, r9b, r9d -DECLARE_REG 4, rdi, edi, di, dil, [rsp + stack_offset + 40] -DECLARE_REG 5, rsi, esi, si, sil, [rsp + stack_offset + 48] -DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56] -%define r7m [rsp + stack_offset + 64] -%define r8m [rsp + stack_offset + 72] - -%macro LOAD_IF_USED 2 ; reg_id, number_of_args - %if %1 < %2 - mov r%1, [rsp + 8 + %1*8] - %endif -%endmacro - -%else ;======================================================================= - -DECLARE_REG 0, rdi, edi, di, dil, edi -DECLARE_REG 1, rsi, esi, si, sil, esi -DECLARE_REG 2, rdx, edx, dx, dl, edx -DECLARE_REG 3, rcx, ecx, cx, cl, ecx -DECLARE_REG 4, r8, r8d, r8w, r8b, r8d -DECLARE_REG 5, r9, r9d, r9w, r9b, r9d -DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 8] -%define r7m [rsp + stack_offset + 16] -%define r8m [rsp + stack_offset + 24] - -%macro LOAD_IF_USED 2 ; reg_id, number_of_args - %if %1 < %2 - mov r%1, [rsp - 40 + %1*8] - %endif -%endmacro - -%endif ; !WIN64 - -%macro PROLOGUE 2-4+ 0 ; #args, #regs, pic, arg_names... - ASSERT %2 >= %1 - ASSERT %2 <= 7 - %assign stack_offset 0 -%ifidn __OUTPUT_FORMAT__,win32 - LOAD_IF_USED 4, %1 - LOAD_IF_USED 5, %1 -%endif - LOAD_IF_USED 6, %1 - DEFINE_ARGS %4 -%endmacro - -%macro RET 0 - ret -%endmacro - -%macro REP_RET 0 - rep ret -%endmacro - -%else ; X86_32 ;============================================================== - -DECLARE_REG 0, eax, eax, ax, al, [esp + stack_offset + 4] -DECLARE_REG 1, ecx, ecx, cx, cl, [esp + stack_offset + 8] -DECLARE_REG 2, edx, edx, dx, dl, [esp + stack_offset + 12] -DECLARE_REG 3, ebx, ebx, bx, bl, [esp + stack_offset + 16] -DECLARE_REG 4, esi, esi, si, null, [esp + stack_offset + 20] -DECLARE_REG 5, edi, edi, di, null, [esp + stack_offset + 24] -DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28] -%define r7m [esp + stack_offset + 32] -%define r8m [esp + stack_offset + 36] -%define rsp esp - -%macro PUSH_IF_USED 1 ; reg_id - %if %1 < regs_used - push r%1 - %assign stack_offset stack_offset+4 - %endif -%endmacro - -%macro POP_IF_USED 1 ; reg_id - %if %1 < regs_used - pop r%1 - %endif -%endmacro - -%macro LOAD_IF_USED 2 ; reg_id, number_of_args - %if %1 < %2 - mov r%1, [esp + stack_offset + 4 + %1*4] - %endif -%endmacro - -%macro PROLOGUE 2-4+ 0 ; #args, #regs, pic, arg_names... - ASSERT %2 >= %1 - %assign stack_offset 0 - %assign regs_used %2 - %ifdef PIC - %if %3 - %assign regs_used regs_used+1 - %endif - %endif - ASSERT regs_used <= 7 - PUSH_IF_USED 3 - PUSH_IF_USED 4 - PUSH_IF_USED 5 - PUSH_IF_USED 6 - LOAD_IF_USED 0, %1 - LOAD_IF_USED 1, %1 - LOAD_IF_USED 2, %1 - LOAD_IF_USED 3, %1 - LOAD_IF_USED 4, %1 - LOAD_IF_USED 5, %1 - LOAD_IF_USED 6, %1 - %if %3 - picgetgot r%2 - %endif - DEFINE_ARGS %4 -%endmacro - -%macro RET 0 - POP_IF_USED 6 - POP_IF_USED 5 - POP_IF_USED 4 - POP_IF_USED 3 - ret -%endmacro - -%macro REP_RET 0 - %if regs_used > 3 - RET - %else - rep ret - %endif -%endmacro - -%endif ;====================================================================== - - - -;============================================================================= -; arch-independent part -;============================================================================= - -%assign function_align 16 - -; Symbol prefix for C linkage -%macro cglobal 1-2+ - %xdefine %1 ff_%1 - %ifdef PREFIX - %xdefine %1 _ %+ %1 - %endif - %ifidn __OUTPUT_FORMAT__,elf - global %1:function hidden - %else - global %1 - %endif - align function_align - %1: - RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer - %if %0 > 1 - PROLOGUE %2 - %endif -%endmacro - -%macro cextern 1 - %ifdef PREFIX - extern _%1 - %define %1 _%1 - %else - extern %1 - %endif -%endmacro - -; This is needed for ELF, otherwise the GNU linker assumes the stack is -; executable by default. -%ifidn __OUTPUT_FORMAT__,elf -SECTION .note.GNU-stack noalloc noexec nowrite progbits -%endif - -%assign FENC_STRIDE 16 -%assign FDEC_STRIDE 32 - -; merge mmx and sse* - -%macro CAT_XDEFINE 3 - %xdefine %1%2 %3 -%endmacro - -%macro CAT_UNDEF 2 - %undef %1%2 -%endmacro - -%macro INIT_MMX 0 - %define RESET_MM_PERMUTATION INIT_MMX - %define mmsize 8 - %define num_mmregs 8 - %define mova movq - %define movu movq - %define movh movd - %define movnt movntq - %assign %%i 0 - %rep 8 - CAT_XDEFINE m, %%i, mm %+ %%i - CAT_XDEFINE nmm, %%i, %%i - %assign %%i %%i+1 - %endrep - %rep 8 - CAT_UNDEF m, %%i - CAT_UNDEF nmm, %%i - %assign %%i %%i+1 - %endrep -%endmacro - -%macro INIT_XMM 0 - %define RESET_MM_PERMUTATION INIT_XMM - %define mmsize 16 - %define num_mmregs 8 - %ifdef ARCH_X86_64 - %define num_mmregs 16 - %endif - %define mova movdqa - %define movu movdqu - %define movh movq - %define movnt movntdq - %assign %%i 0 - %rep num_mmregs - CAT_XDEFINE m, %%i, xmm %+ %%i - CAT_XDEFINE nxmm, %%i, %%i - %assign %%i %%i+1 - %endrep -%endmacro - -INIT_MMX - -; I often want to use macros that permute their arguments. e.g. there's no -; efficient way to implement butterfly or transpose or dct without swapping some -; arguments. -; -; I would like to not have to manually keep track of the permutations: -; If I insert a permutation in the middle of a function, it should automatically -; change everything that follows. For more complex macros I may also have multiple -; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations. -; -; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that -; permutes its arguments. It's equivalent to exchanging the contents of the -; registers, except that this way you exchange the register names instead, so it -; doesn't cost any cycles. - -%macro PERMUTE 2-* ; takes a list of pairs to swap -%rep %0/2 - %xdefine tmp%2 m%2 - %xdefine ntmp%2 nm%2 - %rotate 2 -%endrep -%rep %0/2 - %xdefine m%1 tmp%2 - %xdefine nm%1 ntmp%2 - %undef tmp%2 - %undef ntmp%2 - %rotate 2 -%endrep -%endmacro - -%macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs) -%rep %0-1 -%ifdef m%1 - %xdefine tmp m%1 - %xdefine m%1 m%2 - %xdefine m%2 tmp - CAT_XDEFINE n, m%1, %1 - CAT_XDEFINE n, m%2, %2 -%else - ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here. - ; Be careful using this mode in nested macros though, as in some cases there may be - ; other copies of m# that have already been dereferenced and don't get updated correctly. - %xdefine %%n1 n %+ %1 - %xdefine %%n2 n %+ %2 - %xdefine tmp m %+ %%n1 - CAT_XDEFINE m, %%n1, m %+ %%n2 - CAT_XDEFINE m, %%n2, tmp - CAT_XDEFINE n, m %+ %%n1, %%n1 - CAT_XDEFINE n, m %+ %%n2, %%n2 -%endif - %undef tmp - %rotate 1 -%endrep -%endmacro - -%macro SAVE_MM_PERMUTATION 1 - %assign %%i 0 - %rep num_mmregs - CAT_XDEFINE %1_m, %%i, m %+ %%i - %assign %%i %%i+1 - %endrep -%endmacro - -%macro LOAD_MM_PERMUTATION 1 - %assign %%i 0 - %rep num_mmregs - CAT_XDEFINE m, %%i, %1_m %+ %%i - %assign %%i %%i+1 - %endrep -%endmacro - -%macro call 1 - call %1 - %ifdef %1_m0 - LOAD_MM_PERMUTATION %1 - %endif -%endmacro - -; substitutions which are functionally identical but reduce code size -%define movdqa movaps -%define movdqu movups -
--- a/imgconvert.c Mon Dec 22 06:50:18 2008 +0000 +++ b/imgconvert.c Mon Dec 22 09:12:42 2008 +0000 @@ -35,8 +35,8 @@ #include "colorspace.h" #ifdef HAVE_MMX -#include "i386/mmx.h" -#include "i386/dsputil_mmx.h" +#include "x86/mmx.h" +#include "x86/dsputil_mmx.h" #endif #define xglue(x, y) x ## y
--- a/imgresample.c Mon Dec 22 06:50:18 2008 +0000 +++ b/imgresample.c Mon Dec 22 09:12:42 2008 +0000 @@ -155,7 +155,7 @@ #ifdef HAVE_MMX -#include "i386/mmx.h" +#include "x86/mmx.h" #define FILTER4(reg) \ {\
--- a/mathops.h Mon Dec 22 06:50:18 2008 +0000 +++ b/mathops.h Mon Dec 22 09:12:42 2008 +0000 @@ -26,7 +26,7 @@ #ifdef ARCH_X86_32 -#include "i386/mathops.h" +#include "x86/mathops.h" #elif defined(ARCH_ARM)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/x86/cavsdsp_mmx.c Mon Dec 22 09:12:42 2008 +0000 @@ -0,0 +1,497 @@ +/* + * Chinese AVS video (AVS1-P2, JiZhun profile) decoder. + * Copyright (c) 2006 Stefan Gehrer <stefan.gehrer@gmx.de> + * + * MMX-optimized DSP functions, based on H.264 optimizations by + * Michael Niedermayer and Loren Merritt + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/common.h" +#include "libavutil/x86_cpu.h" +#include "libavcodec/dsputil.h" +#include "dsputil_mmx.h" + +/***************************************************************************** + * + * inverse transform + * + ****************************************************************************/ + +static inline void cavs_idct8_1d(int16_t *block, uint64_t bias) +{ + __asm__ volatile( + "movq 112(%0), %%mm4 \n\t" /* mm4 = src7 */ + "movq 16(%0), %%mm5 \n\t" /* mm5 = src1 */ + "movq 80(%0), %%mm2 \n\t" /* mm2 = src5 */ + "movq 48(%0), %%mm7 \n\t" /* mm7 = src3 */ + "movq %%mm4, %%mm0 \n\t" + "movq %%mm5, %%mm3 \n\t" + "movq %%mm2, %%mm6 \n\t" + "movq %%mm7, %%mm1 \n\t" + + "paddw %%mm4, %%mm4 \n\t" /* mm4 = 2*src7 */ + "paddw %%mm3, %%mm3 \n\t" /* mm3 = 2*src1 */ + "paddw %%mm6, %%mm6 \n\t" /* mm6 = 2*src5 */ + "paddw %%mm1, %%mm1 \n\t" /* mm1 = 2*src3 */ + "paddw %%mm4, %%mm0 \n\t" /* mm0 = 3*src7 */ + "paddw %%mm3, %%mm5 \n\t" /* mm5 = 3*src1 */ + "paddw %%mm6, %%mm2 \n\t" /* mm2 = 3*src5 */ + "paddw %%mm1, %%mm7 \n\t" /* mm7 = 3*src3 */ + "psubw %%mm4, %%mm5 \n\t" /* mm5 = 3*src1 - 2*src7 = a0 */ + "paddw %%mm6, %%mm7 \n\t" /* mm7 = 3*src3 + 2*src5 = a1 */ + "psubw %%mm2, %%mm1 \n\t" /* mm1 = 2*src3 - 3*src5 = a2 */ + "paddw %%mm0, %%mm3 \n\t" /* mm3 = 2*src1 + 3*src7 = a3 */ + + "movq %%mm5, %%mm4 \n\t" + "movq %%mm7, %%mm6 \n\t" + "movq %%mm3, %%mm0 \n\t" + "movq %%mm1, %%mm2 \n\t" + SUMSUB_BA( %%mm7, %%mm5 ) /* mm7 = a0 + a1 mm5 = a0 - a1 */ + "paddw %%mm3, %%mm7 \n\t" /* mm7 = a0 + a1 + a3 */ + "paddw %%mm1, %%mm5 \n\t" /* mm5 = a0 - a1 + a2 */ + "paddw %%mm7, %%mm7 \n\t" + "paddw %%mm5, %%mm5 \n\t" + "paddw %%mm6, %%mm7 \n\t" /* mm7 = b4 */ + "paddw %%mm4, %%mm5 \n\t" /* mm5 = b5 */ + + SUMSUB_BA( %%mm1, %%mm3 ) /* mm1 = a3 + a2 mm3 = a3 - a2 */ + "psubw %%mm1, %%mm4 \n\t" /* mm4 = a0 - a2 - a3 */ + "movq %%mm4, %%mm1 \n\t" /* mm1 = a0 - a2 - a3 */ + "psubw %%mm6, %%mm3 \n\t" /* mm3 = a3 - a2 - a1 */ + "paddw %%mm1, %%mm1 \n\t" + "paddw %%mm3, %%mm3 \n\t" + "psubw %%mm2, %%mm1 \n\t" /* mm1 = b7 */ + "paddw %%mm0, %%mm3 \n\t" /* mm3 = b6 */ + + "movq 32(%0), %%mm2 \n\t" /* mm2 = src2 */ + "movq 96(%0), %%mm6 \n\t" /* mm6 = src6 */ + "movq %%mm2, %%mm4 \n\t" + "movq %%mm6, %%mm0 \n\t" + "psllw $2, %%mm4 \n\t" /* mm4 = 4*src2 */ + "psllw $2, %%mm6 \n\t" /* mm6 = 4*src6 */ + "paddw %%mm4, %%mm2 \n\t" /* mm2 = 5*src2 */ + "paddw %%mm6, %%mm0 \n\t" /* mm0 = 5*src6 */ + "paddw %%mm2, %%mm2 \n\t" + "paddw %%mm0, %%mm0 \n\t" + "psubw %%mm0, %%mm4 \n\t" /* mm4 = 4*src2 - 10*src6 = a7 */ + "paddw %%mm2, %%mm6 \n\t" /* mm6 = 4*src6 + 10*src2 = a6 */ + + "movq (%0), %%mm2 \n\t" /* mm2 = src0 */ + "movq 64(%0), %%mm0 \n\t" /* mm0 = src4 */ + SUMSUB_BA( %%mm0, %%mm2 ) /* mm0 = src0+src4 mm2 = src0-src4 */ + "psllw $3, %%mm0 \n\t" + "psllw $3, %%mm2 \n\t" + "paddw %1, %%mm0 \n\t" /* add rounding bias */ + "paddw %1, %%mm2 \n\t" /* add rounding bias */ + + SUMSUB_BA( %%mm6, %%mm0 ) /* mm6 = a4 + a6 mm0 = a4 - a6 */ + SUMSUB_BA( %%mm4, %%mm2 ) /* mm4 = a5 + a7 mm2 = a5 - a7 */ + SUMSUB_BA( %%mm7, %%mm6 ) /* mm7 = dst0 mm6 = dst7 */ + SUMSUB_BA( %%mm5, %%mm4 ) /* mm5 = dst1 mm4 = dst6 */ + SUMSUB_BA( %%mm3, %%mm2 ) /* mm3 = dst2 mm2 = dst5 */ + SUMSUB_BA( %%mm1, %%mm0 ) /* mm1 = dst3 mm0 = dst4 */ + :: "r"(block), "m"(bias) + ); +} + +static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride) +{ + int i; + DECLARE_ALIGNED_8(int16_t, b2[64]); + + for(i=0; i<2; i++){ + DECLARE_ALIGNED_8(uint64_t, tmp); + + cavs_idct8_1d(block+4*i, ff_pw_4); + + __asm__ volatile( + "psraw $3, %%mm7 \n\t" + "psraw $3, %%mm6 \n\t" + "psraw $3, %%mm5 \n\t" + "psraw $3, %%mm4 \n\t" + "psraw $3, %%mm3 \n\t" + "psraw $3, %%mm2 \n\t" + "psraw $3, %%mm1 \n\t" + "psraw $3, %%mm0 \n\t" + "movq %%mm7, %0 \n\t" + TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 ) + "movq %%mm0, 8(%1) \n\t" + "movq %%mm6, 24(%1) \n\t" + "movq %%mm7, 40(%1) \n\t" + "movq %%mm4, 56(%1) \n\t" + "movq %0, %%mm7 \n\t" + TRANSPOSE4( %%mm7, %%mm5, %%mm3, %%mm1, %%mm0 ) + "movq %%mm7, (%1) \n\t" + "movq %%mm1, 16(%1) \n\t" + "movq %%mm0, 32(%1) \n\t" + "movq %%mm3, 48(%1) \n\t" + : "=m"(tmp) + : "r"(b2+32*i) + : "memory" + ); + } + + for(i=0; i<2; i++){ + cavs_idct8_1d(b2+4*i, ff_pw_64); + + __asm__ volatile( + "psraw $7, %%mm7 \n\t" + "psraw $7, %%mm6 \n\t" + "psraw $7, %%mm5 \n\t" + "psraw $7, %%mm4 \n\t" + "psraw $7, %%mm3 \n\t" + "psraw $7, %%mm2 \n\t" + "psraw $7, %%mm1 \n\t" + "psraw $7, %%mm0 \n\t" + "movq %%mm7, (%0) \n\t" + "movq %%mm5, 16(%0) \n\t" + "movq %%mm3, 32(%0) \n\t" + "movq %%mm1, 48(%0) \n\t" + "movq %%mm0, 64(%0) \n\t" + "movq %%mm2, 80(%0) \n\t" + "movq %%mm4, 96(%0) \n\t" + "movq %%mm6, 112(%0) \n\t" + :: "r"(b2+4*i) + : "memory" + ); + } + + add_pixels_clamped_mmx(b2, dst, stride); + + /* clear block */ + __asm__ volatile( + "pxor %%mm7, %%mm7 \n\t" + "movq %%mm7, (%0) \n\t" + "movq %%mm7, 8(%0) \n\t" + "movq %%mm7, 16(%0) \n\t" + "movq %%mm7, 24(%0) \n\t" + "movq %%mm7, 32(%0) \n\t" + "movq %%mm7, 40(%0) \n\t" + "movq %%mm7, 48(%0) \n\t" + "movq %%mm7, 56(%0) \n\t" + "movq %%mm7, 64(%0) \n\t" + "movq %%mm7, 72(%0) \n\t" + "movq %%mm7, 80(%0) \n\t" + "movq %%mm7, 88(%0) \n\t" + "movq %%mm7, 96(%0) \n\t" + "movq %%mm7, 104(%0) \n\t" + "movq %%mm7, 112(%0) \n\t" + "movq %%mm7, 120(%0) \n\t" + :: "r" (block) + ); +} + +/***************************************************************************** + * + * motion compensation + * + ****************************************************************************/ + +/* vertical filter [-1 -2 96 42 -7 0] */ +#define QPEL_CAVSV1(A,B,C,D,E,F,OP) \ + "movd (%0), "#F" \n\t"\ + "movq "#C", %%mm6 \n\t"\ + "pmullw %5, %%mm6 \n\t"\ + "movq "#D", %%mm7 \n\t"\ + "pmullw %6, %%mm7 \n\t"\ + "psllw $3, "#E" \n\t"\ + "psubw "#E", %%mm6 \n\t"\ + "psraw $3, "#E" \n\t"\ + "paddw %%mm7, %%mm6 \n\t"\ + "paddw "#E", %%mm6 \n\t"\ + "paddw "#B", "#B" \n\t"\ + "pxor %%mm7, %%mm7 \n\t"\ + "add %2, %0 \n\t"\ + "punpcklbw %%mm7, "#F" \n\t"\ + "psubw "#B", %%mm6 \n\t"\ + "psraw $1, "#B" \n\t"\ + "psubw "#A", %%mm6 \n\t"\ + "paddw %4, %%mm6 \n\t"\ + "psraw $7, %%mm6 \n\t"\ + "packuswb %%mm6, %%mm6 \n\t"\ + OP(%%mm6, (%1), A, d) \ + "add %3, %1 \n\t" + +/* vertical filter [ 0 -1 5 5 -1 0] */ +#define QPEL_CAVSV2(A,B,C,D,E,F,OP) \ + "movd (%0), "#F" \n\t"\ + "movq "#C", %%mm6 \n\t"\ + "paddw "#D", %%mm6 \n\t"\ + "pmullw %5, %%mm6 \n\t"\ + "add %2, %0 \n\t"\ + "punpcklbw %%mm7, "#F" \n\t"\ + "psubw "#B", %%mm6 \n\t"\ + "psubw "#E", %%mm6 \n\t"\ + "paddw %4, %%mm6 \n\t"\ + "psraw $3, %%mm6 \n\t"\ + "packuswb %%mm6, %%mm6 \n\t"\ + OP(%%mm6, (%1), A, d) \ + "add %3, %1 \n\t" + +/* vertical filter [ 0 -7 42 96 -2 -1] */ +#define QPEL_CAVSV3(A,B,C,D,E,F,OP) \ + "movd (%0), "#F" \n\t"\ + "movq "#C", %%mm6 \n\t"\ + "pmullw %6, %%mm6 \n\t"\ + "movq "#D", %%mm7 \n\t"\ + "pmullw %5, %%mm7 \n\t"\ + "psllw $3, "#B" \n\t"\ + "psubw "#B", %%mm6 \n\t"\ + "psraw $3, "#B" \n\t"\ + "paddw %%mm7, %%mm6 \n\t"\ + "paddw "#B", %%mm6 \n\t"\ + "paddw "#E", "#E" \n\t"\ + "pxor %%mm7, %%mm7 \n\t"\ + "add %2, %0 \n\t"\ + "punpcklbw %%mm7, "#F" \n\t"\ + "psubw "#E", %%mm6 \n\t"\ + "psraw $1, "#E" \n\t"\ + "psubw "#F", %%mm6 \n\t"\ + "paddw %4, %%mm6 \n\t"\ + "psraw $7, %%mm6 \n\t"\ + "packuswb %%mm6, %%mm6 \n\t"\ + OP(%%mm6, (%1), A, d) \ + "add %3, %1 \n\t" + + +#define QPEL_CAVSVNUM(VOP,OP,ADD,MUL1,MUL2)\ + int w= 2;\ + src -= 2*srcStride;\ + \ + while(w--){\ + __asm__ volatile(\ + "pxor %%mm7, %%mm7 \n\t"\ + "movd (%0), %%mm0 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm1 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm2 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm3 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm4 \n\t"\ + "add %2, %0 \n\t"\ + "punpcklbw %%mm7, %%mm0 \n\t"\ + "punpcklbw %%mm7, %%mm1 \n\t"\ + "punpcklbw %%mm7, %%mm2 \n\t"\ + "punpcklbw %%mm7, %%mm3 \n\t"\ + "punpcklbw %%mm7, %%mm4 \n\t"\ + VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ + VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ + VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ + VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ + VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\ + VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\ + VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ + VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ + \ + : "+a"(src), "+c"(dst)\ + : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride), "m"(ADD), "m"(MUL1), "m"(MUL2)\ + : "memory"\ + );\ + if(h==16){\ + __asm__ volatile(\ + VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ + VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ + VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\ + VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\ + VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ + VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ + VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ + VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ + \ + : "+a"(src), "+c"(dst)\ + : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride), "m"(ADD), "m"(MUL1), "m"(MUL2)\ + : "memory"\ + );\ + }\ + src += 4-(h+5)*srcStride;\ + dst += 4-h*dstStride;\ + } + +#define QPEL_CAVS(OPNAME, OP, MMX)\ +static void OPNAME ## cavs_qpel8_h_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + int h=8;\ + __asm__ volatile(\ + "pxor %%mm7, %%mm7 \n\t"\ + "movq %5, %%mm6 \n\t"\ + "1: \n\t"\ + "movq (%0), %%mm0 \n\t"\ + "movq 1(%0), %%mm2 \n\t"\ + "movq %%mm0, %%mm1 \n\t"\ + "movq %%mm2, %%mm3 \n\t"\ + "punpcklbw %%mm7, %%mm0 \n\t"\ + "punpckhbw %%mm7, %%mm1 \n\t"\ + "punpcklbw %%mm7, %%mm2 \n\t"\ + "punpckhbw %%mm7, %%mm3 \n\t"\ + "paddw %%mm2, %%mm0 \n\t"\ + "paddw %%mm3, %%mm1 \n\t"\ + "pmullw %%mm6, %%mm0 \n\t"\ + "pmullw %%mm6, %%mm1 \n\t"\ + "movq -1(%0), %%mm2 \n\t"\ + "movq 2(%0), %%mm4 \n\t"\ + "movq %%mm2, %%mm3 \n\t"\ + "movq %%mm4, %%mm5 \n\t"\ + "punpcklbw %%mm7, %%mm2 \n\t"\ + "punpckhbw %%mm7, %%mm3 \n\t"\ + "punpcklbw %%mm7, %%mm4 \n\t"\ + "punpckhbw %%mm7, %%mm5 \n\t"\ + "paddw %%mm4, %%mm2 \n\t"\ + "paddw %%mm3, %%mm5 \n\t"\ + "psubw %%mm2, %%mm0 \n\t"\ + "psubw %%mm5, %%mm1 \n\t"\ + "movq %6, %%mm5 \n\t"\ + "paddw %%mm5, %%mm0 \n\t"\ + "paddw %%mm5, %%mm1 \n\t"\ + "psraw $3, %%mm0 \n\t"\ + "psraw $3, %%mm1 \n\t"\ + "packuswb %%mm1, %%mm0 \n\t"\ + OP(%%mm0, (%1),%%mm5, q) \ + "add %3, %0 \n\t"\ + "add %4, %1 \n\t"\ + "decl %2 \n\t"\ + " jnz 1b \n\t"\ + : "+a"(src), "+c"(dst), "+m"(h)\ + : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_4)\ + : "memory"\ + );\ +}\ +\ +static inline void OPNAME ## cavs_qpel8or16_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ + QPEL_CAVSVNUM(QPEL_CAVSV1,OP,ff_pw_64,ff_pw_96,ff_pw_42) \ +}\ +\ +static inline void OPNAME ## cavs_qpel8or16_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ + QPEL_CAVSVNUM(QPEL_CAVSV2,OP,ff_pw_4,ff_pw_5,ff_pw_5) \ +}\ +\ +static inline void OPNAME ## cavs_qpel8or16_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ + QPEL_CAVSVNUM(QPEL_CAVSV3,OP,ff_pw_64,ff_pw_96,ff_pw_42) \ +}\ +\ +static void OPNAME ## cavs_qpel8_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst , src , dstStride, srcStride, 8);\ +}\ +static void OPNAME ## cavs_qpel16_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst , src , dstStride, srcStride, 16);\ + OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ +}\ +\ +static void OPNAME ## cavs_qpel8_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst , src , dstStride, srcStride, 8);\ +}\ +static void OPNAME ## cavs_qpel16_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst , src , dstStride, srcStride, 16);\ + OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ +}\ +\ +static void OPNAME ## cavs_qpel8_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst , src , dstStride, srcStride, 8);\ +}\ +static void OPNAME ## cavs_qpel16_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst , src , dstStride, srcStride, 16);\ + OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ +}\ +\ +static void OPNAME ## cavs_qpel16_h_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + OPNAME ## cavs_qpel8_h_ ## MMX(dst , src , dstStride, srcStride);\ + OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\ + src += 8*srcStride;\ + dst += 8*dstStride;\ + OPNAME ## cavs_qpel8_h_ ## MMX(dst , src , dstStride, srcStride);\ + OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\ +}\ + +#define CAVS_MC(OPNAME, SIZE, MMX) \ +static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## cavs_qpel ## SIZE ## _h_ ## MMX(dst, src, stride, stride);\ +}\ +\ +static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## cavs_qpel ## SIZE ## _v1_ ## MMX(dst, src, stride, stride);\ +}\ +\ +static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## cavs_qpel ## SIZE ## _v2_ ## MMX(dst, src, stride, stride);\ +}\ +\ +static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## cavs_qpel ## SIZE ## _v3_ ## MMX(dst, src, stride, stride);\ +}\ + +#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t" +#define AVG_3DNOW_OP(a,b,temp, size) \ +"mov" #size " " #b ", " #temp " \n\t"\ +"pavgusb " #temp ", " #a " \n\t"\ +"mov" #size " " #a ", " #b " \n\t" +#define AVG_MMX2_OP(a,b,temp, size) \ +"mov" #size " " #b ", " #temp " \n\t"\ +"pavgb " #temp ", " #a " \n\t"\ +"mov" #size " " #a ", " #b " \n\t" + +QPEL_CAVS(put_, PUT_OP, 3dnow) +QPEL_CAVS(avg_, AVG_3DNOW_OP, 3dnow) +QPEL_CAVS(put_, PUT_OP, mmx2) +QPEL_CAVS(avg_, AVG_MMX2_OP, mmx2) + +CAVS_MC(put_, 8, 3dnow) +CAVS_MC(put_, 16,3dnow) +CAVS_MC(avg_, 8, 3dnow) +CAVS_MC(avg_, 16,3dnow) +CAVS_MC(put_, 8, mmx2) +CAVS_MC(put_, 16,mmx2) +CAVS_MC(avg_, 8, mmx2) +CAVS_MC(avg_, 16,mmx2) + +void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride); +void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride); +void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride); +void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride); + +void ff_cavsdsp_init_mmx2(DSPContext* c, AVCodecContext *avctx) { +#define dspfunc(PFX, IDX, NUM) \ + c->PFX ## _pixels_tab[IDX][ 0] = ff_ ## PFX ## NUM ## _mc00_mmx2; \ + c->PFX ## _pixels_tab[IDX][ 2] = ff_ ## PFX ## NUM ## _mc20_mmx2; \ + c->PFX ## _pixels_tab[IDX][ 4] = ff_ ## PFX ## NUM ## _mc01_mmx2; \ + c->PFX ## _pixels_tab[IDX][ 8] = ff_ ## PFX ## NUM ## _mc02_mmx2; \ + c->PFX ## _pixels_tab[IDX][12] = ff_ ## PFX ## NUM ## _mc03_mmx2; \ + + dspfunc(put_cavs_qpel, 0, 16); + dspfunc(put_cavs_qpel, 1, 8); + dspfunc(avg_cavs_qpel, 0, 16); + dspfunc(avg_cavs_qpel, 1, 8); +#undef dspfunc + c->cavs_idct8_add = cavs_idct8_add_mmx; +} + +void ff_cavsdsp_init_3dnow(DSPContext* c, AVCodecContext *avctx) { +#define dspfunc(PFX, IDX, NUM) \ + c->PFX ## _pixels_tab[IDX][ 0] = ff_ ## PFX ## NUM ## _mc00_mmx2; \ + c->PFX ## _pixels_tab[IDX][ 2] = ff_ ## PFX ## NUM ## _mc20_3dnow; \ + c->PFX ## _pixels_tab[IDX][ 4] = ff_ ## PFX ## NUM ## _mc01_3dnow; \ + c->PFX ## _pixels_tab[IDX][ 8] = ff_ ## PFX ## NUM ## _mc02_3dnow; \ + c->PFX ## _pixels_tab[IDX][12] = ff_ ## PFX ## NUM ## _mc03_3dnow; \ + + dspfunc(put_cavs_qpel, 0, 16); + dspfunc(put_cavs_qpel, 1, 8); + dspfunc(avg_cavs_qpel, 0, 16); + dspfunc(avg_cavs_qpel, 1, 8); +#undef dspfunc + c->cavs_idct8_add = cavs_idct8_add_mmx; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/x86/cpuid.c Mon Dec 22 09:12:42 2008 +0000 @@ -0,0 +1,134 @@ +/* + * CPU detection code, extracted from mmx.h + * (c)1997-99 by H. Dietz and R. Fisher + * Converted to C and improved by Fabrice Bellard. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdlib.h> +#include "libavutil/x86_cpu.h" +#include "libavcodec/dsputil.h" + +#undef printf + +/* ebx saving is necessary for PIC. gcc seems unable to see it alone */ +#define cpuid(index,eax,ebx,ecx,edx)\ + __asm__ volatile\ + ("mov %%"REG_b", %%"REG_S"\n\t"\ + "cpuid\n\t"\ + "xchg %%"REG_b", %%"REG_S\ + : "=a" (eax), "=S" (ebx),\ + "=c" (ecx), "=d" (edx)\ + : "0" (index)); + +/* Function to test if multimedia instructions are supported... */ +int mm_support(void) +{ + int rval = 0; + int eax, ebx, ecx, edx; + int max_std_level, max_ext_level, std_caps=0, ext_caps=0; + x86_reg a, c; + +#ifdef ARCH_X86_64 +#define PUSHF "pushfq\n\t" +#define POPF "popfq\n\t" +#else +#define PUSHF "pushfl\n\t" +#define POPF "popfl\n\t" +#endif + __asm__ volatile ( + /* See if CPUID instruction is supported ... */ + /* ... Get copies of EFLAGS into eax and ecx */ + PUSHF + "pop %0\n\t" + "mov %0, %1\n\t" + + /* ... Toggle the ID bit in one copy and store */ + /* to the EFLAGS reg */ + "xor $0x200000, %0\n\t" + "push %0\n\t" + POPF + + /* ... Get the (hopefully modified) EFLAGS */ + PUSHF + "pop %0\n\t" + : "=a" (a), "=c" (c) + : + : "cc" + ); + + if (a == c) + return 0; /* CPUID not supported */ + + cpuid(0, max_std_level, ebx, ecx, edx); + + if(max_std_level >= 1){ + cpuid(1, eax, ebx, ecx, std_caps); + if (std_caps & (1<<23)) + rval |= FF_MM_MMX; + if (std_caps & (1<<25)) + rval |= FF_MM_MMXEXT +#if !defined(__GNUC__) || __GNUC__ > 2 + | FF_MM_SSE; + if (std_caps & (1<<26)) + rval |= FF_MM_SSE2; + if (ecx & 1) + rval |= FF_MM_SSE3; + if (ecx & 0x00000200 ) + rval |= FF_MM_SSSE3 +#endif + ; + } + + cpuid(0x80000000, max_ext_level, ebx, ecx, edx); + + if(max_ext_level >= 0x80000001){ + cpuid(0x80000001, eax, ebx, ecx, ext_caps); + if (ext_caps & (1<<31)) + rval |= FF_MM_3DNOW; + if (ext_caps & (1<<30)) + rval |= FF_MM_3DNOWEXT; + if (ext_caps & (1<<23)) + rval |= FF_MM_MMX; + if (ext_caps & (1<<22)) + rval |= FF_MM_MMXEXT; + } + +#if 0 + av_log(NULL, AV_LOG_DEBUG, "%s%s%s%s%s%s%s%s\n", + (rval&FF_MM_MMX) ? "MMX ":"", + (rval&FF_MM_MMXEXT) ? "MMX2 ":"", + (rval&FF_MM_SSE) ? "SSE ":"", + (rval&FF_MM_SSE2) ? "SSE2 ":"", + (rval&FF_MM_SSE3) ? "SSE3 ":"", + (rval&FF_MM_SSSE3) ? "SSSE3 ":"", + (rval&FF_MM_3DNOW) ? "3DNow ":"", + (rval&FF_MM_3DNOWEXT) ? "3DNowExt ":""); +#endif + return rval; +} + +#ifdef TEST +int main ( void ) +{ + int mm_flags; + mm_flags = mm_support(); + printf("mm_support = 0x%08X\n",mm_flags); + return 0; +} +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/x86/dnxhd_mmx.c Mon Dec 22 09:12:42 2008 +0000 @@ -0,0 +1,58 @@ +/* + * VC3/DNxHD SIMD functions + * Copyright (c) 2007 Baptiste Coudurier <baptiste dot coudurier at smartjog dot com> + * + * VC-3 encoder funded by the British Broadcasting Corporation + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/x86_cpu.h" +#include "libavcodec/dnxhdenc.h" + +static void get_pixels_8x4_sym_sse2(DCTELEM *block, const uint8_t *pixels, int line_size) +{ + __asm__ volatile( + "pxor %%xmm7, %%xmm7 \n\t" + "movq (%0), %%xmm0 \n\t" + "add %2, %0 \n\t" + "movq (%0), %%xmm1 \n\t" + "movq (%0, %2), %%xmm2 \n\t" + "movq (%0, %2,2), %%xmm3 \n\t" + "punpcklbw %%xmm7, %%xmm0 \n\t" + "punpcklbw %%xmm7, %%xmm1 \n\t" + "punpcklbw %%xmm7, %%xmm2 \n\t" + "punpcklbw %%xmm7, %%xmm3 \n\t" + "movdqa %%xmm0, (%1) \n\t" + "movdqa %%xmm1, 16(%1) \n\t" + "movdqa %%xmm2, 32(%1) \n\t" + "movdqa %%xmm3, 48(%1) \n\t" + "movdqa %%xmm3 , 64(%1) \n\t" + "movdqa %%xmm2 , 80(%1) \n\t" + "movdqa %%xmm1 , 96(%1) \n\t" + "movdqa %%xmm0, 112(%1) \n\t" + : "+r" (pixels) + : "r" (block), "r" ((x86_reg)line_size) + ); +} + +void ff_dnxhd_init_mmx(DNXHDEncContext *ctx) +{ + if (mm_flags & FF_MM_SSE2) { + ctx->get_pixels_8x4_sym = get_pixels_8x4_sym_sse2; + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/x86/dsputil_h264_template_mmx.c Mon Dec 22 09:12:42 2008 +0000 @@ -0,0 +1,308 @@ +/* + * Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>, + * Loren Merritt + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * MMX optimized version of (put|avg)_h264_chroma_mc8. + * H264_CHROMA_MC8_TMPL must be defined to the desired function name + * H264_CHROMA_OP must be defined to empty for put and pavgb/pavgusb for avg + * H264_CHROMA_MC8_MV0 must be defined to a (put|avg)_pixels8 function + */ +static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, int rnd) +{ + const uint64_t *rnd_reg; + DECLARE_ALIGNED_8(uint64_t, AA); + DECLARE_ALIGNED_8(uint64_t, DD); + int i; + + if(y==0 && x==0) { + /* no filter needed */ + H264_CHROMA_MC8_MV0(dst, src, stride, h); + return; + } + + assert(x<8 && y<8 && x>=0 && y>=0); + + if(y==0 || x==0) + { + /* 1 dimensional filter only */ + const int dxy = x ? 1 : stride; + + rnd_reg = rnd ? &ff_pw_4 : &ff_pw_3; + + __asm__ volatile( + "movd %0, %%mm5\n\t" + "movq %1, %%mm4\n\t" + "movq %2, %%mm6\n\t" /* mm6 = rnd */ + "punpcklwd %%mm5, %%mm5\n\t" + "punpckldq %%mm5, %%mm5\n\t" /* mm5 = B = x */ + "pxor %%mm7, %%mm7\n\t" + "psubw %%mm5, %%mm4\n\t" /* mm4 = A = 8-x */ + :: "rm"(x+y), "m"(ff_pw_8), "m"(*rnd_reg)); + + for(i=0; i<h; i++) { + __asm__ volatile( + /* mm0 = src[0..7], mm1 = src[1..8] */ + "movq %0, %%mm0\n\t" + "movq %1, %%mm2\n\t" + :: "m"(src[0]), "m"(src[dxy])); + + __asm__ volatile( + /* [mm0,mm1] = A * src[0..7] */ + /* [mm2,mm3] = B * src[1..8] */ + "movq %%mm0, %%mm1\n\t" + "movq %%mm2, %%mm3\n\t" + "punpcklbw %%mm7, %%mm0\n\t" + "punpckhbw %%mm7, %%mm1\n\t" + "punpcklbw %%mm7, %%mm2\n\t" + "punpckhbw %%mm7, %%mm3\n\t" + "pmullw %%mm4, %%mm0\n\t" + "pmullw %%mm4, %%mm1\n\t" + "pmullw %%mm5, %%mm2\n\t" + "pmullw %%mm5, %%mm3\n\t" + + /* dst[0..7] = (A * src[0..7] + B * src[1..8] + 4) >> 3 */ + "paddw %%mm6, %%mm0\n\t" + "paddw %%mm6, %%mm1\n\t" + "paddw %%mm2, %%mm0\n\t" + "paddw %%mm3, %%mm1\n\t" + "psrlw $3, %%mm0\n\t" + "psrlw $3, %%mm1\n\t" + "packuswb %%mm1, %%mm0\n\t" + H264_CHROMA_OP(%0, %%mm0) + "movq %%mm0, %0\n\t" + : "=m" (dst[0])); + + src += stride; + dst += stride; + } + return; + } + + /* general case, bilinear */ + rnd_reg = rnd ? &ff_pw_32.a : &ff_pw_28.a; + __asm__ volatile("movd %2, %%mm4\n\t" + "movd %3, %%mm6\n\t" + "punpcklwd %%mm4, %%mm4\n\t" + "punpcklwd %%mm6, %%mm6\n\t" + "punpckldq %%mm4, %%mm4\n\t" /* mm4 = x words */ + "punpckldq %%mm6, %%mm6\n\t" /* mm6 = y words */ + "movq %%mm4, %%mm5\n\t" + "pmullw %%mm6, %%mm4\n\t" /* mm4 = x * y */ + "psllw $3, %%mm5\n\t" + "psllw $3, %%mm6\n\t" + "movq %%mm5, %%mm7\n\t" + "paddw %%mm6, %%mm7\n\t" + "movq %%mm4, %1\n\t" /* DD = x * y */ + "psubw %%mm4, %%mm5\n\t" /* mm5 = B = 8x - xy */ + "psubw %%mm4, %%mm6\n\t" /* mm6 = C = 8y - xy */ + "paddw %4, %%mm4\n\t" + "psubw %%mm7, %%mm4\n\t" /* mm4 = A = xy - (8x+8y) + 64 */ + "pxor %%mm7, %%mm7\n\t" + "movq %%mm4, %0\n\t" + : "=m" (AA), "=m" (DD) : "rm" (x), "rm" (y), "m" (ff_pw_64)); + + __asm__ volatile( + /* mm0 = src[0..7], mm1 = src[1..8] */ + "movq %0, %%mm0\n\t" + "movq %1, %%mm1\n\t" + : : "m" (src[0]), "m" (src[1])); + + for(i=0; i<h; i++) { + src += stride; + + __asm__ volatile( + /* mm2 = A * src[0..3] + B * src[1..4] */ + /* mm3 = A * src[4..7] + B * src[5..8] */ + "movq %%mm0, %%mm2\n\t" + "movq %%mm1, %%mm3\n\t" + "punpckhbw %%mm7, %%mm0\n\t" + "punpcklbw %%mm7, %%mm1\n\t" + "punpcklbw %%mm7, %%mm2\n\t" + "punpckhbw %%mm7, %%mm3\n\t" + "pmullw %0, %%mm0\n\t" + "pmullw %0, %%mm2\n\t" + "pmullw %%mm5, %%mm1\n\t" + "pmullw %%mm5, %%mm3\n\t" + "paddw %%mm1, %%mm2\n\t" + "paddw %%mm0, %%mm3\n\t" + : : "m" (AA)); + + __asm__ volatile( + /* [mm2,mm3] += C * src[0..7] */ + "movq %0, %%mm0\n\t" + "movq %%mm0, %%mm1\n\t" + "punpcklbw %%mm7, %%mm0\n\t" + "punpckhbw %%mm7, %%mm1\n\t" + "pmullw %%mm6, %%mm0\n\t" + "pmullw %%mm6, %%mm1\n\t" + "paddw %%mm0, %%mm2\n\t" + "paddw %%mm1, %%mm3\n\t" + : : "m" (src[0])); + + __asm__ volatile( + /* [mm2,mm3] += D * src[1..8] */ + "movq %1, %%mm1\n\t" + "movq %%mm1, %%mm0\n\t" + "movq %%mm1, %%mm4\n\t" + "punpcklbw %%mm7, %%mm0\n\t" + "punpckhbw %%mm7, %%mm4\n\t" + "pmullw %2, %%mm0\n\t" + "pmullw %2, %%mm4\n\t" + "paddw %%mm0, %%mm2\n\t" + "paddw %%mm4, %%mm3\n\t" + "movq %0, %%mm0\n\t" + : : "m" (src[0]), "m" (src[1]), "m" (DD)); + + __asm__ volatile( + /* dst[0..7] = ([mm2,mm3] + 32) >> 6 */ + "paddw %1, %%mm2\n\t" + "paddw %1, %%mm3\n\t" + "psrlw $6, %%mm2\n\t" + "psrlw $6, %%mm3\n\t" + "packuswb %%mm3, %%mm2\n\t" + H264_CHROMA_OP(%0, %%mm2) + "movq %%mm2, %0\n\t" + : "=m" (dst[0]) : "m" (*rnd_reg)); + dst+= stride; + } +} + +static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) +{ + __asm__ volatile( + "pxor %%mm7, %%mm7 \n\t" + "movd %5, %%mm2 \n\t" + "movd %6, %%mm3 \n\t" + "movq "MANGLE(ff_pw_8)", %%mm4\n\t" + "movq "MANGLE(ff_pw_8)", %%mm5\n\t" + "punpcklwd %%mm2, %%mm2 \n\t" + "punpcklwd %%mm3, %%mm3 \n\t" + "punpcklwd %%mm2, %%mm2 \n\t" + "punpcklwd %%mm3, %%mm3 \n\t" + "psubw %%mm2, %%mm4 \n\t" + "psubw %%mm3, %%mm5 \n\t" + + "movd (%1), %%mm0 \n\t" + "movd 1(%1), %%mm6 \n\t" + "add %3, %1 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpcklbw %%mm7, %%mm6 \n\t" + "pmullw %%mm4, %%mm0 \n\t" + "pmullw %%mm2, %%mm6 \n\t" + "paddw %%mm0, %%mm6 \n\t" + + "1: \n\t" + "movd (%1), %%mm0 \n\t" + "movd 1(%1), %%mm1 \n\t" + "add %3, %1 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpcklbw %%mm7, %%mm1 \n\t" + "pmullw %%mm4, %%mm0 \n\t" + "pmullw %%mm2, %%mm1 \n\t" + "paddw %%mm0, %%mm1 \n\t" + "movq %%mm1, %%mm0 \n\t" + "pmullw %%mm5, %%mm6 \n\t" + "pmullw %%mm3, %%mm1 \n\t" + "paddw %4, %%mm6 \n\t" + "paddw %%mm6, %%mm1 \n\t" + "psrlw $6, %%mm1 \n\t" + "packuswb %%mm1, %%mm1 \n\t" + H264_CHROMA_OP4((%0), %%mm1, %%mm6) + "movd %%mm1, (%0) \n\t" + "add %3, %0 \n\t" + "movd (%1), %%mm6 \n\t" + "movd 1(%1), %%mm1 \n\t" + "add %3, %1 \n\t" + "punpcklbw %%mm7, %%mm6 \n\t" + "punpcklbw %%mm7, %%mm1 \n\t" + "pmullw %%mm4, %%mm6 \n\t" + "pmullw %%mm2, %%mm1 \n\t" + "paddw %%mm6, %%mm1 \n\t" + "movq %%mm1, %%mm6 \n\t" + "pmullw %%mm5, %%mm0 \n\t" + "pmullw %%mm3, %%mm1 \n\t" + "paddw %4, %%mm0 \n\t" + "paddw %%mm0, %%mm1 \n\t" + "psrlw $6, %%mm1 \n\t" + "packuswb %%mm1, %%mm1 \n\t" + H264_CHROMA_OP4((%0), %%mm1, %%mm0) + "movd %%mm1, (%0) \n\t" + "add %3, %0 \n\t" + "sub $2, %2 \n\t" + "jnz 1b \n\t" + : "+r"(dst), "+r"(src), "+r"(h) + : "r"((x86_reg)stride), "m"(ff_pw_32), "m"(x), "m"(y) + ); +} + +#ifdef H264_CHROMA_MC2_TMPL +static void H264_CHROMA_MC2_TMPL(uint8_t *dst/*align 2*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) +{ + int tmp = ((1<<16)-1)*x + 8; + int CD= tmp*y; + int AB= (tmp<<3) - CD; + __asm__ volatile( + /* mm5 = {A,B,A,B} */ + /* mm6 = {C,D,C,D} */ + "movd %0, %%mm5\n\t" + "movd %1, %%mm6\n\t" + "punpckldq %%mm5, %%mm5\n\t" + "punpckldq %%mm6, %%mm6\n\t" + "pxor %%mm7, %%mm7\n\t" + /* mm0 = src[0,1,1,2] */ + "movd %2, %%mm2\n\t" + "punpcklbw %%mm7, %%mm2\n\t" + "pshufw $0x94, %%mm2, %%mm2\n\t" + :: "r"(AB), "r"(CD), "m"(src[0])); + + + __asm__ volatile( + "1:\n\t" + "add %4, %1\n\t" + /* mm1 = A * src[0,1] + B * src[1,2] */ + "movq %%mm2, %%mm1\n\t" + "pmaddwd %%mm5, %%mm1\n\t" + /* mm0 = src[0,1,1,2] */ + "movd (%1), %%mm0\n\t" + "punpcklbw %%mm7, %%mm0\n\t" + "pshufw $0x94, %%mm0, %%mm0\n\t" + /* mm1 += C * src[0,1] + D * src[1,2] */ + "movq %%mm0, %%mm2\n\t" + "pmaddwd %%mm6, %%mm0\n\t" + "paddw %3, %%mm1\n\t" + "paddw %%mm0, %%mm1\n\t" + /* dst[0,1] = pack((mm1 + 32) >> 6) */ + "psrlw $6, %%mm1\n\t" + "packssdw %%mm7, %%mm1\n\t" + "packuswb %%mm7, %%mm1\n\t" + H264_CHROMA_OP4((%0), %%mm1, %%mm3) + "movd %%mm1, %%esi\n\t" + "movw %%si, (%0)\n\t" + "add %4, %0\n\t" + "sub $1, %2\n\t" + "jnz 1b\n\t" + : "+r" (dst), "+r"(src), "+r"(h) + : "m" (ff_pw_32), "r"((x86_reg)stride) + : "%esi"); + +} +#endif +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/x86/dsputil_h264_template_ssse3.c Mon Dec 22 09:12:42 2008 +0000 @@ -0,0 +1,208 @@ +/* + * Copyright (c) 2008 Loren Merritt + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * SSSE3 optimized version of (put|avg)_h264_chroma_mc8. + * H264_CHROMA_MC8_TMPL must be defined to the desired function name + * H264_CHROMA_MC8_MV0 must be defined to a (put|avg)_pixels8 function + * AVG_OP must be defined to empty for put and the identify for avg + */ +static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, int rnd) +{ + if(y==0 && x==0) { + /* no filter needed */ + H264_CHROMA_MC8_MV0(dst, src, stride, h); + return; + } + + assert(x<8 && y<8 && x>=0 && y>=0); + + if(y==0 || x==0) + { + /* 1 dimensional filter only */ + __asm__ volatile( + "movd %0, %%xmm7 \n\t" + "movq %1, %%xmm6 \n\t" + "pshuflw $0, %%xmm7, %%xmm7 \n\t" + "movlhps %%xmm6, %%xmm6 \n\t" + "movlhps %%xmm7, %%xmm7 \n\t" + :: "r"(255*(x+y)+8), "m"(*(rnd?&ff_pw_4:&ff_pw_3)) + ); + + if(x) { + __asm__ volatile( + "1: \n\t" + "movq (%1), %%xmm0 \n\t" + "movq 1(%1), %%xmm1 \n\t" + "movq (%1,%3), %%xmm2 \n\t" + "movq 1(%1,%3), %%xmm3 \n\t" + "punpcklbw %%xmm1, %%xmm0 \n\t" + "punpcklbw %%xmm3, %%xmm2 \n\t" + "pmaddubsw %%xmm7, %%xmm0 \n\t" + "pmaddubsw %%xmm7, %%xmm2 \n\t" + AVG_OP("movq (%0), %%xmm4 \n\t") + AVG_OP("movhps (%0,%3), %%xmm4 \n\t") + "paddw %%xmm6, %%xmm0 \n\t" + "paddw %%xmm6, %%xmm2 \n\t" + "psrlw $3, %%xmm0 \n\t" + "psrlw $3, %%xmm2 \n\t" + "packuswb %%xmm2, %%xmm0 \n\t" + AVG_OP("pavgb %%xmm4, %%xmm0 \n\t") + "movq %%xmm0, (%0) \n\t" + "movhps %%xmm0, (%0,%3) \n\t" + "sub $2, %2 \n\t" + "lea (%1,%3,2), %1 \n\t" + "lea (%0,%3,2), %0 \n\t" + "jg 1b \n\t" + :"+r"(dst), "+r"(src), "+r"(h) + :"r"((x86_reg)stride) + ); + } else { + __asm__ volatile( + "1: \n\t" + "movq (%1), %%xmm0 \n\t" + "movq (%1,%3), %%xmm1 \n\t" + "movdqa %%xmm1, %%xmm2 \n\t" + "movq (%1,%3,2), %%xmm3 \n\t" + "punpcklbw %%xmm1, %%xmm0 \n\t" + "punpcklbw %%xmm3, %%xmm2 \n\t" + "pmaddubsw %%xmm7, %%xmm0 \n\t" + "pmaddubsw %%xmm7, %%xmm2 \n\t" + AVG_OP("movq (%0), %%xmm4 \n\t") + AVG_OP("movhps (%0,%3), %%xmm4 \n\t") + "paddw %%xmm6, %%xmm0 \n\t" + "paddw %%xmm6, %%xmm2 \n\t" + "psrlw $3, %%xmm0 \n\t" + "psrlw $3, %%xmm2 \n\t" + "packuswb %%xmm2, %%xmm0 \n\t" + AVG_OP("pavgb %%xmm4, %%xmm0 \n\t") + "movq %%xmm0, (%0) \n\t" + "movhps %%xmm0, (%0,%3) \n\t" + "sub $2, %2 \n\t" + "lea (%1,%3,2), %1 \n\t" + "lea (%0,%3,2), %0 \n\t" + "jg 1b \n\t" + :"+r"(dst), "+r"(src), "+r"(h) + :"r"((x86_reg)stride) + ); + } + return; + } + + /* general case, bilinear */ + __asm__ volatile( + "movd %0, %%xmm7 \n\t" + "movd %1, %%xmm6 \n\t" + "movdqa %2, %%xmm5 \n\t" + "pshuflw $0, %%xmm7, %%xmm7 \n\t" + "pshuflw $0, %%xmm6, %%xmm6 \n\t" + "movlhps %%xmm7, %%xmm7 \n\t" + "movlhps %%xmm6, %%xmm6 \n\t" + :: "r"((x*255+8)*(8-y)), "r"((x*255+8)*y), "m"(*(rnd?&ff_pw_32:&ff_pw_28)) + ); + + __asm__ volatile( + "movq (%1), %%xmm0 \n\t" + "movq 1(%1), %%xmm1 \n\t" + "punpcklbw %%xmm1, %%xmm0 \n\t" + "add %3, %1 \n\t" + "1: \n\t" + "movq (%1), %%xmm1 \n\t" + "movq 1(%1), %%xmm2 \n\t" + "movq (%1,%3), %%xmm3 \n\t" + "movq 1(%1,%3), %%xmm4 \n\t" + "lea (%1,%3,2), %1 \n\t" + "punpcklbw %%xmm2, %%xmm1 \n\t" + "punpcklbw %%xmm4, %%xmm3 \n\t" + "movdqa %%xmm1, %%xmm2 \n\t" + "movdqa %%xmm3, %%xmm4 \n\t" + "pmaddubsw %%xmm7, %%xmm0 \n\t" + "pmaddubsw %%xmm6, %%xmm1 \n\t" + "pmaddubsw %%xmm7, %%xmm2 \n\t" + "pmaddubsw %%xmm6, %%xmm3 \n\t" + "paddw %%xmm5, %%xmm0 \n\t" + "paddw %%xmm5, %%xmm2 \n\t" + "paddw %%xmm0, %%xmm1 \n\t" + "paddw %%xmm2, %%xmm3 \n\t" + "movdqa %%xmm4, %%xmm0 \n\t" + "psrlw $6, %%xmm1 \n\t" + "psrlw $6, %%xmm3 \n\t" + AVG_OP("movq (%0), %%xmm2 \n\t") + AVG_OP("movhps (%0,%3), %%xmm2 \n\t") + "packuswb %%xmm3, %%xmm1 \n\t" + AVG_OP("pavgb %%xmm2, %%xmm1 \n\t") + "movq %%xmm1, (%0)\n\t" + "movhps %%xmm1, (%0,%3)\n\t" + "sub $2, %2 \n\t" + "lea (%0,%3,2), %0 \n\t" + "jg 1b \n\t" + :"+r"(dst), "+r"(src), "+r"(h) + :"r"((x86_reg)stride) + ); +} + +static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) +{ + __asm__ volatile( + "movd %0, %%mm7 \n\t" + "movd %1, %%mm6 \n\t" + "movq %2, %%mm5 \n\t" + "pshufw $0, %%mm7, %%mm7 \n\t" + "pshufw $0, %%mm6, %%mm6 \n\t" + :: "r"((x*255+8)*(8-y)), "r"((x*255+8)*y), "m"(ff_pw_32) + ); + + __asm__ volatile( + "movd (%1), %%mm0 \n\t" + "punpcklbw 1(%1), %%mm0 \n\t" + "add %3, %1 \n\t" + "1: \n\t" + "movd (%1), %%mm1 \n\t" + "movd (%1,%3), %%mm3 \n\t" + "punpcklbw 1(%1), %%mm1 \n\t" + "punpcklbw 1(%1,%3), %%mm3 \n\t" + "lea (%1,%3,2), %1 \n\t" + "movq %%mm1, %%mm2 \n\t" + "movq %%mm3, %%mm4 \n\t" + "pmaddubsw %%mm7, %%mm0 \n\t" + "pmaddubsw %%mm6, %%mm1 \n\t" + "pmaddubsw %%mm7, %%mm2 \n\t" + "pmaddubsw %%mm6, %%mm3 \n\t" + "paddw %%mm5, %%mm0 \n\t" + "paddw %%mm5, %%mm2 \n\t" + "paddw %%mm0, %%mm1 \n\t" + "paddw %%mm2, %%mm3 \n\t" + "movq %%mm4, %%mm0 \n\t" + "psrlw $6, %%mm1 \n\t" + "psrlw $6, %%mm3 \n\t" + "packuswb %%mm1, %%mm1 \n\t" + "packuswb %%mm3, %%mm3 \n\t" + AVG_OP("pavgb (%0), %%mm1 \n\t") + AVG_OP("pavgb (%0,%3), %%mm3 \n\t") + "movd %%mm1, (%0)\n\t" + "movd %%mm3, (%0,%3)\n\t" + "sub $2, %2 \n\t" + "lea (%0,%3,2), %0 \n\t" + "jg 1b \n\t" + :"+r"(dst), "+r"(src), "+r"(h) + :"r"((x86_reg)stride) + ); +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/x86/dsputil_mmx.c Mon Dec 22 09:12:42 2008 +0000 @@ -0,0 +1,2976 @@ +/* + * MMX optimized DSP utils + * Copyright (c) 2000, 2001 Fabrice Bellard. + * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + * MMX optimization by Nick Kurshev <nickols_k@mail.ru> + */ + +#include "libavutil/x86_cpu.h" +#include "libavcodec/dsputil.h" +#include "libavcodec/h263.h" +#include "libavcodec/mpegvideo.h" +#include "libavcodec/simple_idct.h" +#include "dsputil_mmx.h" +#include "mmx.h" +#include "vp3dsp_mmx.h" +#include "vp3dsp_sse2.h" +#include "idct_xvid.h" + +//#undef NDEBUG +//#include <assert.h> + +int mm_flags; /* multimedia extension flags */ + +/* pixel operations */ +DECLARE_ALIGNED_8 (const uint64_t, ff_bone) = 0x0101010101010101ULL; +DECLARE_ALIGNED_8 (const uint64_t, ff_wtwo) = 0x0002000200020002ULL; + +DECLARE_ALIGNED_16(const uint64_t, ff_pdw_80000000[2]) = +{0x8000000080000000ULL, 0x8000000080000000ULL}; + +DECLARE_ALIGNED_8 (const uint64_t, ff_pw_3 ) = 0x0003000300030003ULL; +DECLARE_ALIGNED_8 (const uint64_t, ff_pw_4 ) = 0x0004000400040004ULL; +DECLARE_ALIGNED_16(const xmm_reg, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL}; +DECLARE_ALIGNED_16(const xmm_reg, ff_pw_8 ) = {0x0008000800080008ULL, 0x0008000800080008ULL}; +DECLARE_ALIGNED_8 (const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL; +DECLARE_ALIGNED_16(const xmm_reg, ff_pw_16 ) = {0x0010001000100010ULL, 0x0010001000100010ULL}; +DECLARE_ALIGNED_8 (const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL; +DECLARE_ALIGNED_16(const xmm_reg, ff_pw_28 ) = {0x001C001C001C001CULL, 0x001C001C001C001CULL}; +DECLARE_ALIGNED_16(const xmm_reg, ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL}; +DECLARE_ALIGNED_8 (const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL; +DECLARE_ALIGNED_8 (const uint64_t, ff_pw_64 ) = 0x0040004000400040ULL; +DECLARE_ALIGNED_8 (const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL; +DECLARE_ALIGNED_8 (const uint64_t, ff_pw_128) = 0x0080008000800080ULL; +DECLARE_ALIGNED_8 (const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL; + +DECLARE_ALIGNED_8 (const uint64_t, ff_pb_1 ) = 0x0101010101010101ULL; +DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3 ) = 0x0303030303030303ULL; +DECLARE_ALIGNED_8 (const uint64_t, ff_pb_7 ) = 0x0707070707070707ULL; +DECLARE_ALIGNED_8 (const uint64_t, ff_pb_1F ) = 0x1F1F1F1F1F1F1F1FULL; +DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL; +DECLARE_ALIGNED_8 (const uint64_t, ff_pb_81 ) = 0x8181818181818181ULL; +DECLARE_ALIGNED_8 (const uint64_t, ff_pb_A1 ) = 0xA1A1A1A1A1A1A1A1ULL; +DECLARE_ALIGNED_8 (const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL; + +DECLARE_ALIGNED_16(const double, ff_pd_1[2]) = { 1.0, 1.0 }; +DECLARE_ALIGNED_16(const double, ff_pd_2[2]) = { 2.0, 2.0 }; + +#define JUMPALIGN() __asm__ volatile (ASMALIGN(3)::) +#define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%" #regd ", %%" #regd ::) + +#define MOVQ_BFE(regd) \ + __asm__ volatile ( \ + "pcmpeqd %%" #regd ", %%" #regd " \n\t"\ + "paddb %%" #regd ", %%" #regd " \n\t" ::) + +#ifndef PIC +#define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone)) +#define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo)) +#else +// for shared library it's better to use this way for accessing constants +// pcmpeqd -> -1 +#define MOVQ_BONE(regd) \ + __asm__ volatile ( \ + "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ + "psrlw $15, %%" #regd " \n\t" \ + "packuswb %%" #regd ", %%" #regd " \n\t" ::) + +#define MOVQ_WTWO(regd) \ + __asm__ volatile ( \ + "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ + "psrlw $15, %%" #regd " \n\t" \ + "psllw $1, %%" #regd " \n\t"::) + +#endif + +// using regr as temporary and for the output result +// first argument is unmodifed and second is trashed +// regfe is supposed to contain 0xfefefefefefefefe +#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \ + "movq " #rega ", " #regr " \n\t"\ + "pand " #regb ", " #regr " \n\t"\ + "pxor " #rega ", " #regb " \n\t"\ + "pand " #regfe "," #regb " \n\t"\ + "psrlq $1, " #regb " \n\t"\ + "paddb " #regb ", " #regr " \n\t" + +#define PAVGB_MMX(rega, regb, regr, regfe) \ + "movq " #rega ", " #regr " \n\t"\ + "por " #regb ", " #regr " \n\t"\ + "pxor " #rega ", " #regb " \n\t"\ + "pand " #regfe "," #regb " \n\t"\ + "psrlq $1, " #regb " \n\t"\ + "psubb " #regb ", " #regr " \n\t" + +// mm6 is supposed to contain 0xfefefefefefefefe +#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \ + "movq " #rega ", " #regr " \n\t"\ + "movq " #regc ", " #regp " \n\t"\ + "pand " #regb ", " #regr " \n\t"\ + "pand " #regd ", " #regp " \n\t"\ + "pxor " #rega ", " #regb " \n\t"\ + "pxor " #regc ", " #regd " \n\t"\ + "pand %%mm6, " #regb " \n\t"\ + "pand %%mm6, " #regd " \n\t"\ + "psrlq $1, " #regb " \n\t"\ + "psrlq $1, " #regd " \n\t"\ + "paddb " #regb ", " #regr " \n\t"\ + "paddb " #regd ", " #regp " \n\t" + +#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \ + "movq " #rega ", " #regr " \n\t"\ + "movq " #regc ", " #regp " \n\t"\ + "por " #regb ", " #regr " \n\t"\ + "por " #regd ", " #regp " \n\t"\ + "pxor " #rega ", " #regb " \n\t"\ + "pxor " #regc ", " #regd " \n\t"\ + "pand %%mm6, " #regb " \n\t"\ + "pand %%mm6, " #regd " \n\t"\ + "psrlq $1, " #regd " \n\t"\ + "psrlq $1, " #regb " \n\t"\ + "psubb " #regb ", " #regr " \n\t"\ + "psubb " #regd ", " #regp " \n\t" + +/***********************************/ +/* MMX no rounding */ +#define DEF(x, y) x ## _no_rnd_ ## y ##_mmx +#define SET_RND MOVQ_WONE +#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f) +#define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e) + +#include "dsputil_mmx_rnd_template.c" + +#undef DEF +#undef SET_RND +#undef PAVGBP +#undef PAVGB +/***********************************/ +/* MMX rounding */ + +#define DEF(x, y) x ## _ ## y ##_mmx +#define SET_RND MOVQ_WTWO +#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f) +#define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e) + +#include "dsputil_mmx_rnd_template.c" + +#undef DEF +#undef SET_RND +#undef PAVGBP +#undef PAVGB + +/***********************************/ +/* 3Dnow specific */ + +#define DEF(x) x ## _3dnow +#define PAVGB "pavgusb" + +#include "dsputil_mmx_avg_template.c" + +#undef DEF +#undef PAVGB + +/***********************************/ +/* MMX2 specific */ + +#define DEF(x) x ## _mmx2 + +/* Introduced only in MMX2 set */ +#define PAVGB "pavgb" + +#include "dsputil_mmx_avg_template.c" + +#undef DEF +#undef PAVGB + +#define put_no_rnd_pixels16_mmx put_pixels16_mmx +#define put_no_rnd_pixels8_mmx put_pixels8_mmx +#define put_pixels16_mmx2 put_pixels16_mmx +#define put_pixels8_mmx2 put_pixels8_mmx +#define put_pixels4_mmx2 put_pixels4_mmx +#define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx +#define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx +#define put_pixels16_3dnow put_pixels16_mmx +#define put_pixels8_3dnow put_pixels8_mmx +#define put_pixels4_3dnow put_pixels4_mmx +#define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx +#define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx + +/***********************************/ +/* standard MMX */ + +void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) +{ + const DCTELEM *p; + uint8_t *pix; + + /* read the pixels */ + p = block; + pix = pixels; + /* unrolled loop */ + __asm__ volatile( + "movq %3, %%mm0 \n\t" + "movq 8%3, %%mm1 \n\t" + "movq 16%3, %%mm2 \n\t" + "movq 24%3, %%mm3 \n\t" + "movq 32%3, %%mm4 \n\t" + "movq 40%3, %%mm5 \n\t" + "movq 48%3, %%mm6 \n\t" + "movq 56%3, %%mm7 \n\t" + "packuswb %%mm1, %%mm0 \n\t" + "packuswb %%mm3, %%mm2 \n\t" + "packuswb %%mm5, %%mm4 \n\t" + "packuswb %%mm7, %%mm6 \n\t" + "movq %%mm0, (%0) \n\t" + "movq %%mm2, (%0, %1) \n\t" + "movq %%mm4, (%0, %1, 2) \n\t" + "movq %%mm6, (%0, %2) \n\t" + ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "m"(*p) + :"memory"); + pix += line_size*4; + p += 32; + + // if here would be an exact copy of the code above + // compiler would generate some very strange code + // thus using "r" + __asm__ volatile( + "movq (%3), %%mm0 \n\t" + "movq 8(%3), %%mm1 \n\t" + "movq 16(%3), %%mm2 \n\t" + "movq 24(%3), %%mm3 \n\t" + "movq 32(%3), %%mm4 \n\t" + "movq 40(%3), %%mm5 \n\t" + "movq 48(%3), %%mm6 \n\t" + "movq 56(%3), %%mm7 \n\t" + "packuswb %%mm1, %%mm0 \n\t" + "packuswb %%mm3, %%mm2 \n\t" + "packuswb %%mm5, %%mm4 \n\t" + "packuswb %%mm7, %%mm6 \n\t" + "movq %%mm0, (%0) \n\t" + "movq %%mm2, (%0, %1) \n\t" + "movq %%mm4, (%0, %1, 2) \n\t" + "movq %%mm6, (%0, %2) \n\t" + ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "r"(p) + :"memory"); +} + +static DECLARE_ALIGNED_8(const unsigned char, vector128[8]) = + { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 }; + +void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) +{ + int i; + + movq_m2r(*vector128, mm1); + for (i = 0; i < 8; i++) { + movq_m2r(*(block), mm0); + packsswb_m2r(*(block + 4), mm0); + block += 8; + paddb_r2r(mm1, mm0); + movq_r2m(mm0, *pixels); + pixels += line_size; + } +} + +void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) +{ + const DCTELEM *p; + uint8_t *pix; + int i; + + /* read the pixels */ + p = block; + pix = pixels; + MOVQ_ZERO(mm7); + i = 4; + do { + __asm__ volatile( + "movq (%2), %%mm0 \n\t" + "movq 8(%2), %%mm1 \n\t" + "movq 16(%2), %%mm2 \n\t" + "movq 24(%2), %%mm3 \n\t" + "movq %0, %%mm4 \n\t" + "movq %1, %%mm6 \n\t" + "movq %%mm4, %%mm5 \n\t" + "punpcklbw %%mm7, %%mm4 \n\t" + "punpckhbw %%mm7, %%mm5 \n\t" + "paddsw %%mm4, %%mm0 \n\t" + "paddsw %%mm5, %%mm1 \n\t" + "movq %%mm6, %%mm5 \n\t" + "punpcklbw %%mm7, %%mm6 \n\t" + "punpckhbw %%mm7, %%mm5 \n\t" + "paddsw %%mm6, %%mm2 \n\t" + "paddsw %%mm5, %%mm3 \n\t" + "packuswb %%mm1, %%mm0 \n\t" + "packuswb %%mm3, %%mm2 \n\t" + "movq %%mm0, %0 \n\t" + "movq %%mm2, %1 \n\t" + :"+m"(*pix), "+m"(*(pix+line_size)) + :"r"(p) + :"memory"); + pix += line_size*2; + p += 16; + } while (--i); +} + +static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) +{ + __asm__ volatile( + "lea (%3, %3), %%"REG_a" \n\t" + ASMALIGN(3) + "1: \n\t" + "movd (%1), %%mm0 \n\t" + "movd (%1, %3), %%mm1 \n\t" + "movd %%mm0, (%2) \n\t" + "movd %%mm1, (%2, %3) \n\t" + "add %%"REG_a", %1 \n\t" + "add %%"REG_a", %2 \n\t" + "movd (%1), %%mm0 \n\t" + "movd (%1, %3), %%mm1 \n\t" + "movd %%mm0, (%2) \n\t" + "movd %%mm1, (%2, %3) \n\t" + "add %%"REG_a", %1 \n\t" + "add %%"REG_a", %2 \n\t" + "subl $4, %0 \n\t" + "jnz 1b \n\t" + : "+g"(h), "+r" (pixels), "+r" (block) + : "r"((x86_reg)line_size) + : "%"REG_a, "memory" + ); +} + +static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) +{ + __asm__ volatile( + "lea (%3, %3), %%"REG_a" \n\t" + ASMALIGN(3) + "1: \n\t" + "movq (%1), %%mm0 \n\t" + "movq (%1, %3), %%mm1 \n\t" + "movq %%mm0, (%2) \n\t" + "movq %%mm1, (%2, %3) \n\t" + "add %%"REG_a", %1 \n\t" + "add %%"REG_a", %2 \n\t" + "movq (%1), %%mm0 \n\t" + "movq (%1, %3), %%mm1 \n\t" + "movq %%mm0, (%2) \n\t" + "movq %%mm1, (%2, %3) \n\t" + "add %%"REG_a", %1 \n\t" + "add %%"REG_a", %2 \n\t" + "subl $4, %0 \n\t" + "jnz 1b \n\t" + : "+g"(h), "+r" (pixels), "+r" (block) + : "r"((x86_reg)line_size) + : "%"REG_a, "memory" + ); +} + +static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) +{ + __asm__ volatile( + "lea (%3, %3), %%"REG_a" \n\t" + ASMALIGN(3) + "1: \n\t" + "movq (%1), %%mm0 \n\t" + "movq 8(%1), %%mm4 \n\t" + "movq (%1, %3), %%mm1 \n\t" + "movq 8(%1, %3), %%mm5 \n\t" + "movq %%mm0, (%2) \n\t" + "movq %%mm4, 8(%2) \n\t" + "movq %%mm1, (%2, %3) \n\t" + "movq %%mm5, 8(%2, %3) \n\t" + "add %%"REG_a", %1 \n\t" + "add %%"REG_a", %2 \n\t" + "movq (%1), %%mm0 \n\t" + "movq 8(%1), %%mm4 \n\t" + "movq (%1, %3), %%mm1 \n\t" + "movq 8(%1, %3), %%mm5 \n\t" + "movq %%mm0, (%2) \n\t" + "movq %%mm4, 8(%2) \n\t" + "movq %%mm1, (%2, %3) \n\t" + "movq %%mm5, 8(%2, %3) \n\t" + "add %%"REG_a", %1 \n\t" + "add %%"REG_a", %2 \n\t" + "subl $4, %0 \n\t" + "jnz 1b \n\t" + : "+g"(h), "+r" (pixels), "+r" (block) + : "r"((x86_reg)line_size) + : "%"REG_a, "memory" + ); +} + +static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h) +{ + __asm__ volatile( + "1: \n\t" + "movdqu (%1), %%xmm0 \n\t" + "movdqu (%1,%3), %%xmm1 \n\t" + "movdqu (%1,%3,2), %%xmm2 \n\t" + "movdqu (%1,%4), %%xmm3 \n\t" + "movdqa %%xmm0, (%2) \n\t" + "movdqa %%xmm1, (%2,%3) \n\t" + "movdqa %%xmm2, (%2,%3,2) \n\t" + "movdqa %%xmm3, (%2,%4) \n\t" + "subl $4, %0 \n\t" + "lea (%1,%3,4), %1 \n\t" + "lea (%2,%3,4), %2 \n\t" + "jnz 1b \n\t" + : "+g"(h), "+r" (pixels), "+r" (block) + : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size) + : "memory" + ); +} + +static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h) +{ + __asm__ volatile( + "1: \n\t" + "movdqu (%1), %%xmm0 \n\t" + "movdqu (%1,%3), %%xmm1 \n\t" + "movdqu (%1,%3,2), %%xmm2 \n\t" + "movdqu (%1,%4), %%xmm3 \n\t" + "pavgb (%2), %%xmm0 \n\t" + "pavgb (%2,%3), %%xmm1 \n\t" + "pavgb (%2,%3,2), %%xmm2 \n\t" + "pavgb (%2,%4), %%xmm3 \n\t" + "movdqa %%xmm0, (%2) \n\t" + "movdqa %%xmm1, (%2,%3) \n\t" + "movdqa %%xmm2, (%2,%3,2) \n\t" + "movdqa %%xmm3, (%2,%4) \n\t" + "subl $4, %0 \n\t" + "lea (%1,%3,4), %1 \n\t" + "lea (%2,%3,4), %2 \n\t" + "jnz 1b \n\t" + : "+g"(h), "+r" (pixels), "+r" (block) + : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size) + : "memory" + ); +} + +#define CLEAR_BLOCKS(name,n) \ +static void name(DCTELEM *blocks)\ +{\ + __asm__ volatile(\ + "pxor %%mm7, %%mm7 \n\t"\ + "mov %1, %%"REG_a" \n\t"\ + "1: \n\t"\ + "movq %%mm7, (%0, %%"REG_a") \n\t"\ + "movq %%mm7, 8(%0, %%"REG_a") \n\t"\ + "movq %%mm7, 16(%0, %%"REG_a") \n\t"\ + "movq %%mm7, 24(%0, %%"REG_a") \n\t"\ + "add $32, %%"REG_a" \n\t"\ + " js 1b \n\t"\ + : : "r" (((uint8_t *)blocks)+128*n),\ + "i" (-128*n)\ + : "%"REG_a\ + );\ +} +CLEAR_BLOCKS(clear_blocks_mmx, 6) +CLEAR_BLOCKS(clear_block_mmx, 1) + +static void clear_block_sse(DCTELEM *block) +{ + __asm__ volatile( + "xorps %%xmm0, %%xmm0 \n" + "movaps %%xmm0, (%0) \n" + "movaps %%xmm0, 16(%0) \n" + "movaps %%xmm0, 32(%0) \n" + "movaps %%xmm0, 48(%0) \n" + "movaps %%xmm0, 64(%0) \n" + "movaps %%xmm0, 80(%0) \n" + "movaps %%xmm0, 96(%0) \n" + "movaps %%xmm0, 112(%0) \n" + :: "r"(block) + : "memory" + ); +} + +static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){ + x86_reg i=0; + __asm__ volatile( + "jmp 2f \n\t" + "1: \n\t" + "movq (%1, %0), %%mm0 \n\t" + "movq (%2, %0), %%mm1 \n\t" + "paddb %%mm0, %%mm1 \n\t" + "movq %%mm1, (%2, %0) \n\t" + "movq 8(%1, %0), %%mm0 \n\t" + "movq 8(%2, %0), %%mm1 \n\t" + "paddb %%mm0, %%mm1 \n\t" + "movq %%mm1, 8(%2, %0) \n\t" + "add $16, %0 \n\t" + "2: \n\t" + "cmp %3, %0 \n\t" + " js 1b \n\t" + : "+r" (i) + : "r"(src), "r"(dst), "r"((x86_reg)w-15) + ); + for(; i<w; i++) + dst[i+0] += src[i+0]; +} + +static void add_bytes_l2_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ + x86_reg i=0; + __asm__ volatile( + "jmp 2f \n\t" + "1: \n\t" + "movq (%2, %0), %%mm0 \n\t" + "movq 8(%2, %0), %%mm1 \n\t" + "paddb (%3, %0), %%mm0 \n\t" + "paddb 8(%3, %0), %%mm1 \n\t" + "movq %%mm0, (%1, %0) \n\t" + "movq %%mm1, 8(%1, %0) \n\t" + "add $16, %0 \n\t" + "2: \n\t" + "cmp %4, %0 \n\t" + " js 1b \n\t" + : "+r" (i) + : "r"(dst), "r"(src1), "r"(src2), "r"((x86_reg)w-15) + ); + for(; i<w; i++) + dst[i] = src1[i] + src2[i]; +} + +#define H263_LOOP_FILTER \ + "pxor %%mm7, %%mm7 \n\t"\ + "movq %0, %%mm0 \n\t"\ + "movq %0, %%mm1 \n\t"\ + "movq %3, %%mm2 \n\t"\ + "movq %3, %%mm3 \n\t"\ + "punpcklbw %%mm7, %%mm0 \n\t"\ + "punpckhbw %%mm7, %%mm1 \n\t"\ + "punpcklbw %%mm7, %%mm2 \n\t"\ + "punpckhbw %%mm7, %%mm3 \n\t"\ + "psubw %%mm2, %%mm0 \n\t"\ + "psubw %%mm3, %%mm1 \n\t"\ + "movq %1, %%mm2 \n\t"\ + "movq %1, %%mm3 \n\t"\ + "movq %2, %%mm4 \n\t"\ + "movq %2, %%mm5 \n\t"\ + "punpcklbw %%mm7, %%mm2 \n\t"\ + "punpckhbw %%mm7, %%mm3 \n\t"\ + "punpcklbw %%mm7, %%mm4 \n\t"\ + "punpckhbw %%mm7, %%mm5 \n\t"\ + "psubw %%mm2, %%mm4 \n\t"\ + "psubw %%mm3, %%mm5 \n\t"\ + "psllw $2, %%mm4 \n\t"\ + "psllw $2, %%mm5 \n\t"\ + "paddw %%mm0, %%mm4 \n\t"\ + "paddw %%mm1, %%mm5 \n\t"\ + "pxor %%mm6, %%mm6 \n\t"\ + "pcmpgtw %%mm4, %%mm6 \n\t"\ + "pcmpgtw %%mm5, %%mm7 \n\t"\ + "pxor %%mm6, %%mm4 \n\t"\ + "pxor %%mm7, %%mm5 \n\t"\ + "psubw %%mm6, %%mm4 \n\t"\ + "psubw %%mm7, %%mm5 \n\t"\ + "psrlw $3, %%mm4 \n\t"\ + "psrlw $3, %%mm5 \n\t"\ + "packuswb %%mm5, %%mm4 \n\t"\ + "packsswb %%mm7, %%mm6 \n\t"\ + "pxor %%mm7, %%mm7 \n\t"\ + "movd %4, %%mm2 \n\t"\ + "punpcklbw %%mm2, %%mm2 \n\t"\ + "punpcklbw %%mm2, %%mm2 \n\t"\ + "punpcklbw %%mm2, %%mm2 \n\t"\ + "psubusb %%mm4, %%mm2 \n\t"\ + "movq %%mm2, %%mm3 \n\t"\ + "psubusb %%mm4, %%mm3 \n\t"\ + "psubb %%mm3, %%mm2 \n\t"\ + "movq %1, %%mm3 \n\t"\ + "movq %2, %%mm4 \n\t"\ + "pxor %%mm6, %%mm3 \n\t"\ + "pxor %%mm6, %%mm4 \n\t"\ + "paddusb %%mm2, %%mm3 \n\t"\ + "psubusb %%mm2, %%mm4 \n\t"\ + "pxor %%mm6, %%mm3 \n\t"\ + "pxor %%mm6, %%mm4 \n\t"\ + "paddusb %%mm2, %%mm2 \n\t"\ + "packsswb %%mm1, %%mm0 \n\t"\ + "pcmpgtb %%mm0, %%mm7 \n\t"\ + "pxor %%mm7, %%mm0 \n\t"\ + "psubb %%mm7, %%mm0 \n\t"\ + "movq %%mm0, %%mm1 \n\t"\ + "psubusb %%mm2, %%mm0 \n\t"\ + "psubb %%mm0, %%mm1 \n\t"\ + "pand %5, %%mm1 \n\t"\ + "psrlw $2, %%mm1 \n\t"\ + "pxor %%mm7, %%mm1 \n\t"\ + "psubb %%mm7, %%mm1 \n\t"\ + "movq %0, %%mm5 \n\t"\ + "movq %3, %%mm6 \n\t"\ + "psubb %%mm1, %%mm5 \n\t"\ + "paddb %%mm1, %%mm6 \n\t" + +static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){ + if(ENABLE_ANY_H263) { + const int strength= ff_h263_loop_filter_strength[qscale]; + + __asm__ volatile( + + H263_LOOP_FILTER + + "movq %%mm3, %1 \n\t" + "movq %%mm4, %2 \n\t" + "movq %%mm5, %0 \n\t" + "movq %%mm6, %3 \n\t" + : "+m" (*(uint64_t*)(src - 2*stride)), + "+m" (*(uint64_t*)(src - 1*stride)), + "+m" (*(uint64_t*)(src + 0*stride)), + "+m" (*(uint64_t*)(src + 1*stride)) + : "g" (2*strength), "m"(ff_pb_FC) + ); + } +} + +static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){ + __asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ... + "movd %4, %%mm0 \n\t" + "movd %5, %%mm1 \n\t" + "movd %6, %%mm2 \n\t" + "movd %7, %%mm3 \n\t" + "punpcklbw %%mm1, %%mm0 \n\t" + "punpcklbw %%mm3, %%mm2 \n\t" + "movq %%mm0, %%mm1 \n\t" + "punpcklwd %%mm2, %%mm0 \n\t" + "punpckhwd %%mm2, %%mm1 \n\t" + "movd %%mm0, %0 \n\t" + "punpckhdq %%mm0, %%mm0 \n\t" + "movd %%mm0, %1 \n\t" + "movd %%mm1, %2 \n\t" + "punpckhdq %%mm1, %%mm1 \n\t" + "movd %%mm1, %3 \n\t" + + : "=m" (*(uint32_t*)(dst + 0*dst_stride)), + "=m" (*(uint32_t*)(dst + 1*dst_stride)), + "=m" (*(uint32_t*)(dst + 2*dst_stride)), + "=m" (*(uint32_t*)(dst + 3*dst_stride)) + : "m" (*(uint32_t*)(src + 0*src_stride)), + "m" (*(uint32_t*)(src + 1*src_stride)), + "m" (*(uint32_t*)(src + 2*src_stride)), + "m" (*(uint32_t*)(src + 3*src_stride)) + ); +} + +static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){ + if(ENABLE_ANY_H263) { + const int strength= ff_h263_loop_filter_strength[qscale]; + DECLARE_ALIGNED(8, uint64_t, temp[4]); + uint8_t *btemp= (uint8_t*)temp; + + src -= 2; + + transpose4x4(btemp , src , 8, stride); + transpose4x4(btemp+4, src + 4*stride, 8, stride); + __asm__ volatile( + H263_LOOP_FILTER // 5 3 4 6 + + : "+m" (temp[0]), + "+m" (temp[1]), + "+m" (temp[2]), + "+m" (temp[3]) + : "g" (2*strength), "m"(ff_pb_FC) + ); + + __asm__ volatile( + "movq %%mm5, %%mm1 \n\t" + "movq %%mm4, %%mm0 \n\t" + "punpcklbw %%mm3, %%mm5 \n\t" + "punpcklbw %%mm6, %%mm4 \n\t" + "punpckhbw %%mm3, %%mm1 \n\t" + "punpckhbw %%mm6, %%mm0 \n\t" + "movq %%mm5, %%mm3 \n\t" + "movq %%mm1, %%mm6 \n\t" + "punpcklwd %%mm4, %%mm5 \n\t" + "punpcklwd %%mm0, %%mm1 \n\t" + "punpckhwd %%mm4, %%mm3 \n\t" + "punpckhwd %%mm0, %%mm6 \n\t" + "movd %%mm5, (%0) \n\t" + "punpckhdq %%mm5, %%mm5 \n\t" + "movd %%mm5, (%0,%2) \n\t" + "movd %%mm3, (%0,%2,2) \n\t" + "punpckhdq %%mm3, %%mm3 \n\t" + "movd %%mm3, (%0,%3) \n\t" + "movd %%mm1, (%1) \n\t" + "punpckhdq %%mm1, %%mm1 \n\t" + "movd %%mm1, (%1,%2) \n\t" + "movd %%mm6, (%1,%2,2) \n\t" + "punpckhdq %%mm6, %%mm6 \n\t" + "movd %%mm6, (%1,%3) \n\t" + :: "r" (src), + "r" (src + 4*stride), + "r" ((x86_reg) stride ), + "r" ((x86_reg)(3*stride)) + ); + } +} + +/* draw the edges of width 'w' of an image of size width, height + this mmx version can only handle w==8 || w==16 */ +static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w) +{ + uint8_t *ptr, *last_line; + int i; + + last_line = buf + (height - 1) * wrap; + /* left and right */ + ptr = buf; + if(w==8) + { + __asm__ volatile( + "1: \n\t" + "movd (%0), %%mm0 \n\t" + "punpcklbw %%mm0, %%mm0 \n\t" + "punpcklwd %%mm0, %%mm0 \n\t" + "punpckldq %%mm0, %%mm0 \n\t" + "movq %%mm0, -8(%0) \n\t" + "movq -8(%0, %2), %%mm1 \n\t" + "punpckhbw %%mm1, %%mm1 \n\t" + "punpckhwd %%mm1, %%mm1 \n\t" + "punpckhdq %%mm1, %%mm1 \n\t" + "movq %%mm1, (%0, %2) \n\t" + "add %1, %0 \n\t" + "cmp %3, %0 \n\t" + " jb 1b \n\t" + : "+r" (ptr) + : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height) + ); + } + else + { + __asm__ volatile( + "1: \n\t" + "movd (%0), %%mm0 \n\t" + "punpcklbw %%mm0, %%mm0 \n\t" + "punpcklwd %%mm0, %%mm0 \n\t" + "punpckldq %%mm0, %%mm0 \n\t" + "movq %%mm0, -8(%0) \n\t" + "movq %%mm0, -16(%0) \n\t" + "movq -8(%0, %2), %%mm1 \n\t" + "punpckhbw %%mm1, %%mm1 \n\t" + "punpckhwd %%mm1, %%mm1 \n\t" + "punpckhdq %%mm1, %%mm1 \n\t" + "movq %%mm1, (%0, %2) \n\t" + "movq %%mm1, 8(%0, %2) \n\t" + "add %1, %0 \n\t" + "cmp %3, %0 \n\t" + " jb 1b \n\t" + : "+r" (ptr) + : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height) + ); + } + + for(i=0;i<w;i+=4) { + /* top and bottom (and hopefully also the corners) */ + ptr= buf - (i + 1) * wrap - w; + __asm__ volatile( + "1: \n\t" + "movq (%1, %0), %%mm0 \n\t" + "movq %%mm0, (%0) \n\t" + "movq %%mm0, (%0, %2) \n\t" + "movq %%mm0, (%0, %2, 2) \n\t" + "movq %%mm0, (%0, %3) \n\t" + "add $8, %0 \n\t" + "cmp %4, %0 \n\t" + " jb 1b \n\t" + : "+r" (ptr) + : "r" ((x86_reg)buf - (x86_reg)ptr - w), "r" ((x86_reg)-wrap), "r" ((x86_reg)-wrap*3), "r" (ptr+width+2*w) + ); + ptr= last_line + (i + 1) * wrap - w; + __asm__ volatile( + "1: \n\t" + "movq (%1, %0), %%mm0 \n\t" + "movq %%mm0, (%0) \n\t" + "movq %%mm0, (%0, %2) \n\t" + "movq %%mm0, (%0, %2, 2) \n\t" + "movq %%mm0, (%0, %3) \n\t" + "add $8, %0 \n\t" + "cmp %4, %0 \n\t" + " jb 1b \n\t" + : "+r" (ptr) + : "r" ((x86_reg)last_line - (x86_reg)ptr - w), "r" ((x86_reg)wrap), "r" ((x86_reg)wrap*3), "r" (ptr+width+2*w) + ); + } +} + +#define PAETH(cpu, abs3)\ +static void add_png_paeth_prediction_##cpu(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp)\ +{\ + x86_reg i = -bpp;\ + x86_reg end = w-3;\ + __asm__ volatile(\ + "pxor %%mm7, %%mm7 \n"\ + "movd (%1,%0), %%mm0 \n"\ + "movd (%2,%0), %%mm1 \n"\ + "punpcklbw %%mm7, %%mm0 \n"\ + "punpcklbw %%mm7, %%mm1 \n"\ + "add %4, %0 \n"\ + "1: \n"\ + "movq %%mm1, %%mm2 \n"\ + "movd (%2,%0), %%mm1 \n"\ + "movq %%mm2, %%mm3 \n"\ + "punpcklbw %%mm7, %%mm1 \n"\ + "movq %%mm2, %%mm4 \n"\ + "psubw %%mm1, %%mm3 \n"\ + "psubw %%mm0, %%mm4 \n"\ + "movq %%mm3, %%mm5 \n"\ + "paddw %%mm4, %%mm5 \n"\ + abs3\ + "movq %%mm4, %%mm6 \n"\ + "pminsw %%mm5, %%mm6 \n"\ + "pcmpgtw %%mm6, %%mm3 \n"\ + "pcmpgtw %%mm5, %%mm4 \n"\ + "movq %%mm4, %%mm6 \n"\ + "pand %%mm3, %%mm4 \n"\ + "pandn %%mm3, %%mm6 \n"\ + "pandn %%mm0, %%mm3 \n"\ + "movd (%3,%0), %%mm0 \n"\ + "pand %%mm1, %%mm6 \n"\ + "pand %%mm4, %%mm2 \n"\ + "punpcklbw %%mm7, %%mm0 \n"\ + "movq %6, %%mm5 \n"\ + "paddw %%mm6, %%mm0 \n"\ + "paddw %%mm2, %%mm3 \n"\ + "paddw %%mm3, %%mm0 \n"\ + "pand %%mm5, %%mm0 \n"\ + "movq %%mm0, %%mm3 \n"\ + "packuswb %%mm3, %%mm3 \n"\ + "movd %%mm3, (%1,%0) \n"\ + "add %4, %0 \n"\ + "cmp %5, %0 \n"\ + "jle 1b \n"\ + :"+r"(i)\ + :"r"(dst), "r"(top), "r"(src), "r"((x86_reg)bpp), "g"(end),\ + "m"(ff_pw_255)\ + :"memory"\ + );\ +} + +#define ABS3_MMX2\ + "psubw %%mm5, %%mm7 \n"\ + "pmaxsw %%mm7, %%mm5 \n"\ + "pxor %%mm6, %%mm6 \n"\ + "pxor %%mm7, %%mm7 \n"\ + "psubw %%mm3, %%mm6 \n"\ + "psubw %%mm4, %%mm7 \n"\ + "pmaxsw %%mm6, %%mm3 \n"\ + "pmaxsw %%mm7, %%mm4 \n"\ + "pxor %%mm7, %%mm7 \n" + +#define ABS3_SSSE3\ + "pabsw %%mm3, %%mm3 \n"\ + "pabsw %%mm4, %%mm4 \n"\ + "pabsw %%mm5, %%mm5 \n" + +PAETH(mmx2, ABS3_MMX2) +#ifdef HAVE_SSSE3 +PAETH(ssse3, ABS3_SSSE3) +#endif + +#define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\ + "paddw " #m4 ", " #m3 " \n\t" /* x1 */\ + "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\ + "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\ + "movq "#in7", " #m3 " \n\t" /* d */\ + "movq "#in0", %%mm5 \n\t" /* D */\ + "paddw " #m3 ", %%mm5 \n\t" /* x4 */\ + "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\ + "movq "#in1", %%mm5 \n\t" /* C */\ + "movq "#in2", %%mm6 \n\t" /* B */\ + "paddw " #m6 ", %%mm5 \n\t" /* x3 */\ + "paddw " #m5 ", %%mm6 \n\t" /* x2 */\ + "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\ + "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\ + "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\ + "paddw " #rnd ", %%mm4 \n\t" /* x2 */\ + "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\ + "psraw $5, %%mm5 \n\t"\ + "packuswb %%mm5, %%mm5 \n\t"\ + OP(%%mm5, out, %%mm7, d) + +#define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\ +static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ + uint64_t temp;\ +\ + __asm__ volatile(\ + "pxor %%mm7, %%mm7 \n\t"\ + "1: \n\t"\ + "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\ + "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\ + "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\ + "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\ + "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\ + "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\ + "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\ + "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\ + "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\ + "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\ + "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\ + "psllq $24, %%mm4 \n\t" /* 000ABCDE */\ + "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\ + "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\ + "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\ + "paddw %%mm3, %%mm5 \n\t" /* b */\ + "paddw %%mm2, %%mm6 \n\t" /* c */\ + "paddw %%mm5, %%mm5 \n\t" /* 2b */\ + "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\ + "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\ + "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\ + "paddw %%mm4, %%mm0 \n\t" /* a */\ + "paddw %%mm1, %%mm5 \n\t" /* d */\ + "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\ + "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\ + "paddw %6, %%mm6 \n\t"\ + "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ + "psraw $5, %%mm0 \n\t"\ + "movq %%mm0, %5 \n\t"\ + /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\ + \ + "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\ + "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\ + "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\ + "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\ + "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\ + "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\ + "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\ + "paddw %%mm0, %%mm2 \n\t" /* b */\ + "paddw %%mm5, %%mm3 \n\t" /* c */\ + "paddw %%mm2, %%mm2 \n\t" /* 2b */\ + "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\ + "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\ + "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\ + "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\ + "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\ + "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\ + "paddw %%mm2, %%mm1 \n\t" /* a */\ + "paddw %%mm6, %%mm4 \n\t" /* d */\ + "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\ + "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\ + "paddw %6, %%mm1 \n\t"\ + "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\ + "psraw $5, %%mm3 \n\t"\ + "movq %5, %%mm1 \n\t"\ + "packuswb %%mm3, %%mm1 \n\t"\ + OP_MMX2(%%mm1, (%1),%%mm4, q)\ + /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\ + \ + "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\ + "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\ + "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\ + "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\ + "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\ + "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\ + "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\ + "paddw %%mm1, %%mm5 \n\t" /* b */\ + "paddw %%mm4, %%mm0 \n\t" /* c */\ + "paddw %%mm5, %%mm5 \n\t" /* 2b */\ + "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\ + "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\ + "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\ + "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\ + "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\ + "paddw %%mm3, %%mm2 \n\t" /* d */\ + "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\ + "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\ + "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\ + "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\ + "paddw %%mm2, %%mm6 \n\t" /* a */\ + "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\ + "paddw %6, %%mm0 \n\t"\ + "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ + "psraw $5, %%mm0 \n\t"\ + /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\ + \ + "paddw %%mm5, %%mm3 \n\t" /* a */\ + "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\ + "paddw %%mm4, %%mm6 \n\t" /* b */\ + "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\ + "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\ + "paddw %%mm1, %%mm4 \n\t" /* c */\ + "paddw %%mm2, %%mm5 \n\t" /* d */\ + "paddw %%mm6, %%mm6 \n\t" /* 2b */\ + "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\ + "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\ + "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\ + "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\ + "paddw %6, %%mm4 \n\t"\ + "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\ + "psraw $5, %%mm4 \n\t"\ + "packuswb %%mm4, %%mm0 \n\t"\ + OP_MMX2(%%mm0, 8(%1), %%mm4, q)\ + \ + "add %3, %0 \n\t"\ + "add %4, %1 \n\t"\ + "decl %2 \n\t"\ + " jnz 1b \n\t"\ + : "+a"(src), "+c"(dst), "+D"(h)\ + : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\ + : "memory"\ + );\ +}\ +\ +static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ + int i;\ + int16_t temp[16];\ + /* quick HACK, XXX FIXME MUST be optimized */\ + for(i=0; i<h; i++)\ + {\ + temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\ + temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\ + temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\ + temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\ + temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\ + temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\ + temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\ + temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\ + temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\ + temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\ + temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\ + temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\ + temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\ + temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\ + temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\ + temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\ + __asm__ volatile(\ + "movq (%0), %%mm0 \n\t"\ + "movq 8(%0), %%mm1 \n\t"\ + "paddw %2, %%mm0 \n\t"\ + "paddw %2, %%mm1 \n\t"\ + "psraw $5, %%mm0 \n\t"\ + "psraw $5, %%mm1 \n\t"\ + "packuswb %%mm1, %%mm0 \n\t"\ + OP_3DNOW(%%mm0, (%1), %%mm1, q)\ + "movq 16(%0), %%mm0 \n\t"\ + "movq 24(%0), %%mm1 \n\t"\ + "paddw %2, %%mm0 \n\t"\ + "paddw %2, %%mm1 \n\t"\ + "psraw $5, %%mm0 \n\t"\ + "psraw $5, %%mm1 \n\t"\ + "packuswb %%mm1, %%mm0 \n\t"\ + OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\ + :: "r"(temp), "r"(dst), "m"(ROUNDER)\ + : "memory"\ + );\ + dst+=dstStride;\ + src+=srcStride;\ + }\ +}\ +\ +static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ + __asm__ volatile(\ + "pxor %%mm7, %%mm7 \n\t"\ + "1: \n\t"\ + "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\ + "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\ + "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\ + "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\ + "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\ + "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\ + "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\ + "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\ + "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\ + "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\ + "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\ + "psllq $24, %%mm4 \n\t" /* 000ABCDE */\ + "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\ + "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\ + "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\ + "paddw %%mm3, %%mm5 \n\t" /* b */\ + "paddw %%mm2, %%mm6 \n\t" /* c */\ + "paddw %%mm5, %%mm5 \n\t" /* 2b */\ + "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\ + "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\ + "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\ + "paddw %%mm4, %%mm0 \n\t" /* a */\ + "paddw %%mm1, %%mm5 \n\t" /* d */\ + "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\ + "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\ + "paddw %5, %%mm6 \n\t"\ + "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ + "psraw $5, %%mm0 \n\t"\ + /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\ + \ + "movd 5(%0), %%mm5 \n\t" /* FGHI */\ + "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\ + "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\ + "paddw %%mm5, %%mm1 \n\t" /* a */\ + "paddw %%mm6, %%mm2 \n\t" /* b */\ + "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\ + "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\ + "paddw %%mm6, %%mm3 \n\t" /* c */\ + "paddw %%mm5, %%mm4 \n\t" /* d */\ + "paddw %%mm2, %%mm2 \n\t" /* 2b */\ + "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\ + "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\ + "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\ + "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\ + "paddw %5, %%mm1 \n\t"\ + "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\ + "psraw $5, %%mm3 \n\t"\ + "packuswb %%mm3, %%mm0 \n\t"\ + OP_MMX2(%%mm0, (%1), %%mm4, q)\ + \ + "add %3, %0 \n\t"\ + "add %4, %1 \n\t"\ + "decl %2 \n\t"\ + " jnz 1b \n\t"\ + : "+a"(src), "+c"(dst), "+d"(h)\ + : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER)\ + : "memory"\ + );\ +}\ +\ +static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ + int i;\ + int16_t temp[8];\ + /* quick HACK, XXX FIXME MUST be optimized */\ + for(i=0; i<h; i++)\ + {\ + temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\ + temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\ + temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\ + temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\ + temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\ + temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\ + temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\ + temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\ + __asm__ volatile(\ + "movq (%0), %%mm0 \n\t"\ + "movq 8(%0), %%mm1 \n\t"\ + "paddw %2, %%mm0 \n\t"\ + "paddw %2, %%mm1 \n\t"\ + "psraw $5, %%mm0 \n\t"\ + "psraw $5, %%mm1 \n\t"\ + "packuswb %%mm1, %%mm0 \n\t"\ + OP_3DNOW(%%mm0, (%1), %%mm1, q)\ + :: "r"(temp), "r"(dst), "m"(ROUNDER)\ + :"memory"\ + );\ + dst+=dstStride;\ + src+=srcStride;\ + }\ +} + +#define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\ +\ +static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + uint64_t temp[17*4];\ + uint64_t *temp_ptr= temp;\ + int count= 17;\ +\ + /*FIXME unroll */\ + __asm__ volatile(\ + "pxor %%mm7, %%mm7 \n\t"\ + "1: \n\t"\ + "movq (%0), %%mm0 \n\t"\ + "movq (%0), %%mm1 \n\t"\ + "movq 8(%0), %%mm2 \n\t"\ + "movq 8(%0), %%mm3 \n\t"\ + "punpcklbw %%mm7, %%mm0 \n\t"\ + "punpckhbw %%mm7, %%mm1 \n\t"\ + "punpcklbw %%mm7, %%mm2 \n\t"\ + "punpckhbw %%mm7, %%mm3 \n\t"\ + "movq %%mm0, (%1) \n\t"\ + "movq %%mm1, 17*8(%1) \n\t"\ + "movq %%mm2, 2*17*8(%1) \n\t"\ + "movq %%mm3, 3*17*8(%1) \n\t"\ + "add $8, %1 \n\t"\ + "add %3, %0 \n\t"\ + "decl %2 \n\t"\ + " jnz 1b \n\t"\ + : "+r" (src), "+r" (temp_ptr), "+r"(count)\ + : "r" ((x86_reg)srcStride)\ + : "memory"\ + );\ + \ + temp_ptr= temp;\ + count=4;\ + \ +/*FIXME reorder for speed */\ + __asm__ volatile(\ + /*"pxor %%mm7, %%mm7 \n\t"*/\ + "1: \n\t"\ + "movq (%0), %%mm0 \n\t"\ + "movq 8(%0), %%mm1 \n\t"\ + "movq 16(%0), %%mm2 \n\t"\ + "movq 24(%0), %%mm3 \n\t"\ + QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\ + QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\ + "add %4, %1 \n\t"\ + QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\ + \ + QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\ + "add %4, %1 \n\t"\ + QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\ + QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\ + "add %4, %1 \n\t"\ + QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\ + QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\ + "add %4, %1 \n\t"\ + QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\ + QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\ + "add %4, %1 \n\t"\ + QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\ + QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\ + "add %4, %1 \n\t"\ + QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\ + \ + QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\ + "add %4, %1 \n\t" \ + QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\ + QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\ + \ + "add $136, %0 \n\t"\ + "add %6, %1 \n\t"\ + "decl %2 \n\t"\ + " jnz 1b \n\t"\ + \ + : "+r"(temp_ptr), "+r"(dst), "+g"(count)\ + : "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(x86_reg)dstStride)\ + :"memory"\ + );\ +}\ +\ +static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + uint64_t temp[9*2];\ + uint64_t *temp_ptr= temp;\ + int count= 9;\ +\ + /*FIXME unroll */\ + __asm__ volatile(\ + "pxor %%mm7, %%mm7 \n\t"\ + "1: \n\t"\ + "movq (%0), %%mm0 \n\t"\ + "movq (%0), %%mm1 \n\t"\ + "punpcklbw %%mm7, %%mm0 \n\t"\ + "punpckhbw %%mm7, %%mm1 \n\t"\ + "movq %%mm0, (%1) \n\t"\ + "movq %%mm1, 9*8(%1) \n\t"\ + "add $8, %1 \n\t"\ + "add %3, %0 \n\t"\ + "decl %2 \n\t"\ + " jnz 1b \n\t"\ + : "+r" (src), "+r" (temp_ptr), "+r"(count)\ + : "r" ((x86_reg)srcStride)\ + : "memory"\ + );\ + \ + temp_ptr= temp;\ + count=2;\ + \ +/*FIXME reorder for speed */\ + __asm__ volatile(\ + /*"pxor %%mm7, %%mm7 \n\t"*/\ + "1: \n\t"\ + "movq (%0), %%mm0 \n\t"\ + "movq 8(%0), %%mm1 \n\t"\ + "movq 16(%0), %%mm2 \n\t"\ + "movq 24(%0), %%mm3 \n\t"\ + QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\ + QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\ + "add %4, %1 \n\t"\ + QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\ + \ + QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\ + "add %4, %1 \n\t"\ + QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\ + \ + QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\ + "add %4, %1 \n\t"\ + QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\ + QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\ + \ + "add $72, %0 \n\t"\ + "add %6, %1 \n\t"\ + "decl %2 \n\t"\ + " jnz 1b \n\t"\ + \ + : "+r"(temp_ptr), "+r"(dst), "+g"(count)\ + : "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(x86_reg)dstStride)\ + : "memory"\ + );\ +}\ +\ +static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);\ +}\ +\ +static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t temp[8];\ + uint8_t * const half= (uint8_t*)temp;\ + put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\ + OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\ +}\ +\ +static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\ +}\ +\ +static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t temp[8];\ + uint8_t * const half= (uint8_t*)temp;\ + put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\ + OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\ +}\ +\ +static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t temp[8];\ + uint8_t * const half= (uint8_t*)temp;\ + put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\ + OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\ +}\ +\ +static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\ +}\ +\ +static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t temp[8];\ + uint8_t * const half= (uint8_t*)temp;\ + put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\ + OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\ +}\ +static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t half[8 + 9];\ + uint8_t * const halfH= ((uint8_t*)half) + 64;\ + uint8_t * const halfHV= ((uint8_t*)half);\ + put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ + put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\ + put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ + OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\ +}\ +static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t half[8 + 9];\ + uint8_t * const halfH= ((uint8_t*)half) + 64;\ + uint8_t * const halfHV= ((uint8_t*)half);\ + put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ + put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\ + put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ + OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\ +}\ +static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t half[8 + 9];\ + uint8_t * const halfH= ((uint8_t*)half) + 64;\ + uint8_t * const halfHV= ((uint8_t*)half);\ + put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ + put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\ + put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ + OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\ +}\ +static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t half[8 + 9];\ + uint8_t * const halfH= ((uint8_t*)half) + 64;\ + uint8_t * const halfHV= ((uint8_t*)half);\ + put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ + put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\ + put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ + OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\ +}\ +static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t half[8 + 9];\ + uint8_t * const halfH= ((uint8_t*)half) + 64;\ + uint8_t * const halfHV= ((uint8_t*)half);\ + put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ + put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ + OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\ +}\ +static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t half[8 + 9];\ + uint8_t * const halfH= ((uint8_t*)half) + 64;\ + uint8_t * const halfHV= ((uint8_t*)half);\ + put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ + put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ + OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\ +}\ +static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t half[8 + 9];\ + uint8_t * const halfH= ((uint8_t*)half);\ + put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ + put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\ + OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\ +}\ +static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t half[8 + 9];\ + uint8_t * const halfH= ((uint8_t*)half);\ + put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ + put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\ + OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\ +}\ +static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t half[9];\ + uint8_t * const halfH= ((uint8_t*)half);\ + put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ + OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\ +}\ +static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);\ +}\ +\ +static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t temp[32];\ + uint8_t * const half= (uint8_t*)temp;\ + put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\ + OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\ +}\ +\ +static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\ +}\ +\ +static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t temp[32];\ + uint8_t * const half= (uint8_t*)temp;\ + put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\ + OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\ +}\ +\ +static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t temp[32];\ + uint8_t * const half= (uint8_t*)temp;\ + put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\ + OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\ +}\ +\ +static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\ +}\ +\ +static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t temp[32];\ + uint8_t * const half= (uint8_t*)temp;\ + put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\ + OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\ +}\ +static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t half[16*2 + 17*2];\ + uint8_t * const halfH= ((uint8_t*)half) + 256;\ + uint8_t * const halfHV= ((uint8_t*)half);\ + put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ + put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\ + put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ + OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\ +}\ +static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t half[16*2 + 17*2];\ + uint8_t * const halfH= ((uint8_t*)half) + 256;\ + uint8_t * const halfHV= ((uint8_t*)half);\ + put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ + put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\ + put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ + OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\ +}\ +static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t half[16*2 + 17*2];\ + uint8_t * const halfH= ((uint8_t*)half) + 256;\ + uint8_t * const halfHV= ((uint8_t*)half);\ + put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ + put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\ + put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ + OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\ +}\ +static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t half[16*2 + 17*2];\ + uint8_t * const halfH= ((uint8_t*)half) + 256;\ + uint8_t * const halfHV= ((uint8_t*)half);\ + put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ + put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\ + put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ + OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\ +}\ +static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t half[16*2 + 17*2];\ + uint8_t * const halfH= ((uint8_t*)half) + 256;\ + uint8_t * const halfHV= ((uint8_t*)half);\ + put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ + put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ + OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\ +}\ +static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t half[16*2 + 17*2];\ + uint8_t * const halfH= ((uint8_t*)half) + 256;\ + uint8_t * const halfHV= ((uint8_t*)half);\ + put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ + put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ + OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\ +}\ +static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t half[17*2];\ + uint8_t * const halfH= ((uint8_t*)half);\ + put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ + put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\ + OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ +}\ +static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t half[17*2];\ + uint8_t * const halfH= ((uint8_t*)half);\ + put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ + put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\ + OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ +}\ +static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + uint64_t half[17*2];\ + uint8_t * const halfH= ((uint8_t*)half);\ + put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ + OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ +} + +#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t" +#define AVG_3DNOW_OP(a,b,temp, size) \ +"mov" #size " " #b ", " #temp " \n\t"\ +"pavgusb " #temp ", " #a " \n\t"\ +"mov" #size " " #a ", " #b " \n\t" +#define AVG_MMX2_OP(a,b,temp, size) \ +"mov" #size " " #b ", " #temp " \n\t"\ +"pavgb " #temp ", " #a " \n\t"\ +"mov" #size " " #a ", " #b " \n\t" + +QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP) +QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP) +QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP) +QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow) +QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow) +QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow) +QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2) +QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2) +QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2) + +/***********************************/ +/* bilinear qpel: not compliant to any spec, only for -lavdopts fast */ + +#define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\ +static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\ +} +#define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\ +static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\ +} + +#define QPEL_2TAP(OPNAME, SIZE, MMX)\ +QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\ +QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\ +QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\ +static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\ + OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\ +static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\ + OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\ +static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\ + OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\ +static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\ +}\ +static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\ +}\ +QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0)\ +QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0)\ +QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0)\ +QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0)\ +QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1)\ +QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1)\ +QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1)\ +QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\ + +QPEL_2TAP(put_, 16, mmx2) +QPEL_2TAP(avg_, 16, mmx2) +QPEL_2TAP(put_, 8, mmx2) +QPEL_2TAP(avg_, 8, mmx2) +QPEL_2TAP(put_, 16, 3dnow) +QPEL_2TAP(avg_, 16, 3dnow) +QPEL_2TAP(put_, 8, 3dnow) +QPEL_2TAP(avg_, 8, 3dnow) + + +#if 0 +static void just_return() { return; } +#endif + +static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, + int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height){ + const int w = 8; + const int ix = ox>>(16+shift); + const int iy = oy>>(16+shift); + const int oxs = ox>>4; + const int oys = oy>>4; + const int dxxs = dxx>>4; + const int dxys = dxy>>4; + const int dyxs = dyx>>4; + const int dyys = dyy>>4; + const uint16_t r4[4] = {r,r,r,r}; + const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys}; + const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys}; + const uint64_t shift2 = 2*shift; + uint8_t edge_buf[(h+1)*stride]; + int x, y; + + const int dxw = (dxx-(1<<(16+shift)))*(w-1); + const int dyh = (dyy-(1<<(16+shift)))*(h-1); + const int dxh = dxy*(h-1); + const int dyw = dyx*(w-1); + if( // non-constant fullpel offset (3% of blocks) + ((ox^(ox+dxw)) | (ox^(ox+dxh)) | (ox^(ox+dxw+dxh)) | + (oy^(oy+dyw)) | (oy^(oy+dyh)) | (oy^(oy+dyw+dyh))) >> (16+shift) + // uses more than 16 bits of subpel mv (only at huge resolution) + || (dxx|dxy|dyx|dyy)&15 ) + { + //FIXME could still use mmx for some of the rows + ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height); + return; + } + + src += ix + iy*stride; + if( (unsigned)ix >= width-w || + (unsigned)iy >= height-h ) + { + ff_emulated_edge_mc(edge_buf, src, stride, w+1, h+1, ix, iy, width, height); + src = edge_buf; + } + + __asm__ volatile( + "movd %0, %%mm6 \n\t" + "pxor %%mm7, %%mm7 \n\t" + "punpcklwd %%mm6, %%mm6 \n\t" + "punpcklwd %%mm6, %%mm6 \n\t" + :: "r"(1<<shift) + ); + + for(x=0; x<w; x+=4){ + uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0), + oxs - dxys + dxxs*(x+1), + oxs - dxys + dxxs*(x+2), + oxs - dxys + dxxs*(x+3) }; + uint16_t dy4[4] = { oys - dyys + dyxs*(x+0), + oys - dyys + dyxs*(x+1), + oys - dyys + dyxs*(x+2), + oys - dyys + dyxs*(x+3) }; + + for(y=0; y<h; y++){ + __asm__ volatile( + "movq %0, %%mm4 \n\t" + "movq %1, %%mm5 \n\t" + "paddw %2, %%mm4 \n\t" + "paddw %3, %%mm5 \n\t" + "movq %%mm4, %0 \n\t" + "movq %%mm5, %1 \n\t" + "psrlw $12, %%mm4 \n\t" + "psrlw $12, %%mm5 \n\t" + : "+m"(*dx4), "+m"(*dy4) + : "m"(*dxy4), "m"(*dyy4) + ); + + __asm__ volatile( + "movq %%mm6, %%mm2 \n\t" + "movq %%mm6, %%mm1 \n\t" + "psubw %%mm4, %%mm2 \n\t" + "psubw %%mm5, %%mm1 \n\t" + "movq %%mm2, %%mm0 \n\t" + "movq %%mm4, %%mm3 \n\t" + "pmullw %%mm1, %%mm0 \n\t" // (s-dx)*(s-dy) + "pmullw %%mm5, %%mm3 \n\t" // dx*dy + "pmullw %%mm5, %%mm2 \n\t" // (s-dx)*dy + "pmullw %%mm4, %%mm1 \n\t" // dx*(s-dy) + + "movd %4, %%mm5 \n\t" + "movd %3, %%mm4 \n\t" + "punpcklbw %%mm7, %%mm5 \n\t" + "punpcklbw %%mm7, %%mm4 \n\t" + "pmullw %%mm5, %%mm3 \n\t" // src[1,1] * dx*dy + "pmullw %%mm4, %%mm2 \n\t" // src[0,1] * (s-dx)*dy + + "movd %2, %%mm5 \n\t" + "movd %1, %%mm4 \n\t" + "punpcklbw %%mm7, %%mm5 \n\t" + "punpcklbw %%mm7, %%mm4 \n\t" + "pmullw %%mm5, %%mm1 \n\t" // src[1,0] * dx*(s-dy) + "pmullw %%mm4, %%mm0 \n\t" // src[0,0] * (s-dx)*(s-dy) + "paddw %5, %%mm1 \n\t" + "paddw %%mm3, %%mm2 \n\t" + "paddw %%mm1, %%mm0 \n\t" + "paddw %%mm2, %%mm0 \n\t" + + "psrlw %6, %%mm0 \n\t" + "packuswb %%mm0, %%mm0 \n\t" + "movd %%mm0, %0 \n\t" + + : "=m"(dst[x+y*stride]) + : "m"(src[0]), "m"(src[1]), + "m"(src[stride]), "m"(src[stride+1]), + "m"(*r4), "m"(shift2) + ); + src += stride; + } + src += 4-h*stride; + } +} + +#define PREFETCH(name, op) \ +static void name(void *mem, int stride, int h){\ + const uint8_t *p= mem;\ + do{\ + __asm__ volatile(#op" %0" :: "m"(*p));\ + p+= stride;\ + }while(--h);\ +} +PREFETCH(prefetch_mmx2, prefetcht0) +PREFETCH(prefetch_3dnow, prefetch) +#undef PREFETCH + +#include "h264dsp_mmx.c" + +/* CAVS specific */ +void ff_cavsdsp_init_mmx2(DSPContext* c, AVCodecContext *avctx); +void ff_cavsdsp_init_3dnow(DSPContext* c, AVCodecContext *avctx); + +void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) { + put_pixels8_mmx(dst, src, stride, 8); +} +void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) { + avg_pixels8_mmx(dst, src, stride, 8); +} +void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) { + put_pixels16_mmx(dst, src, stride, 16); +} +void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) { + avg_pixels16_mmx(dst, src, stride, 16); +} + +/* VC1 specific */ +void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx); + +void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) { + put_pixels8_mmx(dst, src, stride, 8); +} + +/* external functions, from idct_mmx.c */ +void ff_mmx_idct(DCTELEM *block); +void ff_mmxext_idct(DCTELEM *block); + +/* XXX: those functions should be suppressed ASAP when all IDCTs are + converted */ +#ifdef CONFIG_GPL +static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block) +{ + ff_mmx_idct (block); + put_pixels_clamped_mmx(block, dest, line_size); +} +static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block) +{ + ff_mmx_idct (block); + add_pixels_clamped_mmx(block, dest, line_size); +} +static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block) +{ + ff_mmxext_idct (block); + put_pixels_clamped_mmx(block, dest, line_size); +} +static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block) +{ + ff_mmxext_idct (block); + add_pixels_clamped_mmx(block, dest, line_size); +} +#endif +static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block) +{ + ff_idct_xvid_mmx (block); + put_pixels_clamped_mmx(block, dest, line_size); +} +static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block) +{ + ff_idct_xvid_mmx (block); + add_pixels_clamped_mmx(block, dest, line_size); +} +static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block) +{ + ff_idct_xvid_mmx2 (block); + put_pixels_clamped_mmx(block, dest, line_size); +} +static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block) +{ + ff_idct_xvid_mmx2 (block); + add_pixels_clamped_mmx(block, dest, line_size); +} + +static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize) +{ + int i; + __asm__ volatile("pxor %%mm7, %%mm7":); + for(i=0; i<blocksize; i+=2) { + __asm__ volatile( + "movq %0, %%mm0 \n\t" + "movq %1, %%mm1 \n\t" + "movq %%mm0, %%mm2 \n\t" + "movq %%mm1, %%mm3 \n\t" + "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0 + "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0 + "pslld $31, %%mm2 \n\t" // keep only the sign bit + "pxor %%mm2, %%mm1 \n\t" + "movq %%mm3, %%mm4 \n\t" + "pand %%mm1, %%mm3 \n\t" + "pandn %%mm1, %%mm4 \n\t" + "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m))) + "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m))) + "movq %%mm3, %1 \n\t" + "movq %%mm0, %0 \n\t" + :"+m"(mag[i]), "+m"(ang[i]) + ::"memory" + ); + } + __asm__ volatile("femms"); +} +static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize) +{ + int i; + + __asm__ volatile( + "movaps %0, %%xmm5 \n\t" + ::"m"(ff_pdw_80000000[0]) + ); + for(i=0; i<blocksize; i+=4) { + __asm__ volatile( + "movaps %0, %%xmm0 \n\t" + "movaps %1, %%xmm1 \n\t" + "xorps %%xmm2, %%xmm2 \n\t" + "xorps %%xmm3, %%xmm3 \n\t" + "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0 + "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0 + "andps %%xmm5, %%xmm2 \n\t" // keep only the sign bit + "xorps %%xmm2, %%xmm1 \n\t" + "movaps %%xmm3, %%xmm4 \n\t" + "andps %%xmm1, %%xmm3 \n\t" + "andnps %%xmm1, %%xmm4 \n\t" + "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m))) + "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m))) + "movaps %%xmm3, %1 \n\t" + "movaps %%xmm0, %0 \n\t" + :"+m"(mag[i]), "+m"(ang[i]) + ::"memory" + ); + } +} + +#define IF1(x) x +#define IF0(x) + +#define MIX5(mono,stereo)\ + __asm__ volatile(\ + "movss 0(%2), %%xmm5 \n"\ + "movss 8(%2), %%xmm6 \n"\ + "movss 24(%2), %%xmm7 \n"\ + "shufps $0, %%xmm5, %%xmm5 \n"\ + "shufps $0, %%xmm6, %%xmm6 \n"\ + "shufps $0, %%xmm7, %%xmm7 \n"\ + "1: \n"\ + "movaps (%0,%1), %%xmm0 \n"\ + "movaps 0x400(%0,%1), %%xmm1 \n"\ + "movaps 0x800(%0,%1), %%xmm2 \n"\ + "movaps 0xc00(%0,%1), %%xmm3 \n"\ + "movaps 0x1000(%0,%1), %%xmm4 \n"\ + "mulps %%xmm5, %%xmm0 \n"\ + "mulps %%xmm6, %%xmm1 \n"\ + "mulps %%xmm5, %%xmm2 \n"\ + "mulps %%xmm7, %%xmm3 \n"\ + "mulps %%xmm7, %%xmm4 \n"\ + stereo("addps %%xmm1, %%xmm0 \n")\ + "addps %%xmm1, %%xmm2 \n"\ + "addps %%xmm3, %%xmm0 \n"\ + "addps %%xmm4, %%xmm2 \n"\ + mono("addps %%xmm2, %%xmm0 \n")\ + "movaps %%xmm0, (%0,%1) \n"\ + stereo("movaps %%xmm2, 0x400(%0,%1) \n")\ + "add $16, %0 \n"\ + "jl 1b \n"\ + :"+&r"(i)\ + :"r"(samples[0]+len), "r"(matrix)\ + :"memory"\ + ); + +#define MIX_MISC(stereo)\ + __asm__ volatile(\ + "1: \n"\ + "movaps (%3,%0), %%xmm0 \n"\ + stereo("movaps %%xmm0, %%xmm1 \n")\ + "mulps %%xmm6, %%xmm0 \n"\ + stereo("mulps %%xmm7, %%xmm1 \n")\ + "lea 1024(%3,%0), %1 \n"\ + "mov %5, %2 \n"\ + "2: \n"\ + "movaps (%1), %%xmm2 \n"\ + stereo("movaps %%xmm2, %%xmm3 \n")\ + "mulps (%4,%2), %%xmm2 \n"\ + stereo("mulps 16(%4,%2), %%xmm3 \n")\ + "addps %%xmm2, %%xmm0 \n"\ + stereo("addps %%xmm3, %%xmm1 \n")\ + "add $1024, %1 \n"\ + "add $32, %2 \n"\ + "jl 2b \n"\ + "movaps %%xmm0, (%3,%0) \n"\ + stereo("movaps %%xmm1, 1024(%3,%0) \n")\ + "add $16, %0 \n"\ + "jl 1b \n"\ + :"+&r"(i), "=&r"(j), "=&r"(k)\ + :"r"(samples[0]+len), "r"(matrix_simd+in_ch), "g"((intptr_t)-32*(in_ch-1))\ + :"memory"\ + ); + +static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len) +{ + int (*matrix_cmp)[2] = (int(*)[2])matrix; + intptr_t i,j,k; + + i = -len*sizeof(float); + if(in_ch == 5 && out_ch == 2 && !(matrix_cmp[0][1]|matrix_cmp[2][0]|matrix_cmp[3][1]|matrix_cmp[4][0]|(matrix_cmp[1][0]^matrix_cmp[1][1])|(matrix_cmp[0][0]^matrix_cmp[2][1]))) { + MIX5(IF0,IF1); + } else if(in_ch == 5 && out_ch == 1 && matrix_cmp[0][0]==matrix_cmp[2][0] && matrix_cmp[3][0]==matrix_cmp[4][0]) { + MIX5(IF1,IF0); + } else { + DECLARE_ALIGNED_16(float, matrix_simd[in_ch][2][4]); + j = 2*in_ch*sizeof(float); + __asm__ volatile( + "1: \n" + "sub $8, %0 \n" + "movss (%2,%0), %%xmm6 \n" + "movss 4(%2,%0), %%xmm7 \n" + "shufps $0, %%xmm6, %%xmm6 \n" + "shufps $0, %%xmm7, %%xmm7 \n" + "movaps %%xmm6, (%1,%0,4) \n" + "movaps %%xmm7, 16(%1,%0,4) \n" + "jg 1b \n" + :"+&r"(j) + :"r"(matrix_simd), "r"(matrix) + :"memory" + ); + if(out_ch == 2) { + MIX_MISC(IF1); + } else { + MIX_MISC(IF0); + } + } +} + +static void vector_fmul_3dnow(float *dst, const float *src, int len){ + x86_reg i = (len-4)*4; + __asm__ volatile( + "1: \n\t" + "movq (%1,%0), %%mm0 \n\t" + "movq 8(%1,%0), %%mm1 \n\t" + "pfmul (%2,%0), %%mm0 \n\t" + "pfmul 8(%2,%0), %%mm1 \n\t" + "movq %%mm0, (%1,%0) \n\t" + "movq %%mm1, 8(%1,%0) \n\t" + "sub $16, %0 \n\t" + "jge 1b \n\t" + "femms \n\t" + :"+r"(i) + :"r"(dst), "r"(src) + :"memory" + ); +} +static void vector_fmul_sse(float *dst, const float *src, int len){ + x86_reg i = (len-8)*4; + __asm__ volatile( + "1: \n\t" + "movaps (%1,%0), %%xmm0 \n\t" + "movaps 16(%1,%0), %%xmm1 \n\t" + "mulps (%2,%0), %%xmm0 \n\t" + "mulps 16(%2,%0), %%xmm1 \n\t" + "movaps %%xmm0, (%1,%0) \n\t" + "movaps %%xmm1, 16(%1,%0) \n\t" + "sub $32, %0 \n\t" + "jge 1b \n\t" + :"+r"(i) + :"r"(dst), "r"(src) + :"memory" + ); +} + +static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){ + x86_reg i = len*4-16; + __asm__ volatile( + "1: \n\t" + "pswapd 8(%1), %%mm0 \n\t" + "pswapd (%1), %%mm1 \n\t" + "pfmul (%3,%0), %%mm0 \n\t" + "pfmul 8(%3,%0), %%mm1 \n\t" + "movq %%mm0, (%2,%0) \n\t" + "movq %%mm1, 8(%2,%0) \n\t" + "add $16, %1 \n\t" + "sub $16, %0 \n\t" + "jge 1b \n\t" + :"+r"(i), "+r"(src1) + :"r"(dst), "r"(src0) + ); + __asm__ volatile("femms"); +} +static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len){ + x86_reg i = len*4-32; + __asm__ volatile( + "1: \n\t" + "movaps 16(%1), %%xmm0 \n\t" + "movaps (%1), %%xmm1 \n\t" + "shufps $0x1b, %%xmm0, %%xmm0 \n\t" + "shufps $0x1b, %%xmm1, %%xmm1 \n\t" + "mulps (%3,%0), %%xmm0 \n\t" + "mulps 16(%3,%0), %%xmm1 \n\t" + "movaps %%xmm0, (%2,%0) \n\t" + "movaps %%xmm1, 16(%2,%0) \n\t" + "add $32, %1 \n\t" + "sub $32, %0 \n\t" + "jge 1b \n\t" + :"+r"(i), "+r"(src1) + :"r"(dst), "r"(src0) + ); +} + +static void vector_fmul_add_add_3dnow(float *dst, const float *src0, const float *src1, + const float *src2, int src3, int len, int step){ + x86_reg i = (len-4)*4; + if(step == 2 && src3 == 0){ + dst += (len-4)*2; + __asm__ volatile( + "1: \n\t" + "movq (%2,%0), %%mm0 \n\t" + "movq 8(%2,%0), %%mm1 \n\t" + "pfmul (%3,%0), %%mm0 \n\t" + "pfmul 8(%3,%0), %%mm1 \n\t" + "pfadd (%4,%0), %%mm0 \n\t" + "pfadd 8(%4,%0), %%mm1 \n\t" + "movd %%mm0, (%1) \n\t" + "movd %%mm1, 16(%1) \n\t" + "psrlq $32, %%mm0 \n\t" + "psrlq $32, %%mm1 \n\t" + "movd %%mm0, 8(%1) \n\t" + "movd %%mm1, 24(%1) \n\t" + "sub $32, %1 \n\t" + "sub $16, %0 \n\t" + "jge 1b \n\t" + :"+r"(i), "+r"(dst) + :"r"(src0), "r"(src1), "r"(src2) + :"memory" + ); + } + else if(step == 1 && src3 == 0){ + __asm__ volatile( + "1: \n\t" + "movq (%2,%0), %%mm0 \n\t" + "movq 8(%2,%0), %%mm1 \n\t" + "pfmul (%3,%0), %%mm0 \n\t" + "pfmul 8(%3,%0), %%mm1 \n\t" + "pfadd (%4,%0), %%mm0 \n\t" + "pfadd 8(%4,%0), %%mm1 \n\t" + "movq %%mm0, (%1,%0) \n\t" + "movq %%mm1, 8(%1,%0) \n\t" + "sub $16, %0 \n\t" + "jge 1b \n\t" + :"+r"(i) + :"r"(dst), "r"(src0), "r"(src1), "r"(src2) + :"memory" + ); + } + else + ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step); + __asm__ volatile("femms"); +} +static void vector_fmul_add_add_sse(float *dst, const float *src0, const float *src1, + const float *src2, int src3, int len, int step){ + x86_reg i = (len-8)*4; + if(step == 2 && src3 == 0){ + dst += (len-8)*2; + __asm__ volatile( + "1: \n\t" + "movaps (%2,%0), %%xmm0 \n\t" + "movaps 16(%2,%0), %%xmm1 \n\t" + "mulps (%3,%0), %%xmm0 \n\t" + "mulps 16(%3,%0), %%xmm1 \n\t" + "addps (%4,%0), %%xmm0 \n\t" + "addps 16(%4,%0), %%xmm1 \n\t" + "movss %%xmm0, (%1) \n\t" + "movss %%xmm1, 32(%1) \n\t" + "movhlps %%xmm0, %%xmm2 \n\t" + "movhlps %%xmm1, %%xmm3 \n\t" + "movss %%xmm2, 16(%1) \n\t" + "movss %%xmm3, 48(%1) \n\t" + "shufps $0xb1, %%xmm0, %%xmm0 \n\t" + "shufps $0xb1, %%xmm1, %%xmm1 \n\t" + "movss %%xmm0, 8(%1) \n\t" + "movss %%xmm1, 40(%1) \n\t" + "movhlps %%xmm0, %%xmm2 \n\t" + "movhlps %%xmm1, %%xmm3 \n\t" + "movss %%xmm2, 24(%1) \n\t" + "movss %%xmm3, 56(%1) \n\t" + "sub $64, %1 \n\t" + "sub $32, %0 \n\t" + "jge 1b \n\t" + :"+r"(i), "+r"(dst) + :"r"(src0), "r"(src1), "r"(src2) + :"memory" + ); + } + else if(step == 1 && src3 == 0){ + __asm__ volatile( + "1: \n\t" + "movaps (%2,%0), %%xmm0 \n\t" + "movaps 16(%2,%0), %%xmm1 \n\t" + "mulps (%3,%0), %%xmm0 \n\t" + "mulps 16(%3,%0), %%xmm1 \n\t" + "addps (%4,%0), %%xmm0 \n\t" + "addps 16(%4,%0), %%xmm1 \n\t" + "movaps %%xmm0, (%1,%0) \n\t" + "movaps %%xmm1, 16(%1,%0) \n\t" + "sub $32, %0 \n\t" + "jge 1b \n\t" + :"+r"(i) + :"r"(dst), "r"(src0), "r"(src1), "r"(src2) + :"memory" + ); + } + else + ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step); +} + +static void vector_fmul_window_3dnow2(float *dst, const float *src0, const float *src1, + const float *win, float add_bias, int len){ +#ifdef HAVE_6REGS + if(add_bias == 0){ + x86_reg i = -len*4; + x86_reg j = len*4-8; + __asm__ volatile( + "1: \n" + "pswapd (%5,%1), %%mm1 \n" + "movq (%5,%0), %%mm0 \n" + "pswapd (%4,%1), %%mm5 \n" + "movq (%3,%0), %%mm4 \n" + "movq %%mm0, %%mm2 \n" + "movq %%mm1, %%mm3 \n" + "pfmul %%mm4, %%mm2 \n" // src0[len+i]*win[len+i] + "pfmul %%mm5, %%mm3 \n" // src1[ j]*win[len+j] + "pfmul %%mm4, %%mm1 \n" // src0[len+i]*win[len+j] + "pfmul %%mm5, %%mm0 \n" // src1[ j]*win[len+i] + "pfadd %%mm3, %%mm2 \n" + "pfsub %%mm0, %%mm1 \n" + "pswapd %%mm2, %%mm2 \n" + "movq %%mm1, (%2,%0) \n" + "movq %%mm2, (%2,%1) \n" + "sub $8, %1 \n" + "add $8, %0 \n" + "jl 1b \n" + "femms \n" + :"+r"(i), "+r"(j) + :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len) + ); + }else +#endif + ff_vector_fmul_window_c(dst, src0, src1, win, add_bias, len); +} + +static void vector_fmul_window_sse(float *dst, const float *src0, const float *src1, + const float *win, float add_bias, int len){ +#ifdef HAVE_6REGS + if(add_bias == 0){ + x86_reg i = -len*4; + x86_reg j = len*4-16; + __asm__ volatile( + "1: \n" + "movaps (%5,%1), %%xmm1 \n" + "movaps (%5,%0), %%xmm0 \n" + "movaps (%4,%1), %%xmm5 \n" + "movaps (%3,%0), %%xmm4 \n" + "shufps $0x1b, %%xmm1, %%xmm1 \n" + "shufps $0x1b, %%xmm5, %%xmm5 \n" + "movaps %%xmm0, %%xmm2 \n" + "movaps %%xmm1, %%xmm3 \n" + "mulps %%xmm4, %%xmm2 \n" // src0[len+i]*win[len+i] + "mulps %%xmm5, %%xmm3 \n" // src1[ j]*win[len+j] + "mulps %%xmm4, %%xmm1 \n" // src0[len+i]*win[len+j] + "mulps %%xmm5, %%xmm0 \n" // src1[ j]*win[len+i] + "addps %%xmm3, %%xmm2 \n" + "subps %%xmm0, %%xmm1 \n" + "shufps $0x1b, %%xmm2, %%xmm2 \n" + "movaps %%xmm1, (%2,%0) \n" + "movaps %%xmm2, (%2,%1) \n" + "sub $16, %1 \n" + "add $16, %0 \n" + "jl 1b \n" + :"+r"(i), "+r"(j) + :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len) + ); + }else +#endif + ff_vector_fmul_window_c(dst, src0, src1, win, add_bias, len); +} + +static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len) +{ + x86_reg i = -4*len; + __asm__ volatile( + "movss %3, %%xmm4 \n" + "shufps $0, %%xmm4, %%xmm4 \n" + "1: \n" + "cvtpi2ps (%2,%0), %%xmm0 \n" + "cvtpi2ps 8(%2,%0), %%xmm1 \n" + "cvtpi2ps 16(%2,%0), %%xmm2 \n" + "cvtpi2ps 24(%2,%0), %%xmm3 \n" + "movlhps %%xmm1, %%xmm0 \n" + "movlhps %%xmm3, %%xmm2 \n" + "mulps %%xmm4, %%xmm0 \n" + "mulps %%xmm4, %%xmm2 \n" + "movaps %%xmm0, (%1,%0) \n" + "movaps %%xmm2, 16(%1,%0) \n" + "add $32, %0 \n" + "jl 1b \n" + :"+r"(i) + :"r"(dst+len), "r"(src+len), "m"(mul) + ); +} + +static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len) +{ + x86_reg i = -4*len; + __asm__ volatile( + "movss %3, %%xmm4 \n" + "shufps $0, %%xmm4, %%xmm4 \n" + "1: \n" + "cvtdq2ps (%2,%0), %%xmm0 \n" + "cvtdq2ps 16(%2,%0), %%xmm1 \n" + "mulps %%xmm4, %%xmm0 \n" + "mulps %%xmm4, %%xmm1 \n" + "movaps %%xmm0, (%1,%0) \n" + "movaps %%xmm1, 16(%1,%0) \n" + "add $32, %0 \n" + "jl 1b \n" + :"+r"(i) + :"r"(dst+len), "r"(src+len), "m"(mul) + ); +} + +static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){ + x86_reg reglen = len; + // not bit-exact: pf2id uses different rounding than C and SSE + __asm__ volatile( + "add %0 , %0 \n\t" + "lea (%2,%0,2) , %2 \n\t" + "add %0 , %1 \n\t" + "neg %0 \n\t" + "1: \n\t" + "pf2id (%2,%0,2) , %%mm0 \n\t" + "pf2id 8(%2,%0,2) , %%mm1 \n\t" + "pf2id 16(%2,%0,2) , %%mm2 \n\t" + "pf2id 24(%2,%0,2) , %%mm3 \n\t" + "packssdw %%mm1 , %%mm0 \n\t" + "packssdw %%mm3 , %%mm2 \n\t" + "movq %%mm0 , (%1,%0) \n\t" + "movq %%mm2 , 8(%1,%0) \n\t" + "add $16 , %0 \n\t" + " js 1b \n\t" + "femms \n\t" + :"+r"(reglen), "+r"(dst), "+r"(src) + ); +} +static void float_to_int16_sse(int16_t *dst, const float *src, long len){ + x86_reg reglen = len; + __asm__ volatile( + "add %0 , %0 \n\t" + "lea (%2,%0,2) , %2 \n\t" + "add %0 , %1 \n\t" + "neg %0 \n\t" + "1: \n\t" + "cvtps2pi (%2,%0,2) , %%mm0 \n\t" + "cvtps2pi 8(%2,%0,2) , %%mm1 \n\t" + "cvtps2pi 16(%2,%0,2) , %%mm2 \n\t" + "cvtps2pi 24(%2,%0,2) , %%mm3 \n\t" + "packssdw %%mm1 , %%mm0 \n\t" + "packssdw %%mm3 , %%mm2 \n\t" + "movq %%mm0 , (%1,%0) \n\t" + "movq %%mm2 , 8(%1,%0) \n\t" + "add $16 , %0 \n\t" + " js 1b \n\t" + "emms \n\t" + :"+r"(reglen), "+r"(dst), "+r"(src) + ); +} + +static void float_to_int16_sse2(int16_t *dst, const float *src, long len){ + x86_reg reglen = len; + __asm__ volatile( + "add %0 , %0 \n\t" + "lea (%2,%0,2) , %2 \n\t" + "add %0 , %1 \n\t" + "neg %0 \n\t" + "1: \n\t" + "cvtps2dq (%2,%0,2) , %%xmm0 \n\t" + "cvtps2dq 16(%2,%0,2) , %%xmm1 \n\t" + "packssdw %%xmm1 , %%xmm0 \n\t" + "movdqa %%xmm0 , (%1,%0) \n\t" + "add $16 , %0 \n\t" + " js 1b \n\t" + :"+r"(reglen), "+r"(dst), "+r"(src) + ); +} + +#ifdef HAVE_YASM +void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len); +void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len); +void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len); +void ff_x264_deblock_v_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); +void ff_x264_deblock_h_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); +void ff_x264_deblock_v8_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta); +void ff_x264_deblock_h_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta); +static void ff_x264_deblock_v_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta) +{ + ff_x264_deblock_v8_luma_intra_mmxext(pix+0, stride, alpha, beta); + ff_x264_deblock_v8_luma_intra_mmxext(pix+8, stride, alpha, beta); +} +void ff_x264_deblock_v_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta); +void ff_x264_deblock_h_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta); +#else +#define ff_float_to_int16_interleave6_sse(a,b,c) float_to_int16_interleave_misc_sse(a,b,c,6) +#define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6) +#define ff_float_to_int16_interleave6_3dn2(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6) +#endif +#define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse + +#define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \ +/* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\ +static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\ + DECLARE_ALIGNED_16(int16_t, tmp[len]);\ + int i,j,c;\ + for(c=0; c<channels; c++){\ + float_to_int16_##cpu(tmp, src[c], len);\ + for(i=0, j=c; i<len; i++, j+=channels)\ + dst[j] = tmp[i];\ + }\ +}\ +\ +static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\ + if(channels==1)\ + float_to_int16_##cpu(dst, src[0], len);\ + else if(channels==2){\ + x86_reg reglen = len; \ + const float *src0 = src[0];\ + const float *src1 = src[1];\ + __asm__ volatile(\ + "shl $2, %0 \n"\ + "add %0, %1 \n"\ + "add %0, %2 \n"\ + "add %0, %3 \n"\ + "neg %0 \n"\ + body\ + :"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\ + );\ + }else if(channels==6){\ + ff_float_to_int16_interleave6_##cpu(dst, src, len);\ + }else\ + float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\ +} + +FLOAT_TO_INT16_INTERLEAVE(3dnow, + "1: \n" + "pf2id (%2,%0), %%mm0 \n" + "pf2id 8(%2,%0), %%mm1 \n" + "pf2id (%3,%0), %%mm2 \n" + "pf2id 8(%3,%0), %%mm3 \n" + "packssdw %%mm1, %%mm0 \n" + "packssdw %%mm3, %%mm2 \n" + "movq %%mm0, %%mm1 \n" + "punpcklwd %%mm2, %%mm0 \n" + "punpckhwd %%mm2, %%mm1 \n" + "movq %%mm0, (%1,%0)\n" + "movq %%mm1, 8(%1,%0)\n" + "add $16, %0 \n" + "js 1b \n" + "femms \n" +) + +FLOAT_TO_INT16_INTERLEAVE(sse, + "1: \n" + "cvtps2pi (%2,%0), %%mm0 \n" + "cvtps2pi 8(%2,%0), %%mm1 \n" + "cvtps2pi (%3,%0), %%mm2 \n" + "cvtps2pi 8(%3,%0), %%mm3 \n" + "packssdw %%mm1, %%mm0 \n" + "packssdw %%mm3, %%mm2 \n" + "movq %%mm0, %%mm1 \n" + "punpcklwd %%mm2, %%mm0 \n" + "punpckhwd %%mm2, %%mm1 \n" + "movq %%mm0, (%1,%0)\n" + "movq %%mm1, 8(%1,%0)\n" + "add $16, %0 \n" + "js 1b \n" + "emms \n" +) + +FLOAT_TO_INT16_INTERLEAVE(sse2, + "1: \n" + "cvtps2dq (%2,%0), %%xmm0 \n" + "cvtps2dq (%3,%0), %%xmm1 \n" + "packssdw %%xmm1, %%xmm0 \n" + "movhlps %%xmm0, %%xmm1 \n" + "punpcklwd %%xmm1, %%xmm0 \n" + "movdqa %%xmm0, (%1,%0) \n" + "add $16, %0 \n" + "js 1b \n" +) + +static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){ + if(channels==6) + ff_float_to_int16_interleave6_3dn2(dst, src, len); + else + float_to_int16_interleave_3dnow(dst, src, len, channels); +} + + +void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, int width); +void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, int width); +void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width); +void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width); +void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, + int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8); +void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, + int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8); + + +static void add_int16_sse2(int16_t * v1, int16_t * v2, int order) +{ + x86_reg o = -(order << 1); + v1 += order; + v2 += order; + __asm__ volatile( + "1: \n\t" + "movdqu (%1,%2), %%xmm0 \n\t" + "movdqu 16(%1,%2), %%xmm1 \n\t" + "paddw (%0,%2), %%xmm0 \n\t" + "paddw 16(%0,%2), %%xmm1 \n\t" + "movdqa %%xmm0, (%0,%2) \n\t" + "movdqa %%xmm1, 16(%0,%2) \n\t" + "add $32, %2 \n\t" + "js 1b \n\t" + : "+r"(v1), "+r"(v2), "+r"(o) + ); +} + +static void sub_int16_sse2(int16_t * v1, int16_t * v2, int order) +{ + x86_reg o = -(order << 1); + v1 += order; + v2 += order; + __asm__ volatile( + "1: \n\t" + "movdqa (%0,%2), %%xmm0 \n\t" + "movdqa 16(%0,%2), %%xmm2 \n\t" + "movdqu (%1,%2), %%xmm1 \n\t" + "movdqu 16(%1,%2), %%xmm3 \n\t" + "psubw %%xmm1, %%xmm0 \n\t" + "psubw %%xmm3, %%xmm2 \n\t" + "movdqa %%xmm0, (%0,%2) \n\t" + "movdqa %%xmm2, 16(%0,%2) \n\t" + "add $32, %2 \n\t" + "js 1b \n\t" + : "+r"(v1), "+r"(v2), "+r"(o) + ); +} + +static int32_t scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift) +{ + int res = 0; + DECLARE_ALIGNED_16(int64_t, sh); + x86_reg o = -(order << 1); + + v1 += order; + v2 += order; + sh = shift; + __asm__ volatile( + "pxor %%xmm7, %%xmm7 \n\t" + "1: \n\t" + "movdqu (%0,%3), %%xmm0 \n\t" + "movdqu 16(%0,%3), %%xmm1 \n\t" + "pmaddwd (%1,%3), %%xmm0 \n\t" + "pmaddwd 16(%1,%3), %%xmm1 \n\t" + "paddd %%xmm0, %%xmm7 \n\t" + "paddd %%xmm1, %%xmm7 \n\t" + "add $32, %3 \n\t" + "js 1b \n\t" + "movhlps %%xmm7, %%xmm2 \n\t" + "paddd %%xmm2, %%xmm7 \n\t" + "psrad %4, %%xmm7 \n\t" + "pshuflw $0x4E, %%xmm7,%%xmm2 \n\t" + "paddd %%xmm2, %%xmm7 \n\t" + "movd %%xmm7, %2 \n\t" + : "+r"(v1), "+r"(v2), "=r"(res), "+r"(o) + : "m"(sh) + ); + return res; +} + +void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) +{ + mm_flags = mm_support(); + + if (avctx->dsp_mask) { + if (avctx->dsp_mask & FF_MM_FORCE) + mm_flags |= (avctx->dsp_mask & 0xffff); + else + mm_flags &= ~(avctx->dsp_mask & 0xffff); + } + +#if 0 + av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:"); + if (mm_flags & FF_MM_MMX) + av_log(avctx, AV_LOG_INFO, " mmx"); + if (mm_flags & FF_MM_MMXEXT) + av_log(avctx, AV_LOG_INFO, " mmxext"); + if (mm_flags & FF_MM_3DNOW) + av_log(avctx, AV_LOG_INFO, " 3dnow"); + if (mm_flags & FF_MM_SSE) + av_log(avctx, AV_LOG_INFO, " sse"); + if (mm_flags & FF_MM_SSE2) + av_log(avctx, AV_LOG_INFO, " sse2"); + av_log(avctx, AV_LOG_INFO, "\n"); +#endif + + if (mm_flags & FF_MM_MMX) { + const int idct_algo= avctx->idct_algo; + + if(avctx->lowres==0){ + if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){ + c->idct_put= ff_simple_idct_put_mmx; + c->idct_add= ff_simple_idct_add_mmx; + c->idct = ff_simple_idct_mmx; + c->idct_permutation_type= FF_SIMPLE_IDCT_PERM; +#ifdef CONFIG_GPL + }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){ + if(mm_flags & FF_MM_MMXEXT){ + c->idct_put= ff_libmpeg2mmx2_idct_put; + c->idct_add= ff_libmpeg2mmx2_idct_add; + c->idct = ff_mmxext_idct; + }else{ + c->idct_put= ff_libmpeg2mmx_idct_put; + c->idct_add= ff_libmpeg2mmx_idct_add; + c->idct = ff_mmx_idct; + } + c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM; +#endif + }else if((ENABLE_VP3_DECODER || ENABLE_VP5_DECODER || ENABLE_VP6_DECODER || ENABLE_THEORA_DECODER) && + idct_algo==FF_IDCT_VP3){ + if(mm_flags & FF_MM_SSE2){ + c->idct_put= ff_vp3_idct_put_sse2; + c->idct_add= ff_vp3_idct_add_sse2; + c->idct = ff_vp3_idct_sse2; + c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM; + }else{ + c->idct_put= ff_vp3_idct_put_mmx; + c->idct_add= ff_vp3_idct_add_mmx; + c->idct = ff_vp3_idct_mmx; + c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM; + } + }else if(idct_algo==FF_IDCT_CAVS){ + c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM; + }else if(idct_algo==FF_IDCT_XVIDMMX){ + if(mm_flags & FF_MM_SSE2){ + c->idct_put= ff_idct_xvid_sse2_put; + c->idct_add= ff_idct_xvid_sse2_add; + c->idct = ff_idct_xvid_sse2; + c->idct_permutation_type= FF_SSE2_IDCT_PERM; + }else if(mm_flags & FF_MM_MMXEXT){ + c->idct_put= ff_idct_xvid_mmx2_put; + c->idct_add= ff_idct_xvid_mmx2_add; + c->idct = ff_idct_xvid_mmx2; + }else{ + c->idct_put= ff_idct_xvid_mmx_put; + c->idct_add= ff_idct_xvid_mmx_add; + c->idct = ff_idct_xvid_mmx; + } + } + } + + c->put_pixels_clamped = put_pixels_clamped_mmx; + c->put_signed_pixels_clamped = put_signed_pixels_clamped_mmx; + c->add_pixels_clamped = add_pixels_clamped_mmx; + c->clear_block = clear_block_mmx; + c->clear_blocks = clear_blocks_mmx; + if (mm_flags & FF_MM_SSE) + c->clear_block = clear_block_sse; + +#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \ + c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \ + c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \ + c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \ + c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU + + SET_HPEL_FUNCS(put, 0, 16, mmx); + SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx); + SET_HPEL_FUNCS(avg, 0, 16, mmx); + SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx); + SET_HPEL_FUNCS(put, 1, 8, mmx); + SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx); + SET_HPEL_FUNCS(avg, 1, 8, mmx); + SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx); + + c->gmc= gmc_mmx; + + c->add_bytes= add_bytes_mmx; + c->add_bytes_l2= add_bytes_l2_mmx; + + c->draw_edges = draw_edges_mmx; + + if (ENABLE_ANY_H263) { + c->h263_v_loop_filter= h263_v_loop_filter_mmx; + c->h263_h_loop_filter= h263_h_loop_filter_mmx; + } + c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx_rnd; + c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_mmx; + c->put_no_rnd_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx_nornd; + + c->h264_idct_dc_add= + c->h264_idct_add= ff_h264_idct_add_mmx; + c->h264_idct8_dc_add= + c->h264_idct8_add= ff_h264_idct8_add_mmx; + + c->h264_idct_add16 = ff_h264_idct_add16_mmx; + c->h264_idct8_add4 = ff_h264_idct8_add4_mmx; + c->h264_idct_add8 = ff_h264_idct_add8_mmx; + c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx; + + if (mm_flags & FF_MM_MMXEXT) { + c->prefetch = prefetch_mmx2; + + c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2; + c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2; + + c->avg_pixels_tab[0][0] = avg_pixels16_mmx2; + c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2; + c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2; + + c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2; + c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2; + + c->avg_pixels_tab[1][0] = avg_pixels8_mmx2; + c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2; + c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2; + + c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2; + c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2; + c->h264_idct_add16 = ff_h264_idct_add16_mmx2; + c->h264_idct8_add4 = ff_h264_idct8_add4_mmx2; + c->h264_idct_add8 = ff_h264_idct_add8_mmx2; + c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2; + + if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ + c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2; + c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2; + c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2; + c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2; + c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2; + c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2; + + if (ENABLE_VP3_DECODER || ENABLE_THEORA_DECODER) { + c->vp3_v_loop_filter= ff_vp3_v_loop_filter_mmx2; + c->vp3_h_loop_filter= ff_vp3_h_loop_filter_mmx2; + } + } + +#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU) \ + c->PFX ## _pixels_tab[IDX][ 0] = PFX ## SIZE ## _mc00_ ## CPU; \ + c->PFX ## _pixels_tab[IDX][ 1] = PFX ## SIZE ## _mc10_ ## CPU; \ + c->PFX ## _pixels_tab[IDX][ 2] = PFX ## SIZE ## _mc20_ ## CPU; \ + c->PFX ## _pixels_tab[IDX][ 3] = PFX ## SIZE ## _mc30_ ## CPU; \ + c->PFX ## _pixels_tab[IDX][ 4] = PFX ## SIZE ## _mc01_ ## CPU; \ + c->PFX ## _pixels_tab[IDX][ 5] = PFX ## SIZE ## _mc11_ ## CPU; \ + c->PFX ## _pixels_tab[IDX][ 6] = PFX ## SIZE ## _mc21_ ## CPU; \ + c->PFX ## _pixels_tab[IDX][ 7] = PFX ## SIZE ## _mc31_ ## CPU; \ + c->PFX ## _pixels_tab[IDX][ 8] = PFX ## SIZE ## _mc02_ ## CPU; \ + c->PFX ## _pixels_tab[IDX][ 9] = PFX ## SIZE ## _mc12_ ## CPU; \ + c->PFX ## _pixels_tab[IDX][10] = PFX ## SIZE ## _mc22_ ## CPU; \ + c->PFX ## _pixels_tab[IDX][11] = PFX ## SIZE ## _mc32_ ## CPU; \ + c->PFX ## _pixels_tab[IDX][12] = PFX ## SIZE ## _mc03_ ## CPU; \ + c->PFX ## _pixels_tab[IDX][13] = PFX ## SIZE ## _mc13_ ## CPU; \ + c->PFX ## _pixels_tab[IDX][14] = PFX ## SIZE ## _mc23_ ## CPU; \ + c->PFX ## _pixels_tab[IDX][15] = PFX ## SIZE ## _mc33_ ## CPU + + SET_QPEL_FUNCS(put_qpel, 0, 16, mmx2); + SET_QPEL_FUNCS(put_qpel, 1, 8, mmx2); + SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2); + SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmx2); + SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2); + SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2); + + SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2); + SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2); + SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2); + SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2); + SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2); + SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2); + + SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2); + SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2); + SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2); + SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2); + + c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_mmx2_rnd; + c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_mmx2; + c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_mmx2; + c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_mmx2; + c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2; + c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2; + c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2; + c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2; + c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2; + c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2; + c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2; + + c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2; + c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2; + c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2; + c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2; + c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2; + c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2; + c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2; + c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2; + + c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2; + c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2; + c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2; + c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2; + c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2; + c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2; + c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2; + c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2; + + if (ENABLE_CAVS_DECODER) + ff_cavsdsp_init_mmx2(c, avctx); + + if (ENABLE_VC1_DECODER || ENABLE_WMV3_DECODER) + ff_vc1dsp_init_mmx(c, avctx); + + c->add_png_paeth_prediction= add_png_paeth_prediction_mmx2; + } else if (mm_flags & FF_MM_3DNOW) { + c->prefetch = prefetch_3dnow; + + c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow; + c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow; + + c->avg_pixels_tab[0][0] = avg_pixels16_3dnow; + c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow; + c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow; + + c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow; + c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow; + + c->avg_pixels_tab[1][0] = avg_pixels8_3dnow; + c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow; + c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow; + + if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ + c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow; + c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow; + c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow; + c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow; + c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow; + c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow; + } + + SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow); + SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow); + SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow); + SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, 3dnow); + SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow); + SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow); + + SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow); + SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow); + SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow); + SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow); + SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow); + SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow); + + SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow); + SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow); + SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow); + SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow); + + c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_3dnow_rnd; + c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_3dnow; + + if (ENABLE_CAVS_DECODER) + ff_cavsdsp_init_3dnow(c, avctx); + } + + +#define H264_QPEL_FUNCS(x, y, CPU)\ + c->put_h264_qpel_pixels_tab[0][x+y*4] = put_h264_qpel16_mc##x##y##_##CPU;\ + c->put_h264_qpel_pixels_tab[1][x+y*4] = put_h264_qpel8_mc##x##y##_##CPU;\ + c->avg_h264_qpel_pixels_tab[0][x+y*4] = avg_h264_qpel16_mc##x##y##_##CPU;\ + c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU; + if((mm_flags & FF_MM_SSE2) && !(mm_flags & FF_MM_3DNOW)){ + // these functions are slower than mmx on AMD, but faster on Intel +/* FIXME works in most codecs, but crashes svq1 due to unaligned chroma + c->put_pixels_tab[0][0] = put_pixels16_sse2; + c->avg_pixels_tab[0][0] = avg_pixels16_sse2; +*/ + H264_QPEL_FUNCS(0, 0, sse2); + } + if(mm_flags & FF_MM_SSE2){ + c->h264_idct8_add = ff_h264_idct8_add_sse2; + c->h264_idct8_add4= ff_h264_idct8_add4_sse2; + + H264_QPEL_FUNCS(0, 1, sse2); + H264_QPEL_FUNCS(0, 2, sse2); + H264_QPEL_FUNCS(0, 3, sse2); + H264_QPEL_FUNCS(1, 1, sse2); + H264_QPEL_FUNCS(1, 2, sse2); + H264_QPEL_FUNCS(1, 3, sse2); + H264_QPEL_FUNCS(2, 1, sse2); + H264_QPEL_FUNCS(2, 2, sse2); + H264_QPEL_FUNCS(2, 3, sse2); + H264_QPEL_FUNCS(3, 1, sse2); + H264_QPEL_FUNCS(3, 2, sse2); + H264_QPEL_FUNCS(3, 3, sse2); + } +#ifdef HAVE_SSSE3 + if(mm_flags & FF_MM_SSSE3){ + H264_QPEL_FUNCS(1, 0, ssse3); + H264_QPEL_FUNCS(1, 1, ssse3); + H264_QPEL_FUNCS(1, 2, ssse3); + H264_QPEL_FUNCS(1, 3, ssse3); + H264_QPEL_FUNCS(2, 0, ssse3); + H264_QPEL_FUNCS(2, 1, ssse3); + H264_QPEL_FUNCS(2, 2, ssse3); + H264_QPEL_FUNCS(2, 3, ssse3); + H264_QPEL_FUNCS(3, 0, ssse3); + H264_QPEL_FUNCS(3, 1, ssse3); + H264_QPEL_FUNCS(3, 2, ssse3); + H264_QPEL_FUNCS(3, 3, ssse3); + c->put_no_rnd_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_ssse3_nornd; + c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_ssse3_rnd; + c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_ssse3_rnd; + c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_ssse3; + c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_ssse3; + c->add_png_paeth_prediction= add_png_paeth_prediction_ssse3; + } +#endif + +#if defined(CONFIG_GPL) && defined(HAVE_YASM) + if( mm_flags&FF_MM_MMXEXT ){ +#ifdef ARCH_X86_32 + c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_mmxext; + c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_mmxext; +#endif + if( mm_flags&FF_MM_SSE2 ){ + c->h264_v_loop_filter_luma = ff_x264_deblock_v_luma_sse2; + c->h264_h_loop_filter_luma = ff_x264_deblock_h_luma_sse2; + c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_sse2; + c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_sse2; + } + } +#endif + +#ifdef CONFIG_SNOW_DECODER + if(mm_flags & FF_MM_SSE2 & 0){ + c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2; +#ifdef HAVE_7REGS + c->vertical_compose97i = ff_snow_vertical_compose97i_sse2; +#endif + c->inner_add_yblock = ff_snow_inner_add_yblock_sse2; + } + else{ + if(mm_flags & FF_MM_MMXEXT){ + c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx; +#ifdef HAVE_7REGS + c->vertical_compose97i = ff_snow_vertical_compose97i_mmx; +#endif + } + c->inner_add_yblock = ff_snow_inner_add_yblock_mmx; + } +#endif + + if(mm_flags & FF_MM_3DNOW){ + c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow; + c->vector_fmul = vector_fmul_3dnow; + if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ + c->float_to_int16 = float_to_int16_3dnow; + c->float_to_int16_interleave = float_to_int16_interleave_3dnow; + } + } + if(mm_flags & FF_MM_3DNOWEXT){ + c->vector_fmul_reverse = vector_fmul_reverse_3dnow2; + c->vector_fmul_window = vector_fmul_window_3dnow2; + if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ + c->float_to_int16_interleave = float_to_int16_interleave_3dn2; + } + } + if(mm_flags & FF_MM_SSE){ + c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse; + c->ac3_downmix = ac3_downmix_sse; + c->vector_fmul = vector_fmul_sse; + c->vector_fmul_reverse = vector_fmul_reverse_sse; + c->vector_fmul_add_add = vector_fmul_add_add_sse; + c->vector_fmul_window = vector_fmul_window_sse; + c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse; + c->float_to_int16 = float_to_int16_sse; + c->float_to_int16_interleave = float_to_int16_interleave_sse; + } + if(mm_flags & FF_MM_3DNOW) + c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse + if(mm_flags & FF_MM_SSE2){ + c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2; + c->float_to_int16 = float_to_int16_sse2; + c->float_to_int16_interleave = float_to_int16_interleave_sse2; + c->add_int16 = add_int16_sse2; + c->sub_int16 = sub_int16_sse2; + c->scalarproduct_int16 = scalarproduct_int16_sse2; + } + } + + if (ENABLE_ENCODERS) + dsputilenc_init_mmx(c, avctx); + +#if 0 + // for speed testing + get_pixels = just_return; + put_pixels_clamped = just_return; + add_pixels_clamped = just_return; + + pix_abs16x16 = just_return; + pix_abs16x16_x2 = just_return; + pix_abs16x16_y2 = just_return; + pix_abs16x16_xy2 = just_return; + + put_pixels_tab[0] = just_return; + put_pixels_tab[1] = just_return; + put_pixels_tab[2] = just_return; + put_pixels_tab[3] = just_return; + + put_no_rnd_pixels_tab[0] = just_return; + put_no_rnd_pixels_tab[1] = just_return; + put_no_rnd_pixels_tab[2] = just_return; + put_no_rnd_pixels_tab[3] = just_return; + + avg_pixels_tab[0] = just_return; + avg_pixels_tab[1] = just_return; + avg_pixels_tab[2] = just_return; + avg_pixels_tab[3] = just_return; + + avg_no_rnd_pixels_tab[0] = just_return; + avg_no_rnd_pixels_tab[1] = just_return; + avg_no_rnd_pixels_tab[2] = just_return; + avg_no_rnd_pixels_tab[3] = just_return; + + //av_fdct = just_return; + //ff_idct = just_return; +#endif +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/x86/dsputil_mmx.h Mon Dec 22 09:12:42 2008 +0000 @@ -0,0 +1,154 @@ +/* + * MMX optimized DSP utils + * Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_X86_DSPUTIL_MMX_H +#define AVCODEC_X86_DSPUTIL_MMX_H + +#include <stdint.h> +#include "libavcodec/dsputil.h" + +typedef struct { uint64_t a, b; } xmm_reg; + +extern const uint64_t ff_bone; +extern const uint64_t ff_wtwo; + +extern const uint64_t ff_pdw_80000000[2]; + +extern const uint64_t ff_pw_3; +extern const uint64_t ff_pw_4; +extern const xmm_reg ff_pw_5; +extern const xmm_reg ff_pw_8; +extern const uint64_t ff_pw_15; +extern const xmm_reg ff_pw_16; +extern const uint64_t ff_pw_20; +extern const xmm_reg ff_pw_28; +extern const xmm_reg ff_pw_32; +extern const uint64_t ff_pw_42; +extern const uint64_t ff_pw_64; +extern const uint64_t ff_pw_96; +extern const uint64_t ff_pw_128; +extern const uint64_t ff_pw_255; + +extern const uint64_t ff_pb_1; +extern const uint64_t ff_pb_3; +extern const uint64_t ff_pb_7; +extern const uint64_t ff_pb_1F; +extern const uint64_t ff_pb_3F; +extern const uint64_t ff_pb_81; +extern const uint64_t ff_pb_A1; +extern const uint64_t ff_pb_FC; + +extern const double ff_pd_1[2]; +extern const double ff_pd_2[2]; + +#define LOAD4(stride,in,a,b,c,d)\ + "movq 0*"#stride"+"#in", "#a"\n\t"\ + "movq 1*"#stride"+"#in", "#b"\n\t"\ + "movq 2*"#stride"+"#in", "#c"\n\t"\ + "movq 3*"#stride"+"#in", "#d"\n\t" + +#define STORE4(stride,out,a,b,c,d)\ + "movq "#a", 0*"#stride"+"#out"\n\t"\ + "movq "#b", 1*"#stride"+"#out"\n\t"\ + "movq "#c", 2*"#stride"+"#out"\n\t"\ + "movq "#d", 3*"#stride"+"#out"\n\t" + +/* in/out: mma=mma+mmb, mmb=mmb-mma */ +#define SUMSUB_BA( a, b ) \ + "paddw "#b", "#a" \n\t"\ + "paddw "#b", "#b" \n\t"\ + "psubw "#a", "#b" \n\t" + +#define SBUTTERFLY(a,b,t,n,m)\ + "mov" #m " " #a ", " #t " \n\t" /* abcd */\ + "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\ + "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\ + +#define TRANSPOSE4(a,b,c,d,t)\ + SBUTTERFLY(a,b,t,wd,q) /* a=aebf t=cgdh */\ + SBUTTERFLY(c,d,b,wd,q) /* c=imjn b=kolp */\ + SBUTTERFLY(a,c,d,dq,q) /* a=aeim d=bfjn */\ + SBUTTERFLY(t,b,c,dq,q) /* t=cgko c=dhlp */ + +// e,f,g,h can be memory +// out: a,d,t,c +#define TRANSPOSE8x4(a,b,c,d,e,f,g,h,t)\ + "punpcklbw " #e ", " #a " \n\t" /* a0 e0 a1 e1 a2 e2 a3 e3 */\ + "punpcklbw " #f ", " #b " \n\t" /* b0 f0 b1 f1 b2 f2 b3 f3 */\ + "punpcklbw " #g ", " #c " \n\t" /* c0 g0 c1 g1 c2 g2 d3 g3 */\ + "punpcklbw " #h ", " #d " \n\t" /* d0 h0 d1 h1 d2 h2 d3 h3 */\ + SBUTTERFLY(a, b, t, bw, q) /* a= a0 b0 e0 f0 a1 b1 e1 f1 */\ + /* t= a2 b2 e2 f2 a3 b3 e3 f3 */\ + SBUTTERFLY(c, d, b, bw, q) /* c= c0 d0 g0 h0 c1 d1 g1 h1 */\ + /* b= c2 d2 g2 h2 c3 d3 g3 h3 */\ + SBUTTERFLY(a, c, d, wd, q) /* a= a0 b0 c0 d0 e0 f0 g0 h0 */\ + /* d= a1 b1 c1 d1 e1 f1 g1 h1 */\ + SBUTTERFLY(t, b, c, wd, q) /* t= a2 b2 c2 d2 e2 f2 g2 h2 */\ + /* c= a3 b3 c3 d3 e3 f3 g3 h3 */ + +#ifdef ARCH_X86_64 +// permutes 01234567 -> 05736421 +#define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\ + SBUTTERFLY(a,b,%%xmm8,wd,dqa)\ + SBUTTERFLY(c,d,b,wd,dqa)\ + SBUTTERFLY(e,f,d,wd,dqa)\ + SBUTTERFLY(g,h,f,wd,dqa)\ + SBUTTERFLY(a,c,h,dq,dqa)\ + SBUTTERFLY(%%xmm8,b,c,dq,dqa)\ + SBUTTERFLY(e,g,b,dq,dqa)\ + SBUTTERFLY(d,f,g,dq,dqa)\ + SBUTTERFLY(a,e,f,qdq,dqa)\ + SBUTTERFLY(%%xmm8,d,e,qdq,dqa)\ + SBUTTERFLY(h,b,d,qdq,dqa)\ + SBUTTERFLY(c,g,b,qdq,dqa)\ + "movdqa %%xmm8, "#g" \n\t" +#else +#define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\ + "movdqa "#h", "#t" \n\t"\ + SBUTTERFLY(a,b,h,wd,dqa)\ + "movdqa "#h", 16"#t" \n\t"\ + "movdqa "#t", "#h" \n\t"\ + SBUTTERFLY(c,d,b,wd,dqa)\ + SBUTTERFLY(e,f,d,wd,dqa)\ + SBUTTERFLY(g,h,f,wd,dqa)\ + SBUTTERFLY(a,c,h,dq,dqa)\ + "movdqa "#h", "#t" \n\t"\ + "movdqa 16"#t", "#h" \n\t"\ + SBUTTERFLY(h,b,c,dq,dqa)\ + SBUTTERFLY(e,g,b,dq,dqa)\ + SBUTTERFLY(d,f,g,dq,dqa)\ + SBUTTERFLY(a,e,f,qdq,dqa)\ + SBUTTERFLY(h,d,e,qdq,dqa)\ + "movdqa "#h", 16"#t" \n\t"\ + "movdqa "#t", "#h" \n\t"\ + SBUTTERFLY(h,b,d,qdq,dqa)\ + SBUTTERFLY(c,g,b,qdq,dqa)\ + "movdqa 16"#t", "#g" \n\t" +#endif + +#define MOVQ_WONE(regd) \ + __asm__ volatile ( \ + "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ + "psrlw $15, %%" #regd ::) + +void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx); + +#endif /* AVCODEC_X86_DSPUTIL_MMX_H */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/x86/dsputil_mmx_avg_template.c Mon Dec 22 09:12:42 2008 +0000 @@ -0,0 +1,896 @@ +/* + * DSP utils : average functions are compiled twice for 3dnow/mmx2 + * Copyright (c) 2000, 2001 Fabrice Bellard. + * Copyright (c) 2002-2004 Michael Niedermayer + * + * MMX optimization by Nick Kurshev <nickols_k@mail.ru> + * mostly rewritten by Michael Niedermayer <michaelni@gmx.at> + * and improved by Zdenek Kabelac <kabi@users.sf.net> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm + clobber bug - now it will work with 2.95.2 and also with -fPIC + */ +static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) +{ + __asm__ volatile( + "lea (%3, %3), %%"REG_a" \n\t" + "1: \n\t" + "movq (%1), %%mm0 \n\t" + "movq (%1, %3), %%mm1 \n\t" + PAVGB" 1(%1), %%mm0 \n\t" + PAVGB" 1(%1, %3), %%mm1 \n\t" + "movq %%mm0, (%2) \n\t" + "movq %%mm1, (%2, %3) \n\t" + "add %%"REG_a", %1 \n\t" + "add %%"REG_a", %2 \n\t" + "movq (%1), %%mm0 \n\t" + "movq (%1, %3), %%mm1 \n\t" + PAVGB" 1(%1), %%mm0 \n\t" + PAVGB" 1(%1, %3), %%mm1 \n\t" + "add %%"REG_a", %1 \n\t" + "movq %%mm0, (%2) \n\t" + "movq %%mm1, (%2, %3) \n\t" + "add %%"REG_a", %2 \n\t" + "subl $4, %0 \n\t" + "jnz 1b \n\t" + :"+g"(h), "+S"(pixels), "+D"(block) + :"r" ((x86_reg)line_size) + :"%"REG_a, "memory"); +} + +static void DEF(put_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) +{ + __asm__ volatile( + "testl $1, %0 \n\t" + " jz 1f \n\t" + "movd (%1), %%mm0 \n\t" + "movd (%2), %%mm1 \n\t" + "add %4, %1 \n\t" + "add $4, %2 \n\t" + PAVGB" %%mm1, %%mm0 \n\t" + "movd %%mm0, (%3) \n\t" + "add %5, %3 \n\t" + "decl %0 \n\t" + "1: \n\t" + "movd (%1), %%mm0 \n\t" + "add %4, %1 \n\t" + "movd (%1), %%mm1 \n\t" + "movd (%2), %%mm2 \n\t" + "movd 4(%2), %%mm3 \n\t" + "add %4, %1 \n\t" + PAVGB" %%mm2, %%mm0 \n\t" + PAVGB" %%mm3, %%mm1 \n\t" + "movd %%mm0, (%3) \n\t" + "add %5, %3 \n\t" + "movd %%mm1, (%3) \n\t" + "add %5, %3 \n\t" + "movd (%1), %%mm0 \n\t" + "add %4, %1 \n\t" + "movd (%1), %%mm1 \n\t" + "movd 8(%2), %%mm2 \n\t" + "movd 12(%2), %%mm3 \n\t" + "add %4, %1 \n\t" + PAVGB" %%mm2, %%mm0 \n\t" + PAVGB" %%mm3, %%mm1 \n\t" + "movd %%mm0, (%3) \n\t" + "add %5, %3 \n\t" + "movd %%mm1, (%3) \n\t" + "add %5, %3 \n\t" + "add $16, %2 \n\t" + "subl $4, %0 \n\t" + "jnz 1b \n\t" +#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used + :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) +#else + :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) +#endif + :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) + :"memory"); +} + + +static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) +{ + __asm__ volatile( + "testl $1, %0 \n\t" + " jz 1f \n\t" + "movq (%1), %%mm0 \n\t" + "movq (%2), %%mm1 \n\t" + "add %4, %1 \n\t" + "add $8, %2 \n\t" + PAVGB" %%mm1, %%mm0 \n\t" + "movq %%mm0, (%3) \n\t" + "add %5, %3 \n\t" + "decl %0 \n\t" + "1: \n\t" + "movq (%1), %%mm0 \n\t" + "add %4, %1 \n\t" + "movq (%1), %%mm1 \n\t" + "add %4, %1 \n\t" + PAVGB" (%2), %%mm0 \n\t" + PAVGB" 8(%2), %%mm1 \n\t" + "movq %%mm0, (%3) \n\t" + "add %5, %3 \n\t" + "movq %%mm1, (%3) \n\t" + "add %5, %3 \n\t" + "movq (%1), %%mm0 \n\t" + "add %4, %1 \n\t" + "movq (%1), %%mm1 \n\t" + "add %4, %1 \n\t" + PAVGB" 16(%2), %%mm0 \n\t" + PAVGB" 24(%2), %%mm1 \n\t" + "movq %%mm0, (%3) \n\t" + "add %5, %3 \n\t" + "movq %%mm1, (%3) \n\t" + "add %5, %3 \n\t" + "add $32, %2 \n\t" + "subl $4, %0 \n\t" + "jnz 1b \n\t" +#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used + :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) +#else + :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) +#endif + :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) + :"memory"); +//the following should be used, though better not with gcc ... +/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) + :"r"(src1Stride), "r"(dstStride) + :"memory");*/ +} + +static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) +{ + __asm__ volatile( + "pcmpeqb %%mm6, %%mm6 \n\t" + "testl $1, %0 \n\t" + " jz 1f \n\t" + "movq (%1), %%mm0 \n\t" + "movq (%2), %%mm1 \n\t" + "add %4, %1 \n\t" + "add $8, %2 \n\t" + "pxor %%mm6, %%mm0 \n\t" + "pxor %%mm6, %%mm1 \n\t" + PAVGB" %%mm1, %%mm0 \n\t" + "pxor %%mm6, %%mm0 \n\t" + "movq %%mm0, (%3) \n\t" + "add %5, %3 \n\t" + "decl %0 \n\t" + "1: \n\t" + "movq (%1), %%mm0 \n\t" + "add %4, %1 \n\t" + "movq (%1), %%mm1 \n\t" + "add %4, %1 \n\t" + "movq (%2), %%mm2 \n\t" + "movq 8(%2), %%mm3 \n\t" + "pxor %%mm6, %%mm0 \n\t" + "pxor %%mm6, %%mm1 \n\t" + "pxor %%mm6, %%mm2 \n\t" + "pxor %%mm6, %%mm3 \n\t" + PAVGB" %%mm2, %%mm0 \n\t" + PAVGB" %%mm3, %%mm1 \n\t" + "pxor %%mm6, %%mm0 \n\t" + "pxor %%mm6, %%mm1 \n\t" + "movq %%mm0, (%3) \n\t" + "add %5, %3 \n\t" + "movq %%mm1, (%3) \n\t" + "add %5, %3 \n\t" + "movq (%1), %%mm0 \n\t" + "add %4, %1 \n\t" + "movq (%1), %%mm1 \n\t" + "add %4, %1 \n\t" + "movq 16(%2), %%mm2 \n\t" + "movq 24(%2), %%mm3 \n\t" + "pxor %%mm6, %%mm0 \n\t" + "pxor %%mm6, %%mm1 \n\t" + "pxor %%mm6, %%mm2 \n\t" + "pxor %%mm6, %%mm3 \n\t" + PAVGB" %%mm2, %%mm0 \n\t" + PAVGB" %%mm3, %%mm1 \n\t" + "pxor %%mm6, %%mm0 \n\t" + "pxor %%mm6, %%mm1 \n\t" + "movq %%mm0, (%3) \n\t" + "add %5, %3 \n\t" + "movq %%mm1, (%3) \n\t" + "add %5, %3 \n\t" + "add $32, %2 \n\t" + "subl $4, %0 \n\t" + "jnz 1b \n\t" +#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used + :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) +#else + :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) +#endif + :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) + :"memory"); +//the following should be used, though better not with gcc ... +/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) + :"r"(src1Stride), "r"(dstStride) + :"memory");*/ +} + +static void DEF(avg_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) +{ + __asm__ volatile( + "testl $1, %0 \n\t" + " jz 1f \n\t" + "movd (%1), %%mm0 \n\t" + "movd (%2), %%mm1 \n\t" + "add %4, %1 \n\t" + "add $4, %2 \n\t" + PAVGB" %%mm1, %%mm0 \n\t" + PAVGB" (%3), %%mm0 \n\t" + "movd %%mm0, (%3) \n\t" + "add %5, %3 \n\t" + "decl %0 \n\t" + "1: \n\t" + "movd (%1), %%mm0 \n\t" + "add %4, %1 \n\t" + "movd (%1), %%mm1 \n\t" + "add %4, %1 \n\t" + PAVGB" (%2), %%mm0 \n\t" + PAVGB" 4(%2), %%mm1 \n\t" + PAVGB" (%3), %%mm0 \n\t" + "movd %%mm0, (%3) \n\t" + "add %5, %3 \n\t" + PAVGB" (%3), %%mm1 \n\t" + "movd %%mm1, (%3) \n\t" + "add %5, %3 \n\t" + "movd (%1), %%mm0 \n\t" + "add %4, %1 \n\t" + "movd (%1), %%mm1 \n\t" + "add %4, %1 \n\t" + PAVGB" 8(%2), %%mm0 \n\t" + PAVGB" 12(%2), %%mm1 \n\t" + PAVGB" (%3), %%mm0 \n\t" + "movd %%mm0, (%3) \n\t" + "add %5, %3 \n\t" + PAVGB" (%3), %%mm1 \n\t" + "movd %%mm1, (%3) \n\t" + "add %5, %3 \n\t" + "add $16, %2 \n\t" + "subl $4, %0 \n\t" + "jnz 1b \n\t" +#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used + :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) +#else + :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) +#endif + :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) + :"memory"); +} + + +static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) +{ + __asm__ volatile( + "testl $1, %0 \n\t" + " jz 1f \n\t" + "movq (%1), %%mm0 \n\t" + "movq (%2), %%mm1 \n\t" + "add %4, %1 \n\t" + "add $8, %2 \n\t" + PAVGB" %%mm1, %%mm0 \n\t" + PAVGB" (%3), %%mm0 \n\t" + "movq %%mm0, (%3) \n\t" + "add %5, %3 \n\t" + "decl %0 \n\t" + "1: \n\t" + "movq (%1), %%mm0 \n\t" + "add %4, %1 \n\t" + "movq (%1), %%mm1 \n\t" + "add %4, %1 \n\t" + PAVGB" (%2), %%mm0 \n\t" + PAVGB" 8(%2), %%mm1 \n\t" + PAVGB" (%3), %%mm0 \n\t" + "movq %%mm0, (%3) \n\t" + "add %5, %3 \n\t" + PAVGB" (%3), %%mm1 \n\t" + "movq %%mm1, (%3) \n\t" + "add %5, %3 \n\t" + "movq (%1), %%mm0 \n\t" + "add %4, %1 \n\t" + "movq (%1), %%mm1 \n\t" + "add %4, %1 \n\t" + PAVGB" 16(%2), %%mm0 \n\t" + PAVGB" 24(%2), %%mm1 \n\t" + PAVGB" (%3), %%mm0 \n\t" + "movq %%mm0, (%3) \n\t" + "add %5, %3 \n\t" + PAVGB" (%3), %%mm1 \n\t" + "movq %%mm1, (%3) \n\t" + "add %5, %3 \n\t" + "add $32, %2 \n\t" + "subl $4, %0 \n\t" + "jnz 1b \n\t" +#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used + :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) +#else + :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) +#endif + :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) + :"memory"); +//the following should be used, though better not with gcc ... +/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) + :"r"(src1Stride), "r"(dstStride) + :"memory");*/ +} + +static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) +{ + __asm__ volatile( + "lea (%3, %3), %%"REG_a" \n\t" + "1: \n\t" + "movq (%1), %%mm0 \n\t" + "movq (%1, %3), %%mm1 \n\t" + "movq 8(%1), %%mm2 \n\t" + "movq 8(%1, %3), %%mm3 \n\t" + PAVGB" 1(%1), %%mm0 \n\t" + PAVGB" 1(%1, %3), %%mm1 \n\t" + PAVGB" 9(%1), %%mm2 \n\t" + PAVGB" 9(%1, %3), %%mm3 \n\t" + "movq %%mm0, (%2) \n\t" + "movq %%mm1, (%2, %3) \n\t" + "movq %%mm2, 8(%2) \n\t" + "movq %%mm3, 8(%2, %3) \n\t" + "add %%"REG_a", %1 \n\t" + "add %%"REG_a", %2 \n\t" + "movq (%1), %%mm0 \n\t" + "movq (%1, %3), %%mm1 \n\t" + "movq 8(%1), %%mm2 \n\t" + "movq 8(%1, %3), %%mm3 \n\t" + PAVGB" 1(%1), %%mm0 \n\t" + PAVGB" 1(%1, %3), %%mm1 \n\t" + PAVGB" 9(%1), %%mm2 \n\t" + PAVGB" 9(%1, %3), %%mm3 \n\t" + "add %%"REG_a", %1 \n\t" + "movq %%mm0, (%2) \n\t" + "movq %%mm1, (%2, %3) \n\t" + "movq %%mm2, 8(%2) \n\t" + "movq %%mm3, 8(%2, %3) \n\t" + "add %%"REG_a", %2 \n\t" + "subl $4, %0 \n\t" + "jnz 1b \n\t" + :"+g"(h), "+S"(pixels), "+D"(block) + :"r" ((x86_reg)line_size) + :"%"REG_a, "memory"); +} + +static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) +{ + __asm__ volatile( + "testl $1, %0 \n\t" + " jz 1f \n\t" + "movq (%1), %%mm0 \n\t" + "movq 8(%1), %%mm1 \n\t" + PAVGB" (%2), %%mm0 \n\t" + PAVGB" 8(%2), %%mm1 \n\t" + "add %4, %1 \n\t" + "add $16, %2 \n\t" + "movq %%mm0, (%3) \n\t" + "movq %%mm1, 8(%3) \n\t" + "add %5, %3 \n\t" + "decl %0 \n\t" + "1: \n\t" + "movq (%1), %%mm0 \n\t" + "movq 8(%1), %%mm1 \n\t" + "add %4, %1 \n\t" + PAVGB" (%2), %%mm0 \n\t" + PAVGB" 8(%2), %%mm1 \n\t" + "movq %%mm0, (%3) \n\t" + "movq %%mm1, 8(%3) \n\t" + "add %5, %3 \n\t" + "movq (%1), %%mm0 \n\t" + "movq 8(%1), %%mm1 \n\t" + "add %4, %1 \n\t" + PAVGB" 16(%2), %%mm0 \n\t" + PAVGB" 24(%2), %%mm1 \n\t" + "movq %%mm0, (%3) \n\t" + "movq %%mm1, 8(%3) \n\t" + "add %5, %3 \n\t" + "add $32, %2 \n\t" + "subl $2, %0 \n\t" + "jnz 1b \n\t" +#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used + :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) +#else + :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) +#endif + :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) + :"memory"); +//the following should be used, though better not with gcc ... +/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) + :"r"(src1Stride), "r"(dstStride) + :"memory");*/ +} + +static void DEF(avg_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) +{ + __asm__ volatile( + "testl $1, %0 \n\t" + " jz 1f \n\t" + "movq (%1), %%mm0 \n\t" + "movq 8(%1), %%mm1 \n\t" + PAVGB" (%2), %%mm0 \n\t" + PAVGB" 8(%2), %%mm1 \n\t" + "add %4, %1 \n\t" + "add $16, %2 \n\t" + PAVGB" (%3), %%mm0 \n\t" + PAVGB" 8(%3), %%mm1 \n\t" + "movq %%mm0, (%3) \n\t" + "movq %%mm1, 8(%3) \n\t" + "add %5, %3 \n\t" + "decl %0 \n\t" + "1: \n\t" + "movq (%1), %%mm0 \n\t" + "movq 8(%1), %%mm1 \n\t" + "add %4, %1 \n\t" + PAVGB" (%2), %%mm0 \n\t" + PAVGB" 8(%2), %%mm1 \n\t" + PAVGB" (%3), %%mm0 \n\t" + PAVGB" 8(%3), %%mm1 \n\t" + "movq %%mm0, (%3) \n\t" + "movq %%mm1, 8(%3) \n\t" + "add %5, %3 \n\t" + "movq (%1), %%mm0 \n\t" + "movq 8(%1), %%mm1 \n\t" + "add %4, %1 \n\t" + PAVGB" 16(%2), %%mm0 \n\t" + PAVGB" 24(%2), %%mm1 \n\t" + PAVGB" (%3), %%mm0 \n\t" + PAVGB" 8(%3), %%mm1 \n\t" + "movq %%mm0, (%3) \n\t" + "movq %%mm1, 8(%3) \n\t" + "add %5, %3 \n\t" + "add $32, %2 \n\t" + "subl $2, %0 \n\t" + "jnz 1b \n\t" +#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used + :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) +#else + :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) +#endif + :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) + :"memory"); +//the following should be used, though better not with gcc ... +/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) + :"r"(src1Stride), "r"(dstStride) + :"memory");*/ +} + +static void DEF(put_no_rnd_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) +{ + __asm__ volatile( + "pcmpeqb %%mm6, %%mm6 \n\t" + "testl $1, %0 \n\t" + " jz 1f \n\t" + "movq (%1), %%mm0 \n\t" + "movq 8(%1), %%mm1 \n\t" + "movq (%2), %%mm2 \n\t" + "movq 8(%2), %%mm3 \n\t" + "pxor %%mm6, %%mm0 \n\t" + "pxor %%mm6, %%mm1 \n\t" + "pxor %%mm6, %%mm2 \n\t" + "pxor %%mm6, %%mm3 \n\t" + PAVGB" %%mm2, %%mm0 \n\t" + PAVGB" %%mm3, %%mm1 \n\t" + "pxor %%mm6, %%mm0 \n\t" + "pxor %%mm6, %%mm1 \n\t" + "add %4, %1 \n\t" + "add $16, %2 \n\t" + "movq %%mm0, (%3) \n\t" + "movq %%mm1, 8(%3) \n\t" + "add %5, %3 \n\t" + "decl %0 \n\t" + "1: \n\t" + "movq (%1), %%mm0 \n\t" + "movq 8(%1), %%mm1 \n\t" + "add %4, %1 \n\t" + "movq (%2), %%mm2 \n\t" + "movq 8(%2), %%mm3 \n\t" + "pxor %%mm6, %%mm0 \n\t" + "pxor %%mm6, %%mm1 \n\t" + "pxor %%mm6, %%mm2 \n\t" + "pxor %%mm6, %%mm3 \n\t" + PAVGB" %%mm2, %%mm0 \n\t" + PAVGB" %%mm3, %%mm1 \n\t" + "pxor %%mm6, %%mm0 \n\t" + "pxor %%mm6, %%mm1 \n\t" + "movq %%mm0, (%3) \n\t" + "movq %%mm1, 8(%3) \n\t" + "add %5, %3 \n\t" + "movq (%1), %%mm0 \n\t" + "movq 8(%1), %%mm1 \n\t" + "add %4, %1 \n\t" + "movq 16(%2), %%mm2 \n\t" + "movq 24(%2), %%mm3 \n\t" + "pxor %%mm6, %%mm0 \n\t" + "pxor %%mm6, %%mm1 \n\t" + "pxor %%mm6, %%mm2 \n\t" + "pxor %%mm6, %%mm3 \n\t" + PAVGB" %%mm2, %%mm0 \n\t" + PAVGB" %%mm3, %%mm1 \n\t" + "pxor %%mm6, %%mm0 \n\t" + "pxor %%mm6, %%mm1 \n\t" + "movq %%mm0, (%3) \n\t" + "movq %%mm1, 8(%3) \n\t" + "add %5, %3 \n\t" + "add $32, %2 \n\t" + "subl $2, %0 \n\t" + "jnz 1b \n\t" +#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used + :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) +#else + :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) +#endif + :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) + :"memory"); +//the following should be used, though better not with gcc ... +/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) + :"r"(src1Stride), "r"(dstStride) + :"memory");*/ +} + +/* GL: this function does incorrect rounding if overflow */ +static void DEF(put_no_rnd_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) +{ + MOVQ_BONE(mm6); + __asm__ volatile( + "lea (%3, %3), %%"REG_a" \n\t" + "1: \n\t" + "movq (%1), %%mm0 \n\t" + "movq (%1, %3), %%mm2 \n\t" + "movq 1(%1), %%mm1 \n\t" + "movq 1(%1, %3), %%mm3 \n\t" + "add %%"REG_a", %1 \n\t" + "psubusb %%mm6, %%mm0 \n\t" + "psubusb %%mm6, %%mm2 \n\t" + PAVGB" %%mm1, %%mm0 \n\t" + PAVGB" %%mm3, %%mm2 \n\t" + "movq %%mm0, (%2) \n\t" + "movq %%mm2, (%2, %3) \n\t" + "movq (%1), %%mm0 \n\t" + "movq 1(%1), %%mm1 \n\t" + "movq (%1, %3), %%mm2 \n\t" + "movq 1(%1, %3), %%mm3 \n\t" + "add %%"REG_a", %2 \n\t" + "add %%"REG_a", %1 \n\t" + "psubusb %%mm6, %%mm0 \n\t" + "psubusb %%mm6, %%mm2 \n\t" + PAVGB" %%mm1, %%mm0 \n\t" + PAVGB" %%mm3, %%mm2 \n\t" + "movq %%mm0, (%2) \n\t" + "movq %%mm2, (%2, %3) \n\t" + "add %%"REG_a", %2 \n\t" + "subl $4, %0 \n\t" + "jnz 1b \n\t" + :"+g"(h), "+S"(pixels), "+D"(block) + :"r" ((x86_reg)line_size) + :"%"REG_a, "memory"); +} + +static void DEF(put_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) +{ + __asm__ volatile( + "lea (%3, %3), %%"REG_a" \n\t" + "movq (%1), %%mm0 \n\t" + "sub %3, %2 \n\t" + "1: \n\t" + "movq (%1, %3), %%mm1 \n\t" + "movq (%1, %%"REG_a"), %%mm2 \n\t" + "add %%"REG_a", %1 \n\t" + PAVGB" %%mm1, %%mm0 \n\t" + PAVGB" %%mm2, %%mm1 \n\t" + "movq %%mm0, (%2, %3) \n\t" + "movq %%mm1, (%2, %%"REG_a") \n\t" + "movq (%1, %3), %%mm1 \n\t" + "movq (%1, %%"REG_a"), %%mm0 \n\t" + "add %%"REG_a", %2 \n\t" + "add %%"REG_a", %1 \n\t" + PAVGB" %%mm1, %%mm2 \n\t" + PAVGB" %%mm0, %%mm1 \n\t" + "movq %%mm2, (%2, %3) \n\t" + "movq %%mm1, (%2, %%"REG_a") \n\t" + "add %%"REG_a", %2 \n\t" + "subl $4, %0 \n\t" + "jnz 1b \n\t" + :"+g"(h), "+S"(pixels), "+D" (block) + :"r" ((x86_reg)line_size) + :"%"REG_a, "memory"); +} + +/* GL: this function does incorrect rounding if overflow */ +static void DEF(put_no_rnd_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) +{ + MOVQ_BONE(mm6); + __asm__ volatile( + "lea (%3, %3), %%"REG_a" \n\t" + "movq (%1), %%mm0 \n\t" + "sub %3, %2 \n\t" + "1: \n\t" + "movq (%1, %3), %%mm1 \n\t" + "movq (%1, %%"REG_a"), %%mm2 \n\t" + "add %%"REG_a", %1 \n\t" + "psubusb %%mm6, %%mm1 \n\t" + PAVGB" %%mm1, %%mm0 \n\t" + PAVGB" %%mm2, %%mm1 \n\t" + "movq %%mm0, (%2, %3) \n\t" + "movq %%mm1, (%2, %%"REG_a") \n\t" + "movq (%1, %3), %%mm1 \n\t" + "movq (%1, %%"REG_a"), %%mm0 \n\t" + "add %%"REG_a", %2 \n\t" + "add %%"REG_a", %1 \n\t" + "psubusb %%mm6, %%mm1 \n\t" + PAVGB" %%mm1, %%mm2 \n\t" + PAVGB" %%mm0, %%mm1 \n\t" + "movq %%mm2, (%2, %3) \n\t" + "movq %%mm1, (%2, %%"REG_a") \n\t" + "add %%"REG_a", %2 \n\t" + "subl $4, %0 \n\t" + "jnz 1b \n\t" + :"+g"(h), "+S"(pixels), "+D" (block) + :"r" ((x86_reg)line_size) + :"%"REG_a, "memory"); +} + +static void DEF(avg_pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h) +{ + __asm__ volatile( + "lea (%3, %3), %%"REG_a" \n\t" + "1: \n\t" + "movq (%2), %%mm0 \n\t" + "movq (%2, %3), %%mm1 \n\t" + PAVGB" (%1), %%mm0 \n\t" + PAVGB" (%1, %3), %%mm1 \n\t" + "movq %%mm0, (%2) \n\t" + "movq %%mm1, (%2, %3) \n\t" + "add %%"REG_a", %1 \n\t" + "add %%"REG_a", %2 \n\t" + "movq (%2), %%mm0 \n\t" + "movq (%2, %3), %%mm1 \n\t" + PAVGB" (%1), %%mm0 \n\t" + PAVGB" (%1, %3), %%mm1 \n\t" + "add %%"REG_a", %1 \n\t" + "movq %%mm0, (%2) \n\t" + "movq %%mm1, (%2, %3) \n\t" + "add %%"REG_a", %2 \n\t" + "subl $4, %0 \n\t" + "jnz 1b \n\t" + :"+g"(h), "+S"(pixels), "+D"(block) + :"r" ((x86_reg)line_size) + :"%"REG_a, "memory"); +} + +static void DEF(avg_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) +{ + __asm__ volatile( + "lea (%3, %3), %%"REG_a" \n\t" + "1: \n\t" + "movq (%1), %%mm0 \n\t" + "movq (%1, %3), %%mm2 \n\t" + PAVGB" 1(%1), %%mm0 \n\t" + PAVGB" 1(%1, %3), %%mm2 \n\t" + PAVGB" (%2), %%mm0 \n\t" + PAVGB" (%2, %3), %%mm2 \n\t" + "add %%"REG_a", %1 \n\t" + "movq %%mm0, (%2) \n\t" + "movq %%mm2, (%2, %3) \n\t" + "movq (%1), %%mm0 \n\t" + "movq (%1, %3), %%mm2 \n\t" + PAVGB" 1(%1), %%mm0 \n\t" + PAVGB" 1(%1, %3), %%mm2 \n\t" + "add %%"REG_a", %2 \n\t" + "add %%"REG_a", %1 \n\t" + PAVGB" (%2), %%mm0 \n\t" + PAVGB" (%2, %3), %%mm2 \n\t" + "movq %%mm0, (%2) \n\t" + "movq %%mm2, (%2, %3) \n\t" + "add %%"REG_a", %2 \n\t" + "subl $4, %0 \n\t" + "jnz 1b \n\t" + :"+g"(h), "+S"(pixels), "+D"(block) + :"r" ((x86_reg)line_size) + :"%"REG_a, "memory"); +} + +static void DEF(avg_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) +{ + __asm__ volatile( + "lea (%3, %3), %%"REG_a" \n\t" + "movq (%1), %%mm0 \n\t" + "sub %3, %2 \n\t" + "1: \n\t" + "movq (%1, %3), %%mm1 \n\t" + "movq (%1, %%"REG_a"), %%mm2 \n\t" + "add %%"REG_a", %1 \n\t" + PAVGB" %%mm1, %%mm0 \n\t" + PAVGB" %%mm2, %%mm1 \n\t" + "movq (%2, %3), %%mm3 \n\t" + "movq (%2, %%"REG_a"), %%mm4 \n\t" + PAVGB" %%mm3, %%mm0 \n\t" + PAVGB" %%mm4, %%mm1 \n\t" + "movq %%mm0, (%2, %3) \n\t" + "movq %%mm1, (%2, %%"REG_a") \n\t" + "movq (%1, %3), %%mm1 \n\t" + "movq (%1, %%"REG_a"), %%mm0 \n\t" + PAVGB" %%mm1, %%mm2 \n\t" + PAVGB" %%mm0, %%mm1 \n\t" + "add %%"REG_a", %2 \n\t" + "add %%"REG_a", %1 \n\t" + "movq (%2, %3), %%mm3 \n\t" + "movq (%2, %%"REG_a"), %%mm4 \n\t" + PAVGB" %%mm3, %%mm2 \n\t" + PAVGB" %%mm4, %%mm1 \n\t" + "movq %%mm2, (%2, %3) \n\t" + "movq %%mm1, (%2, %%"REG_a") \n\t" + "add %%"REG_a", %2 \n\t" + "subl $4, %0 \n\t" + "jnz 1b \n\t" + :"+g"(h), "+S"(pixels), "+D"(block) + :"r" ((x86_reg)line_size) + :"%"REG_a, "memory"); +} + +/* Note this is not correctly rounded, but this function is only + * used for B-frames so it does not matter. */ +static void DEF(avg_pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) +{ + MOVQ_BONE(mm6); + __asm__ volatile( + "lea (%3, %3), %%"REG_a" \n\t" + "movq (%1), %%mm0 \n\t" + PAVGB" 1(%1), %%mm0 \n\t" + ASMALIGN(3) + "1: \n\t" + "movq (%1, %%"REG_a"), %%mm2 \n\t" + "movq (%1, %3), %%mm1 \n\t" + "psubusb %%mm6, %%mm2 \n\t" + PAVGB" 1(%1, %3), %%mm1 \n\t" + PAVGB" 1(%1, %%"REG_a"), %%mm2 \n\t" + "add %%"REG_a", %1 \n\t" + PAVGB" %%mm1, %%mm0 \n\t" + PAVGB" %%mm2, %%mm1 \n\t" + PAVGB" (%2), %%mm0 \n\t" + PAVGB" (%2, %3), %%mm1 \n\t" + "movq %%mm0, (%2) \n\t" + "movq %%mm1, (%2, %3) \n\t" + "movq (%1, %3), %%mm1 \n\t" + "movq (%1, %%"REG_a"), %%mm0 \n\t" + PAVGB" 1(%1, %3), %%mm1 \n\t" + PAVGB" 1(%1, %%"REG_a"), %%mm0 \n\t" + "add %%"REG_a", %2 \n\t" + "add %%"REG_a", %1 \n\t" + PAVGB" %%mm1, %%mm2 \n\t" + PAVGB" %%mm0, %%mm1 \n\t" + PAVGB" (%2), %%mm2 \n\t" + PAVGB" (%2, %3), %%mm1 \n\t" + "movq %%mm2, (%2) \n\t" + "movq %%mm1, (%2, %3) \n\t" + "add %%"REG_a", %2 \n\t" + "subl $4, %0 \n\t" + "jnz 1b \n\t" + :"+g"(h), "+S"(pixels), "+D"(block) + :"r" ((x86_reg)line_size) + :"%"REG_a, "memory"); +} + +static void DEF(avg_pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h) +{ + do { + __asm__ volatile( + "movd (%1), %%mm0 \n\t" + "movd (%1, %2), %%mm1 \n\t" + "movd (%1, %2, 2), %%mm2 \n\t" + "movd (%1, %3), %%mm3 \n\t" + PAVGB" (%0), %%mm0 \n\t" + PAVGB" (%0, %2), %%mm1 \n\t" + PAVGB" (%0, %2, 2), %%mm2 \n\t" + PAVGB" (%0, %3), %%mm3 \n\t" + "movd %%mm0, (%1) \n\t" + "movd %%mm1, (%1, %2) \n\t" + "movd %%mm2, (%1, %2, 2) \n\t" + "movd %%mm3, (%1, %3) \n\t" + ::"S"(pixels), "D"(block), + "r" ((x86_reg)line_size), "r"((x86_reg)3L*line_size) + :"memory"); + block += 4*line_size; + pixels += 4*line_size; + h -= 4; + } while(h > 0); +} + +//FIXME the following could be optimized too ... +static void DEF(put_no_rnd_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ + DEF(put_no_rnd_pixels8_x2)(block , pixels , line_size, h); + DEF(put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h); +} +static void DEF(put_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ + DEF(put_pixels8_y2)(block , pixels , line_size, h); + DEF(put_pixels8_y2)(block+8, pixels+8, line_size, h); +} +static void DEF(put_no_rnd_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ + DEF(put_no_rnd_pixels8_y2)(block , pixels , line_size, h); + DEF(put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h); +} +static void DEF(avg_pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ + DEF(avg_pixels8)(block , pixels , line_size, h); + DEF(avg_pixels8)(block+8, pixels+8, line_size, h); +} +static void DEF(avg_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ + DEF(avg_pixels8_x2)(block , pixels , line_size, h); + DEF(avg_pixels8_x2)(block+8, pixels+8, line_size, h); +} +static void DEF(avg_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ + DEF(avg_pixels8_y2)(block , pixels , line_size, h); + DEF(avg_pixels8_y2)(block+8, pixels+8, line_size, h); +} +static void DEF(avg_pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ + DEF(avg_pixels8_xy2)(block , pixels , line_size, h); + DEF(avg_pixels8_xy2)(block+8, pixels+8, line_size, h); +} + +#define QPEL_2TAP_L3(OPNAME) \ +static void DEF(OPNAME ## 2tap_qpel16_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2){\ + __asm__ volatile(\ + "1: \n\t"\ + "movq (%1,%2), %%mm0 \n\t"\ + "movq 8(%1,%2), %%mm1 \n\t"\ + PAVGB" (%1,%3), %%mm0 \n\t"\ + PAVGB" 8(%1,%3), %%mm1 \n\t"\ + PAVGB" (%1), %%mm0 \n\t"\ + PAVGB" 8(%1), %%mm1 \n\t"\ + STORE_OP( (%1,%4),%%mm0)\ + STORE_OP(8(%1,%4),%%mm1)\ + "movq %%mm0, (%1,%4) \n\t"\ + "movq %%mm1, 8(%1,%4) \n\t"\ + "add %5, %1 \n\t"\ + "decl %0 \n\t"\ + "jnz 1b \n\t"\ + :"+g"(h), "+r"(src)\ + :"r"((x86_reg)off1), "r"((x86_reg)off2),\ + "r"((x86_reg)(dst-src)), "r"((x86_reg)stride)\ + :"memory"\ + );\ +}\ +static void DEF(OPNAME ## 2tap_qpel8_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2){\ + __asm__ volatile(\ + "1: \n\t"\ + "movq (%1,%2), %%mm0 \n\t"\ + PAVGB" (%1,%3), %%mm0 \n\t"\ + PAVGB" (%1), %%mm0 \n\t"\ + STORE_OP((%1,%4),%%mm0)\ + "movq %%mm0, (%1,%4) \n\t"\ + "add %5, %1 \n\t"\ + "decl %0 \n\t"\ + "jnz 1b \n\t"\ + :"+g"(h), "+r"(src)\ + :"r"((x86_reg)off1), "r"((x86_reg)off2),\ + "r"((x86_reg)(dst-src)), "r"((x86_reg)stride)\ + :"memory"\ + );\ +} + +#define STORE_OP(a,b) PAVGB" "#a","#b" \n\t" +QPEL_2TAP_L3(avg_) +#undef STORE_OP +#define STORE_OP(a,b) +QPEL_2TAP_L3(put_) +#undef STORE_OP +#undef QPEL_2TAP_L3
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/x86/dsputil_mmx_qns_template.c Mon Dec 22 09:12:42 2008 +0000 @@ -0,0 +1,101 @@ +/* + * DSP utils : QNS functions are compiled 3 times for mmx/3dnow/ssse3 + * Copyright (c) 2004 Michael Niedermayer + * + * MMX optimization by Michael Niedermayer <michaelni@gmx.at> + * 3DNow! and SSSE3 optimization by Zuxy Meng <zuxy.meng@gmail.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define MAX_ABS (512 >> (SCALE_OFFSET>0 ? SCALE_OFFSET : 0)) + +static int DEF(try_8x8basis)(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale) +{ + x86_reg i=0; + + assert(FFABS(scale) < MAX_ABS); + scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT; + + SET_RND(mm6); + __asm__ volatile( + "pxor %%mm7, %%mm7 \n\t" + "movd %4, %%mm5 \n\t" + "punpcklwd %%mm5, %%mm5 \n\t" + "punpcklwd %%mm5, %%mm5 \n\t" + ASMALIGN(4) + "1: \n\t" + "movq (%1, %0), %%mm0 \n\t" + "movq 8(%1, %0), %%mm1 \n\t" + PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6) + "paddw (%2, %0), %%mm0 \n\t" + "paddw 8(%2, %0), %%mm1 \n\t" + "psraw $6, %%mm0 \n\t" + "psraw $6, %%mm1 \n\t" + "pmullw (%3, %0), %%mm0 \n\t" + "pmullw 8(%3, %0), %%mm1 \n\t" + "pmaddwd %%mm0, %%mm0 \n\t" + "pmaddwd %%mm1, %%mm1 \n\t" + "paddd %%mm1, %%mm0 \n\t" + "psrld $4, %%mm0 \n\t" + "paddd %%mm0, %%mm7 \n\t" + "add $16, %0 \n\t" + "cmp $128, %0 \n\t" //FIXME optimize & bench + " jb 1b \n\t" + PHADDD(%%mm7, %%mm6) + "psrld $2, %%mm7 \n\t" + "movd %%mm7, %0 \n\t" + + : "+r" (i) + : "r"(basis), "r"(rem), "r"(weight), "g"(scale) + ); + return i; +} + +static void DEF(add_8x8basis)(int16_t rem[64], int16_t basis[64], int scale) +{ + x86_reg i=0; + + if(FFABS(scale) < MAX_ABS){ + scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT; + SET_RND(mm6); + __asm__ volatile( + "movd %3, %%mm5 \n\t" + "punpcklwd %%mm5, %%mm5 \n\t" + "punpcklwd %%mm5, %%mm5 \n\t" + ASMALIGN(4) + "1: \n\t" + "movq (%1, %0), %%mm0 \n\t" + "movq 8(%1, %0), %%mm1 \n\t" + PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6) + "paddw (%2, %0), %%mm0 \n\t" + "paddw 8(%2, %0), %%mm1 \n\t" + "movq %%mm0, (%2, %0) \n\t" + "movq %%mm1, 8(%2, %0) \n\t" + "add $16, %0 \n\t" + "cmp $128, %0 \n\t" // FIXME optimize & bench + " jb 1b \n\t" + + : "+r" (i) + : "r"(basis), "r"(rem), "g"(scale) + ); + }else{ + for(i=0; i<8*8; i++){ + rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT); + } + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/x86/dsputil_mmx_rnd_template.c Mon Dec 22 09:12:42 2008 +0000 @@ -0,0 +1,590 @@ +/* + * DSP utils mmx functions are compiled twice for rnd/no_rnd + * Copyright (c) 2000, 2001 Fabrice Bellard. + * Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at> + * + * MMX optimization by Nick Kurshev <nickols_k@mail.ru> + * mostly rewritten by Michael Niedermayer <michaelni@gmx.at> + * and improved by Zdenek Kabelac <kabi@users.sf.net> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +// put_pixels +static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) +{ + MOVQ_BFE(mm6); + __asm__ volatile( + "lea (%3, %3), %%"REG_a" \n\t" + ASMALIGN(3) + "1: \n\t" + "movq (%1), %%mm0 \n\t" + "movq 1(%1), %%mm1 \n\t" + "movq (%1, %3), %%mm2 \n\t" + "movq 1(%1, %3), %%mm3 \n\t" + PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) + "movq %%mm4, (%2) \n\t" + "movq %%mm5, (%2, %3) \n\t" + "add %%"REG_a", %1 \n\t" + "add %%"REG_a", %2 \n\t" + "movq (%1), %%mm0 \n\t" + "movq 1(%1), %%mm1 \n\t" + "movq (%1, %3), %%mm2 \n\t" + "movq 1(%1, %3), %%mm3 \n\t" + PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) + "movq %%mm4, (%2) \n\t" + "movq %%mm5, (%2, %3) \n\t" + "add %%"REG_a", %1 \n\t" + "add %%"REG_a", %2 \n\t" + "subl $4, %0 \n\t" + "jnz 1b \n\t" + :"+g"(h), "+S"(pixels), "+D"(block) + :"r"((x86_reg)line_size) + :REG_a, "memory"); +} + +static void av_unused DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) +{ + MOVQ_BFE(mm6); + __asm__ volatile( + "testl $1, %0 \n\t" + " jz 1f \n\t" + "movq (%1), %%mm0 \n\t" + "movq (%2), %%mm1 \n\t" + "add %4, %1 \n\t" + "add $8, %2 \n\t" + PAVGB(%%mm0, %%mm1, %%mm4, %%mm6) + "movq %%mm4, (%3) \n\t" + "add %5, %3 \n\t" + "decl %0 \n\t" + ASMALIGN(3) + "1: \n\t" + "movq (%1), %%mm0 \n\t" + "movq (%2), %%mm1 \n\t" + "add %4, %1 \n\t" + "movq (%1), %%mm2 \n\t" + "movq 8(%2), %%mm3 \n\t" + "add %4, %1 \n\t" + PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) + "movq %%mm4, (%3) \n\t" + "add %5, %3 \n\t" + "movq %%mm5, (%3) \n\t" + "add %5, %3 \n\t" + "movq (%1), %%mm0 \n\t" + "movq 16(%2), %%mm1 \n\t" + "add %4, %1 \n\t" + "movq (%1), %%mm2 \n\t" + "movq 24(%2), %%mm3 \n\t" + "add %4, %1 \n\t" + "add $32, %2 \n\t" + PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) + "movq %%mm4, (%3) \n\t" + "add %5, %3 \n\t" + "movq %%mm5, (%3) \n\t" + "add %5, %3 \n\t" + "subl $4, %0 \n\t" + "jnz 1b \n\t" +#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used + :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) +#else + :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) +#endif + :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) + :"memory"); +} + +static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) +{ + MOVQ_BFE(mm6); + __asm__ volatile( + "lea (%3, %3), %%"REG_a" \n\t" + ASMALIGN(3) + "1: \n\t" + "movq (%1), %%mm0 \n\t" + "movq 1(%1), %%mm1 \n\t" + "movq (%1, %3), %%mm2 \n\t" + "movq 1(%1, %3), %%mm3 \n\t" + PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) + "movq %%mm4, (%2) \n\t" + "movq %%mm5, (%2, %3) \n\t" + "movq 8(%1), %%mm0 \n\t" + "movq 9(%1), %%mm1 \n\t" + "movq 8(%1, %3), %%mm2 \n\t" + "movq 9(%1, %3), %%mm3 \n\t" + PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) + "movq %%mm4, 8(%2) \n\t" + "movq %%mm5, 8(%2, %3) \n\t" + "add %%"REG_a", %1 \n\t" + "add %%"REG_a", %2 \n\t" + "movq (%1), %%mm0 \n\t" + "movq 1(%1), %%mm1 \n\t" + "movq (%1, %3), %%mm2 \n\t" + "movq 1(%1, %3), %%mm3 \n\t" + PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) + "movq %%mm4, (%2) \n\t" + "movq %%mm5, (%2, %3) \n\t" + "movq 8(%1), %%mm0 \n\t" + "movq 9(%1), %%mm1 \n\t" + "movq 8(%1, %3), %%mm2 \n\t" + "movq 9(%1, %3), %%mm3 \n\t" + PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) + "movq %%mm4, 8(%2) \n\t" + "movq %%mm5, 8(%2, %3) \n\t" + "add %%"REG_a", %1 \n\t" + "add %%"REG_a", %2 \n\t" + "subl $4, %0 \n\t" + "jnz 1b \n\t" + :"+g"(h), "+S"(pixels), "+D"(block) + :"r"((x86_reg)line_size) + :REG_a, "memory"); +} + +static void av_unused DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) +{ + MOVQ_BFE(mm6); + __asm__ volatile( + "testl $1, %0 \n\t" + " jz 1f \n\t" + "movq (%1), %%mm0 \n\t" + "movq (%2), %%mm1 \n\t" + "movq 8(%1), %%mm2 \n\t" + "movq 8(%2), %%mm3 \n\t" + "add %4, %1 \n\t" + "add $16, %2 \n\t" + PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) + "movq %%mm4, (%3) \n\t" + "movq %%mm5, 8(%3) \n\t" + "add %5, %3 \n\t" + "decl %0 \n\t" + ASMALIGN(3) + "1: \n\t" + "movq (%1), %%mm0 \n\t" + "movq (%2), %%mm1 \n\t" + "movq 8(%1), %%mm2 \n\t" + "movq 8(%2), %%mm3 \n\t" + "add %4, %1 \n\t" + PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) + "movq %%mm4, (%3) \n\t" + "movq %%mm5, 8(%3) \n\t" + "add %5, %3 \n\t" + "movq (%1), %%mm0 \n\t" + "movq 16(%2), %%mm1 \n\t" + "movq 8(%1), %%mm2 \n\t" + "movq 24(%2), %%mm3 \n\t" + "add %4, %1 \n\t" + PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) + "movq %%mm4, (%3) \n\t" + "movq %%mm5, 8(%3) \n\t" + "add %5, %3 \n\t" + "add $32, %2 \n\t" + "subl $2, %0 \n\t" + "jnz 1b \n\t" +#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used + :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) +#else + :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) +#endif + :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride) + :"memory"); +} + +static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) +{ + MOVQ_BFE(mm6); + __asm__ volatile( + "lea (%3, %3), %%"REG_a" \n\t" + "movq (%1), %%mm0 \n\t" + ASMALIGN(3) + "1: \n\t" + "movq (%1, %3), %%mm1 \n\t" + "movq (%1, %%"REG_a"),%%mm2 \n\t" + PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5) + "movq %%mm4, (%2) \n\t" + "movq %%mm5, (%2, %3) \n\t" + "add %%"REG_a", %1 \n\t" + "add %%"REG_a", %2 \n\t" + "movq (%1, %3), %%mm1 \n\t" + "movq (%1, %%"REG_a"),%%mm0 \n\t" + PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5) + "movq %%mm4, (%2) \n\t" + "movq %%mm5, (%2, %3) \n\t" + "add %%"REG_a", %1 \n\t" + "add %%"REG_a", %2 \n\t" + "subl $4, %0 \n\t" + "jnz 1b \n\t" + :"+g"(h), "+S"(pixels), "+D"(block) + :"r"((x86_reg)line_size) + :REG_a, "memory"); +} + +static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) +{ + MOVQ_ZERO(mm7); + SET_RND(mm6); // =2 for rnd and =1 for no_rnd version + __asm__ volatile( + "movq (%1), %%mm0 \n\t" + "movq 1(%1), %%mm4 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm4, %%mm5 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpcklbw %%mm7, %%mm4 \n\t" + "punpckhbw %%mm7, %%mm1 \n\t" + "punpckhbw %%mm7, %%mm5 \n\t" + "paddusw %%mm0, %%mm4 \n\t" + "paddusw %%mm1, %%mm5 \n\t" + "xor %%"REG_a", %%"REG_a" \n\t" + "add %3, %1 \n\t" + ASMALIGN(3) + "1: \n\t" + "movq (%1, %%"REG_a"), %%mm0 \n\t" + "movq 1(%1, %%"REG_a"), %%mm2 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm2, %%mm3 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpcklbw %%mm7, %%mm2 \n\t" + "punpckhbw %%mm7, %%mm1 \n\t" + "punpckhbw %%mm7, %%mm3 \n\t" + "paddusw %%mm2, %%mm0 \n\t" + "paddusw %%mm3, %%mm1 \n\t" + "paddusw %%mm6, %%mm4 \n\t" + "paddusw %%mm6, %%mm5 \n\t" + "paddusw %%mm0, %%mm4 \n\t" + "paddusw %%mm1, %%mm5 \n\t" + "psrlw $2, %%mm4 \n\t" + "psrlw $2, %%mm5 \n\t" + "packuswb %%mm5, %%mm4 \n\t" + "movq %%mm4, (%2, %%"REG_a") \n\t" + "add %3, %%"REG_a" \n\t" + + "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 + "movq 1(%1, %%"REG_a"), %%mm4 \n\t" + "movq %%mm2, %%mm3 \n\t" + "movq %%mm4, %%mm5 \n\t" + "punpcklbw %%mm7, %%mm2 \n\t" + "punpcklbw %%mm7, %%mm4 \n\t" + "punpckhbw %%mm7, %%mm3 \n\t" + "punpckhbw %%mm7, %%mm5 \n\t" + "paddusw %%mm2, %%mm4 \n\t" + "paddusw %%mm3, %%mm5 \n\t" + "paddusw %%mm6, %%mm0 \n\t" + "paddusw %%mm6, %%mm1 \n\t" + "paddusw %%mm4, %%mm0 \n\t" + "paddusw %%mm5, %%mm1 \n\t" + "psrlw $2, %%mm0 \n\t" + "psrlw $2, %%mm1 \n\t" + "packuswb %%mm1, %%mm0 \n\t" + "movq %%mm0, (%2, %%"REG_a") \n\t" + "add %3, %%"REG_a" \n\t" + + "subl $2, %0 \n\t" + "jnz 1b \n\t" + :"+g"(h), "+S"(pixels) + :"D"(block), "r"((x86_reg)line_size) + :REG_a, "memory"); +} + +// avg_pixels +static void av_unused DEF(avg, pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h) +{ + MOVQ_BFE(mm6); + JUMPALIGN(); + do { + __asm__ volatile( + "movd %0, %%mm0 \n\t" + "movd %1, %%mm1 \n\t" + PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) + "movd %%mm2, %0 \n\t" + :"+m"(*block) + :"m"(*pixels) + :"memory"); + pixels += line_size; + block += line_size; + } + while (--h); +} + +// in case more speed is needed - unroling would certainly help +static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h) +{ + MOVQ_BFE(mm6); + JUMPALIGN(); + do { + __asm__ volatile( + "movq %0, %%mm0 \n\t" + "movq %1, %%mm1 \n\t" + PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) + "movq %%mm2, %0 \n\t" + :"+m"(*block) + :"m"(*pixels) + :"memory"); + pixels += line_size; + block += line_size; + } + while (--h); +} + +static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h) +{ + MOVQ_BFE(mm6); + JUMPALIGN(); + do { + __asm__ volatile( + "movq %0, %%mm0 \n\t" + "movq %1, %%mm1 \n\t" + PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) + "movq %%mm2, %0 \n\t" + "movq 8%0, %%mm0 \n\t" + "movq 8%1, %%mm1 \n\t" + PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) + "movq %%mm2, 8%0 \n\t" + :"+m"(*block) + :"m"(*pixels) + :"memory"); + pixels += line_size; + block += line_size; + } + while (--h); +} + +static void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) +{ + MOVQ_BFE(mm6); + JUMPALIGN(); + do { + __asm__ volatile( + "movq %1, %%mm0 \n\t" + "movq 1%1, %%mm1 \n\t" + "movq %0, %%mm3 \n\t" + PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) + PAVGB(%%mm3, %%mm2, %%mm0, %%mm6) + "movq %%mm0, %0 \n\t" + :"+m"(*block) + :"m"(*pixels) + :"memory"); + pixels += line_size; + block += line_size; + } while (--h); +} + +static av_unused void DEF(avg, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) +{ + MOVQ_BFE(mm6); + JUMPALIGN(); + do { + __asm__ volatile( + "movq %1, %%mm0 \n\t" + "movq %2, %%mm1 \n\t" + "movq %0, %%mm3 \n\t" + PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) + PAVGB(%%mm3, %%mm2, %%mm0, %%mm6) + "movq %%mm0, %0 \n\t" + :"+m"(*dst) + :"m"(*src1), "m"(*src2) + :"memory"); + dst += dstStride; + src1 += src1Stride; + src2 += 8; + } while (--h); +} + +static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) +{ + MOVQ_BFE(mm6); + JUMPALIGN(); + do { + __asm__ volatile( + "movq %1, %%mm0 \n\t" + "movq 1%1, %%mm1 \n\t" + "movq %0, %%mm3 \n\t" + PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) + PAVGB(%%mm3, %%mm2, %%mm0, %%mm6) + "movq %%mm0, %0 \n\t" + "movq 8%1, %%mm0 \n\t" + "movq 9%1, %%mm1 \n\t" + "movq 8%0, %%mm3 \n\t" + PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) + PAVGB(%%mm3, %%mm2, %%mm0, %%mm6) + "movq %%mm0, 8%0 \n\t" + :"+m"(*block) + :"m"(*pixels) + :"memory"); + pixels += line_size; + block += line_size; + } while (--h); +} + +static av_unused void DEF(avg, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) +{ + MOVQ_BFE(mm6); + JUMPALIGN(); + do { + __asm__ volatile( + "movq %1, %%mm0 \n\t" + "movq %2, %%mm1 \n\t" + "movq %0, %%mm3 \n\t" + PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) + PAVGB(%%mm3, %%mm2, %%mm0, %%mm6) + "movq %%mm0, %0 \n\t" + "movq 8%1, %%mm0 \n\t" + "movq 8%2, %%mm1 \n\t" + "movq 8%0, %%mm3 \n\t" + PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) + PAVGB(%%mm3, %%mm2, %%mm0, %%mm6) + "movq %%mm0, 8%0 \n\t" + :"+m"(*dst) + :"m"(*src1), "m"(*src2) + :"memory"); + dst += dstStride; + src1 += src1Stride; + src2 += 16; + } while (--h); +} + +static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) +{ + MOVQ_BFE(mm6); + __asm__ volatile( + "lea (%3, %3), %%"REG_a" \n\t" + "movq (%1), %%mm0 \n\t" + ASMALIGN(3) + "1: \n\t" + "movq (%1, %3), %%mm1 \n\t" + "movq (%1, %%"REG_a"), %%mm2 \n\t" + PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5) + "movq (%2), %%mm3 \n\t" + PAVGB(%%mm3, %%mm4, %%mm0, %%mm6) + "movq (%2, %3), %%mm3 \n\t" + PAVGB(%%mm3, %%mm5, %%mm1, %%mm6) + "movq %%mm0, (%2) \n\t" + "movq %%mm1, (%2, %3) \n\t" + "add %%"REG_a", %1 \n\t" + "add %%"REG_a", %2 \n\t" + + "movq (%1, %3), %%mm1 \n\t" + "movq (%1, %%"REG_a"), %%mm0 \n\t" + PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5) + "movq (%2), %%mm3 \n\t" + PAVGB(%%mm3, %%mm4, %%mm2, %%mm6) + "movq (%2, %3), %%mm3 \n\t" + PAVGB(%%mm3, %%mm5, %%mm1, %%mm6) + "movq %%mm2, (%2) \n\t" + "movq %%mm1, (%2, %3) \n\t" + "add %%"REG_a", %1 \n\t" + "add %%"REG_a", %2 \n\t" + + "subl $4, %0 \n\t" + "jnz 1b \n\t" + :"+g"(h), "+S"(pixels), "+D"(block) + :"r"((x86_reg)line_size) + :REG_a, "memory"); +} + +// this routine is 'slightly' suboptimal but mostly unused +static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) +{ + MOVQ_ZERO(mm7); + SET_RND(mm6); // =2 for rnd and =1 for no_rnd version + __asm__ volatile( + "movq (%1), %%mm0 \n\t" + "movq 1(%1), %%mm4 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm4, %%mm5 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpcklbw %%mm7, %%mm4 \n\t" + "punpckhbw %%mm7, %%mm1 \n\t" + "punpckhbw %%mm7, %%mm5 \n\t" + "paddusw %%mm0, %%mm4 \n\t" + "paddusw %%mm1, %%mm5 \n\t" + "xor %%"REG_a", %%"REG_a" \n\t" + "add %3, %1 \n\t" + ASMALIGN(3) + "1: \n\t" + "movq (%1, %%"REG_a"), %%mm0 \n\t" + "movq 1(%1, %%"REG_a"), %%mm2 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm2, %%mm3 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpcklbw %%mm7, %%mm2 \n\t" + "punpckhbw %%mm7, %%mm1 \n\t" + "punpckhbw %%mm7, %%mm3 \n\t" + "paddusw %%mm2, %%mm0 \n\t" + "paddusw %%mm3, %%mm1 \n\t" + "paddusw %%mm6, %%mm4 \n\t" + "paddusw %%mm6, %%mm5 \n\t" + "paddusw %%mm0, %%mm4 \n\t" + "paddusw %%mm1, %%mm5 \n\t" + "psrlw $2, %%mm4 \n\t" + "psrlw $2, %%mm5 \n\t" + "movq (%2, %%"REG_a"), %%mm3 \n\t" + "packuswb %%mm5, %%mm4 \n\t" + "pcmpeqd %%mm2, %%mm2 \n\t" + "paddb %%mm2, %%mm2 \n\t" + PAVGB(%%mm3, %%mm4, %%mm5, %%mm2) + "movq %%mm5, (%2, %%"REG_a") \n\t" + "add %3, %%"REG_a" \n\t" + + "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 + "movq 1(%1, %%"REG_a"), %%mm4 \n\t" + "movq %%mm2, %%mm3 \n\t" + "movq %%mm4, %%mm5 \n\t" + "punpcklbw %%mm7, %%mm2 \n\t" + "punpcklbw %%mm7, %%mm4 \n\t" + "punpckhbw %%mm7, %%mm3 \n\t" + "punpckhbw %%mm7, %%mm5 \n\t" + "paddusw %%mm2, %%mm4 \n\t" + "paddusw %%mm3, %%mm5 \n\t" + "paddusw %%mm6, %%mm0 \n\t" + "paddusw %%mm6, %%mm1 \n\t" + "paddusw %%mm4, %%mm0 \n\t" + "paddusw %%mm5, %%mm1 \n\t" + "psrlw $2, %%mm0 \n\t" + "psrlw $2, %%mm1 \n\t" + "movq (%2, %%"REG_a"), %%mm3 \n\t" + "packuswb %%mm1, %%mm0 \n\t" + "pcmpeqd %%mm2, %%mm2 \n\t" + "paddb %%mm2, %%mm2 \n\t" + PAVGB(%%mm3, %%mm0, %%mm1, %%mm2) + "movq %%mm1, (%2, %%"REG_a") \n\t" + "add %3, %%"REG_a" \n\t" + + "subl $2, %0 \n\t" + "jnz 1b \n\t" + :"+g"(h), "+S"(pixels) + :"D"(block), "r"((x86_reg)line_size) + :REG_a, "memory"); +} + +//FIXME optimize +static void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ + DEF(put, pixels8_y2)(block , pixels , line_size, h); + DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h); +} + +static void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ + DEF(put, pixels8_xy2)(block , pixels , line_size, h); + DEF(put, pixels8_xy2)(block+8, pixels+8, line_size, h); +} + +static void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ + DEF(avg, pixels8_y2)(block , pixels , line_size, h); + DEF(avg, pixels8_y2)(block+8, pixels+8, line_size, h); +} + +static void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ + DEF(avg, pixels8_xy2)(block , pixels , line_size, h); + DEF(avg, pixels8_xy2)(block+8, pixels+8, line_size, h); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/x86/dsputil_yasm.asm Mon Dec 22 09:12:42 2008 +0000 @@ -0,0 +1,92 @@ +;****************************************************************************** +;* MMX optimized DSP utils +;* Copyright (c) 2008 Loren Merritt +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "x86inc.asm" + +section .text align=16 + +%macro PSWAPD_SSE 2 + pshufw %1, %2, 0x4e +%endmacro +%macro PSWAPD_3DN1 2 + movq %1, %2 + psrlq %1, 32 + punpckldq %1, %2 +%endmacro + +%macro FLOAT_TO_INT16_INTERLEAVE6 1 +; void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len) +cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5 +%ifdef ARCH_X86_64 + %define lend r10d + mov lend, r2d +%else + %define lend dword r2m +%endif + mov src1q, [srcq+1*gprsize] + mov src2q, [srcq+2*gprsize] + mov src3q, [srcq+3*gprsize] + mov src4q, [srcq+4*gprsize] + mov src5q, [srcq+5*gprsize] + mov srcq, [srcq] + sub src1q, srcq + sub src2q, srcq + sub src3q, srcq + sub src4q, srcq + sub src5q, srcq +.loop: + cvtps2pi mm0, [srcq] + cvtps2pi mm1, [srcq+src1q] + cvtps2pi mm2, [srcq+src2q] + cvtps2pi mm3, [srcq+src3q] + cvtps2pi mm4, [srcq+src4q] + cvtps2pi mm5, [srcq+src5q] + packssdw mm0, mm3 + packssdw mm1, mm4 + packssdw mm2, mm5 + pswapd mm3, mm0 + punpcklwd mm0, mm1 + punpckhwd mm1, mm2 + punpcklwd mm2, mm3 + pswapd mm3, mm0 + punpckldq mm0, mm2 + punpckhdq mm2, mm1 + punpckldq mm1, mm3 + movq [dstq ], mm0 + movq [dstq+16], mm2 + movq [dstq+ 8], mm1 + add srcq, 8 + add dstq, 24 + sub lend, 2 + jg .loop + emms + RET +%endmacro ; FLOAT_TO_INT16_INTERLEAVE6 + +%define pswapd PSWAPD_SSE +FLOAT_TO_INT16_INTERLEAVE6 sse +%define cvtps2pi pf2id +%define pswapd PSWAPD_3DN1 +FLOAT_TO_INT16_INTERLEAVE6 3dnow +%undef pswapd +FLOAT_TO_INT16_INTERLEAVE6 3dn2 +%undef cvtps2pi +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/x86/dsputilenc_mmx.c Mon Dec 22 09:12:42 2008 +0000 @@ -0,0 +1,1441 @@ +/* + * MMX optimized DSP utils + * Copyright (c) 2000, 2001 Fabrice Bellard. + * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + * MMX optimization by Nick Kurshev <nickols_k@mail.ru> + */ + +#include "libavutil/x86_cpu.h" +#include "libavcodec/dsputil.h" +#include "libavcodec/mpegvideo.h" +#include "dsputil_mmx.h" + + +static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size) +{ + __asm__ volatile( + "mov $-128, %%"REG_a" \n\t" + "pxor %%mm7, %%mm7 \n\t" + ASMALIGN(4) + "1: \n\t" + "movq (%0), %%mm0 \n\t" + "movq (%0, %2), %%mm2 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm2, %%mm3 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpckhbw %%mm7, %%mm1 \n\t" + "punpcklbw %%mm7, %%mm2 \n\t" + "punpckhbw %%mm7, %%mm3 \n\t" + "movq %%mm0, (%1, %%"REG_a") \n\t" + "movq %%mm1, 8(%1, %%"REG_a") \n\t" + "movq %%mm2, 16(%1, %%"REG_a") \n\t" + "movq %%mm3, 24(%1, %%"REG_a") \n\t" + "add %3, %0 \n\t" + "add $32, %%"REG_a" \n\t" + "js 1b \n\t" + : "+r" (pixels) + : "r" (block+64), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*2) + : "%"REG_a + ); +} + +static void get_pixels_sse2(DCTELEM *block, const uint8_t *pixels, int line_size) +{ + __asm__ volatile( + "pxor %%xmm7, %%xmm7 \n\t" + "movq (%0), %%xmm0 \n\t" + "movq (%0, %2), %%xmm1 \n\t" + "movq (%0, %2,2), %%xmm2 \n\t" + "movq (%0, %3), %%xmm3 \n\t" + "lea (%0,%2,4), %0 \n\t" + "punpcklbw %%xmm7, %%xmm0 \n\t" + "punpcklbw %%xmm7, %%xmm1 \n\t" + "punpcklbw %%xmm7, %%xmm2 \n\t" + "punpcklbw %%xmm7, %%xmm3 \n\t" + "movdqa %%xmm0, (%1) \n\t" + "movdqa %%xmm1, 16(%1) \n\t" + "movdqa %%xmm2, 32(%1) \n\t" + "movdqa %%xmm3, 48(%1) \n\t" + "movq (%0), %%xmm0 \n\t" + "movq (%0, %2), %%xmm1 \n\t" + "movq (%0, %2,2), %%xmm2 \n\t" + "movq (%0, %3), %%xmm3 \n\t" + "punpcklbw %%xmm7, %%xmm0 \n\t" + "punpcklbw %%xmm7, %%xmm1 \n\t" + "punpcklbw %%xmm7, %%xmm2 \n\t" + "punpcklbw %%xmm7, %%xmm3 \n\t" + "movdqa %%xmm0, 64(%1) \n\t" + "movdqa %%xmm1, 80(%1) \n\t" + "movdqa %%xmm2, 96(%1) \n\t" + "movdqa %%xmm3, 112(%1) \n\t" + : "+r" (pixels) + : "r" (block), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3) + ); +} + +static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride) +{ + __asm__ volatile( + "pxor %%mm7, %%mm7 \n\t" + "mov $-128, %%"REG_a" \n\t" + ASMALIGN(4) + "1: \n\t" + "movq (%0), %%mm0 \n\t" + "movq (%1), %%mm2 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm2, %%mm3 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpckhbw %%mm7, %%mm1 \n\t" + "punpcklbw %%mm7, %%mm2 \n\t" + "punpckhbw %%mm7, %%mm3 \n\t" + "psubw %%mm2, %%mm0 \n\t" + "psubw %%mm3, %%mm1 \n\t" + "movq %%mm0, (%2, %%"REG_a") \n\t" + "movq %%mm1, 8(%2, %%"REG_a") \n\t" + "add %3, %0 \n\t" + "add %3, %1 \n\t" + "add $16, %%"REG_a" \n\t" + "jnz 1b \n\t" + : "+r" (s1), "+r" (s2) + : "r" (block+64), "r" ((x86_reg)stride) + : "%"REG_a + ); +} + +static int pix_sum16_mmx(uint8_t * pix, int line_size){ + const int h=16; + int sum; + x86_reg index= -line_size*h; + + __asm__ volatile( + "pxor %%mm7, %%mm7 \n\t" + "pxor %%mm6, %%mm6 \n\t" + "1: \n\t" + "movq (%2, %1), %%mm0 \n\t" + "movq (%2, %1), %%mm1 \n\t" + "movq 8(%2, %1), %%mm2 \n\t" + "movq 8(%2, %1), %%mm3 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpckhbw %%mm7, %%mm1 \n\t" + "punpcklbw %%mm7, %%mm2 \n\t" + "punpckhbw %%mm7, %%mm3 \n\t" + "paddw %%mm0, %%mm1 \n\t" + "paddw %%mm2, %%mm3 \n\t" + "paddw %%mm1, %%mm3 \n\t" + "paddw %%mm3, %%mm6 \n\t" + "add %3, %1 \n\t" + " js 1b \n\t" + "movq %%mm6, %%mm5 \n\t" + "psrlq $32, %%mm6 \n\t" + "paddw %%mm5, %%mm6 \n\t" + "movq %%mm6, %%mm5 \n\t" + "psrlq $16, %%mm6 \n\t" + "paddw %%mm5, %%mm6 \n\t" + "movd %%mm6, %0 \n\t" + "andl $0xFFFF, %0 \n\t" + : "=&r" (sum), "+r" (index) + : "r" (pix - index), "r" ((x86_reg)line_size) + ); + + return sum; +} + +static int pix_norm1_mmx(uint8_t *pix, int line_size) { + int tmp; + __asm__ volatile ( + "movl $16,%%ecx\n" + "pxor %%mm0,%%mm0\n" + "pxor %%mm7,%%mm7\n" + "1:\n" + "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */ + "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */ + + "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */ + + "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */ + "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */ + + "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */ + "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */ + "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */ + + "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */ + "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */ + + "pmaddwd %%mm3,%%mm3\n" + "pmaddwd %%mm4,%%mm4\n" + + "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2, + pix2^2+pix3^2+pix6^2+pix7^2) */ + "paddd %%mm3,%%mm4\n" + "paddd %%mm2,%%mm7\n" + + "add %2, %0\n" + "paddd %%mm4,%%mm7\n" + "dec %%ecx\n" + "jnz 1b\n" + + "movq %%mm7,%%mm1\n" + "psrlq $32, %%mm7\n" /* shift hi dword to lo */ + "paddd %%mm7,%%mm1\n" + "movd %%mm1,%1\n" + : "+r" (pix), "=r"(tmp) : "r" ((x86_reg)line_size) : "%ecx" ); + return tmp; +} + +static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { + int tmp; + __asm__ volatile ( + "movl %4,%%ecx\n" + "shr $1,%%ecx\n" + "pxor %%mm0,%%mm0\n" /* mm0 = 0 */ + "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */ + "1:\n" + "movq (%0),%%mm1\n" /* mm1 = pix1[0][0-7] */ + "movq (%1),%%mm2\n" /* mm2 = pix2[0][0-7] */ + "movq (%0,%3),%%mm3\n" /* mm3 = pix1[1][0-7] */ + "movq (%1,%3),%%mm4\n" /* mm4 = pix2[1][0-7] */ + + /* todo: mm1-mm2, mm3-mm4 */ + /* algo: subtract mm1 from mm2 with saturation and vice versa */ + /* OR the results to get absolute difference */ + "movq %%mm1,%%mm5\n" + "movq %%mm3,%%mm6\n" + "psubusb %%mm2,%%mm1\n" + "psubusb %%mm4,%%mm3\n" + "psubusb %%mm5,%%mm2\n" + "psubusb %%mm6,%%mm4\n" + + "por %%mm1,%%mm2\n" + "por %%mm3,%%mm4\n" + + /* now convert to 16-bit vectors so we can square them */ + "movq %%mm2,%%mm1\n" + "movq %%mm4,%%mm3\n" + + "punpckhbw %%mm0,%%mm2\n" + "punpckhbw %%mm0,%%mm4\n" + "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */ + "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */ + + "pmaddwd %%mm2,%%mm2\n" + "pmaddwd %%mm4,%%mm4\n" + "pmaddwd %%mm1,%%mm1\n" + "pmaddwd %%mm3,%%mm3\n" + + "lea (%0,%3,2), %0\n" /* pix1 += 2*line_size */ + "lea (%1,%3,2), %1\n" /* pix2 += 2*line_size */ + + "paddd %%mm2,%%mm1\n" + "paddd %%mm4,%%mm3\n" + "paddd %%mm1,%%mm7\n" + "paddd %%mm3,%%mm7\n" + + "decl %%ecx\n" + "jnz 1b\n" + + "movq %%mm7,%%mm1\n" + "psrlq $32, %%mm7\n" /* shift hi dword to lo */ + "paddd %%mm7,%%mm1\n" + "movd %%mm1,%2\n" + : "+r" (pix1), "+r" (pix2), "=r"(tmp) + : "r" ((x86_reg)line_size) , "m" (h) + : "%ecx"); + return tmp; +} + +static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { + int tmp; + __asm__ volatile ( + "movl %4,%%ecx\n" + "pxor %%mm0,%%mm0\n" /* mm0 = 0 */ + "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */ + "1:\n" + "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */ + "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */ + "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */ + "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */ + + /* todo: mm1-mm2, mm3-mm4 */ + /* algo: subtract mm1 from mm2 with saturation and vice versa */ + /* OR the results to get absolute difference */ + "movq %%mm1,%%mm5\n" + "movq %%mm3,%%mm6\n" + "psubusb %%mm2,%%mm1\n" + "psubusb %%mm4,%%mm3\n" + "psubusb %%mm5,%%mm2\n" + "psubusb %%mm6,%%mm4\n" + + "por %%mm1,%%mm2\n" + "por %%mm3,%%mm4\n" + + /* now convert to 16-bit vectors so we can square them */ + "movq %%mm2,%%mm1\n" + "movq %%mm4,%%mm3\n" + + "punpckhbw %%mm0,%%mm2\n" + "punpckhbw %%mm0,%%mm4\n" + "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */ + "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */ + + "pmaddwd %%mm2,%%mm2\n" + "pmaddwd %%mm4,%%mm4\n" + "pmaddwd %%mm1,%%mm1\n" + "pmaddwd %%mm3,%%mm3\n" + + "add %3,%0\n" + "add %3,%1\n" + + "paddd %%mm2,%%mm1\n" + "paddd %%mm4,%%mm3\n" + "paddd %%mm1,%%mm7\n" + "paddd %%mm3,%%mm7\n" + + "decl %%ecx\n" + "jnz 1b\n" + + "movq %%mm7,%%mm1\n" + "psrlq $32, %%mm7\n" /* shift hi dword to lo */ + "paddd %%mm7,%%mm1\n" + "movd %%mm1,%2\n" + : "+r" (pix1), "+r" (pix2), "=r"(tmp) + : "r" ((x86_reg)line_size) , "m" (h) + : "%ecx"); + return tmp; +} + +static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { + int tmp; + __asm__ volatile ( + "shr $1,%2\n" + "pxor %%xmm0,%%xmm0\n" /* mm0 = 0 */ + "pxor %%xmm7,%%xmm7\n" /* mm7 holds the sum */ + "1:\n" + "movdqu (%0),%%xmm1\n" /* mm1 = pix1[0][0-15] */ + "movdqu (%1),%%xmm2\n" /* mm2 = pix2[0][0-15] */ + "movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */ + "movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */ + + /* todo: mm1-mm2, mm3-mm4 */ + /* algo: subtract mm1 from mm2 with saturation and vice versa */ + /* OR the results to get absolute difference */ + "movdqa %%xmm1,%%xmm5\n" + "movdqa %%xmm3,%%xmm6\n" + "psubusb %%xmm2,%%xmm1\n" + "psubusb %%xmm4,%%xmm3\n" + "psubusb %%xmm5,%%xmm2\n" + "psubusb %%xmm6,%%xmm4\n" + + "por %%xmm1,%%xmm2\n" + "por %%xmm3,%%xmm4\n" + + /* now convert to 16-bit vectors so we can square them */ + "movdqa %%xmm2,%%xmm1\n" + "movdqa %%xmm4,%%xmm3\n" + + "punpckhbw %%xmm0,%%xmm2\n" + "punpckhbw %%xmm0,%%xmm4\n" + "punpcklbw %%xmm0,%%xmm1\n" /* mm1 now spread over (mm1,mm2) */ + "punpcklbw %%xmm0,%%xmm3\n" /* mm4 now spread over (mm3,mm4) */ + + "pmaddwd %%xmm2,%%xmm2\n" + "pmaddwd %%xmm4,%%xmm4\n" + "pmaddwd %%xmm1,%%xmm1\n" + "pmaddwd %%xmm3,%%xmm3\n" + + "lea (%0,%4,2), %0\n" /* pix1 += 2*line_size */ + "lea (%1,%4,2), %1\n" /* pix2 += 2*line_size */ + + "paddd %%xmm2,%%xmm1\n" + "paddd %%xmm4,%%xmm3\n" + "paddd %%xmm1,%%xmm7\n" + "paddd %%xmm3,%%xmm7\n" + + "decl %2\n" + "jnz 1b\n" + + "movdqa %%xmm7,%%xmm1\n" + "psrldq $8, %%xmm7\n" /* shift hi qword to lo */ + "paddd %%xmm1,%%xmm7\n" + "movdqa %%xmm7,%%xmm1\n" + "psrldq $4, %%xmm7\n" /* shift hi dword to lo */ + "paddd %%xmm1,%%xmm7\n" + "movd %%xmm7,%3\n" + : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp) + : "r" ((x86_reg)line_size)); + return tmp; +} + +static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) { + int tmp; + __asm__ volatile ( + "movl %3,%%ecx\n" + "pxor %%mm7,%%mm7\n" + "pxor %%mm6,%%mm6\n" + + "movq (%0),%%mm0\n" + "movq %%mm0, %%mm1\n" + "psllq $8, %%mm0\n" + "psrlq $8, %%mm1\n" + "psrlq $8, %%mm0\n" + "movq %%mm0, %%mm2\n" + "movq %%mm1, %%mm3\n" + "punpcklbw %%mm7,%%mm0\n" + "punpcklbw %%mm7,%%mm1\n" + "punpckhbw %%mm7,%%mm2\n" + "punpckhbw %%mm7,%%mm3\n" + "psubw %%mm1, %%mm0\n" + "psubw %%mm3, %%mm2\n" + + "add %2,%0\n" + + "movq (%0),%%mm4\n" + "movq %%mm4, %%mm1\n" + "psllq $8, %%mm4\n" + "psrlq $8, %%mm1\n" + "psrlq $8, %%mm4\n" + "movq %%mm4, %%mm5\n" + "movq %%mm1, %%mm3\n" + "punpcklbw %%mm7,%%mm4\n" + "punpcklbw %%mm7,%%mm1\n" + "punpckhbw %%mm7,%%mm5\n" + "punpckhbw %%mm7,%%mm3\n" + "psubw %%mm1, %%mm4\n" + "psubw %%mm3, %%mm5\n" + "psubw %%mm4, %%mm0\n" + "psubw %%mm5, %%mm2\n" + "pxor %%mm3, %%mm3\n" + "pxor %%mm1, %%mm1\n" + "pcmpgtw %%mm0, %%mm3\n\t" + "pcmpgtw %%mm2, %%mm1\n\t" + "pxor %%mm3, %%mm0\n" + "pxor %%mm1, %%mm2\n" + "psubw %%mm3, %%mm0\n" + "psubw %%mm1, %%mm2\n" + "paddw %%mm0, %%mm2\n" + "paddw %%mm2, %%mm6\n" + + "add %2,%0\n" + "1:\n" + + "movq (%0),%%mm0\n" + "movq %%mm0, %%mm1\n" + "psllq $8, %%mm0\n" + "psrlq $8, %%mm1\n" + "psrlq $8, %%mm0\n" + "movq %%mm0, %%mm2\n" + "movq %%mm1, %%mm3\n" + "punpcklbw %%mm7,%%mm0\n" + "punpcklbw %%mm7,%%mm1\n" + "punpckhbw %%mm7,%%mm2\n" + "punpckhbw %%mm7,%%mm3\n" + "psubw %%mm1, %%mm0\n" + "psubw %%mm3, %%mm2\n" + "psubw %%mm0, %%mm4\n" + "psubw %%mm2, %%mm5\n" + "pxor %%mm3, %%mm3\n" + "pxor %%mm1, %%mm1\n" + "pcmpgtw %%mm4, %%mm3\n\t" + "pcmpgtw %%mm5, %%mm1\n\t" + "pxor %%mm3, %%mm4\n" + "pxor %%mm1, %%mm5\n" + "psubw %%mm3, %%mm4\n" + "psubw %%mm1, %%mm5\n" + "paddw %%mm4, %%mm5\n" + "paddw %%mm5, %%mm6\n" + + "add %2,%0\n" + + "movq (%0),%%mm4\n" + "movq %%mm4, %%mm1\n" + "psllq $8, %%mm4\n" + "psrlq $8, %%mm1\n" + "psrlq $8, %%mm4\n" + "movq %%mm4, %%mm5\n" + "movq %%mm1, %%mm3\n" + "punpcklbw %%mm7,%%mm4\n" + "punpcklbw %%mm7,%%mm1\n" + "punpckhbw %%mm7,%%mm5\n" + "punpckhbw %%mm7,%%mm3\n" + "psubw %%mm1, %%mm4\n" + "psubw %%mm3, %%mm5\n" + "psubw %%mm4, %%mm0\n" + "psubw %%mm5, %%mm2\n" + "pxor %%mm3, %%mm3\n" + "pxor %%mm1, %%mm1\n" + "pcmpgtw %%mm0, %%mm3\n\t" + "pcmpgtw %%mm2, %%mm1\n\t" + "pxor %%mm3, %%mm0\n" + "pxor %%mm1, %%mm2\n" + "psubw %%mm3, %%mm0\n" + "psubw %%mm1, %%mm2\n" + "paddw %%mm0, %%mm2\n" + "paddw %%mm2, %%mm6\n" + + "add %2,%0\n" + "subl $2, %%ecx\n" + " jnz 1b\n" + + "movq %%mm6, %%mm0\n" + "punpcklwd %%mm7,%%mm0\n" + "punpckhwd %%mm7,%%mm6\n" + "paddd %%mm0, %%mm6\n" + + "movq %%mm6,%%mm0\n" + "psrlq $32, %%mm6\n" + "paddd %%mm6,%%mm0\n" + "movd %%mm0,%1\n" + : "+r" (pix1), "=r"(tmp) + : "r" ((x86_reg)line_size) , "g" (h-2) + : "%ecx"); + return tmp; +} + +static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) { + int tmp; + uint8_t * pix= pix1; + __asm__ volatile ( + "movl %3,%%ecx\n" + "pxor %%mm7,%%mm7\n" + "pxor %%mm6,%%mm6\n" + + "movq (%0),%%mm0\n" + "movq 1(%0),%%mm1\n" + "movq %%mm0, %%mm2\n" + "movq %%mm1, %%mm3\n" + "punpcklbw %%mm7,%%mm0\n" + "punpcklbw %%mm7,%%mm1\n" + "punpckhbw %%mm7,%%mm2\n" + "punpckhbw %%mm7,%%mm3\n" + "psubw %%mm1, %%mm0\n" + "psubw %%mm3, %%mm2\n" + + "add %2,%0\n" + + "movq (%0),%%mm4\n" + "movq 1(%0),%%mm1\n" + "movq %%mm4, %%mm5\n" + "movq %%mm1, %%mm3\n" + "punpcklbw %%mm7,%%mm4\n" + "punpcklbw %%mm7,%%mm1\n" + "punpckhbw %%mm7,%%mm5\n" + "punpckhbw %%mm7,%%mm3\n" + "psubw %%mm1, %%mm4\n" + "psubw %%mm3, %%mm5\n" + "psubw %%mm4, %%mm0\n" + "psubw %%mm5, %%mm2\n" + "pxor %%mm3, %%mm3\n" + "pxor %%mm1, %%mm1\n" + "pcmpgtw %%mm0, %%mm3\n\t" + "pcmpgtw %%mm2, %%mm1\n\t" + "pxor %%mm3, %%mm0\n" + "pxor %%mm1, %%mm2\n" + "psubw %%mm3, %%mm0\n" + "psubw %%mm1, %%mm2\n" + "paddw %%mm0, %%mm2\n" + "paddw %%mm2, %%mm6\n" + + "add %2,%0\n" + "1:\n" + + "movq (%0),%%mm0\n" + "movq 1(%0),%%mm1\n" + "movq %%mm0, %%mm2\n" + "movq %%mm1, %%mm3\n" + "punpcklbw %%mm7,%%mm0\n" + "punpcklbw %%mm7,%%mm1\n" + "punpckhbw %%mm7,%%mm2\n" + "punpckhbw %%mm7,%%mm3\n" + "psubw %%mm1, %%mm0\n" + "psubw %%mm3, %%mm2\n" + "psubw %%mm0, %%mm4\n" + "psubw %%mm2, %%mm5\n" + "pxor %%mm3, %%mm3\n" + "pxor %%mm1, %%mm1\n" + "pcmpgtw %%mm4, %%mm3\n\t" + "pcmpgtw %%mm5, %%mm1\n\t" + "pxor %%mm3, %%mm4\n" + "pxor %%mm1, %%mm5\n" + "psubw %%mm3, %%mm4\n" + "psubw %%mm1, %%mm5\n" + "paddw %%mm4, %%mm5\n" + "paddw %%mm5, %%mm6\n" + + "add %2,%0\n" + + "movq (%0),%%mm4\n" + "movq 1(%0),%%mm1\n" + "movq %%mm4, %%mm5\n" + "movq %%mm1, %%mm3\n" + "punpcklbw %%mm7,%%mm4\n" + "punpcklbw %%mm7,%%mm1\n" + "punpckhbw %%mm7,%%mm5\n" + "punpckhbw %%mm7,%%mm3\n" + "psubw %%mm1, %%mm4\n" + "psubw %%mm3, %%mm5\n" + "psubw %%mm4, %%mm0\n" + "psubw %%mm5, %%mm2\n" + "pxor %%mm3, %%mm3\n" + "pxor %%mm1, %%mm1\n" + "pcmpgtw %%mm0, %%mm3\n\t" + "pcmpgtw %%mm2, %%mm1\n\t" + "pxor %%mm3, %%mm0\n" + "pxor %%mm1, %%mm2\n" + "psubw %%mm3, %%mm0\n" + "psubw %%mm1, %%mm2\n" + "paddw %%mm0, %%mm2\n" + "paddw %%mm2, %%mm6\n" + + "add %2,%0\n" + "subl $2, %%ecx\n" + " jnz 1b\n" + + "movq %%mm6, %%mm0\n" + "punpcklwd %%mm7,%%mm0\n" + "punpckhwd %%mm7,%%mm6\n" + "paddd %%mm0, %%mm6\n" + + "movq %%mm6,%%mm0\n" + "psrlq $32, %%mm6\n" + "paddd %%mm6,%%mm0\n" + "movd %%mm0,%1\n" + : "+r" (pix1), "=r"(tmp) + : "r" ((x86_reg)line_size) , "g" (h-2) + : "%ecx"); + return tmp + hf_noise8_mmx(pix+8, line_size, h); +} + +static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { + MpegEncContext *c = p; + int score1, score2; + + if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h); + else score1 = sse16_mmx(c, pix1, pix2, line_size, h); + score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h); + + if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight; + else return score1 + FFABS(score2)*8; +} + +static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { + MpegEncContext *c = p; + int score1= sse8_mmx(c, pix1, pix2, line_size, h); + int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h); + + if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight; + else return score1 + FFABS(score2)*8; +} + +static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) { + int tmp; + + assert( (((int)pix) & 7) == 0); + assert((line_size &7) ==0); + +#define SUM(in0, in1, out0, out1) \ + "movq (%0), %%mm2\n"\ + "movq 8(%0), %%mm3\n"\ + "add %2,%0\n"\ + "movq %%mm2, " #out0 "\n"\ + "movq %%mm3, " #out1 "\n"\ + "psubusb " #in0 ", %%mm2\n"\ + "psubusb " #in1 ", %%mm3\n"\ + "psubusb " #out0 ", " #in0 "\n"\ + "psubusb " #out1 ", " #in1 "\n"\ + "por %%mm2, " #in0 "\n"\ + "por %%mm3, " #in1 "\n"\ + "movq " #in0 ", %%mm2\n"\ + "movq " #in1 ", %%mm3\n"\ + "punpcklbw %%mm7, " #in0 "\n"\ + "punpcklbw %%mm7, " #in1 "\n"\ + "punpckhbw %%mm7, %%mm2\n"\ + "punpckhbw %%mm7, %%mm3\n"\ + "paddw " #in1 ", " #in0 "\n"\ + "paddw %%mm3, %%mm2\n"\ + "paddw %%mm2, " #in0 "\n"\ + "paddw " #in0 ", %%mm6\n" + + + __asm__ volatile ( + "movl %3,%%ecx\n" + "pxor %%mm6,%%mm6\n" + "pxor %%mm7,%%mm7\n" + "movq (%0),%%mm0\n" + "movq 8(%0),%%mm1\n" + "add %2,%0\n" + "jmp 2f\n" + "1:\n" + + SUM(%%mm4, %%mm5, %%mm0, %%mm1) + "2:\n" + SUM(%%mm0, %%mm1, %%mm4, %%mm5) + + "subl $2, %%ecx\n" + "jnz 1b\n" + + "movq %%mm6,%%mm0\n" + "psrlq $32, %%mm6\n" + "paddw %%mm6,%%mm0\n" + "movq %%mm0,%%mm6\n" + "psrlq $16, %%mm0\n" + "paddw %%mm6,%%mm0\n" + "movd %%mm0,%1\n" + : "+r" (pix), "=r"(tmp) + : "r" ((x86_reg)line_size) , "m" (h) + : "%ecx"); + return tmp & 0xFFFF; +} +#undef SUM + +static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) { + int tmp; + + assert( (((int)pix) & 7) == 0); + assert((line_size &7) ==0); + +#define SUM(in0, in1, out0, out1) \ + "movq (%0), " #out0 "\n"\ + "movq 8(%0), " #out1 "\n"\ + "add %2,%0\n"\ + "psadbw " #out0 ", " #in0 "\n"\ + "psadbw " #out1 ", " #in1 "\n"\ + "paddw " #in1 ", " #in0 "\n"\ + "paddw " #in0 ", %%mm6\n" + + __asm__ volatile ( + "movl %3,%%ecx\n" + "pxor %%mm6,%%mm6\n" + "pxor %%mm7,%%mm7\n" + "movq (%0),%%mm0\n" + "movq 8(%0),%%mm1\n" + "add %2,%0\n" + "jmp 2f\n" + "1:\n" + + SUM(%%mm4, %%mm5, %%mm0, %%mm1) + "2:\n" + SUM(%%mm0, %%mm1, %%mm4, %%mm5) + + "subl $2, %%ecx\n" + "jnz 1b\n" + + "movd %%mm6,%1\n" + : "+r" (pix), "=r"(tmp) + : "r" ((x86_reg)line_size) , "m" (h) + : "%ecx"); + return tmp; +} +#undef SUM + +static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { + int tmp; + + assert( (((int)pix1) & 7) == 0); + assert( (((int)pix2) & 7) == 0); + assert((line_size &7) ==0); + +#define SUM(in0, in1, out0, out1) \ + "movq (%0),%%mm2\n"\ + "movq (%1)," #out0 "\n"\ + "movq 8(%0),%%mm3\n"\ + "movq 8(%1)," #out1 "\n"\ + "add %3,%0\n"\ + "add %3,%1\n"\ + "psubb " #out0 ", %%mm2\n"\ + "psubb " #out1 ", %%mm3\n"\ + "pxor %%mm7, %%mm2\n"\ + "pxor %%mm7, %%mm3\n"\ + "movq %%mm2, " #out0 "\n"\ + "movq %%mm3, " #out1 "\n"\ + "psubusb " #in0 ", %%mm2\n"\ + "psubusb " #in1 ", %%mm3\n"\ + "psubusb " #out0 ", " #in0 "\n"\ + "psubusb " #out1 ", " #in1 "\n"\ + "por %%mm2, " #in0 "\n"\ + "por %%mm3, " #in1 "\n"\ + "movq " #in0 ", %%mm2\n"\ + "movq " #in1 ", %%mm3\n"\ + "punpcklbw %%mm7, " #in0 "\n"\ + "punpcklbw %%mm7, " #in1 "\n"\ + "punpckhbw %%mm7, %%mm2\n"\ + "punpckhbw %%mm7, %%mm3\n"\ + "paddw " #in1 ", " #in0 "\n"\ + "paddw %%mm3, %%mm2\n"\ + "paddw %%mm2, " #in0 "\n"\ + "paddw " #in0 ", %%mm6\n" + + + __asm__ volatile ( + "movl %4,%%ecx\n" + "pxor %%mm6,%%mm6\n" + "pcmpeqw %%mm7,%%mm7\n" + "psllw $15, %%mm7\n" + "packsswb %%mm7, %%mm7\n" + "movq (%0),%%mm0\n" + "movq (%1),%%mm2\n" + "movq 8(%0),%%mm1\n" + "movq 8(%1),%%mm3\n" + "add %3,%0\n" + "add %3,%1\n" + "psubb %%mm2, %%mm0\n" + "psubb %%mm3, %%mm1\n" + "pxor %%mm7, %%mm0\n" + "pxor %%mm7, %%mm1\n" + "jmp 2f\n" + "1:\n" + + SUM(%%mm4, %%mm5, %%mm0, %%mm1) + "2:\n" + SUM(%%mm0, %%mm1, %%mm4, %%mm5) + + "subl $2, %%ecx\n" + "jnz 1b\n" + + "movq %%mm6,%%mm0\n" + "psrlq $32, %%mm6\n" + "paddw %%mm6,%%mm0\n" + "movq %%mm0,%%mm6\n" + "psrlq $16, %%mm0\n" + "paddw %%mm6,%%mm0\n" + "movd %%mm0,%2\n" + : "+r" (pix1), "+r" (pix2), "=r"(tmp) + : "r" ((x86_reg)line_size) , "m" (h) + : "%ecx"); + return tmp & 0x7FFF; +} +#undef SUM + +static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { + int tmp; + + assert( (((int)pix1) & 7) == 0); + assert( (((int)pix2) & 7) == 0); + assert((line_size &7) ==0); + +#define SUM(in0, in1, out0, out1) \ + "movq (%0)," #out0 "\n"\ + "movq (%1),%%mm2\n"\ + "movq 8(%0)," #out1 "\n"\ + "movq 8(%1),%%mm3\n"\ + "add %3,%0\n"\ + "add %3,%1\n"\ + "psubb %%mm2, " #out0 "\n"\ + "psubb %%mm3, " #out1 "\n"\ + "pxor %%mm7, " #out0 "\n"\ + "pxor %%mm7, " #out1 "\n"\ + "psadbw " #out0 ", " #in0 "\n"\ + "psadbw " #out1 ", " #in1 "\n"\ + "paddw " #in1 ", " #in0 "\n"\ + "paddw " #in0 ", %%mm6\n" + + __asm__ volatile ( + "movl %4,%%ecx\n" + "pxor %%mm6,%%mm6\n" + "pcmpeqw %%mm7,%%mm7\n" + "psllw $15, %%mm7\n" + "packsswb %%mm7, %%mm7\n" + "movq (%0),%%mm0\n" + "movq (%1),%%mm2\n" + "movq 8(%0),%%mm1\n" + "movq 8(%1),%%mm3\n" + "add %3,%0\n" + "add %3,%1\n" + "psubb %%mm2, %%mm0\n" + "psubb %%mm3, %%mm1\n" + "pxor %%mm7, %%mm0\n" + "pxor %%mm7, %%mm1\n" + "jmp 2f\n" + "1:\n" + + SUM(%%mm4, %%mm5, %%mm0, %%mm1) + "2:\n" + SUM(%%mm0, %%mm1, %%mm4, %%mm5) + + "subl $2, %%ecx\n" + "jnz 1b\n" + + "movd %%mm6,%2\n" + : "+r" (pix1), "+r" (pix2), "=r"(tmp) + : "r" ((x86_reg)line_size) , "m" (h) + : "%ecx"); + return tmp; +} +#undef SUM + +static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ + x86_reg i=0; + __asm__ volatile( + "1: \n\t" + "movq (%2, %0), %%mm0 \n\t" + "movq (%1, %0), %%mm1 \n\t" + "psubb %%mm0, %%mm1 \n\t" + "movq %%mm1, (%3, %0) \n\t" + "movq 8(%2, %0), %%mm0 \n\t" + "movq 8(%1, %0), %%mm1 \n\t" + "psubb %%mm0, %%mm1 \n\t" + "movq %%mm1, 8(%3, %0) \n\t" + "add $16, %0 \n\t" + "cmp %4, %0 \n\t" + " jb 1b \n\t" + : "+r" (i) + : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w-15) + ); + for(; i<w; i++) + dst[i+0] = src1[i+0]-src2[i+0]; +} + +static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){ + x86_reg i=0; + uint8_t l, lt; + + __asm__ volatile( + "1: \n\t" + "movq -1(%1, %0), %%mm0 \n\t" // LT + "movq (%1, %0), %%mm1 \n\t" // T + "movq -1(%2, %0), %%mm2 \n\t" // L + "movq (%2, %0), %%mm3 \n\t" // X + "movq %%mm2, %%mm4 \n\t" // L + "psubb %%mm0, %%mm2 \n\t" + "paddb %%mm1, %%mm2 \n\t" // L + T - LT + "movq %%mm4, %%mm5 \n\t" // L + "pmaxub %%mm1, %%mm4 \n\t" // max(T, L) + "pminub %%mm5, %%mm1 \n\t" // min(T, L) + "pminub %%mm2, %%mm4 \n\t" + "pmaxub %%mm1, %%mm4 \n\t" + "psubb %%mm4, %%mm3 \n\t" // dst - pred + "movq %%mm3, (%3, %0) \n\t" + "add $8, %0 \n\t" + "cmp %4, %0 \n\t" + " jb 1b \n\t" + : "+r" (i) + : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w) + ); + + l= *left; + lt= *left_top; + + dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF); + + *left_top= src1[w-1]; + *left = src2[w-1]; +} + +#define DIFF_PIXELS_1(m,a,t,p1,p2)\ + "mov"#m" "#p1", "#a" \n\t"\ + "mov"#m" "#p2", "#t" \n\t"\ + "punpcklbw "#a", "#t" \n\t"\ + "punpcklbw "#a", "#a" \n\t"\ + "psubw "#t", "#a" \n\t"\ + +#define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\ + uint8_t *p1b=p1, *p2b=p2;\ + __asm__ volatile(\ + DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\ + DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\ + DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\ + "add %4, %1 \n\t"\ + "add %4, %2 \n\t"\ + DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\ + DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\ + DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\ + DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\ + "mov"#m1" "#mm"0, %0 \n\t"\ + DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\ + "mov"#m1" %0, "#mm"0 \n\t"\ + : "+m"(temp), "+r"(p1b), "+r"(p2b)\ + : "r"((x86_reg)stride), "r"((x86_reg)stride*3)\ + );\ +} + //the "+m"(temp) is needed as gcc 2.95 sometimes fails to compile "=m"(temp) + +#define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q, %%mm, p1, p2, stride, temp) +#define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp) + +#define LBUTTERFLY2(a1,b1,a2,b2)\ + "paddw " #b1 ", " #a1 " \n\t"\ + "paddw " #b2 ", " #a2 " \n\t"\ + "paddw " #b1 ", " #b1 " \n\t"\ + "paddw " #b2 ", " #b2 " \n\t"\ + "psubw " #a1 ", " #b1 " \n\t"\ + "psubw " #a2 ", " #b2 " \n\t" + +#define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\ + LBUTTERFLY2(m0, m1, m2, m3)\ + LBUTTERFLY2(m4, m5, m6, m7)\ + LBUTTERFLY2(m0, m2, m1, m3)\ + LBUTTERFLY2(m4, m6, m5, m7)\ + LBUTTERFLY2(m0, m4, m1, m5)\ + LBUTTERFLY2(m2, m6, m3, m7)\ + +#define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7) + +#define MMABS_MMX(a,z)\ + "pxor " #z ", " #z " \n\t"\ + "pcmpgtw " #a ", " #z " \n\t"\ + "pxor " #z ", " #a " \n\t"\ + "psubw " #z ", " #a " \n\t" + +#define MMABS_MMX2(a,z)\ + "pxor " #z ", " #z " \n\t"\ + "psubw " #a ", " #z " \n\t"\ + "pmaxsw " #z ", " #a " \n\t" + +#define MMABS_SSSE3(a,z)\ + "pabsw " #a ", " #a " \n\t" + +#define MMABS_SUM(a,z, sum)\ + MMABS(a,z)\ + "paddusw " #a ", " #sum " \n\t" + +#define MMABS_SUM_8x8_NOSPILL\ + MMABS(%%xmm0, %%xmm8)\ + MMABS(%%xmm1, %%xmm9)\ + MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\ + MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\ + MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\ + MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\ + MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\ + MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\ + "paddusw %%xmm1, %%xmm0 \n\t" + +#ifdef ARCH_X86_64 +#define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL +#else +#define MMABS_SUM_8x8_SSE2\ + "movdqa %%xmm7, (%1) \n\t"\ + MMABS(%%xmm0, %%xmm7)\ + MMABS(%%xmm1, %%xmm7)\ + MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\ + MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\ + MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\ + MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\ + MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\ + "movdqa (%1), %%xmm2 \n\t"\ + MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\ + "paddusw %%xmm1, %%xmm0 \n\t" +#endif + +/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to + * about 100k on extreme inputs. But that's very unlikely to occur in natural video, + * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */ +#define HSUM_MMX(a, t, dst)\ + "movq "#a", "#t" \n\t"\ + "psrlq $32, "#a" \n\t"\ + "paddusw "#t", "#a" \n\t"\ + "movq "#a", "#t" \n\t"\ + "psrlq $16, "#a" \n\t"\ + "paddusw "#t", "#a" \n\t"\ + "movd "#a", "#dst" \n\t"\ + +#define HSUM_MMX2(a, t, dst)\ + "pshufw $0x0E, "#a", "#t" \n\t"\ + "paddusw "#t", "#a" \n\t"\ + "pshufw $0x01, "#a", "#t" \n\t"\ + "paddusw "#t", "#a" \n\t"\ + "movd "#a", "#dst" \n\t"\ + +#define HSUM_SSE2(a, t, dst)\ + "movhlps "#a", "#t" \n\t"\ + "paddusw "#t", "#a" \n\t"\ + "pshuflw $0x0E, "#a", "#t" \n\t"\ + "paddusw "#t", "#a" \n\t"\ + "pshuflw $0x01, "#a", "#t" \n\t"\ + "paddusw "#t", "#a" \n\t"\ + "movd "#a", "#dst" \n\t"\ + +#define HADAMARD8_DIFF_MMX(cpu) \ +static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\ + DECLARE_ALIGNED_8(uint64_t, temp[13]);\ + int sum;\ +\ + assert(h==8);\ +\ + DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\ +\ + __asm__ volatile(\ + HADAMARD48\ +\ + "movq %%mm7, 96(%1) \n\t"\ +\ + TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\ + STORE4(8, 0(%1), %%mm0, %%mm3, %%mm7, %%mm2)\ +\ + "movq 96(%1), %%mm7 \n\t"\ + TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\ + STORE4(8, 64(%1), %%mm4, %%mm7, %%mm0, %%mm6)\ +\ + : "=r" (sum)\ + : "r"(temp)\ + );\ +\ + DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\ +\ + __asm__ volatile(\ + HADAMARD48\ +\ + "movq %%mm7, 96(%1) \n\t"\ +\ + TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\ + STORE4(8, 32(%1), %%mm0, %%mm3, %%mm7, %%mm2)\ +\ + "movq 96(%1), %%mm7 \n\t"\ + TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\ + "movq %%mm7, %%mm5 \n\t"/*FIXME remove*/\ + "movq %%mm6, %%mm7 \n\t"\ + "movq %%mm0, %%mm6 \n\t"\ +\ + LOAD4(8, 64(%1), %%mm0, %%mm1, %%mm2, %%mm3)\ +\ + HADAMARD48\ + "movq %%mm7, 64(%1) \n\t"\ + MMABS(%%mm0, %%mm7)\ + MMABS(%%mm1, %%mm7)\ + MMABS_SUM(%%mm2, %%mm7, %%mm0)\ + MMABS_SUM(%%mm3, %%mm7, %%mm1)\ + MMABS_SUM(%%mm4, %%mm7, %%mm0)\ + MMABS_SUM(%%mm5, %%mm7, %%mm1)\ + MMABS_SUM(%%mm6, %%mm7, %%mm0)\ + "movq 64(%1), %%mm2 \n\t"\ + MMABS_SUM(%%mm2, %%mm7, %%mm1)\ + "paddusw %%mm1, %%mm0 \n\t"\ + "movq %%mm0, 64(%1) \n\t"\ +\ + LOAD4(8, 0(%1), %%mm0, %%mm1, %%mm2, %%mm3)\ + LOAD4(8, 32(%1), %%mm4, %%mm5, %%mm6, %%mm7)\ +\ + HADAMARD48\ + "movq %%mm7, (%1) \n\t"\ + MMABS(%%mm0, %%mm7)\ + MMABS(%%mm1, %%mm7)\ + MMABS_SUM(%%mm2, %%mm7, %%mm0)\ + MMABS_SUM(%%mm3, %%mm7, %%mm1)\ + MMABS_SUM(%%mm4, %%mm7, %%mm0)\ + MMABS_SUM(%%mm5, %%mm7, %%mm1)\ + MMABS_SUM(%%mm6, %%mm7, %%mm0)\ + "movq (%1), %%mm2 \n\t"\ + MMABS_SUM(%%mm2, %%mm7, %%mm1)\ + "paddusw 64(%1), %%mm0 \n\t"\ + "paddusw %%mm1, %%mm0 \n\t"\ +\ + HSUM(%%mm0, %%mm1, %0)\ +\ + : "=r" (sum)\ + : "r"(temp)\ + );\ + return sum&0xFFFF;\ +}\ +WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu) + +#define HADAMARD8_DIFF_SSE2(cpu) \ +static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\ + DECLARE_ALIGNED_16(uint64_t, temp[4]);\ + int sum;\ +\ + assert(h==8);\ +\ + DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\ +\ + __asm__ volatile(\ + HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\ + TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\ + HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\ + MMABS_SUM_8x8\ + HSUM_SSE2(%%xmm0, %%xmm1, %0)\ + : "=r" (sum)\ + : "r"(temp)\ + );\ + return sum&0xFFFF;\ +}\ +WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu) + +#define MMABS(a,z) MMABS_MMX(a,z) +#define HSUM(a,t,dst) HSUM_MMX(a,t,dst) +HADAMARD8_DIFF_MMX(mmx) +#undef MMABS +#undef HSUM + +#define MMABS(a,z) MMABS_MMX2(a,z) +#define MMABS_SUM_8x8 MMABS_SUM_8x8_SSE2 +#define HSUM(a,t,dst) HSUM_MMX2(a,t,dst) +HADAMARD8_DIFF_MMX(mmx2) +HADAMARD8_DIFF_SSE2(sse2) +#undef MMABS +#undef MMABS_SUM_8x8 +#undef HSUM + +#ifdef HAVE_SSSE3 +#define MMABS(a,z) MMABS_SSSE3(a,z) +#define MMABS_SUM_8x8 MMABS_SUM_8x8_NOSPILL +HADAMARD8_DIFF_SSE2(ssse3) +#undef MMABS +#undef MMABS_SUM_8x8 +#endif + +#define DCT_SAD4(m,mm,o)\ + "mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\ + "mov"#m" "#o"+16(%1), "#mm"3 \n\t"\ + "mov"#m" "#o"+32(%1), "#mm"4 \n\t"\ + "mov"#m" "#o"+48(%1), "#mm"5 \n\t"\ + MMABS_SUM(mm##2, mm##6, mm##0)\ + MMABS_SUM(mm##3, mm##7, mm##1)\ + MMABS_SUM(mm##4, mm##6, mm##0)\ + MMABS_SUM(mm##5, mm##7, mm##1)\ + +#define DCT_SAD_MMX\ + "pxor %%mm0, %%mm0 \n\t"\ + "pxor %%mm1, %%mm1 \n\t"\ + DCT_SAD4(q, %%mm, 0)\ + DCT_SAD4(q, %%mm, 8)\ + DCT_SAD4(q, %%mm, 64)\ + DCT_SAD4(q, %%mm, 72)\ + "paddusw %%mm1, %%mm0 \n\t"\ + HSUM(%%mm0, %%mm1, %0) + +#define DCT_SAD_SSE2\ + "pxor %%xmm0, %%xmm0 \n\t"\ + "pxor %%xmm1, %%xmm1 \n\t"\ + DCT_SAD4(dqa, %%xmm, 0)\ + DCT_SAD4(dqa, %%xmm, 64)\ + "paddusw %%xmm1, %%xmm0 \n\t"\ + HSUM(%%xmm0, %%xmm1, %0) + +#define DCT_SAD_FUNC(cpu) \ +static int sum_abs_dctelem_##cpu(DCTELEM *block){\ + int sum;\ + __asm__ volatile(\ + DCT_SAD\ + :"=r"(sum)\ + :"r"(block)\ + );\ + return sum&0xFFFF;\ +} + +#define DCT_SAD DCT_SAD_MMX +#define HSUM(a,t,dst) HSUM_MMX(a,t,dst) +#define MMABS(a,z) MMABS_MMX(a,z) +DCT_SAD_FUNC(mmx) +#undef MMABS +#undef HSUM + +#define HSUM(a,t,dst) HSUM_MMX2(a,t,dst) +#define MMABS(a,z) MMABS_MMX2(a,z) +DCT_SAD_FUNC(mmx2) +#undef HSUM +#undef DCT_SAD + +#define DCT_SAD DCT_SAD_SSE2 +#define HSUM(a,t,dst) HSUM_SSE2(a,t,dst) +DCT_SAD_FUNC(sse2) +#undef MMABS + +#ifdef HAVE_SSSE3 +#define MMABS(a,z) MMABS_SSSE3(a,z) +DCT_SAD_FUNC(ssse3) +#undef MMABS +#endif +#undef HSUM +#undef DCT_SAD + +static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){ + int sum; + x86_reg i=size; + __asm__ volatile( + "pxor %%mm4, %%mm4 \n" + "1: \n" + "sub $8, %0 \n" + "movq (%2,%0), %%mm2 \n" + "movq (%3,%0,2), %%mm0 \n" + "movq 8(%3,%0,2), %%mm1 \n" + "punpckhbw %%mm2, %%mm3 \n" + "punpcklbw %%mm2, %%mm2 \n" + "psraw $8, %%mm3 \n" + "psraw $8, %%mm2 \n" + "psubw %%mm3, %%mm1 \n" + "psubw %%mm2, %%mm0 \n" + "pmaddwd %%mm1, %%mm1 \n" + "pmaddwd %%mm0, %%mm0 \n" + "paddd %%mm1, %%mm4 \n" + "paddd %%mm0, %%mm4 \n" + "jg 1b \n" + "movq %%mm4, %%mm3 \n" + "psrlq $32, %%mm3 \n" + "paddd %%mm3, %%mm4 \n" + "movd %%mm4, %1 \n" + :"+r"(i), "=r"(sum) + :"r"(pix1), "r"(pix2) + ); + return sum; +} + +#define PHADDD(a, t)\ + "movq "#a", "#t" \n\t"\ + "psrlq $32, "#a" \n\t"\ + "paddd "#t", "#a" \n\t" +/* + pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31] + pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31] + pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30] + */ +#define PMULHRW(x, y, s, o)\ + "pmulhw " #s ", "#x " \n\t"\ + "pmulhw " #s ", "#y " \n\t"\ + "paddw " #o ", "#x " \n\t"\ + "paddw " #o ", "#y " \n\t"\ + "psraw $1, "#x " \n\t"\ + "psraw $1, "#y " \n\t" +#define DEF(x) x ## _mmx +#define SET_RND MOVQ_WONE +#define SCALE_OFFSET 1 + +#include "dsputil_mmx_qns_template.c" + +#undef DEF +#undef SET_RND +#undef SCALE_OFFSET +#undef PMULHRW + +#define DEF(x) x ## _3dnow +#define SET_RND(x) +#define SCALE_OFFSET 0 +#define PMULHRW(x, y, s, o)\ + "pmulhrw " #s ", "#x " \n\t"\ + "pmulhrw " #s ", "#y " \n\t" + +#include "dsputil_mmx_qns_template.c" + +#undef DEF +#undef SET_RND +#undef SCALE_OFFSET +#undef PMULHRW + +#ifdef HAVE_SSSE3 +#undef PHADDD +#define DEF(x) x ## _ssse3 +#define SET_RND(x) +#define SCALE_OFFSET -1 +#define PHADDD(a, t)\ + "pshufw $0x0E, "#a", "#t" \n\t"\ + "paddd "#t", "#a" \n\t" /* faster than phaddd on core2 */ +#define PMULHRW(x, y, s, o)\ + "pmulhrsw " #s ", "#x " \n\t"\ + "pmulhrsw " #s ", "#y " \n\t" + +#include "dsputil_mmx_qns_template.c" + +#undef DEF +#undef SET_RND +#undef SCALE_OFFSET +#undef PMULHRW +#undef PHADDD +#endif //HAVE_SSSE3 + + +/* FLAC specific */ +void ff_flac_compute_autocorr_sse2(const int32_t *data, int len, int lag, + double *autoc); + + +void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx) +{ + if (mm_flags & FF_MM_MMX) { + const int dct_algo = avctx->dct_algo; + if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){ + if(mm_flags & FF_MM_SSE2){ + c->fdct = ff_fdct_sse2; + }else if(mm_flags & FF_MM_MMXEXT){ + c->fdct = ff_fdct_mmx2; + }else{ + c->fdct = ff_fdct_mmx; + } + } + + c->get_pixels = get_pixels_mmx; + c->diff_pixels = diff_pixels_mmx; + c->pix_sum = pix_sum16_mmx; + + c->diff_bytes= diff_bytes_mmx; + c->sum_abs_dctelem= sum_abs_dctelem_mmx; + + c->hadamard8_diff[0]= hadamard8_diff16_mmx; + c->hadamard8_diff[1]= hadamard8_diff_mmx; + + c->pix_norm1 = pix_norm1_mmx; + c->sse[0] = (mm_flags & FF_MM_SSE2) ? sse16_sse2 : sse16_mmx; + c->sse[1] = sse8_mmx; + c->vsad[4]= vsad_intra16_mmx; + + c->nsse[0] = nsse16_mmx; + c->nsse[1] = nsse8_mmx; + if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ + c->vsad[0] = vsad16_mmx; + } + + if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ + c->try_8x8basis= try_8x8basis_mmx; + } + c->add_8x8basis= add_8x8basis_mmx; + + c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx; + + + if (mm_flags & FF_MM_MMXEXT) { + c->sum_abs_dctelem= sum_abs_dctelem_mmx2; + c->hadamard8_diff[0]= hadamard8_diff16_mmx2; + c->hadamard8_diff[1]= hadamard8_diff_mmx2; + c->vsad[4]= vsad_intra16_mmx2; + + if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ + c->vsad[0] = vsad16_mmx2; + } + + c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2; + } + + if(mm_flags & FF_MM_SSE2){ + c->get_pixels = get_pixels_sse2; + c->sum_abs_dctelem= sum_abs_dctelem_sse2; + c->hadamard8_diff[0]= hadamard8_diff16_sse2; + c->hadamard8_diff[1]= hadamard8_diff_sse2; + if (ENABLE_FLAC_ENCODER) + c->flac_compute_autocorr = ff_flac_compute_autocorr_sse2; + } + +#ifdef HAVE_SSSE3 + if(mm_flags & FF_MM_SSSE3){ + if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ + c->try_8x8basis= try_8x8basis_ssse3; + } + c->add_8x8basis= add_8x8basis_ssse3; + c->sum_abs_dctelem= sum_abs_dctelem_ssse3; + c->hadamard8_diff[0]= hadamard8_diff16_ssse3; + c->hadamard8_diff[1]= hadamard8_diff_ssse3; + } +#endif + + if(mm_flags & FF_MM_3DNOW){ + if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ + c->try_8x8basis= try_8x8basis_3dnow; + } + c->add_8x8basis= add_8x8basis_3dnow; + } + } + + dsputil_init_pix_mmx(c, avctx); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/x86/fdct_mmx.c Mon Dec 22 09:12:42 2008 +0000 @@ -0,0 +1,580 @@ +/* + * MMX optimized forward DCT + * The gcc porting is Copyright (c) 2001 Fabrice Bellard. + * cleanup/optimizations are Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> + * SSE2 optimization is Copyright (c) 2004 Denes Balatoni. + * + * from fdctam32.c - AP922 MMX(3D-Now) forward-DCT + * + * Intel Application Note AP-922 - fast, precise implementation of DCT + * http://developer.intel.com/vtune/cbts/appnotes.htm + * + * Also of inspiration: + * a page about fdct at http://www.geocities.com/ssavekar/dct.htm + * Skal's fdct at http://skal.planet-d.net/coding/dct.html + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/common.h" +#include "libavcodec/dsputil.h" + +#define ATTR_ALIGN(align) __attribute__ ((__aligned__ (align))) + +////////////////////////////////////////////////////////////////////// +// +// constants for the forward DCT +// ----------------------------- +// +// Be sure to check that your compiler is aligning all constants to QWORD +// (8-byte) memory boundaries! Otherwise the unaligned memory access will +// severely stall MMX execution. +// +////////////////////////////////////////////////////////////////////// + +#define BITS_FRW_ACC 3 //; 2 or 3 for accuracy +#define SHIFT_FRW_COL BITS_FRW_ACC +#define SHIFT_FRW_ROW (BITS_FRW_ACC + 17 - 3) +#define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1)) +//#define RND_FRW_COL (1 << (SHIFT_FRW_COL-1)) + +#define X8(x) x,x,x,x,x,x,x,x + +//concatenated table, for forward DCT transformation +static const int16_t fdct_tg_all_16[24] ATTR_ALIGN(16) = { + X8(13036), // tg * (2<<16) + 0.5 + X8(27146), // tg * (2<<16) + 0.5 + X8(-21746) // tg * (2<<16) + 0.5 +}; + +static const int16_t ocos_4_16[8] ATTR_ALIGN(16) = { + X8(23170) //cos * (2<<15) + 0.5 +}; + +static const int16_t fdct_one_corr[8] ATTR_ALIGN(16) = { X8(1) }; + +static const int32_t fdct_r_row[2] ATTR_ALIGN(8) = {RND_FRW_ROW, RND_FRW_ROW }; + +static struct +{ + const int32_t fdct_r_row_sse2[4] ATTR_ALIGN(16); +} fdct_r_row_sse2 ATTR_ALIGN(16)= +{{ + RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW +}}; +//static const long fdct_r_row_sse2[4] ATTR_ALIGN(16) = {RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW}; + +static const int16_t tab_frw_01234567[] ATTR_ALIGN(8) = { // forward_dct coeff table + 16384, 16384, 22725, 19266, + 16384, 16384, 12873, 4520, + 21407, 8867, 19266, -4520, + -8867, -21407, -22725, -12873, + 16384, -16384, 12873, -22725, + -16384, 16384, 4520, 19266, + 8867, -21407, 4520, -12873, + 21407, -8867, 19266, -22725, + + 22725, 22725, 31521, 26722, + 22725, 22725, 17855, 6270, + 29692, 12299, 26722, -6270, + -12299, -29692, -31521, -17855, + 22725, -22725, 17855, -31521, + -22725, 22725, 6270, 26722, + 12299, -29692, 6270, -17855, + 29692, -12299, 26722, -31521, + + 21407, 21407, 29692, 25172, + 21407, 21407, 16819, 5906, + 27969, 11585, 25172, -5906, + -11585, -27969, -29692, -16819, + 21407, -21407, 16819, -29692, + -21407, 21407, 5906, 25172, + 11585, -27969, 5906, -16819, + 27969, -11585, 25172, -29692, + + 19266, 19266, 26722, 22654, + 19266, 19266, 15137, 5315, + 25172, 10426, 22654, -5315, + -10426, -25172, -26722, -15137, + 19266, -19266, 15137, -26722, + -19266, 19266, 5315, 22654, + 10426, -25172, 5315, -15137, + 25172, -10426, 22654, -26722, + + 16384, 16384, 22725, 19266, + 16384, 16384, 12873, 4520, + 21407, 8867, 19266, -4520, + -8867, -21407, -22725, -12873, + 16384, -16384, 12873, -22725, + -16384, 16384, 4520, 19266, + 8867, -21407, 4520, -12873, + 21407, -8867, 19266, -22725, + + 19266, 19266, 26722, 22654, + 19266, 19266, 15137, 5315, + 25172, 10426, 22654, -5315, + -10426, -25172, -26722, -15137, + 19266, -19266, 15137, -26722, + -19266, 19266, 5315, 22654, + 10426, -25172, 5315, -15137, + 25172, -10426, 22654, -26722, + + 21407, 21407, 29692, 25172, + 21407, 21407, 16819, 5906, + 27969, 11585, 25172, -5906, + -11585, -27969, -29692, -16819, + 21407, -21407, 16819, -29692, + -21407, 21407, 5906, 25172, + 11585, -27969, 5906, -16819, + 27969, -11585, 25172, -29692, + + 22725, 22725, 31521, 26722, + 22725, 22725, 17855, 6270, + 29692, 12299, 26722, -6270, + -12299, -29692, -31521, -17855, + 22725, -22725, 17855, -31521, + -22725, 22725, 6270, 26722, + 12299, -29692, 6270, -17855, + 29692, -12299, 26722, -31521, +}; + +static struct +{ + const int16_t tab_frw_01234567_sse2[256] ATTR_ALIGN(16); +} tab_frw_01234567_sse2 ATTR_ALIGN(16) = +{{ +//static const int16_t tab_frw_01234567_sse2[] ATTR_ALIGN(16) = { // forward_dct coeff table +#define TABLE_SSE2 C4, C4, C1, C3, -C6, -C2, -C1, -C5, \ + C4, C4, C5, C7, C2, C6, C3, -C7, \ + -C4, C4, C7, C3, C6, -C2, C7, -C5, \ + C4, -C4, C5, -C1, C2, -C6, C3, -C1, +// c1..c7 * cos(pi/4) * 2^15 +#define C1 22725 +#define C2 21407 +#define C3 19266 +#define C4 16384 +#define C5 12873 +#define C6 8867 +#define C7 4520 +TABLE_SSE2 + +#undef C1 +#undef C2 +#undef C3 +#undef C4 +#undef C5 +#undef C6 +#undef C7 +#define C1 31521 +#define C2 29692 +#define C3 26722 +#define C4 22725 +#define C5 17855 +#define C6 12299 +#define C7 6270 +TABLE_SSE2 + +#undef C1 +#undef C2 +#undef C3 +#undef C4 +#undef C5 +#undef C6 +#undef C7 +#define C1 29692 +#define C2 27969 +#define C3 25172 +#define C4 21407 +#define C5 16819 +#define C6 11585 +#define C7 5906 +TABLE_SSE2 + +#undef C1 +#undef C2 +#undef C3 +#undef C4 +#undef C5 +#undef C6 +#undef C7 +#define C1 26722 +#define C2 25172 +#define C3 22654 +#define C4 19266 +#define C5 15137 +#define C6 10426 +#define C7 5315 +TABLE_SSE2 + +#undef C1 +#undef C2 +#undef C3 +#undef C4 +#undef C5 +#undef C6 +#undef C7 +#define C1 22725 +#define C2 21407 +#define C3 19266 +#define C4 16384 +#define C5 12873 +#define C6 8867 +#define C7 4520 +TABLE_SSE2 + +#undef C1 +#undef C2 +#undef C3 +#undef C4 +#undef C5 +#undef C6 +#undef C7 +#define C1 26722 +#define C2 25172 +#define C3 22654 +#define C4 19266 +#define C5 15137 +#define C6 10426 +#define C7 5315 +TABLE_SSE2 + +#undef C1 +#undef C2 +#undef C3 +#undef C4 +#undef C5 +#undef C6 +#undef C7 +#define C1 29692 +#define C2 27969 +#define C3 25172 +#define C4 21407 +#define C5 16819 +#define C6 11585 +#define C7 5906 +TABLE_SSE2 + +#undef C1 +#undef C2 +#undef C3 +#undef C4 +#undef C5 +#undef C6 +#undef C7 +#define C1 31521 +#define C2 29692 +#define C3 26722 +#define C4 22725 +#define C5 17855 +#define C6 12299 +#define C7 6270 +TABLE_SSE2 +}}; + +#define S(s) AV_TOSTRING(s) //AV_STRINGIFY is too long + +#define FDCT_COL(cpu, mm, mov)\ +static av_always_inline void fdct_col_##cpu(const int16_t *in, int16_t *out, int offset)\ +{\ + __asm__ volatile (\ + #mov" 16(%0), %%"#mm"0 \n\t" \ + #mov" 96(%0), %%"#mm"1 \n\t" \ + #mov" %%"#mm"0, %%"#mm"2 \n\t" \ + #mov" 32(%0), %%"#mm"3 \n\t" \ + "paddsw %%"#mm"1, %%"#mm"0 \n\t" \ + #mov" 80(%0), %%"#mm"4 \n\t" \ + "psllw $"S(SHIFT_FRW_COL)", %%"#mm"0 \n\t" \ + #mov" (%0), %%"#mm"5 \n\t" \ + "paddsw %%"#mm"3, %%"#mm"4 \n\t" \ + "paddsw 112(%0), %%"#mm"5 \n\t" \ + "psllw $"S(SHIFT_FRW_COL)", %%"#mm"4 \n\t" \ + #mov" %%"#mm"0, %%"#mm"6 \n\t" \ + "psubsw %%"#mm"1, %%"#mm"2 \n\t" \ + #mov" 16(%1), %%"#mm"1 \n\t" \ + "psubsw %%"#mm"4, %%"#mm"0 \n\t" \ + #mov" 48(%0), %%"#mm"7 \n\t" \ + "pmulhw %%"#mm"0, %%"#mm"1 \n\t" \ + "paddsw 64(%0), %%"#mm"7 \n\t" \ + "psllw $"S(SHIFT_FRW_COL)", %%"#mm"5 \n\t" \ + "paddsw %%"#mm"4, %%"#mm"6 \n\t" \ + "psllw $"S(SHIFT_FRW_COL)", %%"#mm"7 \n\t" \ + #mov" %%"#mm"5, %%"#mm"4 \n\t" \ + "psubsw %%"#mm"7, %%"#mm"5 \n\t" \ + "paddsw %%"#mm"5, %%"#mm"1 \n\t" \ + "paddsw %%"#mm"7, %%"#mm"4 \n\t" \ + "por (%2), %%"#mm"1 \n\t" \ + "psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"2 \n\t" \ + "pmulhw 16(%1), %%"#mm"5 \n\t" \ + #mov" %%"#mm"4, %%"#mm"7 \n\t" \ + "psubsw 80(%0), %%"#mm"3 \n\t" \ + "psubsw %%"#mm"6, %%"#mm"4 \n\t" \ + #mov" %%"#mm"1, 32(%3) \n\t" \ + "paddsw %%"#mm"6, %%"#mm"7 \n\t" \ + #mov" 48(%0), %%"#mm"1 \n\t" \ + "psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"3 \n\t" \ + "psubsw 64(%0), %%"#mm"1 \n\t" \ + #mov" %%"#mm"2, %%"#mm"6 \n\t" \ + #mov" %%"#mm"4, 64(%3) \n\t" \ + "paddsw %%"#mm"3, %%"#mm"2 \n\t" \ + "pmulhw (%4), %%"#mm"2 \n\t" \ + "psubsw %%"#mm"3, %%"#mm"6 \n\t" \ + "pmulhw (%4), %%"#mm"6 \n\t" \ + "psubsw %%"#mm"0, %%"#mm"5 \n\t" \ + "por (%2), %%"#mm"5 \n\t" \ + "psllw $"S(SHIFT_FRW_COL)", %%"#mm"1 \n\t" \ + "por (%2), %%"#mm"2 \n\t" \ + #mov" %%"#mm"1, %%"#mm"4 \n\t" \ + #mov" (%0), %%"#mm"3 \n\t" \ + "paddsw %%"#mm"6, %%"#mm"1 \n\t" \ + "psubsw 112(%0), %%"#mm"3 \n\t" \ + "psubsw %%"#mm"6, %%"#mm"4 \n\t" \ + #mov" (%1), %%"#mm"0 \n\t" \ + "psllw $"S(SHIFT_FRW_COL)", %%"#mm"3 \n\t" \ + #mov" 32(%1), %%"#mm"6 \n\t" \ + "pmulhw %%"#mm"1, %%"#mm"0 \n\t" \ + #mov" %%"#mm"7, (%3) \n\t" \ + "pmulhw %%"#mm"4, %%"#mm"6 \n\t" \ + #mov" %%"#mm"5, 96(%3) \n\t" \ + #mov" %%"#mm"3, %%"#mm"7 \n\t" \ + #mov" 32(%1), %%"#mm"5 \n\t" \ + "psubsw %%"#mm"2, %%"#mm"7 \n\t" \ + "paddsw %%"#mm"2, %%"#mm"3 \n\t" \ + "pmulhw %%"#mm"7, %%"#mm"5 \n\t" \ + "paddsw %%"#mm"3, %%"#mm"0 \n\t" \ + "paddsw %%"#mm"4, %%"#mm"6 \n\t" \ + "pmulhw (%1), %%"#mm"3 \n\t" \ + "por (%2), %%"#mm"0 \n\t" \ + "paddsw %%"#mm"7, %%"#mm"5 \n\t" \ + "psubsw %%"#mm"6, %%"#mm"7 \n\t" \ + #mov" %%"#mm"0, 16(%3) \n\t" \ + "paddsw %%"#mm"4, %%"#mm"5 \n\t" \ + #mov" %%"#mm"7, 48(%3) \n\t" \ + "psubsw %%"#mm"1, %%"#mm"3 \n\t" \ + #mov" %%"#mm"5, 80(%3) \n\t" \ + #mov" %%"#mm"3, 112(%3) \n\t" \ + : \ + : "r" (in + offset), "r" (fdct_tg_all_16), "r" (fdct_one_corr), \ + "r" (out + offset), "r" (ocos_4_16)); \ +} + +FDCT_COL(mmx, mm, movq) +FDCT_COL(sse2, xmm, movdqa) + +static av_always_inline void fdct_row_sse2(const int16_t *in, int16_t *out) +{ + __asm__ volatile( +#define FDCT_ROW_SSE2_H1(i,t) \ + "movq " #i "(%0), %%xmm2 \n\t" \ + "movq " #i "+8(%0), %%xmm0 \n\t" \ + "movdqa " #t "+32(%1), %%xmm3 \n\t" \ + "movdqa " #t "+48(%1), %%xmm7 \n\t" \ + "movdqa " #t "(%1), %%xmm4 \n\t" \ + "movdqa " #t "+16(%1), %%xmm5 \n\t" + +#define FDCT_ROW_SSE2_H2(i,t) \ + "movq " #i "(%0), %%xmm2 \n\t" \ + "movq " #i "+8(%0), %%xmm0 \n\t" \ + "movdqa " #t "+32(%1), %%xmm3 \n\t" \ + "movdqa " #t "+48(%1), %%xmm7 \n\t" + +#define FDCT_ROW_SSE2(i) \ + "movq %%xmm2, %%xmm1 \n\t" \ + "pshuflw $27, %%xmm0, %%xmm0 \n\t" \ + "paddsw %%xmm0, %%xmm1 \n\t" \ + "psubsw %%xmm0, %%xmm2 \n\t" \ + "punpckldq %%xmm2, %%xmm1 \n\t" \ + "pshufd $78, %%xmm1, %%xmm2 \n\t" \ + "pmaddwd %%xmm2, %%xmm3 \n\t" \ + "pmaddwd %%xmm1, %%xmm7 \n\t" \ + "pmaddwd %%xmm5, %%xmm2 \n\t" \ + "pmaddwd %%xmm4, %%xmm1 \n\t" \ + "paddd %%xmm7, %%xmm3 \n\t" \ + "paddd %%xmm2, %%xmm1 \n\t" \ + "paddd %%xmm6, %%xmm3 \n\t" \ + "paddd %%xmm6, %%xmm1 \n\t" \ + "psrad %3, %%xmm3 \n\t" \ + "psrad %3, %%xmm1 \n\t" \ + "packssdw %%xmm3, %%xmm1 \n\t" \ + "movdqa %%xmm1, " #i "(%4) \n\t" + + "movdqa (%2), %%xmm6 \n\t" + FDCT_ROW_SSE2_H1(0,0) + FDCT_ROW_SSE2(0) + FDCT_ROW_SSE2_H2(64,0) + FDCT_ROW_SSE2(64) + + FDCT_ROW_SSE2_H1(16,64) + FDCT_ROW_SSE2(16) + FDCT_ROW_SSE2_H2(112,64) + FDCT_ROW_SSE2(112) + + FDCT_ROW_SSE2_H1(32,128) + FDCT_ROW_SSE2(32) + FDCT_ROW_SSE2_H2(96,128) + FDCT_ROW_SSE2(96) + + FDCT_ROW_SSE2_H1(48,192) + FDCT_ROW_SSE2(48) + FDCT_ROW_SSE2_H2(80,192) + FDCT_ROW_SSE2(80) + : + : "r" (in), "r" (tab_frw_01234567_sse2.tab_frw_01234567_sse2), "r" (fdct_r_row_sse2.fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out) + ); +} + +static av_always_inline void fdct_row_mmx2(const int16_t *in, int16_t *out, const int16_t *table) +{ + __asm__ volatile ( + "pshufw $0x1B, 8(%0), %%mm5 \n\t" + "movq (%0), %%mm0 \n\t" + "movq %%mm0, %%mm1 \n\t" + "paddsw %%mm5, %%mm0 \n\t" + "psubsw %%mm5, %%mm1 \n\t" + "movq %%mm0, %%mm2 \n\t" + "punpckldq %%mm1, %%mm0 \n\t" + "punpckhdq %%mm1, %%mm2 \n\t" + "movq (%1), %%mm1 \n\t" + "movq 8(%1), %%mm3 \n\t" + "movq 16(%1), %%mm4 \n\t" + "movq 24(%1), %%mm5 \n\t" + "movq 32(%1), %%mm6 \n\t" + "movq 40(%1), %%mm7 \n\t" + "pmaddwd %%mm0, %%mm1 \n\t" + "pmaddwd %%mm2, %%mm3 \n\t" + "pmaddwd %%mm0, %%mm4 \n\t" + "pmaddwd %%mm2, %%mm5 \n\t" + "pmaddwd %%mm0, %%mm6 \n\t" + "pmaddwd %%mm2, %%mm7 \n\t" + "pmaddwd 48(%1), %%mm0 \n\t" + "pmaddwd 56(%1), %%mm2 \n\t" + "paddd %%mm1, %%mm3 \n\t" + "paddd %%mm4, %%mm5 \n\t" + "paddd %%mm6, %%mm7 \n\t" + "paddd %%mm0, %%mm2 \n\t" + "movq (%2), %%mm0 \n\t" + "paddd %%mm0, %%mm3 \n\t" + "paddd %%mm0, %%mm5 \n\t" + "paddd %%mm0, %%mm7 \n\t" + "paddd %%mm0, %%mm2 \n\t" + "psrad $"S(SHIFT_FRW_ROW)", %%mm3 \n\t" + "psrad $"S(SHIFT_FRW_ROW)", %%mm5 \n\t" + "psrad $"S(SHIFT_FRW_ROW)", %%mm7 \n\t" + "psrad $"S(SHIFT_FRW_ROW)", %%mm2 \n\t" + "packssdw %%mm5, %%mm3 \n\t" + "packssdw %%mm2, %%mm7 \n\t" + "movq %%mm3, (%3) \n\t" + "movq %%mm7, 8(%3) \n\t" + : + : "r" (in), "r" (table), "r" (fdct_r_row), "r" (out)); +} + +static av_always_inline void fdct_row_mmx(const int16_t *in, int16_t *out, const int16_t *table) +{ + //FIXME reorder (I do not have an old MMX-only CPU here to benchmark ...) + __asm__ volatile( + "movd 12(%0), %%mm1 \n\t" + "punpcklwd 8(%0), %%mm1 \n\t" + "movq %%mm1, %%mm2 \n\t" + "psrlq $0x20, %%mm1 \n\t" + "movq 0(%0), %%mm0 \n\t" + "punpcklwd %%mm2, %%mm1 \n\t" + "movq %%mm0, %%mm5 \n\t" + "paddsw %%mm1, %%mm0 \n\t" + "psubsw %%mm1, %%mm5 \n\t" + "movq %%mm0, %%mm2 \n\t" + "punpckldq %%mm5, %%mm0 \n\t" + "punpckhdq %%mm5, %%mm2 \n\t" + "movq 0(%1), %%mm1 \n\t" + "movq 8(%1), %%mm3 \n\t" + "movq 16(%1), %%mm4 \n\t" + "movq 24(%1), %%mm5 \n\t" + "movq 32(%1), %%mm6 \n\t" + "movq 40(%1), %%mm7 \n\t" + "pmaddwd %%mm0, %%mm1 \n\t" + "pmaddwd %%mm2, %%mm3 \n\t" + "pmaddwd %%mm0, %%mm4 \n\t" + "pmaddwd %%mm2, %%mm5 \n\t" + "pmaddwd %%mm0, %%mm6 \n\t" + "pmaddwd %%mm2, %%mm7 \n\t" + "pmaddwd 48(%1), %%mm0 \n\t" + "pmaddwd 56(%1), %%mm2 \n\t" + "paddd %%mm1, %%mm3 \n\t" + "paddd %%mm4, %%mm5 \n\t" + "paddd %%mm6, %%mm7 \n\t" + "paddd %%mm0, %%mm2 \n\t" + "movq (%2), %%mm0 \n\t" + "paddd %%mm0, %%mm3 \n\t" + "paddd %%mm0, %%mm5 \n\t" + "paddd %%mm0, %%mm7 \n\t" + "paddd %%mm0, %%mm2 \n\t" + "psrad $"S(SHIFT_FRW_ROW)", %%mm3 \n\t" + "psrad $"S(SHIFT_FRW_ROW)", %%mm5 \n\t" + "psrad $"S(SHIFT_FRW_ROW)", %%mm7 \n\t" + "psrad $"S(SHIFT_FRW_ROW)", %%mm2 \n\t" + "packssdw %%mm5, %%mm3 \n\t" + "packssdw %%mm2, %%mm7 \n\t" + "movq %%mm3, 0(%3) \n\t" + "movq %%mm7, 8(%3) \n\t" + : + : "r" (in), "r" (table), "r" (fdct_r_row), "r" (out)); +} + +void ff_fdct_mmx(int16_t *block) +{ + int64_t align_tmp[16] ATTR_ALIGN(8); + int16_t * block1= (int16_t*)align_tmp; + const int16_t *table= tab_frw_01234567; + int i; + + fdct_col_mmx(block, block1, 0); + fdct_col_mmx(block, block1, 4); + + for(i=8;i>0;i--) { + fdct_row_mmx(block1, block, table); + block1 += 8; + table += 32; + block += 8; + } +} + +void ff_fdct_mmx2(int16_t *block) +{ + int64_t align_tmp[16] ATTR_ALIGN(8); + int16_t *block1= (int16_t*)align_tmp; + const int16_t *table= tab_frw_01234567; + int i; + + fdct_col_mmx(block, block1, 0); + fdct_col_mmx(block, block1, 4); + + for(i=8;i>0;i--) { + fdct_row_mmx2(block1, block, table); + block1 += 8; + table += 32; + block += 8; + } +} + +void ff_fdct_sse2(int16_t *block) +{ + int64_t align_tmp[16] ATTR_ALIGN(16); + int16_t * const block1= (int16_t*)align_tmp; + + fdct_col_sse2(block, block1, 0); + fdct_row_sse2(block1, block); +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/x86/fft_3dn.c Mon Dec 22 09:12:42 2008 +0000 @@ -0,0 +1,23 @@ +/* + * FFT/MDCT transform with 3DNow! optimizations + * Copyright (c) 2008 Loren Merritt + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define EMULATE_3DNOWEXT +#include "fft_3dn2.c"
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/x86/fft_3dn2.c Mon Dec 22 09:12:42 2008 +0000 @@ -0,0 +1,173 @@ +/* + * FFT/MDCT transform with Extended 3DNow! optimizations + * Copyright (c) 2006-2008 Zuxy MENG Jie, Loren Merritt + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/x86_cpu.h" +#include "libavcodec/dsputil.h" + +DECLARE_ALIGNED_8(static const int, m1m1[2]) = { 1<<31, 1<<31 }; + +#ifdef EMULATE_3DNOWEXT +#define PSWAPD(s,d)\ + "movq "#s","#d"\n"\ + "psrlq $32,"#d"\n"\ + "punpckldq "#s","#d"\n" +#define ff_fft_calc_3dn2 ff_fft_calc_3dn +#define ff_fft_dispatch_3dn2 ff_fft_dispatch_3dn +#define ff_fft_dispatch_interleave_3dn2 ff_fft_dispatch_interleave_3dn +#define ff_imdct_calc_3dn2 ff_imdct_calc_3dn +#define ff_imdct_half_3dn2 ff_imdct_half_3dn +#else +#define PSWAPD(s,d) "pswapd "#s","#d"\n" +#endif + +void ff_fft_dispatch_3dn2(FFTComplex *z, int nbits); +void ff_fft_dispatch_interleave_3dn2(FFTComplex *z, int nbits); + +void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z) +{ + int n = 1<<s->nbits; + int i; + ff_fft_dispatch_interleave_3dn2(z, s->nbits); + __asm__ volatile("femms"); + if(n <= 8) + for(i=0; i<n; i+=2) + FFSWAP(FFTSample, z[i].im, z[i+1].re); +} + +void ff_imdct_half_3dn2(MDCTContext *s, FFTSample *output, const FFTSample *input) +{ + x86_reg j, k; + long n = 1 << s->nbits; + long n2 = n >> 1; + long n4 = n >> 2; + long n8 = n >> 3; + const uint16_t *revtab = s->fft.revtab; + const FFTSample *tcos = s->tcos; + const FFTSample *tsin = s->tsin; + const FFTSample *in1, *in2; + FFTComplex *z = (FFTComplex *)output; + + /* pre rotation */ + in1 = input; + in2 = input + n2 - 1; +#ifdef EMULATE_3DNOWEXT + __asm__ volatile("movd %0, %%mm7" ::"r"(1<<31)); +#endif + for(k = 0; k < n4; k++) { + // FIXME a single block is faster, but gcc 2.95 and 3.4.x on 32bit can't compile it + __asm__ volatile( + "movd %0, %%mm0 \n" + "movd %2, %%mm1 \n" + "punpckldq %1, %%mm0 \n" + "punpckldq %3, %%mm1 \n" + "movq %%mm0, %%mm2 \n" + PSWAPD( %%mm1, %%mm3 ) + "pfmul %%mm1, %%mm0 \n" + "pfmul %%mm3, %%mm2 \n" +#ifdef EMULATE_3DNOWEXT + "movq %%mm0, %%mm1 \n" + "punpckhdq %%mm2, %%mm0 \n" + "punpckldq %%mm2, %%mm1 \n" + "pxor %%mm7, %%mm0 \n" + "pfadd %%mm1, %%mm0 \n" +#else + "pfpnacc %%mm2, %%mm0 \n" +#endif + ::"m"(in2[-2*k]), "m"(in1[2*k]), + "m"(tcos[k]), "m"(tsin[k]) + ); + __asm__ volatile( + "movq %%mm0, %0 \n\t" + :"=m"(z[revtab[k]]) + ); + } + + ff_fft_dispatch_3dn2(z, s->fft.nbits); + +#define CMUL(j,mm0,mm1)\ + "movq (%2,"#j",2), %%mm6 \n"\ + "movq 8(%2,"#j",2), "#mm0"\n"\ + "movq %%mm6, "#mm1"\n"\ + "movq "#mm0",%%mm7 \n"\ + "pfmul (%3,"#j"), %%mm6 \n"\ + "pfmul (%4,"#j"), "#mm0"\n"\ + "pfmul (%4,"#j"), "#mm1"\n"\ + "pfmul (%3,"#j"), %%mm7 \n"\ + "pfsub %%mm6, "#mm0"\n"\ + "pfadd %%mm7, "#mm1"\n" + + /* post rotation */ + j = -n2; + k = n2-8; + __asm__ volatile( + "1: \n" + CMUL(%0, %%mm0, %%mm1) + CMUL(%1, %%mm2, %%mm3) + "movd %%mm0, (%2,%0,2) \n" + "movd %%mm1,12(%2,%1,2) \n" + "movd %%mm2, (%2,%1,2) \n" + "movd %%mm3,12(%2,%0,2) \n" + "psrlq $32, %%mm0 \n" + "psrlq $32, %%mm1 \n" + "psrlq $32, %%mm2 \n" + "psrlq $32, %%mm3 \n" + "movd %%mm0, 8(%2,%0,2) \n" + "movd %%mm1, 4(%2,%1,2) \n" + "movd %%mm2, 8(%2,%1,2) \n" + "movd %%mm3, 4(%2,%0,2) \n" + "sub $8, %1 \n" + "add $8, %0 \n" + "jl 1b \n" + :"+r"(j), "+r"(k) + :"r"(z+n8), "r"(tcos+n8), "r"(tsin+n8) + :"memory" + ); + __asm__ volatile("femms"); +} + +void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output, const FFTSample *input) +{ + x86_reg j, k; + long n = 1 << s->nbits; + long n4 = n >> 2; + + ff_imdct_half_3dn2(s, output+n4, input); + + j = -n; + k = n-8; + __asm__ volatile( + "movq %4, %%mm7 \n" + "1: \n" + PSWAPD((%2,%1), %%mm0) + PSWAPD((%3,%0), %%mm1) + "pxor %%mm7, %%mm0 \n" + "movq %%mm1, (%3,%1) \n" + "movq %%mm0, (%2,%0) \n" + "sub $8, %1 \n" + "add $8, %0 \n" + "jl 1b \n" + :"+r"(j), "+r"(k) + :"r"(output+n4), "r"(output+n4*3), + "m"(*m1m1) + ); + __asm__ volatile("femms"); +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/x86/fft_mmx.asm Mon Dec 22 09:12:42 2008 +0000 @@ -0,0 +1,467 @@ +;****************************************************************************** +;* FFT transform with SSE/3DNow optimizations +;* Copyright (c) 2008 Loren Merritt +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +; These functions are not individually interchangeable with the C versions. +; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results +; in blocks as conventient to the vector size. +; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively) + +%include "x86inc.asm" + +SECTION_RODATA + +%define M_SQRT1_2 0.70710678118654752440 +ps_root2: times 4 dd M_SQRT1_2 +ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2 +ps_m1p1: dd 1<<31, 0 + +%assign i 16 +%rep 13 +cextern ff_cos_ %+ i +%assign i i<<1 +%endrep + +%ifdef ARCH_X86_64 + %define pointer dq +%else + %define pointer dd +%endif + +%macro IF0 1+ +%endmacro +%macro IF1 1+ + %1 +%endmacro + +section .text align=16 + +%macro T2_3DN 4 ; z0, z1, mem0, mem1 + mova %1, %3 + mova %2, %1 + pfadd %1, %4 + pfsub %2, %4 +%endmacro + +%macro T4_3DN 6 ; z0, z1, z2, z3, tmp0, tmp1 + mova %5, %3 + pfsub %3, %4 + pfadd %5, %4 ; {t6,t5} + pxor %3, [ps_m1p1 GLOBAL] ; {t8,t7} + mova %6, %1 + pswapd %3, %3 + pfadd %1, %5 ; {r0,i0} + pfsub %6, %5 ; {r2,i2} + mova %4, %2 + pfadd %2, %3 ; {r1,i1} + pfsub %4, %3 ; {r3,i3} + SWAP %3, %6 +%endmacro + +; in: %1={r0,i0,r1,i1} %2={r2,i2,r3,i3} +; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} +%macro T4_SSE 3 + mova %3, %1 + shufps %1, %2, 0x64 ; {r0,i0,r3,i2} + shufps %3, %2, 0xce ; {r1,i1,r2,i3} + mova %2, %1 + addps %1, %3 ; {t1,t2,t6,t5} + subps %2, %3 ; {t3,t4,t8,t7} + mova %3, %1 + shufps %1, %2, 0x44 ; {t1,t2,t3,t4} + shufps %3, %2, 0xbe ; {t6,t5,t7,t8} + mova %2, %1 + addps %1, %3 ; {r0,i0,r1,i1} + subps %2, %3 ; {r2,i2,r3,i3} + mova %3, %1 + shufps %1, %2, 0x88 ; {r0,r1,r2,r3} + shufps %3, %2, 0xdd ; {i0,i1,i2,i3} + SWAP %2, %3 +%endmacro + +%macro T8_SSE 6 ; r0,i0,r1,i1,t0,t1 + mova %5, %3 + shufps %3, %4, 0x44 ; {r4,i4,r6,i6} + shufps %5, %4, 0xee ; {r5,i5,r7,i7} + mova %6, %3 + subps %3, %5 ; {r5,i5,r7,i7} + addps %6, %5 ; {t1,t2,t3,t4} + mova %5, %3 + shufps %5, %5, 0xb1 ; {i5,r5,i7,r7} + mulps %3, [ps_root2mppm GLOBAL] ; {-r5,i5,r7,-i7} + mulps %5, [ps_root2 GLOBAL] + addps %3, %5 ; {t8,t7,ta,t9} + mova %5, %6 + shufps %6, %3, 0x36 ; {t3,t2,t9,t8} + shufps %5, %3, 0x9c ; {t1,t4,t7,ta} + mova %3, %6 + addps %6, %5 ; {t1,t2,t9,ta} + subps %3, %5 ; {t6,t5,tc,tb} + mova %5, %6 + shufps %6, %3, 0xd8 ; {t1,t9,t5,tb} + shufps %5, %3, 0x8d ; {t2,ta,t6,tc} + mova %3, %1 + mova %4, %2 + addps %1, %6 ; {r0,r1,r2,r3} + addps %2, %5 ; {i0,i1,i2,i3} + subps %3, %6 ; {r4,r5,r6,r7} + subps %4, %5 ; {i4,i5,i6,i7} +%endmacro + +; scheduled for cpu-bound sizes +%macro PASS_SMALL 3 ; (to load m4-m7), wre, wim +IF%1 mova m4, Z(4) +IF%1 mova m5, Z(5) + mova m0, %2 ; wre + mova m2, m4 + mova m1, %3 ; wim + mova m3, m5 + mulps m2, m0 ; r2*wre +IF%1 mova m6, Z(6) + mulps m3, m1 ; i2*wim +IF%1 mova m7, Z(7) + mulps m4, m1 ; r2*wim + mulps m5, m0 ; i2*wre + addps m2, m3 ; r2*wre + i2*wim + mova m3, m1 + mulps m1, m6 ; r3*wim + subps m5, m4 ; i2*wre - r2*wim + mova m4, m0 + mulps m3, m7 ; i3*wim + mulps m4, m6 ; r3*wre + mulps m0, m7 ; i3*wre + subps m4, m3 ; r3*wre - i3*wim + mova m3, Z(0) + addps m0, m1 ; i3*wre + r3*wim + mova m1, m4 + addps m4, m2 ; t5 + subps m1, m2 ; t3 + subps m3, m4 ; r2 + addps m4, Z(0) ; r0 + mova m6, Z(2) + mova Z(4), m3 + mova Z(0), m4 + mova m3, m5 + subps m5, m0 ; t4 + mova m4, m6 + subps m6, m5 ; r3 + addps m5, m4 ; r1 + mova Z(6), m6 + mova Z(2), m5 + mova m2, Z(3) + addps m3, m0 ; t6 + subps m2, m1 ; i3 + mova m7, Z(1) + addps m1, Z(3) ; i1 + mova Z(7), m2 + mova Z(3), m1 + mova m4, m7 + subps m7, m3 ; i2 + addps m3, m4 ; i0 + mova Z(5), m7 + mova Z(1), m3 +%endmacro + +; scheduled to avoid store->load aliasing +%macro PASS_BIG 1 ; (!interleave) + mova m4, Z(4) ; r2 + mova m5, Z(5) ; i2 + mova m2, m4 + mova m0, [wq] ; wre + mova m3, m5 + mova m1, [wq+o1q] ; wim + mulps m2, m0 ; r2*wre + mova m6, Z(6) ; r3 + mulps m3, m1 ; i2*wim + mova m7, Z(7) ; i3 + mulps m4, m1 ; r2*wim + mulps m5, m0 ; i2*wre + addps m2, m3 ; r2*wre + i2*wim + mova m3, m1 + mulps m1, m6 ; r3*wim + subps m5, m4 ; i2*wre - r2*wim + mova m4, m0 + mulps m3, m7 ; i3*wim + mulps m4, m6 ; r3*wre + mulps m0, m7 ; i3*wre + subps m4, m3 ; r3*wre - i3*wim + mova m3, Z(0) + addps m0, m1 ; i3*wre + r3*wim + mova m1, m4 + addps m4, m2 ; t5 + subps m1, m2 ; t3 + subps m3, m4 ; r2 + addps m4, Z(0) ; r0 + mova m6, Z(2) + mova Z(4), m3 + mova Z(0), m4 + mova m3, m5 + subps m5, m0 ; t4 + mova m4, m6 + subps m6, m5 ; r3 + addps m5, m4 ; r1 +IF%1 mova Z(6), m6 +IF%1 mova Z(2), m5 + mova m2, Z(3) + addps m3, m0 ; t6 + subps m2, m1 ; i3 + mova m7, Z(1) + addps m1, Z(3) ; i1 +IF%1 mova Z(7), m2 +IF%1 mova Z(3), m1 + mova m4, m7 + subps m7, m3 ; i2 + addps m3, m4 ; i0 +IF%1 mova Z(5), m7 +IF%1 mova Z(1), m3 +%if %1==0 + mova m4, m5 ; r1 + mova m0, m6 ; r3 + unpcklps m5, m1 + unpckhps m4, m1 + unpcklps m6, m2 + unpckhps m0, m2 + mova m1, Z(0) + mova m2, Z(4) + mova Z(2), m5 + mova Z(3), m4 + mova Z(6), m6 + mova Z(7), m0 + mova m5, m1 ; r0 + mova m4, m2 ; r2 + unpcklps m1, m3 + unpckhps m5, m3 + unpcklps m2, m7 + unpckhps m4, m7 + mova Z(0), m1 + mova Z(1), m5 + mova Z(4), m2 + mova Z(5), m4 +%endif +%endmacro + +%macro PUNPCK 3 + mova %3, %1 + punpckldq %1, %2 + punpckhdq %3, %2 +%endmacro + +INIT_XMM + +%define Z(x) [r0+mmsize*x] + +align 16 +fft4_sse: + mova m0, Z(0) + mova m1, Z(1) + T4_SSE m0, m1, m2 + mova Z(0), m0 + mova Z(1), m1 + ret + +align 16 +fft8_sse: + mova m0, Z(0) + mova m1, Z(1) + T4_SSE m0, m1, m2 + mova m2, Z(2) + mova m3, Z(3) + T8_SSE m0, m1, m2, m3, m4, m5 + mova Z(0), m0 + mova Z(1), m1 + mova Z(2), m2 + mova Z(3), m3 + ret + +align 16 +fft16_sse: + mova m0, Z(0) + mova m1, Z(1) + T4_SSE m0, m1, m2 + mova m2, Z(2) + mova m3, Z(3) + T8_SSE m0, m1, m2, m3, m4, m5 + mova m4, Z(4) + mova m5, Z(5) + mova Z(0), m0 + mova Z(1), m1 + mova Z(2), m2 + mova Z(3), m3 + T4_SSE m4, m5, m6 + mova m6, Z(6) + mova m7, Z(7) + T4_SSE m6, m7, m0 + PASS_SMALL 0, [ff_cos_16 GLOBAL], [ff_cos_16+16 GLOBAL] + ret + + +INIT_MMX + +%macro FFT48_3DN 1 +align 16 +fft4%1: + T2_3DN m0, m1, Z(0), Z(1) + mova m2, Z(2) + mova m3, Z(3) + T4_3DN m0, m1, m2, m3, m4, m5 + PUNPCK m0, m1, m4 + PUNPCK m2, m3, m5 + mova Z(0), m0 + mova Z(1), m4 + mova Z(2), m2 + mova Z(3), m5 + ret + +align 16 +fft8%1: + T2_3DN m0, m1, Z(0), Z(1) + mova m2, Z(2) + mova m3, Z(3) + T4_3DN m0, m1, m2, m3, m4, m5 + mova Z(0), m0 + mova Z(2), m2 + T2_3DN m4, m5, Z(4), Z(5) + T2_3DN m6, m7, Z(6), Z(7) + pswapd m0, m5 + pswapd m2, m7 + pxor m0, [ps_m1p1 GLOBAL] + pxor m2, [ps_m1p1 GLOBAL] + pfsub m5, m0 + pfadd m7, m2 + pfmul m5, [ps_root2 GLOBAL] + pfmul m7, [ps_root2 GLOBAL] + T4_3DN m1, m3, m5, m7, m0, m2 + mova Z(5), m5 + mova Z(7), m7 + mova m0, Z(0) + mova m2, Z(2) + T4_3DN m0, m2, m4, m6, m5, m7 + PUNPCK m0, m1, m5 + PUNPCK m2, m3, m7 + mova Z(0), m0 + mova Z(1), m5 + mova Z(2), m2 + mova Z(3), m7 + PUNPCK m4, Z(5), m5 + PUNPCK m6, Z(7), m7 + mova Z(4), m4 + mova Z(5), m5 + mova Z(6), m6 + mova Z(7), m7 + ret +%endmacro + +FFT48_3DN _3dn2 + +%macro pswapd 2 +%ifidn %1, %2 + movd [r0+12], %1 + punpckhdq %1, [r0+8] +%else + movq %1, %2 + psrlq %1, 32 + punpckldq %1, %2 +%endif +%endmacro + +FFT48_3DN _3dn + + +%define Z(x) [zq + o1q*(x&6)*((x/6)^1) + o3q*(x/6) + mmsize*(x&1)] + +%macro DECL_PASS 2+ ; name, payload +align 16 +%1: +DEFINE_ARGS z, w, n, o1, o3 + lea o3q, [nq*3] + lea o1q, [nq*8] + shl o3q, 4 +.loop: + %2 + add zq, mmsize*2 + add wq, mmsize + sub nd, mmsize/8 + jg .loop + rep ret +%endmacro + +INIT_XMM +DECL_PASS pass_sse, PASS_BIG 1 +DECL_PASS pass_interleave_sse, PASS_BIG 0 + +INIT_MMX +%define mulps pfmul +%define addps pfadd +%define subps pfsub +%define unpcklps punpckldq +%define unpckhps punpckhdq +DECL_PASS pass_3dn, PASS_SMALL 1, [wq], [wq+o1q] +DECL_PASS pass_interleave_3dn, PASS_BIG 0 +%define pass_3dn2 pass_3dn +%define pass_interleave_3dn2 pass_interleave_3dn + + +%macro DECL_FFT 2-3 ; nbits, cpu, suffix +%xdefine list_of_fft fft4%2, fft8%2 +%if %1==5 +%xdefine list_of_fft list_of_fft, fft16%2 +%endif + +%assign n 1<<%1 +%rep 17-%1 +%assign n2 n/2 +%assign n4 n/4 +%xdefine list_of_fft list_of_fft, fft %+ n %+ %3%2 + +align 16 +fft %+ n %+ %3%2: + call fft %+ n2 %+ %2 + add r0, n*4 - (n&(-2<<%1)) + call fft %+ n4 %+ %2 + add r0, n*2 - (n2&(-2<<%1)) + call fft %+ n4 %+ %2 + sub r0, n*6 + (n2&(-2<<%1)) + lea r1, [ff_cos_ %+ n GLOBAL] + mov r2d, n4/2 + jmp pass%3%2 + +%assign n n*2 +%endrep +%undef n + +align 8 +dispatch_tab%3%2: pointer list_of_fft + +; On x86_32, this function does the register saving and restoring for all of fft. +; The others pass args in registers and don't spill anything. +cglobal fft_dispatch%3%2, 2,5,0, z, nbits + lea r2, [dispatch_tab%3%2 GLOBAL] + mov r2, [r2 + (nbitsq-2)*gprsize] + call r2 + RET +%endmacro ; DECL_FFT + +DECL_FFT 5, _sse +DECL_FFT 5, _sse, _interleave +DECL_FFT 4, _3dn +DECL_FFT 4, _3dn, _interleave +DECL_FFT 4, _3dn2 +DECL_FFT 4, _3dn2, _interleave +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/x86/fft_sse.c Mon Dec 22 09:12:42 2008 +0000 @@ -0,0 +1,202 @@ +/* + * FFT/MDCT transform with SSE optimizations + * Copyright (c) 2008 Loren Merritt + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/x86_cpu.h" +#include "libavcodec/dsputil.h" + +static const int m1m1m1m1[4] __attribute__((aligned(16))) = + { 1 << 31, 1 << 31, 1 << 31, 1 << 31 }; + +void ff_fft_dispatch_sse(FFTComplex *z, int nbits); +void ff_fft_dispatch_interleave_sse(FFTComplex *z, int nbits); + +void ff_fft_calc_sse(FFTContext *s, FFTComplex *z) +{ + int n = 1 << s->nbits; + + ff_fft_dispatch_interleave_sse(z, s->nbits); + + if(n <= 16) { + x86_reg i = -8*n; + __asm__ volatile( + "1: \n" + "movaps (%0,%1), %%xmm0 \n" + "movaps %%xmm0, %%xmm1 \n" + "unpcklps 16(%0,%1), %%xmm0 \n" + "unpckhps 16(%0,%1), %%xmm1 \n" + "movaps %%xmm0, (%0,%1) \n" + "movaps %%xmm1, 16(%0,%1) \n" + "add $32, %0 \n" + "jl 1b \n" + :"+r"(i) + :"r"(z+n) + :"memory" + ); + } +} + +void ff_fft_permute_sse(FFTContext *s, FFTComplex *z) +{ + int n = 1 << s->nbits; + int i; + for(i=0; i<n; i+=2) { + __asm__ volatile( + "movaps %2, %%xmm0 \n" + "movlps %%xmm0, %0 \n" + "movhps %%xmm0, %1 \n" + :"=m"(s->tmp_buf[s->revtab[i]]), + "=m"(s->tmp_buf[s->revtab[i+1]]) + :"m"(z[i]) + ); + } + memcpy(z, s->tmp_buf, n*sizeof(FFTComplex)); +} + +void ff_imdct_half_sse(MDCTContext *s, FFTSample *output, const FFTSample *input) +{ + av_unused x86_reg i, j, k, l; + long n = 1 << s->nbits; + long n2 = n >> 1; + long n4 = n >> 2; + long n8 = n >> 3; + const uint16_t *revtab = s->fft.revtab + n8; + const FFTSample *tcos = s->tcos; + const FFTSample *tsin = s->tsin; + FFTComplex *z = (FFTComplex *)output; + + /* pre rotation */ + for(k=n8-2; k>=0; k-=2) { + __asm__ volatile( + "movaps (%2,%1,2), %%xmm0 \n" // { z[k].re, z[k].im, z[k+1].re, z[k+1].im } + "movaps -16(%2,%0,2), %%xmm1 \n" // { z[-k-2].re, z[-k-2].im, z[-k-1].re, z[-k-1].im } + "movaps %%xmm0, %%xmm2 \n" + "shufps $0x88, %%xmm1, %%xmm0 \n" // { z[k].re, z[k+1].re, z[-k-2].re, z[-k-1].re } + "shufps $0x77, %%xmm2, %%xmm1 \n" // { z[-k-1].im, z[-k-2].im, z[k+1].im, z[k].im } + "movlps (%3,%1), %%xmm4 \n" + "movlps (%4,%1), %%xmm5 \n" + "movhps -8(%3,%0), %%xmm4 \n" // { cos[k], cos[k+1], cos[-k-2], cos[-k-1] } + "movhps -8(%4,%0), %%xmm5 \n" // { sin[k], sin[k+1], sin[-k-2], sin[-k-1] } + "movaps %%xmm0, %%xmm2 \n" + "movaps %%xmm1, %%xmm3 \n" + "mulps %%xmm5, %%xmm0 \n" // re*sin + "mulps %%xmm4, %%xmm1 \n" // im*cos + "mulps %%xmm4, %%xmm2 \n" // re*cos + "mulps %%xmm5, %%xmm3 \n" // im*sin + "subps %%xmm0, %%xmm1 \n" // -> re + "addps %%xmm3, %%xmm2 \n" // -> im + "movaps %%xmm1, %%xmm0 \n" + "unpcklps %%xmm2, %%xmm1 \n" // { z[k], z[k+1] } + "unpckhps %%xmm2, %%xmm0 \n" // { z[-k-2], z[-k-1] } + ::"r"(-4*k), "r"(4*k), + "r"(input+n4), "r"(tcos+n8), "r"(tsin+n8) + ); +#ifdef ARCH_X86_64 + // if we have enough regs, don't let gcc make the luts latency-bound + // but if not, latency is faster than spilling + __asm__("movlps %%xmm0, %0 \n" + "movhps %%xmm0, %1 \n" + "movlps %%xmm1, %2 \n" + "movhps %%xmm1, %3 \n" + :"=m"(z[revtab[-k-2]]), + "=m"(z[revtab[-k-1]]), + "=m"(z[revtab[ k ]]), + "=m"(z[revtab[ k+1]]) + ); +#else + __asm__("movlps %%xmm0, %0" :"=m"(z[revtab[-k-2]])); + __asm__("movhps %%xmm0, %0" :"=m"(z[revtab[-k-1]])); + __asm__("movlps %%xmm1, %0" :"=m"(z[revtab[ k ]])); + __asm__("movhps %%xmm1, %0" :"=m"(z[revtab[ k+1]])); +#endif + } + + ff_fft_dispatch_sse(z, s->fft.nbits); + + /* post rotation + reinterleave + reorder */ + +#define CMUL(j,xmm0,xmm1)\ + "movaps (%2,"#j",2), %%xmm6 \n"\ + "movaps 16(%2,"#j",2), "#xmm0"\n"\ + "movaps %%xmm6, "#xmm1"\n"\ + "movaps "#xmm0",%%xmm7 \n"\ + "mulps (%3,"#j"), %%xmm6 \n"\ + "mulps (%4,"#j"), "#xmm0"\n"\ + "mulps (%4,"#j"), "#xmm1"\n"\ + "mulps (%3,"#j"), %%xmm7 \n"\ + "subps %%xmm6, "#xmm0"\n"\ + "addps %%xmm7, "#xmm1"\n" + + j = -n2; + k = n2-16; + __asm__ volatile( + "1: \n" + CMUL(%0, %%xmm0, %%xmm1) + CMUL(%1, %%xmm4, %%xmm5) + "shufps $0x1b, %%xmm1, %%xmm1 \n" + "shufps $0x1b, %%xmm5, %%xmm5 \n" + "movaps %%xmm4, %%xmm6 \n" + "unpckhps %%xmm1, %%xmm4 \n" + "unpcklps %%xmm1, %%xmm6 \n" + "movaps %%xmm0, %%xmm2 \n" + "unpcklps %%xmm5, %%xmm0 \n" + "unpckhps %%xmm5, %%xmm2 \n" + "movaps %%xmm6, (%2,%1,2) \n" + "movaps %%xmm4, 16(%2,%1,2) \n" + "movaps %%xmm0, (%2,%0,2) \n" + "movaps %%xmm2, 16(%2,%0,2) \n" + "sub $16, %1 \n" + "add $16, %0 \n" + "jl 1b \n" + :"+&r"(j), "+&r"(k) + :"r"(z+n8), "r"(tcos+n8), "r"(tsin+n8) + :"memory" + ); +} + +void ff_imdct_calc_sse(MDCTContext *s, FFTSample *output, const FFTSample *input) +{ + x86_reg j, k; + long n = 1 << s->nbits; + long n4 = n >> 2; + + ff_imdct_half_sse(s, output+n4, input); + + j = -n; + k = n-16; + __asm__ volatile( + "movaps %4, %%xmm7 \n" + "1: \n" + "movaps (%2,%1), %%xmm0 \n" + "movaps (%3,%0), %%xmm1 \n" + "shufps $0x1b, %%xmm0, %%xmm0 \n" + "shufps $0x1b, %%xmm1, %%xmm1 \n" + "xorps %%xmm7, %%xmm0 \n" + "movaps %%xmm1, (%3,%1) \n" + "movaps %%xmm0, (%2,%0) \n" + "sub $16, %1 \n" + "add $16, %0 \n" + "jl 1b \n" + :"+r"(j), "+r"(k) + :"r"(output+n4), "r"(output+n4*3), + "m"(*m1m1m1m1) + ); +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/x86/flacdsp_mmx.c Mon Dec 22 09:12:42 2008 +0000 @@ -0,0 +1,139 @@ +/* + * MMX optimized FLAC DSP utils + * Copyright (c) 2007 Loren Merritt + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/x86_cpu.h" +#include "dsputil_mmx.h" + +static void apply_welch_window_sse2(const int32_t *data, int len, double *w_data) +{ + double c = 2.0 / (len-1.0); + int n2 = len>>1; + x86_reg i = -n2*sizeof(int32_t); + x86_reg j = n2*sizeof(int32_t); + __asm__ volatile( + "movsd %0, %%xmm7 \n\t" + "movapd "MANGLE(ff_pd_1)", %%xmm6 \n\t" + "movapd "MANGLE(ff_pd_2)", %%xmm5 \n\t" + "movlhps %%xmm7, %%xmm7 \n\t" + "subpd %%xmm5, %%xmm7 \n\t" + "addsd %%xmm6, %%xmm7 \n\t" + ::"m"(c) + ); +#define WELCH(MOVPD, offset)\ + __asm__ volatile(\ + "1: \n\t"\ + "movapd %%xmm7, %%xmm1 \n\t"\ + "mulpd %%xmm1, %%xmm1 \n\t"\ + "movapd %%xmm6, %%xmm0 \n\t"\ + "subpd %%xmm1, %%xmm0 \n\t"\ + "pshufd $0x4e, %%xmm0, %%xmm1 \n\t"\ + "cvtpi2pd (%3,%0), %%xmm2 \n\t"\ + "cvtpi2pd "#offset"*4(%3,%1), %%xmm3 \n\t"\ + "mulpd %%xmm0, %%xmm2 \n\t"\ + "mulpd %%xmm1, %%xmm3 \n\t"\ + "movapd %%xmm2, (%2,%0,2) \n\t"\ + MOVPD" %%xmm3, "#offset"*8(%2,%1,2) \n\t"\ + "subpd %%xmm5, %%xmm7 \n\t"\ + "sub $8, %1 \n\t"\ + "add $8, %0 \n\t"\ + "jl 1b \n\t"\ + :"+&r"(i), "+&r"(j)\ + :"r"(w_data+n2), "r"(data+n2)\ + ); + if(len&1) + WELCH("movupd", -1) + else + WELCH("movapd", -2) +#undef WELCH +} + +void ff_flac_compute_autocorr_sse2(const int32_t *data, int len, int lag, + double *autoc) +{ + double tmp[len + lag + 2]; + double *data1 = tmp + lag; + int j; + + if((x86_reg)data1 & 15) + data1++; + + apply_welch_window_sse2(data, len, data1); + + for(j=0; j<lag; j++) + data1[j-lag]= 0.0; + data1[len] = 0.0; + + for(j=0; j<lag; j+=2){ + x86_reg i = -len*sizeof(double); + if(j == lag-2) { + __asm__ volatile( + "movsd "MANGLE(ff_pd_1)", %%xmm0 \n\t" + "movsd "MANGLE(ff_pd_1)", %%xmm1 \n\t" + "movsd "MANGLE(ff_pd_1)", %%xmm2 \n\t" + "1: \n\t" + "movapd (%4,%0), %%xmm3 \n\t" + "movupd -8(%5,%0), %%xmm4 \n\t" + "movapd (%5,%0), %%xmm5 \n\t" + "mulpd %%xmm3, %%xmm4 \n\t" + "mulpd %%xmm3, %%xmm5 \n\t" + "mulpd -16(%5,%0), %%xmm3 \n\t" + "addpd %%xmm4, %%xmm1 \n\t" + "addpd %%xmm5, %%xmm0 \n\t" + "addpd %%xmm3, %%xmm2 \n\t" + "add $16, %0 \n\t" + "jl 1b \n\t" + "movhlps %%xmm0, %%xmm3 \n\t" + "movhlps %%xmm1, %%xmm4 \n\t" + "movhlps %%xmm2, %%xmm5 \n\t" + "addsd %%xmm3, %%xmm0 \n\t" + "addsd %%xmm4, %%xmm1 \n\t" + "addsd %%xmm5, %%xmm2 \n\t" + "movsd %%xmm0, %1 \n\t" + "movsd %%xmm1, %2 \n\t" + "movsd %%xmm2, %3 \n\t" + :"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1]), "=m"(autoc[j+2]) + :"r"(data1+len), "r"(data1+len-j) + ); + } else { + __asm__ volatile( + "movsd "MANGLE(ff_pd_1)", %%xmm0 \n\t" + "movsd "MANGLE(ff_pd_1)", %%xmm1 \n\t" + "1: \n\t" + "movapd (%3,%0), %%xmm3 \n\t" + "movupd -8(%4,%0), %%xmm4 \n\t" + "mulpd %%xmm3, %%xmm4 \n\t" + "mulpd (%4,%0), %%xmm3 \n\t" + "addpd %%xmm4, %%xmm1 \n\t" + "addpd %%xmm3, %%xmm0 \n\t" + "add $16, %0 \n\t" + "jl 1b \n\t" + "movhlps %%xmm0, %%xmm3 \n\t" + "movhlps %%xmm1, %%xmm4 \n\t" + "addsd %%xmm3, %%xmm0 \n\t" + "addsd %%xmm4, %%xmm1 \n\t" + "movsd %%xmm0, %1 \n\t" + "movsd %%xmm1, %2 \n\t" + :"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1]) + :"r"(data1+len), "r"(data1+len-j) + ); + } + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/x86/h264_deblock_sse2.asm Mon Dec 22 09:12:42 2008 +0000 @@ -0,0 +1,747 @@ +;***************************************************************************** +;* deblock-a.asm: h264 encoder library +;***************************************************************************** +;* Copyright (C) 2005-2008 x264 project +;* +;* Authors: Loren Merritt <lorenm@u.washington.edu> +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +;***************************************************************************** + +%include "x86inc.asm" + +SECTION_RODATA +pb_00: times 16 db 0x00 +pb_01: times 16 db 0x01 +pb_03: times 16 db 0x03 +pb_a1: times 16 db 0xa1 + +SECTION .text + +; expands to [base],...,[base+7*stride] +%define PASS8ROWS(base, base3, stride, stride3) \ + [base], [base+stride], [base+stride*2], [base3], \ + [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4] + +; in: 8 rows of 4 bytes in %1..%8 +; out: 4 rows of 8 bytes in m0..m3 +%macro TRANSPOSE4x8_LOAD 8 + movd m0, %1 + movd m2, %2 + movd m1, %3 + movd m3, %4 + punpcklbw m0, m2 + punpcklbw m1, m3 + movq m2, m0 + punpcklwd m0, m1 + punpckhwd m2, m1 + + movd m4, %5 + movd m6, %6 + movd m5, %7 + movd m7, %8 + punpcklbw m4, m6 + punpcklbw m5, m7 + movq m6, m4 + punpcklwd m4, m5 + punpckhwd m6, m5 + + movq m1, m0 + movq m3, m2 + punpckldq m0, m4 + punpckhdq m1, m4 + punpckldq m2, m6 + punpckhdq m3, m6 +%endmacro + +; in: 4 rows of 8 bytes in m0..m3 +; out: 8 rows of 4 bytes in %1..%8 +%macro TRANSPOSE8x4_STORE 8 + movq m4, m0 + movq m5, m1 + movq m6, m2 + punpckhdq m4, m4 + punpckhdq m5, m5 + punpckhdq m6, m6 + + punpcklbw m0, m1 + punpcklbw m2, m3 + movq m1, m0 + punpcklwd m0, m2 + punpckhwd m1, m2 + movd %1, m0 + punpckhdq m0, m0 + movd %2, m0 + movd %3, m1 + punpckhdq m1, m1 + movd %4, m1 + + punpckhdq m3, m3 + punpcklbw m4, m5 + punpcklbw m6, m3 + movq m5, m4 + punpcklwd m4, m6 + punpckhwd m5, m6 + movd %5, m4 + punpckhdq m4, m4 + movd %6, m4 + movd %7, m5 + punpckhdq m5, m5 + movd %8, m5 +%endmacro + +%macro SBUTTERFLY 4 + movq %4, %2 + punpckl%1 %2, %3 + punpckh%1 %4, %3 +%endmacro + +; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8 +; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16] +%macro TRANSPOSE6x8_MEM 9 + movq m0, %1 + movq m1, %2 + movq m2, %3 + movq m3, %4 + movq m4, %5 + movq m5, %6 + movq m6, %7 + SBUTTERFLY bw, m0, m1, m7 + SBUTTERFLY bw, m2, m3, m1 + SBUTTERFLY bw, m4, m5, m3 + movq [%9+0x10], m1 + SBUTTERFLY bw, m6, %8, m5 + SBUTTERFLY wd, m0, m2, m1 + SBUTTERFLY wd, m4, m6, m2 + punpckhdq m0, m4 + movq [%9+0x00], m0 + SBUTTERFLY wd, m7, [%9+0x10], m6 + SBUTTERFLY wd, m3, m5, m4 + SBUTTERFLY dq, m7, m3, m0 + SBUTTERFLY dq, m1, m2, m5 + punpckldq m6, m4 + movq [%9+0x10], m1 + movq [%9+0x20], m5 + movq [%9+0x30], m7 + movq [%9+0x40], m0 + movq [%9+0x50], m6 +%endmacro + +; in: 8 rows of 8 in %1..%8 +; out: 8 rows of 8 in %9..%16 +%macro TRANSPOSE8x8_MEM 16 + movq m0, %1 + movq m1, %2 + movq m2, %3 + movq m3, %4 + movq m4, %5 + movq m5, %6 + movq m6, %7 + SBUTTERFLY bw, m0, m1, m7 + SBUTTERFLY bw, m2, m3, m1 + SBUTTERFLY bw, m4, m5, m3 + SBUTTERFLY bw, m6, %8, m5 + movq %9, m3 + SBUTTERFLY wd, m0, m2, m3 + SBUTTERFLY wd, m4, m6, m2 + SBUTTERFLY wd, m7, m1, m6 + movq %11, m2 + movq m2, %9 + SBUTTERFLY wd, m2, m5, m1 + SBUTTERFLY dq, m0, m4, m5 + SBUTTERFLY dq, m7, m2, m4 + movq %9, m0 + movq %10, m5 + movq %13, m7 + movq %14, m4 + SBUTTERFLY dq, m3, %11, m0 + SBUTTERFLY dq, m6, m1, m5 + movq %11, m3 + movq %12, m0 + movq %15, m6 + movq %16, m5 +%endmacro + +; out: %4 = |%1-%2|>%3 +; clobbers: %5 +%macro DIFF_GT 5 + mova %5, %2 + mova %4, %1 + psubusb %5, %1 + psubusb %4, %2 + por %4, %5 + psubusb %4, %3 +%endmacro + +; out: %4 = |%1-%2|>%3 +; clobbers: %5 +%macro DIFF_GT2 5 + mova %5, %2 + mova %4, %1 + psubusb %5, %1 + psubusb %4, %2 + psubusb %5, %3 + psubusb %4, %3 + pcmpeqb %4, %5 +%endmacro + +%macro SPLATW 1 +%ifidn m0, xmm0 + pshuflw %1, %1, 0 + punpcklqdq %1, %1 +%else + pshufw %1, %1, 0 +%endif +%endmacro + +; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1 +; out: m5=beta-1, m7=mask, %3=alpha-1 +; clobbers: m4,m6 +%macro LOAD_MASK 2-3 + movd m4, %1 + movd m5, %2 + SPLATW m4 + SPLATW m5 + packuswb m4, m4 ; 16x alpha-1 + packuswb m5, m5 ; 16x beta-1 +%if %0>2 + mova %3, m4 +%endif + DIFF_GT m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1 + DIFF_GT m0, m1, m5, m4, m6 ; |p1-p0| > beta-1 + por m7, m4 + DIFF_GT m3, m2, m5, m4, m6 ; |q1-q0| > beta-1 + por m7, m4 + pxor m6, m6 + pcmpeqb m7, m6 +%endmacro + +; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask) +; out: m1=p0' m2=q0' +; clobbers: m0,3-6 +%macro DEBLOCK_P0_Q0 0 + mova m5, m1 + pxor m5, m2 ; p0^q0 + pand m5, [pb_01 GLOBAL] ; (p0^q0)&1 + pcmpeqb m4, m4 + pxor m3, m4 + pavgb m3, m0 ; (p1 - q1 + 256)>>1 + pavgb m3, [pb_03 GLOBAL] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2 + pxor m4, m1 + pavgb m4, m2 ; (q0 - p0 + 256)>>1 + pavgb m3, m5 + paddusb m3, m4 ; d+128+33 + mova m6, [pb_a1 GLOBAL] + psubusb m6, m3 + psubusb m3, [pb_a1 GLOBAL] + pminub m6, m7 + pminub m3, m7 + psubusb m1, m6 + psubusb m2, m3 + paddusb m1, m3 + paddusb m2, m6 +%endmacro + +; in: m1=p0 m2=q0 +; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp +; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 ) +; clobbers: q2, tmp, tc0 +%macro LUMA_Q1 6 + mova %6, m1 + pavgb %6, m2 + pavgb %2, %6 ; avg(p2,avg(p0,q0)) + pxor %6, %3 + pand %6, [pb_01 GLOBAL] ; (p2^avg(p0,q0))&1 + psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1 + mova %6, %1 + psubusb %6, %5 + paddusb %5, %1 + pmaxub %2, %6 + pminub %2, %5 + mova %4, %2 +%endmacro + +%ifdef ARCH_X86_64 +;----------------------------------------------------------------------------- +; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +;----------------------------------------------------------------------------- +INIT_XMM +cglobal x264_deblock_v_luma_sse2 + movd m8, [r4] ; tc0 + lea r4, [r1*3] + dec r2d ; alpha-1 + neg r4 + dec r3d ; beta-1 + add r4, r0 ; pix-3*stride + + mova m0, [r4+r1] ; p1 + mova m1, [r4+2*r1] ; p0 + mova m2, [r0] ; q0 + mova m3, [r0+r1] ; q1 + LOAD_MASK r2d, r3d + + punpcklbw m8, m8 + punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] + pcmpeqb m9, m9 + pcmpeqb m9, m8 + pandn m9, m7 + pand m8, m9 + + movdqa m3, [r4] ; p2 + DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 + pand m6, m9 + mova m7, m8 + psubb m7, m6 + pand m6, m8 + LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 + + movdqa m4, [r0+2*r1] ; q2 + DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 + pand m6, m9 + pand m8, m6 + psubb m7, m6 + mova m3, [r0+r1] + LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6 + + DEBLOCK_P0_Q0 + mova [r4+2*r1], m1 + mova [r0], m2 + ret + +;----------------------------------------------------------------------------- +; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +;----------------------------------------------------------------------------- +INIT_MMX +cglobal x264_deblock_h_luma_sse2 + movsxd r10, esi + lea r11, [r10+r10*2] + lea rax, [r0-4] + lea r9, [r0-4+r11] + sub rsp, 0x68 + %define pix_tmp rsp + + ; transpose 6x16 -> tmp space + TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp + lea rax, [rax+r10*8] + lea r9, [r9 +r10*8] + TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp+8 + + ; vertical filter + ; alpha, beta, tc0 are still in r2d, r3d, r4 + ; don't backup rax, r9, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them + lea r0, [pix_tmp+0x30] + mov esi, 0x10 + call x264_deblock_v_luma_sse2 + + ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) + add rax, 2 + add r9, 2 + movq m0, [pix_tmp+0x18] + movq m1, [pix_tmp+0x28] + movq m2, [pix_tmp+0x38] + movq m3, [pix_tmp+0x48] + TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11) + + shl r10, 3 + sub rax, r10 + sub r9, r10 + shr r10, 3 + movq m0, [pix_tmp+0x10] + movq m1, [pix_tmp+0x20] + movq m2, [pix_tmp+0x30] + movq m3, [pix_tmp+0x40] + TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11) + + add rsp, 0x68 + ret + +%else + +%macro DEBLOCK_LUMA 3 +;----------------------------------------------------------------------------- +; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +;----------------------------------------------------------------------------- +cglobal x264_deblock_%2_luma_%1, 5,5 + lea r4, [r1*3] + dec r2 ; alpha-1 + neg r4 + dec r3 ; beta-1 + add r4, r0 ; pix-3*stride + %assign pad 2*%3+12-(stack_offset&15) + SUB esp, pad + + mova m0, [r4+r1] ; p1 + mova m1, [r4+2*r1] ; p0 + mova m2, [r0] ; q0 + mova m3, [r0+r1] ; q1 + LOAD_MASK r2, r3 + + mov r3, r4m + movd m4, [r3] ; tc0 + punpcklbw m4, m4 + punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] + mova [esp+%3], m4 ; tc + pcmpeqb m3, m3 + pcmpgtb m4, m3 + pand m4, m7 + mova [esp], m4 ; mask + + mova m3, [r4] ; p2 + DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 + pand m6, m4 + pand m4, [esp+%3] ; tc + mova m7, m4 + psubb m7, m6 + pand m6, m4 + LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 + + mova m4, [r0+2*r1] ; q2 + DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 + mova m5, [esp] ; mask + pand m6, m5 + mova m5, [esp+%3] ; tc + pand m5, m6 + psubb m7, m6 + mova m3, [r0+r1] + LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6 + + DEBLOCK_P0_Q0 + mova [r4+2*r1], m1 + mova [r0], m2 + ADD esp, pad + RET + +;----------------------------------------------------------------------------- +; void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +;----------------------------------------------------------------------------- +INIT_MMX +cglobal x264_deblock_h_luma_%1, 0,5 + mov r0, r0m + mov r3, r1m + lea r4, [r3*3] + sub r0, 4 + lea r1, [r0+r4] + %assign pad 0x78-(stack_offset&15) + SUB esp, pad +%define pix_tmp esp+12 + + ; transpose 6x16 -> tmp space + TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp + lea r0, [r0+r3*8] + lea r1, [r1+r3*8] + TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp+8 + + ; vertical filter + lea r0, [pix_tmp+0x30] + PUSH dword r4m + PUSH dword r3m + PUSH dword r2m + PUSH dword 16 + PUSH dword r0 + call x264_deblock_%2_luma_%1 +%ifidn %2, v8 + add dword [esp ], 8 ; pix_tmp+0x38 + add dword [esp+16], 2 ; tc0+2 + call x264_deblock_%2_luma_%1 +%endif + ADD esp, 20 + + ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) + mov r0, r0m + sub r0, 2 + lea r1, [r0+r4] + + movq m0, [pix_tmp+0x10] + movq m1, [pix_tmp+0x20] + movq m2, [pix_tmp+0x30] + movq m3, [pix_tmp+0x40] + TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4) + + lea r0, [r0+r3*8] + lea r1, [r1+r3*8] + movq m0, [pix_tmp+0x18] + movq m1, [pix_tmp+0x28] + movq m2, [pix_tmp+0x38] + movq m3, [pix_tmp+0x48] + TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4) + + ADD esp, pad + RET +%endmacro ; DEBLOCK_LUMA + +INIT_XMM +DEBLOCK_LUMA sse2, v, 16 + +%endif ; ARCH + + + +%macro LUMA_INTRA_P012 4 ; p0..p3 in memory + mova t0, p2 + mova t1, p0 + pavgb t0, p1 + pavgb t1, q0 + pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2 + mova t5, t1 + mova t2, p2 + mova t3, p0 + paddb t2, p1 + paddb t3, q0 + paddb t2, t3 + mova t3, t2 + mova t4, t2 + psrlw t2, 1 + pavgb t2, mpb_00 + pxor t2, t0 + pand t2, mpb_01 + psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4; + + mova t1, p2 + mova t2, p2 + pavgb t1, q1 + psubb t2, q1 + paddb t3, t3 + psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1 + pand t2, mpb_01 + psubb t1, t2 + pavgb t1, p1 + pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2 + psrlw t3, 2 + pavgb t3, mpb_00 + pxor t3, t1 + pand t3, mpb_01 + psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8 + + mova t3, p0 + mova t2, p0 + pxor t3, q1 + pavgb t2, q1 + pand t3, mpb_01 + psubb t2, t3 + pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4 + + pxor t1, t2 + pxor t2, p0 + pand t1, mask1p + pand t2, mask0 + pxor t1, t2 + pxor t1, p0 + mova %1, t1 ; store p0 + + mova t1, %4 ; p3 + mova t2, t1 + pavgb t1, p2 + paddb t2, p2 + pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4 + paddb t2, t2 + paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0 + psrlw t2, 2 + pavgb t2, mpb_00 + pxor t2, t1 + pand t2, mpb_01 + psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8 + + pxor t0, p1 + pxor t1, p2 + pand t0, mask1p + pand t1, mask1p + pxor t0, p1 + pxor t1, p2 + mova %2, t0 ; store p1 + mova %3, t1 ; store p2 +%endmacro + +%macro LUMA_INTRA_SWAP_PQ 0 + %define q1 m0 + %define q0 m1 + %define p0 m2 + %define p1 m3 + %define p2 q2 + %define mask1p mask1q +%endmacro + +%macro DEBLOCK_LUMA_INTRA 2 + %define p1 m0 + %define p0 m1 + %define q0 m2 + %define q1 m3 + %define t0 m4 + %define t1 m5 + %define t2 m6 + %define t3 m7 +%ifdef ARCH_X86_64 + %define p2 m8 + %define q2 m9 + %define t4 m10 + %define t5 m11 + %define mask0 m12 + %define mask1p m13 + %define mask1q [rsp-24] + %define mpb_00 m14 + %define mpb_01 m15 +%else + %define spill(x) [esp+16*x+((stack_offset+4)&15)] + %define p2 [r4+r1] + %define q2 [r0+2*r1] + %define t4 spill(0) + %define t5 spill(1) + %define mask0 spill(2) + %define mask1p spill(3) + %define mask1q spill(4) + %define mpb_00 [pb_00 GLOBAL] + %define mpb_01 [pb_01 GLOBAL] +%endif + +;----------------------------------------------------------------------------- +; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ) +;----------------------------------------------------------------------------- +cglobal x264_deblock_%2_luma_intra_%1, 4,6 +%ifndef ARCH_X86_64 + sub esp, 0x60 +%endif + lea r4, [r1*4] + lea r5, [r1*3] ; 3*stride + dec r2d ; alpha-1 + jl .end + neg r4 + dec r3d ; beta-1 + jl .end + add r4, r0 ; pix-4*stride + mova p1, [r4+2*r1] + mova p0, [r4+r5] + mova q0, [r0] + mova q1, [r0+r1] +%ifdef ARCH_X86_64 + pxor mpb_00, mpb_00 + mova mpb_01, [pb_01 GLOBAL] + LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 + SWAP 7, 12 ; m12=mask0 + pavgb t5, mpb_00 + pavgb t5, mpb_01 ; alpha/4+1 + movdqa p2, [r4+r1] + movdqa q2, [r0+2*r1] + DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1 + DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1 + DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1 + pand t0, mask0 + pand t4, t0 + pand t2, t0 + mova mask1q, t4 + mova mask1p, t2 +%else + LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 + mova m4, t5 + mova mask0, m7 + pavgb m4, [pb_00 GLOBAL] + pavgb m4, [pb_01 GLOBAL] ; alpha/4+1 + DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1 + pand m6, mask0 + DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1 + pand m4, m6 + mova mask1p, m4 + DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1 + pand m4, m6 + mova mask1q, m4 +%endif + LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4] + LUMA_INTRA_SWAP_PQ + LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5] +.end: +%ifndef ARCH_X86_64 + add esp, 0x60 +%endif + RET + +INIT_MMX +%ifdef ARCH_X86_64 +;----------------------------------------------------------------------------- +; void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ) +;----------------------------------------------------------------------------- +cglobal x264_deblock_h_luma_intra_%1 + movsxd r10, r1d + lea r11, [r10*3] + lea rax, [r0-4] + lea r9, [r0-4+r11] + sub rsp, 0x88 + %define pix_tmp rsp + + ; transpose 8x16 -> tmp space + TRANSPOSE8x8_MEM PASS8ROWS(rax, r9, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) + lea rax, [rax+r10*8] + lea r9, [r9+r10*8] + TRANSPOSE8x8_MEM PASS8ROWS(rax, r9, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) + + lea r0, [pix_tmp+0x40] + mov r1, 0x10 + call x264_deblock_v_luma_intra_%1 + + ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) + lea r9, [rax+r11] + TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(rax, r9, r10, r11) + shl r10, 3 + sub rax, r10 + sub r9, r10 + shr r10, 3 + TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(rax, r9, r10, r11) + add rsp, 0x88 + ret +%else +cglobal x264_deblock_h_luma_intra_%1, 2,4 + lea r3, [r1*3] + sub r0, 4 + lea r2, [r0+r3] +%assign pad 0x8c-(stack_offset&15) + SUB rsp, pad + %define pix_tmp rsp + + ; transpose 8x16 -> tmp space + TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) + lea r0, [r0+r1*8] + lea r2, [r2+r1*8] + TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) + + lea r0, [pix_tmp+0x40] + PUSH dword r3m + PUSH dword r2m + PUSH dword 16 + PUSH r0 + call x264_deblock_%2_luma_intra_%1 +%ifidn %2, v8 + add dword [rsp], 8 ; pix_tmp+8 + call x264_deblock_%2_luma_intra_%1 +%endif + ADD esp, 16 + + mov r1, r1m + mov r0, r0m + lea r3, [r1*3] + sub r0, 4 + lea r2, [r0+r3] + ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) + TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3) + lea r0, [r0+r1*8] + lea r2, [r2+r1*8] + TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3) + ADD rsp, pad + RET +%endif ; ARCH_X86_64 +%endmacro ; DEBLOCK_LUMA_INTRA + +INIT_XMM +DEBLOCK_LUMA_INTRA sse2, v +%ifndef ARCH_X86_64 +INIT_MMX +DEBLOCK_LUMA_INTRA mmxext, v8 +%endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/x86/h264_i386.h Mon Dec 22 09:12:42 2008 +0000 @@ -0,0 +1,155 @@ +/* + * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file h264_i386.h + * H.264 / AVC / MPEG4 part10 codec. + * non-MMX i386-specific optimizations for H.264 + * @author Michael Niedermayer <michaelni@gmx.at> + */ + +#ifndef AVCODEC_X86_H264_I386_H +#define AVCODEC_X86_H264_I386_H + +#include "libavcodec/cabac.h" + +//FIXME use some macros to avoid duplicating get_cabac (cannot be done yet +//as that would make optimization work hard) +#if defined(ARCH_X86) && defined(HAVE_7REGS) && \ + defined(HAVE_EBX_AVAILABLE) && \ + !defined(BROKEN_RELOCATIONS) +static int decode_significance_x86(CABACContext *c, int max_coeff, + uint8_t *significant_coeff_ctx_base, + int *index){ + void *end= significant_coeff_ctx_base + max_coeff - 1; + int minusstart= -(int)significant_coeff_ctx_base; + int minusindex= 4-(int)index; + int coeff_count; + __asm__ volatile( + "movl "RANGE "(%3), %%esi \n\t" + "movl "LOW "(%3), %%ebx \n\t" + + "2: \n\t" + + BRANCHLESS_GET_CABAC("%%edx", "%3", "(%1)", "%%ebx", + "%%bx", "%%esi", "%%eax", "%%al") + + "test $1, %%edx \n\t" + " jz 3f \n\t" + + BRANCHLESS_GET_CABAC("%%edx", "%3", "61(%1)", "%%ebx", + "%%bx", "%%esi", "%%eax", "%%al") + + "mov %2, %%"REG_a" \n\t" + "movl %4, %%ecx \n\t" + "add %1, %%"REG_c" \n\t" + "movl %%ecx, (%%"REG_a") \n\t" + + "test $1, %%edx \n\t" + " jnz 4f \n\t" + + "add $4, %%"REG_a" \n\t" + "mov %%"REG_a", %2 \n\t" + + "3: \n\t" + "add $1, %1 \n\t" + "cmp %5, %1 \n\t" + " jb 2b \n\t" + "mov %2, %%"REG_a" \n\t" + "movl %4, %%ecx \n\t" + "add %1, %%"REG_c" \n\t" + "movl %%ecx, (%%"REG_a") \n\t" + "4: \n\t" + "add %6, %%eax \n\t" + "shr $2, %%eax \n\t" + + "movl %%esi, "RANGE "(%3) \n\t" + "movl %%ebx, "LOW "(%3) \n\t" + :"=&a"(coeff_count), "+r"(significant_coeff_ctx_base), "+m"(index) + :"r"(c), "m"(minusstart), "m"(end), "m"(minusindex) + : "%"REG_c, "%ebx", "%edx", "%esi", "memory" + ); + return coeff_count; +} + +static int decode_significance_8x8_x86(CABACContext *c, + uint8_t *significant_coeff_ctx_base, + int *index, const uint8_t *sig_off){ + int minusindex= 4-(int)index; + int coeff_count; + x86_reg last=0; + __asm__ volatile( + "movl "RANGE "(%3), %%esi \n\t" + "movl "LOW "(%3), %%ebx \n\t" + + "mov %1, %%"REG_D" \n\t" + "2: \n\t" + + "mov %6, %%"REG_a" \n\t" + "movzbl (%%"REG_a", %%"REG_D"), %%edi \n\t" + "add %5, %%"REG_D" \n\t" + + BRANCHLESS_GET_CABAC("%%edx", "%3", "(%%"REG_D")", "%%ebx", + "%%bx", "%%esi", "%%eax", "%%al") + + "mov %1, %%edi \n\t" + "test $1, %%edx \n\t" + " jz 3f \n\t" + + "movzbl "MANGLE(last_coeff_flag_offset_8x8)"(%%edi), %%edi\n\t" + "add %5, %%"REG_D" \n\t" + + BRANCHLESS_GET_CABAC("%%edx", "%3", "15(%%"REG_D")", "%%ebx", + "%%bx", "%%esi", "%%eax", "%%al") + + "mov %2, %%"REG_a" \n\t" + "mov %1, %%edi \n\t" + "movl %%edi, (%%"REG_a") \n\t" + + "test $1, %%edx \n\t" + " jnz 4f \n\t" + + "add $4, %%"REG_a" \n\t" + "mov %%"REG_a", %2 \n\t" + + "3: \n\t" + "addl $1, %%edi \n\t" + "mov %%edi, %1 \n\t" + "cmpl $63, %%edi \n\t" + " jb 2b \n\t" + "mov %2, %%"REG_a" \n\t" + "movl %%edi, (%%"REG_a") \n\t" + "4: \n\t" + "addl %4, %%eax \n\t" + "shr $2, %%eax \n\t" + + "movl %%esi, "RANGE "(%3) \n\t" + "movl %%ebx, "LOW "(%3) \n\t" + :"=&a"(coeff_count),"+m"(last), "+m"(index) + :"r"(c), "m"(minusindex), "m"(significant_coeff_ctx_base), "m"(sig_off) + : "%"REG_c, "%ebx", "%edx", "%esi", "%"REG_D, "memory" + ); + return coeff_count; +} +#endif /* defined(ARCH_X86) && defined(HAVE_7REGS) && */ + /* defined(HAVE_EBX_AVAILABLE) && !defined(BROKEN_RELOCATIONS) */ + +#endif /* AVCODEC_X86_H264_I386_H */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/x86/h264dsp_mmx.c Mon Dec 22 09:12:42 2008 +0000 @@ -0,0 +1,2208 @@ +/* + * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "dsputil_mmx.h" + +DECLARE_ALIGNED_8 (static const uint64_t, ff_pb_3_1 ) = 0x0103010301030103ULL; +DECLARE_ALIGNED_8 (static const uint64_t, ff_pb_7_3 ) = 0x0307030703070307ULL; + +/***********************************/ +/* IDCT */ + +#define SUMSUB_BADC( a, b, c, d ) \ + "paddw "#b", "#a" \n\t"\ + "paddw "#d", "#c" \n\t"\ + "paddw "#b", "#b" \n\t"\ + "paddw "#d", "#d" \n\t"\ + "psubw "#a", "#b" \n\t"\ + "psubw "#c", "#d" \n\t" + +#define SUMSUBD2_AB( a, b, t ) \ + "movq "#b", "#t" \n\t"\ + "psraw $1 , "#b" \n\t"\ + "paddw "#a", "#b" \n\t"\ + "psraw $1 , "#a" \n\t"\ + "psubw "#t", "#a" \n\t" + +#define IDCT4_1D( s02, s13, d02, d13, t ) \ + SUMSUB_BA ( s02, d02 )\ + SUMSUBD2_AB( s13, d13, t )\ + SUMSUB_BADC( d13, s02, s13, d02 ) + +#define STORE_DIFF_4P( p, t, z ) \ + "psraw $6, "#p" \n\t"\ + "movd (%0), "#t" \n\t"\ + "punpcklbw "#z", "#t" \n\t"\ + "paddsw "#t", "#p" \n\t"\ + "packuswb "#z", "#p" \n\t"\ + "movd "#p", (%0) \n\t" + +static void ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride) +{ + /* Load dct coeffs */ + __asm__ volatile( + "movq (%0), %%mm0 \n\t" + "movq 8(%0), %%mm1 \n\t" + "movq 16(%0), %%mm2 \n\t" + "movq 24(%0), %%mm3 \n\t" + :: "r"(block) ); + + __asm__ volatile( + /* mm1=s02+s13 mm2=s02-s13 mm4=d02+d13 mm0=d02-d13 */ + IDCT4_1D( %%mm2, %%mm1, %%mm0, %%mm3, %%mm4 ) + + "movq %0, %%mm6 \n\t" + /* in: 1,4,0,2 out: 1,2,3,0 */ + TRANSPOSE4( %%mm3, %%mm1, %%mm0, %%mm2, %%mm4 ) + + "paddw %%mm6, %%mm3 \n\t" + + /* mm2=s02+s13 mm3=s02-s13 mm4=d02+d13 mm1=d02-d13 */ + IDCT4_1D( %%mm4, %%mm2, %%mm3, %%mm0, %%mm1 ) + + "pxor %%mm7, %%mm7 \n\t" + :: "m"(ff_pw_32)); + + __asm__ volatile( + STORE_DIFF_4P( %%mm0, %%mm1, %%mm7) + "add %1, %0 \n\t" + STORE_DIFF_4P( %%mm2, %%mm1, %%mm7) + "add %1, %0 \n\t" + STORE_DIFF_4P( %%mm3, %%mm1, %%mm7) + "add %1, %0 \n\t" + STORE_DIFF_4P( %%mm4, %%mm1, %%mm7) + : "+r"(dst) + : "r" ((x86_reg)stride) + ); +} + +static inline void h264_idct8_1d(int16_t *block) +{ + __asm__ volatile( + "movq 112(%0), %%mm7 \n\t" + "movq 80(%0), %%mm0 \n\t" + "movq 48(%0), %%mm3 \n\t" + "movq 16(%0), %%mm5 \n\t" + + "movq %%mm0, %%mm4 \n\t" + "movq %%mm5, %%mm1 \n\t" + "psraw $1, %%mm4 \n\t" + "psraw $1, %%mm1 \n\t" + "paddw %%mm0, %%mm4 \n\t" + "paddw %%mm5, %%mm1 \n\t" + "paddw %%mm7, %%mm4 \n\t" + "paddw %%mm0, %%mm1 \n\t" + "psubw %%mm5, %%mm4 \n\t" + "paddw %%mm3, %%mm1 \n\t" + + "psubw %%mm3, %%mm5 \n\t" + "psubw %%mm3, %%mm0 \n\t" + "paddw %%mm7, %%mm5 \n\t" + "psubw %%mm7, %%mm0 \n\t" + "psraw $1, %%mm3 \n\t" + "psraw $1, %%mm7 \n\t" + "psubw %%mm3, %%mm5 \n\t" + "psubw %%mm7, %%mm0 \n\t" + + "movq %%mm4, %%mm3 \n\t" + "movq %%mm1, %%mm7 \n\t" + "psraw $2, %%mm1 \n\t" + "psraw $2, %%mm3 \n\t" + "paddw %%mm5, %%mm3 \n\t" + "psraw $2, %%mm5 \n\t" + "paddw %%mm0, %%mm1 \n\t" + "psraw $2, %%mm0 \n\t" + "psubw %%mm4, %%mm5 \n\t" + "psubw %%mm0, %%mm7 \n\t" + + "movq 32(%0), %%mm2 \n\t" + "movq 96(%0), %%mm6 \n\t" + "movq %%mm2, %%mm4 \n\t" + "movq %%mm6, %%mm0 \n\t" + "psraw $1, %%mm4 \n\t" + "psraw $1, %%mm6 \n\t" + "psubw %%mm0, %%mm4 \n\t" + "paddw %%mm2, %%mm6 \n\t" + + "movq (%0), %%mm2 \n\t" + "movq 64(%0), %%mm0 \n\t" + SUMSUB_BA( %%mm0, %%mm2 ) + SUMSUB_BA( %%mm6, %%mm0 ) + SUMSUB_BA( %%mm4, %%mm2 ) + SUMSUB_BA( %%mm7, %%mm6 ) + SUMSUB_BA( %%mm5, %%mm4 ) + SUMSUB_BA( %%mm3, %%mm2 ) + SUMSUB_BA( %%mm1, %%mm0 ) + :: "r"(block) + ); +} + +static void ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride) +{ + int i; + int16_t __attribute__ ((aligned(8))) b2[64]; + + block[0] += 32; + + for(i=0; i<2; i++){ + DECLARE_ALIGNED_8(uint64_t, tmp); + + h264_idct8_1d(block+4*i); + + __asm__ volatile( + "movq %%mm7, %0 \n\t" + TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 ) + "movq %%mm0, 8(%1) \n\t" + "movq %%mm6, 24(%1) \n\t" + "movq %%mm7, 40(%1) \n\t" + "movq %%mm4, 56(%1) \n\t" + "movq %0, %%mm7 \n\t" + TRANSPOSE4( %%mm7, %%mm5, %%mm3, %%mm1, %%mm0 ) + "movq %%mm7, (%1) \n\t" + "movq %%mm1, 16(%1) \n\t" + "movq %%mm0, 32(%1) \n\t" + "movq %%mm3, 48(%1) \n\t" + : "=m"(tmp) + : "r"(b2+32*i) + : "memory" + ); + } + + for(i=0; i<2; i++){ + h264_idct8_1d(b2+4*i); + + __asm__ volatile( + "psraw $6, %%mm7 \n\t" + "psraw $6, %%mm6 \n\t" + "psraw $6, %%mm5 \n\t" + "psraw $6, %%mm4 \n\t" + "psraw $6, %%mm3 \n\t" + "psraw $6, %%mm2 \n\t" + "psraw $6, %%mm1 \n\t" + "psraw $6, %%mm0 \n\t" + + "movq %%mm7, (%0) \n\t" + "movq %%mm5, 16(%0) \n\t" + "movq %%mm3, 32(%0) \n\t" + "movq %%mm1, 48(%0) \n\t" + "movq %%mm0, 64(%0) \n\t" + "movq %%mm2, 80(%0) \n\t" + "movq %%mm4, 96(%0) \n\t" + "movq %%mm6, 112(%0) \n\t" + :: "r"(b2+4*i) + : "memory" + ); + } + + add_pixels_clamped_mmx(b2, dst, stride); +} + +#define STORE_DIFF_8P( p, d, t, z )\ + "movq "#d", "#t" \n"\ + "psraw $6, "#p" \n"\ + "punpcklbw "#z", "#t" \n"\ + "paddsw "#t", "#p" \n"\ + "packuswb "#p", "#p" \n"\ + "movq "#p", "#d" \n" + +#define H264_IDCT8_1D_SSE2(a,b,c,d,e,f,g,h)\ + "movdqa "#c", "#a" \n"\ + "movdqa "#g", "#e" \n"\ + "psraw $1, "#c" \n"\ + "psraw $1, "#g" \n"\ + "psubw "#e", "#c" \n"\ + "paddw "#a", "#g" \n"\ + "movdqa "#b", "#e" \n"\ + "psraw $1, "#e" \n"\ + "paddw "#b", "#e" \n"\ + "paddw "#d", "#e" \n"\ + "paddw "#f", "#e" \n"\ + "movdqa "#f", "#a" \n"\ + "psraw $1, "#a" \n"\ + "paddw "#f", "#a" \n"\ + "paddw "#h", "#a" \n"\ + "psubw "#b", "#a" \n"\ + "psubw "#d", "#b" \n"\ + "psubw "#d", "#f" \n"\ + "paddw "#h", "#b" \n"\ + "psubw "#h", "#f" \n"\ + "psraw $1, "#d" \n"\ + "psraw $1, "#h" \n"\ + "psubw "#d", "#b" \n"\ + "psubw "#h", "#f" \n"\ + "movdqa "#e", "#d" \n"\ + "movdqa "#a", "#h" \n"\ + "psraw $2, "#d" \n"\ + "psraw $2, "#h" \n"\ + "paddw "#f", "#d" \n"\ + "paddw "#b", "#h" \n"\ + "psraw $2, "#f" \n"\ + "psraw $2, "#b" \n"\ + "psubw "#f", "#e" \n"\ + "psubw "#a", "#b" \n"\ + "movdqa 0x00(%1), "#a" \n"\ + "movdqa 0x40(%1), "#f" \n"\ + SUMSUB_BA(f, a)\ + SUMSUB_BA(g, f)\ + SUMSUB_BA(c, a)\ + SUMSUB_BA(e, g)\ + SUMSUB_BA(b, c)\ + SUMSUB_BA(h, a)\ + SUMSUB_BA(d, f) + +static void ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride) +{ + __asm__ volatile( + "movdqa 0x10(%1), %%xmm1 \n" + "movdqa 0x20(%1), %%xmm2 \n" + "movdqa 0x30(%1), %%xmm3 \n" + "movdqa 0x50(%1), %%xmm5 \n" + "movdqa 0x60(%1), %%xmm6 \n" + "movdqa 0x70(%1), %%xmm7 \n" + H264_IDCT8_1D_SSE2(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7) + TRANSPOSE8(%%xmm4, %%xmm1, %%xmm7, %%xmm3, %%xmm5, %%xmm0, %%xmm2, %%xmm6, (%1)) + "paddw %4, %%xmm4 \n" + "movdqa %%xmm4, 0x00(%1) \n" + "movdqa %%xmm2, 0x40(%1) \n" + H264_IDCT8_1D_SSE2(%%xmm4, %%xmm0, %%xmm6, %%xmm3, %%xmm2, %%xmm5, %%xmm7, %%xmm1) + "movdqa %%xmm6, 0x60(%1) \n" + "movdqa %%xmm7, 0x70(%1) \n" + "pxor %%xmm7, %%xmm7 \n" + STORE_DIFF_8P(%%xmm2, (%0), %%xmm6, %%xmm7) + STORE_DIFF_8P(%%xmm0, (%0,%2), %%xmm6, %%xmm7) + STORE_DIFF_8P(%%xmm1, (%0,%2,2), %%xmm6, %%xmm7) + STORE_DIFF_8P(%%xmm3, (%0,%3), %%xmm6, %%xmm7) + "lea (%0,%2,4), %0 \n" + STORE_DIFF_8P(%%xmm5, (%0), %%xmm6, %%xmm7) + STORE_DIFF_8P(%%xmm4, (%0,%2), %%xmm6, %%xmm7) + "movdqa 0x60(%1), %%xmm0 \n" + "movdqa 0x70(%1), %%xmm1 \n" + STORE_DIFF_8P(%%xmm0, (%0,%2,2), %%xmm6, %%xmm7) + STORE_DIFF_8P(%%xmm1, (%0,%3), %%xmm6, %%xmm7) + :"+r"(dst) + :"r"(block), "r"((x86_reg)stride), "r"((x86_reg)3L*stride), "m"(ff_pw_32) + ); +} + +static void ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride) +{ + int dc = (block[0] + 32) >> 6; + __asm__ volatile( + "movd %0, %%mm0 \n\t" + "pshufw $0, %%mm0, %%mm0 \n\t" + "pxor %%mm1, %%mm1 \n\t" + "psubw %%mm0, %%mm1 \n\t" + "packuswb %%mm0, %%mm0 \n\t" + "packuswb %%mm1, %%mm1 \n\t" + ::"r"(dc) + ); + __asm__ volatile( + "movd %0, %%mm2 \n\t" + "movd %1, %%mm3 \n\t" + "movd %2, %%mm4 \n\t" + "movd %3, %%mm5 \n\t" + "paddusb %%mm0, %%mm2 \n\t" + "paddusb %%mm0, %%mm3 \n\t" + "paddusb %%mm0, %%mm4 \n\t" + "paddusb %%mm0, %%mm5 \n\t" + "psubusb %%mm1, %%mm2 \n\t" + "psubusb %%mm1, %%mm3 \n\t" + "psubusb %%mm1, %%mm4 \n\t" + "psubusb %%mm1, %%mm5 \n\t" + "movd %%mm2, %0 \n\t" + "movd %%mm3, %1 \n\t" + "movd %%mm4, %2 \n\t" + "movd %%mm5, %3 \n\t" + :"+m"(*(uint32_t*)(dst+0*stride)), + "+m"(*(uint32_t*)(dst+1*stride)), + "+m"(*(uint32_t*)(dst+2*stride)), + "+m"(*(uint32_t*)(dst+3*stride)) + ); +} + +static void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride) +{ + int dc = (block[0] + 32) >> 6; + int y; + __asm__ volatile( + "movd %0, %%mm0 \n\t" + "pshufw $0, %%mm0, %%mm0 \n\t" + "pxor %%mm1, %%mm1 \n\t" + "psubw %%mm0, %%mm1 \n\t" + "packuswb %%mm0, %%mm0 \n\t" + "packuswb %%mm1, %%mm1 \n\t" + ::"r"(dc) + ); + for(y=2; y--; dst += 4*stride){ + __asm__ volatile( + "movq %0, %%mm2 \n\t" + "movq %1, %%mm3 \n\t" + "movq %2, %%mm4 \n\t" + "movq %3, %%mm5 \n\t" + "paddusb %%mm0, %%mm2 \n\t" + "paddusb %%mm0, %%mm3 \n\t" + "paddusb %%mm0, %%mm4 \n\t" + "paddusb %%mm0, %%mm5 \n\t" + "psubusb %%mm1, %%mm2 \n\t" + "psubusb %%mm1, %%mm3 \n\t" + "psubusb %%mm1, %%mm4 \n\t" + "psubusb %%mm1, %%mm5 \n\t" + "movq %%mm2, %0 \n\t" + "movq %%mm3, %1 \n\t" + "movq %%mm4, %2 \n\t" + "movq %%mm5, %3 \n\t" + :"+m"(*(uint64_t*)(dst+0*stride)), + "+m"(*(uint64_t*)(dst+1*stride)), + "+m"(*(uint64_t*)(dst+2*stride)), + "+m"(*(uint64_t*)(dst+3*stride)) + ); + } +} + +//FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split +static const uint8_t scan8[16 + 2*4]={ + 4+1*8, 5+1*8, 4+2*8, 5+2*8, + 6+1*8, 7+1*8, 6+2*8, 7+2*8, + 4+3*8, 5+3*8, 4+4*8, 5+4*8, + 6+3*8, 7+3*8, 6+4*8, 7+4*8, + 1+1*8, 2+1*8, + 1+2*8, 2+2*8, + 1+4*8, 2+4*8, + 1+5*8, 2+5*8, +}; + +static void ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ + int i; + for(i=0; i<16; i++){ + if(nnzc[ scan8[i] ]) + ff_h264_idct_add_mmx(dst + block_offset[i], block + i*16, stride); + } +} + +static void ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ + int i; + for(i=0; i<16; i+=4){ + if(nnzc[ scan8[i] ]) + ff_h264_idct8_add_mmx(dst + block_offset[i], block + i*16, stride); + } +} + + +static void ff_h264_idct_add16_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ + int i; + for(i=0; i<16; i++){ + int nnz = nnzc[ scan8[i] ]; + if(nnz){ + if(nnz==1 && block[i*16]) ff_h264_idct_dc_add_mmx2(dst + block_offset[i], block + i*16, stride); + else ff_h264_idct_add_mmx (dst + block_offset[i], block + i*16, stride); + } + } +} + +static void ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ + int i; + for(i=0; i<16; i++){ + if(nnzc[ scan8[i] ] || block[i*16]) + ff_h264_idct_add_mmx(dst + block_offset[i], block + i*16, stride); + } +} + +static void ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ + int i; + for(i=0; i<16; i++){ + if(nnzc[ scan8[i] ]) ff_h264_idct_add_mmx (dst + block_offset[i], block + i*16, stride); + else if(block[i*16]) ff_h264_idct_dc_add_mmx2(dst + block_offset[i], block + i*16, stride); + } +} + +static void ff_h264_idct8_add4_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ + int i; + for(i=0; i<16; i+=4){ + int nnz = nnzc[ scan8[i] ]; + if(nnz){ + if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_mmx2(dst + block_offset[i], block + i*16, stride); + else ff_h264_idct8_add_mmx (dst + block_offset[i], block + i*16, stride); + } + } +} + +static void ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ + int i; + for(i=0; i<16; i+=4){ + int nnz = nnzc[ scan8[i] ]; + if(nnz){ + if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_mmx2(dst + block_offset[i], block + i*16, stride); + else ff_h264_idct8_add_sse2 (dst + block_offset[i], block + i*16, stride); + } + } +} + +static void ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ + int i; + for(i=16; i<16+8; i++){ + if(nnzc[ scan8[i] ] || block[i*16]) + ff_h264_idct_add_mmx (dest[(i&4)>>2] + block_offset[i], block + i*16, stride); + } +} + +static void ff_h264_idct_add8_mmx2(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ + int i; + for(i=16; i<16+8; i++){ + if(nnzc[ scan8[i] ]) + ff_h264_idct_add_mmx (dest[(i&4)>>2] + block_offset[i], block + i*16, stride); + else if(block[i*16]) + ff_h264_idct_dc_add_mmx2(dest[(i&4)>>2] + block_offset[i], block + i*16, stride); + } +} + +/***********************************/ +/* deblocking */ + +// out: o = |x-y|>a +// clobbers: t +#define DIFF_GT_MMX(x,y,a,o,t)\ + "movq "#y", "#t" \n\t"\ + "movq "#x", "#o" \n\t"\ + "psubusb "#x", "#t" \n\t"\ + "psubusb "#y", "#o" \n\t"\ + "por "#t", "#o" \n\t"\ + "psubusb "#a", "#o" \n\t" + +// out: o = |x-y|>a +// clobbers: t +#define DIFF_GT2_MMX(x,y,a,o,t)\ + "movq "#y", "#t" \n\t"\ + "movq "#x", "#o" \n\t"\ + "psubusb "#x", "#t" \n\t"\ + "psubusb "#y", "#o" \n\t"\ + "psubusb "#a", "#t" \n\t"\ + "psubusb "#a", "#o" \n\t"\ + "pcmpeqb "#t", "#o" \n\t"\ + +// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 +// out: mm5=beta-1, mm7=mask +// clobbers: mm4,mm6 +#define H264_DEBLOCK_MASK(alpha1, beta1) \ + "pshufw $0, "#alpha1", %%mm4 \n\t"\ + "pshufw $0, "#beta1 ", %%mm5 \n\t"\ + "packuswb %%mm4, %%mm4 \n\t"\ + "packuswb %%mm5, %%mm5 \n\t"\ + DIFF_GT_MMX(%%mm1, %%mm2, %%mm4, %%mm7, %%mm6) /* |p0-q0| > alpha-1 */\ + DIFF_GT_MMX(%%mm0, %%mm1, %%mm5, %%mm4, %%mm6) /* |p1-p0| > beta-1 */\ + "por %%mm4, %%mm7 \n\t"\ + DIFF_GT_MMX(%%mm3, %%mm2, %%mm5, %%mm4, %%mm6) /* |q1-q0| > beta-1 */\ + "por %%mm4, %%mm7 \n\t"\ + "pxor %%mm6, %%mm6 \n\t"\ + "pcmpeqb %%mm6, %%mm7 \n\t" + +// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask) +// out: mm1=p0' mm2=q0' +// clobbers: mm0,3-6 +#define H264_DEBLOCK_P0_Q0(pb_01, pb_3f)\ + "movq %%mm1 , %%mm5 \n\t"\ + "pxor %%mm2 , %%mm5 \n\t" /* p0^q0*/\ + "pand "#pb_01" , %%mm5 \n\t" /* (p0^q0)&1*/\ + "pcmpeqb %%mm4 , %%mm4 \n\t"\ + "pxor %%mm4 , %%mm3 \n\t"\ + "pavgb %%mm0 , %%mm3 \n\t" /* (p1 - q1 + 256)>>1*/\ + "pavgb "MANGLE(ff_pb_3)" , %%mm3 \n\t" /*(((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2*/\ + "pxor %%mm1 , %%mm4 \n\t"\ + "pavgb %%mm2 , %%mm4 \n\t" /* (q0 - p0 + 256)>>1*/\ + "pavgb %%mm5 , %%mm3 \n\t"\ + "paddusb %%mm4 , %%mm3 \n\t" /* d+128+33*/\ + "movq "MANGLE(ff_pb_A1)" , %%mm6 \n\t"\ + "psubusb %%mm3 , %%mm6 \n\t"\ + "psubusb "MANGLE(ff_pb_A1)" , %%mm3 \n\t"\ + "pminub %%mm7 , %%mm6 \n\t"\ + "pminub %%mm7 , %%mm3 \n\t"\ + "psubusb %%mm6 , %%mm1 \n\t"\ + "psubusb %%mm3 , %%mm2 \n\t"\ + "paddusb %%mm3 , %%mm1 \n\t"\ + "paddusb %%mm6 , %%mm2 \n\t" + +// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask) %8=ff_bone +// out: (q1addr) = av_clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 ) +// clobbers: q2, tmp, tc0 +#define H264_DEBLOCK_Q1(p1, q2, q2addr, q1addr, tc0, tmp)\ + "movq %%mm1, "#tmp" \n\t"\ + "pavgb %%mm2, "#tmp" \n\t"\ + "pavgb "#tmp", "#q2" \n\t" /* avg(p2,avg(p0,q0)) */\ + "pxor "q2addr", "#tmp" \n\t"\ + "pand %8, "#tmp" \n\t" /* (p2^avg(p0,q0))&1 */\ + "psubusb "#tmp", "#q2" \n\t" /* (p2+((p0+q0+1)>>1))>>1 */\ + "movq "#p1", "#tmp" \n\t"\ + "psubusb "#tc0", "#tmp" \n\t"\ + "paddusb "#p1", "#tc0" \n\t"\ + "pmaxub "#tmp", "#q2" \n\t"\ + "pminub "#tc0", "#q2" \n\t"\ + "movq "#q2", "q1addr" \n\t" + +static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0) +{ + DECLARE_ALIGNED_8(uint64_t, tmp0[2]); + + __asm__ volatile( + "movq (%1,%3), %%mm0 \n\t" //p1 + "movq (%1,%3,2), %%mm1 \n\t" //p0 + "movq (%2), %%mm2 \n\t" //q0 + "movq (%2,%3), %%mm3 \n\t" //q1 + H264_DEBLOCK_MASK(%6, %7) + + "movd %5, %%mm4 \n\t" + "punpcklbw %%mm4, %%mm4 \n\t" + "punpcklwd %%mm4, %%mm4 \n\t" + "pcmpeqb %%mm3, %%mm3 \n\t" + "movq %%mm4, %%mm6 \n\t" + "pcmpgtb %%mm3, %%mm4 \n\t" + "movq %%mm6, 8+%0 \n\t" + "pand %%mm4, %%mm7 \n\t" + "movq %%mm7, %0 \n\t" + + /* filter p1 */ + "movq (%1), %%mm3 \n\t" //p2 + DIFF_GT2_MMX(%%mm1, %%mm3, %%mm5, %%mm6, %%mm4) // |p2-p0|>beta-1 + "pand %%mm7, %%mm6 \n\t" // mask & |p2-p0|<beta + "pand 8+%0, %%mm7 \n\t" // mask & tc0 + "movq %%mm7, %%mm4 \n\t" + "psubb %%mm6, %%mm7 \n\t" + "pand %%mm4, %%mm6 \n\t" // mask & |p2-p0|<beta & tc0 + H264_DEBLOCK_Q1(%%mm0, %%mm3, "(%1)", "(%1,%3)", %%mm6, %%mm4) + + /* filter q1 */ + "movq (%2,%3,2), %%mm4 \n\t" //q2 + DIFF_GT2_MMX(%%mm2, %%mm4, %%mm5, %%mm6, %%mm3) // |q2-q0|>beta-1 + "pand %0, %%mm6 \n\t" + "movq 8+%0, %%mm5 \n\t" // can be merged with the and below but is slower then + "pand %%mm6, %%mm5 \n\t" + "psubb %%mm6, %%mm7 \n\t" + "movq (%2,%3), %%mm3 \n\t" + H264_DEBLOCK_Q1(%%mm3, %%mm4, "(%2,%3,2)", "(%2,%3)", %%mm5, %%mm6) + + /* filter p0, q0 */ + H264_DEBLOCK_P0_Q0(%8, unused) + "movq %%mm1, (%1,%3,2) \n\t" + "movq %%mm2, (%2) \n\t" + + : "=m"(*tmp0) + : "r"(pix-3*stride), "r"(pix), "r"((x86_reg)stride), + "m"(*tmp0/*unused*/), "m"(*(uint32_t*)tc0), "m"(alpha1), "m"(beta1), + "m"(ff_bone) + ); +} + +static void h264_v_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) +{ + if((tc0[0] & tc0[1]) >= 0) + h264_loop_filter_luma_mmx2(pix, stride, alpha-1, beta-1, tc0); + if((tc0[2] & tc0[3]) >= 0) + h264_loop_filter_luma_mmx2(pix+8, stride, alpha-1, beta-1, tc0+2); +} +static void h264_h_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) +{ + //FIXME: could cut some load/stores by merging transpose with filter + // also, it only needs to transpose 6x8 + DECLARE_ALIGNED_8(uint8_t, trans[8*8]); + int i; + for(i=0; i<2; i++, pix+=8*stride, tc0+=2) { + if((tc0[0] & tc0[1]) < 0) + continue; + transpose4x4(trans, pix-4, 8, stride); + transpose4x4(trans +4*8, pix, 8, stride); + transpose4x4(trans+4, pix-4+4*stride, 8, stride); + transpose4x4(trans+4+4*8, pix +4*stride, 8, stride); + h264_loop_filter_luma_mmx2(trans+4*8, 8, alpha-1, beta-1, tc0); + transpose4x4(pix-2, trans +2*8, stride, 8); + transpose4x4(pix-2+4*stride, trans+4+2*8, stride, 8); + } +} + +static inline void h264_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0) +{ + __asm__ volatile( + "movq (%0), %%mm0 \n\t" //p1 + "movq (%0,%2), %%mm1 \n\t" //p0 + "movq (%1), %%mm2 \n\t" //q0 + "movq (%1,%2), %%mm3 \n\t" //q1 + H264_DEBLOCK_MASK(%4, %5) + "movd %3, %%mm6 \n\t" + "punpcklbw %%mm6, %%mm6 \n\t" + "pand %%mm6, %%mm7 \n\t" // mm7 = tc&mask + H264_DEBLOCK_P0_Q0(%6, %7) + "movq %%mm1, (%0,%2) \n\t" + "movq %%mm2, (%1) \n\t" + + :: "r"(pix-2*stride), "r"(pix), "r"((x86_reg)stride), + "r"(*(uint32_t*)tc0), + "m"(alpha1), "m"(beta1), "m"(ff_bone), "m"(ff_pb_3F) + ); +} + +static void h264_v_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) +{ + h264_loop_filter_chroma_mmx2(pix, stride, alpha-1, beta-1, tc0); +} + +static void h264_h_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) +{ + //FIXME: could cut some load/stores by merging transpose with filter + DECLARE_ALIGNED_8(uint8_t, trans[8*4]); + transpose4x4(trans, pix-2, 8, stride); + transpose4x4(trans+4, pix-2+4*stride, 8, stride); + h264_loop_filter_chroma_mmx2(trans+2*8, 8, alpha-1, beta-1, tc0); + transpose4x4(pix-2, trans, stride, 8); + transpose4x4(pix-2+4*stride, trans+4, stride, 8); +} + +// p0 = (p0 + q1 + 2*p1 + 2) >> 2 +#define H264_FILTER_CHROMA4(p0, p1, q1, one) \ + "movq "#p0", %%mm4 \n\t"\ + "pxor "#q1", %%mm4 \n\t"\ + "pand "#one", %%mm4 \n\t" /* mm4 = (p0^q1)&1 */\ + "pavgb "#q1", "#p0" \n\t"\ + "psubusb %%mm4, "#p0" \n\t"\ + "pavgb "#p1", "#p0" \n\t" /* dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) */\ + +static inline void h264_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha1, int beta1) +{ + __asm__ volatile( + "movq (%0), %%mm0 \n\t" + "movq (%0,%2), %%mm1 \n\t" + "movq (%1), %%mm2 \n\t" + "movq (%1,%2), %%mm3 \n\t" + H264_DEBLOCK_MASK(%3, %4) + "movq %%mm1, %%mm5 \n\t" + "movq %%mm2, %%mm6 \n\t" + H264_FILTER_CHROMA4(%%mm1, %%mm0, %%mm3, %5) //p0' + H264_FILTER_CHROMA4(%%mm2, %%mm3, %%mm0, %5) //q0' + "psubb %%mm5, %%mm1 \n\t" + "psubb %%mm6, %%mm2 \n\t" + "pand %%mm7, %%mm1 \n\t" + "pand %%mm7, %%mm2 \n\t" + "paddb %%mm5, %%mm1 \n\t" + "paddb %%mm6, %%mm2 \n\t" + "movq %%mm1, (%0,%2) \n\t" + "movq %%mm2, (%1) \n\t" + :: "r"(pix-2*stride), "r"(pix), "r"((x86_reg)stride), + "m"(alpha1), "m"(beta1), "m"(ff_bone) + ); +} + +static void h264_v_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta) +{ + h264_loop_filter_chroma_intra_mmx2(pix, stride, alpha-1, beta-1); +} + +static void h264_h_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta) +{ + //FIXME: could cut some load/stores by merging transpose with filter + DECLARE_ALIGNED_8(uint8_t, trans[8*4]); + transpose4x4(trans, pix-2, 8, stride); + transpose4x4(trans+4, pix-2+4*stride, 8, stride); + h264_loop_filter_chroma_intra_mmx2(trans+2*8, 8, alpha-1, beta-1); + transpose4x4(pix-2, trans, stride, 8); + transpose4x4(pix-2+4*stride, trans+4, stride, 8); +} + +static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2], + int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field ) { + int dir; + __asm__ volatile( + "pxor %%mm7, %%mm7 \n\t" + "movq %0, %%mm6 \n\t" + "movq %1, %%mm5 \n\t" + "movq %2, %%mm4 \n\t" + ::"m"(ff_pb_1), "m"(ff_pb_3), "m"(ff_pb_7) + ); + if(field) + __asm__ volatile( + "movq %0, %%mm5 \n\t" + "movq %1, %%mm4 \n\t" + ::"m"(ff_pb_3_1), "m"(ff_pb_7_3) + ); + + // could do a special case for dir==0 && edges==1, but it only reduces the + // average filter time by 1.2% + for( dir=1; dir>=0; dir-- ) { + const int d_idx = dir ? -8 : -1; + const int mask_mv = dir ? mask_mv1 : mask_mv0; + DECLARE_ALIGNED_8(const uint64_t, mask_dir) = dir ? 0 : 0xffffffffffffffffULL; + int b_idx, edge, l; + for( b_idx=12, edge=0; edge<edges; edge+=step, b_idx+=8*step ) { + __asm__ volatile( + "pand %0, %%mm0 \n\t" + ::"m"(mask_dir) + ); + if(!(mask_mv & edge)) { + __asm__ volatile("pxor %%mm0, %%mm0 \n\t":); + for( l = bidir; l >= 0; l-- ) { + __asm__ volatile( + "movd %0, %%mm1 \n\t" + "punpckldq %1, %%mm1 \n\t" + "movq %%mm1, %%mm2 \n\t" + "psrlw $7, %%mm2 \n\t" + "pand %%mm6, %%mm2 \n\t" + "por %%mm2, %%mm1 \n\t" // ref_cache with -2 mapped to -1 + "punpckldq %%mm1, %%mm2 \n\t" + "pcmpeqb %%mm2, %%mm1 \n\t" + "paddb %%mm6, %%mm1 \n\t" + "punpckhbw %%mm7, %%mm1 \n\t" // ref[b] != ref[bn] + "por %%mm1, %%mm0 \n\t" + + "movq %2, %%mm1 \n\t" + "movq %3, %%mm2 \n\t" + "psubw %4, %%mm1 \n\t" + "psubw %5, %%mm2 \n\t" + "packsswb %%mm2, %%mm1 \n\t" + "paddb %%mm5, %%mm1 \n\t" + "pminub %%mm4, %%mm1 \n\t" + "pcmpeqb %%mm4, %%mm1 \n\t" // abs(mv[b] - mv[bn]) >= limit + "por %%mm1, %%mm0 \n\t" + ::"m"(ref[l][b_idx]), + "m"(ref[l][b_idx+d_idx]), + "m"(mv[l][b_idx][0]), + "m"(mv[l][b_idx+2][0]), + "m"(mv[l][b_idx+d_idx][0]), + "m"(mv[l][b_idx+d_idx+2][0]) + ); + } + } + __asm__ volatile( + "movd %0, %%mm1 \n\t" + "por %1, %%mm1 \n\t" + "punpcklbw %%mm7, %%mm1 \n\t" + "pcmpgtw %%mm7, %%mm1 \n\t" // nnz[b] || nnz[bn] + ::"m"(nnz[b_idx]), + "m"(nnz[b_idx+d_idx]) + ); + __asm__ volatile( + "pcmpeqw %%mm7, %%mm0 \n\t" + "pcmpeqw %%mm7, %%mm0 \n\t" + "psrlw $15, %%mm0 \n\t" // nonzero -> 1 + "psrlw $14, %%mm1 \n\t" + "movq %%mm0, %%mm2 \n\t" + "por %%mm1, %%mm2 \n\t" + "psrlw $1, %%mm1 \n\t" + "pandn %%mm2, %%mm1 \n\t" + "movq %%mm1, %0 \n\t" + :"=m"(*bS[dir][edge]) + ::"memory" + ); + } + edges = 4; + step = 1; + } + __asm__ volatile( + "movq (%0), %%mm0 \n\t" + "movq 8(%0), %%mm1 \n\t" + "movq 16(%0), %%mm2 \n\t" + "movq 24(%0), %%mm3 \n\t" + TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4) + "movq %%mm0, (%0) \n\t" + "movq %%mm3, 8(%0) \n\t" + "movq %%mm4, 16(%0) \n\t" + "movq %%mm2, 24(%0) \n\t" + ::"r"(bS[0]) + :"memory" + ); +} + +/***********************************/ +/* motion compensation */ + +#define QPEL_H264V_MM(A,B,C,D,E,F,OP,T,Z,d,q)\ + "mov"#q" "#C", "#T" \n\t"\ + "mov"#d" (%0), "#F" \n\t"\ + "paddw "#D", "#T" \n\t"\ + "psllw $2, "#T" \n\t"\ + "psubw "#B", "#T" \n\t"\ + "psubw "#E", "#T" \n\t"\ + "punpcklbw "#Z", "#F" \n\t"\ + "pmullw %4, "#T" \n\t"\ + "paddw %5, "#A" \n\t"\ + "add %2, %0 \n\t"\ + "paddw "#F", "#A" \n\t"\ + "paddw "#A", "#T" \n\t"\ + "psraw $5, "#T" \n\t"\ + "packuswb "#T", "#T" \n\t"\ + OP(T, (%1), A, d)\ + "add %3, %1 \n\t" + +#define QPEL_H264HV_MM(A,B,C,D,E,F,OF,T,Z,d,q)\ + "mov"#q" "#C", "#T" \n\t"\ + "mov"#d" (%0), "#F" \n\t"\ + "paddw "#D", "#T" \n\t"\ + "psllw $2, "#T" \n\t"\ + "paddw %4, "#A" \n\t"\ + "psubw "#B", "#T" \n\t"\ + "psubw "#E", "#T" \n\t"\ + "punpcklbw "#Z", "#F" \n\t"\ + "pmullw %3, "#T" \n\t"\ + "paddw "#F", "#A" \n\t"\ + "add %2, %0 \n\t"\ + "paddw "#A", "#T" \n\t"\ + "mov"#q" "#T", "#OF"(%1) \n\t" + +#define QPEL_H264V(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%mm6,%%mm7,d,q) +#define QPEL_H264HV(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%mm6,%%mm7,d,q) +#define QPEL_H264V_XMM(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%xmm6,%%xmm7,q,dqa) +#define QPEL_H264HV_XMM(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%xmm6,%%xmm7,q,dqa) + + +#define QPEL_H264(OPNAME, OP, MMX)\ +static av_noinline void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + int h=4;\ +\ + __asm__ volatile(\ + "pxor %%mm7, %%mm7 \n\t"\ + "movq %5, %%mm4 \n\t"\ + "movq %6, %%mm5 \n\t"\ + "1: \n\t"\ + "movd -1(%0), %%mm1 \n\t"\ + "movd (%0), %%mm2 \n\t"\ + "movd 1(%0), %%mm3 \n\t"\ + "movd 2(%0), %%mm0 \n\t"\ + "punpcklbw %%mm7, %%mm1 \n\t"\ + "punpcklbw %%mm7, %%mm2 \n\t"\ + "punpcklbw %%mm7, %%mm3 \n\t"\ + "punpcklbw %%mm7, %%mm0 \n\t"\ + "paddw %%mm0, %%mm1 \n\t"\ + "paddw %%mm3, %%mm2 \n\t"\ + "movd -2(%0), %%mm0 \n\t"\ + "movd 3(%0), %%mm3 \n\t"\ + "punpcklbw %%mm7, %%mm0 \n\t"\ + "punpcklbw %%mm7, %%mm3 \n\t"\ + "paddw %%mm3, %%mm0 \n\t"\ + "psllw $2, %%mm2 \n\t"\ + "psubw %%mm1, %%mm2 \n\t"\ + "pmullw %%mm4, %%mm2 \n\t"\ + "paddw %%mm5, %%mm0 \n\t"\ + "paddw %%mm2, %%mm0 \n\t"\ + "psraw $5, %%mm0 \n\t"\ + "packuswb %%mm0, %%mm0 \n\t"\ + OP(%%mm0, (%1),%%mm6, d)\ + "add %3, %0 \n\t"\ + "add %4, %1 \n\t"\ + "decl %2 \n\t"\ + " jnz 1b \n\t"\ + : "+a"(src), "+c"(dst), "+g"(h)\ + : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ + : "memory"\ + );\ +}\ +static av_noinline void OPNAME ## h264_qpel4_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ + int h=4;\ + __asm__ volatile(\ + "pxor %%mm7, %%mm7 \n\t"\ + "movq %0, %%mm4 \n\t"\ + "movq %1, %%mm5 \n\t"\ + :: "m"(ff_pw_5), "m"(ff_pw_16)\ + );\ + do{\ + __asm__ volatile(\ + "movd -1(%0), %%mm1 \n\t"\ + "movd (%0), %%mm2 \n\t"\ + "movd 1(%0), %%mm3 \n\t"\ + "movd 2(%0), %%mm0 \n\t"\ + "punpcklbw %%mm7, %%mm1 \n\t"\ + "punpcklbw %%mm7, %%mm2 \n\t"\ + "punpcklbw %%mm7, %%mm3 \n\t"\ + "punpcklbw %%mm7, %%mm0 \n\t"\ + "paddw %%mm0, %%mm1 \n\t"\ + "paddw %%mm3, %%mm2 \n\t"\ + "movd -2(%0), %%mm0 \n\t"\ + "movd 3(%0), %%mm3 \n\t"\ + "punpcklbw %%mm7, %%mm0 \n\t"\ + "punpcklbw %%mm7, %%mm3 \n\t"\ + "paddw %%mm3, %%mm0 \n\t"\ + "psllw $2, %%mm2 \n\t"\ + "psubw %%mm1, %%mm2 \n\t"\ + "pmullw %%mm4, %%mm2 \n\t"\ + "paddw %%mm5, %%mm0 \n\t"\ + "paddw %%mm2, %%mm0 \n\t"\ + "movd (%2), %%mm3 \n\t"\ + "psraw $5, %%mm0 \n\t"\ + "packuswb %%mm0, %%mm0 \n\t"\ + PAVGB" %%mm3, %%mm0 \n\t"\ + OP(%%mm0, (%1),%%mm6, d)\ + "add %4, %0 \n\t"\ + "add %4, %1 \n\t"\ + "add %3, %2 \n\t"\ + : "+a"(src), "+c"(dst), "+d"(src2)\ + : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\ + : "memory"\ + );\ + }while(--h);\ +}\ +static av_noinline void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + src -= 2*srcStride;\ + __asm__ volatile(\ + "pxor %%mm7, %%mm7 \n\t"\ + "movd (%0), %%mm0 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm1 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm2 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm3 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm4 \n\t"\ + "add %2, %0 \n\t"\ + "punpcklbw %%mm7, %%mm0 \n\t"\ + "punpcklbw %%mm7, %%mm1 \n\t"\ + "punpcklbw %%mm7, %%mm2 \n\t"\ + "punpcklbw %%mm7, %%mm3 \n\t"\ + "punpcklbw %%mm7, %%mm4 \n\t"\ + QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ + QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ + QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ + QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ + \ + : "+a"(src), "+c"(dst)\ + : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ + : "memory"\ + );\ +}\ +static av_noinline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ + int h=4;\ + int w=3;\ + src -= 2*srcStride+2;\ + while(w--){\ + __asm__ volatile(\ + "pxor %%mm7, %%mm7 \n\t"\ + "movd (%0), %%mm0 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm1 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm2 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm3 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm4 \n\t"\ + "add %2, %0 \n\t"\ + "punpcklbw %%mm7, %%mm0 \n\t"\ + "punpcklbw %%mm7, %%mm1 \n\t"\ + "punpcklbw %%mm7, %%mm2 \n\t"\ + "punpcklbw %%mm7, %%mm3 \n\t"\ + "punpcklbw %%mm7, %%mm4 \n\t"\ + QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*3)\ + QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*3)\ + QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*3)\ + QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\ + \ + : "+a"(src)\ + : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\ + : "memory"\ + );\ + tmp += 4;\ + src += 4 - 9*srcStride;\ + }\ + tmp -= 3*4;\ + __asm__ volatile(\ + "1: \n\t"\ + "movq (%0), %%mm0 \n\t"\ + "paddw 10(%0), %%mm0 \n\t"\ + "movq 2(%0), %%mm1 \n\t"\ + "paddw 8(%0), %%mm1 \n\t"\ + "movq 4(%0), %%mm2 \n\t"\ + "paddw 6(%0), %%mm2 \n\t"\ + "psubw %%mm1, %%mm0 \n\t"/*a-b (abccba)*/\ + "psraw $2, %%mm0 \n\t"/*(a-b)/4 */\ + "psubw %%mm1, %%mm0 \n\t"/*(a-b)/4-b */\ + "paddsw %%mm2, %%mm0 \n\t"\ + "psraw $2, %%mm0 \n\t"/*((a-b)/4-b+c)/4 */\ + "paddw %%mm2, %%mm0 \n\t"/*(a-5*b+20*c)/16 */\ + "psraw $6, %%mm0 \n\t"\ + "packuswb %%mm0, %%mm0 \n\t"\ + OP(%%mm0, (%1),%%mm7, d)\ + "add $24, %0 \n\t"\ + "add %3, %1 \n\t"\ + "decl %2 \n\t"\ + " jnz 1b \n\t"\ + : "+a"(tmp), "+c"(dst), "+g"(h)\ + : "S"((x86_reg)dstStride)\ + : "memory"\ + );\ +}\ +\ +static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + int h=8;\ + __asm__ volatile(\ + "pxor %%mm7, %%mm7 \n\t"\ + "movq %5, %%mm6 \n\t"\ + "1: \n\t"\ + "movq (%0), %%mm0 \n\t"\ + "movq 1(%0), %%mm2 \n\t"\ + "movq %%mm0, %%mm1 \n\t"\ + "movq %%mm2, %%mm3 \n\t"\ + "punpcklbw %%mm7, %%mm0 \n\t"\ + "punpckhbw %%mm7, %%mm1 \n\t"\ + "punpcklbw %%mm7, %%mm2 \n\t"\ + "punpckhbw %%mm7, %%mm3 \n\t"\ + "paddw %%mm2, %%mm0 \n\t"\ + "paddw %%mm3, %%mm1 \n\t"\ + "psllw $2, %%mm0 \n\t"\ + "psllw $2, %%mm1 \n\t"\ + "movq -1(%0), %%mm2 \n\t"\ + "movq 2(%0), %%mm4 \n\t"\ + "movq %%mm2, %%mm3 \n\t"\ + "movq %%mm4, %%mm5 \n\t"\ + "punpcklbw %%mm7, %%mm2 \n\t"\ + "punpckhbw %%mm7, %%mm3 \n\t"\ + "punpcklbw %%mm7, %%mm4 \n\t"\ + "punpckhbw %%mm7, %%mm5 \n\t"\ + "paddw %%mm4, %%mm2 \n\t"\ + "paddw %%mm3, %%mm5 \n\t"\ + "psubw %%mm2, %%mm0 \n\t"\ + "psubw %%mm5, %%mm1 \n\t"\ + "pmullw %%mm6, %%mm0 \n\t"\ + "pmullw %%mm6, %%mm1 \n\t"\ + "movd -2(%0), %%mm2 \n\t"\ + "movd 7(%0), %%mm5 \n\t"\ + "punpcklbw %%mm7, %%mm2 \n\t"\ + "punpcklbw %%mm7, %%mm5 \n\t"\ + "paddw %%mm3, %%mm2 \n\t"\ + "paddw %%mm5, %%mm4 \n\t"\ + "movq %6, %%mm5 \n\t"\ + "paddw %%mm5, %%mm2 \n\t"\ + "paddw %%mm5, %%mm4 \n\t"\ + "paddw %%mm2, %%mm0 \n\t"\ + "paddw %%mm4, %%mm1 \n\t"\ + "psraw $5, %%mm0 \n\t"\ + "psraw $5, %%mm1 \n\t"\ + "packuswb %%mm1, %%mm0 \n\t"\ + OP(%%mm0, (%1),%%mm5, q)\ + "add %3, %0 \n\t"\ + "add %4, %1 \n\t"\ + "decl %2 \n\t"\ + " jnz 1b \n\t"\ + : "+a"(src), "+c"(dst), "+g"(h)\ + : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ + : "memory"\ + );\ +}\ +\ +static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ + int h=8;\ + __asm__ volatile(\ + "pxor %%mm7, %%mm7 \n\t"\ + "movq %0, %%mm6 \n\t"\ + :: "m"(ff_pw_5)\ + );\ + do{\ + __asm__ volatile(\ + "movq (%0), %%mm0 \n\t"\ + "movq 1(%0), %%mm2 \n\t"\ + "movq %%mm0, %%mm1 \n\t"\ + "movq %%mm2, %%mm3 \n\t"\ + "punpcklbw %%mm7, %%mm0 \n\t"\ + "punpckhbw %%mm7, %%mm1 \n\t"\ + "punpcklbw %%mm7, %%mm2 \n\t"\ + "punpckhbw %%mm7, %%mm3 \n\t"\ + "paddw %%mm2, %%mm0 \n\t"\ + "paddw %%mm3, %%mm1 \n\t"\ + "psllw $2, %%mm0 \n\t"\ + "psllw $2, %%mm1 \n\t"\ + "movq -1(%0), %%mm2 \n\t"\ + "movq 2(%0), %%mm4 \n\t"\ + "movq %%mm2, %%mm3 \n\t"\ + "movq %%mm4, %%mm5 \n\t"\ + "punpcklbw %%mm7, %%mm2 \n\t"\ + "punpckhbw %%mm7, %%mm3 \n\t"\ + "punpcklbw %%mm7, %%mm4 \n\t"\ + "punpckhbw %%mm7, %%mm5 \n\t"\ + "paddw %%mm4, %%mm2 \n\t"\ + "paddw %%mm3, %%mm5 \n\t"\ + "psubw %%mm2, %%mm0 \n\t"\ + "psubw %%mm5, %%mm1 \n\t"\ + "pmullw %%mm6, %%mm0 \n\t"\ + "pmullw %%mm6, %%mm1 \n\t"\ + "movd -2(%0), %%mm2 \n\t"\ + "movd 7(%0), %%mm5 \n\t"\ + "punpcklbw %%mm7, %%mm2 \n\t"\ + "punpcklbw %%mm7, %%mm5 \n\t"\ + "paddw %%mm3, %%mm2 \n\t"\ + "paddw %%mm5, %%mm4 \n\t"\ + "movq %5, %%mm5 \n\t"\ + "paddw %%mm5, %%mm2 \n\t"\ + "paddw %%mm5, %%mm4 \n\t"\ + "paddw %%mm2, %%mm0 \n\t"\ + "paddw %%mm4, %%mm1 \n\t"\ + "psraw $5, %%mm0 \n\t"\ + "psraw $5, %%mm1 \n\t"\ + "movq (%2), %%mm4 \n\t"\ + "packuswb %%mm1, %%mm0 \n\t"\ + PAVGB" %%mm4, %%mm0 \n\t"\ + OP(%%mm0, (%1),%%mm5, q)\ + "add %4, %0 \n\t"\ + "add %4, %1 \n\t"\ + "add %3, %2 \n\t"\ + : "+a"(src), "+c"(dst), "+d"(src2)\ + : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\ + "m"(ff_pw_16)\ + : "memory"\ + );\ + }while(--h);\ +}\ +\ +static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ + int w= 2;\ + src -= 2*srcStride;\ + \ + while(w--){\ + __asm__ volatile(\ + "pxor %%mm7, %%mm7 \n\t"\ + "movd (%0), %%mm0 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm1 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm2 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm3 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm4 \n\t"\ + "add %2, %0 \n\t"\ + "punpcklbw %%mm7, %%mm0 \n\t"\ + "punpcklbw %%mm7, %%mm1 \n\t"\ + "punpcklbw %%mm7, %%mm2 \n\t"\ + "punpcklbw %%mm7, %%mm3 \n\t"\ + "punpcklbw %%mm7, %%mm4 \n\t"\ + QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ + QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ + QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ + QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ + QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\ + QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\ + QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ + QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ + \ + : "+a"(src), "+c"(dst)\ + : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ + : "memory"\ + );\ + if(h==16){\ + __asm__ volatile(\ + QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ + QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ + QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\ + QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\ + QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ + QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ + QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ + QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ + \ + : "+a"(src), "+c"(dst)\ + : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ + : "memory"\ + );\ + }\ + src += 4-(h+5)*srcStride;\ + dst += 4-h*dstStride;\ + }\ +}\ +static av_always_inline void OPNAME ## h264_qpel8or16_hv1_lowpass_ ## MMX(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){\ + int w = (size+8)>>2;\ + src -= 2*srcStride+2;\ + while(w--){\ + __asm__ volatile(\ + "pxor %%mm7, %%mm7 \n\t"\ + "movd (%0), %%mm0 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm1 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm2 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm3 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm4 \n\t"\ + "add %2, %0 \n\t"\ + "punpcklbw %%mm7, %%mm0 \n\t"\ + "punpcklbw %%mm7, %%mm1 \n\t"\ + "punpcklbw %%mm7, %%mm2 \n\t"\ + "punpcklbw %%mm7, %%mm3 \n\t"\ + "punpcklbw %%mm7, %%mm4 \n\t"\ + QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*48)\ + QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*48)\ + QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*48)\ + QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*48)\ + QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*48)\ + QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*48)\ + QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*48)\ + QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*48)\ + : "+a"(src)\ + : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\ + : "memory"\ + );\ + if(size==16){\ + __asm__ volatile(\ + QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 8*48)\ + QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 9*48)\ + QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 10*48)\ + QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 11*48)\ + QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 12*48)\ + QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 13*48)\ + QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 14*48)\ + QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 15*48)\ + : "+a"(src)\ + : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\ + : "memory"\ + );\ + }\ + tmp += 4;\ + src += 4 - (size+5)*srcStride;\ + }\ +}\ +static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\ + int w = size>>4;\ + do{\ + int h = size;\ + __asm__ volatile(\ + "1: \n\t"\ + "movq (%0), %%mm0 \n\t"\ + "movq 8(%0), %%mm3 \n\t"\ + "movq 2(%0), %%mm1 \n\t"\ + "movq 10(%0), %%mm4 \n\t"\ + "paddw %%mm4, %%mm0 \n\t"\ + "paddw %%mm3, %%mm1 \n\t"\ + "paddw 18(%0), %%mm3 \n\t"\ + "paddw 16(%0), %%mm4 \n\t"\ + "movq 4(%0), %%mm2 \n\t"\ + "movq 12(%0), %%mm5 \n\t"\ + "paddw 6(%0), %%mm2 \n\t"\ + "paddw 14(%0), %%mm5 \n\t"\ + "psubw %%mm1, %%mm0 \n\t"\ + "psubw %%mm4, %%mm3 \n\t"\ + "psraw $2, %%mm0 \n\t"\ + "psraw $2, %%mm3 \n\t"\ + "psubw %%mm1, %%mm0 \n\t"\ + "psubw %%mm4, %%mm3 \n\t"\ + "paddsw %%mm2, %%mm0 \n\t"\ + "paddsw %%mm5, %%mm3 \n\t"\ + "psraw $2, %%mm0 \n\t"\ + "psraw $2, %%mm3 \n\t"\ + "paddw %%mm2, %%mm0 \n\t"\ + "paddw %%mm5, %%mm3 \n\t"\ + "psraw $6, %%mm0 \n\t"\ + "psraw $6, %%mm3 \n\t"\ + "packuswb %%mm3, %%mm0 \n\t"\ + OP(%%mm0, (%1),%%mm7, q)\ + "add $48, %0 \n\t"\ + "add %3, %1 \n\t"\ + "decl %2 \n\t"\ + " jnz 1b \n\t"\ + : "+a"(tmp), "+c"(dst), "+g"(h)\ + : "S"((x86_reg)dstStride)\ + : "memory"\ + );\ + tmp += 8 - size*24;\ + dst += 8 - size*dstStride;\ + }while(w--);\ +}\ +\ +static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\ +}\ +static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\ + OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ +}\ +\ +static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ + OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ + src += 8*srcStride;\ + dst += 8*dstStride;\ + OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ + OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ +}\ +\ +static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ + OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ + OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ + src += 8*dstStride;\ + dst += 8*dstStride;\ + src2 += 8*src2Stride;\ + OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ + OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ +}\ +\ +static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\ + put_h264_qpel8or16_hv1_lowpass_ ## MMX(tmp, src, tmpStride, srcStride, size);\ + OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\ +}\ +static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ + OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 8);\ +}\ +\ +static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ + OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 16);\ +}\ +\ +static av_noinline void OPNAME ## pixels4_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\ +{\ + __asm__ volatile(\ + "movq (%1), %%mm0 \n\t"\ + "movq 24(%1), %%mm1 \n\t"\ + "psraw $5, %%mm0 \n\t"\ + "psraw $5, %%mm1 \n\t"\ + "packuswb %%mm0, %%mm0 \n\t"\ + "packuswb %%mm1, %%mm1 \n\t"\ + PAVGB" (%0), %%mm0 \n\t"\ + PAVGB" (%0,%3), %%mm1 \n\t"\ + OP(%%mm0, (%2), %%mm4, d)\ + OP(%%mm1, (%2,%4), %%mm5, d)\ + "lea (%0,%3,2), %0 \n\t"\ + "lea (%2,%4,2), %2 \n\t"\ + "movq 48(%1), %%mm0 \n\t"\ + "movq 72(%1), %%mm1 \n\t"\ + "psraw $5, %%mm0 \n\t"\ + "psraw $5, %%mm1 \n\t"\ + "packuswb %%mm0, %%mm0 \n\t"\ + "packuswb %%mm1, %%mm1 \n\t"\ + PAVGB" (%0), %%mm0 \n\t"\ + PAVGB" (%0,%3), %%mm1 \n\t"\ + OP(%%mm0, (%2), %%mm4, d)\ + OP(%%mm1, (%2,%4), %%mm5, d)\ + :"+a"(src8), "+c"(src16), "+d"(dst)\ + :"S"((x86_reg)src8Stride), "D"((x86_reg)dstStride)\ + :"memory");\ +}\ +static av_noinline void OPNAME ## pixels8_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\ +{\ + do{\ + __asm__ volatile(\ + "movq (%1), %%mm0 \n\t"\ + "movq 8(%1), %%mm1 \n\t"\ + "movq 48(%1), %%mm2 \n\t"\ + "movq 8+48(%1), %%mm3 \n\t"\ + "psraw $5, %%mm0 \n\t"\ + "psraw $5, %%mm1 \n\t"\ + "psraw $5, %%mm2 \n\t"\ + "psraw $5, %%mm3 \n\t"\ + "packuswb %%mm1, %%mm0 \n\t"\ + "packuswb %%mm3, %%mm2 \n\t"\ + PAVGB" (%0), %%mm0 \n\t"\ + PAVGB" (%0,%3), %%mm2 \n\t"\ + OP(%%mm0, (%2), %%mm5, q)\ + OP(%%mm2, (%2,%4), %%mm5, q)\ + ::"a"(src8), "c"(src16), "d"(dst),\ + "r"((x86_reg)src8Stride), "r"((x86_reg)dstStride)\ + :"memory");\ + src8 += 2L*src8Stride;\ + src16 += 48;\ + dst += 2L*dstStride;\ + }while(h-=2);\ +}\ +static void OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\ +{\ + OPNAME ## pixels8_l2_shift5_ ## MMX(dst , src16 , src8 , dstStride, src8Stride, h);\ + OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\ +}\ + + +#ifdef ARCH_X86_64 +#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ +static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ + int h=16;\ + __asm__ volatile(\ + "pxor %%xmm15, %%xmm15 \n\t"\ + "movdqa %6, %%xmm14 \n\t"\ + "movdqa %7, %%xmm13 \n\t"\ + "1: \n\t"\ + "lddqu 3(%0), %%xmm1 \n\t"\ + "lddqu -5(%0), %%xmm7 \n\t"\ + "movdqa %%xmm1, %%xmm0 \n\t"\ + "punpckhbw %%xmm15, %%xmm1 \n\t"\ + "punpcklbw %%xmm15, %%xmm0 \n\t"\ + "punpcklbw %%xmm15, %%xmm7 \n\t"\ + "movdqa %%xmm1, %%xmm2 \n\t"\ + "movdqa %%xmm0, %%xmm6 \n\t"\ + "movdqa %%xmm1, %%xmm3 \n\t"\ + "movdqa %%xmm0, %%xmm8 \n\t"\ + "movdqa %%xmm1, %%xmm4 \n\t"\ + "movdqa %%xmm0, %%xmm9 \n\t"\ + "movdqa %%xmm1, %%xmm5 \n\t"\ + "movdqa %%xmm0, %%xmm10 \n\t"\ + "palignr $6, %%xmm0, %%xmm5 \n\t"\ + "palignr $6, %%xmm7, %%xmm10\n\t"\ + "palignr $8, %%xmm0, %%xmm4 \n\t"\ + "palignr $8, %%xmm7, %%xmm9 \n\t"\ + "palignr $10,%%xmm0, %%xmm3 \n\t"\ + "palignr $10,%%xmm7, %%xmm8 \n\t"\ + "paddw %%xmm1, %%xmm5 \n\t"\ + "paddw %%xmm0, %%xmm10 \n\t"\ + "palignr $12,%%xmm0, %%xmm2 \n\t"\ + "palignr $12,%%xmm7, %%xmm6 \n\t"\ + "palignr $14,%%xmm0, %%xmm1 \n\t"\ + "palignr $14,%%xmm7, %%xmm0 \n\t"\ + "paddw %%xmm3, %%xmm2 \n\t"\ + "paddw %%xmm8, %%xmm6 \n\t"\ + "paddw %%xmm4, %%xmm1 \n\t"\ + "paddw %%xmm9, %%xmm0 \n\t"\ + "psllw $2, %%xmm2 \n\t"\ + "psllw $2, %%xmm6 \n\t"\ + "psubw %%xmm1, %%xmm2 \n\t"\ + "psubw %%xmm0, %%xmm6 \n\t"\ + "paddw %%xmm13,%%xmm5 \n\t"\ + "paddw %%xmm13,%%xmm10 \n\t"\ + "pmullw %%xmm14,%%xmm2 \n\t"\ + "pmullw %%xmm14,%%xmm6 \n\t"\ + "lddqu (%2), %%xmm3 \n\t"\ + "paddw %%xmm5, %%xmm2 \n\t"\ + "paddw %%xmm10,%%xmm6 \n\t"\ + "psraw $5, %%xmm2 \n\t"\ + "psraw $5, %%xmm6 \n\t"\ + "packuswb %%xmm2,%%xmm6 \n\t"\ + "pavgb %%xmm3, %%xmm6 \n\t"\ + OP(%%xmm6, (%1), %%xmm4, dqa)\ + "add %5, %0 \n\t"\ + "add %5, %1 \n\t"\ + "add %4, %2 \n\t"\ + "decl %3 \n\t"\ + "jg 1b \n\t"\ + : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\ + : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\ + "m"(ff_pw_5), "m"(ff_pw_16)\ + : "memory"\ + );\ +} +#else // ARCH_X86_64 +#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ +static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ + OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ + OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ + src += 8*dstStride;\ + dst += 8*dstStride;\ + src2 += 8*src2Stride;\ + OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ + OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ +} +#endif // ARCH_X86_64 + +#define QPEL_H264_H_XMM(OPNAME, OP, MMX)\ +static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ + int h=8;\ + __asm__ volatile(\ + "pxor %%xmm7, %%xmm7 \n\t"\ + "movdqa %0, %%xmm6 \n\t"\ + :: "m"(ff_pw_5)\ + );\ + do{\ + __asm__ volatile(\ + "lddqu -5(%0), %%xmm1 \n\t"\ + "movdqa %%xmm1, %%xmm0 \n\t"\ + "punpckhbw %%xmm7, %%xmm1 \n\t"\ + "punpcklbw %%xmm7, %%xmm0 \n\t"\ + "movdqa %%xmm1, %%xmm2 \n\t"\ + "movdqa %%xmm1, %%xmm3 \n\t"\ + "movdqa %%xmm1, %%xmm4 \n\t"\ + "movdqa %%xmm1, %%xmm5 \n\t"\ + "palignr $6, %%xmm0, %%xmm5 \n\t"\ + "palignr $8, %%xmm0, %%xmm4 \n\t"\ + "palignr $10,%%xmm0, %%xmm3 \n\t"\ + "paddw %%xmm1, %%xmm5 \n\t"\ + "palignr $12,%%xmm0, %%xmm2 \n\t"\ + "palignr $14,%%xmm0, %%xmm1 \n\t"\ + "paddw %%xmm3, %%xmm2 \n\t"\ + "paddw %%xmm4, %%xmm1 \n\t"\ + "psllw $2, %%xmm2 \n\t"\ + "movq (%2), %%xmm3 \n\t"\ + "psubw %%xmm1, %%xmm2 \n\t"\ + "paddw %5, %%xmm5 \n\t"\ + "pmullw %%xmm6, %%xmm2 \n\t"\ + "paddw %%xmm5, %%xmm2 \n\t"\ + "psraw $5, %%xmm2 \n\t"\ + "packuswb %%xmm2, %%xmm2 \n\t"\ + "pavgb %%xmm3, %%xmm2 \n\t"\ + OP(%%xmm2, (%1), %%xmm4, q)\ + "add %4, %0 \n\t"\ + "add %4, %1 \n\t"\ + "add %3, %2 \n\t"\ + : "+a"(src), "+c"(dst), "+d"(src2)\ + : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\ + "m"(ff_pw_16)\ + : "memory"\ + );\ + }while(--h);\ +}\ +QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ +\ +static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + int h=8;\ + __asm__ volatile(\ + "pxor %%xmm7, %%xmm7 \n\t"\ + "movdqa %5, %%xmm6 \n\t"\ + "1: \n\t"\ + "lddqu -5(%0), %%xmm1 \n\t"\ + "movdqa %%xmm1, %%xmm0 \n\t"\ + "punpckhbw %%xmm7, %%xmm1 \n\t"\ + "punpcklbw %%xmm7, %%xmm0 \n\t"\ + "movdqa %%xmm1, %%xmm2 \n\t"\ + "movdqa %%xmm1, %%xmm3 \n\t"\ + "movdqa %%xmm1, %%xmm4 \n\t"\ + "movdqa %%xmm1, %%xmm5 \n\t"\ + "palignr $6, %%xmm0, %%xmm5 \n\t"\ + "palignr $8, %%xmm0, %%xmm4 \n\t"\ + "palignr $10,%%xmm0, %%xmm3 \n\t"\ + "paddw %%xmm1, %%xmm5 \n\t"\ + "palignr $12,%%xmm0, %%xmm2 \n\t"\ + "palignr $14,%%xmm0, %%xmm1 \n\t"\ + "paddw %%xmm3, %%xmm2 \n\t"\ + "paddw %%xmm4, %%xmm1 \n\t"\ + "psllw $2, %%xmm2 \n\t"\ + "psubw %%xmm1, %%xmm2 \n\t"\ + "paddw %6, %%xmm5 \n\t"\ + "pmullw %%xmm6, %%xmm2 \n\t"\ + "paddw %%xmm5, %%xmm2 \n\t"\ + "psraw $5, %%xmm2 \n\t"\ + "packuswb %%xmm2, %%xmm2 \n\t"\ + OP(%%xmm2, (%1), %%xmm4, q)\ + "add %3, %0 \n\t"\ + "add %4, %1 \n\t"\ + "decl %2 \n\t"\ + " jnz 1b \n\t"\ + : "+a"(src), "+c"(dst), "+g"(h)\ + : "D"((x86_reg)srcStride), "S"((x86_reg)dstStride),\ + "m"(ff_pw_5), "m"(ff_pw_16)\ + : "memory"\ + );\ +}\ +static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ + OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ + src += 8*srcStride;\ + dst += 8*dstStride;\ + OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ + OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ +}\ + +#define QPEL_H264_V_XMM(OPNAME, OP, MMX)\ +static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ + src -= 2*srcStride;\ + \ + __asm__ volatile(\ + "pxor %%xmm7, %%xmm7 \n\t"\ + "movq (%0), %%xmm0 \n\t"\ + "add %2, %0 \n\t"\ + "movq (%0), %%xmm1 \n\t"\ + "add %2, %0 \n\t"\ + "movq (%0), %%xmm2 \n\t"\ + "add %2, %0 \n\t"\ + "movq (%0), %%xmm3 \n\t"\ + "add %2, %0 \n\t"\ + "movq (%0), %%xmm4 \n\t"\ + "add %2, %0 \n\t"\ + "punpcklbw %%xmm7, %%xmm0 \n\t"\ + "punpcklbw %%xmm7, %%xmm1 \n\t"\ + "punpcklbw %%xmm7, %%xmm2 \n\t"\ + "punpcklbw %%xmm7, %%xmm3 \n\t"\ + "punpcklbw %%xmm7, %%xmm4 \n\t"\ + QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\ + QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\ + QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\ + QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\ + QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\ + QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\ + QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\ + QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\ + \ + : "+a"(src), "+c"(dst)\ + : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ + : "memory"\ + );\ + if(h==16){\ + __asm__ volatile(\ + QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\ + QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\ + QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\ + QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\ + QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\ + QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\ + QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\ + QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\ + \ + : "+a"(src), "+c"(dst)\ + : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ + : "memory"\ + );\ + }\ +}\ +static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\ +}\ +static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\ + OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ +} + +static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){ + int w = (size+8)>>3; + src -= 2*srcStride+2; + while(w--){ + __asm__ volatile( + "pxor %%xmm7, %%xmm7 \n\t" + "movq (%0), %%xmm0 \n\t" + "add %2, %0 \n\t" + "movq (%0), %%xmm1 \n\t" + "add %2, %0 \n\t" + "movq (%0), %%xmm2 \n\t" + "add %2, %0 \n\t" + "movq (%0), %%xmm3 \n\t" + "add %2, %0 \n\t" + "movq (%0), %%xmm4 \n\t" + "add %2, %0 \n\t" + "punpcklbw %%xmm7, %%xmm0 \n\t" + "punpcklbw %%xmm7, %%xmm1 \n\t" + "punpcklbw %%xmm7, %%xmm2 \n\t" + "punpcklbw %%xmm7, %%xmm3 \n\t" + "punpcklbw %%xmm7, %%xmm4 \n\t" + QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 0*48) + QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 1*48) + QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 2*48) + QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 3*48) + QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 4*48) + QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 5*48) + QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 6*48) + QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 7*48) + : "+a"(src) + : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16) + : "memory" + ); + if(size==16){ + __asm__ volatile( + QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 8*48) + QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 9*48) + QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 10*48) + QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 11*48) + QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 12*48) + QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 13*48) + QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 14*48) + QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 15*48) + : "+a"(src) + : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16) + : "memory" + ); + } + tmp += 8; + src += 8 - (size+5)*srcStride; + } +} + +#define QPEL_H264_HV2_XMM(OPNAME, OP, MMX)\ +static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\ + int h = size;\ + if(size == 16){\ + __asm__ volatile(\ + "1: \n\t"\ + "movdqa 32(%0), %%xmm4 \n\t"\ + "movdqa 16(%0), %%xmm5 \n\t"\ + "movdqa (%0), %%xmm7 \n\t"\ + "movdqa %%xmm4, %%xmm3 \n\t"\ + "movdqa %%xmm4, %%xmm2 \n\t"\ + "movdqa %%xmm4, %%xmm1 \n\t"\ + "movdqa %%xmm4, %%xmm0 \n\t"\ + "palignr $10, %%xmm5, %%xmm0 \n\t"\ + "palignr $8, %%xmm5, %%xmm1 \n\t"\ + "palignr $6, %%xmm5, %%xmm2 \n\t"\ + "palignr $4, %%xmm5, %%xmm3 \n\t"\ + "palignr $2, %%xmm5, %%xmm4 \n\t"\ + "paddw %%xmm5, %%xmm0 \n\t"\ + "paddw %%xmm4, %%xmm1 \n\t"\ + "paddw %%xmm3, %%xmm2 \n\t"\ + "movdqa %%xmm5, %%xmm6 \n\t"\ + "movdqa %%xmm5, %%xmm4 \n\t"\ + "movdqa %%xmm5, %%xmm3 \n\t"\ + "palignr $8, %%xmm7, %%xmm4 \n\t"\ + "palignr $2, %%xmm7, %%xmm6 \n\t"\ + "palignr $10, %%xmm7, %%xmm3 \n\t"\ + "paddw %%xmm6, %%xmm4 \n\t"\ + "movdqa %%xmm5, %%xmm6 \n\t"\ + "palignr $6, %%xmm7, %%xmm5 \n\t"\ + "palignr $4, %%xmm7, %%xmm6 \n\t"\ + "paddw %%xmm7, %%xmm3 \n\t"\ + "paddw %%xmm6, %%xmm5 \n\t"\ + \ + "psubw %%xmm1, %%xmm0 \n\t"\ + "psubw %%xmm4, %%xmm3 \n\t"\ + "psraw $2, %%xmm0 \n\t"\ + "psraw $2, %%xmm3 \n\t"\ + "psubw %%xmm1, %%xmm0 \n\t"\ + "psubw %%xmm4, %%xmm3 \n\t"\ + "paddw %%xmm2, %%xmm0 \n\t"\ + "paddw %%xmm5, %%xmm3 \n\t"\ + "psraw $2, %%xmm0 \n\t"\ + "psraw $2, %%xmm3 \n\t"\ + "paddw %%xmm2, %%xmm0 \n\t"\ + "paddw %%xmm5, %%xmm3 \n\t"\ + "psraw $6, %%xmm0 \n\t"\ + "psraw $6, %%xmm3 \n\t"\ + "packuswb %%xmm0, %%xmm3 \n\t"\ + OP(%%xmm3, (%1), %%xmm7, dqa)\ + "add $48, %0 \n\t"\ + "add %3, %1 \n\t"\ + "decl %2 \n\t"\ + " jnz 1b \n\t"\ + : "+a"(tmp), "+c"(dst), "+g"(h)\ + : "S"((x86_reg)dstStride)\ + : "memory"\ + );\ + }else{\ + __asm__ volatile(\ + "1: \n\t"\ + "movdqa 16(%0), %%xmm1 \n\t"\ + "movdqa (%0), %%xmm0 \n\t"\ + "movdqa %%xmm1, %%xmm2 \n\t"\ + "movdqa %%xmm1, %%xmm3 \n\t"\ + "movdqa %%xmm1, %%xmm4 \n\t"\ + "movdqa %%xmm1, %%xmm5 \n\t"\ + "palignr $10, %%xmm0, %%xmm5 \n\t"\ + "palignr $8, %%xmm0, %%xmm4 \n\t"\ + "palignr $6, %%xmm0, %%xmm3 \n\t"\ + "palignr $4, %%xmm0, %%xmm2 \n\t"\ + "palignr $2, %%xmm0, %%xmm1 \n\t"\ + "paddw %%xmm5, %%xmm0 \n\t"\ + "paddw %%xmm4, %%xmm1 \n\t"\ + "paddw %%xmm3, %%xmm2 \n\t"\ + "psubw %%xmm1, %%xmm0 \n\t"\ + "psraw $2, %%xmm0 \n\t"\ + "psubw %%xmm1, %%xmm0 \n\t"\ + "paddw %%xmm2, %%xmm0 \n\t"\ + "psraw $2, %%xmm0 \n\t"\ + "paddw %%xmm2, %%xmm0 \n\t"\ + "psraw $6, %%xmm0 \n\t"\ + "packuswb %%xmm0, %%xmm0 \n\t"\ + OP(%%xmm0, (%1), %%xmm7, q)\ + "add $48, %0 \n\t"\ + "add %3, %1 \n\t"\ + "decl %2 \n\t"\ + " jnz 1b \n\t"\ + : "+a"(tmp), "+c"(dst), "+g"(h)\ + : "S"((x86_reg)dstStride)\ + : "memory"\ + );\ + }\ +} + +#define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\ +static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\ + put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\ + OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\ +}\ +static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ + OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\ +}\ +static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ + OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\ +}\ + +#define put_pixels8_l2_sse2 put_pixels8_l2_mmx2 +#define avg_pixels8_l2_sse2 avg_pixels8_l2_mmx2 +#define put_pixels16_l2_sse2 put_pixels16_l2_mmx2 +#define avg_pixels16_l2_sse2 avg_pixels16_l2_mmx2 +#define put_pixels8_l2_ssse3 put_pixels8_l2_mmx2 +#define avg_pixels8_l2_ssse3 avg_pixels8_l2_mmx2 +#define put_pixels16_l2_ssse3 put_pixels16_l2_mmx2 +#define avg_pixels16_l2_ssse3 avg_pixels16_l2_mmx2 + +#define put_pixels8_l2_shift5_sse2 put_pixels8_l2_shift5_mmx2 +#define avg_pixels8_l2_shift5_sse2 avg_pixels8_l2_shift5_mmx2 +#define put_pixels16_l2_shift5_sse2 put_pixels16_l2_shift5_mmx2 +#define avg_pixels16_l2_shift5_sse2 avg_pixels16_l2_shift5_mmx2 +#define put_pixels8_l2_shift5_ssse3 put_pixels8_l2_shift5_mmx2 +#define avg_pixels8_l2_shift5_ssse3 avg_pixels8_l2_shift5_mmx2 +#define put_pixels16_l2_shift5_ssse3 put_pixels16_l2_shift5_mmx2 +#define avg_pixels16_l2_shift5_ssse3 avg_pixels16_l2_shift5_mmx2 + +#define put_h264_qpel8_h_lowpass_l2_sse2 put_h264_qpel8_h_lowpass_l2_mmx2 +#define avg_h264_qpel8_h_lowpass_l2_sse2 avg_h264_qpel8_h_lowpass_l2_mmx2 +#define put_h264_qpel16_h_lowpass_l2_sse2 put_h264_qpel16_h_lowpass_l2_mmx2 +#define avg_h264_qpel16_h_lowpass_l2_sse2 avg_h264_qpel16_h_lowpass_l2_mmx2 + +#define put_h264_qpel8_v_lowpass_ssse3 put_h264_qpel8_v_lowpass_sse2 +#define avg_h264_qpel8_v_lowpass_ssse3 avg_h264_qpel8_v_lowpass_sse2 +#define put_h264_qpel16_v_lowpass_ssse3 put_h264_qpel16_v_lowpass_sse2 +#define avg_h264_qpel16_v_lowpass_ssse3 avg_h264_qpel16_v_lowpass_sse2 + +#define put_h264_qpel8or16_hv2_lowpass_sse2 put_h264_qpel8or16_hv2_lowpass_mmx2 +#define avg_h264_qpel8or16_hv2_lowpass_sse2 avg_h264_qpel8or16_hv2_lowpass_mmx2 + +#define H264_MC(OPNAME, SIZE, MMX, ALIGN) \ +H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\ +H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\ +H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\ +H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\ + +static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){ + put_pixels16_sse2(dst, src, stride, 16); +} +static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){ + avg_pixels16_sse2(dst, src, stride, 16); +} +#define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmx2 +#define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmx2 + +#define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \ +static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\ +}\ + +#define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \ +static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\ +}\ + +#define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \ +static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\ + put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ + OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\ + put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ + OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\ +}\ + +#define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \ +static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\ + put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\ + put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\ + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\ + put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\ + put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\ + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(ALIGN, uint16_t, temp[SIZE*(SIZE<8?12:24)]);\ + OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\ + uint8_t * const halfHV= temp;\ + int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ + assert(((int)temp & 7) == 0);\ + put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\ + uint8_t * const halfHV= temp;\ + int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ + assert(((int)temp & 7) == 0);\ + put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\ + uint8_t * const halfHV= temp;\ + int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ + assert(((int)temp & 7) == 0);\ + put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ + OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+2, halfHV, stride, SIZE, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\ + uint8_t * const halfHV= temp;\ + int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ + assert(((int)temp & 7) == 0);\ + put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ + OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+3, halfHV, stride, SIZE, SIZE);\ +}\ + +#define H264_MC_4816(MMX)\ +H264_MC(put_, 4, MMX, 8)\ +H264_MC(put_, 8, MMX, 8)\ +H264_MC(put_, 16,MMX, 8)\ +H264_MC(avg_, 4, MMX, 8)\ +H264_MC(avg_, 8, MMX, 8)\ +H264_MC(avg_, 16,MMX, 8)\ + +#define H264_MC_816(QPEL, XMM)\ +QPEL(put_, 8, XMM, 16)\ +QPEL(put_, 16,XMM, 16)\ +QPEL(avg_, 8, XMM, 16)\ +QPEL(avg_, 16,XMM, 16)\ + + +#define AVG_3DNOW_OP(a,b,temp, size) \ +"mov" #size " " #b ", " #temp " \n\t"\ +"pavgusb " #temp ", " #a " \n\t"\ +"mov" #size " " #a ", " #b " \n\t" +#define AVG_MMX2_OP(a,b,temp, size) \ +"mov" #size " " #b ", " #temp " \n\t"\ +"pavgb " #temp ", " #a " \n\t"\ +"mov" #size " " #a ", " #b " \n\t" + +#define PAVGB "pavgusb" +QPEL_H264(put_, PUT_OP, 3dnow) +QPEL_H264(avg_, AVG_3DNOW_OP, 3dnow) +#undef PAVGB +#define PAVGB "pavgb" +QPEL_H264(put_, PUT_OP, mmx2) +QPEL_H264(avg_, AVG_MMX2_OP, mmx2) +QPEL_H264_V_XMM(put_, PUT_OP, sse2) +QPEL_H264_V_XMM(avg_, AVG_MMX2_OP, sse2) +QPEL_H264_HV_XMM(put_, PUT_OP, sse2) +QPEL_H264_HV_XMM(avg_, AVG_MMX2_OP, sse2) +#ifdef HAVE_SSSE3 +QPEL_H264_H_XMM(put_, PUT_OP, ssse3) +QPEL_H264_H_XMM(avg_, AVG_MMX2_OP, ssse3) +QPEL_H264_HV2_XMM(put_, PUT_OP, ssse3) +QPEL_H264_HV2_XMM(avg_, AVG_MMX2_OP, ssse3) +QPEL_H264_HV_XMM(put_, PUT_OP, ssse3) +QPEL_H264_HV_XMM(avg_, AVG_MMX2_OP, ssse3) +#endif +#undef PAVGB + +H264_MC_4816(3dnow) +H264_MC_4816(mmx2) +H264_MC_816(H264_MC_V, sse2) +H264_MC_816(H264_MC_HV, sse2) +#ifdef HAVE_SSSE3 +H264_MC_816(H264_MC_H, ssse3) +H264_MC_816(H264_MC_HV, ssse3) +#endif + + +#define H264_CHROMA_OP(S,D) +#define H264_CHROMA_OP4(S,D,T) +#define H264_CHROMA_MC8_TMPL put_h264_chroma_mc8_mmx +#define H264_CHROMA_MC4_TMPL put_h264_chroma_mc4_mmx +#define H264_CHROMA_MC2_TMPL put_h264_chroma_mc2_mmx2 +#define H264_CHROMA_MC8_MV0 put_pixels8_mmx +#include "dsputil_h264_template_mmx.c" + +static void put_h264_chroma_mc8_mmx_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) +{ + put_h264_chroma_mc8_mmx(dst, src, stride, h, x, y, 1); +} +static void put_h264_chroma_mc8_mmx_nornd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) +{ + put_h264_chroma_mc8_mmx(dst, src, stride, h, x, y, 0); +} + +#undef H264_CHROMA_OP +#undef H264_CHROMA_OP4 +#undef H264_CHROMA_MC8_TMPL +#undef H264_CHROMA_MC4_TMPL +#undef H264_CHROMA_MC2_TMPL +#undef H264_CHROMA_MC8_MV0 + +#define H264_CHROMA_OP(S,D) "pavgb " #S ", " #D " \n\t" +#define H264_CHROMA_OP4(S,D,T) "movd " #S ", " #T " \n\t"\ + "pavgb " #T ", " #D " \n\t" +#define H264_CHROMA_MC8_TMPL avg_h264_chroma_mc8_mmx2 +#define H264_CHROMA_MC4_TMPL avg_h264_chroma_mc4_mmx2 +#define H264_CHROMA_MC2_TMPL avg_h264_chroma_mc2_mmx2 +#define H264_CHROMA_MC8_MV0 avg_pixels8_mmx2 +#include "dsputil_h264_template_mmx.c" +static void avg_h264_chroma_mc8_mmx2_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) +{ + avg_h264_chroma_mc8_mmx2(dst, src, stride, h, x, y, 1); +} +#undef H264_CHROMA_OP +#undef H264_CHROMA_OP4 +#undef H264_CHROMA_MC8_TMPL +#undef H264_CHROMA_MC4_TMPL +#undef H264_CHROMA_MC2_TMPL +#undef H264_CHROMA_MC8_MV0 + +#define H264_CHROMA_OP(S,D) "pavgusb " #S ", " #D " \n\t" +#define H264_CHROMA_OP4(S,D,T) "movd " #S ", " #T " \n\t"\ + "pavgusb " #T ", " #D " \n\t" +#define H264_CHROMA_MC8_TMPL avg_h264_chroma_mc8_3dnow +#define H264_CHROMA_MC4_TMPL avg_h264_chroma_mc4_3dnow +#define H264_CHROMA_MC8_MV0 avg_pixels8_3dnow +#include "dsputil_h264_template_mmx.c" +static void avg_h264_chroma_mc8_3dnow_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) +{ + avg_h264_chroma_mc8_3dnow(dst, src, stride, h, x, y, 1); +} +#undef H264_CHROMA_OP +#undef H264_CHROMA_OP4 +#undef H264_CHROMA_MC8_TMPL +#undef H264_CHROMA_MC4_TMPL +#undef H264_CHROMA_MC8_MV0 + +#ifdef HAVE_SSSE3 +#define AVG_OP(X) +#undef H264_CHROMA_MC8_TMPL +#undef H264_CHROMA_MC4_TMPL +#define H264_CHROMA_MC8_TMPL put_h264_chroma_mc8_ssse3 +#define H264_CHROMA_MC4_TMPL put_h264_chroma_mc4_ssse3 +#define H264_CHROMA_MC8_MV0 put_pixels8_mmx +#include "dsputil_h264_template_ssse3.c" +static void put_h264_chroma_mc8_ssse3_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) +{ + put_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 1); +} +static void put_h264_chroma_mc8_ssse3_nornd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) +{ + put_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 0); +} + +#undef AVG_OP +#undef H264_CHROMA_MC8_TMPL +#undef H264_CHROMA_MC4_TMPL +#undef H264_CHROMA_MC8_MV0 +#define AVG_OP(X) X +#define H264_CHROMA_MC8_TMPL avg_h264_chroma_mc8_ssse3 +#define H264_CHROMA_MC4_TMPL avg_h264_chroma_mc4_ssse3 +#define H264_CHROMA_MC8_MV0 avg_pixels8_mmx2 +#include "dsputil_h264_template_ssse3.c" +static void avg_h264_chroma_mc8_ssse3_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) +{ + avg_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 1); +} +#undef AVG_OP +#undef H264_CHROMA_MC8_TMPL +#undef H264_CHROMA_MC4_TMPL +#undef H264_CHROMA_MC8_MV0 +#endif + +/***********************************/ +/* weighted prediction */ + +static inline void ff_h264_weight_WxH_mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset, int w, int h) +{ + int x, y; + offset <<= log2_denom; + offset += (1 << log2_denom) >> 1; + __asm__ volatile( + "movd %0, %%mm4 \n\t" + "movd %1, %%mm5 \n\t" + "movd %2, %%mm6 \n\t" + "pshufw $0, %%mm4, %%mm4 \n\t" + "pshufw $0, %%mm5, %%mm5 \n\t" + "pxor %%mm7, %%mm7 \n\t" + :: "g"(weight), "g"(offset), "g"(log2_denom) + ); + for(y=0; y<h; y+=2){ + for(x=0; x<w; x+=4){ + __asm__ volatile( + "movd %0, %%mm0 \n\t" + "movd %1, %%mm1 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpcklbw %%mm7, %%mm1 \n\t" + "pmullw %%mm4, %%mm0 \n\t" + "pmullw %%mm4, %%mm1 \n\t" + "paddsw %%mm5, %%mm0 \n\t" + "paddsw %%mm5, %%mm1 \n\t" + "psraw %%mm6, %%mm0 \n\t" + "psraw %%mm6, %%mm1 \n\t" + "packuswb %%mm7, %%mm0 \n\t" + "packuswb %%mm7, %%mm1 \n\t" + "movd %%mm0, %0 \n\t" + "movd %%mm1, %1 \n\t" + : "+m"(*(uint32_t*)(dst+x)), + "+m"(*(uint32_t*)(dst+x+stride)) + ); + } + dst += 2*stride; + } +} + +static inline void ff_h264_biweight_WxH_mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset, int w, int h) +{ + int x, y; + offset = ((offset + 1) | 1) << log2_denom; + __asm__ volatile( + "movd %0, %%mm3 \n\t" + "movd %1, %%mm4 \n\t" + "movd %2, %%mm5 \n\t" + "movd %3, %%mm6 \n\t" + "pshufw $0, %%mm3, %%mm3 \n\t" + "pshufw $0, %%mm4, %%mm4 \n\t" + "pshufw $0, %%mm5, %%mm5 \n\t" + "pxor %%mm7, %%mm7 \n\t" + :: "g"(weightd), "g"(weights), "g"(offset), "g"(log2_denom+1) + ); + for(y=0; y<h; y++){ + for(x=0; x<w; x+=4){ + __asm__ volatile( + "movd %0, %%mm0 \n\t" + "movd %1, %%mm1 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpcklbw %%mm7, %%mm1 \n\t" + "pmullw %%mm3, %%mm0 \n\t" + "pmullw %%mm4, %%mm1 \n\t" + "paddsw %%mm1, %%mm0 \n\t" + "paddsw %%mm5, %%mm0 \n\t" + "psraw %%mm6, %%mm0 \n\t" + "packuswb %%mm0, %%mm0 \n\t" + "movd %%mm0, %0 \n\t" + : "+m"(*(uint32_t*)(dst+x)) + : "m"(*(uint32_t*)(src+x)) + ); + } + src += stride; + dst += stride; + } +} + +#define H264_WEIGHT(W,H) \ +static void ff_h264_biweight_ ## W ## x ## H ## _mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \ + ff_h264_biweight_WxH_mmx2(dst, src, stride, log2_denom, weightd, weights, offset, W, H); \ +} \ +static void ff_h264_weight_ ## W ## x ## H ## _mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset){ \ + ff_h264_weight_WxH_mmx2(dst, stride, log2_denom, weight, offset, W, H); \ +} + +H264_WEIGHT(16,16) +H264_WEIGHT(16, 8) +H264_WEIGHT( 8,16) +H264_WEIGHT( 8, 8) +H264_WEIGHT( 8, 4) +H264_WEIGHT( 4, 8) +H264_WEIGHT( 4, 4) +H264_WEIGHT( 4, 2) +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/x86/idct_mmx.c Mon Dec 22 09:12:42 2008 +0000 @@ -0,0 +1,605 @@ +/* + * idct_mmx.c + * Copyright (C) 1999-2001 Aaron Holtzman <aholtzma@ess.engr.uvic.ca> + * + * This file is part of mpeg2dec, a free MPEG-2 video stream decoder. + * See http://libmpeg2.sourceforge.net/ for updates. + * + * mpeg2dec is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * mpeg2dec is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with mpeg2dec; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/common.h" +#include "libavcodec/dsputil.h" + +#include "mmx.h" + +#define ATTR_ALIGN(align) __attribute__ ((__aligned__ (align))) + +#define ROW_SHIFT 11 +#define COL_SHIFT 6 + +#define round(bias) ((int)(((bias)+0.5) * (1<<ROW_SHIFT))) +#define rounder(bias) {round (bias), round (bias)} + + +#if 0 +/* C row IDCT - it is just here to document the MMXEXT and MMX versions */ +static inline void idct_row (int16_t * row, int offset, + int16_t * table, int32_t * rounder) +{ + int C1, C2, C3, C4, C5, C6, C7; + int a0, a1, a2, a3, b0, b1, b2, b3; + + row += offset; + + C1 = table[1]; + C2 = table[2]; + C3 = table[3]; + C4 = table[4]; + C5 = table[5]; + C6 = table[6]; + C7 = table[7]; + + a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + *rounder; + a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + *rounder; + a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + *rounder; + a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + *rounder; + + b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7]; + b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7]; + b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7]; + b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7]; + + row[0] = (a0 + b0) >> ROW_SHIFT; + row[1] = (a1 + b1) >> ROW_SHIFT; + row[2] = (a2 + b2) >> ROW_SHIFT; + row[3] = (a3 + b3) >> ROW_SHIFT; + row[4] = (a3 - b3) >> ROW_SHIFT; + row[5] = (a2 - b2) >> ROW_SHIFT; + row[6] = (a1 - b1) >> ROW_SHIFT; + row[7] = (a0 - b0) >> ROW_SHIFT; +} +#endif + + +/* MMXEXT row IDCT */ + +#define mmxext_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, -c4, -c2, \ + c4, c6, c4, c6, \ + c1, c3, -c1, -c5, \ + c5, c7, c3, -c7, \ + c4, -c6, c4, -c6, \ + -c4, c2, c4, -c2, \ + c5, -c1, c3, -c1, \ + c7, c3, c7, -c5 } + +static inline void mmxext_row_head (int16_t * const row, const int offset, + const int16_t * const table) +{ + movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */ + + movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ + movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */ + + movq_m2r (*table, mm3); /* mm3 = -C2 -C4 C2 C4 */ + movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */ + + movq_m2r (*(table+4), mm4); /* mm4 = C6 C4 C6 C4 */ + pmaddwd_r2r (mm0, mm3); /* mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 */ + + pshufw_r2r (mm2, mm2, 0x4e); /* mm2 = x2 x0 x6 x4 */ +} + +static inline void mmxext_row (const int16_t * const table, + const int32_t * const rounder) +{ + movq_m2r (*(table+8), mm1); /* mm1 = -C5 -C1 C3 C1 */ + pmaddwd_r2r (mm2, mm4); /* mm4 = C4*x0+C6*x2 C4*x4+C6*x6 */ + + pmaddwd_m2r (*(table+16), mm0); /* mm0 = C4*x4-C6*x6 C4*x0-C6*x2 */ + pshufw_r2r (mm6, mm6, 0x4e); /* mm6 = x3 x1 x7 x5 */ + + movq_m2r (*(table+12), mm7); /* mm7 = -C7 C3 C7 C5 */ + pmaddwd_r2r (mm5, mm1); /* mm1 = -C1*x5-C5*x7 C1*x1+C3*x3 */ + + paddd_m2r (*rounder, mm3); /* mm3 += rounder */ + pmaddwd_r2r (mm6, mm7); /* mm7 = C3*x1-C7*x3 C5*x5+C7*x7 */ + + pmaddwd_m2r (*(table+20), mm2); /* mm2 = C4*x0-C2*x2 -C4*x4+C2*x6 */ + paddd_r2r (mm4, mm3); /* mm3 = a1 a0 + rounder */ + + pmaddwd_m2r (*(table+24), mm5); /* mm5 = C3*x5-C1*x7 C5*x1-C1*x3 */ + movq_r2r (mm3, mm4); /* mm4 = a1 a0 + rounder */ + + pmaddwd_m2r (*(table+28), mm6); /* mm6 = C7*x1-C5*x3 C7*x5+C3*x7 */ + paddd_r2r (mm7, mm1); /* mm1 = b1 b0 */ + + paddd_m2r (*rounder, mm0); /* mm0 += rounder */ + psubd_r2r (mm1, mm3); /* mm3 = a1-b1 a0-b0 + rounder */ + + psrad_i2r (ROW_SHIFT, mm3); /* mm3 = y6 y7 */ + paddd_r2r (mm4, mm1); /* mm1 = a1+b1 a0+b0 + rounder */ + + paddd_r2r (mm2, mm0); /* mm0 = a3 a2 + rounder */ + psrad_i2r (ROW_SHIFT, mm1); /* mm1 = y1 y0 */ + + paddd_r2r (mm6, mm5); /* mm5 = b3 b2 */ + movq_r2r (mm0, mm4); /* mm4 = a3 a2 + rounder */ + + paddd_r2r (mm5, mm0); /* mm0 = a3+b3 a2+b2 + rounder */ + psubd_r2r (mm5, mm4); /* mm4 = a3-b3 a2-b2 + rounder */ +} + +static inline void mmxext_row_tail (int16_t * const row, const int store) +{ + psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */ + + psrad_i2r (ROW_SHIFT, mm4); /* mm4 = y4 y5 */ + + packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */ + + packssdw_r2r (mm3, mm4); /* mm4 = y6 y7 y4 y5 */ + + movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */ + pshufw_r2r (mm4, mm4, 0xb1); /* mm4 = y7 y6 y5 y4 */ + + /* slot */ + + movq_r2m (mm4, *(row+store+4)); /* save y7 y6 y5 y4 */ +} + +static inline void mmxext_row_mid (int16_t * const row, const int store, + const int offset, + const int16_t * const table) +{ + movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */ + psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */ + + movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ + psrad_i2r (ROW_SHIFT, mm4); /* mm4 = y4 y5 */ + + packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */ + movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */ + + packssdw_r2r (mm3, mm4); /* mm4 = y6 y7 y4 y5 */ + movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */ + + movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */ + pshufw_r2r (mm4, mm4, 0xb1); /* mm4 = y7 y6 y5 y4 */ + + movq_m2r (*table, mm3); /* mm3 = -C2 -C4 C2 C4 */ + movq_r2m (mm4, *(row+store+4)); /* save y7 y6 y5 y4 */ + + pmaddwd_r2r (mm0, mm3); /* mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 */ + + movq_m2r (*(table+4), mm4); /* mm4 = C6 C4 C6 C4 */ + pshufw_r2r (mm2, mm2, 0x4e); /* mm2 = x2 x0 x6 x4 */ +} + + +/* MMX row IDCT */ + +#define mmx_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, c4, c6, \ + c4, c6, -c4, -c2, \ + c1, c3, c3, -c7, \ + c5, c7, -c1, -c5, \ + c4, -c6, c4, -c2, \ + -c4, c2, c4, -c6, \ + c5, -c1, c7, -c5, \ + c7, c3, c3, -c1 } + +static inline void mmx_row_head (int16_t * const row, const int offset, + const int16_t * const table) +{ + movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */ + + movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ + movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */ + + movq_m2r (*table, mm3); /* mm3 = C6 C4 C2 C4 */ + movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */ + + punpckldq_r2r (mm0, mm0); /* mm0 = x2 x0 x2 x0 */ + + movq_m2r (*(table+4), mm4); /* mm4 = -C2 -C4 C6 C4 */ + pmaddwd_r2r (mm0, mm3); /* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */ + + movq_m2r (*(table+8), mm1); /* mm1 = -C7 C3 C3 C1 */ + punpckhdq_r2r (mm2, mm2); /* mm2 = x6 x4 x6 x4 */ +} + +static inline void mmx_row (const int16_t * const table, + const int32_t * const rounder) +{ + pmaddwd_r2r (mm2, mm4); /* mm4 = -C4*x4-C2*x6 C4*x4+C6*x6 */ + punpckldq_r2r (mm5, mm5); /* mm5 = x3 x1 x3 x1 */ + + pmaddwd_m2r (*(table+16), mm0); /* mm0 = C4*x0-C2*x2 C4*x0-C6*x2 */ + punpckhdq_r2r (mm6, mm6); /* mm6 = x7 x5 x7 x5 */ + + movq_m2r (*(table+12), mm7); /* mm7 = -C5 -C1 C7 C5 */ + pmaddwd_r2r (mm5, mm1); /* mm1 = C3*x1-C7*x3 C1*x1+C3*x3 */ + + paddd_m2r (*rounder, mm3); /* mm3 += rounder */ + pmaddwd_r2r (mm6, mm7); /* mm7 = -C1*x5-C5*x7 C5*x5+C7*x7 */ + + pmaddwd_m2r (*(table+20), mm2); /* mm2 = C4*x4-C6*x6 -C4*x4+C2*x6 */ + paddd_r2r (mm4, mm3); /* mm3 = a1 a0 + rounder */ + + pmaddwd_m2r (*(table+24), mm5); /* mm5 = C7*x1-C5*x3 C5*x1-C1*x3 */ + movq_r2r (mm3, mm4); /* mm4 = a1 a0 + rounder */ + + pmaddwd_m2r (*(table+28), mm6); /* mm6 = C3*x5-C1*x7 C7*x5+C3*x7 */ + paddd_r2r (mm7, mm1); /* mm1 = b1 b0 */ + + paddd_m2r (*rounder, mm0); /* mm0 += rounder */ + psubd_r2r (mm1, mm3); /* mm3 = a1-b1 a0-b0 + rounder */ + + psrad_i2r (ROW_SHIFT, mm3); /* mm3 = y6 y7 */ + paddd_r2r (mm4, mm1); /* mm1 = a1+b1 a0+b0 + rounder */ + + paddd_r2r (mm2, mm0); /* mm0 = a3 a2 + rounder */ + psrad_i2r (ROW_SHIFT, mm1); /* mm1 = y1 y0 */ + + paddd_r2r (mm6, mm5); /* mm5 = b3 b2 */ + movq_r2r (mm0, mm7); /* mm7 = a3 a2 + rounder */ + + paddd_r2r (mm5, mm0); /* mm0 = a3+b3 a2+b2 + rounder */ + psubd_r2r (mm5, mm7); /* mm7 = a3-b3 a2-b2 + rounder */ +} + +static inline void mmx_row_tail (int16_t * const row, const int store) +{ + psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */ + + psrad_i2r (ROW_SHIFT, mm7); /* mm7 = y4 y5 */ + + packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */ + + packssdw_r2r (mm3, mm7); /* mm7 = y6 y7 y4 y5 */ + + movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */ + movq_r2r (mm7, mm4); /* mm4 = y6 y7 y4 y5 */ + + pslld_i2r (16, mm7); /* mm7 = y7 0 y5 0 */ + + psrld_i2r (16, mm4); /* mm4 = 0 y6 0 y4 */ + + por_r2r (mm4, mm7); /* mm7 = y7 y6 y5 y4 */ + + /* slot */ + + movq_r2m (mm7, *(row+store+4)); /* save y7 y6 y5 y4 */ +} + +static inline void mmx_row_mid (int16_t * const row, const int store, + const int offset, const int16_t * const table) +{ + movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */ + psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */ + + movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ + psrad_i2r (ROW_SHIFT, mm7); /* mm7 = y4 y5 */ + + packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */ + movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */ + + packssdw_r2r (mm3, mm7); /* mm7 = y6 y7 y4 y5 */ + movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */ + + movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */ + movq_r2r (mm7, mm1); /* mm1 = y6 y7 y4 y5 */ + + punpckldq_r2r (mm0, mm0); /* mm0 = x2 x0 x2 x0 */ + psrld_i2r (16, mm7); /* mm7 = 0 y6 0 y4 */ + + movq_m2r (*table, mm3); /* mm3 = C6 C4 C2 C4 */ + pslld_i2r (16, mm1); /* mm1 = y7 0 y5 0 */ + + movq_m2r (*(table+4), mm4); /* mm4 = -C2 -C4 C6 C4 */ + por_r2r (mm1, mm7); /* mm7 = y7 y6 y5 y4 */ + + movq_m2r (*(table+8), mm1); /* mm1 = -C7 C3 C3 C1 */ + punpckhdq_r2r (mm2, mm2); /* mm2 = x6 x4 x6 x4 */ + + movq_r2m (mm7, *(row+store+4)); /* save y7 y6 y5 y4 */ + pmaddwd_r2r (mm0, mm3); /* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */ +} + + +#if 0 +/* C column IDCT - it is just here to document the MMXEXT and MMX versions */ +static inline void idct_col (int16_t * col, int offset) +{ +/* multiplication - as implemented on mmx */ +#define F(c,x) (((c) * (x)) >> 16) + +/* saturation - it helps us handle torture test cases */ +#define S(x) (((x)>32767) ? 32767 : ((x)<-32768) ? -32768 : (x)) + + int16_t x0, x1, x2, x3, x4, x5, x6, x7; + int16_t y0, y1, y2, y3, y4, y5, y6, y7; + int16_t a0, a1, a2, a3, b0, b1, b2, b3; + int16_t u04, v04, u26, v26, u17, v17, u35, v35, u12, v12; + + col += offset; + + x0 = col[0*8]; + x1 = col[1*8]; + x2 = col[2*8]; + x3 = col[3*8]; + x4 = col[4*8]; + x5 = col[5*8]; + x6 = col[6*8]; + x7 = col[7*8]; + + u04 = S (x0 + x4); + v04 = S (x0 - x4); + u26 = S (F (T2, x6) + x2); + v26 = S (F (T2, x2) - x6); + + a0 = S (u04 + u26); + a1 = S (v04 + v26); + a2 = S (v04 - v26); + a3 = S (u04 - u26); + + u17 = S (F (T1, x7) + x1); + v17 = S (F (T1, x1) - x7); + u35 = S (F (T3, x5) + x3); + v35 = S (F (T3, x3) - x5); + + b0 = S (u17 + u35); + b3 = S (v17 - v35); + u12 = S (u17 - u35); + v12 = S (v17 + v35); + u12 = S (2 * F (C4, u12)); + v12 = S (2 * F (C4, v12)); + b1 = S (u12 + v12); + b2 = S (u12 - v12); + + y0 = S (a0 + b0) >> COL_SHIFT; + y1 = S (a1 + b1) >> COL_SHIFT; + y2 = S (a2 + b2) >> COL_SHIFT; + y3 = S (a3 + b3) >> COL_SHIFT; + + y4 = S (a3 - b3) >> COL_SHIFT; + y5 = S (a2 - b2) >> COL_SHIFT; + y6 = S (a1 - b1) >> COL_SHIFT; + y7 = S (a0 - b0) >> COL_SHIFT; + + col[0*8] = y0; + col[1*8] = y1; + col[2*8] = y2; + col[3*8] = y3; + col[4*8] = y4; + col[5*8] = y5; + col[6*8] = y6; + col[7*8] = y7; +} +#endif + + +/* MMX column IDCT */ +static inline void idct_col (int16_t * const col, const int offset) +{ +#define T1 13036 +#define T2 27146 +#define T3 43790 +#define C4 23170 + + static const short t1_vector[] ATTR_ALIGN(8) = {T1,T1,T1,T1}; + static const short t2_vector[] ATTR_ALIGN(8) = {T2,T2,T2,T2}; + static const short t3_vector[] ATTR_ALIGN(8) = {T3,T3,T3,T3}; + static const short c4_vector[] ATTR_ALIGN(8) = {C4,C4,C4,C4}; + + /* column code adapted from Peter Gubanov */ + /* http://www.elecard.com/peter/idct.shtml */ + + movq_m2r (*t1_vector, mm0); /* mm0 = T1 */ + + movq_m2r (*(col+offset+1*8), mm1); /* mm1 = x1 */ + movq_r2r (mm0, mm2); /* mm2 = T1 */ + + movq_m2r (*(col+offset+7*8), mm4); /* mm4 = x7 */ + pmulhw_r2r (mm1, mm0); /* mm0 = T1*x1 */ + + movq_m2r (*t3_vector, mm5); /* mm5 = T3 */ + pmulhw_r2r (mm4, mm2); /* mm2 = T1*x7 */ + + movq_m2r (*(col+offset+5*8), mm6); /* mm6 = x5 */ + movq_r2r (mm5, mm7); /* mm7 = T3-1 */ + + movq_m2r (*(col+offset+3*8), mm3); /* mm3 = x3 */ + psubsw_r2r (mm4, mm0); /* mm0 = v17 */ + + movq_m2r (*t2_vector, mm4); /* mm4 = T2 */ + pmulhw_r2r (mm3, mm5); /* mm5 = (T3-1)*x3 */ + + paddsw_r2r (mm2, mm1); /* mm1 = u17 */ + pmulhw_r2r (mm6, mm7); /* mm7 = (T3-1)*x5 */ + + /* slot */ + + movq_r2r (mm4, mm2); /* mm2 = T2 */ + paddsw_r2r (mm3, mm5); /* mm5 = T3*x3 */ + + pmulhw_m2r (*(col+offset+2*8), mm4);/* mm4 = T2*x2 */ + paddsw_r2r (mm6, mm7); /* mm7 = T3*x5 */ + + psubsw_r2r (mm6, mm5); /* mm5 = v35 */ + paddsw_r2r (mm3, mm7); /* mm7 = u35 */ + + movq_m2r (*(col+offset+6*8), mm3); /* mm3 = x6 */ + movq_r2r (mm0, mm6); /* mm6 = v17 */ + + pmulhw_r2r (mm3, mm2); /* mm2 = T2*x6 */ + psubsw_r2r (mm5, mm0); /* mm0 = b3 */ + + psubsw_r2r (mm3, mm4); /* mm4 = v26 */ + paddsw_r2r (mm6, mm5); /* mm5 = v12 */ + + movq_r2m (mm0, *(col+offset+3*8)); /* save b3 in scratch0 */ + movq_r2r (mm1, mm6); /* mm6 = u17 */ + + paddsw_m2r (*(col+offset+2*8), mm2);/* mm2 = u26 */ + paddsw_r2r (mm7, mm6); /* mm6 = b0 */ + + psubsw_r2r (mm7, mm1); /* mm1 = u12 */ + movq_r2r (mm1, mm7); /* mm7 = u12 */ + + movq_m2r (*(col+offset+0*8), mm3); /* mm3 = x0 */ + paddsw_r2r (mm5, mm1); /* mm1 = u12+v12 */ + + movq_m2r (*c4_vector, mm0); /* mm0 = C4/2 */ + psubsw_r2r (mm5, mm7); /* mm7 = u12-v12 */ + + movq_r2m (mm6, *(col+offset+5*8)); /* save b0 in scratch1 */ + pmulhw_r2r (mm0, mm1); /* mm1 = b1/2 */ + + movq_r2r (mm4, mm6); /* mm6 = v26 */ + pmulhw_r2r (mm0, mm7); /* mm7 = b2/2 */ + + movq_m2r (*(col+offset+4*8), mm5); /* mm5 = x4 */ + movq_r2r (mm3, mm0); /* mm0 = x0 */ + + psubsw_r2r (mm5, mm3); /* mm3 = v04 */ + paddsw_r2r (mm5, mm0); /* mm0 = u04 */ + + paddsw_r2r (mm3, mm4); /* mm4 = a1 */ + movq_r2r (mm0, mm5); /* mm5 = u04 */ + + psubsw_r2r (mm6, mm3); /* mm3 = a2 */ + paddsw_r2r (mm2, mm5); /* mm5 = a0 */ + + paddsw_r2r (mm1, mm1); /* mm1 = b1 */ + psubsw_r2r (mm2, mm0); /* mm0 = a3 */ + + paddsw_r2r (mm7, mm7); /* mm7 = b2 */ + movq_r2r (mm3, mm2); /* mm2 = a2 */ + + movq_r2r (mm4, mm6); /* mm6 = a1 */ + paddsw_r2r (mm7, mm3); /* mm3 = a2+b2 */ + + psraw_i2r (COL_SHIFT, mm3); /* mm3 = y2 */ + paddsw_r2r (mm1, mm4); /* mm4 = a1+b1 */ + + psraw_i2r (COL_SHIFT, mm4); /* mm4 = y1 */ + psubsw_r2r (mm1, mm6); /* mm6 = a1-b1 */ + + movq_m2r (*(col+offset+5*8), mm1); /* mm1 = b0 */ + psubsw_r2r (mm7, mm2); /* mm2 = a2-b2 */ + + psraw_i2r (COL_SHIFT, mm6); /* mm6 = y6 */ + movq_r2r (mm5, mm7); /* mm7 = a0 */ + + movq_r2m (mm4, *(col+offset+1*8)); /* save y1 */ + psraw_i2r (COL_SHIFT, mm2); /* mm2 = y5 */ + + movq_r2m (mm3, *(col+offset+2*8)); /* save y2 */ + paddsw_r2r (mm1, mm5); /* mm5 = a0+b0 */ + + movq_m2r (*(col+offset+3*8), mm4); /* mm4 = b3 */ + psubsw_r2r (mm1, mm7); /* mm7 = a0-b0 */ + + psraw_i2r (COL_SHIFT, mm5); /* mm5 = y0 */ + movq_r2r (mm0, mm3); /* mm3 = a3 */ + + movq_r2m (mm2, *(col+offset+5*8)); /* save y5 */ + psubsw_r2r (mm4, mm3); /* mm3 = a3-b3 */ + + psraw_i2r (COL_SHIFT, mm7); /* mm7 = y7 */ + paddsw_r2r (mm0, mm4); /* mm4 = a3+b3 */ + + movq_r2m (mm5, *(col+offset+0*8)); /* save y0 */ + psraw_i2r (COL_SHIFT, mm3); /* mm3 = y4 */ + + movq_r2m (mm6, *(col+offset+6*8)); /* save y6 */ + psraw_i2r (COL_SHIFT, mm4); /* mm4 = y3 */ + + movq_r2m (mm7, *(col+offset+7*8)); /* save y7 */ + + movq_r2m (mm3, *(col+offset+4*8)); /* save y4 */ + + movq_r2m (mm4, *(col+offset+3*8)); /* save y3 */ + +#undef T1 +#undef T2 +#undef T3 +#undef C4 +} + + +static const int32_t rounder0[] ATTR_ALIGN(8) = + rounder ((1 << (COL_SHIFT - 1)) - 0.5); +static const int32_t rounder4[] ATTR_ALIGN(8) = rounder (0); +static const int32_t rounder1[] ATTR_ALIGN(8) = + rounder (1.25683487303); /* C1*(C1/C4+C1+C7)/2 */ +static const int32_t rounder7[] ATTR_ALIGN(8) = + rounder (-0.25); /* C1*(C7/C4+C7-C1)/2 */ +static const int32_t rounder2[] ATTR_ALIGN(8) = + rounder (0.60355339059); /* C2 * (C6+C2)/2 */ +static const int32_t rounder6[] ATTR_ALIGN(8) = + rounder (-0.25); /* C2 * (C6-C2)/2 */ +static const int32_t rounder3[] ATTR_ALIGN(8) = + rounder (0.087788325588); /* C3*(-C3/C4+C3+C5)/2 */ +static const int32_t rounder5[] ATTR_ALIGN(8) = + rounder (-0.441341716183); /* C3*(-C5/C4+C5-C3)/2 */ + +#undef COL_SHIFT +#undef ROW_SHIFT + +#define declare_idct(idct,table,idct_row_head,idct_row,idct_row_tail,idct_row_mid) \ +void idct (int16_t * const block) \ +{ \ + static const int16_t table04[] ATTR_ALIGN(16) = \ + table (22725, 21407, 19266, 16384, 12873, 8867, 4520); \ + static const int16_t table17[] ATTR_ALIGN(16) = \ + table (31521, 29692, 26722, 22725, 17855, 12299, 6270); \ + static const int16_t table26[] ATTR_ALIGN(16) = \ + table (29692, 27969, 25172, 21407, 16819, 11585, 5906); \ + static const int16_t table35[] ATTR_ALIGN(16) = \ + table (26722, 25172, 22654, 19266, 15137, 10426, 5315); \ + \ + idct_row_head (block, 0*8, table04); \ + idct_row (table04, rounder0); \ + idct_row_mid (block, 0*8, 4*8, table04); \ + idct_row (table04, rounder4); \ + idct_row_mid (block, 4*8, 1*8, table17); \ + idct_row (table17, rounder1); \ + idct_row_mid (block, 1*8, 7*8, table17); \ + idct_row (table17, rounder7); \ + idct_row_mid (block, 7*8, 2*8, table26); \ + idct_row (table26, rounder2); \ + idct_row_mid (block, 2*8, 6*8, table26); \ + idct_row (table26, rounder6); \ + idct_row_mid (block, 6*8, 3*8, table35); \ + idct_row (table35, rounder3); \ + idct_row_mid (block, 3*8, 5*8, table35); \ + idct_row (table35, rounder5); \ + idct_row_tail (block, 5*8); \ + \ + idct_col (block, 0); \ + idct_col (block, 4); \ +} + +void ff_mmx_idct(DCTELEM *block); +void ff_mmxext_idct(DCTELEM *block); + +declare_idct (ff_mmxext_idct, mmxext_table, + mmxext_row_head, mmxext_row, mmxext_row_tail, mmxext_row_mid) + +declare_idct (ff_mmx_idct, mmx_table, + mmx_row_head, mmx_row, mmx_row_tail, mmx_row_mid) +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/x86/idct_mmx_xvid.c Mon Dec 22 09:12:42 2008 +0000 @@ -0,0 +1,525 @@ +/* + * XVID MPEG-4 VIDEO CODEC + * - MMX and XMM forward discrete cosine transform - + * + * Copyright(C) 2001 Peter Ross <pross@xvid.org> + * + * Originally provided by Intel at AP-922 + * http://developer.intel.com/vtune/cbts/strmsimd/922down.htm + * (See more app notes at http://developer.intel.com/vtune/cbts/strmsimd/appnotes.htm) + * but in a limited edition. + * New macro implements a column part for precise iDCT + * The routine precision now satisfies IEEE standard 1180-1990. + * + * Copyright(C) 2000-2001 Peter Gubanov <peter@elecard.net.ru> + * Rounding trick Copyright(C) 2000 Michel Lespinasse <walken@zoy.org> + * + * http://www.elecard.com/peter/idct.html + * http://www.linuxvideo.org/mpeg2dec/ + * + * These examples contain code fragments for first stage iDCT 8x8 + * (for rows) and first stage DCT 8x8 (for columns) + * + * conversion to gcc syntax by Michael Niedermayer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with FFmpeg; if not, write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <inttypes.h> +#include "libavcodec/avcodec.h" + +//============================================================================= +// Macros and other preprocessor constants +//============================================================================= + +#define BITS_INV_ACC 5 // 4 or 5 for IEEE +#define SHIFT_INV_ROW (16 - BITS_INV_ACC) //11 +#define SHIFT_INV_COL (1 + BITS_INV_ACC) //6 +#define RND_INV_ROW (1024 * (6 - BITS_INV_ACC)) +#define RND_INV_COL (16 * (BITS_INV_ACC - 3)) +#define RND_INV_CORR (RND_INV_COL - 1) + +#define BITS_FRW_ACC 3 // 2 or 3 for accuracy +#define SHIFT_FRW_COL BITS_FRW_ACC +#define SHIFT_FRW_ROW (BITS_FRW_ACC + 17) +#define RND_FRW_ROW (262144*(BITS_FRW_ACC - 1)) + + +//----------------------------------------------------------------------------- +// Various memory constants (trigonometric values or rounding values) +//----------------------------------------------------------------------------- + + +DECLARE_ALIGNED(8, static const int16_t, tg_1_16[4*4]) = { + 13036,13036,13036,13036, // tg * (2<<16) + 0.5 + 27146,27146,27146,27146, // tg * (2<<16) + 0.5 + -21746,-21746,-21746,-21746, // tg * (2<<16) + 0.5 + 23170,23170,23170,23170}; // cos * (2<<15) + 0.5 + +DECLARE_ALIGNED(8, static const int32_t, rounder_0[2*8]) = { + 65536,65536, + 3597,3597, + 2260,2260, + 1203,1203, + 0,0, + 120,120, + 512,512, + 512,512}; + +//----------------------------------------------------------------------------- +// +// The first stage iDCT 8x8 - inverse DCTs of rows +// +//----------------------------------------------------------------------------- +// The 8-point inverse DCT direct algorithm +//----------------------------------------------------------------------------- +// +// static const short w[32] = { +// FIX(cos_4_16), FIX(cos_2_16), FIX(cos_4_16), FIX(cos_6_16), +// FIX(cos_4_16), FIX(cos_6_16), -FIX(cos_4_16), -FIX(cos_2_16), +// FIX(cos_4_16), -FIX(cos_6_16), -FIX(cos_4_16), FIX(cos_2_16), +// FIX(cos_4_16), -FIX(cos_2_16), FIX(cos_4_16), -FIX(cos_6_16), +// FIX(cos_1_16), FIX(cos_3_16), FIX(cos_5_16), FIX(cos_7_16), +// FIX(cos_3_16), -FIX(cos_7_16), -FIX(cos_1_16), -FIX(cos_5_16), +// FIX(cos_5_16), -FIX(cos_1_16), FIX(cos_7_16), FIX(cos_3_16), +// FIX(cos_7_16), -FIX(cos_5_16), FIX(cos_3_16), -FIX(cos_1_16) }; +// +// #define DCT_8_INV_ROW(x, y) +// { +// int a0, a1, a2, a3, b0, b1, b2, b3; +// +// a0 =x[0]*w[0]+x[2]*w[1]+x[4]*w[2]+x[6]*w[3]; +// a1 =x[0]*w[4]+x[2]*w[5]+x[4]*w[6]+x[6]*w[7]; +// a2 = x[0] * w[ 8] + x[2] * w[ 9] + x[4] * w[10] + x[6] * w[11]; +// a3 = x[0] * w[12] + x[2] * w[13] + x[4] * w[14] + x[6] * w[15]; +// b0 = x[1] * w[16] + x[3] * w[17] + x[5] * w[18] + x[7] * w[19]; +// b1 = x[1] * w[20] + x[3] * w[21] + x[5] * w[22] + x[7] * w[23]; +// b2 = x[1] * w[24] + x[3] * w[25] + x[5] * w[26] + x[7] * w[27]; +// b3 = x[1] * w[28] + x[3] * w[29] + x[5] * w[30] + x[7] * w[31]; +// +// y[0] = SHIFT_ROUND ( a0 + b0 ); +// y[1] = SHIFT_ROUND ( a1 + b1 ); +// y[2] = SHIFT_ROUND ( a2 + b2 ); +// y[3] = SHIFT_ROUND ( a3 + b3 ); +// y[4] = SHIFT_ROUND ( a3 - b3 ); +// y[5] = SHIFT_ROUND ( a2 - b2 ); +// y[6] = SHIFT_ROUND ( a1 - b1 ); +// y[7] = SHIFT_ROUND ( a0 - b0 ); +// } +// +//----------------------------------------------------------------------------- +// +// In this implementation the outputs of the iDCT-1D are multiplied +// for rows 0,4 - by cos_4_16, +// for rows 1,7 - by cos_1_16, +// for rows 2,6 - by cos_2_16, +// for rows 3,5 - by cos_3_16 +// and are shifted to the left for better accuracy +// +// For the constants used, +// FIX(float_const) = (short) (float_const * (1<<15) + 0.5) +// +//----------------------------------------------------------------------------- + +//----------------------------------------------------------------------------- +// Tables for mmx processors +//----------------------------------------------------------------------------- + +// Table for rows 0,4 - constants are multiplied by cos_4_16 +DECLARE_ALIGNED(8, static const int16_t, tab_i_04_mmx[32*4]) = { + 16384,16384,16384,-16384, // movq-> w06 w04 w02 w00 + 21407,8867,8867,-21407, // w07 w05 w03 w01 + 16384,-16384,16384,16384, // w14 w12 w10 w08 + -8867,21407,-21407,-8867, // w15 w13 w11 w09 + 22725,12873,19266,-22725, // w22 w20 w18 w16 + 19266,4520,-4520,-12873, // w23 w21 w19 w17 + 12873,4520,4520,19266, // w30 w28 w26 w24 + -22725,19266,-12873,-22725, // w31 w29 w27 w25 +// Table for rows 1,7 - constants are multiplied by cos_1_16 + 22725,22725,22725,-22725, // movq-> w06 w04 w02 w00 + 29692,12299,12299,-29692, // w07 w05 w03 w01 + 22725,-22725,22725,22725, // w14 w12 w10 w08 + -12299,29692,-29692,-12299, // w15 w13 w11 w09 + 31521,17855,26722,-31521, // w22 w20 w18 w16 + 26722,6270,-6270,-17855, // w23 w21 w19 w17 + 17855,6270,6270,26722, // w30 w28 w26 w24 + -31521,26722,-17855,-31521, // w31 w29 w27 w25 +// Table for rows 2,6 - constants are multiplied by cos_2_16 + 21407,21407,21407,-21407, // movq-> w06 w04 w02 w00 + 27969,11585,11585,-27969, // w07 w05 w03 w01 + 21407,-21407,21407,21407, // w14 w12 w10 w08 + -11585,27969,-27969,-11585, // w15 w13 w11 w09 + 29692,16819,25172,-29692, // w22 w20 w18 w16 + 25172,5906,-5906,-16819, // w23 w21 w19 w17 + 16819,5906,5906,25172, // w30 w28 w26 w24 + -29692,25172,-16819,-29692, // w31 w29 w27 w25 +// Table for rows 3,5 - constants are multiplied by cos_3_16 + 19266,19266,19266,-19266, // movq-> w06 w04 w02 w00 + 25172,10426,10426,-25172, // w07 w05 w03 w01 + 19266,-19266,19266,19266, // w14 w12 w10 w08 + -10426,25172,-25172,-10426, // w15 w13 w11 w09 + 26722,15137,22654,-26722, // w22 w20 w18 w16 + 22654,5315,-5315,-15137, // w23 w21 w19 w17 + 15137,5315,5315,22654, // w30 w28 w26 w24 + -26722,22654,-15137,-26722, // w31 w29 w27 w25 +}; +//----------------------------------------------------------------------------- +// Tables for xmm processors +//----------------------------------------------------------------------------- + +// %3 for rows 0,4 - constants are multiplied by cos_4_16 +DECLARE_ALIGNED(8, static const int16_t, tab_i_04_xmm[32*4]) = { + 16384,21407,16384,8867, // movq-> w05 w04 w01 w00 + 16384,8867,-16384,-21407, // w07 w06 w03 w02 + 16384,-8867,16384,-21407, // w13 w12 w09 w08 + -16384,21407,16384,-8867, // w15 w14 w11 w10 + 22725,19266,19266,-4520, // w21 w20 w17 w16 + 12873,4520,-22725,-12873, // w23 w22 w19 w18 + 12873,-22725,4520,-12873, // w29 w28 w25 w24 + 4520,19266,19266,-22725, // w31 w30 w27 w26 +// %3 for rows 1,7 - constants are multiplied by cos_1_16 + 22725,29692,22725,12299, // movq-> w05 w04 w01 w00 + 22725,12299,-22725,-29692, // w07 w06 w03 w02 + 22725,-12299,22725,-29692, // w13 w12 w09 w08 + -22725,29692,22725,-12299, // w15 w14 w11 w10 + 31521,26722,26722,-6270, // w21 w20 w17 w16 + 17855,6270,-31521,-17855, // w23 w22 w19 w18 + 17855,-31521,6270,-17855, // w29 w28 w25 w24 + 6270,26722,26722,-31521, // w31 w30 w27 w26 +// %3 for rows 2,6 - constants are multiplied by cos_2_16 + 21407,27969,21407,11585, // movq-> w05 w04 w01 w00 + 21407,11585,-21407,-27969, // w07 w06 w03 w02 + 21407,-11585,21407,-27969, // w13 w12 w09 w08 + -21407,27969,21407,-11585, // w15 w14 w11 w10 + 29692,25172,25172,-5906, // w21 w20 w17 w16 + 16819,5906,-29692,-16819, // w23 w22 w19 w18 + 16819,-29692,5906,-16819, // w29 w28 w25 w24 + 5906,25172,25172,-29692, // w31 w30 w27 w26 +// %3 for rows 3,5 - constants are multiplied by cos_3_16 + 19266,25172,19266,10426, // movq-> w05 w04 w01 w00 + 19266,10426,-19266,-25172, // w07 w06 w03 w02 + 19266,-10426,19266,-25172, // w13 w12 w09 w08 + -19266,25172,19266,-10426, // w15 w14 w11 w10 + 26722,22654,22654,-5315, // w21 w20 w17 w16 + 15137,5315,-26722,-15137, // w23 w22 w19 w18 + 15137,-26722,5315,-15137, // w29 w28 w25 w24 + 5315,22654,22654,-26722, // w31 w30 w27 w26 +}; +//============================================================================= +// Helper macros for the code +//============================================================================= + +//----------------------------------------------------------------------------- +// DCT_8_INV_ROW_MMX( INP, OUT, TABLE, ROUNDER +//----------------------------------------------------------------------------- + +#define DCT_8_INV_ROW_MMX(A1,A2,A3,A4)\ + "movq " #A1 ",%%mm0 \n\t"/* 0 ; x3 x2 x1 x0*/\ + "movq 8+" #A1 ",%%mm1 \n\t"/* 1 ; x7 x6 x5 x4*/\ + "movq %%mm0,%%mm2 \n\t"/* 2 ; x3 x2 x1 x0*/\ + "movq " #A3 ",%%mm3 \n\t"/* 3 ; w06 w04 w02 w00*/\ + "punpcklwd %%mm1,%%mm0 \n\t"/* x5 x1 x4 x0*/\ + "movq %%mm0,%%mm5 \n\t"/* 5 ; x5 x1 x4 x0*/\ + "punpckldq %%mm0,%%mm0 \n\t"/* x4 x0 x4 x0*/\ + "movq 8+" #A3 ",%%mm4 \n\t"/* 4 ; w07 w05 w03 w01*/\ + "punpckhwd %%mm1,%%mm2 \n\t"/* 1 ; x7 x3 x6 x2*/\ + "pmaddwd %%mm0,%%mm3 \n\t"/* x4*w06+x0*w04 x4*w02+x0*w00*/\ + "movq %%mm2,%%mm6 \n\t"/* 6 ; x7 x3 x6 x2*/\ + "movq 32+" #A3 ",%%mm1 \n\t"/* 1 ; w22 w20 w18 w16*/\ + "punpckldq %%mm2,%%mm2 \n\t"/* x6 x2 x6 x2*/\ + "pmaddwd %%mm2,%%mm4 \n\t"/* x6*w07+x2*w05 x6*w03+x2*w01*/\ + "punpckhdq %%mm5,%%mm5 \n\t"/* x5 x1 x5 x1*/\ + "pmaddwd 16+" #A3 ",%%mm0 \n\t"/* x4*w14+x0*w12 x4*w10+x0*w08*/\ + "punpckhdq %%mm6,%%mm6 \n\t"/* x7 x3 x7 x3*/\ + "movq 40+" #A3 ",%%mm7 \n\t"/* 7 ; w23 w21 w19 w17*/\ + "pmaddwd %%mm5,%%mm1 \n\t"/* x5*w22+x1*w20 x5*w18+x1*w16*/\ + "paddd " #A4 ",%%mm3 \n\t"/* +%4*/\ + "pmaddwd %%mm6,%%mm7 \n\t"/* x7*w23+x3*w21 x7*w19+x3*w17*/\ + "pmaddwd 24+" #A3 ",%%mm2 \n\t"/* x6*w15+x2*w13 x6*w11+x2*w09*/\ + "paddd %%mm4,%%mm3 \n\t"/* 4 ; a1=sum(even1) a0=sum(even0)*/\ + "pmaddwd 48+" #A3 ",%%mm5 \n\t"/* x5*w30+x1*w28 x5*w26+x1*w24*/\ + "movq %%mm3,%%mm4 \n\t"/* 4 ; a1 a0*/\ + "pmaddwd 56+" #A3 ",%%mm6 \n\t"/* x7*w31+x3*w29 x7*w27+x3*w25*/\ + "paddd %%mm7,%%mm1 \n\t"/* 7 ; b1=sum(odd1) b0=sum(odd0)*/\ + "paddd " #A4 ",%%mm0 \n\t"/* +%4*/\ + "psubd %%mm1,%%mm3 \n\t"/* a1-b1 a0-b0*/\ + "psrad $11,%%mm3 \n\t"/* y6=a1-b1 y7=a0-b0*/\ + "paddd %%mm4,%%mm1 \n\t"/* 4 ; a1+b1 a0+b0*/\ + "paddd %%mm2,%%mm0 \n\t"/* 2 ; a3=sum(even3) a2=sum(even2)*/\ + "psrad $11,%%mm1 \n\t"/* y1=a1+b1 y0=a0+b0*/\ + "paddd %%mm6,%%mm5 \n\t"/* 6 ; b3=sum(odd3) b2=sum(odd2)*/\ + "movq %%mm0,%%mm4 \n\t"/* 4 ; a3 a2*/\ + "paddd %%mm5,%%mm0 \n\t"/* a3+b3 a2+b2*/\ + "psubd %%mm5,%%mm4 \n\t"/* 5 ; a3-b3 a2-b2*/\ + "psrad $11,%%mm0 \n\t"/* y3=a3+b3 y2=a2+b2*/\ + "psrad $11,%%mm4 \n\t"/* y4=a3-b3 y5=a2-b2*/\ + "packssdw %%mm0,%%mm1 \n\t"/* 0 ; y3 y2 y1 y0*/\ + "packssdw %%mm3,%%mm4 \n\t"/* 3 ; y6 y7 y4 y5*/\ + "movq %%mm4,%%mm7 \n\t"/* 7 ; y6 y7 y4 y5*/\ + "psrld $16,%%mm4 \n\t"/* 0 y6 0 y4*/\ + "pslld $16,%%mm7 \n\t"/* y7 0 y5 0*/\ + "movq %%mm1," #A2 " \n\t"/* 1 ; save y3 y2 y1 y0*/\ + "por %%mm4,%%mm7 \n\t"/* 4 ; y7 y6 y5 y4*/\ + "movq %%mm7,8 +" #A2 "\n\t"/* 7 ; save y7 y6 y5 y4*/\ + + +//----------------------------------------------------------------------------- +// DCT_8_INV_ROW_XMM( INP, OUT, TABLE, ROUNDER +//----------------------------------------------------------------------------- + +#define DCT_8_INV_ROW_XMM(A1,A2,A3,A4)\ + "movq " #A1 ",%%mm0 \n\t"/* 0 ; x3 x2 x1 x0*/\ + "movq 8+" #A1 ",%%mm1 \n\t"/* 1 ; x7 x6 x5 x4*/\ + "movq %%mm0,%%mm2 \n\t"/* 2 ; x3 x2 x1 x0*/\ + "movq " #A3 ",%%mm3 \n\t"/* 3 ; w05 w04 w01 w00*/\ + "pshufw $0x88,%%mm0,%%mm0 \n\t"/* x2 x0 x2 x0*/\ + "movq 8+" #A3 ",%%mm4 \n\t"/* 4 ; w07 w06 w03 w02*/\ + "movq %%mm1,%%mm5 \n\t"/* 5 ; x7 x6 x5 x4*/\ + "pmaddwd %%mm0,%%mm3 \n\t"/* x2*w05+x0*w04 x2*w01+x0*w00*/\ + "movq 32+" #A3 ",%%mm6 \n\t"/* 6 ; w21 w20 w17 w16*/\ + "pshufw $0x88,%%mm1,%%mm1 \n\t"/* x6 x4 x6 x4*/\ + "pmaddwd %%mm1,%%mm4 \n\t"/* x6*w07+x4*w06 x6*w03+x4*w02*/\ + "movq 40+" #A3 ",%%mm7 \n\t"/* 7 ; w23 w22 w19 w18*/\ + "pshufw $0xdd,%%mm2,%%mm2 \n\t"/* x3 x1 x3 x1*/\ + "pmaddwd %%mm2,%%mm6 \n\t"/* x3*w21+x1*w20 x3*w17+x1*w16*/\ + "pshufw $0xdd,%%mm5,%%mm5 \n\t"/* x7 x5 x7 x5*/\ + "pmaddwd %%mm5,%%mm7 \n\t"/* x7*w23+x5*w22 x7*w19+x5*w18*/\ + "paddd " #A4 ",%%mm3 \n\t"/* +%4*/\ + "pmaddwd 16+" #A3 ",%%mm0 \n\t"/* x2*w13+x0*w12 x2*w09+x0*w08*/\ + "paddd %%mm4,%%mm3 \n\t"/* 4 ; a1=sum(even1) a0=sum(even0)*/\ + "pmaddwd 24+" #A3 ",%%mm1 \n\t"/* x6*w15+x4*w14 x6*w11+x4*w10*/\ + "movq %%mm3,%%mm4 \n\t"/* 4 ; a1 a0*/\ + "pmaddwd 48+" #A3 ",%%mm2 \n\t"/* x3*w29+x1*w28 x3*w25+x1*w24*/\ + "paddd %%mm7,%%mm6 \n\t"/* 7 ; b1=sum(odd1) b0=sum(odd0)*/\ + "pmaddwd 56+" #A3 ",%%mm5 \n\t"/* x7*w31+x5*w30 x7*w27+x5*w26*/\ + "paddd %%mm6,%%mm3 \n\t"/* a1+b1 a0+b0*/\ + "paddd " #A4 ",%%mm0 \n\t"/* +%4*/\ + "psrad $11,%%mm3 \n\t"/* y1=a1+b1 y0=a0+b0*/\ + "paddd %%mm1,%%mm0 \n\t"/* 1 ; a3=sum(even3) a2=sum(even2)*/\ + "psubd %%mm6,%%mm4 \n\t"/* 6 ; a1-b1 a0-b0*/\ + "movq %%mm0,%%mm7 \n\t"/* 7 ; a3 a2*/\ + "paddd %%mm5,%%mm2 \n\t"/* 5 ; b3=sum(odd3) b2=sum(odd2)*/\ + "paddd %%mm2,%%mm0 \n\t"/* a3+b3 a2+b2*/\ + "psrad $11,%%mm4 \n\t"/* y6=a1-b1 y7=a0-b0*/\ + "psubd %%mm2,%%mm7 \n\t"/* 2 ; a3-b3 a2-b2*/\ + "psrad $11,%%mm0 \n\t"/* y3=a3+b3 y2=a2+b2*/\ + "psrad $11,%%mm7 \n\t"/* y4=a3-b3 y5=a2-b2*/\ + "packssdw %%mm0,%%mm3 \n\t"/* 0 ; y3 y2 y1 y0*/\ + "packssdw %%mm4,%%mm7 \n\t"/* 4 ; y6 y7 y4 y5*/\ + "movq %%mm3, " #A2 " \n\t"/* 3 ; save y3 y2 y1 y0*/\ + "pshufw $0xb1,%%mm7,%%mm7 \n\t"/* y7 y6 y5 y4*/\ + "movq %%mm7,8 +" #A2 "\n\t"/* 7 ; save y7 y6 y5 y4*/\ + + +//----------------------------------------------------------------------------- +// +// The first stage DCT 8x8 - forward DCTs of columns +// +// The %2puts are multiplied +// for rows 0,4 - on cos_4_16, +// for rows 1,7 - on cos_1_16, +// for rows 2,6 - on cos_2_16, +// for rows 3,5 - on cos_3_16 +// and are shifted to the left for rise of accuracy +// +//----------------------------------------------------------------------------- +// +// The 8-point scaled forward DCT algorithm (26a8m) +// +//----------------------------------------------------------------------------- +// +// #define DCT_8_FRW_COL(x, y) +//{ +// short t0, t1, t2, t3, t4, t5, t6, t7; +// short tp03, tm03, tp12, tm12, tp65, tm65; +// short tp465, tm465, tp765, tm765; +// +// t0 = LEFT_SHIFT ( x[0] + x[7] ); +// t1 = LEFT_SHIFT ( x[1] + x[6] ); +// t2 = LEFT_SHIFT ( x[2] + x[5] ); +// t3 = LEFT_SHIFT ( x[3] + x[4] ); +// t4 = LEFT_SHIFT ( x[3] - x[4] ); +// t5 = LEFT_SHIFT ( x[2] - x[5] ); +// t6 = LEFT_SHIFT ( x[1] - x[6] ); +// t7 = LEFT_SHIFT ( x[0] - x[7] ); +// +// tp03 = t0 + t3; +// tm03 = t0 - t3; +// tp12 = t1 + t2; +// tm12 = t1 - t2; +// +// y[0] = tp03 + tp12; +// y[4] = tp03 - tp12; +// +// y[2] = tm03 + tm12 * tg_2_16; +// y[6] = tm03 * tg_2_16 - tm12; +// +// tp65 =(t6 +t5 )*cos_4_16; +// tm65 =(t6 -t5 )*cos_4_16; +// +// tp765 = t7 + tp65; +// tm765 = t7 - tp65; +// tp465 = t4 + tm65; +// tm465 = t4 - tm65; +// +// y[1] = tp765 + tp465 * tg_1_16; +// y[7] = tp765 * tg_1_16 - tp465; +// y[5] = tm765 * tg_3_16 + tm465; +// y[3] = tm765 - tm465 * tg_3_16; +//} +// +//----------------------------------------------------------------------------- + +//----------------------------------------------------------------------------- +// DCT_8_INV_COL_4 INP,OUT +//----------------------------------------------------------------------------- + +#define DCT_8_INV_COL(A1,A2)\ + "movq 2*8(%3),%%mm0\n\t"\ + "movq 16*3+" #A1 ",%%mm3\n\t"\ + "movq %%mm0,%%mm1 \n\t"/* tg_3_16*/\ + "movq 16*5+" #A1 ",%%mm5\n\t"\ + "pmulhw %%mm3,%%mm0 \n\t"/* x3*(tg_3_16-1)*/\ + "movq (%3),%%mm4\n\t"\ + "pmulhw %%mm5,%%mm1 \n\t"/* x5*(tg_3_16-1)*/\ + "movq 16*7+" #A1 ",%%mm7\n\t"\ + "movq %%mm4,%%mm2 \n\t"/* tg_1_16*/\ + "movq 16*1+" #A1 ",%%mm6\n\t"\ + "pmulhw %%mm7,%%mm4 \n\t"/* x7*tg_1_16*/\ + "paddsw %%mm3,%%mm0 \n\t"/* x3*tg_3_16*/\ + "pmulhw %%mm6,%%mm2 \n\t"/* x1*tg_1_16*/\ + "paddsw %%mm3,%%mm1 \n\t"/* x3+x5*(tg_3_16-1)*/\ + "psubsw %%mm5,%%mm0 \n\t"/* x3*tg_3_16-x5 = tm35*/\ + "movq 3*8(%3),%%mm3\n\t"\ + "paddsw %%mm5,%%mm1 \n\t"/* x3+x5*tg_3_16 = tp35*/\ + "paddsw %%mm6,%%mm4 \n\t"/* x1+tg_1_16*x7 = tp17*/\ + "psubsw %%mm7,%%mm2 \n\t"/* x1*tg_1_16-x7 = tm17*/\ + "movq %%mm4,%%mm5 \n\t"/* tp17*/\ + "movq %%mm2,%%mm6 \n\t"/* tm17*/\ + "paddsw %%mm1,%%mm5 \n\t"/* tp17+tp35 = b0*/\ + "psubsw %%mm0,%%mm6 \n\t"/* tm17-tm35 = b3*/\ + "psubsw %%mm1,%%mm4 \n\t"/* tp17-tp35 = t1*/\ + "paddsw %%mm0,%%mm2 \n\t"/* tm17+tm35 = t2*/\ + "movq 1*8(%3),%%mm7\n\t"\ + "movq %%mm4,%%mm1 \n\t"/* t1*/\ + "movq %%mm5,3*16 +" #A2 "\n\t"/* save b0*/\ + "paddsw %%mm2,%%mm1 \n\t"/* t1+t2*/\ + "movq %%mm6,5*16 +" #A2 "\n\t"/* save b3*/\ + "psubsw %%mm2,%%mm4 \n\t"/* t1-t2*/\ + "movq 2*16+" #A1 ",%%mm5\n\t"\ + "movq %%mm7,%%mm0 \n\t"/* tg_2_16*/\ + "movq 6*16+" #A1 ",%%mm6\n\t"\ + "pmulhw %%mm5,%%mm0 \n\t"/* x2*tg_2_16*/\ + "pmulhw %%mm6,%%mm7 \n\t"/* x6*tg_2_16*/\ + "pmulhw %%mm3,%%mm1 \n\t"/* ocos_4_16*(t1+t2) = b1/2*/\ + "movq 0*16+" #A1 ",%%mm2\n\t"\ + "pmulhw %%mm3,%%mm4 \n\t"/* ocos_4_16*(t1-t2) = b2/2*/\ + "psubsw %%mm6,%%mm0 \n\t"/* t2*tg_2_16-x6 = tm26*/\ + "movq %%mm2,%%mm3 \n\t"/* x0*/\ + "movq 4*16+" #A1 ",%%mm6\n\t"\ + "paddsw %%mm5,%%mm7 \n\t"/* x2+x6*tg_2_16 = tp26*/\ + "paddsw %%mm6,%%mm2 \n\t"/* x0+x4 = tp04*/\ + "psubsw %%mm6,%%mm3 \n\t"/* x0-x4 = tm04*/\ + "movq %%mm2,%%mm5 \n\t"/* tp04*/\ + "movq %%mm3,%%mm6 \n\t"/* tm04*/\ + "psubsw %%mm7,%%mm2 \n\t"/* tp04-tp26 = a3*/\ + "paddsw %%mm0,%%mm3 \n\t"/* tm04+tm26 = a1*/\ + "paddsw %%mm1,%%mm1 \n\t"/* b1*/\ + "paddsw %%mm4,%%mm4 \n\t"/* b2*/\ + "paddsw %%mm7,%%mm5 \n\t"/* tp04+tp26 = a0*/\ + "psubsw %%mm0,%%mm6 \n\t"/* tm04-tm26 = a2*/\ + "movq %%mm3,%%mm7 \n\t"/* a1*/\ + "movq %%mm6,%%mm0 \n\t"/* a2*/\ + "paddsw %%mm1,%%mm3 \n\t"/* a1+b1*/\ + "paddsw %%mm4,%%mm6 \n\t"/* a2+b2*/\ + "psraw $6,%%mm3 \n\t"/* dst1*/\ + "psubsw %%mm1,%%mm7 \n\t"/* a1-b1*/\ + "psraw $6,%%mm6 \n\t"/* dst2*/\ + "psubsw %%mm4,%%mm0 \n\t"/* a2-b2*/\ + "movq 3*16+" #A2 ",%%mm1 \n\t"/* load b0*/\ + "psraw $6,%%mm7 \n\t"/* dst6*/\ + "movq %%mm5,%%mm4 \n\t"/* a0*/\ + "psraw $6,%%mm0 \n\t"/* dst5*/\ + "movq %%mm3,1*16+" #A2 "\n\t"\ + "paddsw %%mm1,%%mm5 \n\t"/* a0+b0*/\ + "movq %%mm6,2*16+" #A2 "\n\t"\ + "psubsw %%mm1,%%mm4 \n\t"/* a0-b0*/\ + "movq 5*16+" #A2 ",%%mm3 \n\t"/* load b3*/\ + "psraw $6,%%mm5 \n\t"/* dst0*/\ + "movq %%mm2,%%mm6 \n\t"/* a3*/\ + "psraw $6,%%mm4 \n\t"/* dst7*/\ + "movq %%mm0,5*16+" #A2 "\n\t"\ + "paddsw %%mm3,%%mm2 \n\t"/* a3+b3*/\ + "movq %%mm7,6*16+" #A2 "\n\t"\ + "psubsw %%mm3,%%mm6 \n\t"/* a3-b3*/\ + "movq %%mm5,0*16+" #A2 "\n\t"\ + "psraw $6,%%mm2 \n\t"/* dst3*/\ + "movq %%mm4,7*16+" #A2 "\n\t"\ + "psraw $6,%%mm6 \n\t"/* dst4*/\ + "movq %%mm2,3*16+" #A2 "\n\t"\ + "movq %%mm6,4*16+" #A2 "\n\t" + +//============================================================================= +// Code +//============================================================================= + +//----------------------------------------------------------------------------- +// void idct_mmx(uint16_t block[64]); +//----------------------------------------------------------------------------- + + +void ff_idct_xvid_mmx(short *block){ +__asm__ volatile( + //# Process each row + DCT_8_INV_ROW_MMX(0*16(%0), 0*16(%0), 64*0(%2), 8*0(%1)) + DCT_8_INV_ROW_MMX(1*16(%0), 1*16(%0), 64*1(%2), 8*1(%1)) + DCT_8_INV_ROW_MMX(2*16(%0), 2*16(%0), 64*2(%2), 8*2(%1)) + DCT_8_INV_ROW_MMX(3*16(%0), 3*16(%0), 64*3(%2), 8*3(%1)) + DCT_8_INV_ROW_MMX(4*16(%0), 4*16(%0), 64*0(%2), 8*4(%1)) + DCT_8_INV_ROW_MMX(5*16(%0), 5*16(%0), 64*3(%2), 8*5(%1)) + DCT_8_INV_ROW_MMX(6*16(%0), 6*16(%0), 64*2(%2), 8*6(%1)) + DCT_8_INV_ROW_MMX(7*16(%0), 7*16(%0), 64*1(%2), 8*7(%1)) + + //# Process the columns (4 at a time) + DCT_8_INV_COL(0(%0), 0(%0)) + DCT_8_INV_COL(8(%0), 8(%0)) + :: "r"(block), "r"(rounder_0), "r"(tab_i_04_mmx), "r"(tg_1_16)); +} + +//----------------------------------------------------------------------------- +// void idct_xmm(uint16_t block[64]); +//----------------------------------------------------------------------------- + + +void ff_idct_xvid_mmx2(short *block){ +__asm__ volatile( + //# Process each row + DCT_8_INV_ROW_XMM(0*16(%0), 0*16(%0), 64*0(%2), 8*0(%1)) + DCT_8_INV_ROW_XMM(1*16(%0), 1*16(%0), 64*1(%2), 8*1(%1)) + DCT_8_INV_ROW_XMM(2*16(%0), 2*16(%0), 64*2(%2), 8*2(%1)) + DCT_8_INV_ROW_XMM(3*16(%0), 3*16(%0), 64*3(%2), 8*3(%1)) + DCT_8_INV_ROW_XMM(4*16(%0), 4*16(%0), 64*0(%2), 8*4(%1)) + DCT_8_INV_ROW_XMM(5*16(%0), 5*16(%0), 64*3(%2), 8*5(%1)) + DCT_8_INV_ROW_XMM(6*16(%0), 6*16(%0), 64*2(%2), 8*6(%1)) + DCT_8_INV_ROW_XMM(7*16(%0), 7*16(%0), 64*1(%2), 8*7(%1)) + + //# Process the columns (4 at a time) + DCT_8_INV_COL(0(%0), 0(%0)) + DCT_8_INV_COL(8(%0), 8(%0)) + :: "r"(block), "r"(rounder_0), "r"(tab_i_04_xmm), "r"(tg_1_16)); +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/x86/idct_sse2_xvid.c Mon Dec 22 09:12:42 2008 +0000 @@ -0,0 +1,394 @@ +/* + * XVID MPEG-4 VIDEO CODEC + * - SSE2 inverse discrete cosine transform - + * + * Copyright(C) 2003 Pascal Massimino <skal@planet-d.net> + * + * Conversion to gcc syntax with modifications + * by Alexander Strange <astrange@ithinksw.com> + * + * Originally from dct/x86_asm/fdct_sse2_skal.asm in Xvid. + * + * This file is part of FFmpeg. + * + * Vertical pass is an implementation of the scheme: + * Loeffler C., Ligtenberg A., and Moschytz C.S.: + * Practical Fast 1D DCT Algorithm with Eleven Multiplications, + * Proc. ICASSP 1989, 988-991. + * + * Horizontal pass is a double 4x4 vector/matrix multiplication, + * (see also Intel's Application Note 922: + * http://developer.intel.com/vtune/cbts/strmsimd/922down.htm + * Copyright (C) 1999 Intel Corporation) + * + * More details at http://skal.planet-d.net/coding/dct.html + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with FFmpeg; if not, write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavcodec/dsputil.h" +#include "idct_xvid.h" + +/*! + * @file idct_sse2_xvid.c + * @brief SSE2 idct compatible with xvidmmx + */ + +#define X8(x) x,x,x,x,x,x,x,x + +#define ROW_SHIFT 11 +#define COL_SHIFT 6 + +DECLARE_ASM_CONST(16, int16_t, tan1[]) = {X8(13036)}; // tan( pi/16) +DECLARE_ASM_CONST(16, int16_t, tan2[]) = {X8(27146)}; // tan(2pi/16) = sqrt(2)-1 +DECLARE_ASM_CONST(16, int16_t, tan3[]) = {X8(43790)}; // tan(3pi/16)-1 +DECLARE_ASM_CONST(16, int16_t, sqrt2[])= {X8(23170)}; // 0.5/sqrt(2) +DECLARE_ASM_CONST(8, uint8_t, m127[]) = {X8(127)}; + +DECLARE_ASM_CONST(16, int16_t, iTab1[]) = { + 0x4000, 0x539f, 0xc000, 0xac61, 0x4000, 0xdd5d, 0x4000, 0xdd5d, + 0x4000, 0x22a3, 0x4000, 0x22a3, 0xc000, 0x539f, 0x4000, 0xac61, + 0x3249, 0x11a8, 0x4b42, 0xee58, 0x11a8, 0x4b42, 0x11a8, 0xcdb7, + 0x58c5, 0x4b42, 0xa73b, 0xcdb7, 0x3249, 0xa73b, 0x4b42, 0xa73b +}; + +DECLARE_ASM_CONST(16, int16_t, iTab2[]) = { + 0x58c5, 0x73fc, 0xa73b, 0x8c04, 0x58c5, 0xcff5, 0x58c5, 0xcff5, + 0x58c5, 0x300b, 0x58c5, 0x300b, 0xa73b, 0x73fc, 0x58c5, 0x8c04, + 0x45bf, 0x187e, 0x6862, 0xe782, 0x187e, 0x6862, 0x187e, 0xba41, + 0x7b21, 0x6862, 0x84df, 0xba41, 0x45bf, 0x84df, 0x6862, 0x84df +}; + +DECLARE_ASM_CONST(16, int16_t, iTab3[]) = { + 0x539f, 0x6d41, 0xac61, 0x92bf, 0x539f, 0xd2bf, 0x539f, 0xd2bf, + 0x539f, 0x2d41, 0x539f, 0x2d41, 0xac61, 0x6d41, 0x539f, 0x92bf, + 0x41b3, 0x1712, 0x6254, 0xe8ee, 0x1712, 0x6254, 0x1712, 0xbe4d, + 0x73fc, 0x6254, 0x8c04, 0xbe4d, 0x41b3, 0x8c04, 0x6254, 0x8c04 +}; + +DECLARE_ASM_CONST(16, int16_t, iTab4[]) = { + 0x4b42, 0x6254, 0xb4be, 0x9dac, 0x4b42, 0xd746, 0x4b42, 0xd746, + 0x4b42, 0x28ba, 0x4b42, 0x28ba, 0xb4be, 0x6254, 0x4b42, 0x9dac, + 0x3b21, 0x14c3, 0x587e, 0xeb3d, 0x14c3, 0x587e, 0x14c3, 0xc4df, + 0x6862, 0x587e, 0x979e, 0xc4df, 0x3b21, 0x979e, 0x587e, 0x979e +}; + +DECLARE_ASM_CONST(16, int32_t, walkenIdctRounders[]) = { + 65536, 65536, 65536, 65536, + 3597, 3597, 3597, 3597, + 2260, 2260, 2260, 2260, + 1203, 1203, 1203, 1203, + 120, 120, 120, 120, + 512, 512, 512, 512 +}; + +// Temporary storage before the column pass +#define ROW1 "%%xmm6" +#define ROW3 "%%xmm4" +#define ROW5 "%%xmm5" +#define ROW7 "%%xmm7" + +#define CLEAR_ODD(r) "pxor "r","r" \n\t" +#define PUT_ODD(dst) "pshufhw $0x1B, %%xmm2, "dst" \n\t" + +#ifdef ARCH_X86_64 + +# define ROW0 "%%xmm8" +# define REG0 ROW0 +# define ROW2 "%%xmm9" +# define REG2 ROW2 +# define ROW4 "%%xmm10" +# define REG4 ROW4 +# define ROW6 "%%xmm11" +# define REG6 ROW6 +# define CLEAR_EVEN(r) CLEAR_ODD(r) +# define PUT_EVEN(dst) PUT_ODD(dst) +# define XMMS "%%xmm12" +# define MOV_32_ONLY "#" +# define SREG2 REG2 +# define TAN3 "%%xmm13" +# define TAN1 "%%xmm14" + +#else + +# define ROW0 "(%0)" +# define REG0 "%%xmm4" +# define ROW2 "2*16(%0)" +# define REG2 "%%xmm4" +# define ROW4 "4*16(%0)" +# define REG4 "%%xmm6" +# define ROW6 "6*16(%0)" +# define REG6 "%%xmm6" +# define CLEAR_EVEN(r) +# define PUT_EVEN(dst) \ + "pshufhw $0x1B, %%xmm2, %%xmm2 \n\t" \ + "movdqa %%xmm2, "dst" \n\t" +# define XMMS "%%xmm2" +# define MOV_32_ONLY "movdqa " +# define SREG2 "%%xmm7" +# define TAN3 "%%xmm0" +# define TAN1 "%%xmm2" + +#endif + +#define ROUND(x) "paddd "MANGLE(x) + +#define JZ(reg, to) \ + "testl "reg","reg" \n\t" \ + "jz "to" \n\t" + +#define JNZ(reg, to) \ + "testl "reg","reg" \n\t" \ + "jnz "to" \n\t" + +#define TEST_ONE_ROW(src, reg, clear) \ + clear \ + "movq "src", %%mm1 \n\t" \ + "por 8+"src", %%mm1 \n\t" \ + "paddusb %%mm0, %%mm1 \n\t" \ + "pmovmskb %%mm1, "reg" \n\t" + +#define TEST_TWO_ROWS(row1, row2, reg1, reg2, clear1, clear2) \ + clear1 \ + clear2 \ + "movq "row1", %%mm1 \n\t" \ + "por 8+"row1", %%mm1 \n\t" \ + "movq "row2", %%mm2 \n\t" \ + "por 8+"row2", %%mm2 \n\t" \ + "paddusb %%mm0, %%mm1 \n\t" \ + "paddusb %%mm0, %%mm2 \n\t" \ + "pmovmskb %%mm1, "reg1" \n\t" \ + "pmovmskb %%mm2, "reg2" \n\t" + +///IDCT pass on rows. +#define iMTX_MULT(src, table, rounder, put) \ + "movdqa "src", %%xmm3 \n\t" \ + "movdqa %%xmm3, %%xmm0 \n\t" \ + "pshufd $0x11, %%xmm3, %%xmm1 \n\t" /* 4602 */ \ + "punpcklqdq %%xmm0, %%xmm0 \n\t" /* 0246 */ \ + "pmaddwd "table", %%xmm0 \n\t" \ + "pmaddwd 16+"table", %%xmm1 \n\t" \ + "pshufd $0xBB, %%xmm3, %%xmm2 \n\t" /* 5713 */ \ + "punpckhqdq %%xmm3, %%xmm3 \n\t" /* 1357 */ \ + "pmaddwd 32+"table", %%xmm2 \n\t" \ + "pmaddwd 48+"table", %%xmm3 \n\t" \ + "paddd %%xmm1, %%xmm0 \n\t" \ + "paddd %%xmm3, %%xmm2 \n\t" \ + rounder", %%xmm0 \n\t" \ + "movdqa %%xmm2, %%xmm3 \n\t" \ + "paddd %%xmm0, %%xmm2 \n\t" \ + "psubd %%xmm3, %%xmm0 \n\t" \ + "psrad $11, %%xmm2 \n\t" \ + "psrad $11, %%xmm0 \n\t" \ + "packssdw %%xmm0, %%xmm2 \n\t" \ + put \ + "1: \n\t" + +#define iLLM_HEAD \ + "movdqa "MANGLE(tan3)", "TAN3" \n\t" \ + "movdqa "MANGLE(tan1)", "TAN1" \n\t" \ + +///IDCT pass on columns. +#define iLLM_PASS(dct) \ + "movdqa "TAN3", %%xmm1 \n\t" \ + "movdqa "TAN1", %%xmm3 \n\t" \ + "pmulhw %%xmm4, "TAN3" \n\t" \ + "pmulhw %%xmm5, %%xmm1 \n\t" \ + "paddsw %%xmm4, "TAN3" \n\t" \ + "paddsw %%xmm5, %%xmm1 \n\t" \ + "psubsw %%xmm5, "TAN3" \n\t" \ + "paddsw %%xmm4, %%xmm1 \n\t" \ + "pmulhw %%xmm7, %%xmm3 \n\t" \ + "pmulhw %%xmm6, "TAN1" \n\t" \ + "paddsw %%xmm6, %%xmm3 \n\t" \ + "psubsw %%xmm7, "TAN1" \n\t" \ + "movdqa %%xmm3, %%xmm7 \n\t" \ + "movdqa "TAN1", %%xmm6 \n\t" \ + "psubsw %%xmm1, %%xmm3 \n\t" \ + "psubsw "TAN3", "TAN1" \n\t" \ + "paddsw %%xmm7, %%xmm1 \n\t" \ + "paddsw %%xmm6, "TAN3" \n\t" \ + "movdqa %%xmm3, %%xmm6 \n\t" \ + "psubsw "TAN3", %%xmm3 \n\t" \ + "paddsw %%xmm6, "TAN3" \n\t" \ + "movdqa "MANGLE(sqrt2)", %%xmm4 \n\t" \ + "pmulhw %%xmm4, %%xmm3 \n\t" \ + "pmulhw %%xmm4, "TAN3" \n\t" \ + "paddsw "TAN3", "TAN3" \n\t" \ + "paddsw %%xmm3, %%xmm3 \n\t" \ + "movdqa "MANGLE(tan2)", %%xmm7 \n\t" \ + MOV_32_ONLY ROW2", "REG2" \n\t" \ + MOV_32_ONLY ROW6", "REG6" \n\t" \ + "movdqa %%xmm7, %%xmm5 \n\t" \ + "pmulhw "REG6", %%xmm7 \n\t" \ + "pmulhw "REG2", %%xmm5 \n\t" \ + "paddsw "REG2", %%xmm7 \n\t" \ + "psubsw "REG6", %%xmm5 \n\t" \ + MOV_32_ONLY ROW0", "REG0" \n\t" \ + MOV_32_ONLY ROW4", "REG4" \n\t" \ + MOV_32_ONLY" "TAN1", (%0) \n\t" \ + "movdqa "REG0", "XMMS" \n\t" \ + "psubsw "REG4", "REG0" \n\t" \ + "paddsw "XMMS", "REG4" \n\t" \ + "movdqa "REG4", "XMMS" \n\t" \ + "psubsw %%xmm7, "REG4" \n\t" \ + "paddsw "XMMS", %%xmm7 \n\t" \ + "movdqa "REG0", "XMMS" \n\t" \ + "psubsw %%xmm5, "REG0" \n\t" \ + "paddsw "XMMS", %%xmm5 \n\t" \ + "movdqa %%xmm5, "XMMS" \n\t" \ + "psubsw "TAN3", %%xmm5 \n\t" \ + "paddsw "XMMS", "TAN3" \n\t" \ + "movdqa "REG0", "XMMS" \n\t" \ + "psubsw %%xmm3, "REG0" \n\t" \ + "paddsw "XMMS", %%xmm3 \n\t" \ + MOV_32_ONLY" (%0), "TAN1" \n\t" \ + "psraw $6, %%xmm5 \n\t" \ + "psraw $6, "REG0" \n\t" \ + "psraw $6, "TAN3" \n\t" \ + "psraw $6, %%xmm3 \n\t" \ + "movdqa "TAN3", 1*16("dct") \n\t" \ + "movdqa %%xmm3, 2*16("dct") \n\t" \ + "movdqa "REG0", 5*16("dct") \n\t" \ + "movdqa %%xmm5, 6*16("dct") \n\t" \ + "movdqa %%xmm7, %%xmm0 \n\t" \ + "movdqa "REG4", %%xmm4 \n\t" \ + "psubsw %%xmm1, %%xmm7 \n\t" \ + "psubsw "TAN1", "REG4" \n\t" \ + "paddsw %%xmm0, %%xmm1 \n\t" \ + "paddsw %%xmm4, "TAN1" \n\t" \ + "psraw $6, %%xmm1 \n\t" \ + "psraw $6, %%xmm7 \n\t" \ + "psraw $6, "TAN1" \n\t" \ + "psraw $6, "REG4" \n\t" \ + "movdqa %%xmm1, ("dct") \n\t" \ + "movdqa "TAN1", 3*16("dct") \n\t" \ + "movdqa "REG4", 4*16("dct") \n\t" \ + "movdqa %%xmm7, 7*16("dct") \n\t" + +///IDCT pass on columns, assuming rows 4-7 are zero. +#define iLLM_PASS_SPARSE(dct) \ + "pmulhw %%xmm4, "TAN3" \n\t" \ + "paddsw %%xmm4, "TAN3" \n\t" \ + "movdqa %%xmm6, %%xmm3 \n\t" \ + "pmulhw %%xmm6, "TAN1" \n\t" \ + "movdqa %%xmm4, %%xmm1 \n\t" \ + "psubsw %%xmm1, %%xmm3 \n\t" \ + "paddsw %%xmm6, %%xmm1 \n\t" \ + "movdqa "TAN1", %%xmm6 \n\t" \ + "psubsw "TAN3", "TAN1" \n\t" \ + "paddsw %%xmm6, "TAN3" \n\t" \ + "movdqa %%xmm3, %%xmm6 \n\t" \ + "psubsw "TAN3", %%xmm3 \n\t" \ + "paddsw %%xmm6, "TAN3" \n\t" \ + "movdqa "MANGLE(sqrt2)", %%xmm4 \n\t" \ + "pmulhw %%xmm4, %%xmm3 \n\t" \ + "pmulhw %%xmm4, "TAN3" \n\t" \ + "paddsw "TAN3", "TAN3" \n\t" \ + "paddsw %%xmm3, %%xmm3 \n\t" \ + "movdqa "MANGLE(tan2)", %%xmm5 \n\t" \ + MOV_32_ONLY ROW2", "SREG2" \n\t" \ + "pmulhw "SREG2", %%xmm5 \n\t" \ + MOV_32_ONLY ROW0", "REG0" \n\t" \ + "movdqa "REG0", %%xmm6 \n\t" \ + "psubsw "SREG2", %%xmm6 \n\t" \ + "paddsw "REG0", "SREG2" \n\t" \ + MOV_32_ONLY" "TAN1", (%0) \n\t" \ + "movdqa "REG0", "XMMS" \n\t" \ + "psubsw %%xmm5, "REG0" \n\t" \ + "paddsw "XMMS", %%xmm5 \n\t" \ + "movdqa %%xmm5, "XMMS" \n\t" \ + "psubsw "TAN3", %%xmm5 \n\t" \ + "paddsw "XMMS", "TAN3" \n\t" \ + "movdqa "REG0", "XMMS" \n\t" \ + "psubsw %%xmm3, "REG0" \n\t" \ + "paddsw "XMMS", %%xmm3 \n\t" \ + MOV_32_ONLY" (%0), "TAN1" \n\t" \ + "psraw $6, %%xmm5 \n\t" \ + "psraw $6, "REG0" \n\t" \ + "psraw $6, "TAN3" \n\t" \ + "psraw $6, %%xmm3 \n\t" \ + "movdqa "TAN3", 1*16("dct") \n\t" \ + "movdqa %%xmm3, 2*16("dct") \n\t" \ + "movdqa "REG0", 5*16("dct") \n\t" \ + "movdqa %%xmm5, 6*16("dct") \n\t" \ + "movdqa "SREG2", %%xmm0 \n\t" \ + "movdqa %%xmm6, %%xmm4 \n\t" \ + "psubsw %%xmm1, "SREG2" \n\t" \ + "psubsw "TAN1", %%xmm6 \n\t" \ + "paddsw %%xmm0, %%xmm1 \n\t" \ + "paddsw %%xmm4, "TAN1" \n\t" \ + "psraw $6, %%xmm1 \n\t" \ + "psraw $6, "SREG2" \n\t" \ + "psraw $6, "TAN1" \n\t" \ + "psraw $6, %%xmm6 \n\t" \ + "movdqa %%xmm1, ("dct") \n\t" \ + "movdqa "TAN1", 3*16("dct") \n\t" \ + "movdqa %%xmm6, 4*16("dct") \n\t" \ + "movdqa "SREG2", 7*16("dct") \n\t" + +inline void ff_idct_xvid_sse2(short *block) +{ + __asm__ volatile( + "movq "MANGLE(m127)", %%mm0 \n\t" + iMTX_MULT("(%0)", MANGLE(iTab1), ROUND(walkenIdctRounders), PUT_EVEN(ROW0)) + iMTX_MULT("1*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+1*16), PUT_ODD(ROW1)) + iMTX_MULT("2*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+2*16), PUT_EVEN(ROW2)) + + TEST_TWO_ROWS("3*16(%0)", "4*16(%0)", "%%eax", "%%ecx", CLEAR_ODD(ROW3), CLEAR_EVEN(ROW4)) + JZ("%%eax", "1f") + iMTX_MULT("3*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+3*16), PUT_ODD(ROW3)) + + TEST_TWO_ROWS("5*16(%0)", "6*16(%0)", "%%eax", "%%edx", CLEAR_ODD(ROW5), CLEAR_EVEN(ROW6)) + TEST_ONE_ROW("7*16(%0)", "%%esi", CLEAR_ODD(ROW7)) + iLLM_HEAD + ASMALIGN(4) + JNZ("%%ecx", "2f") + JNZ("%%eax", "3f") + JNZ("%%edx", "4f") + JNZ("%%esi", "5f") + iLLM_PASS_SPARSE("%0") + "jmp 6f \n\t" + "2: \n\t" + iMTX_MULT("4*16(%0)", MANGLE(iTab1), "#", PUT_EVEN(ROW4)) + "3: \n\t" + iMTX_MULT("5*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+4*16), PUT_ODD(ROW5)) + JZ("%%edx", "1f") + "4: \n\t" + iMTX_MULT("6*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+5*16), PUT_EVEN(ROW6)) + JZ("%%esi", "1f") + "5: \n\t" + iMTX_MULT("7*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+5*16), PUT_ODD(ROW7)) +#ifndef ARCH_X86_64 + iLLM_HEAD +#endif + iLLM_PASS("%0") + "6: \n\t" + : "+r"(block) + : + : "%eax", "%ecx", "%edx", "%esi", "memory"); +} + +void ff_idct_xvid_sse2_put(uint8_t *dest, int line_size, short *block) +{ + ff_idct_xvid_sse2(block); + put_pixels_clamped_mmx(block, dest, line_size); +} + +void ff_idct_xvid_sse2_add(uint8_t *dest, int line_size, short *block) +{ + ff_idct_xvid_sse2(block); + add_pixels_clamped_mmx(block, dest, line_size); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/x86/idct_xvid.h Mon Dec 22 09:12:42 2008 +0000 @@ -0,0 +1,37 @@ +/* + * XVID MPEG-4 VIDEO CODEC + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/*! + * @file idct_xvid.h + * header for Xvid IDCT functions + */ + +#ifndef AVCODEC_X86_IDCT_XVID_H +#define AVCODEC_X86_IDCT_XVID_H + +#include <stdint.h> + +void ff_idct_xvid_mmx(short *block); +void ff_idct_xvid_mmx2(short *block); +void ff_idct_xvid_sse2(short *block); +void ff_idct_xvid_sse2_put(uint8_t *dest, int line_size, short *block); +void ff_idct_xvid_sse2_add(uint8_t *dest, int line_size, short *block); + +#endif /* AVCODEC_X86_IDCT_XVID_H */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/x86/mathops.h Mon Dec 22 09:12:42 2008 +0000 @@ -0,0 +1,43 @@ +/* + * simple math operations + * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_X86_MATHOPS_H +#define AVCODEC_X86_MATHOPS_H + +#define MULL(ra, rb, shift) \ + ({ int rt, dummy; __asm__ (\ + "imull %3 \n\t"\ + "shrdl %4, %%edx, %%eax \n\t"\ + : "=a"(rt), "=d"(dummy)\ + : "a" ((int)ra), "rm" ((int)rb), "i"(shift));\ + rt; }) + +#define MULH(ra, rb) \ + ({ int rt, dummy;\ + __asm__ ("imull %3\n\t" : "=d"(rt), "=a"(dummy): "a" ((int)ra), "rm" ((int)rb));\ + rt; }) + +#define MUL64(ra, rb) \ + ({ int64_t rt;\ + __asm__ ("imull %2\n\t" : "=A"(rt) : "a" ((int)ra), "g" ((int)rb));\ + rt; }) + +#endif /* AVCODEC_X86_MATHOPS_H */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/x86/mmx.h Mon Dec 22 09:12:42 2008 +0000 @@ -0,0 +1,267 @@ +/* + * mmx.h + * Copyright (C) 1997-2001 H. Dietz and R. Fisher + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef AVCODEC_X86_MMX_H +#define AVCODEC_X86_MMX_H + +#warning Everything in this header is deprecated, use plain __asm__()! New code using this header will be rejected. + + +#define mmx_i2r(op,imm,reg) \ + __asm__ volatile (#op " %0, %%" #reg \ + : /* nothing */ \ + : "i" (imm) ) + +#define mmx_m2r(op,mem,reg) \ + __asm__ volatile (#op " %0, %%" #reg \ + : /* nothing */ \ + : "m" (mem)) + +#define mmx_r2m(op,reg,mem) \ + __asm__ volatile (#op " %%" #reg ", %0" \ + : "=m" (mem) \ + : /* nothing */ ) + +#define mmx_r2r(op,regs,regd) \ + __asm__ volatile (#op " %" #regs ", %" #regd) + + +#define emms() __asm__ volatile ("emms") + +#define movd_m2r(var,reg) mmx_m2r (movd, var, reg) +#define movd_r2m(reg,var) mmx_r2m (movd, reg, var) +#define movd_r2r(regs,regd) mmx_r2r (movd, regs, regd) + +#define movq_m2r(var,reg) mmx_m2r (movq, var, reg) +#define movq_r2m(reg,var) mmx_r2m (movq, reg, var) +#define movq_r2r(regs,regd) mmx_r2r (movq, regs, regd) + +#define packssdw_m2r(var,reg) mmx_m2r (packssdw, var, reg) +#define packssdw_r2r(regs,regd) mmx_r2r (packssdw, regs, regd) +#define packsswb_m2r(var,reg) mmx_m2r (packsswb, var, reg) +#define packsswb_r2r(regs,regd) mmx_r2r (packsswb, regs, regd) + +#define packuswb_m2r(var,reg) mmx_m2r (packuswb, var, reg) +#define packuswb_r2r(regs,regd) mmx_r2r (packuswb, regs, regd) + +#define paddb_m2r(var,reg) mmx_m2r (paddb, var, reg) +#define paddb_r2r(regs,regd) mmx_r2r (paddb, regs, regd) +#define paddd_m2r(var,reg) mmx_m2r (paddd, var, reg) +#define paddd_r2r(regs,regd) mmx_r2r (paddd, regs, regd) +#define paddw_m2r(var,reg) mmx_m2r (paddw, var, reg) +#define paddw_r2r(regs,regd) mmx_r2r (paddw, regs, regd) + +#define paddsb_m2r(var,reg) mmx_m2r (paddsb, var, reg) +#define paddsb_r2r(regs,regd) mmx_r2r (paddsb, regs, regd) +#define paddsw_m2r(var,reg) mmx_m2r (paddsw, var, reg) +#define paddsw_r2r(regs,regd) mmx_r2r (paddsw, regs, regd) + +#define paddusb_m2r(var,reg) mmx_m2r (paddusb, var, reg) +#define paddusb_r2r(regs,regd) mmx_r2r (paddusb, regs, regd) +#define paddusw_m2r(var,reg) mmx_m2r (paddusw, var, reg) +#define paddusw_r2r(regs,regd) mmx_r2r (paddusw, regs, regd) + +#define pand_m2r(var,reg) mmx_m2r (pand, var, reg) +#define pand_r2r(regs,regd) mmx_r2r (pand, regs, regd) + +#define pandn_m2r(var,reg) mmx_m2r (pandn, var, reg) +#define pandn_r2r(regs,regd) mmx_r2r (pandn, regs, regd) + +#define pcmpeqb_m2r(var,reg) mmx_m2r (pcmpeqb, var, reg) +#define pcmpeqb_r2r(regs,regd) mmx_r2r (pcmpeqb, regs, regd) +#define pcmpeqd_m2r(var,reg) mmx_m2r (pcmpeqd, var, reg) +#define pcmpeqd_r2r(regs,regd) mmx_r2r (pcmpeqd, regs, regd) +#define pcmpeqw_m2r(var,reg) mmx_m2r (pcmpeqw, var, reg) +#define pcmpeqw_r2r(regs,regd) mmx_r2r (pcmpeqw, regs, regd) + +#define pcmpgtb_m2r(var,reg) mmx_m2r (pcmpgtb, var, reg) +#define pcmpgtb_r2r(regs,regd) mmx_r2r (pcmpgtb, regs, regd) +#define pcmpgtd_m2r(var,reg) mmx_m2r (pcmpgtd, var, reg) +#define pcmpgtd_r2r(regs,regd) mmx_r2r (pcmpgtd, regs, regd) +#define pcmpgtw_m2r(var,reg) mmx_m2r (pcmpgtw, var, reg) +#define pcmpgtw_r2r(regs,regd) mmx_r2r (pcmpgtw, regs, regd) + +#define pmaddwd_m2r(var,reg) mmx_m2r (pmaddwd, var, reg) +#define pmaddwd_r2r(regs,regd) mmx_r2r (pmaddwd, regs, regd) + +#define pmulhw_m2r(var,reg) mmx_m2r (pmulhw, var, reg) +#define pmulhw_r2r(regs,regd) mmx_r2r (pmulhw, regs, regd) + +#define pmullw_m2r(var,reg) mmx_m2r (pmullw, var, reg) +#define pmullw_r2r(regs,regd) mmx_r2r (pmullw, regs, regd) + +#define por_m2r(var,reg) mmx_m2r (por, var, reg) +#define por_r2r(regs,regd) mmx_r2r (por, regs, regd) + +#define pslld_i2r(imm,reg) mmx_i2r (pslld, imm, reg) +#define pslld_m2r(var,reg) mmx_m2r (pslld, var, reg) +#define pslld_r2r(regs,regd) mmx_r2r (pslld, regs, regd) +#define psllq_i2r(imm,reg) mmx_i2r (psllq, imm, reg) +#define psllq_m2r(var,reg) mmx_m2r (psllq, var, reg) +#define psllq_r2r(regs,regd) mmx_r2r (psllq, regs, regd) +#define psllw_i2r(imm,reg) mmx_i2r (psllw, imm, reg) +#define psllw_m2r(var,reg) mmx_m2r (psllw, var, reg) +#define psllw_r2r(regs,regd) mmx_r2r (psllw, regs, regd) + +#define psrad_i2r(imm,reg) mmx_i2r (psrad, imm, reg) +#define psrad_m2r(var,reg) mmx_m2r (psrad, var, reg) +#define psrad_r2r(regs,regd) mmx_r2r (psrad, regs, regd) +#define psraw_i2r(imm,reg) mmx_i2r (psraw, imm, reg) +#define psraw_m2r(var,reg) mmx_m2r (psraw, var, reg) +#define psraw_r2r(regs,regd) mmx_r2r (psraw, regs, regd) + +#define psrld_i2r(imm,reg) mmx_i2r (psrld, imm, reg) +#define psrld_m2r(var,reg) mmx_m2r (psrld, var, reg) +#define psrld_r2r(regs,regd) mmx_r2r (psrld, regs, regd) +#define psrlq_i2r(imm,reg) mmx_i2r (psrlq, imm, reg) +#define psrlq_m2r(var,reg) mmx_m2r (psrlq, var, reg) +#define psrlq_r2r(regs,regd) mmx_r2r (psrlq, regs, regd) +#define psrlw_i2r(imm,reg) mmx_i2r (psrlw, imm, reg) +#define psrlw_m2r(var,reg) mmx_m2r (psrlw, var, reg) +#define psrlw_r2r(regs,regd) mmx_r2r (psrlw, regs, regd) + +#define psubb_m2r(var,reg) mmx_m2r (psubb, var, reg) +#define psubb_r2r(regs,regd) mmx_r2r (psubb, regs, regd) +#define psubd_m2r(var,reg) mmx_m2r (psubd, var, reg) +#define psubd_r2r(regs,regd) mmx_r2r (psubd, regs, regd) +#define psubw_m2r(var,reg) mmx_m2r (psubw, var, reg) +#define psubw_r2r(regs,regd) mmx_r2r (psubw, regs, regd) + +#define psubsb_m2r(var,reg) mmx_m2r (psubsb, var, reg) +#define psubsb_r2r(regs,regd) mmx_r2r (psubsb, regs, regd) +#define psubsw_m2r(var,reg) mmx_m2r (psubsw, var, reg) +#define psubsw_r2r(regs,regd) mmx_r2r (psubsw, regs, regd) + +#define psubusb_m2r(var,reg) mmx_m2r (psubusb, var, reg) +#define psubusb_r2r(regs,regd) mmx_r2r (psubusb, regs, regd) +#define psubusw_m2r(var,reg) mmx_m2r (psubusw, var, reg) +#define psubusw_r2r(regs,regd) mmx_r2r (psubusw, regs, regd) + +#define punpckhbw_m2r(var,reg) mmx_m2r (punpckhbw, var, reg) +#define punpckhbw_r2r(regs,regd) mmx_r2r (punpckhbw, regs, regd) +#define punpckhdq_m2r(var,reg) mmx_m2r (punpckhdq, var, reg) +#define punpckhdq_r2r(regs,regd) mmx_r2r (punpckhdq, regs, regd) +#define punpckhwd_m2r(var,reg) mmx_m2r (punpckhwd, var, reg) +#define punpckhwd_r2r(regs,regd) mmx_r2r (punpckhwd, regs, regd) + +#define punpcklbw_m2r(var,reg) mmx_m2r (punpcklbw, var, reg) +#define punpcklbw_r2r(regs,regd) mmx_r2r (punpcklbw, regs, regd) +#define punpckldq_m2r(var,reg) mmx_m2r (punpckldq, var, reg) +#define punpckldq_r2r(regs,regd) mmx_r2r (punpckldq, regs, regd) +#define punpcklwd_m2r(var,reg) mmx_m2r (punpcklwd, var, reg) +#define punpcklwd_r2r(regs,regd) mmx_r2r (punpcklwd, regs, regd) + +#define pxor_m2r(var,reg) mmx_m2r (pxor, var, reg) +#define pxor_r2r(regs,regd) mmx_r2r (pxor, regs, regd) + + +/* 3DNOW extensions */ + +#define pavgusb_m2r(var,reg) mmx_m2r (pavgusb, var, reg) +#define pavgusb_r2r(regs,regd) mmx_r2r (pavgusb, regs, regd) + + +/* AMD MMX extensions - also available in intel SSE */ + + +#define mmx_m2ri(op,mem,reg,imm) \ + __asm__ volatile (#op " %1, %0, %%" #reg \ + : /* nothing */ \ + : "m" (mem), "i" (imm)) +#define mmx_r2ri(op,regs,regd,imm) \ + __asm__ volatile (#op " %0, %%" #regs ", %%" #regd \ + : /* nothing */ \ + : "i" (imm) ) + +#define mmx_fetch(mem,hint) \ + __asm__ volatile ("prefetch" #hint " %0" \ + : /* nothing */ \ + : "m" (mem)) + + +#define maskmovq(regs,maskreg) mmx_r2ri (maskmovq, regs, maskreg) + +#define movntq_r2m(mmreg,var) mmx_r2m (movntq, mmreg, var) + +#define pavgb_m2r(var,reg) mmx_m2r (pavgb, var, reg) +#define pavgb_r2r(regs,regd) mmx_r2r (pavgb, regs, regd) +#define pavgw_m2r(var,reg) mmx_m2r (pavgw, var, reg) +#define pavgw_r2r(regs,regd) mmx_r2r (pavgw, regs, regd) + +#define pextrw_r2r(mmreg,reg,imm) mmx_r2ri (pextrw, mmreg, reg, imm) + +#define pinsrw_r2r(reg,mmreg,imm) mmx_r2ri (pinsrw, reg, mmreg, imm) + +#define pmaxsw_m2r(var,reg) mmx_m2r (pmaxsw, var, reg) +#define pmaxsw_r2r(regs,regd) mmx_r2r (pmaxsw, regs, regd) + +#define pmaxub_m2r(var,reg) mmx_m2r (pmaxub, var, reg) +#define pmaxub_r2r(regs,regd) mmx_r2r (pmaxub, regs, regd) + +#define pminsw_m2r(var,reg) mmx_m2r (pminsw, var, reg) +#define pminsw_r2r(regs,regd) mmx_r2r (pminsw, regs, regd) + +#define pminub_m2r(var,reg) mmx_m2r (pminub, var, reg) +#define pminub_r2r(regs,regd) mmx_r2r (pminub, regs, regd) + +#define pmovmskb(mmreg,reg) \ + __asm__ volatile ("movmskps %" #mmreg ", %" #reg) + +#define pmulhuw_m2r(var,reg) mmx_m2r (pmulhuw, var, reg) +#define pmulhuw_r2r(regs,regd) mmx_r2r (pmulhuw, regs, regd) + +#define prefetcht0(mem) mmx_fetch (mem, t0) +#define prefetcht1(mem) mmx_fetch (mem, t1) +#define prefetcht2(mem) mmx_fetch (mem, t2) +#define prefetchnta(mem) mmx_fetch (mem, nta) + +#define psadbw_m2r(var,reg) mmx_m2r (psadbw, var, reg) +#define psadbw_r2r(regs,regd) mmx_r2r (psadbw, regs, regd) + +#define pshufw_m2r(var,reg,imm) mmx_m2ri(pshufw, var, reg, imm) +#define pshufw_r2r(regs,regd,imm) mmx_r2ri(pshufw, regs, regd, imm) + +#define sfence() __asm__ volatile ("sfence\n\t") + +/* SSE2 */ +#define pshufhw_m2r(var,reg,imm) mmx_m2ri(pshufhw, var, reg, imm) +#define pshufhw_r2r(regs,regd,imm) mmx_r2ri(pshufhw, regs, regd, imm) +#define pshuflw_m2r(var,reg,imm) mmx_m2ri(pshuflw, var, reg, imm) +#define pshuflw_r2r(regs,regd,imm) mmx_r2ri(pshuflw, regs, regd, imm) + +#define pshufd_r2r(regs,regd,imm) mmx_r2ri(pshufd, regs, regd, imm) + +#define movdqa_m2r(var,reg) mmx_m2r (movdqa, var, reg) +#define movdqa_r2m(reg,var) mmx_r2m (movdqa, reg, var) +#define movdqa_r2r(regs,regd) mmx_r2r (movdqa, regs, regd) +#define movdqu_m2r(var,reg) mmx_m2r (movdqu, var, reg) +#define movdqu_r2m(reg,var) mmx_r2m (movdqu, reg, var) +#define movdqu_r2r(regs,regd) mmx_r2r (movdqu, regs, regd) + +#define pmullw_r2m(reg,var) mmx_r2m (pmullw, reg, var) + +#define pslldq_i2r(imm,reg) mmx_i2r (pslldq, imm, reg) +#define psrldq_i2r(imm,reg) mmx_i2r (psrldq, imm, reg) + +#define punpcklqdq_r2r(regs,regd) mmx_r2r (punpcklqdq, regs, regd) +#define punpckhqdq_r2r(regs,regd) mmx_r2r (punpckhqdq, regs, regd) + + +#endif /* AVCODEC_X86_MMX_H */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/x86/motion_est_mmx.c Mon Dec 22 09:12:42 2008 +0000 @@ -0,0 +1,461 @@ +/* + * MMX optimized motion estimation + * Copyright (c) 2001 Fabrice Bellard. + * Copyright (c) 2002-2004 Michael Niedermayer + * + * mostly by Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/x86_cpu.h" +#include "libavcodec/dsputil.h" + +DECLARE_ASM_CONST(8, uint64_t, round_tab[3])={ +0x0000000000000000ULL, +0x0001000100010001ULL, +0x0002000200020002ULL, +}; + +DECLARE_ASM_CONST(8, uint64_t, bone)= 0x0101010101010101LL; + +static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) +{ + x86_reg len= -(stride*h); + __asm__ volatile( + ASMALIGN(4) + "1: \n\t" + "movq (%1, %%"REG_a"), %%mm0 \n\t" + "movq (%2, %%"REG_a"), %%mm2 \n\t" + "movq (%2, %%"REG_a"), %%mm4 \n\t" + "add %3, %%"REG_a" \n\t" + "psubusb %%mm0, %%mm2 \n\t" + "psubusb %%mm4, %%mm0 \n\t" + "movq (%1, %%"REG_a"), %%mm1 \n\t" + "movq (%2, %%"REG_a"), %%mm3 \n\t" + "movq (%2, %%"REG_a"), %%mm5 \n\t" + "psubusb %%mm1, %%mm3 \n\t" + "psubusb %%mm5, %%mm1 \n\t" + "por %%mm2, %%mm0 \n\t" + "por %%mm1, %%mm3 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm3, %%mm2 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpckhbw %%mm7, %%mm1 \n\t" + "punpcklbw %%mm7, %%mm3 \n\t" + "punpckhbw %%mm7, %%mm2 \n\t" + "paddw %%mm1, %%mm0 \n\t" + "paddw %%mm3, %%mm2 \n\t" + "paddw %%mm2, %%mm0 \n\t" + "paddw %%mm0, %%mm6 \n\t" + "add %3, %%"REG_a" \n\t" + " js 1b \n\t" + : "+a" (len) + : "r" (blk1 - len), "r" (blk2 - len), "r" ((x86_reg)stride) + ); +} + +static inline void sad8_1_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h) +{ + __asm__ volatile( + ASMALIGN(4) + "1: \n\t" + "movq (%1), %%mm0 \n\t" + "movq (%1, %3), %%mm1 \n\t" + "psadbw (%2), %%mm0 \n\t" + "psadbw (%2, %3), %%mm1 \n\t" + "paddw %%mm0, %%mm6 \n\t" + "paddw %%mm1, %%mm6 \n\t" + "lea (%1,%3,2), %1 \n\t" + "lea (%2,%3,2), %2 \n\t" + "sub $2, %0 \n\t" + " jg 1b \n\t" + : "+r" (h), "+r" (blk1), "+r" (blk2) + : "r" ((x86_reg)stride) + ); +} + +static int sad16_sse2(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h) +{ + int ret; + __asm__ volatile( + "pxor %%xmm6, %%xmm6 \n\t" + ASMALIGN(4) + "1: \n\t" + "movdqu (%1), %%xmm0 \n\t" + "movdqu (%1, %3), %%xmm1 \n\t" + "psadbw (%2), %%xmm0 \n\t" + "psadbw (%2, %3), %%xmm1 \n\t" + "paddw %%xmm0, %%xmm6 \n\t" + "paddw %%xmm1, %%xmm6 \n\t" + "lea (%1,%3,2), %1 \n\t" + "lea (%2,%3,2), %2 \n\t" + "sub $2, %0 \n\t" + " jg 1b \n\t" + : "+r" (h), "+r" (blk1), "+r" (blk2) + : "r" ((x86_reg)stride) + ); + __asm__ volatile( + "movhlps %%xmm6, %%xmm0 \n\t" + "paddw %%xmm0, %%xmm6 \n\t" + "movd %%xmm6, %0 \n\t" + : "=r"(ret) + ); + return ret; +} + +static inline void sad8_x2a_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h) +{ + __asm__ volatile( + ASMALIGN(4) + "1: \n\t" + "movq (%1), %%mm0 \n\t" + "movq (%1, %3), %%mm1 \n\t" + "pavgb 1(%1), %%mm0 \n\t" + "pavgb 1(%1, %3), %%mm1 \n\t" + "psadbw (%2), %%mm0 \n\t" + "psadbw (%2, %3), %%mm1 \n\t" + "paddw %%mm0, %%mm6 \n\t" + "paddw %%mm1, %%mm6 \n\t" + "lea (%1,%3,2), %1 \n\t" + "lea (%2,%3,2), %2 \n\t" + "sub $2, %0 \n\t" + " jg 1b \n\t" + : "+r" (h), "+r" (blk1), "+r" (blk2) + : "r" ((x86_reg)stride) + ); +} + +static inline void sad8_y2a_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h) +{ + __asm__ volatile( + "movq (%1), %%mm0 \n\t" + "add %3, %1 \n\t" + ASMALIGN(4) + "1: \n\t" + "movq (%1), %%mm1 \n\t" + "movq (%1, %3), %%mm2 \n\t" + "pavgb %%mm1, %%mm0 \n\t" + "pavgb %%mm2, %%mm1 \n\t" + "psadbw (%2), %%mm0 \n\t" + "psadbw (%2, %3), %%mm1 \n\t" + "paddw %%mm0, %%mm6 \n\t" + "paddw %%mm1, %%mm6 \n\t" + "movq %%mm2, %%mm0 \n\t" + "lea (%1,%3,2), %1 \n\t" + "lea (%2,%3,2), %2 \n\t" + "sub $2, %0 \n\t" + " jg 1b \n\t" + : "+r" (h), "+r" (blk1), "+r" (blk2) + : "r" ((x86_reg)stride) + ); +} + +static inline void sad8_4_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h) +{ + __asm__ volatile( + "movq "MANGLE(bone)", %%mm5 \n\t" + "movq (%1), %%mm0 \n\t" + "pavgb 1(%1), %%mm0 \n\t" + "add %3, %1 \n\t" + ASMALIGN(4) + "1: \n\t" + "movq (%1), %%mm1 \n\t" + "movq (%1,%3), %%mm2 \n\t" + "pavgb 1(%1), %%mm1 \n\t" + "pavgb 1(%1,%3), %%mm2 \n\t" + "psubusb %%mm5, %%mm1 \n\t" + "pavgb %%mm1, %%mm0 \n\t" + "pavgb %%mm2, %%mm1 \n\t" + "psadbw (%2), %%mm0 \n\t" + "psadbw (%2,%3), %%mm1 \n\t" + "paddw %%mm0, %%mm6 \n\t" + "paddw %%mm1, %%mm6 \n\t" + "movq %%mm2, %%mm0 \n\t" + "lea (%1,%3,2), %1 \n\t" + "lea (%2,%3,2), %2 \n\t" + "sub $2, %0 \n\t" + " jg 1b \n\t" + : "+r" (h), "+r" (blk1), "+r" (blk2) + : "r" ((x86_reg)stride) + ); +} + +static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h) +{ + x86_reg len= -(stride*h); + __asm__ volatile( + ASMALIGN(4) + "1: \n\t" + "movq (%1, %%"REG_a"), %%mm0 \n\t" + "movq (%2, %%"REG_a"), %%mm1 \n\t" + "movq (%1, %%"REG_a"), %%mm2 \n\t" + "movq (%2, %%"REG_a"), %%mm3 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpcklbw %%mm7, %%mm1 \n\t" + "punpckhbw %%mm7, %%mm2 \n\t" + "punpckhbw %%mm7, %%mm3 \n\t" + "paddw %%mm0, %%mm1 \n\t" + "paddw %%mm2, %%mm3 \n\t" + "movq (%3, %%"REG_a"), %%mm4 \n\t" + "movq (%3, %%"REG_a"), %%mm2 \n\t" + "paddw %%mm5, %%mm1 \n\t" + "paddw %%mm5, %%mm3 \n\t" + "psrlw $1, %%mm1 \n\t" + "psrlw $1, %%mm3 \n\t" + "packuswb %%mm3, %%mm1 \n\t" + "psubusb %%mm1, %%mm4 \n\t" + "psubusb %%mm2, %%mm1 \n\t" + "por %%mm4, %%mm1 \n\t" + "movq %%mm1, %%mm0 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpckhbw %%mm7, %%mm1 \n\t" + "paddw %%mm1, %%mm0 \n\t" + "paddw %%mm0, %%mm6 \n\t" + "add %4, %%"REG_a" \n\t" + " js 1b \n\t" + : "+a" (len) + : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" ((x86_reg)stride) + ); +} + +static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) +{ + x86_reg len= -(stride*h); + __asm__ volatile( + "movq (%1, %%"REG_a"), %%mm0 \n\t" + "movq 1(%1, %%"REG_a"), %%mm2 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm2, %%mm3 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpckhbw %%mm7, %%mm1 \n\t" + "punpcklbw %%mm7, %%mm2 \n\t" + "punpckhbw %%mm7, %%mm3 \n\t" + "paddw %%mm2, %%mm0 \n\t" + "paddw %%mm3, %%mm1 \n\t" + ASMALIGN(4) + "1: \n\t" + "movq (%2, %%"REG_a"), %%mm2 \n\t" + "movq 1(%2, %%"REG_a"), %%mm4 \n\t" + "movq %%mm2, %%mm3 \n\t" + "movq %%mm4, %%mm5 \n\t" + "punpcklbw %%mm7, %%mm2 \n\t" + "punpckhbw %%mm7, %%mm3 \n\t" + "punpcklbw %%mm7, %%mm4 \n\t" + "punpckhbw %%mm7, %%mm5 \n\t" + "paddw %%mm4, %%mm2 \n\t" + "paddw %%mm5, %%mm3 \n\t" + "movq 16+"MANGLE(round_tab)", %%mm5 \n\t" + "paddw %%mm2, %%mm0 \n\t" + "paddw %%mm3, %%mm1 \n\t" + "paddw %%mm5, %%mm0 \n\t" + "paddw %%mm5, %%mm1 \n\t" + "movq (%3, %%"REG_a"), %%mm4 \n\t" + "movq (%3, %%"REG_a"), %%mm5 \n\t" + "psrlw $2, %%mm0 \n\t" + "psrlw $2, %%mm1 \n\t" + "packuswb %%mm1, %%mm0 \n\t" + "psubusb %%mm0, %%mm4 \n\t" + "psubusb %%mm5, %%mm0 \n\t" + "por %%mm4, %%mm0 \n\t" + "movq %%mm0, %%mm4 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpckhbw %%mm7, %%mm4 \n\t" + "paddw %%mm0, %%mm6 \n\t" + "paddw %%mm4, %%mm6 \n\t" + "movq %%mm2, %%mm0 \n\t" + "movq %%mm3, %%mm1 \n\t" + "add %4, %%"REG_a" \n\t" + " js 1b \n\t" + : "+a" (len) + : "r" (blk1 - len), "r" (blk1 -len + stride), "r" (blk2 - len), "r" ((x86_reg)stride) + ); +} + +static inline int sum_mmx(void) +{ + int ret; + __asm__ volatile( + "movq %%mm6, %%mm0 \n\t" + "psrlq $32, %%mm6 \n\t" + "paddw %%mm0, %%mm6 \n\t" + "movq %%mm6, %%mm0 \n\t" + "psrlq $16, %%mm6 \n\t" + "paddw %%mm0, %%mm6 \n\t" + "movd %%mm6, %0 \n\t" + : "=r" (ret) + ); + return ret&0xFFFF; +} + +static inline int sum_mmx2(void) +{ + int ret; + __asm__ volatile( + "movd %%mm6, %0 \n\t" + : "=r" (ret) + ); + return ret; +} + +static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) +{ + sad8_2_mmx(blk1, blk1+1, blk2, stride, h); +} +static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) +{ + sad8_2_mmx(blk1, blk1+stride, blk2, stride, h); +} + + +#define PIX_SAD(suf)\ +static int sad8_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ +{\ + assert(h==8);\ + __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ + "pxor %%mm6, %%mm6 \n\t":);\ +\ + sad8_1_ ## suf(blk1, blk2, stride, 8);\ +\ + return sum_ ## suf();\ +}\ +static int sad8_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ +{\ + assert(h==8);\ + __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ + "pxor %%mm6, %%mm6 \n\t"\ + "movq %0, %%mm5 \n\t"\ + :: "m"(round_tab[1]) \ + );\ +\ + sad8_x2a_ ## suf(blk1, blk2, stride, 8);\ +\ + return sum_ ## suf();\ +}\ +\ +static int sad8_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ +{\ + assert(h==8);\ + __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ + "pxor %%mm6, %%mm6 \n\t"\ + "movq %0, %%mm5 \n\t"\ + :: "m"(round_tab[1]) \ + );\ +\ + sad8_y2a_ ## suf(blk1, blk2, stride, 8);\ +\ + return sum_ ## suf();\ +}\ +\ +static int sad8_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ +{\ + assert(h==8);\ + __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ + "pxor %%mm6, %%mm6 \n\t"\ + ::);\ +\ + sad8_4_ ## suf(blk1, blk2, stride, 8);\ +\ + return sum_ ## suf();\ +}\ +\ +static int sad16_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ +{\ + __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ + "pxor %%mm6, %%mm6 \n\t":);\ +\ + sad8_1_ ## suf(blk1 , blk2 , stride, h);\ + sad8_1_ ## suf(blk1+8, blk2+8, stride, h);\ +\ + return sum_ ## suf();\ +}\ +static int sad16_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ +{\ + __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ + "pxor %%mm6, %%mm6 \n\t"\ + "movq %0, %%mm5 \n\t"\ + :: "m"(round_tab[1]) \ + );\ +\ + sad8_x2a_ ## suf(blk1 , blk2 , stride, h);\ + sad8_x2a_ ## suf(blk1+8, blk2+8, stride, h);\ +\ + return sum_ ## suf();\ +}\ +static int sad16_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ +{\ + __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ + "pxor %%mm6, %%mm6 \n\t"\ + "movq %0, %%mm5 \n\t"\ + :: "m"(round_tab[1]) \ + );\ +\ + sad8_y2a_ ## suf(blk1 , blk2 , stride, h);\ + sad8_y2a_ ## suf(blk1+8, blk2+8, stride, h);\ +\ + return sum_ ## suf();\ +}\ +static int sad16_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ +{\ + __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ + "pxor %%mm6, %%mm6 \n\t"\ + ::);\ +\ + sad8_4_ ## suf(blk1 , blk2 , stride, h);\ + sad8_4_ ## suf(blk1+8, blk2+8, stride, h);\ +\ + return sum_ ## suf();\ +}\ + +PIX_SAD(mmx) +PIX_SAD(mmx2) + +void dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx) +{ + if (mm_flags & FF_MM_MMX) { + c->pix_abs[0][0] = sad16_mmx; + c->pix_abs[0][1] = sad16_x2_mmx; + c->pix_abs[0][2] = sad16_y2_mmx; + c->pix_abs[0][3] = sad16_xy2_mmx; + c->pix_abs[1][0] = sad8_mmx; + c->pix_abs[1][1] = sad8_x2_mmx; + c->pix_abs[1][2] = sad8_y2_mmx; + c->pix_abs[1][3] = sad8_xy2_mmx; + + c->sad[0]= sad16_mmx; + c->sad[1]= sad8_mmx; + } + if (mm_flags & FF_MM_MMXEXT) { + c->pix_abs[0][0] = sad16_mmx2; + c->pix_abs[1][0] = sad8_mmx2; + + c->sad[0]= sad16_mmx2; + c->sad[1]= sad8_mmx2; + + if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ + c->pix_abs[0][1] = sad16_x2_mmx2; + c->pix_abs[0][2] = sad16_y2_mmx2; + c->pix_abs[0][3] = sad16_xy2_mmx2; + c->pix_abs[1][1] = sad8_x2_mmx2; + c->pix_abs[1][2] = sad8_y2_mmx2; + c->pix_abs[1][3] = sad8_xy2_mmx2; + } + } + if ((mm_flags & FF_MM_SSE2) && !(mm_flags & FF_MM_3DNOW)) { + c->sad[0]= sad16_sse2; + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/x86/mpegvideo_mmx.c Mon Dec 22 09:12:42 2008 +0000 @@ -0,0 +1,654 @@ +/* + * The simplest mpeg encoder (well, it was the simplest!) + * Copyright (c) 2000,2001 Fabrice Bellard. + * + * Optimized for ia32 CPUs by Nick Kurshev <nickols_k@mail.ru> + * h263, mpeg1, mpeg2 dequantizer & draw_edges by Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/x86_cpu.h" +#include "libavcodec/avcodec.h" +#include "libavcodec/dsputil.h" +#include "libavcodec/mpegvideo.h" +#include "dsputil_mmx.h" + +extern uint16_t inv_zigzag_direct16[64]; + + +static void dct_unquantize_h263_intra_mmx(MpegEncContext *s, + DCTELEM *block, int n, int qscale) +{ + x86_reg level, qmul, qadd, nCoeffs; + + qmul = qscale << 1; + + assert(s->block_last_index[n]>=0 || s->h263_aic); + + if (!s->h263_aic) { + if (n < 4) + level = block[0] * s->y_dc_scale; + else + level = block[0] * s->c_dc_scale; + qadd = (qscale - 1) | 1; + }else{ + qadd = 0; + level= block[0]; + } + if(s->ac_pred) + nCoeffs=63; + else + nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; +//printf("%d %d ", qmul, qadd); +__asm__ volatile( + "movd %1, %%mm6 \n\t" //qmul + "packssdw %%mm6, %%mm6 \n\t" + "packssdw %%mm6, %%mm6 \n\t" + "movd %2, %%mm5 \n\t" //qadd + "pxor %%mm7, %%mm7 \n\t" + "packssdw %%mm5, %%mm5 \n\t" + "packssdw %%mm5, %%mm5 \n\t" + "psubw %%mm5, %%mm7 \n\t" + "pxor %%mm4, %%mm4 \n\t" + ASMALIGN(4) + "1: \n\t" + "movq (%0, %3), %%mm0 \n\t" + "movq 8(%0, %3), %%mm1 \n\t" + + "pmullw %%mm6, %%mm0 \n\t" + "pmullw %%mm6, %%mm1 \n\t" + + "movq (%0, %3), %%mm2 \n\t" + "movq 8(%0, %3), %%mm3 \n\t" + + "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 + "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 + + "pxor %%mm2, %%mm0 \n\t" + "pxor %%mm3, %%mm1 \n\t" + + "paddw %%mm7, %%mm0 \n\t" + "paddw %%mm7, %%mm1 \n\t" + + "pxor %%mm0, %%mm2 \n\t" + "pxor %%mm1, %%mm3 \n\t" + + "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 + "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 + + "pandn %%mm2, %%mm0 \n\t" + "pandn %%mm3, %%mm1 \n\t" + + "movq %%mm0, (%0, %3) \n\t" + "movq %%mm1, 8(%0, %3) \n\t" + + "add $16, %3 \n\t" + "jng 1b \n\t" + ::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs)) + : "memory" + ); + block[0]= level; +} + + +static void dct_unquantize_h263_inter_mmx(MpegEncContext *s, + DCTELEM *block, int n, int qscale) +{ + x86_reg qmul, qadd, nCoeffs; + + qmul = qscale << 1; + qadd = (qscale - 1) | 1; + + assert(s->block_last_index[n]>=0 || s->h263_aic); + + nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; +//printf("%d %d ", qmul, qadd); +__asm__ volatile( + "movd %1, %%mm6 \n\t" //qmul + "packssdw %%mm6, %%mm6 \n\t" + "packssdw %%mm6, %%mm6 \n\t" + "movd %2, %%mm5 \n\t" //qadd + "pxor %%mm7, %%mm7 \n\t" + "packssdw %%mm5, %%mm5 \n\t" + "packssdw %%mm5, %%mm5 \n\t" + "psubw %%mm5, %%mm7 \n\t" + "pxor %%mm4, %%mm4 \n\t" + ASMALIGN(4) + "1: \n\t" + "movq (%0, %3), %%mm0 \n\t" + "movq 8(%0, %3), %%mm1 \n\t" + + "pmullw %%mm6, %%mm0 \n\t" + "pmullw %%mm6, %%mm1 \n\t" + + "movq (%0, %3), %%mm2 \n\t" + "movq 8(%0, %3), %%mm3 \n\t" + + "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 + "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 + + "pxor %%mm2, %%mm0 \n\t" + "pxor %%mm3, %%mm1 \n\t" + + "paddw %%mm7, %%mm0 \n\t" + "paddw %%mm7, %%mm1 \n\t" + + "pxor %%mm0, %%mm2 \n\t" + "pxor %%mm1, %%mm3 \n\t" + + "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 + "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 + + "pandn %%mm2, %%mm0 \n\t" + "pandn %%mm3, %%mm1 \n\t" + + "movq %%mm0, (%0, %3) \n\t" + "movq %%mm1, 8(%0, %3) \n\t" + + "add $16, %3 \n\t" + "jng 1b \n\t" + ::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs)) + : "memory" + ); +} + + +/* + NK: + Note: looking at PARANOID: + "enable all paranoid tests for rounding, overflows, etc..." + +#ifdef PARANOID + if (level < -2048 || level > 2047) + fprintf(stderr, "unquant error %d %d\n", i, level); +#endif + We can suppose that result of two multiplications can't be greater than 0xFFFF + i.e. is 16-bit, so we use here only PMULLW instruction and can avoid + a complex multiplication. +===================================================== + Full formula for multiplication of 2 integer numbers + which are represent as high:low words: + input: value1 = high1:low1 + value2 = high2:low2 + output: value3 = value1*value2 + value3=high3:low3 (on overflow: modulus 2^32 wrap-around) + this mean that for 0x123456 * 0x123456 correct result is 0x766cb0ce4 + but this algorithm will compute only 0x66cb0ce4 + this limited by 16-bit size of operands + --------------------------------- + tlow1 = high1*low2 + tlow2 = high2*low1 + tlow1 = tlow1 + tlow2 + high3:low3 = low1*low2 + high3 += tlow1 +*/ +static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s, + DCTELEM *block, int n, int qscale) +{ + x86_reg nCoeffs; + const uint16_t *quant_matrix; + int block0; + + assert(s->block_last_index[n]>=0); + + nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1; + + if (n < 4) + block0 = block[0] * s->y_dc_scale; + else + block0 = block[0] * s->c_dc_scale; + /* XXX: only mpeg1 */ + quant_matrix = s->intra_matrix; +__asm__ volatile( + "pcmpeqw %%mm7, %%mm7 \n\t" + "psrlw $15, %%mm7 \n\t" + "movd %2, %%mm6 \n\t" + "packssdw %%mm6, %%mm6 \n\t" + "packssdw %%mm6, %%mm6 \n\t" + "mov %3, %%"REG_a" \n\t" + ASMALIGN(4) + "1: \n\t" + "movq (%0, %%"REG_a"), %%mm0 \n\t" + "movq 8(%0, %%"REG_a"), %%mm1 \n\t" + "movq (%1, %%"REG_a"), %%mm4 \n\t" + "movq 8(%1, %%"REG_a"), %%mm5 \n\t" + "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] + "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] + "pxor %%mm2, %%mm2 \n\t" + "pxor %%mm3, %%mm3 \n\t" + "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 + "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 + "pxor %%mm2, %%mm0 \n\t" + "pxor %%mm3, %%mm1 \n\t" + "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) + "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) + "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q + "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q + "pxor %%mm4, %%mm4 \n\t" + "pxor %%mm5, %%mm5 \n\t" // FIXME slow + "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 + "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 + "psraw $3, %%mm0 \n\t" + "psraw $3, %%mm1 \n\t" + "psubw %%mm7, %%mm0 \n\t" + "psubw %%mm7, %%mm1 \n\t" + "por %%mm7, %%mm0 \n\t" + "por %%mm7, %%mm1 \n\t" + "pxor %%mm2, %%mm0 \n\t" + "pxor %%mm3, %%mm1 \n\t" + "psubw %%mm2, %%mm0 \n\t" + "psubw %%mm3, %%mm1 \n\t" + "pandn %%mm0, %%mm4 \n\t" + "pandn %%mm1, %%mm5 \n\t" + "movq %%mm4, (%0, %%"REG_a") \n\t" + "movq %%mm5, 8(%0, %%"REG_a") \n\t" + + "add $16, %%"REG_a" \n\t" + "js 1b \n\t" + ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs) + : "%"REG_a, "memory" + ); + block[0]= block0; +} + +static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s, + DCTELEM *block, int n, int qscale) +{ + x86_reg nCoeffs; + const uint16_t *quant_matrix; + + assert(s->block_last_index[n]>=0); + + nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1; + + quant_matrix = s->inter_matrix; +__asm__ volatile( + "pcmpeqw %%mm7, %%mm7 \n\t" + "psrlw $15, %%mm7 \n\t" + "movd %2, %%mm6 \n\t" + "packssdw %%mm6, %%mm6 \n\t" + "packssdw %%mm6, %%mm6 \n\t" + "mov %3, %%"REG_a" \n\t" + ASMALIGN(4) + "1: \n\t" + "movq (%0, %%"REG_a"), %%mm0 \n\t" + "movq 8(%0, %%"REG_a"), %%mm1 \n\t" + "movq (%1, %%"REG_a"), %%mm4 \n\t" + "movq 8(%1, %%"REG_a"), %%mm5 \n\t" + "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] + "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] + "pxor %%mm2, %%mm2 \n\t" + "pxor %%mm3, %%mm3 \n\t" + "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 + "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 + "pxor %%mm2, %%mm0 \n\t" + "pxor %%mm3, %%mm1 \n\t" + "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) + "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) + "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2 + "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2 + "paddw %%mm7, %%mm0 \n\t" // abs(block[i])*2 + 1 + "paddw %%mm7, %%mm1 \n\t" // abs(block[i])*2 + 1 + "pmullw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q + "pmullw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q + "pxor %%mm4, %%mm4 \n\t" + "pxor %%mm5, %%mm5 \n\t" // FIXME slow + "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 + "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 + "psraw $4, %%mm0 \n\t" + "psraw $4, %%mm1 \n\t" + "psubw %%mm7, %%mm0 \n\t" + "psubw %%mm7, %%mm1 \n\t" + "por %%mm7, %%mm0 \n\t" + "por %%mm7, %%mm1 \n\t" + "pxor %%mm2, %%mm0 \n\t" + "pxor %%mm3, %%mm1 \n\t" + "psubw %%mm2, %%mm0 \n\t" + "psubw %%mm3, %%mm1 \n\t" + "pandn %%mm0, %%mm4 \n\t" + "pandn %%mm1, %%mm5 \n\t" + "movq %%mm4, (%0, %%"REG_a") \n\t" + "movq %%mm5, 8(%0, %%"REG_a") \n\t" + + "add $16, %%"REG_a" \n\t" + "js 1b \n\t" + ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs) + : "%"REG_a, "memory" + ); +} + +static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s, + DCTELEM *block, int n, int qscale) +{ + x86_reg nCoeffs; + const uint16_t *quant_matrix; + int block0; + + assert(s->block_last_index[n]>=0); + + if(s->alternate_scan) nCoeffs= 63; //FIXME + else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; + + if (n < 4) + block0 = block[0] * s->y_dc_scale; + else + block0 = block[0] * s->c_dc_scale; + quant_matrix = s->intra_matrix; +__asm__ volatile( + "pcmpeqw %%mm7, %%mm7 \n\t" + "psrlw $15, %%mm7 \n\t" + "movd %2, %%mm6 \n\t" + "packssdw %%mm6, %%mm6 \n\t" + "packssdw %%mm6, %%mm6 \n\t" + "mov %3, %%"REG_a" \n\t" + ASMALIGN(4) + "1: \n\t" + "movq (%0, %%"REG_a"), %%mm0 \n\t" + "movq 8(%0, %%"REG_a"), %%mm1 \n\t" + "movq (%1, %%"REG_a"), %%mm4 \n\t" + "movq 8(%1, %%"REG_a"), %%mm5 \n\t" + "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] + "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] + "pxor %%mm2, %%mm2 \n\t" + "pxor %%mm3, %%mm3 \n\t" + "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 + "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 + "pxor %%mm2, %%mm0 \n\t" + "pxor %%mm3, %%mm1 \n\t" + "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) + "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) + "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q + "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q + "pxor %%mm4, %%mm4 \n\t" + "pxor %%mm5, %%mm5 \n\t" // FIXME slow + "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 + "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 + "psraw $3, %%mm0 \n\t" + "psraw $3, %%mm1 \n\t" + "pxor %%mm2, %%mm0 \n\t" + "pxor %%mm3, %%mm1 \n\t" + "psubw %%mm2, %%mm0 \n\t" + "psubw %%mm3, %%mm1 \n\t" + "pandn %%mm0, %%mm4 \n\t" + "pandn %%mm1, %%mm5 \n\t" + "movq %%mm4, (%0, %%"REG_a") \n\t" + "movq %%mm5, 8(%0, %%"REG_a") \n\t" + + "add $16, %%"REG_a" \n\t" + "jng 1b \n\t" + ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs) + : "%"REG_a, "memory" + ); + block[0]= block0; + //Note, we do not do mismatch control for intra as errors cannot accumulate +} + +static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s, + DCTELEM *block, int n, int qscale) +{ + x86_reg nCoeffs; + const uint16_t *quant_matrix; + + assert(s->block_last_index[n]>=0); + + if(s->alternate_scan) nCoeffs= 63; //FIXME + else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; + + quant_matrix = s->inter_matrix; +__asm__ volatile( + "pcmpeqw %%mm7, %%mm7 \n\t" + "psrlq $48, %%mm7 \n\t" + "movd %2, %%mm6 \n\t" + "packssdw %%mm6, %%mm6 \n\t" + "packssdw %%mm6, %%mm6 \n\t" + "mov %3, %%"REG_a" \n\t" + ASMALIGN(4) + "1: \n\t" + "movq (%0, %%"REG_a"), %%mm0 \n\t" + "movq 8(%0, %%"REG_a"), %%mm1 \n\t" + "movq (%1, %%"REG_a"), %%mm4 \n\t" + "movq 8(%1, %%"REG_a"), %%mm5 \n\t" + "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] + "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] + "pxor %%mm2, %%mm2 \n\t" + "pxor %%mm3, %%mm3 \n\t" + "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 + "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 + "pxor %%mm2, %%mm0 \n\t" + "pxor %%mm3, %%mm1 \n\t" + "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) + "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) + "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2 + "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2 + "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*2*q + "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*2*q + "paddw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q + "paddw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q + "pxor %%mm4, %%mm4 \n\t" + "pxor %%mm5, %%mm5 \n\t" // FIXME slow + "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 + "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 + "psrlw $4, %%mm0 \n\t" + "psrlw $4, %%mm1 \n\t" + "pxor %%mm2, %%mm0 \n\t" + "pxor %%mm3, %%mm1 \n\t" + "psubw %%mm2, %%mm0 \n\t" + "psubw %%mm3, %%mm1 \n\t" + "pandn %%mm0, %%mm4 \n\t" + "pandn %%mm1, %%mm5 \n\t" + "pxor %%mm4, %%mm7 \n\t" + "pxor %%mm5, %%mm7 \n\t" + "movq %%mm4, (%0, %%"REG_a") \n\t" + "movq %%mm5, 8(%0, %%"REG_a") \n\t" + + "add $16, %%"REG_a" \n\t" + "jng 1b \n\t" + "movd 124(%0, %3), %%mm0 \n\t" + "movq %%mm7, %%mm6 \n\t" + "psrlq $32, %%mm7 \n\t" + "pxor %%mm6, %%mm7 \n\t" + "movq %%mm7, %%mm6 \n\t" + "psrlq $16, %%mm7 \n\t" + "pxor %%mm6, %%mm7 \n\t" + "pslld $31, %%mm7 \n\t" + "psrlq $15, %%mm7 \n\t" + "pxor %%mm7, %%mm0 \n\t" + "movd %%mm0, 124(%0, %3) \n\t" + + ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "r" (-2*nCoeffs) + : "%"REG_a, "memory" + ); +} + +static void denoise_dct_mmx(MpegEncContext *s, DCTELEM *block){ + const int intra= s->mb_intra; + int *sum= s->dct_error_sum[intra]; + uint16_t *offset= s->dct_offset[intra]; + + s->dct_count[intra]++; + + __asm__ volatile( + "pxor %%mm7, %%mm7 \n\t" + "1: \n\t" + "pxor %%mm0, %%mm0 \n\t" + "pxor %%mm1, %%mm1 \n\t" + "movq (%0), %%mm2 \n\t" + "movq 8(%0), %%mm3 \n\t" + "pcmpgtw %%mm2, %%mm0 \n\t" + "pcmpgtw %%mm3, %%mm1 \n\t" + "pxor %%mm0, %%mm2 \n\t" + "pxor %%mm1, %%mm3 \n\t" + "psubw %%mm0, %%mm2 \n\t" + "psubw %%mm1, %%mm3 \n\t" + "movq %%mm2, %%mm4 \n\t" + "movq %%mm3, %%mm5 \n\t" + "psubusw (%2), %%mm2 \n\t" + "psubusw 8(%2), %%mm3 \n\t" + "pxor %%mm0, %%mm2 \n\t" + "pxor %%mm1, %%mm3 \n\t" + "psubw %%mm0, %%mm2 \n\t" + "psubw %%mm1, %%mm3 \n\t" + "movq %%mm2, (%0) \n\t" + "movq %%mm3, 8(%0) \n\t" + "movq %%mm4, %%mm2 \n\t" + "movq %%mm5, %%mm3 \n\t" + "punpcklwd %%mm7, %%mm4 \n\t" + "punpckhwd %%mm7, %%mm2 \n\t" + "punpcklwd %%mm7, %%mm5 \n\t" + "punpckhwd %%mm7, %%mm3 \n\t" + "paddd (%1), %%mm4 \n\t" + "paddd 8(%1), %%mm2 \n\t" + "paddd 16(%1), %%mm5 \n\t" + "paddd 24(%1), %%mm3 \n\t" + "movq %%mm4, (%1) \n\t" + "movq %%mm2, 8(%1) \n\t" + "movq %%mm5, 16(%1) \n\t" + "movq %%mm3, 24(%1) \n\t" + "add $16, %0 \n\t" + "add $32, %1 \n\t" + "add $16, %2 \n\t" + "cmp %3, %0 \n\t" + " jb 1b \n\t" + : "+r" (block), "+r" (sum), "+r" (offset) + : "r"(block+64) + ); +} + +static void denoise_dct_sse2(MpegEncContext *s, DCTELEM *block){ + const int intra= s->mb_intra; + int *sum= s->dct_error_sum[intra]; + uint16_t *offset= s->dct_offset[intra]; + + s->dct_count[intra]++; + + __asm__ volatile( + "pxor %%xmm7, %%xmm7 \n\t" + "1: \n\t" + "pxor %%xmm0, %%xmm0 \n\t" + "pxor %%xmm1, %%xmm1 \n\t" + "movdqa (%0), %%xmm2 \n\t" + "movdqa 16(%0), %%xmm3 \n\t" + "pcmpgtw %%xmm2, %%xmm0 \n\t" + "pcmpgtw %%xmm3, %%xmm1 \n\t" + "pxor %%xmm0, %%xmm2 \n\t" + "pxor %%xmm1, %%xmm3 \n\t" + "psubw %%xmm0, %%xmm2 \n\t" + "psubw %%xmm1, %%xmm3 \n\t" + "movdqa %%xmm2, %%xmm4 \n\t" + "movdqa %%xmm3, %%xmm5 \n\t" + "psubusw (%2), %%xmm2 \n\t" + "psubusw 16(%2), %%xmm3 \n\t" + "pxor %%xmm0, %%xmm2 \n\t" + "pxor %%xmm1, %%xmm3 \n\t" + "psubw %%xmm0, %%xmm2 \n\t" + "psubw %%xmm1, %%xmm3 \n\t" + "movdqa %%xmm2, (%0) \n\t" + "movdqa %%xmm3, 16(%0) \n\t" + "movdqa %%xmm4, %%xmm6 \n\t" + "movdqa %%xmm5, %%xmm0 \n\t" + "punpcklwd %%xmm7, %%xmm4 \n\t" + "punpckhwd %%xmm7, %%xmm6 \n\t" + "punpcklwd %%xmm7, %%xmm5 \n\t" + "punpckhwd %%xmm7, %%xmm0 \n\t" + "paddd (%1), %%xmm4 \n\t" + "paddd 16(%1), %%xmm6 \n\t" + "paddd 32(%1), %%xmm5 \n\t" + "paddd 48(%1), %%xmm0 \n\t" + "movdqa %%xmm4, (%1) \n\t" + "movdqa %%xmm6, 16(%1) \n\t" + "movdqa %%xmm5, 32(%1) \n\t" + "movdqa %%xmm0, 48(%1) \n\t" + "add $32, %0 \n\t" + "add $64, %1 \n\t" + "add $32, %2 \n\t" + "cmp %3, %0 \n\t" + " jb 1b \n\t" + : "+r" (block), "+r" (sum), "+r" (offset) + : "r"(block+64) + ); +} + +#ifdef HAVE_SSSE3 +#define HAVE_SSSE3_BAK +#endif +#undef HAVE_SSSE3 + +#undef HAVE_SSE2 +#undef HAVE_MMX2 +#define RENAME(a) a ## _MMX +#define RENAMEl(a) a ## _mmx +#include "mpegvideo_mmx_template.c" + +#define HAVE_MMX2 +#undef RENAME +#undef RENAMEl +#define RENAME(a) a ## _MMX2 +#define RENAMEl(a) a ## _mmx2 +#include "mpegvideo_mmx_template.c" + +#define HAVE_SSE2 +#undef RENAME +#undef RENAMEl +#define RENAME(a) a ## _SSE2 +#define RENAMEl(a) a ## _sse2 +#include "mpegvideo_mmx_template.c" + +#ifdef HAVE_SSSE3_BAK +#define HAVE_SSSE3 +#undef RENAME +#undef RENAMEl +#define RENAME(a) a ## _SSSE3 +#define RENAMEl(a) a ## _sse2 +#include "mpegvideo_mmx_template.c" +#endif + +void MPV_common_init_mmx(MpegEncContext *s) +{ + if (mm_flags & FF_MM_MMX) { + const int dct_algo = s->avctx->dct_algo; + + s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_mmx; + s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_mmx; + s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_mmx; + s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_mmx; + if(!(s->flags & CODEC_FLAG_BITEXACT)) + s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx; + s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx; + + if (mm_flags & FF_MM_SSE2) { + s->denoise_dct= denoise_dct_sse2; + } else { + s->denoise_dct= denoise_dct_mmx; + } + + if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){ +#ifdef HAVE_SSSE3 + if(mm_flags & FF_MM_SSSE3){ + s->dct_quantize= dct_quantize_SSSE3; + } else +#endif + if(mm_flags & FF_MM_SSE2){ + s->dct_quantize= dct_quantize_SSE2; + } else if(mm_flags & FF_MM_MMXEXT){ + s->dct_quantize= dct_quantize_MMX2; + } else { + s->dct_quantize= dct_quantize_MMX; + } + } + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/x86/mpegvideo_mmx_template.c Mon Dec 22 09:12:42 2008 +0000 @@ -0,0 +1,376 @@ +/* + * MPEG video MMX templates + * + * Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#undef MMREG_WIDTH +#undef MM +#undef MOVQ +#undef SPREADW +#undef PMAXW +#undef PMAX +#undef SAVE_SIGN +#undef RESTORE_SIGN + +#if defined(HAVE_SSE2) +#define MMREG_WIDTH "16" +#define MM "%%xmm" +#define MOVQ "movdqa" +#define SPREADW(a) \ + "pshuflw $0, "a", "a" \n\t"\ + "punpcklwd "a", "a" \n\t" +#define PMAXW(a,b) "pmaxsw "a", "b" \n\t" +#define PMAX(a,b) \ + "movhlps "a", "b" \n\t"\ + PMAXW(b, a)\ + "pshuflw $0x0E, "a", "b" \n\t"\ + PMAXW(b, a)\ + "pshuflw $0x01, "a", "b" \n\t"\ + PMAXW(b, a) +#else +#define MMREG_WIDTH "8" +#define MM "%%mm" +#define MOVQ "movq" +#if defined(HAVE_MMX2) +#define SPREADW(a) "pshufw $0, "a", "a" \n\t" +#define PMAXW(a,b) "pmaxsw "a", "b" \n\t" +#define PMAX(a,b) \ + "pshufw $0x0E, "a", "b" \n\t"\ + PMAXW(b, a)\ + "pshufw $0x01, "a", "b" \n\t"\ + PMAXW(b, a) +#else +#define SPREADW(a) \ + "punpcklwd "a", "a" \n\t"\ + "punpcklwd "a", "a" \n\t" +#define PMAXW(a,b) \ + "psubusw "a", "b" \n\t"\ + "paddw "a", "b" \n\t" +#define PMAX(a,b) \ + "movq "a", "b" \n\t"\ + "psrlq $32, "a" \n\t"\ + PMAXW(b, a)\ + "movq "a", "b" \n\t"\ + "psrlq $16, "a" \n\t"\ + PMAXW(b, a) + +#endif +#endif + +#ifdef HAVE_SSSE3 +#define SAVE_SIGN(a,b) \ + "movdqa "b", "a" \n\t"\ + "pabsw "b", "b" \n\t" +#define RESTORE_SIGN(a,b) \ + "psignw "a", "b" \n\t" +#else +#define SAVE_SIGN(a,b) \ + "pxor "a", "a" \n\t"\ + "pcmpgtw "b", "a" \n\t" /* block[i] <= 0 ? 0xFF : 0x00 */\ + "pxor "a", "b" \n\t"\ + "psubw "a", "b" \n\t" /* ABS(block[i]) */ +#define RESTORE_SIGN(a,b) \ + "pxor "a", "b" \n\t"\ + "psubw "a", "b" \n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i]) +#endif + +static int RENAME(dct_quantize)(MpegEncContext *s, + DCTELEM *block, int n, + int qscale, int *overflow) +{ + x86_reg last_non_zero_p1; + int level=0, q; //=0 is because gcc says uninitialized ... + const uint16_t *qmat, *bias; + DECLARE_ALIGNED_16(int16_t, temp_block[64]); + + assert((7&(int)(&temp_block[0])) == 0); //did gcc align it correctly? + + //s->fdct (block); + RENAMEl(ff_fdct) (block); //cannot be anything else ... + + if(s->dct_error_sum) + s->denoise_dct(s, block); + + if (s->mb_intra) { + int dummy; + if (n < 4) + q = s->y_dc_scale; + else + q = s->c_dc_scale; + /* note: block[0] is assumed to be positive */ + if (!s->h263_aic) { +#if 1 + __asm__ volatile ( + "mul %%ecx \n\t" + : "=d" (level), "=a"(dummy) + : "a" ((block[0]>>2) + q), "c" (ff_inverse[q<<1]) + ); +#else + __asm__ volatile ( + "xorl %%edx, %%edx \n\t" + "divw %%cx \n\t" + "movzwl %%ax, %%eax \n\t" + : "=a" (level) + : "a" ((block[0]>>2) + q), "c" (q<<1) + : "%edx" + ); +#endif + } else + /* For AIC we skip quant/dequant of INTRADC */ + level = (block[0] + 4)>>3; + + block[0]=0; //avoid fake overflow +// temp_block[0] = (block[0] + (q >> 1)) / q; + last_non_zero_p1 = 1; + bias = s->q_intra_matrix16[qscale][1]; + qmat = s->q_intra_matrix16[qscale][0]; + } else { + last_non_zero_p1 = 0; + bias = s->q_inter_matrix16[qscale][1]; + qmat = s->q_inter_matrix16[qscale][0]; + } + + if((s->out_format == FMT_H263 || s->out_format == FMT_H261) && s->mpeg_quant==0){ + + __asm__ volatile( + "movd %%"REG_a", "MM"3 \n\t" // last_non_zero_p1 + SPREADW(MM"3") + "pxor "MM"7, "MM"7 \n\t" // 0 + "pxor "MM"4, "MM"4 \n\t" // 0 + MOVQ" (%2), "MM"5 \n\t" // qmat[0] + "pxor "MM"6, "MM"6 \n\t" + "psubw (%3), "MM"6 \n\t" // -bias[0] + "mov $-128, %%"REG_a" \n\t" + ASMALIGN(4) + "1: \n\t" + MOVQ" (%1, %%"REG_a"), "MM"0 \n\t" // block[i] + SAVE_SIGN(MM"1", MM"0") // ABS(block[i]) + "psubusw "MM"6, "MM"0 \n\t" // ABS(block[i]) + bias[0] + "pmulhw "MM"5, "MM"0 \n\t" // (ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16 + "por "MM"0, "MM"4 \n\t" + RESTORE_SIGN(MM"1", MM"0") // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i]) + MOVQ" "MM"0, (%5, %%"REG_a") \n\t" + "pcmpeqw "MM"7, "MM"0 \n\t" // out==0 ? 0xFF : 0x00 + MOVQ" (%4, %%"REG_a"), "MM"1 \n\t" + MOVQ" "MM"7, (%1, %%"REG_a") \n\t" // 0 + "pandn "MM"1, "MM"0 \n\t" + PMAXW(MM"0", MM"3") + "add $"MMREG_WIDTH", %%"REG_a" \n\t" + " js 1b \n\t" + PMAX(MM"3", MM"0") + "movd "MM"3, %%"REG_a" \n\t" + "movzb %%al, %%"REG_a" \n\t" // last_non_zero_p1 + : "+a" (last_non_zero_p1) + : "r" (block+64), "r" (qmat), "r" (bias), + "r" (inv_zigzag_direct16+64), "r" (temp_block+64) + ); + }else{ // FMT_H263 + __asm__ volatile( + "movd %%"REG_a", "MM"3 \n\t" // last_non_zero_p1 + SPREADW(MM"3") + "pxor "MM"7, "MM"7 \n\t" // 0 + "pxor "MM"4, "MM"4 \n\t" // 0 + "mov $-128, %%"REG_a" \n\t" + ASMALIGN(4) + "1: \n\t" + MOVQ" (%1, %%"REG_a"), "MM"0 \n\t" // block[i] + SAVE_SIGN(MM"1", MM"0") // ABS(block[i]) + MOVQ" (%3, %%"REG_a"), "MM"6 \n\t" // bias[0] + "paddusw "MM"6, "MM"0 \n\t" // ABS(block[i]) + bias[0] + MOVQ" (%2, %%"REG_a"), "MM"5 \n\t" // qmat[i] + "pmulhw "MM"5, "MM"0 \n\t" // (ABS(block[i])*qmat[0] + bias[0]*qmat[0])>>16 + "por "MM"0, "MM"4 \n\t" + RESTORE_SIGN(MM"1", MM"0") // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i]) + MOVQ" "MM"0, (%5, %%"REG_a") \n\t" + "pcmpeqw "MM"7, "MM"0 \n\t" // out==0 ? 0xFF : 0x00 + MOVQ" (%4, %%"REG_a"), "MM"1 \n\t" + MOVQ" "MM"7, (%1, %%"REG_a") \n\t" // 0 + "pandn "MM"1, "MM"0 \n\t" + PMAXW(MM"0", MM"3") + "add $"MMREG_WIDTH", %%"REG_a" \n\t" + " js 1b \n\t" + PMAX(MM"3", MM"0") + "movd "MM"3, %%"REG_a" \n\t" + "movzb %%al, %%"REG_a" \n\t" // last_non_zero_p1 + : "+a" (last_non_zero_p1) + : "r" (block+64), "r" (qmat+64), "r" (bias+64), + "r" (inv_zigzag_direct16+64), "r" (temp_block+64) + ); + } + __asm__ volatile( + "movd %1, "MM"1 \n\t" // max_qcoeff + SPREADW(MM"1") + "psubusw "MM"1, "MM"4 \n\t" + "packuswb "MM"4, "MM"4 \n\t" +#ifdef HAVE_SSE2 + "packuswb "MM"4, "MM"4 \n\t" +#endif + "movd "MM"4, %0 \n\t" // *overflow + : "=g" (*overflow) + : "g" (s->max_qcoeff) + ); + + if(s->mb_intra) block[0]= level; + else block[0]= temp_block[0]; + + if(s->dsp.idct_permutation_type == FF_SIMPLE_IDCT_PERM){ + if(last_non_zero_p1 <= 1) goto end; + block[0x08] = temp_block[0x01]; block[0x10] = temp_block[0x08]; + block[0x20] = temp_block[0x10]; + if(last_non_zero_p1 <= 4) goto end; + block[0x18] = temp_block[0x09]; block[0x04] = temp_block[0x02]; + block[0x09] = temp_block[0x03]; + if(last_non_zero_p1 <= 7) goto end; + block[0x14] = temp_block[0x0A]; block[0x28] = temp_block[0x11]; + block[0x12] = temp_block[0x18]; block[0x02] = temp_block[0x20]; + if(last_non_zero_p1 <= 11) goto end; + block[0x1A] = temp_block[0x19]; block[0x24] = temp_block[0x12]; + block[0x19] = temp_block[0x0B]; block[0x01] = temp_block[0x04]; + block[0x0C] = temp_block[0x05]; + if(last_non_zero_p1 <= 16) goto end; + block[0x11] = temp_block[0x0C]; block[0x29] = temp_block[0x13]; + block[0x16] = temp_block[0x1A]; block[0x0A] = temp_block[0x21]; + block[0x30] = temp_block[0x28]; block[0x22] = temp_block[0x30]; + block[0x38] = temp_block[0x29]; block[0x06] = temp_block[0x22]; + if(last_non_zero_p1 <= 24) goto end; + block[0x1B] = temp_block[0x1B]; block[0x21] = temp_block[0x14]; + block[0x1C] = temp_block[0x0D]; block[0x05] = temp_block[0x06]; + block[0x0D] = temp_block[0x07]; block[0x15] = temp_block[0x0E]; + block[0x2C] = temp_block[0x15]; block[0x13] = temp_block[0x1C]; + if(last_non_zero_p1 <= 32) goto end; + block[0x0B] = temp_block[0x23]; block[0x34] = temp_block[0x2A]; + block[0x2A] = temp_block[0x31]; block[0x32] = temp_block[0x38]; + block[0x3A] = temp_block[0x39]; block[0x26] = temp_block[0x32]; + block[0x39] = temp_block[0x2B]; block[0x03] = temp_block[0x24]; + if(last_non_zero_p1 <= 40) goto end; + block[0x1E] = temp_block[0x1D]; block[0x25] = temp_block[0x16]; + block[0x1D] = temp_block[0x0F]; block[0x2D] = temp_block[0x17]; + block[0x17] = temp_block[0x1E]; block[0x0E] = temp_block[0x25]; + block[0x31] = temp_block[0x2C]; block[0x2B] = temp_block[0x33]; + if(last_non_zero_p1 <= 48) goto end; + block[0x36] = temp_block[0x3A]; block[0x3B] = temp_block[0x3B]; + block[0x23] = temp_block[0x34]; block[0x3C] = temp_block[0x2D]; + block[0x07] = temp_block[0x26]; block[0x1F] = temp_block[0x1F]; + block[0x0F] = temp_block[0x27]; block[0x35] = temp_block[0x2E]; + if(last_non_zero_p1 <= 56) goto end; + block[0x2E] = temp_block[0x35]; block[0x33] = temp_block[0x3C]; + block[0x3E] = temp_block[0x3D]; block[0x27] = temp_block[0x36]; + block[0x3D] = temp_block[0x2F]; block[0x2F] = temp_block[0x37]; + block[0x37] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F]; + }else if(s->dsp.idct_permutation_type == FF_LIBMPEG2_IDCT_PERM){ + if(last_non_zero_p1 <= 1) goto end; + block[0x04] = temp_block[0x01]; + block[0x08] = temp_block[0x08]; block[0x10] = temp_block[0x10]; + if(last_non_zero_p1 <= 4) goto end; + block[0x0C] = temp_block[0x09]; block[0x01] = temp_block[0x02]; + block[0x05] = temp_block[0x03]; + if(last_non_zero_p1 <= 7) goto end; + block[0x09] = temp_block[0x0A]; block[0x14] = temp_block[0x11]; + block[0x18] = temp_block[0x18]; block[0x20] = temp_block[0x20]; + if(last_non_zero_p1 <= 11) goto end; + block[0x1C] = temp_block[0x19]; + block[0x11] = temp_block[0x12]; block[0x0D] = temp_block[0x0B]; + block[0x02] = temp_block[0x04]; block[0x06] = temp_block[0x05]; + if(last_non_zero_p1 <= 16) goto end; + block[0x0A] = temp_block[0x0C]; block[0x15] = temp_block[0x13]; + block[0x19] = temp_block[0x1A]; block[0x24] = temp_block[0x21]; + block[0x28] = temp_block[0x28]; block[0x30] = temp_block[0x30]; + block[0x2C] = temp_block[0x29]; block[0x21] = temp_block[0x22]; + if(last_non_zero_p1 <= 24) goto end; + block[0x1D] = temp_block[0x1B]; block[0x12] = temp_block[0x14]; + block[0x0E] = temp_block[0x0D]; block[0x03] = temp_block[0x06]; + block[0x07] = temp_block[0x07]; block[0x0B] = temp_block[0x0E]; + block[0x16] = temp_block[0x15]; block[0x1A] = temp_block[0x1C]; + if(last_non_zero_p1 <= 32) goto end; + block[0x25] = temp_block[0x23]; block[0x29] = temp_block[0x2A]; + block[0x34] = temp_block[0x31]; block[0x38] = temp_block[0x38]; + block[0x3C] = temp_block[0x39]; block[0x31] = temp_block[0x32]; + block[0x2D] = temp_block[0x2B]; block[0x22] = temp_block[0x24]; + if(last_non_zero_p1 <= 40) goto end; + block[0x1E] = temp_block[0x1D]; block[0x13] = temp_block[0x16]; + block[0x0F] = temp_block[0x0F]; block[0x17] = temp_block[0x17]; + block[0x1B] = temp_block[0x1E]; block[0x26] = temp_block[0x25]; + block[0x2A] = temp_block[0x2C]; block[0x35] = temp_block[0x33]; + if(last_non_zero_p1 <= 48) goto end; + block[0x39] = temp_block[0x3A]; block[0x3D] = temp_block[0x3B]; + block[0x32] = temp_block[0x34]; block[0x2E] = temp_block[0x2D]; + block[0x23] = temp_block[0x26]; block[0x1F] = temp_block[0x1F]; + block[0x27] = temp_block[0x27]; block[0x2B] = temp_block[0x2E]; + if(last_non_zero_p1 <= 56) goto end; + block[0x36] = temp_block[0x35]; block[0x3A] = temp_block[0x3C]; + block[0x3E] = temp_block[0x3D]; block[0x33] = temp_block[0x36]; + block[0x2F] = temp_block[0x2F]; block[0x37] = temp_block[0x37]; + block[0x3B] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F]; + }else{ + if(last_non_zero_p1 <= 1) goto end; + block[0x01] = temp_block[0x01]; + block[0x08] = temp_block[0x08]; block[0x10] = temp_block[0x10]; + if(last_non_zero_p1 <= 4) goto end; + block[0x09] = temp_block[0x09]; block[0x02] = temp_block[0x02]; + block[0x03] = temp_block[0x03]; + if(last_non_zero_p1 <= 7) goto end; + block[0x0A] = temp_block[0x0A]; block[0x11] = temp_block[0x11]; + block[0x18] = temp_block[0x18]; block[0x20] = temp_block[0x20]; + if(last_non_zero_p1 <= 11) goto end; + block[0x19] = temp_block[0x19]; + block[0x12] = temp_block[0x12]; block[0x0B] = temp_block[0x0B]; + block[0x04] = temp_block[0x04]; block[0x05] = temp_block[0x05]; + if(last_non_zero_p1 <= 16) goto end; + block[0x0C] = temp_block[0x0C]; block[0x13] = temp_block[0x13]; + block[0x1A] = temp_block[0x1A]; block[0x21] = temp_block[0x21]; + block[0x28] = temp_block[0x28]; block[0x30] = temp_block[0x30]; + block[0x29] = temp_block[0x29]; block[0x22] = temp_block[0x22]; + if(last_non_zero_p1 <= 24) goto end; + block[0x1B] = temp_block[0x1B]; block[0x14] = temp_block[0x14]; + block[0x0D] = temp_block[0x0D]; block[0x06] = temp_block[0x06]; + block[0x07] = temp_block[0x07]; block[0x0E] = temp_block[0x0E]; + block[0x15] = temp_block[0x15]; block[0x1C] = temp_block[0x1C]; + if(last_non_zero_p1 <= 32) goto end; + block[0x23] = temp_block[0x23]; block[0x2A] = temp_block[0x2A]; + block[0x31] = temp_block[0x31]; block[0x38] = temp_block[0x38]; + block[0x39] = temp_block[0x39]; block[0x32] = temp_block[0x32]; + block[0x2B] = temp_block[0x2B]; block[0x24] = temp_block[0x24]; + if(last_non_zero_p1 <= 40) goto end; + block[0x1D] = temp_block[0x1D]; block[0x16] = temp_block[0x16]; + block[0x0F] = temp_block[0x0F]; block[0x17] = temp_block[0x17]; + block[0x1E] = temp_block[0x1E]; block[0x25] = temp_block[0x25]; + block[0x2C] = temp_block[0x2C]; block[0x33] = temp_block[0x33]; + if(last_non_zero_p1 <= 48) goto end; + block[0x3A] = temp_block[0x3A]; block[0x3B] = temp_block[0x3B]; + block[0x34] = temp_block[0x34]; block[0x2D] = temp_block[0x2D]; + block[0x26] = temp_block[0x26]; block[0x1F] = temp_block[0x1F]; + block[0x27] = temp_block[0x27]; block[0x2E] = temp_block[0x2E]; + if(last_non_zero_p1 <= 56) goto end; + block[0x35] = temp_block[0x35]; block[0x3C] = temp_block[0x3C]; + block[0x3D] = temp_block[0x3D]; block[0x36] = temp_block[0x36]; + block[0x2F] = temp_block[0x2F]; block[0x37] = temp_block[0x37]; + block[0x3E] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F]; + } + end: +/* + for(i=0; i<last_non_zero_p1; i++) + { + int j= zigzag_direct_noperm[i]; + block[block_permute_op(j)]= temp_block[j]; + } +*/ + + return last_non_zero_p1 - 1; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/x86/simple_idct_mmx.c Mon Dec 22 09:12:42 2008 +0000 @@ -0,0 +1,1294 @@ +/* + * Simple IDCT MMX + * + * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "libavcodec/dsputil.h" +#include "libavcodec/simple_idct.h" + +/* +23170.475006 +22725.260826 +21406.727617 +19265.545870 +16384.000000 +12872.826198 +8866.956905 +4520.335430 +*/ +#define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#if 0 +#define C4 16384 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#else +#define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5 +#endif +#define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 + +#define ROW_SHIFT 11 +#define COL_SHIFT 20 // 6 + +DECLARE_ASM_CONST(8, uint64_t, wm1010)= 0xFFFF0000FFFF0000ULL; +DECLARE_ASM_CONST(8, uint64_t, d40000)= 0x0000000000040000ULL; + +DECLARE_ALIGNED(8, static const int16_t, coeffs[])= { + 1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0, +// 1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0, +// 0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16), + 1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0, + // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :) +// 0, 0, 0, 0, +// 0, 0, 0, 0, + + C4, C4, C4, C4, + C4, -C4, C4, -C4, + + C2, C6, C2, C6, + C6, -C2, C6, -C2, + + C1, C3, C1, C3, + C5, C7, C5, C7, + + C3, -C7, C3, -C7, +-C1, -C5, -C1, -C5, + + C5, -C1, C5, -C1, + C7, C3, C7, C3, + + C7, -C5, C7, -C5, + C3, -C1, C3, -C1 +}; + +#if 0 +static void unused_var_killer(){ + int a= wm1010 + d40000; + temp[0]=a; +} + +static void inline idctCol (int16_t * col, int16_t *input) +{ +#undef C0 +#undef C1 +#undef C2 +#undef C3 +#undef C4 +#undef C5 +#undef C6 +#undef C7 + int a0, a1, a2, a3, b0, b1, b2, b3; + const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 + const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 + const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 + const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 + const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 + const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 + const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 + const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +/* + if( !(col[8*1] | col[8*2] |col[8*3] |col[8*4] |col[8*5] |col[8*6] | col[8*7])) { + col[8*0] = col[8*1] = col[8*2] = col[8*3] = col[8*4] = + col[8*5] = col[8*6] = col[8*7] = col[8*0]<<3; + return; + }*/ + +col[8*0] = input[8*0 + 0]; +col[8*1] = input[8*2 + 0]; +col[8*2] = input[8*0 + 1]; +col[8*3] = input[8*2 + 1]; +col[8*4] = input[8*4 + 0]; +col[8*5] = input[8*6 + 0]; +col[8*6] = input[8*4 + 1]; +col[8*7] = input[8*6 + 1]; + + a0 = C4*col[8*0] + C2*col[8*2] + C4*col[8*4] + C6*col[8*6] + (1<<(COL_SHIFT-1)); + a1 = C4*col[8*0] + C6*col[8*2] - C4*col[8*4] - C2*col[8*6] + (1<<(COL_SHIFT-1)); + a2 = C4*col[8*0] - C6*col[8*2] - C4*col[8*4] + C2*col[8*6] + (1<<(COL_SHIFT-1)); + a3 = C4*col[8*0] - C2*col[8*2] + C4*col[8*4] - C6*col[8*6] + (1<<(COL_SHIFT-1)); + + b0 = C1*col[8*1] + C3*col[8*3] + C5*col[8*5] + C7*col[8*7]; + b1 = C3*col[8*1] - C7*col[8*3] - C1*col[8*5] - C5*col[8*7]; + b2 = C5*col[8*1] - C1*col[8*3] + C7*col[8*5] + C3*col[8*7]; + b3 = C7*col[8*1] - C5*col[8*3] + C3*col[8*5] - C1*col[8*7]; + + col[8*0] = (a0 + b0) >> COL_SHIFT; + col[8*1] = (a1 + b1) >> COL_SHIFT; + col[8*2] = (a2 + b2) >> COL_SHIFT; + col[8*3] = (a3 + b3) >> COL_SHIFT; + col[8*4] = (a3 - b3) >> COL_SHIFT; + col[8*5] = (a2 - b2) >> COL_SHIFT; + col[8*6] = (a1 - b1) >> COL_SHIFT; + col[8*7] = (a0 - b0) >> COL_SHIFT; +} + +static void inline idctRow (int16_t * output, int16_t * input) +{ + int16_t row[8]; + + int a0, a1, a2, a3, b0, b1, b2, b3; + const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 + const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 + const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 + const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 + const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 + const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 + const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 + const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 + +row[0] = input[0]; +row[2] = input[1]; +row[4] = input[4]; +row[6] = input[5]; +row[1] = input[8]; +row[3] = input[9]; +row[5] = input[12]; +row[7] = input[13]; + + if( !(row[1] | row[2] |row[3] |row[4] |row[5] |row[6] | row[7]) ) { + row[0] = row[1] = row[2] = row[3] = row[4] = + row[5] = row[6] = row[7] = row[0]<<3; + output[0] = row[0]; + output[2] = row[1]; + output[4] = row[2]; + output[6] = row[3]; + output[8] = row[4]; + output[10] = row[5]; + output[12] = row[6]; + output[14] = row[7]; + return; + } + + a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + (1<<(ROW_SHIFT-1)); + a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + (1<<(ROW_SHIFT-1)); + a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + (1<<(ROW_SHIFT-1)); + a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + (1<<(ROW_SHIFT-1)); + + b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7]; + b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7]; + b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7]; + b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7]; + + row[0] = (a0 + b0) >> ROW_SHIFT; + row[1] = (a1 + b1) >> ROW_SHIFT; + row[2] = (a2 + b2) >> ROW_SHIFT; + row[3] = (a3 + b3) >> ROW_SHIFT; + row[4] = (a3 - b3) >> ROW_SHIFT; + row[5] = (a2 - b2) >> ROW_SHIFT; + row[6] = (a1 - b1) >> ROW_SHIFT; + row[7] = (a0 - b0) >> ROW_SHIFT; + + output[0] = row[0]; + output[2] = row[1]; + output[4] = row[2]; + output[6] = row[3]; + output[8] = row[4]; + output[10] = row[5]; + output[12] = row[6]; + output[14] = row[7]; +} +#endif + +static inline void idct(int16_t *block) +{ + DECLARE_ALIGNED(8, int64_t, align_tmp[16]); + int16_t * const temp= (int16_t*)align_tmp; + + __asm__ volatile( +#if 0 //Alternative, simpler variant + +#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \ + "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ + "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ + "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ + "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ + "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ + "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ + "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ + "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ + "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ + "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ + "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ + "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ + #rounder ", %%mm4 \n\t"\ + "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ + "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ + "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ + "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ + #rounder ", %%mm0 \n\t"\ + "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ + "paddd %%mm0, %%mm0 \n\t" \ + "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ + "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ + "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ + "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ + "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ + "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ + "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ + "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ + "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ + "psrad $" #shift ", %%mm7 \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ + "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ + "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ + "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ + "psrad $" #shift ", %%mm1 \n\t"\ + "psrad $" #shift ", %%mm2 \n\t"\ + "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ + "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ + "movq %%mm7, " #dst " \n\t"\ + "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ + "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ + "movq %%mm2, 24+" #dst " \n\t"\ + "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ + "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ + "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ + "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ + "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ + "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ + "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ + "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ + "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ + "psrad $" #shift ", %%mm2 \n\t"\ + "psrad $" #shift ", %%mm0 \n\t"\ + "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ + "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ + "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ + "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ + "psrad $" #shift ", %%mm6 \n\t"\ + "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ + "movq %%mm2, 8+" #dst " \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ + "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ + "movq %%mm4, 16+" #dst " \n\t"\ + +#define COL_IDCT(src0, src4, src1, src5, dst, shift) \ + "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ + "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ + "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ + "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ + "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ + "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ + "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ + "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ + "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ + "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ + "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ + "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ + "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ + "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ + "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ + "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ + "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ + "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ + "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ + "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\ + "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\ + "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ + "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ + "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ + "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ + "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\ + "psrad $" #shift ", %%mm7 \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ + "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ + "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\ + "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\ + "psrad $" #shift ", %%mm0 \n\t"\ + "psrad $" #shift ", %%mm2 \n\t"\ + "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ + "movd %%mm7, " #dst " \n\t"\ + "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ + "movd %%mm0, 16+" #dst " \n\t"\ + "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ + "movd %%mm2, 96+" #dst " \n\t"\ + "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ + "movd %%mm4, 112+" #dst " \n\t"\ + "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\ + "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ + "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ + "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ + "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ + "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ + "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ + "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ + "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ + "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ + "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ + "psrad $" #shift ", %%mm2 \n\t"\ + "psrad $" #shift ", %%mm5 \n\t"\ + "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ + "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\ + "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ + "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ + "psrad $" #shift ", %%mm6 \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ + "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ + "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ + "movd %%mm2, 32+" #dst " \n\t"\ + "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ + "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ + "movd %%mm6, 48+" #dst " \n\t"\ + "movd %%mm4, 64+" #dst " \n\t"\ + "movd %%mm5, 80+" #dst " \n\t"\ + + +#define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \ + "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ + "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ + "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ + "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ + "movq "MANGLE(wm1010)", %%mm4 \n\t"\ + "pand %%mm0, %%mm4 \n\t"\ + "por %%mm1, %%mm4 \n\t"\ + "por %%mm2, %%mm4 \n\t"\ + "por %%mm3, %%mm4 \n\t"\ + "packssdw %%mm4,%%mm4 \n\t"\ + "movd %%mm4, %%eax \n\t"\ + "orl %%eax, %%eax \n\t"\ + "jz 1f \n\t"\ + "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ + "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ + "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ + "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ + "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ + "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ + "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ + "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ + #rounder ", %%mm4 \n\t"\ + "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ + "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ + "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ + "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ + #rounder ", %%mm0 \n\t"\ + "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ + "paddd %%mm0, %%mm0 \n\t" \ + "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ + "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ + "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ + "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ + "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ + "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ + "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ + "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ + "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ + "psrad $" #shift ", %%mm7 \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ + "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ + "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ + "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ + "psrad $" #shift ", %%mm1 \n\t"\ + "psrad $" #shift ", %%mm2 \n\t"\ + "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ + "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ + "movq %%mm7, " #dst " \n\t"\ + "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ + "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ + "movq %%mm2, 24+" #dst " \n\t"\ + "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ + "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ + "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ + "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ + "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ + "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ + "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ + "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ + "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ + "psrad $" #shift ", %%mm2 \n\t"\ + "psrad $" #shift ", %%mm0 \n\t"\ + "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ + "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ + "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ + "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ + "psrad $" #shift ", %%mm6 \n\t"\ + "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ + "movq %%mm2, 8+" #dst " \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ + "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ + "movq %%mm4, 16+" #dst " \n\t"\ + "jmp 2f \n\t"\ + "1: \n\t"\ + "pslld $16, %%mm0 \n\t"\ + "#paddd "MANGLE(d40000)", %%mm0 \n\t"\ + "psrad $13, %%mm0 \n\t"\ + "packssdw %%mm0, %%mm0 \n\t"\ + "movq %%mm0, " #dst " \n\t"\ + "movq %%mm0, 8+" #dst " \n\t"\ + "movq %%mm0, 16+" #dst " \n\t"\ + "movq %%mm0, 24+" #dst " \n\t"\ + "2: \n\t" + + +//IDCT( src0, src4, src1, src5, dst, rounder, shift) +ROW_IDCT( (%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11) +/*ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11) +ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11) +ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/ + +DC_COND_ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11) +DC_COND_ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11) +DC_COND_ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11) + + +//IDCT( src0, src4, src1, src5, dst, shift) +COL_IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) +COL_IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) +COL_IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) +COL_IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) + +#else + +#define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \ + "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ + "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ + "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ + "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ + "movq "MANGLE(wm1010)", %%mm4 \n\t"\ + "pand %%mm0, %%mm4 \n\t"\ + "por %%mm1, %%mm4 \n\t"\ + "por %%mm2, %%mm4 \n\t"\ + "por %%mm3, %%mm4 \n\t"\ + "packssdw %%mm4,%%mm4 \n\t"\ + "movd %%mm4, %%eax \n\t"\ + "orl %%eax, %%eax \n\t"\ + "jz 1f \n\t"\ + "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ + "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ + "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ + "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ + "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ + "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ + "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ + "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ + #rounder ", %%mm4 \n\t"\ + "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ + "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ + "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ + "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ + #rounder ", %%mm0 \n\t"\ + "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ + "paddd %%mm0, %%mm0 \n\t" \ + "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ + "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ + "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ + "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ + "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ + "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ + "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ + "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ + "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ + "psrad $" #shift ", %%mm7 \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ + "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ + "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ + "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ + "psrad $" #shift ", %%mm1 \n\t"\ + "psrad $" #shift ", %%mm2 \n\t"\ + "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ + "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ + "movq %%mm7, " #dst " \n\t"\ + "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ + "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ + "movq %%mm2, 24+" #dst " \n\t"\ + "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ + "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ + "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ + "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ + "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ + "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ + "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ + "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ + "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ + "psrad $" #shift ", %%mm2 \n\t"\ + "psrad $" #shift ", %%mm0 \n\t"\ + "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ + "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ + "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ + "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ + "psrad $" #shift ", %%mm6 \n\t"\ + "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ + "movq %%mm2, 8+" #dst " \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ + "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ + "movq %%mm4, 16+" #dst " \n\t"\ + "jmp 2f \n\t"\ + "1: \n\t"\ + "pslld $16, %%mm0 \n\t"\ + "paddd "MANGLE(d40000)", %%mm0 \n\t"\ + "psrad $13, %%mm0 \n\t"\ + "packssdw %%mm0, %%mm0 \n\t"\ + "movq %%mm0, " #dst " \n\t"\ + "movq %%mm0, 8+" #dst " \n\t"\ + "movq %%mm0, 16+" #dst " \n\t"\ + "movq %%mm0, 24+" #dst " \n\t"\ + "2: \n\t" + +#define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \ + "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ + "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ + "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ + "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ + "movq %%mm0, %%mm4 \n\t"\ + "por %%mm1, %%mm4 \n\t"\ + "por %%mm2, %%mm4 \n\t"\ + "por %%mm3, %%mm4 \n\t"\ + "packssdw %%mm4,%%mm4 \n\t"\ + "movd %%mm4, %%eax \n\t"\ + "orl %%eax, %%eax \n\t"\ + "jz " #bt " \n\t"\ + "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ + "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ + "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ + "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ + "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ + "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ + "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ + "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ + #rounder ", %%mm4 \n\t"\ + "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ + "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ + "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ + "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ + #rounder ", %%mm0 \n\t"\ + "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ + "paddd %%mm0, %%mm0 \n\t" \ + "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ + "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ + "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ + "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ + "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ + "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ + "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ + "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ + "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ + "psrad $" #shift ", %%mm7 \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ + "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ + "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ + "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ + "psrad $" #shift ", %%mm1 \n\t"\ + "psrad $" #shift ", %%mm2 \n\t"\ + "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ + "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ + "movq %%mm7, " #dst " \n\t"\ + "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ + "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ + "movq %%mm2, 24+" #dst " \n\t"\ + "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ + "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ + "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ + "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ + "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ + "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ + "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ + "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ + "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ + "psrad $" #shift ", %%mm2 \n\t"\ + "psrad $" #shift ", %%mm0 \n\t"\ + "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ + "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ + "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ + "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ + "psrad $" #shift ", %%mm6 \n\t"\ + "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ + "movq %%mm2, 8+" #dst " \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ + "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ + "movq %%mm4, 16+" #dst " \n\t"\ + +#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \ + "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ + "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ + "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ + "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ + "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ + "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ + "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ + "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ + "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ + "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ + "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ + "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ + #rounder ", %%mm4 \n\t"\ + "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ + "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ + "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ + "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ + #rounder ", %%mm0 \n\t"\ + "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ + "paddd %%mm0, %%mm0 \n\t" \ + "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ + "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ + "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ + "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ + "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ + "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ + "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ + "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ + "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ + "psrad $" #shift ", %%mm7 \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ + "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ + "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ + "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ + "psrad $" #shift ", %%mm1 \n\t"\ + "psrad $" #shift ", %%mm2 \n\t"\ + "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ + "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ + "movq %%mm7, " #dst " \n\t"\ + "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ + "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ + "movq %%mm2, 24+" #dst " \n\t"\ + "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ + "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ + "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ + "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ + "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ + "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ + "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ + "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ + "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ + "psrad $" #shift ", %%mm2 \n\t"\ + "psrad $" #shift ", %%mm0 \n\t"\ + "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ + "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ + "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ + "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ + "psrad $" #shift ", %%mm6 \n\t"\ + "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ + "movq %%mm2, 8+" #dst " \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ + "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ + "movq %%mm4, 16+" #dst " \n\t"\ + +//IDCT( src0, src4, src1, src5, dst, rounder, shift) +DC_COND_IDCT( 0(%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11) +Z_COND_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f) +Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f) +Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f) + +#undef IDCT +#define IDCT(src0, src4, src1, src5, dst, shift) \ + "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ + "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ + "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ + "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ + "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ + "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ + "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ + "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ + "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ + "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ + "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ + "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ + "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ + "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ + "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ + "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ + "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ + "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ + "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ + "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\ + "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\ + "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ + "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ + "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ + "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ + "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\ + "psrad $" #shift ", %%mm7 \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ + "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ + "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\ + "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\ + "psrad $" #shift ", %%mm0 \n\t"\ + "psrad $" #shift ", %%mm2 \n\t"\ + "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ + "movd %%mm7, " #dst " \n\t"\ + "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ + "movd %%mm0, 16+" #dst " \n\t"\ + "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ + "movd %%mm2, 96+" #dst " \n\t"\ + "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ + "movd %%mm4, 112+" #dst " \n\t"\ + "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\ + "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ + "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ + "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ + "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ + "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ + "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ + "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ + "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ + "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ + "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ + "psrad $" #shift ", %%mm2 \n\t"\ + "psrad $" #shift ", %%mm5 \n\t"\ + "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ + "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\ + "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ + "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ + "psrad $" #shift ", %%mm6 \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ + "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ + "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ + "movd %%mm2, 32+" #dst " \n\t"\ + "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ + "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ + "movd %%mm6, 48+" #dst " \n\t"\ + "movd %%mm4, 64+" #dst " \n\t"\ + "movd %%mm5, 80+" #dst " \n\t" + + +//IDCT( src0, src4, src1, src5, dst, shift) +IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) +IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) +IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) +IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) + "jmp 9f \n\t" + + "#" ASMALIGN(4) \ + "4: \n\t" +Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f) +Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f) + +#undef IDCT +#define IDCT(src0, src4, src1, src5, dst, shift) \ + "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ + "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ + "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ + "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ + "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ + "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ + "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ + "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ + "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ + "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ + "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ + "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ + "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ + "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ + "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ + "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\ + "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ + "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\ + "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ + "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\ + "psrad $" #shift ", %%mm1 \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ + "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ + "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\ + "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\ + "psrad $" #shift ", %%mm0 \n\t"\ + "psrad $" #shift ", %%mm2 \n\t"\ + "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\ + "movd %%mm1, " #dst " \n\t"\ + "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ + "movd %%mm0, 16+" #dst " \n\t"\ + "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ + "movd %%mm2, 96+" #dst " \n\t"\ + "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ + "movd %%mm4, 112+" #dst " \n\t"\ + "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\ + "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ + "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ + "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ + "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\ + "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\ + "psrad $" #shift ", %%mm2 \n\t"\ + "psrad $" #shift ", %%mm5 \n\t"\ + "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\ + "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ + "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\ + "psrad $" #shift ", %%mm6 \n\t"\ + "psrad $" #shift ", %%mm1 \n\t"\ + "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ + "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ + "movd %%mm2, 32+" #dst " \n\t"\ + "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\ + "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ + "movd %%mm6, 48+" #dst " \n\t"\ + "movd %%mm1, 64+" #dst " \n\t"\ + "movd %%mm5, 80+" #dst " \n\t" + +//IDCT( src0, src4, src1, src5, dst, shift) +IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) +IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) +IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) +IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) + "jmp 9f \n\t" + + "#" ASMALIGN(4) \ + "6: \n\t" +Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f) + +#undef IDCT +#define IDCT(src0, src4, src1, src5, dst, shift) \ + "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ + "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ + "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ + "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ + "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ + "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ + "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\ + "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ + "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\ + "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ + "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\ + "psrad $" #shift ", %%mm1 \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ + "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ + "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\ + "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\ + "psrad $" #shift ", %%mm0 \n\t"\ + "psrad $" #shift ", %%mm2 \n\t"\ + "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\ + "movd %%mm1, " #dst " \n\t"\ + "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ + "movd %%mm0, 16+" #dst " \n\t"\ + "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ + "movd %%mm2, 96+" #dst " \n\t"\ + "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ + "movd %%mm4, 112+" #dst " \n\t"\ + "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\ + "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ + "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ + "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ + "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\ + "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\ + "psrad $" #shift ", %%mm2 \n\t"\ + "psrad $" #shift ", %%mm5 \n\t"\ + "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\ + "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ + "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\ + "psrad $" #shift ", %%mm6 \n\t"\ + "psrad $" #shift ", %%mm1 \n\t"\ + "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ + "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ + "movd %%mm2, 32+" #dst " \n\t"\ + "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\ + "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ + "movd %%mm6, 48+" #dst " \n\t"\ + "movd %%mm1, 64+" #dst " \n\t"\ + "movd %%mm5, 80+" #dst " \n\t" + + +//IDCT( src0, src4, src1, src5, dst, shift) +IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) +IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) +IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) +IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) + "jmp 9f \n\t" + + "#" ASMALIGN(4) \ + "2: \n\t" +Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f) + +#undef IDCT +#define IDCT(src0, src4, src1, src5, dst, shift) \ + "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ + "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ + "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ + "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ + "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ + "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ + "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ + "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ + "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ + "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ + "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\ + "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\ + "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ + "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ + "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ + "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ + "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\ + "psrad $" #shift ", %%mm7 \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ + "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ + "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\ + "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\ + "psrad $" #shift ", %%mm0 \n\t"\ + "psrad $" #shift ", %%mm2 \n\t"\ + "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ + "movd %%mm7, " #dst " \n\t"\ + "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ + "movd %%mm0, 16+" #dst " \n\t"\ + "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ + "movd %%mm2, 96+" #dst " \n\t"\ + "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ + "movd %%mm4, 112+" #dst " \n\t"\ + "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\ + "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ + "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ + "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ + "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ + "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ + "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ + "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ + "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ + "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ + "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ + "psrad $" #shift ", %%mm2 \n\t"\ + "psrad $" #shift ", %%mm5 \n\t"\ + "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ + "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\ + "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ + "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ + "psrad $" #shift ", %%mm6 \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ + "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ + "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ + "movd %%mm2, 32+" #dst " \n\t"\ + "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ + "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ + "movd %%mm6, 48+" #dst " \n\t"\ + "movd %%mm4, 64+" #dst " \n\t"\ + "movd %%mm5, 80+" #dst " \n\t" + +//IDCT( src0, src4, src1, src5, dst, shift) +IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) +IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) +IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) +IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) + "jmp 9f \n\t" + + "#" ASMALIGN(4) \ + "3: \n\t" +#undef IDCT +#define IDCT(src0, src4, src1, src5, dst, shift) \ + "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ + "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ + "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ + "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ + "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ + "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ + "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "movq 64(%2), %%mm3 \n\t"\ + "pmaddwd %%mm2, %%mm3 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ + "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ + "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ + "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ + "psrad $" #shift ", %%mm7 \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ + "movq %%mm0, %%mm1 \n\t" /* A1 a1 */\ + "paddd %%mm3, %%mm0 \n\t" /* A1+B1 a1+b1 */\ + "psubd %%mm3, %%mm1 \n\t" /* A1-B1 a1-b1 */\ + "psrad $" #shift ", %%mm0 \n\t"\ + "psrad $" #shift ", %%mm1 \n\t"\ + "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ + "movd %%mm7, " #dst " \n\t"\ + "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ + "movd %%mm0, 16+" #dst " \n\t"\ + "packssdw %%mm1, %%mm1 \n\t" /* A1-B1 a1-b1 */\ + "movd %%mm1, 96+" #dst " \n\t"\ + "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ + "movd %%mm4, 112+" #dst " \n\t"\ + "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ + "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ + "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ + "movq %%mm5, %%mm1 \n\t" /* A2 a2 */\ + "paddd %%mm4, %%mm1 \n\t" /* A2+B2 a2+b2 */\ + "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ + "psrad $" #shift ", %%mm1 \n\t"\ + "psrad $" #shift ", %%mm5 \n\t"\ + "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ + "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\ + "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\ + "psrad $" #shift ", %%mm6 \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ + "packssdw %%mm1, %%mm1 \n\t" /* A2+B2 a2+b2 */\ + "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ + "movd %%mm1, 32+" #dst " \n\t"\ + "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ + "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ + "movd %%mm6, 48+" #dst " \n\t"\ + "movd %%mm4, 64+" #dst " \n\t"\ + "movd %%mm5, 80+" #dst " \n\t" + + +//IDCT( src0, src4, src1, src5, dst, shift) +IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) +IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) +IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) +IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) + "jmp 9f \n\t" + + "#" ASMALIGN(4) \ + "5: \n\t" +#undef IDCT +#define IDCT(src0, src4, src1, src5, dst, shift) \ + "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ + "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ + "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ + "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ + "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ + "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ + "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ + "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ + "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ + "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ + "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ + "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ + "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\ + "movq 8+" #src4 ", %%mm3 \n\t" /* R6 R2 r6 r2 */\ + "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\ + "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\ + "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\ + "pmaddwd %%mm3, %%mm7 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ + "pmaddwd 40(%2), %%mm3 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ + "paddd %%mm1, %%mm7 \n\t" /* A0 a0 */\ + "paddd %%mm1, %%mm1 \n\t" /* 2C0 2c0 */\ + "psubd %%mm7, %%mm1 \n\t" /* A3 a3 */\ + "paddd %%mm2, %%mm3 \n\t" /* A1 a1 */\ + "paddd %%mm2, %%mm2 \n\t" /* 2C1 2c1 */\ + "psubd %%mm3, %%mm2 \n\t" /* A2 a2 */\ + "psrad $" #shift ", %%mm4 \n\t"\ + "psrad $" #shift ", %%mm7 \n\t"\ + "psrad $" #shift ", %%mm3 \n\t"\ + "packssdw %%mm7, %%mm4 \n\t" /* A0 a0 */\ + "movq %%mm4, " #dst " \n\t"\ + "psrad $" #shift ", %%mm0 \n\t"\ + "packssdw %%mm3, %%mm0 \n\t" /* A1 a1 */\ + "movq %%mm0, 16+" #dst " \n\t"\ + "movq %%mm0, 96+" #dst " \n\t"\ + "movq %%mm4, 112+" #dst " \n\t"\ + "psrad $" #shift ", %%mm5 \n\t"\ + "psrad $" #shift ", %%mm6 \n\t"\ + "psrad $" #shift ", %%mm2 \n\t"\ + "packssdw %%mm2, %%mm5 \n\t" /* A2-B2 a2-b2 */\ + "movq %%mm5, 32+" #dst " \n\t"\ + "psrad $" #shift ", %%mm1 \n\t"\ + "packssdw %%mm1, %%mm6 \n\t" /* A3+B3 a3+b3 */\ + "movq %%mm6, 48+" #dst " \n\t"\ + "movq %%mm6, 64+" #dst " \n\t"\ + "movq %%mm5, 80+" #dst " \n\t" + + +//IDCT( src0, src4, src1, src5, dst, shift) +IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) +//IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) +IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) +//IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) + "jmp 9f \n\t" + + + "#" ASMALIGN(4) \ + "1: \n\t" +#undef IDCT +#define IDCT(src0, src4, src1, src5, dst, shift) \ + "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ + "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ + "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ + "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ + "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ + "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ + "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ + "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ + "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ + "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ + "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ + "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ + "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ + "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ + "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ + "movq 64(%2), %%mm1 \n\t"\ + "pmaddwd %%mm2, %%mm1 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ + "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ + "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ + "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ + "psrad $" #shift ", %%mm7 \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ + "movq %%mm0, %%mm3 \n\t" /* A1 a1 */\ + "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\ + "psubd %%mm1, %%mm3 \n\t" /* A1-B1 a1-b1 */\ + "psrad $" #shift ", %%mm0 \n\t"\ + "psrad $" #shift ", %%mm3 \n\t"\ + "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ + "movd %%mm7, " #dst " \n\t"\ + "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ + "movd %%mm0, 16+" #dst " \n\t"\ + "packssdw %%mm3, %%mm3 \n\t" /* A1-B1 a1-b1 */\ + "movd %%mm3, 96+" #dst " \n\t"\ + "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ + "movd %%mm4, 112+" #dst " \n\t"\ + "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ + "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ + "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ + "movq %%mm5, %%mm3 \n\t" /* A2 a2 */\ + "paddd %%mm4, %%mm3 \n\t" /* A2+B2 a2+b2 */\ + "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ + "psrad $" #shift ", %%mm3 \n\t"\ + "psrad $" #shift ", %%mm5 \n\t"\ + "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ + "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\ + "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\ + "psrad $" #shift ", %%mm6 \n\t"\ + "packssdw %%mm3, %%mm3 \n\t" /* A2+B2 a2+b2 */\ + "movd %%mm3, 32+" #dst " \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ + "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ + "movd %%mm6, 48+" #dst " \n\t"\ + "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ + "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ + "movd %%mm4, 64+" #dst " \n\t"\ + "movd %%mm5, 80+" #dst " \n\t" + + +//IDCT( src0, src4, src1, src5, dst, shift) +IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) +IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) +IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) +IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) + "jmp 9f \n\t" + + + "#" ASMALIGN(4) + "7: \n\t" +#undef IDCT +#define IDCT(src0, src4, src1, src5, dst, shift) \ + "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ + "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ + "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ + "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "psrad $" #shift ", %%mm4 \n\t"\ + "psrad $" #shift ", %%mm0 \n\t"\ + "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\ + "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\ + "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ + "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\ + "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ + "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\ + "psrad $" #shift ", %%mm1 \n\t"\ + "packssdw %%mm1, %%mm4 \n\t" /* A0 a0 */\ + "movq %%mm4, " #dst " \n\t"\ + "psrad $" #shift ", %%mm2 \n\t"\ + "packssdw %%mm2, %%mm0 \n\t" /* A1 a1 */\ + "movq %%mm0, 16+" #dst " \n\t"\ + "movq %%mm0, 96+" #dst " \n\t"\ + "movq %%mm4, 112+" #dst " \n\t"\ + "movq %%mm0, 32+" #dst " \n\t"\ + "movq %%mm4, 48+" #dst " \n\t"\ + "movq %%mm4, 64+" #dst " \n\t"\ + "movq %%mm0, 80+" #dst " \n\t" + +//IDCT( src0, src4, src1, src5, dst, shift) +IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) +//IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) +IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) +//IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) + + +#endif + +/* +Input + 00 40 04 44 20 60 24 64 + 10 30 14 34 50 70 54 74 + 01 41 03 43 21 61 23 63 + 11 31 13 33 51 71 53 73 + 02 42 06 46 22 62 26 66 + 12 32 16 36 52 72 56 76 + 05 45 07 47 25 65 27 67 + 15 35 17 37 55 75 57 77 + +Temp + 00 04 10 14 20 24 30 34 + 40 44 50 54 60 64 70 74 + 01 03 11 13 21 23 31 33 + 41 43 51 53 61 63 71 73 + 02 06 12 16 22 26 32 36 + 42 46 52 56 62 66 72 76 + 05 07 15 17 25 27 35 37 + 45 47 55 57 65 67 75 77 +*/ + +"9: \n\t" + :: "r" (block), "r" (temp), "r" (coeffs) + : "%eax" + ); +} + +void ff_simple_idct_mmx(int16_t *block) +{ + idct(block); +} + +//FIXME merge add/put into the idct + +void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block) +{ + idct(block); + put_pixels_clamped_mmx(block, dest, line_size); +} +void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block) +{ + idct(block); + add_pixels_clamped_mmx(block, dest, line_size); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/x86/snowdsp_mmx.c Mon Dec 22 09:12:42 2008 +0000 @@ -0,0 +1,871 @@ +/* + * MMX and SSE2 optimized snow DSP utils + * Copyright (c) 2005-2006 Robert Edele <yartrebo@earthlink.net> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/x86_cpu.h" +#include "libavcodec/avcodec.h" +#include "libavcodec/snow.h" + +void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, int width){ + const int w2= (width+1)>>1; + DECLARE_ALIGNED_16(IDWTELEM, temp[width>>1]); + const int w_l= (width>>1); + const int w_r= w2 - 1; + int i; + + { // Lift 0 + IDWTELEM * const ref = b + w2 - 1; + IDWTELEM b_0 = b[0]; //By allowing the first entry in b[0] to be calculated twice + // (the first time erroneously), we allow the SSE2 code to run an extra pass. + // The savings in code and time are well worth having to store this value and + // calculate b[0] correctly afterwards. + + i = 0; + __asm__ volatile( + "pcmpeqd %%xmm7, %%xmm7 \n\t" + "pcmpeqd %%xmm3, %%xmm3 \n\t" + "psllw $1, %%xmm3 \n\t" + "paddw %%xmm7, %%xmm3 \n\t" + "psllw $13, %%xmm3 \n\t" + ::); + for(; i<w_l-15; i+=16){ + __asm__ volatile( + "movdqu (%1), %%xmm1 \n\t" + "movdqu 16(%1), %%xmm5 \n\t" + "movdqu 2(%1), %%xmm2 \n\t" + "movdqu 18(%1), %%xmm6 \n\t" + "paddw %%xmm1, %%xmm2 \n\t" + "paddw %%xmm5, %%xmm6 \n\t" + "paddw %%xmm7, %%xmm2 \n\t" + "paddw %%xmm7, %%xmm6 \n\t" + "pmulhw %%xmm3, %%xmm2 \n\t" + "pmulhw %%xmm3, %%xmm6 \n\t" + "paddw (%0), %%xmm2 \n\t" + "paddw 16(%0), %%xmm6 \n\t" + "movdqa %%xmm2, (%0) \n\t" + "movdqa %%xmm6, 16(%0) \n\t" + :: "r"(&b[i]), "r"(&ref[i]) + : "memory" + ); + } + snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS); + b[0] = b_0 - ((W_DM * 2 * ref[1]+W_DO)>>W_DS); + } + + { // Lift 1 + IDWTELEM * const dst = b+w2; + + i = 0; + for(; (((x86_reg)&dst[i]) & 0x1F) && i<w_r; i++){ + dst[i] = dst[i] - (b[i] + b[i + 1]); + } + for(; i<w_r-15; i+=16){ + __asm__ volatile( + "movdqu (%1), %%xmm1 \n\t" + "movdqu 16(%1), %%xmm5 \n\t" + "movdqu 2(%1), %%xmm2 \n\t" + "movdqu 18(%1), %%xmm6 \n\t" + "paddw %%xmm1, %%xmm2 \n\t" + "paddw %%xmm5, %%xmm6 \n\t" + "movdqa (%0), %%xmm0 \n\t" + "movdqa 16(%0), %%xmm4 \n\t" + "psubw %%xmm2, %%xmm0 \n\t" + "psubw %%xmm6, %%xmm4 \n\t" + "movdqa %%xmm0, (%0) \n\t" + "movdqa %%xmm4, 16(%0) \n\t" + :: "r"(&dst[i]), "r"(&b[i]) + : "memory" + ); + } + snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS); + } + + { // Lift 2 + IDWTELEM * const ref = b+w2 - 1; + IDWTELEM b_0 = b[0]; + + i = 0; + __asm__ volatile( + "psllw $15, %%xmm7 \n\t" + "pcmpeqw %%xmm6, %%xmm6 \n\t" + "psrlw $13, %%xmm6 \n\t" + "paddw %%xmm7, %%xmm6 \n\t" + ::); + for(; i<w_l-15; i+=16){ + __asm__ volatile( + "movdqu (%1), %%xmm0 \n\t" + "movdqu 16(%1), %%xmm4 \n\t" + "movdqu 2(%1), %%xmm1 \n\t" + "movdqu 18(%1), %%xmm5 \n\t" //FIXME try aligned reads and shifts + "paddw %%xmm6, %%xmm0 \n\t" + "paddw %%xmm6, %%xmm4 \n\t" + "paddw %%xmm7, %%xmm1 \n\t" + "paddw %%xmm7, %%xmm5 \n\t" + "pavgw %%xmm1, %%xmm0 \n\t" + "pavgw %%xmm5, %%xmm4 \n\t" + "psubw %%xmm7, %%xmm0 \n\t" + "psubw %%xmm7, %%xmm4 \n\t" + "psraw $1, %%xmm0 \n\t" + "psraw $1, %%xmm4 \n\t" + "movdqa (%0), %%xmm1 \n\t" + "movdqa 16(%0), %%xmm5 \n\t" + "paddw %%xmm1, %%xmm0 \n\t" + "paddw %%xmm5, %%xmm4 \n\t" + "psraw $2, %%xmm0 \n\t" + "psraw $2, %%xmm4 \n\t" + "paddw %%xmm1, %%xmm0 \n\t" + "paddw %%xmm5, %%xmm4 \n\t" + "movdqa %%xmm0, (%0) \n\t" + "movdqa %%xmm4, 16(%0) \n\t" + :: "r"(&b[i]), "r"(&ref[i]) + : "memory" + ); + } + snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l); + b[0] = b_0 + ((2 * ref[1] + W_BO-1 + 4 * b_0) >> W_BS); + } + + { // Lift 3 + IDWTELEM * const src = b+w2; + + i = 0; + for(; (((x86_reg)&temp[i]) & 0x1F) && i<w_r; i++){ + temp[i] = src[i] - ((-W_AM*(b[i] + b[i+1]))>>W_AS); + } + for(; i<w_r-7; i+=8){ + __asm__ volatile( + "movdqu 2(%1), %%xmm2 \n\t" + "movdqu 18(%1), %%xmm6 \n\t" + "paddw (%1), %%xmm2 \n\t" + "paddw 16(%1), %%xmm6 \n\t" + "movdqu (%0), %%xmm0 \n\t" + "movdqu 16(%0), %%xmm4 \n\t" + "paddw %%xmm2, %%xmm0 \n\t" + "paddw %%xmm6, %%xmm4 \n\t" + "psraw $1, %%xmm2 \n\t" + "psraw $1, %%xmm6 \n\t" + "paddw %%xmm0, %%xmm2 \n\t" + "paddw %%xmm4, %%xmm6 \n\t" + "movdqa %%xmm2, (%2) \n\t" + "movdqa %%xmm6, 16(%2) \n\t" + :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i]) + : "memory" + ); + } + snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS); + } + + { + snow_interleave_line_header(&i, width, b, temp); + + for (; (i & 0x3E) != 0x3E; i-=2){ + b[i+1] = temp[i>>1]; + b[i] = b[i>>1]; + } + for (i-=62; i>=0; i-=64){ + __asm__ volatile( + "movdqa (%1), %%xmm0 \n\t" + "movdqa 16(%1), %%xmm2 \n\t" + "movdqa 32(%1), %%xmm4 \n\t" + "movdqa 48(%1), %%xmm6 \n\t" + "movdqa (%1), %%xmm1 \n\t" + "movdqa 16(%1), %%xmm3 \n\t" + "movdqa 32(%1), %%xmm5 \n\t" + "movdqa 48(%1), %%xmm7 \n\t" + "punpcklwd (%2), %%xmm0 \n\t" + "punpcklwd 16(%2), %%xmm2 \n\t" + "punpcklwd 32(%2), %%xmm4 \n\t" + "punpcklwd 48(%2), %%xmm6 \n\t" + "movdqa %%xmm0, (%0) \n\t" + "movdqa %%xmm2, 32(%0) \n\t" + "movdqa %%xmm4, 64(%0) \n\t" + "movdqa %%xmm6, 96(%0) \n\t" + "punpckhwd (%2), %%xmm1 \n\t" + "punpckhwd 16(%2), %%xmm3 \n\t" + "punpckhwd 32(%2), %%xmm5 \n\t" + "punpckhwd 48(%2), %%xmm7 \n\t" + "movdqa %%xmm1, 16(%0) \n\t" + "movdqa %%xmm3, 48(%0) \n\t" + "movdqa %%xmm5, 80(%0) \n\t" + "movdqa %%xmm7, 112(%0) \n\t" + :: "r"(&(b)[i]), "r"(&(b)[i>>1]), "r"(&(temp)[i>>1]) + : "memory" + ); + } + } +} + +void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, int width){ + const int w2= (width+1)>>1; + IDWTELEM temp[width >> 1]; + const int w_l= (width>>1); + const int w_r= w2 - 1; + int i; + + { // Lift 0 + IDWTELEM * const ref = b + w2 - 1; + + i = 1; + b[0] = b[0] - ((W_DM * 2 * ref[1]+W_DO)>>W_DS); + __asm__ volatile( + "pcmpeqw %%mm7, %%mm7 \n\t" + "pcmpeqw %%mm3, %%mm3 \n\t" + "psllw $1, %%mm3 \n\t" + "paddw %%mm7, %%mm3 \n\t" + "psllw $13, %%mm3 \n\t" + ::); + for(; i<w_l-7; i+=8){ + __asm__ volatile( + "movq (%1), %%mm2 \n\t" + "movq 8(%1), %%mm6 \n\t" + "paddw 2(%1), %%mm2 \n\t" + "paddw 10(%1), %%mm6 \n\t" + "paddw %%mm7, %%mm2 \n\t" + "paddw %%mm7, %%mm6 \n\t" + "pmulhw %%mm3, %%mm2 \n\t" + "pmulhw %%mm3, %%mm6 \n\t" + "paddw (%0), %%mm2 \n\t" + "paddw 8(%0), %%mm6 \n\t" + "movq %%mm2, (%0) \n\t" + "movq %%mm6, 8(%0) \n\t" + :: "r"(&b[i]), "r"(&ref[i]) + : "memory" + ); + } + snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS); + } + + { // Lift 1 + IDWTELEM * const dst = b+w2; + + i = 0; + for(; i<w_r-7; i+=8){ + __asm__ volatile( + "movq (%1), %%mm2 \n\t" + "movq 8(%1), %%mm6 \n\t" + "paddw 2(%1), %%mm2 \n\t" + "paddw 10(%1), %%mm6 \n\t" + "movq (%0), %%mm0 \n\t" + "movq 8(%0), %%mm4 \n\t" + "psubw %%mm2, %%mm0 \n\t" + "psubw %%mm6, %%mm4 \n\t" + "movq %%mm0, (%0) \n\t" + "movq %%mm4, 8(%0) \n\t" + :: "r"(&dst[i]), "r"(&b[i]) + : "memory" + ); + } + snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS); + } + + { // Lift 2 + IDWTELEM * const ref = b+w2 - 1; + + i = 1; + b[0] = b[0] + (((2 * ref[1] + W_BO) + 4 * b[0]) >> W_BS); + __asm__ volatile( + "psllw $15, %%mm7 \n\t" + "pcmpeqw %%mm6, %%mm6 \n\t" + "psrlw $13, %%mm6 \n\t" + "paddw %%mm7, %%mm6 \n\t" + ::); + for(; i<w_l-7; i+=8){ + __asm__ volatile( + "movq (%1), %%mm0 \n\t" + "movq 8(%1), %%mm4 \n\t" + "movq 2(%1), %%mm1 \n\t" + "movq 10(%1), %%mm5 \n\t" + "paddw %%mm6, %%mm0 \n\t" + "paddw %%mm6, %%mm4 \n\t" + "paddw %%mm7, %%mm1 \n\t" + "paddw %%mm7, %%mm5 \n\t" + "pavgw %%mm1, %%mm0 \n\t" + "pavgw %%mm5, %%mm4 \n\t" + "psubw %%mm7, %%mm0 \n\t" + "psubw %%mm7, %%mm4 \n\t" + "psraw $1, %%mm0 \n\t" + "psraw $1, %%mm4 \n\t" + "movq (%0), %%mm1 \n\t" + "movq 8(%0), %%mm5 \n\t" + "paddw %%mm1, %%mm0 \n\t" + "paddw %%mm5, %%mm4 \n\t" + "psraw $2, %%mm0 \n\t" + "psraw $2, %%mm4 \n\t" + "paddw %%mm1, %%mm0 \n\t" + "paddw %%mm5, %%mm4 \n\t" + "movq %%mm0, (%0) \n\t" + "movq %%mm4, 8(%0) \n\t" + :: "r"(&b[i]), "r"(&ref[i]) + : "memory" + ); + } + snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l); + } + + { // Lift 3 + IDWTELEM * const src = b+w2; + i = 0; + + for(; i<w_r-7; i+=8){ + __asm__ volatile( + "movq 2(%1), %%mm2 \n\t" + "movq 10(%1), %%mm6 \n\t" + "paddw (%1), %%mm2 \n\t" + "paddw 8(%1), %%mm6 \n\t" + "movq (%0), %%mm0 \n\t" + "movq 8(%0), %%mm4 \n\t" + "paddw %%mm2, %%mm0 \n\t" + "paddw %%mm6, %%mm4 \n\t" + "psraw $1, %%mm2 \n\t" + "psraw $1, %%mm6 \n\t" + "paddw %%mm0, %%mm2 \n\t" + "paddw %%mm4, %%mm6 \n\t" + "movq %%mm2, (%2) \n\t" + "movq %%mm6, 8(%2) \n\t" + :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i]) + : "memory" + ); + } + snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS); + } + + { + snow_interleave_line_header(&i, width, b, temp); + + for (; (i & 0x1E) != 0x1E; i-=2){ + b[i+1] = temp[i>>1]; + b[i] = b[i>>1]; + } + for (i-=30; i>=0; i-=32){ + __asm__ volatile( + "movq (%1), %%mm0 \n\t" + "movq 8(%1), %%mm2 \n\t" + "movq 16(%1), %%mm4 \n\t" + "movq 24(%1), %%mm6 \n\t" + "movq (%1), %%mm1 \n\t" + "movq 8(%1), %%mm3 \n\t" + "movq 16(%1), %%mm5 \n\t" + "movq 24(%1), %%mm7 \n\t" + "punpcklwd (%2), %%mm0 \n\t" + "punpcklwd 8(%2), %%mm2 \n\t" + "punpcklwd 16(%2), %%mm4 \n\t" + "punpcklwd 24(%2), %%mm6 \n\t" + "movq %%mm0, (%0) \n\t" + "movq %%mm2, 16(%0) \n\t" + "movq %%mm4, 32(%0) \n\t" + "movq %%mm6, 48(%0) \n\t" + "punpckhwd (%2), %%mm1 \n\t" + "punpckhwd 8(%2), %%mm3 \n\t" + "punpckhwd 16(%2), %%mm5 \n\t" + "punpckhwd 24(%2), %%mm7 \n\t" + "movq %%mm1, 8(%0) \n\t" + "movq %%mm3, 24(%0) \n\t" + "movq %%mm5, 40(%0) \n\t" + "movq %%mm7, 56(%0) \n\t" + :: "r"(&b[i]), "r"(&b[i>>1]), "r"(&temp[i>>1]) + : "memory" + ); + } + } +} + +#ifdef HAVE_7REGS +#define snow_vertical_compose_sse2_load_add(op,r,t0,t1,t2,t3)\ + ""op" ("r",%%"REG_d"), %%"t0" \n\t"\ + ""op" 16("r",%%"REG_d"), %%"t1" \n\t"\ + ""op" 32("r",%%"REG_d"), %%"t2" \n\t"\ + ""op" 48("r",%%"REG_d"), %%"t3" \n\t" + +#define snow_vertical_compose_sse2_load(r,t0,t1,t2,t3)\ + snow_vertical_compose_sse2_load_add("movdqa",r,t0,t1,t2,t3) + +#define snow_vertical_compose_sse2_add(r,t0,t1,t2,t3)\ + snow_vertical_compose_sse2_load_add("paddw",r,t0,t1,t2,t3) + +#define snow_vertical_compose_r2r_sub(s0,s1,s2,s3,t0,t1,t2,t3)\ + "psubw %%"s0", %%"t0" \n\t"\ + "psubw %%"s1", %%"t1" \n\t"\ + "psubw %%"s2", %%"t2" \n\t"\ + "psubw %%"s3", %%"t3" \n\t" + +#define snow_vertical_compose_sse2_store(w,s0,s1,s2,s3)\ + "movdqa %%"s0", ("w",%%"REG_d") \n\t"\ + "movdqa %%"s1", 16("w",%%"REG_d") \n\t"\ + "movdqa %%"s2", 32("w",%%"REG_d") \n\t"\ + "movdqa %%"s3", 48("w",%%"REG_d") \n\t" + +#define snow_vertical_compose_sra(n,t0,t1,t2,t3)\ + "psraw $"n", %%"t0" \n\t"\ + "psraw $"n", %%"t1" \n\t"\ + "psraw $"n", %%"t2" \n\t"\ + "psraw $"n", %%"t3" \n\t" + +#define snow_vertical_compose_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)\ + "paddw %%"s0", %%"t0" \n\t"\ + "paddw %%"s1", %%"t1" \n\t"\ + "paddw %%"s2", %%"t2" \n\t"\ + "paddw %%"s3", %%"t3" \n\t" + +#define snow_vertical_compose_r2r_pmulhw(s0,s1,s2,s3,t0,t1,t2,t3)\ + "pmulhw %%"s0", %%"t0" \n\t"\ + "pmulhw %%"s1", %%"t1" \n\t"\ + "pmulhw %%"s2", %%"t2" \n\t"\ + "pmulhw %%"s3", %%"t3" \n\t" + +#define snow_vertical_compose_sse2_move(s0,s1,s2,s3,t0,t1,t2,t3)\ + "movdqa %%"s0", %%"t0" \n\t"\ + "movdqa %%"s1", %%"t1" \n\t"\ + "movdqa %%"s2", %%"t2" \n\t"\ + "movdqa %%"s3", %%"t3" \n\t" + +void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){ + x86_reg i = width; + + while(i & 0x1F) + { + i--; + b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS; + b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS; + b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS; + b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS; + } + i+=i; + + __asm__ volatile ( + "jmp 2f \n\t" + "1: \n\t" + snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6") + snow_vertical_compose_sse2_add("%6","xmm0","xmm2","xmm4","xmm6") + + + "pcmpeqw %%xmm0, %%xmm0 \n\t" + "pcmpeqw %%xmm2, %%xmm2 \n\t" + "paddw %%xmm2, %%xmm2 \n\t" + "paddw %%xmm0, %%xmm2 \n\t" + "psllw $13, %%xmm2 \n\t" + snow_vertical_compose_r2r_add("xmm0","xmm0","xmm0","xmm0","xmm1","xmm3","xmm5","xmm7") + snow_vertical_compose_r2r_pmulhw("xmm2","xmm2","xmm2","xmm2","xmm1","xmm3","xmm5","xmm7") + snow_vertical_compose_sse2_add("%5","xmm1","xmm3","xmm5","xmm7") + snow_vertical_compose_sse2_store("%5","xmm1","xmm3","xmm5","xmm7") + snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6") + snow_vertical_compose_sse2_add("%3","xmm1","xmm3","xmm5","xmm7") + snow_vertical_compose_r2r_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6") + snow_vertical_compose_sse2_store("%4","xmm0","xmm2","xmm4","xmm6") + + "pcmpeqw %%xmm7, %%xmm7 \n\t" + "pcmpeqw %%xmm5, %%xmm5 \n\t" + "psllw $15, %%xmm7 \n\t" + "psrlw $13, %%xmm5 \n\t" + "paddw %%xmm7, %%xmm5 \n\t" + snow_vertical_compose_r2r_add("xmm5","xmm5","xmm5","xmm5","xmm0","xmm2","xmm4","xmm6") + "movq (%2,%%"REG_d"), %%xmm1 \n\t" + "movq 8(%2,%%"REG_d"), %%xmm3 \n\t" + "paddw %%xmm7, %%xmm1 \n\t" + "paddw %%xmm7, %%xmm3 \n\t" + "pavgw %%xmm1, %%xmm0 \n\t" + "pavgw %%xmm3, %%xmm2 \n\t" + "movq 16(%2,%%"REG_d"), %%xmm1 \n\t" + "movq 24(%2,%%"REG_d"), %%xmm3 \n\t" + "paddw %%xmm7, %%xmm1 \n\t" + "paddw %%xmm7, %%xmm3 \n\t" + "pavgw %%xmm1, %%xmm4 \n\t" + "pavgw %%xmm3, %%xmm6 \n\t" + snow_vertical_compose_r2r_sub("xmm7","xmm7","xmm7","xmm7","xmm0","xmm2","xmm4","xmm6") + snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6") + snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6") + + snow_vertical_compose_sra("2","xmm0","xmm2","xmm4","xmm6") + snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6") + snow_vertical_compose_sse2_store("%3","xmm0","xmm2","xmm4","xmm6") + snow_vertical_compose_sse2_add("%1","xmm0","xmm2","xmm4","xmm6") + snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7") + snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6") + snow_vertical_compose_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6") + snow_vertical_compose_sse2_add("%2","xmm0","xmm2","xmm4","xmm6") + snow_vertical_compose_sse2_store("%2","xmm0","xmm2","xmm4","xmm6") + + "2: \n\t" + "sub $64, %%"REG_d" \n\t" + "jge 1b \n\t" + :"+d"(i) + :"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5)); +} + +#define snow_vertical_compose_mmx_load_add(op,r,t0,t1,t2,t3)\ + ""op" ("r",%%"REG_d"), %%"t0" \n\t"\ + ""op" 8("r",%%"REG_d"), %%"t1" \n\t"\ + ""op" 16("r",%%"REG_d"), %%"t2" \n\t"\ + ""op" 24("r",%%"REG_d"), %%"t3" \n\t" + +#define snow_vertical_compose_mmx_load(r,t0,t1,t2,t3)\ + snow_vertical_compose_mmx_load_add("movq",r,t0,t1,t2,t3) + +#define snow_vertical_compose_mmx_add(r,t0,t1,t2,t3)\ + snow_vertical_compose_mmx_load_add("paddw",r,t0,t1,t2,t3) + +#define snow_vertical_compose_mmx_store(w,s0,s1,s2,s3)\ + "movq %%"s0", ("w",%%"REG_d") \n\t"\ + "movq %%"s1", 8("w",%%"REG_d") \n\t"\ + "movq %%"s2", 16("w",%%"REG_d") \n\t"\ + "movq %%"s3", 24("w",%%"REG_d") \n\t" + +#define snow_vertical_compose_mmx_move(s0,s1,s2,s3,t0,t1,t2,t3)\ + "movq %%"s0", %%"t0" \n\t"\ + "movq %%"s1", %%"t1" \n\t"\ + "movq %%"s2", %%"t2" \n\t"\ + "movq %%"s3", %%"t3" \n\t" + + +void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){ + x86_reg i = width; + while(i & 15) + { + i--; + b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS; + b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS; + b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS; + b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS; + } + i+=i; + __asm__ volatile( + "jmp 2f \n\t" + "1: \n\t" + + snow_vertical_compose_mmx_load("%4","mm1","mm3","mm5","mm7") + snow_vertical_compose_mmx_add("%6","mm1","mm3","mm5","mm7") + "pcmpeqw %%mm0, %%mm0 \n\t" + "pcmpeqw %%mm2, %%mm2 \n\t" + "paddw %%mm2, %%mm2 \n\t" + "paddw %%mm0, %%mm2 \n\t" + "psllw $13, %%mm2 \n\t" + snow_vertical_compose_r2r_add("mm0","mm0","mm0","mm0","mm1","mm3","mm5","mm7") + snow_vertical_compose_r2r_pmulhw("mm2","mm2","mm2","mm2","mm1","mm3","mm5","mm7") + snow_vertical_compose_mmx_add("%5","mm1","mm3","mm5","mm7") + snow_vertical_compose_mmx_store("%5","mm1","mm3","mm5","mm7") + snow_vertical_compose_mmx_load("%4","mm0","mm2","mm4","mm6") + snow_vertical_compose_mmx_add("%3","mm1","mm3","mm5","mm7") + snow_vertical_compose_r2r_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6") + snow_vertical_compose_mmx_store("%4","mm0","mm2","mm4","mm6") + "pcmpeqw %%mm7, %%mm7 \n\t" + "pcmpeqw %%mm5, %%mm5 \n\t" + "psllw $15, %%mm7 \n\t" + "psrlw $13, %%mm5 \n\t" + "paddw %%mm7, %%mm5 \n\t" + snow_vertical_compose_r2r_add("mm5","mm5","mm5","mm5","mm0","mm2","mm4","mm6") + "movq (%2,%%"REG_d"), %%mm1 \n\t" + "movq 8(%2,%%"REG_d"), %%mm3 \n\t" + "paddw %%mm7, %%mm1 \n\t" + "paddw %%mm7, %%mm3 \n\t" + "pavgw %%mm1, %%mm0 \n\t" + "pavgw %%mm3, %%mm2 \n\t" + "movq 16(%2,%%"REG_d"), %%mm1 \n\t" + "movq 24(%2,%%"REG_d"), %%mm3 \n\t" + "paddw %%mm7, %%mm1 \n\t" + "paddw %%mm7, %%mm3 \n\t" + "pavgw %%mm1, %%mm4 \n\t" + "pavgw %%mm3, %%mm6 \n\t" + snow_vertical_compose_r2r_sub("mm7","mm7","mm7","mm7","mm0","mm2","mm4","mm6") + snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6") + snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6") + + snow_vertical_compose_sra("2","mm0","mm2","mm4","mm6") + snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6") + snow_vertical_compose_mmx_store("%3","mm0","mm2","mm4","mm6") + snow_vertical_compose_mmx_add("%1","mm0","mm2","mm4","mm6") + snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7") + snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6") + snow_vertical_compose_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6") + snow_vertical_compose_mmx_add("%2","mm0","mm2","mm4","mm6") + snow_vertical_compose_mmx_store("%2","mm0","mm2","mm4","mm6") + + "2: \n\t" + "sub $32, %%"REG_d" \n\t" + "jge 1b \n\t" + :"+d"(i) + :"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5)); +} +#endif //HAVE_7REGS + +#define snow_inner_add_yblock_sse2_header \ + IDWTELEM * * dst_array = sb->line + src_y;\ + x86_reg tmp;\ + __asm__ volatile(\ + "mov %7, %%"REG_c" \n\t"\ + "mov %6, %2 \n\t"\ + "mov %4, %%"REG_S" \n\t"\ + "pxor %%xmm7, %%xmm7 \n\t" /* 0 */\ + "pcmpeqd %%xmm3, %%xmm3 \n\t"\ + "psllw $15, %%xmm3 \n\t"\ + "psrlw $12, %%xmm3 \n\t" /* FRAC_BITS >> 1 */\ + "1: \n\t"\ + "mov %1, %%"REG_D" \n\t"\ + "mov (%%"REG_D"), %%"REG_D" \n\t"\ + "add %3, %%"REG_D" \n\t" + +#define snow_inner_add_yblock_sse2_start_8(out_reg1, out_reg2, ptr_offset, s_offset)\ + "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\ + "movq (%%"REG_d"), %%"out_reg1" \n\t"\ + "movq (%%"REG_d", %%"REG_c"), %%"out_reg2" \n\t"\ + "punpcklbw %%xmm7, %%"out_reg1" \n\t"\ + "punpcklbw %%xmm7, %%"out_reg2" \n\t"\ + "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\ + "movq "s_offset"+16(%%"REG_S"), %%xmm4 \n\t"\ + "punpcklbw %%xmm7, %%xmm0 \n\t"\ + "punpcklbw %%xmm7, %%xmm4 \n\t"\ + "pmullw %%xmm0, %%"out_reg1" \n\t"\ + "pmullw %%xmm4, %%"out_reg2" \n\t" + +#define snow_inner_add_yblock_sse2_start_16(out_reg1, out_reg2, ptr_offset, s_offset)\ + "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\ + "movq (%%"REG_d"), %%"out_reg1" \n\t"\ + "movq 8(%%"REG_d"), %%"out_reg2" \n\t"\ + "punpcklbw %%xmm7, %%"out_reg1" \n\t"\ + "punpcklbw %%xmm7, %%"out_reg2" \n\t"\ + "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\ + "movq "s_offset"+8(%%"REG_S"), %%xmm4 \n\t"\ + "punpcklbw %%xmm7, %%xmm0 \n\t"\ + "punpcklbw %%xmm7, %%xmm4 \n\t"\ + "pmullw %%xmm0, %%"out_reg1" \n\t"\ + "pmullw %%xmm4, %%"out_reg2" \n\t" + +#define snow_inner_add_yblock_sse2_accum_8(ptr_offset, s_offset) \ + snow_inner_add_yblock_sse2_start_8("xmm2", "xmm6", ptr_offset, s_offset)\ + "paddusw %%xmm2, %%xmm1 \n\t"\ + "paddusw %%xmm6, %%xmm5 \n\t" + +#define snow_inner_add_yblock_sse2_accum_16(ptr_offset, s_offset) \ + snow_inner_add_yblock_sse2_start_16("xmm2", "xmm6", ptr_offset, s_offset)\ + "paddusw %%xmm2, %%xmm1 \n\t"\ + "paddusw %%xmm6, %%xmm5 \n\t" + +#define snow_inner_add_yblock_sse2_end_common1\ + "add $32, %%"REG_S" \n\t"\ + "add %%"REG_c", %0 \n\t"\ + "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\ + "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\ + "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\ + "add %%"REG_c", (%%"REG_a") \n\t" + +#define snow_inner_add_yblock_sse2_end_common2\ + "jnz 1b \n\t"\ + :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\ + :\ + "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"((x86_reg)b_h),"m"((x86_reg)src_stride):\ + "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d""); + +#define snow_inner_add_yblock_sse2_end_8\ + "sal $1, %%"REG_c" \n\t"\ + "add $"PTR_SIZE"*2, %1 \n\t"\ + snow_inner_add_yblock_sse2_end_common1\ + "sar $1, %%"REG_c" \n\t"\ + "sub $2, %2 \n\t"\ + snow_inner_add_yblock_sse2_end_common2 + +#define snow_inner_add_yblock_sse2_end_16\ + "add $"PTR_SIZE"*1, %1 \n\t"\ + snow_inner_add_yblock_sse2_end_common1\ + "dec %2 \n\t"\ + snow_inner_add_yblock_sse2_end_common2 + +static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h, + int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){ +snow_inner_add_yblock_sse2_header +snow_inner_add_yblock_sse2_start_8("xmm1", "xmm5", "3", "0") +snow_inner_add_yblock_sse2_accum_8("2", "8") +snow_inner_add_yblock_sse2_accum_8("1", "128") +snow_inner_add_yblock_sse2_accum_8("0", "136") + + "mov %0, %%"REG_d" \n\t" + "movdqa (%%"REG_D"), %%xmm0 \n\t" + "movdqa %%xmm1, %%xmm2 \n\t" + + "punpckhwd %%xmm7, %%xmm1 \n\t" + "punpcklwd %%xmm7, %%xmm2 \n\t" + "paddd %%xmm2, %%xmm0 \n\t" + "movdqa 16(%%"REG_D"), %%xmm2 \n\t" + "paddd %%xmm1, %%xmm2 \n\t" + "paddd %%xmm3, %%xmm0 \n\t" + "paddd %%xmm3, %%xmm2 \n\t" + + "mov %1, %%"REG_D" \n\t" + "mov "PTR_SIZE"(%%"REG_D"), %%"REG_D";\n\t" + "add %3, %%"REG_D" \n\t" + + "movdqa (%%"REG_D"), %%xmm4 \n\t" + "movdqa %%xmm5, %%xmm6 \n\t" + "punpckhwd %%xmm7, %%xmm5 \n\t" + "punpcklwd %%xmm7, %%xmm6 \n\t" + "paddd %%xmm6, %%xmm4 \n\t" + "movdqa 16(%%"REG_D"), %%xmm6 \n\t" + "paddd %%xmm5, %%xmm6 \n\t" + "paddd %%xmm3, %%xmm4 \n\t" + "paddd %%xmm3, %%xmm6 \n\t" + + "psrad $8, %%xmm0 \n\t" /* FRAC_BITS. */ + "psrad $8, %%xmm2 \n\t" /* FRAC_BITS. */ + "packssdw %%xmm2, %%xmm0 \n\t" + "packuswb %%xmm7, %%xmm0 \n\t" + "movq %%xmm0, (%%"REG_d") \n\t" + + "psrad $8, %%xmm4 \n\t" /* FRAC_BITS. */ + "psrad $8, %%xmm6 \n\t" /* FRAC_BITS. */ + "packssdw %%xmm6, %%xmm4 \n\t" + "packuswb %%xmm7, %%xmm4 \n\t" + "movq %%xmm4, (%%"REG_d",%%"REG_c");\n\t" +snow_inner_add_yblock_sse2_end_8 +} + +static void inner_add_yblock_bw_16_obmc_32_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h, + int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){ +snow_inner_add_yblock_sse2_header +snow_inner_add_yblock_sse2_start_16("xmm1", "xmm5", "3", "0") +snow_inner_add_yblock_sse2_accum_16("2", "16") +snow_inner_add_yblock_sse2_accum_16("1", "512") +snow_inner_add_yblock_sse2_accum_16("0", "528") + + "mov %0, %%"REG_d" \n\t" + "psrlw $4, %%xmm1 \n\t" + "psrlw $4, %%xmm5 \n\t" + "paddw (%%"REG_D"), %%xmm1 \n\t" + "paddw 16(%%"REG_D"), %%xmm5 \n\t" + "paddw %%xmm3, %%xmm1 \n\t" + "paddw %%xmm3, %%xmm5 \n\t" + "psraw $4, %%xmm1 \n\t" /* FRAC_BITS. */ + "psraw $4, %%xmm5 \n\t" /* FRAC_BITS. */ + "packuswb %%xmm5, %%xmm1 \n\t" + + "movdqu %%xmm1, (%%"REG_d") \n\t" + +snow_inner_add_yblock_sse2_end_16 +} + +#define snow_inner_add_yblock_mmx_header \ + IDWTELEM * * dst_array = sb->line + src_y;\ + x86_reg tmp;\ + __asm__ volatile(\ + "mov %7, %%"REG_c" \n\t"\ + "mov %6, %2 \n\t"\ + "mov %4, %%"REG_S" \n\t"\ + "pxor %%mm7, %%mm7 \n\t" /* 0 */\ + "pcmpeqd %%mm3, %%mm3 \n\t"\ + "psllw $15, %%mm3 \n\t"\ + "psrlw $12, %%mm3 \n\t" /* FRAC_BITS >> 1 */\ + "1: \n\t"\ + "mov %1, %%"REG_D" \n\t"\ + "mov (%%"REG_D"), %%"REG_D" \n\t"\ + "add %3, %%"REG_D" \n\t" + +#define snow_inner_add_yblock_mmx_start(out_reg1, out_reg2, ptr_offset, s_offset, d_offset)\ + "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\ + "movd "d_offset"(%%"REG_d"), %%"out_reg1" \n\t"\ + "movd "d_offset"+4(%%"REG_d"), %%"out_reg2" \n\t"\ + "punpcklbw %%mm7, %%"out_reg1" \n\t"\ + "punpcklbw %%mm7, %%"out_reg2" \n\t"\ + "movd "s_offset"(%%"REG_S"), %%mm0 \n\t"\ + "movd "s_offset"+4(%%"REG_S"), %%mm4 \n\t"\ + "punpcklbw %%mm7, %%mm0 \n\t"\ + "punpcklbw %%mm7, %%mm4 \n\t"\ + "pmullw %%mm0, %%"out_reg1" \n\t"\ + "pmullw %%mm4, %%"out_reg2" \n\t" + +#define snow_inner_add_yblock_mmx_accum(ptr_offset, s_offset, d_offset) \ + snow_inner_add_yblock_mmx_start("mm2", "mm6", ptr_offset, s_offset, d_offset)\ + "paddusw %%mm2, %%mm1 \n\t"\ + "paddusw %%mm6, %%mm5 \n\t" + +#define snow_inner_add_yblock_mmx_mix(read_offset, write_offset)\ + "mov %0, %%"REG_d" \n\t"\ + "psrlw $4, %%mm1 \n\t"\ + "psrlw $4, %%mm5 \n\t"\ + "paddw "read_offset"(%%"REG_D"), %%mm1 \n\t"\ + "paddw "read_offset"+8(%%"REG_D"), %%mm5 \n\t"\ + "paddw %%mm3, %%mm1 \n\t"\ + "paddw %%mm3, %%mm5 \n\t"\ + "psraw $4, %%mm1 \n\t"\ + "psraw $4, %%mm5 \n\t"\ + "packuswb %%mm5, %%mm1 \n\t"\ + "movq %%mm1, "write_offset"(%%"REG_d") \n\t" + +#define snow_inner_add_yblock_mmx_end(s_step)\ + "add $"s_step", %%"REG_S" \n\t"\ + "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\ + "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\ + "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\ + "add %%"REG_c", (%%"REG_a") \n\t"\ + "add $"PTR_SIZE"*1, %1 \n\t"\ + "add %%"REG_c", %0 \n\t"\ + "dec %2 \n\t"\ + "jnz 1b \n\t"\ + :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\ + :\ + "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"((x86_reg)b_h),"m"((x86_reg)src_stride):\ + "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d""); + +static void inner_add_yblock_bw_8_obmc_16_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h, + int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){ +snow_inner_add_yblock_mmx_header +snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0") +snow_inner_add_yblock_mmx_accum("2", "8", "0") +snow_inner_add_yblock_mmx_accum("1", "128", "0") +snow_inner_add_yblock_mmx_accum("0", "136", "0") +snow_inner_add_yblock_mmx_mix("0", "0") +snow_inner_add_yblock_mmx_end("16") +} + +static void inner_add_yblock_bw_16_obmc_32_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h, + int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){ +snow_inner_add_yblock_mmx_header +snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0") +snow_inner_add_yblock_mmx_accum("2", "16", "0") +snow_inner_add_yblock_mmx_accum("1", "512", "0") +snow_inner_add_yblock_mmx_accum("0", "528", "0") +snow_inner_add_yblock_mmx_mix("0", "0") + +snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "8", "8") +snow_inner_add_yblock_mmx_accum("2", "24", "8") +snow_inner_add_yblock_mmx_accum("1", "520", "8") +snow_inner_add_yblock_mmx_accum("0", "536", "8") +snow_inner_add_yblock_mmx_mix("16", "8") +snow_inner_add_yblock_mmx_end("32") +} + +void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, + int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){ + + if (b_w == 16) + inner_add_yblock_bw_16_obmc_32_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); + else if (b_w == 8 && obmc_stride == 16) { + if (!(b_h & 1)) + inner_add_yblock_bw_8_obmc_16_bh_even_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); + else + inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); + } else + ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); +} + +void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, + int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){ + if (b_w == 16) + inner_add_yblock_bw_16_obmc_32_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); + else if (b_w == 8 && obmc_stride == 16) + inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); + else + ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/x86/vc1dsp_mmx.c Mon Dec 22 09:12:42 2008 +0000 @@ -0,0 +1,490 @@ +/* + * VC-1 and WMV3 - DSP functions MMX-optimized + * Copyright (c) 2007 Christophe GISQUET <christophe.gisquet@free.fr> + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "libavutil/x86_cpu.h" +#include "libavcodec/dsputil.h" +#include "dsputil_mmx.h" + +/** Add rounder from mm7 to mm3 and pack result at destination */ +#define NORMALIZE_MMX(SHIFT) \ + "paddw %%mm7, %%mm3 \n\t" /* +bias-r */ \ + "paddw %%mm7, %%mm4 \n\t" /* +bias-r */ \ + "psraw "SHIFT", %%mm3 \n\t" \ + "psraw "SHIFT", %%mm4 \n\t" + +#define TRANSFER_DO_PACK \ + "packuswb %%mm4, %%mm3 \n\t" \ + "movq %%mm3, (%2) \n\t" + +#define TRANSFER_DONT_PACK \ + "movq %%mm3, 0(%2) \n\t" \ + "movq %%mm4, 8(%2) \n\t" + +/** @see MSPEL_FILTER13_CORE for use as UNPACK macro */ +#define DO_UNPACK(reg) "punpcklbw %%mm0, " reg "\n\t" +#define DONT_UNPACK(reg) + +/** Compute the rounder 32-r or 8-r and unpacks it to mm7 */ +#define LOAD_ROUNDER_MMX(ROUND) \ + "movd "ROUND", %%mm7 \n\t" \ + "punpcklwd %%mm7, %%mm7 \n\t" \ + "punpckldq %%mm7, %%mm7 \n\t" + +#define SHIFT2_LINE(OFF, R0,R1,R2,R3) \ + "paddw %%mm"#R2", %%mm"#R1" \n\t" \ + "movd (%0,%3), %%mm"#R0" \n\t" \ + "pmullw %%mm6, %%mm"#R1" \n\t" \ + "punpcklbw %%mm0, %%mm"#R0" \n\t" \ + "movd (%0,%2), %%mm"#R3" \n\t" \ + "psubw %%mm"#R0", %%mm"#R1" \n\t" \ + "punpcklbw %%mm0, %%mm"#R3" \n\t" \ + "paddw %%mm7, %%mm"#R1" \n\t" \ + "psubw %%mm"#R3", %%mm"#R1" \n\t" \ + "psraw %4, %%mm"#R1" \n\t" \ + "movq %%mm"#R1", "#OFF"(%1) \n\t" \ + "add %2, %0 \n\t" + +DECLARE_ALIGNED_16(const uint64_t, ff_pw_9) = 0x0009000900090009ULL; + +/** Sacrifying mm6 allows to pipeline loads from src */ +static void vc1_put_ver_16b_shift2_mmx(int16_t *dst, + const uint8_t *src, x86_reg stride, + int rnd, int64_t shift) +{ + __asm__ volatile( + "mov $3, %%"REG_c" \n\t" + LOAD_ROUNDER_MMX("%5") + "movq "MANGLE(ff_pw_9)", %%mm6 \n\t" + "1: \n\t" + "movd (%0), %%mm2 \n\t" + "add %2, %0 \n\t" + "movd (%0), %%mm3 \n\t" + "punpcklbw %%mm0, %%mm2 \n\t" + "punpcklbw %%mm0, %%mm3 \n\t" + SHIFT2_LINE( 0, 1, 2, 3, 4) + SHIFT2_LINE( 24, 2, 3, 4, 1) + SHIFT2_LINE( 48, 3, 4, 1, 2) + SHIFT2_LINE( 72, 4, 1, 2, 3) + SHIFT2_LINE( 96, 1, 2, 3, 4) + SHIFT2_LINE(120, 2, 3, 4, 1) + SHIFT2_LINE(144, 3, 4, 1, 2) + SHIFT2_LINE(168, 4, 1, 2, 3) + "sub %6, %0 \n\t" + "add $8, %1 \n\t" + "dec %%"REG_c" \n\t" + "jnz 1b \n\t" + : "+r"(src), "+r"(dst) + : "r"(stride), "r"(-2*stride), + "m"(shift), "m"(rnd), "r"(9*stride-4) + : "%"REG_c, "memory" + ); +} + +/** + * Data is already unpacked, so some operations can directly be made from + * memory. + */ +static void vc1_put_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride, + const int16_t *src, int rnd) +{ + int h = 8; + + src -= 1; + rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */ + __asm__ volatile( + LOAD_ROUNDER_MMX("%4") + "movq "MANGLE(ff_pw_128)", %%mm6\n\t" + "movq "MANGLE(ff_pw_9)", %%mm5 \n\t" + "1: \n\t" + "movq 2*0+0(%1), %%mm1 \n\t" + "movq 2*0+8(%1), %%mm2 \n\t" + "movq 2*1+0(%1), %%mm3 \n\t" + "movq 2*1+8(%1), %%mm4 \n\t" + "paddw 2*3+0(%1), %%mm1 \n\t" + "paddw 2*3+8(%1), %%mm2 \n\t" + "paddw 2*2+0(%1), %%mm3 \n\t" + "paddw 2*2+8(%1), %%mm4 \n\t" + "pmullw %%mm5, %%mm3 \n\t" + "pmullw %%mm5, %%mm4 \n\t" + "psubw %%mm1, %%mm3 \n\t" + "psubw %%mm2, %%mm4 \n\t" + NORMALIZE_MMX("$7") + /* Remove bias */ + "paddw %%mm6, %%mm3 \n\t" + "paddw %%mm6, %%mm4 \n\t" + TRANSFER_DO_PACK + "add $24, %1 \n\t" + "add %3, %2 \n\t" + "decl %0 \n\t" + "jnz 1b \n\t" + : "+r"(h), "+r" (src), "+r" (dst) + : "r"(stride), "m"(rnd) + : "memory" + ); +} + + +/** + * Purely vertical or horizontal 1/2 shift interpolation. + * Sacrify mm6 for *9 factor. + */ +static void vc1_put_shift2_mmx(uint8_t *dst, const uint8_t *src, + x86_reg stride, int rnd, x86_reg offset) +{ + rnd = 8-rnd; + __asm__ volatile( + "mov $8, %%"REG_c" \n\t" + LOAD_ROUNDER_MMX("%5") + "movq "MANGLE(ff_pw_9)", %%mm6\n\t" + "1: \n\t" + "movd 0(%0 ), %%mm3 \n\t" + "movd 4(%0 ), %%mm4 \n\t" + "movd 0(%0,%2), %%mm1 \n\t" + "movd 4(%0,%2), %%mm2 \n\t" + "add %2, %0 \n\t" + "punpcklbw %%mm0, %%mm3 \n\t" + "punpcklbw %%mm0, %%mm4 \n\t" + "punpcklbw %%mm0, %%mm1 \n\t" + "punpcklbw %%mm0, %%mm2 \n\t" + "paddw %%mm1, %%mm3 \n\t" + "paddw %%mm2, %%mm4 \n\t" + "movd 0(%0,%3), %%mm1 \n\t" + "movd 4(%0,%3), %%mm2 \n\t" + "pmullw %%mm6, %%mm3 \n\t" /* 0,9,9,0*/ + "pmullw %%mm6, %%mm4 \n\t" /* 0,9,9,0*/ + "punpcklbw %%mm0, %%mm1 \n\t" + "punpcklbw %%mm0, %%mm2 \n\t" + "psubw %%mm1, %%mm3 \n\t" /*-1,9,9,0*/ + "psubw %%mm2, %%mm4 \n\t" /*-1,9,9,0*/ + "movd 0(%0,%2), %%mm1 \n\t" + "movd 4(%0,%2), %%mm2 \n\t" + "punpcklbw %%mm0, %%mm1 \n\t" + "punpcklbw %%mm0, %%mm2 \n\t" + "psubw %%mm1, %%mm3 \n\t" /*-1,9,9,-1*/ + "psubw %%mm2, %%mm4 \n\t" /*-1,9,9,-1*/ + NORMALIZE_MMX("$4") + "packuswb %%mm4, %%mm3 \n\t" + "movq %%mm3, (%1) \n\t" + "add %6, %0 \n\t" + "add %4, %1 \n\t" + "dec %%"REG_c" \n\t" + "jnz 1b \n\t" + : "+r"(src), "+r"(dst) + : "r"(offset), "r"(-2*offset), "g"(stride), "m"(rnd), + "g"(stride-offset) + : "%"REG_c, "memory" + ); +} + +/** + * Filter coefficients made global to allow access by all 1 or 3 quarter shift + * interpolation functions. + */ +DECLARE_ASM_CONST(16, uint64_t, ff_pw_53) = 0x0035003500350035ULL; +DECLARE_ASM_CONST(16, uint64_t, ff_pw_18) = 0x0012001200120012ULL; + +/** + * Core of the 1/4 and 3/4 shift bicubic interpolation. + * + * @param UNPACK Macro unpacking arguments from 8 to 16bits (can be empty). + * @param MOVQ "movd 1" or "movq 2", if data read is already unpacked. + * @param A1 Address of 1st tap (beware of unpacked/packed). + * @param A2 Address of 2nd tap + * @param A3 Address of 3rd tap + * @param A4 Address of 4th tap + */ +#define MSPEL_FILTER13_CORE(UNPACK, MOVQ, A1, A2, A3, A4) \ + MOVQ "*0+"A1", %%mm1 \n\t" \ + MOVQ "*4+"A1", %%mm2 \n\t" \ + UNPACK("%%mm1") \ + UNPACK("%%mm2") \ + "pmullw "MANGLE(ff_pw_3)", %%mm1\n\t" \ + "pmullw "MANGLE(ff_pw_3)", %%mm2\n\t" \ + MOVQ "*0+"A2", %%mm3 \n\t" \ + MOVQ "*4+"A2", %%mm4 \n\t" \ + UNPACK("%%mm3") \ + UNPACK("%%mm4") \ + "pmullw %%mm6, %%mm3 \n\t" /* *18 */ \ + "pmullw %%mm6, %%mm4 \n\t" /* *18 */ \ + "psubw %%mm1, %%mm3 \n\t" /* 18,-3 */ \ + "psubw %%mm2, %%mm4 \n\t" /* 18,-3 */ \ + MOVQ "*0+"A4", %%mm1 \n\t" \ + MOVQ "*4+"A4", %%mm2 \n\t" \ + UNPACK("%%mm1") \ + UNPACK("%%mm2") \ + "psllw $2, %%mm1 \n\t" /* 4* */ \ + "psllw $2, %%mm2 \n\t" /* 4* */ \ + "psubw %%mm1, %%mm3 \n\t" /* -4,18,-3 */ \ + "psubw %%mm2, %%mm4 \n\t" /* -4,18,-3 */ \ + MOVQ "*0+"A3", %%mm1 \n\t" \ + MOVQ "*4+"A3", %%mm2 \n\t" \ + UNPACK("%%mm1") \ + UNPACK("%%mm2") \ + "pmullw %%mm5, %%mm1 \n\t" /* *53 */ \ + "pmullw %%mm5, %%mm2 \n\t" /* *53 */ \ + "paddw %%mm1, %%mm3 \n\t" /* 4,53,18,-3 */ \ + "paddw %%mm2, %%mm4 \n\t" /* 4,53,18,-3 */ + +/** + * Macro to build the vertical 16bits version of vc1_put_shift[13]. + * Here, offset=src_stride. Parameters passed A1 to A4 must use + * %3 (src_stride) and %4 (3*src_stride). + * + * @param NAME Either 1 or 3 + * @see MSPEL_FILTER13_CORE for information on A1->A4 + */ +#define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4) \ +static void \ +vc1_put_ver_16b_ ## NAME ## _mmx(int16_t *dst, const uint8_t *src, \ + x86_reg src_stride, \ + int rnd, int64_t shift) \ +{ \ + int h = 8; \ + src -= src_stride; \ + __asm__ volatile( \ + LOAD_ROUNDER_MMX("%5") \ + "movq "MANGLE(ff_pw_53)", %%mm5\n\t" \ + "movq "MANGLE(ff_pw_18)", %%mm6\n\t" \ + ASMALIGN(3) \ + "1: \n\t" \ + MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \ + NORMALIZE_MMX("%6") \ + TRANSFER_DONT_PACK \ + /* Last 3 (in fact 4) bytes on the line */ \ + "movd 8+"A1", %%mm1 \n\t" \ + DO_UNPACK("%%mm1") \ + "movq %%mm1, %%mm3 \n\t" \ + "paddw %%mm1, %%mm1 \n\t" \ + "paddw %%mm3, %%mm1 \n\t" /* 3* */ \ + "movd 8+"A2", %%mm3 \n\t" \ + DO_UNPACK("%%mm3") \ + "pmullw %%mm6, %%mm3 \n\t" /* *18 */ \ + "psubw %%mm1, %%mm3 \n\t" /*18,-3 */ \ + "movd 8+"A3", %%mm1 \n\t" \ + DO_UNPACK("%%mm1") \ + "pmullw %%mm5, %%mm1 \n\t" /* *53 */ \ + "paddw %%mm1, %%mm3 \n\t" /*53,18,-3 */ \ + "movd 8+"A4", %%mm1 \n\t" \ + DO_UNPACK("%%mm1") \ + "psllw $2, %%mm1 \n\t" /* 4* */ \ + "psubw %%mm1, %%mm3 \n\t" \ + "paddw %%mm7, %%mm3 \n\t" \ + "psraw %6, %%mm3 \n\t" \ + "movq %%mm3, 16(%2) \n\t" \ + "add %3, %1 \n\t" \ + "add $24, %2 \n\t" \ + "decl %0 \n\t" \ + "jnz 1b \n\t" \ + : "+r"(h), "+r" (src), "+r" (dst) \ + : "r"(src_stride), "r"(3*src_stride), \ + "m"(rnd), "m"(shift) \ + : "memory" \ + ); \ +} + +/** + * Macro to build the horizontal 16bits version of vc1_put_shift[13]. + * Here, offset=16bits, so parameters passed A1 to A4 should be simple. + * + * @param NAME Either 1 or 3 + * @see MSPEL_FILTER13_CORE for information on A1->A4 + */ +#define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4) \ +static void \ +vc1_put_hor_16b_ ## NAME ## _mmx(uint8_t *dst, x86_reg stride, \ + const int16_t *src, int rnd) \ +{ \ + int h = 8; \ + src -= 1; \ + rnd -= (-4+58+13-3)*256; /* Add -256 bias */ \ + __asm__ volatile( \ + LOAD_ROUNDER_MMX("%4") \ + "movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \ + "movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \ + ASMALIGN(3) \ + "1: \n\t" \ + MSPEL_FILTER13_CORE(DONT_UNPACK, "movq 2", A1, A2, A3, A4) \ + NORMALIZE_MMX("$7") \ + /* Remove bias */ \ + "paddw "MANGLE(ff_pw_128)", %%mm3 \n\t" \ + "paddw "MANGLE(ff_pw_128)", %%mm4 \n\t" \ + TRANSFER_DO_PACK \ + "add $24, %1 \n\t" \ + "add %3, %2 \n\t" \ + "decl %0 \n\t" \ + "jnz 1b \n\t" \ + : "+r"(h), "+r" (src), "+r" (dst) \ + : "r"(stride), "m"(rnd) \ + : "memory" \ + ); \ +} + +/** + * Macro to build the 8bits, any direction, version of vc1_put_shift[13]. + * Here, offset=src_stride. Parameters passed A1 to A4 must use + * %3 (offset) and %4 (3*offset). + * + * @param NAME Either 1 or 3 + * @see MSPEL_FILTER13_CORE for information on A1->A4 + */ +#define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4) \ +static void \ +vc1_put_## NAME ## _mmx(uint8_t *dst, const uint8_t *src, \ + x86_reg stride, int rnd, x86_reg offset) \ +{ \ + int h = 8; \ + src -= offset; \ + rnd = 32-rnd; \ + __asm__ volatile ( \ + LOAD_ROUNDER_MMX("%6") \ + "movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \ + "movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \ + ASMALIGN(3) \ + "1: \n\t" \ + MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \ + NORMALIZE_MMX("$6") \ + TRANSFER_DO_PACK \ + "add %5, %1 \n\t" \ + "add %5, %2 \n\t" \ + "decl %0 \n\t" \ + "jnz 1b \n\t" \ + : "+r"(h), "+r" (src), "+r" (dst) \ + : "r"(offset), "r"(3*offset), "g"(stride), "m"(rnd) \ + : "memory" \ + ); \ +} + +/** 1/4 shift bicubic interpolation */ +MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )") +MSPEL_FILTER13_VER_16B(shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )") +MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)") + +/** 3/4 shift bicubic interpolation */ +MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )") +MSPEL_FILTER13_VER_16B(shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )") +MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)") + +typedef void (*vc1_mspel_mc_filter_ver_16bits)(int16_t *dst, const uint8_t *src, x86_reg src_stride, int rnd, int64_t shift); +typedef void (*vc1_mspel_mc_filter_hor_16bits)(uint8_t *dst, x86_reg dst_stride, const int16_t *src, int rnd); +typedef void (*vc1_mspel_mc_filter_8bits)(uint8_t *dst, const uint8_t *src, x86_reg stride, int rnd, x86_reg offset); + +/** + * Interpolates fractional pel values by applying proper vertical then + * horizontal filter. + * + * @param dst Destination buffer for interpolated pels. + * @param src Source buffer. + * @param stride Stride for both src and dst buffers. + * @param hmode Horizontal filter (expressed in quarter pixels shift). + * @param hmode Vertical filter. + * @param rnd Rounding bias. + */ +static void vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride, + int hmode, int vmode, int rnd) +{ + static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] = + { NULL, vc1_put_ver_16b_shift1_mmx, vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx }; + static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] = + { NULL, vc1_put_hor_16b_shift1_mmx, vc1_put_hor_16b_shift2_mmx, vc1_put_hor_16b_shift3_mmx }; + static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] = + { NULL, vc1_put_shift1_mmx, vc1_put_shift2_mmx, vc1_put_shift3_mmx }; + + __asm__ volatile( + "pxor %%mm0, %%mm0 \n\t" + ::: "memory" + ); + + if (vmode) { /* Vertical filter to apply */ + if (hmode) { /* Horizontal filter to apply, output to tmp */ + static const int shift_value[] = { 0, 5, 1, 5 }; + int shift = (shift_value[hmode]+shift_value[vmode])>>1; + int r; + DECLARE_ALIGNED_16(int16_t, tmp[12*8]); + + r = (1<<(shift-1)) + rnd-1; + vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift); + + vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd); + return; + } + else { /* No horizontal filter, output 8 lines to dst */ + vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride); + return; + } + } + + /* Horizontal mode with no vertical mode */ + vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1); +} + +void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd); + +/** Macro to ease bicubic filter interpolation functions declarations */ +#define DECLARE_FUNCTION(a, b) \ +static void put_vc1_mspel_mc ## a ## b ## _mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) { \ + vc1_mspel_mc(dst, src, stride, a, b, rnd); \ +} + +DECLARE_FUNCTION(0, 1) +DECLARE_FUNCTION(0, 2) +DECLARE_FUNCTION(0, 3) + +DECLARE_FUNCTION(1, 0) +DECLARE_FUNCTION(1, 1) +DECLARE_FUNCTION(1, 2) +DECLARE_FUNCTION(1, 3) + +DECLARE_FUNCTION(2, 0) +DECLARE_FUNCTION(2, 1) +DECLARE_FUNCTION(2, 2) +DECLARE_FUNCTION(2, 3) + +DECLARE_FUNCTION(3, 0) +DECLARE_FUNCTION(3, 1) +DECLARE_FUNCTION(3, 2) +DECLARE_FUNCTION(3, 3) + +void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx) { + dsp->put_vc1_mspel_pixels_tab[ 0] = ff_put_vc1_mspel_mc00_mmx; + dsp->put_vc1_mspel_pixels_tab[ 4] = put_vc1_mspel_mc01_mmx; + dsp->put_vc1_mspel_pixels_tab[ 8] = put_vc1_mspel_mc02_mmx; + dsp->put_vc1_mspel_pixels_tab[12] = put_vc1_mspel_mc03_mmx; + + dsp->put_vc1_mspel_pixels_tab[ 1] = put_vc1_mspel_mc10_mmx; + dsp->put_vc1_mspel_pixels_tab[ 5] = put_vc1_mspel_mc11_mmx; + dsp->put_vc1_mspel_pixels_tab[ 9] = put_vc1_mspel_mc12_mmx; + dsp->put_vc1_mspel_pixels_tab[13] = put_vc1_mspel_mc13_mmx; + + dsp->put_vc1_mspel_pixels_tab[ 2] = put_vc1_mspel_mc20_mmx; + dsp->put_vc1_mspel_pixels_tab[ 6] = put_vc1_mspel_mc21_mmx; + dsp->put_vc1_mspel_pixels_tab[10] = put_vc1_mspel_mc22_mmx; + dsp->put_vc1_mspel_pixels_tab[14] = put_vc1_mspel_mc23_mmx; + + dsp->put_vc1_mspel_pixels_tab[ 3] = put_vc1_mspel_mc30_mmx; + dsp->put_vc1_mspel_pixels_tab[ 7] = put_vc1_mspel_mc31_mmx; + dsp->put_vc1_mspel_pixels_tab[11] = put_vc1_mspel_mc32_mmx; + dsp->put_vc1_mspel_pixels_tab[15] = put_vc1_mspel_mc33_mmx; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/x86/vp3dsp_mmx.c Mon Dec 22 09:12:42 2008 +0000 @@ -0,0 +1,396 @@ +/* + * Copyright (C) 2004 the ffmpeg project + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file vp3dsp_mmx.c + * MMX-optimized functions cribbed from the original VP3 source code. + */ + +#include "libavutil/x86_cpu.h" +#include "libavcodec/dsputil.h" +#include "dsputil_mmx.h" + +extern const uint16_t ff_vp3_idct_data[]; + +// this is off by one or two for some cases when filter_limit is greater than 63 +// in: p0 in mm6, p1 in mm4, p2 in mm2, p3 in mm1 +// out: p1 in mm4, p2 in mm3 +#define VP3_LOOP_FILTER(flim) \ + "movq %%mm6, %%mm7 \n\t" \ + "pand "MANGLE(ff_pb_7 )", %%mm6 \n\t" /* p0&7 */ \ + "psrlw $3, %%mm7 \n\t" \ + "pand "MANGLE(ff_pb_1F)", %%mm7 \n\t" /* p0>>3 */ \ + "movq %%mm2, %%mm3 \n\t" /* mm3 = p2 */ \ + "pxor %%mm4, %%mm2 \n\t" \ + "pand "MANGLE(ff_pb_1 )", %%mm2 \n\t" /* (p2^p1)&1 */ \ + "movq %%mm2, %%mm5 \n\t" \ + "paddb %%mm2, %%mm2 \n\t" \ + "paddb %%mm5, %%mm2 \n\t" /* 3*(p2^p1)&1 */ \ + "paddb %%mm6, %%mm2 \n\t" /* extra bits lost in shifts */ \ + "pcmpeqb %%mm0, %%mm0 \n\t" \ + "pxor %%mm0, %%mm1 \n\t" /* 255 - p3 */ \ + "pavgb %%mm2, %%mm1 \n\t" /* (256 - p3 + extrabits) >> 1 */ \ + "pxor %%mm4, %%mm0 \n\t" /* 255 - p1 */ \ + "pavgb %%mm3, %%mm0 \n\t" /* (256 + p2-p1) >> 1 */ \ + "paddb "MANGLE(ff_pb_3 )", %%mm1 \n\t" \ + "pavgb %%mm0, %%mm1 \n\t" /* 128+2+( p2-p1 - p3) >> 2 */ \ + "pavgb %%mm0, %%mm1 \n\t" /* 128+1+(3*(p2-p1) - p3) >> 3 */ \ + "paddusb %%mm1, %%mm7 \n\t" /* d+128+1 */ \ + "movq "MANGLE(ff_pb_81)", %%mm6 \n\t" \ + "psubusb %%mm7, %%mm6 \n\t" \ + "psubusb "MANGLE(ff_pb_81)", %%mm7 \n\t" \ +\ + "movq "#flim", %%mm5 \n\t" \ + "pminub %%mm5, %%mm6 \n\t" \ + "pminub %%mm5, %%mm7 \n\t" \ + "movq %%mm6, %%mm0 \n\t" \ + "movq %%mm7, %%mm1 \n\t" \ + "paddb %%mm6, %%mm6 \n\t" \ + "paddb %%mm7, %%mm7 \n\t" \ + "pminub %%mm5, %%mm6 \n\t" \ + "pminub %%mm5, %%mm7 \n\t" \ + "psubb %%mm0, %%mm6 \n\t" \ + "psubb %%mm1, %%mm7 \n\t" \ + "paddusb %%mm7, %%mm4 \n\t" \ + "psubusb %%mm6, %%mm4 \n\t" \ + "psubusb %%mm7, %%mm3 \n\t" \ + "paddusb %%mm6, %%mm3 \n\t" + +#define STORE_4_WORDS(dst0, dst1, dst2, dst3, mm) \ + "movd "#mm", %0 \n\t" \ + "movw %w0, -1"#dst0" \n\t" \ + "psrlq $32, "#mm" \n\t" \ + "shr $16, %0 \n\t" \ + "movw %w0, -1"#dst1" \n\t" \ + "movd "#mm", %0 \n\t" \ + "movw %w0, -1"#dst2" \n\t" \ + "shr $16, %0 \n\t" \ + "movw %w0, -1"#dst3" \n\t" + +void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values) +{ + __asm__ volatile( + "movq %0, %%mm6 \n\t" + "movq %1, %%mm4 \n\t" + "movq %2, %%mm2 \n\t" + "movq %3, %%mm1 \n\t" + + VP3_LOOP_FILTER(%4) + + "movq %%mm4, %1 \n\t" + "movq %%mm3, %2 \n\t" + + : "+m" (*(uint64_t*)(src - 2*stride)), + "+m" (*(uint64_t*)(src - 1*stride)), + "+m" (*(uint64_t*)(src + 0*stride)), + "+m" (*(uint64_t*)(src + 1*stride)) + : "m"(*(uint64_t*)(bounding_values+129)) + ); +} + +void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values) +{ + x86_reg tmp; + + __asm__ volatile( + "movd -2(%1), %%mm6 \n\t" + "movd -2(%1,%3), %%mm0 \n\t" + "movd -2(%1,%3,2), %%mm1 \n\t" + "movd -2(%1,%4), %%mm4 \n\t" + + TRANSPOSE8x4(%%mm6, %%mm0, %%mm1, %%mm4, -2(%2), -2(%2,%3), -2(%2,%3,2), -2(%2,%4), %%mm2) + VP3_LOOP_FILTER(%5) + SBUTTERFLY(%%mm4, %%mm3, %%mm5, bw, q) + + STORE_4_WORDS((%1), (%1,%3), (%1,%3,2), (%1,%4), %%mm4) + STORE_4_WORDS((%2), (%2,%3), (%2,%3,2), (%2,%4), %%mm5) + + : "=&r"(tmp) + : "r"(src), "r"(src+4*stride), "r"((x86_reg)stride), "r"((x86_reg)3*stride), + "m"(*(uint64_t*)(bounding_values+129)) + : "memory" + ); +} + +/* from original comments: The Macro does IDct on 4 1-D Dcts */ +#define BeginIDCT() \ + "movq "I(3)", %%mm2 \n\t" \ + "movq "C(3)", %%mm6 \n\t" \ + "movq %%mm2, %%mm4 \n\t" \ + "movq "J(5)", %%mm7 \n\t" \ + "pmulhw %%mm6, %%mm4 \n\t" /* r4 = c3*i3 - i3 */ \ + "movq "C(5)", %%mm1 \n\t" \ + "pmulhw %%mm7, %%mm6 \n\t" /* r6 = c3*i5 - i5 */ \ + "movq %%mm1, %%mm5 \n\t" \ + "pmulhw %%mm2, %%mm1 \n\t" /* r1 = c5*i3 - i3 */ \ + "movq "I(1)", %%mm3 \n\t" \ + "pmulhw %%mm7, %%mm5 \n\t" /* r5 = c5*i5 - i5 */ \ + "movq "C(1)", %%mm0 \n\t" \ + "paddw %%mm2, %%mm4 \n\t" /* r4 = c3*i3 */ \ + "paddw %%mm7, %%mm6 \n\t" /* r6 = c3*i5 */ \ + "paddw %%mm1, %%mm2 \n\t" /* r2 = c5*i3 */ \ + "movq "J(7)", %%mm1 \n\t" \ + "paddw %%mm5, %%mm7 \n\t" /* r7 = c5*i5 */ \ + "movq %%mm0, %%mm5 \n\t" /* r5 = c1 */ \ + "pmulhw %%mm3, %%mm0 \n\t" /* r0 = c1*i1 - i1 */ \ + "paddsw %%mm7, %%mm4 \n\t" /* r4 = C = c3*i3 + c5*i5 */ \ + "pmulhw %%mm1, %%mm5 \n\t" /* r5 = c1*i7 - i7 */ \ + "movq "C(7)", %%mm7 \n\t" \ + "psubsw %%mm2, %%mm6 \n\t" /* r6 = D = c3*i5 - c5*i3 */ \ + "paddw %%mm3, %%mm0 \n\t" /* r0 = c1*i1 */ \ + "pmulhw %%mm7, %%mm3 \n\t" /* r3 = c7*i1 */ \ + "movq "I(2)", %%mm2 \n\t" \ + "pmulhw %%mm1, %%mm7 \n\t" /* r7 = c7*i7 */ \ + "paddw %%mm1, %%mm5 \n\t" /* r5 = c1*i7 */ \ + "movq %%mm2, %%mm1 \n\t" /* r1 = i2 */ \ + "pmulhw "C(2)", %%mm2 \n\t" /* r2 = c2*i2 - i2 */ \ + "psubsw %%mm5, %%mm3 \n\t" /* r3 = B = c7*i1 - c1*i7 */ \ + "movq "J(6)", %%mm5 \n\t" \ + "paddsw %%mm7, %%mm0 \n\t" /* r0 = A = c1*i1 + c7*i7 */ \ + "movq %%mm5, %%mm7 \n\t" /* r7 = i6 */ \ + "psubsw %%mm4, %%mm0 \n\t" /* r0 = A - C */ \ + "pmulhw "C(2)", %%mm5 \n\t" /* r5 = c2*i6 - i6 */ \ + "paddw %%mm1, %%mm2 \n\t" /* r2 = c2*i2 */ \ + "pmulhw "C(6)", %%mm1 \n\t" /* r1 = c6*i2 */ \ + "paddsw %%mm4, %%mm4 \n\t" /* r4 = C + C */ \ + "paddsw %%mm0, %%mm4 \n\t" /* r4 = C. = A + C */ \ + "psubsw %%mm6, %%mm3 \n\t" /* r3 = B - D */ \ + "paddw %%mm7, %%mm5 \n\t" /* r5 = c2*i6 */ \ + "paddsw %%mm6, %%mm6 \n\t" /* r6 = D + D */ \ + "pmulhw "C(6)", %%mm7 \n\t" /* r7 = c6*i6 */ \ + "paddsw %%mm3, %%mm6 \n\t" /* r6 = D. = B + D */ \ + "movq %%mm4, "I(1)"\n\t" /* save C. at I(1) */ \ + "psubsw %%mm5, %%mm1 \n\t" /* r1 = H = c6*i2 - c2*i6 */ \ + "movq "C(4)", %%mm4 \n\t" \ + "movq %%mm3, %%mm5 \n\t" /* r5 = B - D */ \ + "pmulhw %%mm4, %%mm3 \n\t" /* r3 = (c4 - 1) * (B - D) */ \ + "paddsw %%mm2, %%mm7 \n\t" /* r3 = (c4 - 1) * (B - D) */ \ + "movq %%mm6, "I(2)"\n\t" /* save D. at I(2) */ \ + "movq %%mm0, %%mm2 \n\t" /* r2 = A - C */ \ + "movq "I(0)", %%mm6 \n\t" \ + "pmulhw %%mm4, %%mm0 \n\t" /* r0 = (c4 - 1) * (A - C) */ \ + "paddw %%mm3, %%mm5 \n\t" /* r5 = B. = c4 * (B - D) */ \ + "movq "J(4)", %%mm3 \n\t" \ + "psubsw %%mm1, %%mm5 \n\t" /* r5 = B.. = B. - H */ \ + "paddw %%mm0, %%mm2 \n\t" /* r0 = A. = c4 * (A - C) */ \ + "psubsw %%mm3, %%mm6 \n\t" /* r6 = i0 - i4 */ \ + "movq %%mm6, %%mm0 \n\t" \ + "pmulhw %%mm4, %%mm6 \n\t" /* r6 = (c4 - 1) * (i0 - i4) */ \ + "paddsw %%mm3, %%mm3 \n\t" /* r3 = i4 + i4 */ \ + "paddsw %%mm1, %%mm1 \n\t" /* r1 = H + H */ \ + "paddsw %%mm0, %%mm3 \n\t" /* r3 = i0 + i4 */ \ + "paddsw %%mm5, %%mm1 \n\t" /* r1 = H. = B + H */ \ + "pmulhw %%mm3, %%mm4 \n\t" /* r4 = (c4 - 1) * (i0 + i4) */ \ + "paddsw %%mm0, %%mm6 \n\t" /* r6 = F = c4 * (i0 - i4) */ \ + "psubsw %%mm2, %%mm6 \n\t" /* r6 = F. = F - A. */ \ + "paddsw %%mm2, %%mm2 \n\t" /* r2 = A. + A. */ \ + "movq "I(1)", %%mm0 \n\t" /* r0 = C. */ \ + "paddsw %%mm6, %%mm2 \n\t" /* r2 = A.. = F + A. */ \ + "paddw %%mm3, %%mm4 \n\t" /* r4 = E = c4 * (i0 + i4) */ \ + "psubsw %%mm1, %%mm2 \n\t" /* r2 = R2 = A.. - H. */ + +/* RowIDCT gets ready to transpose */ +#define RowIDCT() \ + BeginIDCT() \ + "movq "I(2)", %%mm3 \n\t" /* r3 = D. */ \ + "psubsw %%mm7, %%mm4 \n\t" /* r4 = E. = E - G */ \ + "paddsw %%mm1, %%mm1 \n\t" /* r1 = H. + H. */ \ + "paddsw %%mm7, %%mm7 \n\t" /* r7 = G + G */ \ + "paddsw %%mm2, %%mm1 \n\t" /* r1 = R1 = A.. + H. */ \ + "paddsw %%mm4, %%mm7 \n\t" /* r1 = R1 = A.. + H. */ \ + "psubsw %%mm3, %%mm4 \n\t" /* r4 = R4 = E. - D. */ \ + "paddsw %%mm3, %%mm3 \n\t" \ + "psubsw %%mm5, %%mm6 \n\t" /* r6 = R6 = F. - B.. */ \ + "paddsw %%mm5, %%mm5 \n\t" \ + "paddsw %%mm4, %%mm3 \n\t" /* r3 = R3 = E. + D. */ \ + "paddsw %%mm6, %%mm5 \n\t" /* r5 = R5 = F. + B.. */ \ + "psubsw %%mm0, %%mm7 \n\t" /* r7 = R7 = G. - C. */ \ + "paddsw %%mm0, %%mm0 \n\t" \ + "movq %%mm1, "I(1)"\n\t" /* save R1 */ \ + "paddsw %%mm7, %%mm0 \n\t" /* r0 = R0 = G. + C. */ + +/* Column IDCT normalizes and stores final results */ +#define ColumnIDCT() \ + BeginIDCT() \ + "paddsw "OC_8", %%mm2 \n\t" /* adjust R2 (and R1) for shift */ \ + "paddsw %%mm1, %%mm1 \n\t" /* r1 = H. + H. */ \ + "paddsw %%mm2, %%mm1 \n\t" /* r1 = R1 = A.. + H. */ \ + "psraw $4, %%mm2 \n\t" /* r2 = NR2 */ \ + "psubsw %%mm7, %%mm4 \n\t" /* r4 = E. = E - G */ \ + "psraw $4, %%mm1 \n\t" /* r1 = NR1 */ \ + "movq "I(2)", %%mm3 \n\t" /* r3 = D. */ \ + "paddsw %%mm7, %%mm7 \n\t" /* r7 = G + G */ \ + "movq %%mm2, "I(2)"\n\t" /* store NR2 at I2 */ \ + "paddsw %%mm4, %%mm7 \n\t" /* r7 = G. = E + G */ \ + "movq %%mm1, "I(1)"\n\t" /* store NR1 at I1 */ \ + "psubsw %%mm3, %%mm4 \n\t" /* r4 = R4 = E. - D. */ \ + "paddsw "OC_8", %%mm4 \n\t" /* adjust R4 (and R3) for shift */ \ + "paddsw %%mm3, %%mm3 \n\t" /* r3 = D. + D. */ \ + "paddsw %%mm4, %%mm3 \n\t" /* r3 = R3 = E. + D. */ \ + "psraw $4, %%mm4 \n\t" /* r4 = NR4 */ \ + "psubsw %%mm5, %%mm6 \n\t" /* r6 = R6 = F. - B.. */ \ + "psraw $4, %%mm3 \n\t" /* r3 = NR3 */ \ + "paddsw "OC_8", %%mm6 \n\t" /* adjust R6 (and R5) for shift */ \ + "paddsw %%mm5, %%mm5 \n\t" /* r5 = B.. + B.. */ \ + "paddsw %%mm6, %%mm5 \n\t" /* r5 = R5 = F. + B.. */ \ + "psraw $4, %%mm6 \n\t" /* r6 = NR6 */ \ + "movq %%mm4, "J(4)"\n\t" /* store NR4 at J4 */ \ + "psraw $4, %%mm5 \n\t" /* r5 = NR5 */ \ + "movq %%mm3, "I(3)"\n\t" /* store NR3 at I3 */ \ + "psubsw %%mm0, %%mm7 \n\t" /* r7 = R7 = G. - C. */ \ + "paddsw "OC_8", %%mm7 \n\t" /* adjust R7 (and R0) for shift */ \ + "paddsw %%mm0, %%mm0 \n\t" /* r0 = C. + C. */ \ + "paddsw %%mm7, %%mm0 \n\t" /* r0 = R0 = G. + C. */ \ + "psraw $4, %%mm7 \n\t" /* r7 = NR7 */ \ + "movq %%mm6, "J(6)"\n\t" /* store NR6 at J6 */ \ + "psraw $4, %%mm0 \n\t" /* r0 = NR0 */ \ + "movq %%mm5, "J(5)"\n\t" /* store NR5 at J5 */ \ + "movq %%mm7, "J(7)"\n\t" /* store NR7 at J7 */ \ + "movq %%mm0, "I(0)"\n\t" /* store NR0 at I0 */ + +/* Following macro does two 4x4 transposes in place. + + At entry (we assume): + + r0 = a3 a2 a1 a0 + I(1) = b3 b2 b1 b0 + r2 = c3 c2 c1 c0 + r3 = d3 d2 d1 d0 + + r4 = e3 e2 e1 e0 + r5 = f3 f2 f1 f0 + r6 = g3 g2 g1 g0 + r7 = h3 h2 h1 h0 + + At exit, we have: + + I(0) = d0 c0 b0 a0 + I(1) = d1 c1 b1 a1 + I(2) = d2 c2 b2 a2 + I(3) = d3 c3 b3 a3 + + J(4) = h0 g0 f0 e0 + J(5) = h1 g1 f1 e1 + J(6) = h2 g2 f2 e2 + J(7) = h3 g3 f3 e3 + + I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3. + J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7. + + Since r1 is free at entry, we calculate the Js first. */ +#define Transpose() \ + "movq %%mm4, %%mm1 \n\t" /* r1 = e3 e2 e1 e0 */ \ + "punpcklwd %%mm5, %%mm4 \n\t" /* r4 = f1 e1 f0 e0 */ \ + "movq %%mm0, "I(0)"\n\t" /* save a3 a2 a1 a0 */ \ + "punpckhwd %%mm5, %%mm1 \n\t" /* r1 = f3 e3 f2 e2 */ \ + "movq %%mm6, %%mm0 \n\t" /* r0 = g3 g2 g1 g0 */ \ + "punpcklwd %%mm7, %%mm6 \n\t" /* r6 = h1 g1 h0 g0 */ \ + "movq %%mm4, %%mm5 \n\t" /* r5 = f1 e1 f0 e0 */ \ + "punpckldq %%mm6, %%mm4 \n\t" /* r4 = h0 g0 f0 e0 = R4 */ \ + "punpckhdq %%mm6, %%mm5 \n\t" /* r5 = h1 g1 f1 e1 = R5 */ \ + "movq %%mm1, %%mm6 \n\t" /* r6 = f3 e3 f2 e2 */ \ + "movq %%mm4, "J(4)"\n\t" \ + "punpckhwd %%mm7, %%mm0 \n\t" /* r0 = h3 g3 h2 g2 */ \ + "movq %%mm5, "J(5)"\n\t" \ + "punpckhdq %%mm0, %%mm6 \n\t" /* r6 = h3 g3 f3 e3 = R7 */ \ + "movq "I(0)", %%mm4 \n\t" /* r4 = a3 a2 a1 a0 */ \ + "punpckldq %%mm0, %%mm1 \n\t" /* r1 = h2 g2 f2 e2 = R6 */ \ + "movq "I(1)", %%mm5 \n\t" /* r5 = b3 b2 b1 b0 */ \ + "movq %%mm4, %%mm0 \n\t" /* r0 = a3 a2 a1 a0 */ \ + "movq %%mm6, "J(7)"\n\t" \ + "punpcklwd %%mm5, %%mm0 \n\t" /* r0 = b1 a1 b0 a0 */ \ + "movq %%mm1, "J(6)"\n\t" \ + "punpckhwd %%mm5, %%mm4 \n\t" /* r4 = b3 a3 b2 a2 */ \ + "movq %%mm2, %%mm5 \n\t" /* r5 = c3 c2 c1 c0 */ \ + "punpcklwd %%mm3, %%mm2 \n\t" /* r2 = d1 c1 d0 c0 */ \ + "movq %%mm0, %%mm1 \n\t" /* r1 = b1 a1 b0 a0 */ \ + "punpckldq %%mm2, %%mm0 \n\t" /* r0 = d0 c0 b0 a0 = R0 */ \ + "punpckhdq %%mm2, %%mm1 \n\t" /* r1 = d1 c1 b1 a1 = R1 */ \ + "movq %%mm4, %%mm2 \n\t" /* r2 = b3 a3 b2 a2 */ \ + "movq %%mm0, "I(0)"\n\t" \ + "punpckhwd %%mm3, %%mm5 \n\t" /* r5 = d3 c3 d2 c2 */ \ + "movq %%mm1, "I(1)"\n\t" \ + "punpckhdq %%mm5, %%mm4 \n\t" /* r4 = d3 c3 b3 a3 = R3 */ \ + "punpckldq %%mm5, %%mm2 \n\t" /* r2 = d2 c2 b2 a2 = R2 */ \ + "movq %%mm4, "I(3)"\n\t" \ + "movq %%mm2, "I(2)"\n\t" + +void ff_vp3_idct_mmx(int16_t *output_data) +{ + /* eax = quantized input + * ebx = dequantizer matrix + * ecx = IDCT constants + * M(I) = ecx + MaskOffset(0) + I * 8 + * C(I) = ecx + CosineOffset(32) + (I-1) * 8 + * edx = output + * r0..r7 = mm0..mm7 + */ + +#define C(x) AV_STRINGIFY(16*(x-1))"(%1)" +#define OC_8 "%2" + + /* at this point, function has completed dequantization + dezigzag + + * partial transposition; now do the idct itself */ +#define I(x) AV_STRINGIFY(16* x )"(%0)" +#define J(x) AV_STRINGIFY(16*(x-4) + 8)"(%0)" + + __asm__ volatile ( + RowIDCT() + Transpose() + +#undef I +#undef J +#define I(x) AV_STRINGIFY(16* x + 64)"(%0)" +#define J(x) AV_STRINGIFY(16*(x-4) + 72)"(%0)" + + RowIDCT() + Transpose() + +#undef I +#undef J +#define I(x) AV_STRINGIFY(16*x)"(%0)" +#define J(x) AV_STRINGIFY(16*x)"(%0)" + + ColumnIDCT() + +#undef I +#undef J +#define I(x) AV_STRINGIFY(16*x + 8)"(%0)" +#define J(x) AV_STRINGIFY(16*x + 8)"(%0)" + + ColumnIDCT() + :: "r"(output_data), "r"(ff_vp3_idct_data), "m"(ff_pw_8) + ); +#undef I +#undef J + +} + +void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block) +{ + ff_vp3_idct_mmx(block); + put_signed_pixels_clamped_mmx(block, dest, line_size); +} + +void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block) +{ + ff_vp3_idct_mmx(block); + add_pixels_clamped_mmx(block, dest, line_size); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/x86/vp3dsp_mmx.h Mon Dec 22 09:12:42 2008 +0000 @@ -0,0 +1,35 @@ +/* + * vp3dsp MMX function declarations + * Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_X86_VP3DSP_MMX_H +#define AVCODEC_X86_VP3DSP_MMX_H + +#include <stdint.h> +#include "libavcodec/dsputil.h" + +void ff_vp3_idct_mmx(int16_t *data); +void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block); +void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block); + +void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values); +void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values); + +#endif /* AVCODEC_X86_VP3DSP_MMX_H */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/x86/vp3dsp_sse2.c Mon Dec 22 09:12:42 2008 +0000 @@ -0,0 +1,186 @@ +/* + * Copyright (C) 2004 the ffmpeg project + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file vp3dsp_sse2.c + * SSE2-optimized functions cribbed from the original VP3 source code. + */ + +#include "libavcodec/dsputil.h" +#include "dsputil_mmx.h" + +DECLARE_ALIGNED_16(const uint16_t, ff_vp3_idct_data[7 * 8]) = +{ + 64277,64277,64277,64277,64277,64277,64277,64277, + 60547,60547,60547,60547,60547,60547,60547,60547, + 54491,54491,54491,54491,54491,54491,54491,54491, + 46341,46341,46341,46341,46341,46341,46341,46341, + 36410,36410,36410,36410,36410,36410,36410,36410, + 25080,25080,25080,25080,25080,25080,25080,25080, + 12785,12785,12785,12785,12785,12785,12785,12785 +}; + + +#define VP3_1D_IDCT_SSE2(ADD, SHIFT) \ + "movdqa "I(3)", %%xmm2 \n\t" /* xmm2 = i3 */ \ + "movdqa "C(3)", %%xmm6 \n\t" /* xmm6 = c3 */ \ + "movdqa %%xmm2, %%xmm4 \n\t" /* xmm4 = i3 */ \ + "movdqa "I(5)", %%xmm7 \n\t" /* xmm7 = i5 */ \ + "pmulhw %%xmm6, %%xmm4 \n\t" /* xmm4 = c3 * i3 - i3 */ \ + "movdqa "C(5)", %%xmm1 \n\t" /* xmm1 = c5 */ \ + "pmulhw %%xmm7, %%xmm6 \n\t" /* xmm6 = c3 * i5 - i5 */ \ + "movdqa %%xmm1, %%xmm5 \n\t" /* xmm5 = c5 */ \ + "pmulhw %%xmm2, %%xmm1 \n\t" /* xmm1 = c5 * i3 - i3 */ \ + "movdqa "I(1)", %%xmm3 \n\t" /* xmm3 = i1 */ \ + "pmulhw %%xmm7, %%xmm5 \n\t" /* xmm5 = c5 * i5 - i5 */ \ + "movdqa "C(1)", %%xmm0 \n\t" /* xmm0 = c1 */ \ + "paddw %%xmm2, %%xmm4 \n\t" /* xmm4 = c3 * i3 */ \ + "paddw %%xmm7, %%xmm6 \n\t" /* xmm6 = c3 * i5 */ \ + "paddw %%xmm1, %%xmm2 \n\t" /* xmm2 = c5 * i3 */ \ + "movdqa "I(7)", %%xmm1 \n\t" /* xmm1 = i7 */ \ + "paddw %%xmm5, %%xmm7 \n\t" /* xmm7 = c5 * i5 */ \ + "movdqa %%xmm0, %%xmm5 \n\t" /* xmm5 = c1 */ \ + "pmulhw %%xmm3, %%xmm0 \n\t" /* xmm0 = c1 * i1 - i1 */ \ + "paddsw %%xmm7, %%xmm4 \n\t" /* xmm4 = c3 * i3 + c5 * i5 = C */ \ + "pmulhw %%xmm1, %%xmm5 \n\t" /* xmm5 = c1 * i7 - i7 */ \ + "movdqa "C(7)", %%xmm7 \n\t" /* xmm7 = c7 */ \ + "psubsw %%xmm2, %%xmm6 \n\t" /* xmm6 = c3 * i5 - c5 * i3 = D */ \ + "paddw %%xmm3, %%xmm0 \n\t" /* xmm0 = c1 * i1 */ \ + "pmulhw %%xmm7, %%xmm3 \n\t" /* xmm3 = c7 * i1 */ \ + "movdqa "I(2)", %%xmm2 \n\t" /* xmm2 = i2 */ \ + "pmulhw %%xmm1, %%xmm7 \n\t" /* xmm7 = c7 * i7 */ \ + "paddw %%xmm1, %%xmm5 \n\t" /* xmm5 = c1 * i7 */ \ + "movdqa %%xmm2, %%xmm1 \n\t" /* xmm1 = i2 */ \ + "pmulhw "C(2)", %%xmm2 \n\t" /* xmm2 = i2 * c2 -i2 */ \ + "psubsw %%xmm5, %%xmm3 \n\t" /* xmm3 = c7 * i1 - c1 * i7 = B */ \ + "movdqa "I(6)", %%xmm5 \n\t" /* xmm5 = i6 */ \ + "paddsw %%xmm7, %%xmm0 \n\t" /* xmm0 = c1 * i1 + c7 * i7 = A */ \ + "movdqa %%xmm5, %%xmm7 \n\t" /* xmm7 = i6 */ \ + "psubsw %%xmm4, %%xmm0 \n\t" /* xmm0 = A - C */ \ + "pmulhw "C(2)", %%xmm5 \n\t" /* xmm5 = c2 * i6 - i6 */ \ + "paddw %%xmm1, %%xmm2 \n\t" /* xmm2 = i2 * c2 */ \ + "pmulhw "C(6)", %%xmm1 \n\t" /* xmm1 = c6 * i2 */ \ + "paddsw %%xmm4, %%xmm4 \n\t" /* xmm4 = C + C */ \ + "paddsw %%xmm0, %%xmm4 \n\t" /* xmm4 = A + C = C. */ \ + "psubsw %%xmm6, %%xmm3 \n\t" /* xmm3 = B - D */ \ + "paddw %%xmm7, %%xmm5 \n\t" /* xmm5 = c2 * i6 */ \ + "paddsw %%xmm6, %%xmm6 \n\t" /* xmm6 = D + D */ \ + "pmulhw "C(6)", %%xmm7 \n\t" /* xmm7 = c6 * i6 */ \ + "paddsw %%xmm3, %%xmm6 \n\t" /* xmm6 = B + D = D. */ \ + "movdqa %%xmm4, "I(1)" \n\t" /* Save C. at I(1) */ \ + "psubsw %%xmm5, %%xmm1 \n\t" /* xmm1 = c6 * i2 - c2 * i6 = H */ \ + "movdqa "C(4)", %%xmm4 \n\t" /* xmm4 = c4 */ \ + "movdqa %%xmm3, %%xmm5 \n\t" /* xmm5 = B - D */ \ + "pmulhw %%xmm4, %%xmm3 \n\t" /* xmm3 = ( c4 -1 ) * ( B - D ) */ \ + "paddsw %%xmm2, %%xmm7 \n\t" /* xmm7 = c2 * i2 + c6 * i6 = G */ \ + "movdqa %%xmm6, "I(2)" \n\t" /* Save D. at I(2) */ \ + "movdqa %%xmm0, %%xmm2 \n\t" /* xmm2 = A - C */ \ + "movdqa "I(0)", %%xmm6 \n\t" /* xmm6 = i0 */ \ + "pmulhw %%xmm4, %%xmm0 \n\t" /* xmm0 = ( c4 - 1 ) * ( A - C ) = A. */ \ + "paddw %%xmm3, %%xmm5 \n\t" /* xmm5 = c4 * ( B - D ) = B. */ \ + "movdqa "I(4)", %%xmm3 \n\t" /* xmm3 = i4 */ \ + "psubsw %%xmm1, %%xmm5 \n\t" /* xmm5 = B. - H = B.. */ \ + "paddw %%xmm0, %%xmm2 \n\t" /* xmm2 = c4 * ( A - C) = A. */ \ + "psubsw %%xmm3, %%xmm6 \n\t" /* xmm6 = i0 - i4 */ \ + "movdqa %%xmm6, %%xmm0 \n\t" /* xmm0 = i0 - i4 */ \ + "pmulhw %%xmm4, %%xmm6 \n\t" /* xmm6 = (c4 - 1) * (i0 - i4) = F */ \ + "paddsw %%xmm3, %%xmm3 \n\t" /* xmm3 = i4 + i4 */ \ + "paddsw %%xmm1, %%xmm1 \n\t" /* xmm1 = H + H */ \ + "paddsw %%xmm0, %%xmm3 \n\t" /* xmm3 = i0 + i4 */ \ + "paddsw %%xmm5, %%xmm1 \n\t" /* xmm1 = B. + H = H. */ \ + "pmulhw %%xmm3, %%xmm4 \n\t" /* xmm4 = ( c4 - 1 ) * ( i0 + i4 ) */ \ + "paddw %%xmm0, %%xmm6 \n\t" /* xmm6 = c4 * ( i0 - i4 ) */ \ + "psubsw %%xmm2, %%xmm6 \n\t" /* xmm6 = F - A. = F. */ \ + "paddsw %%xmm2, %%xmm2 \n\t" /* xmm2 = A. + A. */ \ + "movdqa "I(1)", %%xmm0 \n\t" /* Load C. from I(1) */ \ + "paddsw %%xmm6, %%xmm2 \n\t" /* xmm2 = F + A. = A.. */ \ + "paddw %%xmm3, %%xmm4 \n\t" /* xmm4 = c4 * ( i0 + i4 ) = 3 */ \ + "psubsw %%xmm1, %%xmm2 \n\t" /* xmm2 = A.. - H. = R2 */ \ + ADD(%%xmm2) /* Adjust R2 and R1 before shifting */ \ + "paddsw %%xmm1, %%xmm1 \n\t" /* xmm1 = H. + H. */ \ + "paddsw %%xmm2, %%xmm1 \n\t" /* xmm1 = A.. + H. = R1 */ \ + SHIFT(%%xmm2) /* xmm2 = op2 */ \ + "psubsw %%xmm7, %%xmm4 \n\t" /* xmm4 = E - G = E. */ \ + SHIFT(%%xmm1) /* xmm1 = op1 */ \ + "movdqa "I(2)", %%xmm3 \n\t" /* Load D. from I(2) */ \ + "paddsw %%xmm7, %%xmm7 \n\t" /* xmm7 = G + G */ \ + "paddsw %%xmm4, %%xmm7 \n\t" /* xmm7 = E + G = G. */ \ + "psubsw %%xmm3, %%xmm4 \n\t" /* xmm4 = E. - D. = R4 */ \ + ADD(%%xmm4) /* Adjust R4 and R3 before shifting */ \ + "paddsw %%xmm3, %%xmm3 \n\t" /* xmm3 = D. + D. */ \ + "paddsw %%xmm4, %%xmm3 \n\t" /* xmm3 = E. + D. = R3 */ \ + SHIFT(%%xmm4) /* xmm4 = op4 */ \ + "psubsw %%xmm5, %%xmm6 \n\t" /* xmm6 = F. - B..= R6 */ \ + SHIFT(%%xmm3) /* xmm3 = op3 */ \ + ADD(%%xmm6) /* Adjust R6 and R5 before shifting */ \ + "paddsw %%xmm5, %%xmm5 \n\t" /* xmm5 = B.. + B.. */ \ + "paddsw %%xmm6, %%xmm5 \n\t" /* xmm5 = F. + B.. = R5 */ \ + SHIFT(%%xmm6) /* xmm6 = op6 */ \ + SHIFT(%%xmm5) /* xmm5 = op5 */ \ + "psubsw %%xmm0, %%xmm7 \n\t" /* xmm7 = G. - C. = R7 */ \ + ADD(%%xmm7) /* Adjust R7 and R0 before shifting */ \ + "paddsw %%xmm0, %%xmm0 \n\t" /* xmm0 = C. + C. */ \ + "paddsw %%xmm7, %%xmm0 \n\t" /* xmm0 = G. + C. */ \ + SHIFT(%%xmm7) /* xmm7 = op7 */ \ + SHIFT(%%xmm0) /* xmm0 = op0 */ + +#define PUT_BLOCK(r0, r1, r2, r3, r4, r5, r6, r7) \ + "movdqa " #r0 ", " O(0) "\n\t" \ + "movdqa " #r1 ", " O(1) "\n\t" \ + "movdqa " #r2 ", " O(2) "\n\t" \ + "movdqa " #r3 ", " O(3) "\n\t" \ + "movdqa " #r4 ", " O(4) "\n\t" \ + "movdqa " #r5 ", " O(5) "\n\t" \ + "movdqa " #r6 ", " O(6) "\n\t" \ + "movdqa " #r7 ", " O(7) "\n\t" + +#define NOP(xmm) +#define SHIFT4(xmm) "psraw $4, "#xmm"\n\t" +#define ADD8(xmm) "paddsw %2, "#xmm"\n\t" + +void ff_vp3_idct_sse2(int16_t *input_data) +{ +#define I(x) AV_STRINGIFY(16*x)"(%0)" +#define O(x) I(x) +#define C(x) AV_STRINGIFY(16*(x-1))"(%1)" + + __asm__ volatile ( + VP3_1D_IDCT_SSE2(NOP, NOP) + + TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%0)) + PUT_BLOCK(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1) + + VP3_1D_IDCT_SSE2(ADD8, SHIFT4) + PUT_BLOCK(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7) + :: "r"(input_data), "r"(ff_vp3_idct_data), "m"(ff_pw_8) + ); +} + +void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block) +{ + ff_vp3_idct_sse2(block); + put_signed_pixels_clamped_mmx(block, dest, line_size); +} + +void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block) +{ + ff_vp3_idct_sse2(block); + add_pixels_clamped_mmx(block, dest, line_size); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/x86/vp3dsp_sse2.h Mon Dec 22 09:12:42 2008 +0000 @@ -0,0 +1,31 @@ +/* + * vp3dsp SSE2 function declarations + * Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_X86_VP3DSP_SSE2_H +#define AVCODEC_X86_VP3DSP_SSE2_H + +#include "libavcodec/dsputil.h" + +void ff_vp3_idct_sse2(int16_t *input_data); +void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block); +void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block); + +#endif /* AVCODEC_X86_VP3DSP_SSE2_H */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/x86/x86inc.asm Mon Dec 22 09:12:42 2008 +0000 @@ -0,0 +1,540 @@ +;***************************************************************************** +;* x86inc.asm +;***************************************************************************** +;* Copyright (C) 2005-2008 Loren Merritt <lorenm@u.washington.edu> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;***************************************************************************** + +; FIXME: All of the 64bit asm functions that take a stride as an argument +; via register, assume that the high dword of that register is filled with 0. +; This is true in practice (since we never do any 64bit arithmetic on strides, +; and x264's strides are all positive), but is not guaranteed by the ABI. + +; Name of the .rodata section. +; Kludge: Something on OS X fails to align .rodata even given an align attribute, +; so use a different read-only section. +%macro SECTION_RODATA 0 + %ifidn __OUTPUT_FORMAT__,macho64 + SECTION .text align=16 + %elifidn __OUTPUT_FORMAT__,macho + SECTION .text align=16 + fakegot: + %else + SECTION .rodata align=16 + %endif +%endmacro + +; PIC support macros. All these macros are totally harmless when PIC is +; not defined but can ruin everything if misused in PIC mode. On x86_32, shared +; objects cannot directly access global variables by address, they need to +; go through the GOT (global offset table). Most OSes do not care about it +; and let you load non-shared .so objects (Linux, Win32...). However, OS X +; requires PIC code in its .dylib objects. +; +; - GLOBAL should be used as a suffix for global addressing, eg. +; picgetgot ebx +; mov eax, [foo GLOBAL] +; instead of +; mov eax, [foo] +; +; - picgetgot computes the GOT address into the given register in PIC +; mode, otherwise does nothing. You need to do this before using GLOBAL. +; Before in both execution order and compiled code order (so GLOBAL knows +; which register the GOT is in). + +%ifndef PIC + %define GLOBAL + %macro picgetgot 1 + %endmacro +%elifdef ARCH_X86_64 + %define PIC64 + %define GLOBAL wrt rip + %macro picgetgot 1 + %endmacro +%else + %define PIC32 + %ifidn __OUTPUT_FORMAT__,macho + ; There is no real global offset table on OS X, but we still + ; need to reference our variables by offset. + %macro picgetgot 1 + call %%getgot + %%getgot: + pop %1 + add %1, $$ - %%getgot + %undef GLOBAL + %define GLOBAL + %1 - fakegot + %endmacro + %else ; elf + extern _GLOBAL_OFFSET_TABLE_ + %macro picgetgot 1 + call %%getgot + %%getgot: + pop %1 + add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%getgot wrt ..gotpc + %undef GLOBAL + %define GLOBAL + %1 wrt ..gotoff + %endmacro + %endif +%endif + +; Macros to eliminate most code duplication between x86_32 and x86_64: +; Currently this works only for leaf functions which load all their arguments +; into registers at the start, and make no other use of the stack. Luckily that +; covers most of x264's asm. + +; PROLOGUE: +; %1 = number of arguments. loads them from stack if needed. +; %2 = number of registers used, not including PIC. pushes callee-saved regs if needed. +; %3 = whether global constants are used in this function. inits x86_32 PIC if needed. +; %4 = list of names to define to registers +; PROLOGUE can also be invoked by adding the same options to cglobal + +; e.g. +; cglobal foo, 2,3,0, dst, src, tmp +; declares a function (foo), taking two args (dst and src), one local variable (tmp), and not using globals + +; TODO Some functions can use some args directly from the stack. If they're the +; last args then you can just not declare them, but if they're in the middle +; we need more flexible macro. + +; RET: +; Pops anything that was pushed by PROLOGUE + +; REP_RET: +; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons +; which are slow when a normal ret follows a branch. + +%macro DECLARE_REG 6 + %define r%1q %2 + %define r%1d %3 + %define r%1w %4 + %define r%1b %5 + %define r%1m %6 + %define r%1 %2 +%endmacro + +%macro DECLARE_REG_SIZE 2 + %define r%1q r%1 + %define e%1q r%1 + %define r%1d e%1 + %define e%1d e%1 + %define r%1w %1 + %define e%1w %1 + %define r%1b %2 + %define e%1b %2 +%ifndef ARCH_X86_64 + %define r%1 e%1 +%endif +%endmacro + +DECLARE_REG_SIZE ax, al +DECLARE_REG_SIZE bx, bl +DECLARE_REG_SIZE cx, cl +DECLARE_REG_SIZE dx, dl +DECLARE_REG_SIZE si, sil +DECLARE_REG_SIZE di, dil +DECLARE_REG_SIZE bp, bpl + +%ifdef ARCH_X86_64 + %define gprsize 8 +%else + %define gprsize 4 +%endif + +%macro PUSH 1 + push %1 + %assign stack_offset stack_offset+gprsize +%endmacro + +%macro POP 1 + pop %1 + %assign stack_offset stack_offset-gprsize +%endmacro + +%macro SUB 2 + sub %1, %2 + %ifidn %1, rsp + %assign stack_offset stack_offset+(%2) + %endif +%endmacro + +%macro ADD 2 + add %1, %2 + %ifidn %1, rsp + %assign stack_offset stack_offset-(%2) + %endif +%endmacro + +%macro movifnidn 2 + %ifnidn %1, %2 + mov %1, %2 + %endif +%endmacro + +%macro movsxdifnidn 2 + %ifnidn %1, %2 + movsxd %1, %2 + %endif +%endmacro + +%macro ASSERT 1 + %if (%1) == 0 + %error assert failed + %endif +%endmacro + +%macro DEFINE_ARGS 0-* + %ifdef n_arg_names + %assign %%i 0 + %rep n_arg_names + CAT_UNDEF arg_name %+ %%i, q + CAT_UNDEF arg_name %+ %%i, d + CAT_UNDEF arg_name %+ %%i, w + CAT_UNDEF arg_name %+ %%i, b + CAT_UNDEF arg_name, %%i + %assign %%i %%i+1 + %endrep + %endif + + %assign %%i 0 + %rep %0 + %xdefine %1q r %+ %%i %+ q + %xdefine %1d r %+ %%i %+ d + %xdefine %1w r %+ %%i %+ w + %xdefine %1b r %+ %%i %+ b + CAT_XDEFINE arg_name, %%i, %1 + %assign %%i %%i+1 + %rotate 1 + %endrep + %assign n_arg_names %%i +%endmacro + +%ifdef ARCH_X86_64 ;========================================================== +%ifidn __OUTPUT_FORMAT__,win32 + +DECLARE_REG 0, rcx, ecx, cx, cl, ecx +DECLARE_REG 1, rdx, edx, dx, dl, edx +DECLARE_REG 2, r8, r8d, r8w, r8b, r8d +DECLARE_REG 3, r9, r9d, r9w, r9b, r9d +DECLARE_REG 4, rdi, edi, di, dil, [rsp + stack_offset + 40] +DECLARE_REG 5, rsi, esi, si, sil, [rsp + stack_offset + 48] +DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56] +%define r7m [rsp + stack_offset + 64] +%define r8m [rsp + stack_offset + 72] + +%macro LOAD_IF_USED 2 ; reg_id, number_of_args + %if %1 < %2 + mov r%1, [rsp + 8 + %1*8] + %endif +%endmacro + +%else ;======================================================================= + +DECLARE_REG 0, rdi, edi, di, dil, edi +DECLARE_REG 1, rsi, esi, si, sil, esi +DECLARE_REG 2, rdx, edx, dx, dl, edx +DECLARE_REG 3, rcx, ecx, cx, cl, ecx +DECLARE_REG 4, r8, r8d, r8w, r8b, r8d +DECLARE_REG 5, r9, r9d, r9w, r9b, r9d +DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 8] +%define r7m [rsp + stack_offset + 16] +%define r8m [rsp + stack_offset + 24] + +%macro LOAD_IF_USED 2 ; reg_id, number_of_args + %if %1 < %2 + mov r%1, [rsp - 40 + %1*8] + %endif +%endmacro + +%endif ; !WIN64 + +%macro PROLOGUE 2-4+ 0 ; #args, #regs, pic, arg_names... + ASSERT %2 >= %1 + ASSERT %2 <= 7 + %assign stack_offset 0 +%ifidn __OUTPUT_FORMAT__,win32 + LOAD_IF_USED 4, %1 + LOAD_IF_USED 5, %1 +%endif + LOAD_IF_USED 6, %1 + DEFINE_ARGS %4 +%endmacro + +%macro RET 0 + ret +%endmacro + +%macro REP_RET 0 + rep ret +%endmacro + +%else ; X86_32 ;============================================================== + +DECLARE_REG 0, eax, eax, ax, al, [esp + stack_offset + 4] +DECLARE_REG 1, ecx, ecx, cx, cl, [esp + stack_offset + 8] +DECLARE_REG 2, edx, edx, dx, dl, [esp + stack_offset + 12] +DECLARE_REG 3, ebx, ebx, bx, bl, [esp + stack_offset + 16] +DECLARE_REG 4, esi, esi, si, null, [esp + stack_offset + 20] +DECLARE_REG 5, edi, edi, di, null, [esp + stack_offset + 24] +DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28] +%define r7m [esp + stack_offset + 32] +%define r8m [esp + stack_offset + 36] +%define rsp esp + +%macro PUSH_IF_USED 1 ; reg_id + %if %1 < regs_used + push r%1 + %assign stack_offset stack_offset+4 + %endif +%endmacro + +%macro POP_IF_USED 1 ; reg_id + %if %1 < regs_used + pop r%1 + %endif +%endmacro + +%macro LOAD_IF_USED 2 ; reg_id, number_of_args + %if %1 < %2 + mov r%1, [esp + stack_offset + 4 + %1*4] + %endif +%endmacro + +%macro PROLOGUE 2-4+ 0 ; #args, #regs, pic, arg_names... + ASSERT %2 >= %1 + %assign stack_offset 0 + %assign regs_used %2 + %ifdef PIC + %if %3 + %assign regs_used regs_used+1 + %endif + %endif + ASSERT regs_used <= 7 + PUSH_IF_USED 3 + PUSH_IF_USED 4 + PUSH_IF_USED 5 + PUSH_IF_USED 6 + LOAD_IF_USED 0, %1 + LOAD_IF_USED 1, %1 + LOAD_IF_USED 2, %1 + LOAD_IF_USED 3, %1 + LOAD_IF_USED 4, %1 + LOAD_IF_USED 5, %1 + LOAD_IF_USED 6, %1 + %if %3 + picgetgot r%2 + %endif + DEFINE_ARGS %4 +%endmacro + +%macro RET 0 + POP_IF_USED 6 + POP_IF_USED 5 + POP_IF_USED 4 + POP_IF_USED 3 + ret +%endmacro + +%macro REP_RET 0 + %if regs_used > 3 + RET + %else + rep ret + %endif +%endmacro + +%endif ;====================================================================== + + + +;============================================================================= +; arch-independent part +;============================================================================= + +%assign function_align 16 + +; Symbol prefix for C linkage +%macro cglobal 1-2+ + %xdefine %1 ff_%1 + %ifdef PREFIX + %xdefine %1 _ %+ %1 + %endif + %ifidn __OUTPUT_FORMAT__,elf + global %1:function hidden + %else + global %1 + %endif + align function_align + %1: + RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer + %if %0 > 1 + PROLOGUE %2 + %endif +%endmacro + +%macro cextern 1 + %ifdef PREFIX + extern _%1 + %define %1 _%1 + %else + extern %1 + %endif +%endmacro + +; This is needed for ELF, otherwise the GNU linker assumes the stack is +; executable by default. +%ifidn __OUTPUT_FORMAT__,elf +SECTION .note.GNU-stack noalloc noexec nowrite progbits +%endif + +%assign FENC_STRIDE 16 +%assign FDEC_STRIDE 32 + +; merge mmx and sse* + +%macro CAT_XDEFINE 3 + %xdefine %1%2 %3 +%endmacro + +%macro CAT_UNDEF 2 + %undef %1%2 +%endmacro + +%macro INIT_MMX 0 + %define RESET_MM_PERMUTATION INIT_MMX + %define mmsize 8 + %define num_mmregs 8 + %define mova movq + %define movu movq + %define movh movd + %define movnt movntq + %assign %%i 0 + %rep 8 + CAT_XDEFINE m, %%i, mm %+ %%i + CAT_XDEFINE nmm, %%i, %%i + %assign %%i %%i+1 + %endrep + %rep 8 + CAT_UNDEF m, %%i + CAT_UNDEF nmm, %%i + %assign %%i %%i+1 + %endrep +%endmacro + +%macro INIT_XMM 0 + %define RESET_MM_PERMUTATION INIT_XMM + %define mmsize 16 + %define num_mmregs 8 + %ifdef ARCH_X86_64 + %define num_mmregs 16 + %endif + %define mova movdqa + %define movu movdqu + %define movh movq + %define movnt movntdq + %assign %%i 0 + %rep num_mmregs + CAT_XDEFINE m, %%i, xmm %+ %%i + CAT_XDEFINE nxmm, %%i, %%i + %assign %%i %%i+1 + %endrep +%endmacro + +INIT_MMX + +; I often want to use macros that permute their arguments. e.g. there's no +; efficient way to implement butterfly or transpose or dct without swapping some +; arguments. +; +; I would like to not have to manually keep track of the permutations: +; If I insert a permutation in the middle of a function, it should automatically +; change everything that follows. For more complex macros I may also have multiple +; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations. +; +; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that +; permutes its arguments. It's equivalent to exchanging the contents of the +; registers, except that this way you exchange the register names instead, so it +; doesn't cost any cycles. + +%macro PERMUTE 2-* ; takes a list of pairs to swap +%rep %0/2 + %xdefine tmp%2 m%2 + %xdefine ntmp%2 nm%2 + %rotate 2 +%endrep +%rep %0/2 + %xdefine m%1 tmp%2 + %xdefine nm%1 ntmp%2 + %undef tmp%2 + %undef ntmp%2 + %rotate 2 +%endrep +%endmacro + +%macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs) +%rep %0-1 +%ifdef m%1 + %xdefine tmp m%1 + %xdefine m%1 m%2 + %xdefine m%2 tmp + CAT_XDEFINE n, m%1, %1 + CAT_XDEFINE n, m%2, %2 +%else + ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here. + ; Be careful using this mode in nested macros though, as in some cases there may be + ; other copies of m# that have already been dereferenced and don't get updated correctly. + %xdefine %%n1 n %+ %1 + %xdefine %%n2 n %+ %2 + %xdefine tmp m %+ %%n1 + CAT_XDEFINE m, %%n1, m %+ %%n2 + CAT_XDEFINE m, %%n2, tmp + CAT_XDEFINE n, m %+ %%n1, %%n1 + CAT_XDEFINE n, m %+ %%n2, %%n2 +%endif + %undef tmp + %rotate 1 +%endrep +%endmacro + +%macro SAVE_MM_PERMUTATION 1 + %assign %%i 0 + %rep num_mmregs + CAT_XDEFINE %1_m, %%i, m %+ %%i + %assign %%i %%i+1 + %endrep +%endmacro + +%macro LOAD_MM_PERMUTATION 1 + %assign %%i 0 + %rep num_mmregs + CAT_XDEFINE m, %%i, %1_m %+ %%i + %assign %%i %%i+1 + %endrep +%endmacro + +%macro call 1 + call %1 + %ifdef %1_m0 + LOAD_MM_PERMUTATION %1 + %endif +%endmacro + +; substitutions which are functionally identical but reduce code size +%define movdqa movaps +%define movdqu movups +