# HG changeset patch # User rbultje # Date 1283185546 0 # Node ID d6d0a43848b498e4c8caf7219e2120ad13b6c97d # Parent fe78a4548d1201c26427ebd821dea563140c112a Move VP3 IDCT functions from inline ASM to YASM. This fixes part of the VP3/5/6 issues on Win64. diff -r fe78a4548d12 -r d6d0a43848b4 x86/Makefile --- a/x86/Makefile Mon Aug 30 16:22:27 2010 +0000 +++ b/x86/Makefile Mon Aug 30 16:25:46 2010 +0000 @@ -26,15 +26,12 @@ MMX-OBJS-$(CONFIG_LPC) += x86/lpc_mmx.o MMX-OBJS-$(CONFIG_DWT) += x86/snowdsp_mmx.o MMX-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_mmx.o -MMX-OBJS-$(CONFIG_VP3_DECODER) += x86/vp3dsp_mmx.o \ - x86/vp3dsp_sse2.o -MMX-OBJS-$(CONFIG_VP5_DECODER) += x86/vp3dsp_mmx.o \ - x86/vp3dsp_sse2.o \ - x86/vp56dsp_init.o -YASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp56dsp.o -MMX-OBJS-$(CONFIG_VP6_DECODER) += x86/vp3dsp_mmx.o \ - x86/vp3dsp_sse2.o \ - x86/vp56dsp_init.o +YASM-OBJS-$(CONFIG_VP3_DECODER) += x86/vp3dsp.o +YASM-OBJS-$(CONFIG_VP5_DECODER) += x86/vp3dsp.o +MMX-OBJS-$(CONFIG_VP5_DECODER) += x86/vp56dsp_init.o +YASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp3dsp.o \ + x86/vp56dsp.o +MMX-OBJS-$(CONFIG_VP6_DECODER) += x86/vp56dsp_init.o YASM-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp.o MMX-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp-init.o MMX-OBJS-$(HAVE_YASM) += x86/dsputil_yasm.o \ diff -r fe78a4548d12 -r d6d0a43848b4 x86/dsputil_mmx.c --- a/x86/dsputil_mmx.c Mon Aug 30 16:22:27 2010 +0000 +++ b/x86/dsputil_mmx.c Mon Aug 30 16:25:46 2010 +0000 @@ -28,8 +28,6 @@ #include "libavcodec/mpegvideo.h" #include "libavcodec/simple_idct.h" #include "dsputil_mmx.h" -#include "vp3dsp_mmx.h" -#include "vp3dsp_sse2.h" #include "idct_xvid.h" //#undef NDEBUG @@ -2376,6 +2374,19 @@ ); } +void ff_vp3_idct_mmx(int16_t *input_data); +void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block); +void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block); + +void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int line_size, const DCTELEM *block); + +void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values); +void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values); + +void ff_vp3_idct_sse2(int16_t *input_data); +void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block); +void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block); + void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len); void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len); void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len); diff -r fe78a4548d12 -r d6d0a43848b4 x86/vp3dsp.asm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/x86/vp3dsp.asm Mon Aug 30 16:25:46 2010 +0000 @@ -0,0 +1,618 @@ +;****************************************************************************** +;* MMX/SSE2-optimized functions for the VP3 decoder +;* Copyright (c) 2007 Aurelien Jacobs +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "x86inc.asm" +%include "x86util.asm" + +; MMX-optimized functions cribbed from the original VP3 source code. + +SECTION_RODATA + +vp3_idct_data: times 8 dw 64277 + times 8 dw 60547 + times 8 dw 54491 + times 8 dw 46341 + times 8 dw 36410 + times 8 dw 25080 + times 8 dw 12785 + +cextern pb_1 +cextern pb_3 +cextern pb_7 +cextern pb_1F +cextern pb_81 + +cextern pw_8 + +cextern put_signed_pixels_clamped_mmx +cextern add_pixels_clamped_mmx + +SECTION .text + +; this is off by one or two for some cases when filter_limit is greater than 63 +; in: p0 in mm6, p1 in mm4, p2 in mm2, p3 in mm1 +; out: p1 in mm4, p2 in mm3 +%macro VP3_LOOP_FILTER 0 + movq m7, m6 + pand m6, [pb_7] ; p0&7 + psrlw m7, 3 + pand m7, [pb_1F] ; p0>>3 + movq m3, m2 ; p2 + pxor m2, m4 + pand m2, [pb_1] ; (p2^p1)&1 + movq m5, m2 + paddb m2, m2 + paddb m2, m5 ; 3*(p2^p1)&1 + paddb m2, m6 ; extra bits lost in shifts + pcmpeqb m0, m0 + pxor m1, m0 ; 255 - p3 + pavgb m1, m2 ; (256 - p3 + extrabits) >> 1 + pxor m0, m4 ; 255 - p1 + pavgb m0, m3 ; (256 + p2-p1) >> 1 + paddb m1, [pb_3] + pavgb m1, m0 ; 128+2+( p2-p1 - p3) >> 2 + pavgb m1, m0 ; 128+1+(3*(p2-p1) - p3) >> 3 + paddusb m7, m1 ; d+128+1 + movq m6, [pb_81] + psubusb m6, m7 + psubusb m7, [pb_81] + + movq m5, [r2+516] ; flim + pminub m6, m5 + pminub m7, m5 + movq m0, m6 + movq m1, m7 + paddb m6, m6 + paddb m7, m7 + pminub m6, m5 + pminub m7, m5 + psubb m6, m0 + psubb m7, m1 + paddusb m4, m7 + psubusb m4, m6 + psubusb m3, m7 + paddusb m3, m6 +%endmacro + +%macro STORE_4_WORDS 1 + movd r2, %1 + mov [r0 -1], r2w + psrlq %1, 32 + shr r2, 16 + mov [r0+r1 -1], r2w + movd r2, %1 + mov [r0+r1*2-1], r2w + shr r2, 16 + mov [r0+r3 -1], r2w +%endmacro + +INIT_MMX +cglobal vp3_v_loop_filter_mmx2, 3, 4 +%ifdef ARCH_X86_64 + movsxd r1, r1d +%endif + mov r3, r1 + neg r1 + movq m6, [r0+r1*2] + movq m4, [r0+r1 ] + movq m2, [r0 ] + movq m1, [r0+r3 ] + + VP3_LOOP_FILTER + + movq [r0+r1], m4 + movq [r0 ], m3 + RET + +cglobal vp3_h_loop_filter_mmx2, 3, 4 +%ifdef ARCH_X86_64 + movsxd r1, r1d +%endif + lea r3, [r1*3] + + movd m6, [r0 -2] + movd m4, [r0+r1 -2] + movd m2, [r0+r1*2-2] + movd m1, [r0+r3 -2] + lea r0, [r0+r1*4 ] + punpcklbw m6, [r0 -2] + punpcklbw m4, [r0+r1 -2] + punpcklbw m2, [r0+r1*2-2] + punpcklbw m1, [r0+r3 -2] + sub r0, r3 + sub r0, r1 + + TRANSPOSE4x4B 6, 4, 2, 1, 0 + VP3_LOOP_FILTER + SBUTTERFLY bw, 4, 3, 5 + + STORE_4_WORDS m4 + lea r0, [r0+r1*4 ] + STORE_4_WORDS m3 + RET + +; from original comments: The Macro does IDct on 4 1-D Dcts +%macro BeginIDCT 0 + movq m2, I(3) + movq m6, C(3) + movq m4, m2 + movq m7, J(5) + pmulhw m4, m6 ; r4 = c3*i3 - i3 + movq m1, C(5) + pmulhw m6, m7 ; r6 = c3*i5 - i5 + movq m5, m1 + pmulhw m1, m2 ; r1 = c5*i3 - i3 + movq m3, I(1) + pmulhw m5, m7 ; r5 = c5*i5 - i5 + movq m0, C(1) + paddw m4, m2 ; r4 = c3*i3 + paddw m6, m7 ; r6 = c3*i5 + paddw m2, m1 ; r2 = c5*i3 + movq m1, J(7) + paddw m7, m5 ; r7 = c5*i5 + movq m5, m0 ; r5 = c1 + pmulhw m0, m3 ; r0 = c1*i1 - i1 + paddsw m4, m7 ; r4 = C = c3*i3 + c5*i5 + pmulhw m5, m1 ; r5 = c1*i7 - i7 + movq m7, C(7) + psubsw m6, m2 ; r6 = D = c3*i5 - c5*i3 + paddw m0, m3 ; r0 = c1*i1 + pmulhw m3, m7 ; r3 = c7*i1 + movq m2, I(2) + pmulhw m7, m1 ; r7 = c7*i7 + paddw m5, m1 ; r5 = c1*i7 + movq m1, m2 ; r1 = i2 + pmulhw m2, C(2) ; r2 = c2*i2 - i2 + psubsw m3, m5 ; r3 = B = c7*i1 - c1*i7 + movq m5, J(6) + paddsw m0, m7 ; r0 = A = c1*i1 + c7*i7 + movq m7, m5 ; r7 = i6 + psubsw m0, m4 ; r0 = A - C + pmulhw m5, C(2) ; r5 = c2*i6 - i6 + paddw m2, m1 ; r2 = c2*i2 + pmulhw m1, C(6) ; r1 = c6*i2 + paddsw m4, m4 ; r4 = C + C + paddsw m4, m0 ; r4 = C. = A + C + psubsw m3, m6 ; r3 = B - D + paddw m5, m7 ; r5 = c2*i6 + paddsw m6, m6 ; r6 = D + D + pmulhw m7, C(6) ; r7 = c6*i6 + paddsw m6, m3 ; r6 = D. = B + D + movq I(1), m4 ; save C. at I(1) + psubsw m1, m5 ; r1 = H = c6*i2 - c2*i6 + movq m4, C(4) + movq m5, m3 ; r5 = B - D + pmulhw m3, m4 ; r3 = (c4 - 1) * (B - D) + paddsw m7, m2 ; r3 = (c4 - 1) * (B - D) + movq I(2), m6 ; save D. at I(2) + movq m2, m0 ; r2 = A - C + movq m6, I(0) + pmulhw m0, m4 ; r0 = (c4 - 1) * (A - C) + paddw m5, m3 ; r5 = B. = c4 * (B - D) + movq m3, J(4) + psubsw m5, m1 ; r5 = B.. = B. - H + paddw m2, m0 ; r0 = A. = c4 * (A - C) + psubsw m6, m3 ; r6 = i0 - i4 + movq m0, m6 + pmulhw m6, m4 ; r6 = (c4 - 1) * (i0 - i4) + paddsw m3, m3 ; r3 = i4 + i4 + paddsw m1, m1 ; r1 = H + H + paddsw m3, m0 ; r3 = i0 + i4 + paddsw m1, m5 ; r1 = H. = B + H + pmulhw m4, m3 ; r4 = (c4 - 1) * (i0 + i4) + paddsw m6, m0 ; r6 = F = c4 * (i0 - i4) + psubsw m6, m2 ; r6 = F. = F - A. + paddsw m2, m2 ; r2 = A. + A. + movq m0, I(1) ; r0 = C. + paddsw m2, m6 ; r2 = A.. = F + A. + paddw m4, m3 ; r4 = E = c4 * (i0 + i4) + psubsw m2, m1 ; r2 = R2 = A.. - H. +%endmacro + +; RowIDCT gets ready to transpose +%macro RowIDCT 0 + BeginIDCT + movq m3, I(2) ; r3 = D. + psubsw m4, m7 ; r4 = E. = E - G + paddsw m1, m1 ; r1 = H. + H. + paddsw m7, m7 ; r7 = G + G + paddsw m1, m2 ; r1 = R1 = A.. + H. + paddsw m7, m4 ; r1 = R1 = A.. + H. + psubsw m4, m3 ; r4 = R4 = E. - D. + paddsw m3, m3 + psubsw m6, m5 ; r6 = R6 = F. - B.. + paddsw m5, m5 + paddsw m3, m4 ; r3 = R3 = E. + D. + paddsw m5, m6 ; r5 = R5 = F. + B.. + psubsw m7, m0 ; r7 = R7 = G. - C. + paddsw m0, m0 + movq I(1), m1 ; save R1 + paddsw m0, m7 ; r0 = R0 = G. + C. +%endmacro + +; Column IDCT normalizes and stores final results +%macro ColumnIDCT 0 + BeginIDCT + paddsw m2, OC_8 ; adjust R2 (and R1) for shift + paddsw m1, m1 ; r1 = H. + H. + paddsw m1, m2 ; r1 = R1 = A.. + H. + psraw m2, 4 ; r2 = NR2 + psubsw m4, m7 ; r4 = E. = E - G + psraw m1, 4 ; r1 = NR2 + movq m3, I(2) ; r3 = D. + paddsw m7, m7 ; r7 = G + G + movq I(2), m2 ; store NR2 at I2 + paddsw m7, m4 ; r7 = G. = E + G + movq I(1), m1 ; store NR1 at I1 + psubsw m4, m3 ; r4 = R4 = E. - D. + paddsw m4, OC_8 ; adjust R4 (and R3) for shift + paddsw m3, m3 ; r3 = D. + D. + paddsw m3, m4 ; r3 = R3 = E. + D. + psraw m4, 4 ; r4 = NR4 + psubsw m6, m5 ; r6 = R6 = F. - B.. + psraw m3, 4 ; r3 = NR3 + paddsw m6, OC_8 ; adjust R6 (and R5) for shift + paddsw m5, m5 ; r5 = B.. + B.. + paddsw m5, m6 ; r5 = R5 = F. + B.. + psraw m6, 4 ; r6 = NR6 + movq J(4), m4 ; store NR4 at J4 + psraw m5, 4 ; r5 = NR5 + movq I(3), m3 ; store NR3 at I3 + psubsw m7, m0 ; r7 = R7 = G. - C. + paddsw m7, OC_8 ; adjust R7 (and R0) for shift + paddsw m0, m0 ; r0 = C. + C. + paddsw m0, m7 ; r0 = R0 = G. + C. + psraw m7, 4 ; r7 = NR7 + movq J(6), m6 ; store NR6 at J6 + psraw m0, 4 ; r0 = NR0 + movq J(5), m5 ; store NR5 at J5 + movq J(7), m7 ; store NR7 at J7 + movq I(0), m0 ; store NR0 at I0 +%endmacro + +; Following macro does two 4x4 transposes in place. +; +; At entry (we assume): +; +; r0 = a3 a2 a1 a0 +; I(1) = b3 b2 b1 b0 +; r2 = c3 c2 c1 c0 +; r3 = d3 d2 d1 d0 +; +; r4 = e3 e2 e1 e0 +; r5 = f3 f2 f1 f0 +; r6 = g3 g2 g1 g0 +; r7 = h3 h2 h1 h0 +; +; At exit, we have: +; +; I(0) = d0 c0 b0 a0 +; I(1) = d1 c1 b1 a1 +; I(2) = d2 c2 b2 a2 +; I(3) = d3 c3 b3 a3 +; +; J(4) = h0 g0 f0 e0 +; J(5) = h1 g1 f1 e1 +; J(6) = h2 g2 f2 e2 +; J(7) = h3 g3 f3 e3 +; +; I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3. +; J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7. +; +; Since r1 is free at entry, we calculate the Js first. +%macro Transpose 0 + movq m1, m4 ; r1 = e3 e2 e1 e0 + punpcklwd m4, m5 ; r4 = f1 e1 f0 e0 + movq I(0), m0 ; save a3 a2 a1 a0 + punpckhwd m1, m5 ; r1 = f3 e3 f2 e2 + movq m0, m6 ; r0 = g3 g2 g1 g0 + punpcklwd m6, m7 ; r6 = h1 g1 h0 g0 + movq m5, m4 ; r5 = f1 e1 f0 e0 + punpckldq m4, m6 ; r4 = h0 g0 f0 e0 = R4 + punpckhdq m5, m6 ; r5 = h1 g1 f1 e1 = R5 + movq m6, m1 ; r6 = f3 e3 f2 e2 + movq J(4), m4 + punpckhwd m0, m7 ; r0 = h3 g3 h2 g2 + movq J(5), m5 + punpckhdq m6, m0 ; r6 = h3 g3 f3 e3 = R7 + movq m4, I(0) ; r4 = a3 a2 a1 a0 + punpckldq m1, m0 ; r1 = h2 g2 f2 e2 = R6 + movq m5, I(1) ; r5 = b3 b2 b1 b0 + movq m0, m4 ; r0 = a3 a2 a1 a0 + movq J(7), m6 + punpcklwd m0, m5 ; r0 = b1 a1 b0 a0 + movq J(6), m1 + punpckhwd m4, m5 ; r4 = b3 a3 b2 a2 + movq m5, m2 ; r5 = c3 c2 c1 c0 + punpcklwd m2, m3 ; r2 = d1 c1 d0 c0 + movq m1, m0 ; r1 = b1 a1 b0 a0 + punpckldq m0, m2 ; r0 = d0 c0 b0 a0 = R0 + punpckhdq m1, m2 ; r1 = d1 c1 b1 a1 = R1 + movq m2, m4 ; r2 = b3 a3 b2 a2 + movq I(0), m0 + punpckhwd m5, m3 ; r5 = d3 c3 d2 c2 + movq I(1), m1 + punpckhdq m4, m5 ; r4 = d3 c3 b3 a3 = R3 + punpckldq m2, m5 ; r2 = d2 c2 b2 a2 = R2 + movq I(3), m4 + movq I(2), m2 +%endmacro + +%macro VP3_IDCT_mmx 1 + ; eax = quantized input + ; ebx = dequantizer matrix + ; ecx = IDCT constants + ; M(I) = ecx + MaskOffset(0) + I * 8 + ; C(I) = ecx + CosineOffset(32) + (I-1) * 8 + ; edx = output + ; r0..r7 = mm0..mm7 +%define OC_8 [pw_8] +%define C(x) [vp3_idct_data+16*(x-1)] + + ; at this point, function has completed dequantization + dezigzag + + ; partial transposition; now do the idct itself +%define I(x) [%1+16* x ] +%define J(x) [%1+16*(x-4)+8] + RowIDCT + Transpose + +%define I(x) [%1+16* x +64] +%define J(x) [%1+16*(x-4)+72] + RowIDCT + Transpose + +%define I(x) [%1+16*x] +%define J(x) [%1+16*x] + ColumnIDCT + +%define I(x) [%1+16*x+8] +%define J(x) [%1+16*x+8] + ColumnIDCT +%endmacro + +%macro VP3_1D_IDCT_SSE2 0 + movdqa m2, I(3) ; xmm2 = i3 + movdqa m6, C(3) ; xmm6 = c3 + movdqa m4, m2 ; xmm4 = i3 + movdqa m7, I(5) ; xmm7 = i5 + pmulhw m4, m6 ; xmm4 = c3 * i3 - i3 + movdqa m1, C(5) ; xmm1 = c5 + pmulhw m6, m7 ; xmm6 = c3 * i5 - i5 + movdqa m5, m1 ; xmm5 = c5 + pmulhw m1, m2 ; xmm1 = c5 * i3 - i3 + movdqa m3, I(1) ; xmm3 = i1 + pmulhw m5, m7 ; xmm5 = c5 * i5 - i5 + movdqa m0, C(1) ; xmm0 = c1 + paddw m4, m2 ; xmm4 = c3 * i3 + paddw m6, m7 ; xmm6 = c3 * i5 + paddw m2, m1 ; xmm2 = c5 * i3 + movdqa m1, I(7) ; xmm1 = i7 + paddw m7, m5 ; xmm7 = c5 * i5 + movdqa m5, m0 ; xmm5 = c1 + pmulhw m0, m3 ; xmm0 = c1 * i1 - i1 + paddsw m4, m7 ; xmm4 = c3 * i3 + c5 * i5 = C + pmulhw m5, m1 ; xmm5 = c1 * i7 - i7 + movdqa m7, C(7) ; xmm7 = c7 + psubsw m6, m2 ; xmm6 = c3 * i5 - c5 * i3 = D + paddw m0, m3 ; xmm0 = c1 * i1 + pmulhw m3, m7 ; xmm3 = c7 * i1 + movdqa m2, I(2) ; xmm2 = i2 + pmulhw m7, m1 ; xmm7 = c7 * i7 + paddw m5, m1 ; xmm5 = c1 * i7 + movdqa m1, m2 ; xmm1 = i2 + pmulhw m2, C(2) ; xmm2 = i2 * c2 -i2 + psubsw m3, m5 ; xmm3 = c7 * i1 - c1 * i7 = B + movdqa m5, I(6) ; xmm5 = i6 + paddsw m0, m7 ; xmm0 = c1 * i1 + c7 * i7 = A + movdqa m7, m5 ; xmm7 = i6 + psubsw m0, m4 ; xmm0 = A - C + pmulhw m5, C(2) ; xmm5 = c2 * i6 - i6 + paddw m2, m1 ; xmm2 = i2 * c2 + pmulhw m1, C(6) ; xmm1 = c6 * i2 + paddsw m4, m4 ; xmm4 = C + C + paddsw m4, m0 ; xmm4 = A + C = C. + psubsw m3, m6 ; xmm3 = B - D + paddw m5, m7 ; xmm5 = c2 * i6 + paddsw m6, m6 ; xmm6 = D + D + pmulhw m7, C(6) ; xmm7 = c6 * i6 + paddsw m6, m3 ; xmm6 = B + D = D. + movdqa I(1), m4 ; Save C. at I(1) + psubsw m1, m5 ; xmm1 = c6 * i2 - c2 * i6 = H + movdqa m4, C(4) ; xmm4 = C4 + movdqa m5, m3 ; xmm5 = B - D + pmulhw m3, m4 ; xmm3 = ( c4 -1 ) * ( B - D ) + paddsw m7, m2 ; xmm7 = c2 * i2 + c6 * i6 = G + movdqa I(2), m6 ; save D. at I(2) + movdqa m2, m0 ; xmm2 = A - C + movdqa m6, I(0) ; xmm6 = i0 + pmulhw m0, m4 ; xmm0 = ( c4 - 1 ) * ( A - C ) = A. + paddw m5, m3 ; xmm5 = c4 * ( B - D ) = B. + movdqa m3, I(4) ; xmm3 = i4 + psubsw m5, m1 ; xmm5 = B. - H = B.. + paddw m2, m0 ; xmm2 = c4 * ( A - C) = A. + psubsw m6, m3 ; xmm6 = i0 - i4 + movdqa m0, m6 ; xmm0 = i0 - i4 + pmulhw m6, m4 ; xmm6 = (c4 - 1) * (i0 - i4) = F + paddsw m3, m3 ; xmm3 = i4 + i4 + paddsw m1, m1 ; xmm1 = H + H + paddsw m3, m0 ; xmm3 = i0 + i4 + paddsw m1, m5 ; xmm1 = B. + H = H. + pmulhw m4, m3 ; xmm4 = ( c4 - 1 ) * ( i0 + i4 ) + paddw m6, m0 ; xmm6 = c4 * ( i0 - i4 ) + psubsw m6, m2 ; xmm6 = F - A. = F. + paddsw m2, m2 ; xmm2 = A. + A. + movdqa m0, I(1) ; Load C. from I(1) + paddsw m2, m6 ; xmm2 = F + A. = A.. + paddw m4, m3 ; xmm4 = c4 * ( i0 + i4 ) = 3 + psubsw m2, m1 ; xmm2 = A.. - H. = R2 + ADD(m2) ; Adjust R2 and R1 before shifting + paddsw m1, m1 ; xmm1 = H. + H. + paddsw m1, m2 ; xmm1 = A.. + H. = R1 + SHIFT(m2) ; xmm2 = op2 + psubsw m4, m7 ; xmm4 = E - G = E. + SHIFT(m1) ; xmm1 = op1 + movdqa m3, I(2) ; Load D. from I(2) + paddsw m7, m7 ; xmm7 = G + G + paddsw m7, m4 ; xmm7 = E + G = G. + psubsw m4, m3 ; xmm4 = E. - D. = R4 + ADD(m4) ; Adjust R4 and R3 before shifting + paddsw m3, m3 ; xmm3 = D. + D. + paddsw m3, m4 ; xmm3 = E. + D. = R3 + SHIFT(m4) ; xmm4 = op4 + psubsw m6, m5 ; xmm6 = F. - B..= R6 + SHIFT(m3) ; xmm3 = op3 + ADD(m6) ; Adjust R6 and R5 before shifting + paddsw m5, m5 ; xmm5 = B.. + B.. + paddsw m5, m6 ; xmm5 = F. + B.. = R5 + SHIFT(m6) ; xmm6 = op6 + SHIFT(m5) ; xmm5 = op5 + psubsw m7, m0 ; xmm7 = G. - C. = R7 + ADD(m7) ; Adjust R7 and R0 before shifting + paddsw m0, m0 ; xmm0 = C. + C. + paddsw m0, m7 ; xmm0 = G. + C. + SHIFT(m7) ; xmm7 = op7 + SHIFT(m0) ; xmm0 = op0 +%endmacro + +%macro PUT_BLOCK 8 + movdqa O(0), m%1 + movdqa O(1), m%2 + movdqa O(2), m%3 + movdqa O(3), m%4 + movdqa O(4), m%5 + movdqa O(5), m%6 + movdqa O(6), m%7 + movdqa O(7), m%8 +%endmacro + +%macro VP3_IDCT_sse2 1 +%define I(x) [%1+16*x] +%define O(x) [%1+16*x] +%define C(x) [vp3_idct_data+16*(x-1)] +%define SHIFT(x) +%define ADD(x) + VP3_1D_IDCT_SSE2 +%ifdef ARCH_X86_64 + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 +%else + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%1], [%1+16] +%endif + PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7 + +%define SHIFT(x) psraw x, 4 +%define ADD(x) paddsw x, [pw_8] + VP3_1D_IDCT_SSE2 + PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7 +%endmacro + +%macro vp3_idct_funcs 3 +cglobal vp3_idct_%1, 1, 1, %2 + VP3_IDCT_%1 r0 + RET + +cglobal vp3_idct_put_%1, 3, %3, %2 + VP3_IDCT_%1 r2 +%ifdef ARCH_X86_64 + mov r3, r2 + mov r2, r1 + mov r1, r0 + mov r0, r3 +%else + mov r0m, r2 + mov r1m, r0 + mov r2m, r1 +%endif +%ifdef WIN64 + call put_signed_pixels_clamped_mmx + RET +%else + jmp put_signed_pixels_clamped_mmx +%endif + +cglobal vp3_idct_add_%1, 3, %3, %2 + VP3_IDCT_%1 r2 +%ifdef ARCH_X86_64 + mov r3, r2 + mov r2, r1 + mov r1, r0 + mov r0, r3 +%else + mov r0m, r2 + mov r1m, r0 + mov r2m, r1 +%endif +%ifdef WIN64 + call add_pixels_clamped_mmx + RET +%else + jmp add_pixels_clamped_mmx +%endif +%endmacro + +%ifdef ARCH_X86_64 +%define REGS 4 +%else +%define REGS 3 +%endif +INIT_MMX +vp3_idct_funcs mmx, 0, REGS +INIT_XMM +vp3_idct_funcs sse2, 9, REGS +%undef REGS + +%macro DC_ADD 0 + movq m2, [r0 ] + movq m3, [r0+r1 ] + paddusb m2, m0 + movq m4, [r0+r1*2] + paddusb m3, m0 + movq m5, [r0+r3 ] + paddusb m4, m0 + paddusb m5, m0 + psubusb m2, m1 + psubusb m3, m1 + movq [r0 ], m2 + psubusb m4, m1 + movq [r0+r1 ], m3 + psubusb m5, m1 + movq [r0+r1*2], m4 + movq [r0+r3 ], m5 +%endmacro + +INIT_MMX +cglobal vp3_idct_dc_add_mmx2, 3, 4 +%ifdef ARCH_X86_64 + movsxd r1, r1d +%endif + lea r3, [r1*3] + movsx r2, word [r2] + add r2, 15 + sar r2, 5 + movd m0, r2 + pshufw m0, m0, 0x0 + pxor m1, m1 + psubw m1, m0 + packuswb m0, m0 + packuswb m1, m1 + DC_ADD + lea r0, [r0+r1*4] + DC_ADD + RET diff -r fe78a4548d12 -r d6d0a43848b4 x86/vp3dsp_mmx.c --- a/x86/vp3dsp_mmx.c Mon Aug 30 16:22:27 2010 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,436 +0,0 @@ -/* - * Copyright (C) 2004 the ffmpeg project - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * MMX-optimized functions cribbed from the original VP3 source code. - */ - -#include "libavutil/x86_cpu.h" -#include "libavcodec/dsputil.h" -#include "dsputil_mmx.h" -#include "vp3dsp_mmx.h" - -extern const uint16_t ff_vp3_idct_data[]; - -// this is off by one or two for some cases when filter_limit is greater than 63 -// in: p0 in mm6, p1 in mm4, p2 in mm2, p3 in mm1 -// out: p1 in mm4, p2 in mm3 -#define VP3_LOOP_FILTER(flim) \ - "movq %%mm6, %%mm7 \n\t" \ - "pand "MANGLE(ff_pb_7 )", %%mm6 \n\t" /* p0&7 */ \ - "psrlw $3, %%mm7 \n\t" \ - "pand "MANGLE(ff_pb_1F)", %%mm7 \n\t" /* p0>>3 */ \ - "movq %%mm2, %%mm3 \n\t" /* mm3 = p2 */ \ - "pxor %%mm4, %%mm2 \n\t" \ - "pand "MANGLE(ff_pb_1 )", %%mm2 \n\t" /* (p2^p1)&1 */ \ - "movq %%mm2, %%mm5 \n\t" \ - "paddb %%mm2, %%mm2 \n\t" \ - "paddb %%mm5, %%mm2 \n\t" /* 3*(p2^p1)&1 */ \ - "paddb %%mm6, %%mm2 \n\t" /* extra bits lost in shifts */ \ - "pcmpeqb %%mm0, %%mm0 \n\t" \ - "pxor %%mm0, %%mm1 \n\t" /* 255 - p3 */ \ - "pavgb %%mm2, %%mm1 \n\t" /* (256 - p3 + extrabits) >> 1 */ \ - "pxor %%mm4, %%mm0 \n\t" /* 255 - p1 */ \ - "pavgb %%mm3, %%mm0 \n\t" /* (256 + p2-p1) >> 1 */ \ - "paddb "MANGLE(ff_pb_3 )", %%mm1 \n\t" \ - "pavgb %%mm0, %%mm1 \n\t" /* 128+2+( p2-p1 - p3) >> 2 */ \ - "pavgb %%mm0, %%mm1 \n\t" /* 128+1+(3*(p2-p1) - p3) >> 3 */ \ - "paddusb %%mm1, %%mm7 \n\t" /* d+128+1 */ \ - "movq "MANGLE(ff_pb_81)", %%mm6 \n\t" \ - "psubusb %%mm7, %%mm6 \n\t" \ - "psubusb "MANGLE(ff_pb_81)", %%mm7 \n\t" \ -\ - "movq "#flim", %%mm5 \n\t" \ - "pminub %%mm5, %%mm6 \n\t" \ - "pminub %%mm5, %%mm7 \n\t" \ - "movq %%mm6, %%mm0 \n\t" \ - "movq %%mm7, %%mm1 \n\t" \ - "paddb %%mm6, %%mm6 \n\t" \ - "paddb %%mm7, %%mm7 \n\t" \ - "pminub %%mm5, %%mm6 \n\t" \ - "pminub %%mm5, %%mm7 \n\t" \ - "psubb %%mm0, %%mm6 \n\t" \ - "psubb %%mm1, %%mm7 \n\t" \ - "paddusb %%mm7, %%mm4 \n\t" \ - "psubusb %%mm6, %%mm4 \n\t" \ - "psubusb %%mm7, %%mm3 \n\t" \ - "paddusb %%mm6, %%mm3 \n\t" - -#define STORE_4_WORDS(dst0, dst1, dst2, dst3, mm) \ - "movd "#mm", %0 \n\t" \ - "movw %w0, -1"#dst0" \n\t" \ - "psrlq $32, "#mm" \n\t" \ - "shr $16, %0 \n\t" \ - "movw %w0, -1"#dst1" \n\t" \ - "movd "#mm", %0 \n\t" \ - "movw %w0, -1"#dst2" \n\t" \ - "shr $16, %0 \n\t" \ - "movw %w0, -1"#dst3" \n\t" - -void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values) -{ - __asm__ volatile( - "movq %0, %%mm6 \n\t" - "movq %1, %%mm4 \n\t" - "movq %2, %%mm2 \n\t" - "movq %3, %%mm1 \n\t" - - VP3_LOOP_FILTER(%4) - - "movq %%mm4, %1 \n\t" - "movq %%mm3, %2 \n\t" - - : "+m" (*(uint64_t*)(src - 2*stride)), - "+m" (*(uint64_t*)(src - 1*stride)), - "+m" (*(uint64_t*)(src + 0*stride)), - "+m" (*(uint64_t*)(src + 1*stride)) - : "m"(*(uint64_t*)(bounding_values+129)) - ); -} - -void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values) -{ - x86_reg tmp; - - __asm__ volatile( - "movd -2(%1), %%mm6 \n\t" - "movd -2(%1,%3), %%mm0 \n\t" - "movd -2(%1,%3,2), %%mm1 \n\t" - "movd -2(%1,%4), %%mm4 \n\t" - - TRANSPOSE8x4(%%mm6, %%mm0, %%mm1, %%mm4, -2(%2), -2(%2,%3), -2(%2,%3,2), -2(%2,%4), %%mm2) - VP3_LOOP_FILTER(%5) - SBUTTERFLY(%%mm4, %%mm3, %%mm5, bw, q) - - STORE_4_WORDS((%1), (%1,%3), (%1,%3,2), (%1,%4), %%mm4) - STORE_4_WORDS((%2), (%2,%3), (%2,%3,2), (%2,%4), %%mm5) - - : "=&r"(tmp) - : "r"(src), "r"(src+4*stride), "r"((x86_reg)stride), "r"((x86_reg)3*stride), - "m"(*(uint64_t*)(bounding_values+129)) - : "memory" - ); -} - -/* from original comments: The Macro does IDct on 4 1-D Dcts */ -#define BeginIDCT() \ - "movq "I(3)", %%mm2 \n\t" \ - "movq "C(3)", %%mm6 \n\t" \ - "movq %%mm2, %%mm4 \n\t" \ - "movq "J(5)", %%mm7 \n\t" \ - "pmulhw %%mm6, %%mm4 \n\t" /* r4 = c3*i3 - i3 */ \ - "movq "C(5)", %%mm1 \n\t" \ - "pmulhw %%mm7, %%mm6 \n\t" /* r6 = c3*i5 - i5 */ \ - "movq %%mm1, %%mm5 \n\t" \ - "pmulhw %%mm2, %%mm1 \n\t" /* r1 = c5*i3 - i3 */ \ - "movq "I(1)", %%mm3 \n\t" \ - "pmulhw %%mm7, %%mm5 \n\t" /* r5 = c5*i5 - i5 */ \ - "movq "C(1)", %%mm0 \n\t" \ - "paddw %%mm2, %%mm4 \n\t" /* r4 = c3*i3 */ \ - "paddw %%mm7, %%mm6 \n\t" /* r6 = c3*i5 */ \ - "paddw %%mm1, %%mm2 \n\t" /* r2 = c5*i3 */ \ - "movq "J(7)", %%mm1 \n\t" \ - "paddw %%mm5, %%mm7 \n\t" /* r7 = c5*i5 */ \ - "movq %%mm0, %%mm5 \n\t" /* r5 = c1 */ \ - "pmulhw %%mm3, %%mm0 \n\t" /* r0 = c1*i1 - i1 */ \ - "paddsw %%mm7, %%mm4 \n\t" /* r4 = C = c3*i3 + c5*i5 */ \ - "pmulhw %%mm1, %%mm5 \n\t" /* r5 = c1*i7 - i7 */ \ - "movq "C(7)", %%mm7 \n\t" \ - "psubsw %%mm2, %%mm6 \n\t" /* r6 = D = c3*i5 - c5*i3 */ \ - "paddw %%mm3, %%mm0 \n\t" /* r0 = c1*i1 */ \ - "pmulhw %%mm7, %%mm3 \n\t" /* r3 = c7*i1 */ \ - "movq "I(2)", %%mm2 \n\t" \ - "pmulhw %%mm1, %%mm7 \n\t" /* r7 = c7*i7 */ \ - "paddw %%mm1, %%mm5 \n\t" /* r5 = c1*i7 */ \ - "movq %%mm2, %%mm1 \n\t" /* r1 = i2 */ \ - "pmulhw "C(2)", %%mm2 \n\t" /* r2 = c2*i2 - i2 */ \ - "psubsw %%mm5, %%mm3 \n\t" /* r3 = B = c7*i1 - c1*i7 */ \ - "movq "J(6)", %%mm5 \n\t" \ - "paddsw %%mm7, %%mm0 \n\t" /* r0 = A = c1*i1 + c7*i7 */ \ - "movq %%mm5, %%mm7 \n\t" /* r7 = i6 */ \ - "psubsw %%mm4, %%mm0 \n\t" /* r0 = A - C */ \ - "pmulhw "C(2)", %%mm5 \n\t" /* r5 = c2*i6 - i6 */ \ - "paddw %%mm1, %%mm2 \n\t" /* r2 = c2*i2 */ \ - "pmulhw "C(6)", %%mm1 \n\t" /* r1 = c6*i2 */ \ - "paddsw %%mm4, %%mm4 \n\t" /* r4 = C + C */ \ - "paddsw %%mm0, %%mm4 \n\t" /* r4 = C. = A + C */ \ - "psubsw %%mm6, %%mm3 \n\t" /* r3 = B - D */ \ - "paddw %%mm7, %%mm5 \n\t" /* r5 = c2*i6 */ \ - "paddsw %%mm6, %%mm6 \n\t" /* r6 = D + D */ \ - "pmulhw "C(6)", %%mm7 \n\t" /* r7 = c6*i6 */ \ - "paddsw %%mm3, %%mm6 \n\t" /* r6 = D. = B + D */ \ - "movq %%mm4, "I(1)"\n\t" /* save C. at I(1) */ \ - "psubsw %%mm5, %%mm1 \n\t" /* r1 = H = c6*i2 - c2*i6 */ \ - "movq "C(4)", %%mm4 \n\t" \ - "movq %%mm3, %%mm5 \n\t" /* r5 = B - D */ \ - "pmulhw %%mm4, %%mm3 \n\t" /* r3 = (c4 - 1) * (B - D) */ \ - "paddsw %%mm2, %%mm7 \n\t" /* r3 = (c4 - 1) * (B - D) */ \ - "movq %%mm6, "I(2)"\n\t" /* save D. at I(2) */ \ - "movq %%mm0, %%mm2 \n\t" /* r2 = A - C */ \ - "movq "I(0)", %%mm6 \n\t" \ - "pmulhw %%mm4, %%mm0 \n\t" /* r0 = (c4 - 1) * (A - C) */ \ - "paddw %%mm3, %%mm5 \n\t" /* r5 = B. = c4 * (B - D) */ \ - "movq "J(4)", %%mm3 \n\t" \ - "psubsw %%mm1, %%mm5 \n\t" /* r5 = B.. = B. - H */ \ - "paddw %%mm0, %%mm2 \n\t" /* r0 = A. = c4 * (A - C) */ \ - "psubsw %%mm3, %%mm6 \n\t" /* r6 = i0 - i4 */ \ - "movq %%mm6, %%mm0 \n\t" \ - "pmulhw %%mm4, %%mm6 \n\t" /* r6 = (c4 - 1) * (i0 - i4) */ \ - "paddsw %%mm3, %%mm3 \n\t" /* r3 = i4 + i4 */ \ - "paddsw %%mm1, %%mm1 \n\t" /* r1 = H + H */ \ - "paddsw %%mm0, %%mm3 \n\t" /* r3 = i0 + i4 */ \ - "paddsw %%mm5, %%mm1 \n\t" /* r1 = H. = B + H */ \ - "pmulhw %%mm3, %%mm4 \n\t" /* r4 = (c4 - 1) * (i0 + i4) */ \ - "paddsw %%mm0, %%mm6 \n\t" /* r6 = F = c4 * (i0 - i4) */ \ - "psubsw %%mm2, %%mm6 \n\t" /* r6 = F. = F - A. */ \ - "paddsw %%mm2, %%mm2 \n\t" /* r2 = A. + A. */ \ - "movq "I(1)", %%mm0 \n\t" /* r0 = C. */ \ - "paddsw %%mm6, %%mm2 \n\t" /* r2 = A.. = F + A. */ \ - "paddw %%mm3, %%mm4 \n\t" /* r4 = E = c4 * (i0 + i4) */ \ - "psubsw %%mm1, %%mm2 \n\t" /* r2 = R2 = A.. - H. */ - -/* RowIDCT gets ready to transpose */ -#define RowIDCT() \ - BeginIDCT() \ - "movq "I(2)", %%mm3 \n\t" /* r3 = D. */ \ - "psubsw %%mm7, %%mm4 \n\t" /* r4 = E. = E - G */ \ - "paddsw %%mm1, %%mm1 \n\t" /* r1 = H. + H. */ \ - "paddsw %%mm7, %%mm7 \n\t" /* r7 = G + G */ \ - "paddsw %%mm2, %%mm1 \n\t" /* r1 = R1 = A.. + H. */ \ - "paddsw %%mm4, %%mm7 \n\t" /* r1 = R1 = A.. + H. */ \ - "psubsw %%mm3, %%mm4 \n\t" /* r4 = R4 = E. - D. */ \ - "paddsw %%mm3, %%mm3 \n\t" \ - "psubsw %%mm5, %%mm6 \n\t" /* r6 = R6 = F. - B.. */ \ - "paddsw %%mm5, %%mm5 \n\t" \ - "paddsw %%mm4, %%mm3 \n\t" /* r3 = R3 = E. + D. */ \ - "paddsw %%mm6, %%mm5 \n\t" /* r5 = R5 = F. + B.. */ \ - "psubsw %%mm0, %%mm7 \n\t" /* r7 = R7 = G. - C. */ \ - "paddsw %%mm0, %%mm0 \n\t" \ - "movq %%mm1, "I(1)"\n\t" /* save R1 */ \ - "paddsw %%mm7, %%mm0 \n\t" /* r0 = R0 = G. + C. */ - -/* Column IDCT normalizes and stores final results */ -#define ColumnIDCT() \ - BeginIDCT() \ - "paddsw "OC_8", %%mm2 \n\t" /* adjust R2 (and R1) for shift */ \ - "paddsw %%mm1, %%mm1 \n\t" /* r1 = H. + H. */ \ - "paddsw %%mm2, %%mm1 \n\t" /* r1 = R1 = A.. + H. */ \ - "psraw $4, %%mm2 \n\t" /* r2 = NR2 */ \ - "psubsw %%mm7, %%mm4 \n\t" /* r4 = E. = E - G */ \ - "psraw $4, %%mm1 \n\t" /* r1 = NR1 */ \ - "movq "I(2)", %%mm3 \n\t" /* r3 = D. */ \ - "paddsw %%mm7, %%mm7 \n\t" /* r7 = G + G */ \ - "movq %%mm2, "I(2)"\n\t" /* store NR2 at I2 */ \ - "paddsw %%mm4, %%mm7 \n\t" /* r7 = G. = E + G */ \ - "movq %%mm1, "I(1)"\n\t" /* store NR1 at I1 */ \ - "psubsw %%mm3, %%mm4 \n\t" /* r4 = R4 = E. - D. */ \ - "paddsw "OC_8", %%mm4 \n\t" /* adjust R4 (and R3) for shift */ \ - "paddsw %%mm3, %%mm3 \n\t" /* r3 = D. + D. */ \ - "paddsw %%mm4, %%mm3 \n\t" /* r3 = R3 = E. + D. */ \ - "psraw $4, %%mm4 \n\t" /* r4 = NR4 */ \ - "psubsw %%mm5, %%mm6 \n\t" /* r6 = R6 = F. - B.. */ \ - "psraw $4, %%mm3 \n\t" /* r3 = NR3 */ \ - "paddsw "OC_8", %%mm6 \n\t" /* adjust R6 (and R5) for shift */ \ - "paddsw %%mm5, %%mm5 \n\t" /* r5 = B.. + B.. */ \ - "paddsw %%mm6, %%mm5 \n\t" /* r5 = R5 = F. + B.. */ \ - "psraw $4, %%mm6 \n\t" /* r6 = NR6 */ \ - "movq %%mm4, "J(4)"\n\t" /* store NR4 at J4 */ \ - "psraw $4, %%mm5 \n\t" /* r5 = NR5 */ \ - "movq %%mm3, "I(3)"\n\t" /* store NR3 at I3 */ \ - "psubsw %%mm0, %%mm7 \n\t" /* r7 = R7 = G. - C. */ \ - "paddsw "OC_8", %%mm7 \n\t" /* adjust R7 (and R0) for shift */ \ - "paddsw %%mm0, %%mm0 \n\t" /* r0 = C. + C. */ \ - "paddsw %%mm7, %%mm0 \n\t" /* r0 = R0 = G. + C. */ \ - "psraw $4, %%mm7 \n\t" /* r7 = NR7 */ \ - "movq %%mm6, "J(6)"\n\t" /* store NR6 at J6 */ \ - "psraw $4, %%mm0 \n\t" /* r0 = NR0 */ \ - "movq %%mm5, "J(5)"\n\t" /* store NR5 at J5 */ \ - "movq %%mm7, "J(7)"\n\t" /* store NR7 at J7 */ \ - "movq %%mm0, "I(0)"\n\t" /* store NR0 at I0 */ - -/* Following macro does two 4x4 transposes in place. - - At entry (we assume): - - r0 = a3 a2 a1 a0 - I(1) = b3 b2 b1 b0 - r2 = c3 c2 c1 c0 - r3 = d3 d2 d1 d0 - - r4 = e3 e2 e1 e0 - r5 = f3 f2 f1 f0 - r6 = g3 g2 g1 g0 - r7 = h3 h2 h1 h0 - - At exit, we have: - - I(0) = d0 c0 b0 a0 - I(1) = d1 c1 b1 a1 - I(2) = d2 c2 b2 a2 - I(3) = d3 c3 b3 a3 - - J(4) = h0 g0 f0 e0 - J(5) = h1 g1 f1 e1 - J(6) = h2 g2 f2 e2 - J(7) = h3 g3 f3 e3 - - I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3. - J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7. - - Since r1 is free at entry, we calculate the Js first. */ -#define Transpose() \ - "movq %%mm4, %%mm1 \n\t" /* r1 = e3 e2 e1 e0 */ \ - "punpcklwd %%mm5, %%mm4 \n\t" /* r4 = f1 e1 f0 e0 */ \ - "movq %%mm0, "I(0)"\n\t" /* save a3 a2 a1 a0 */ \ - "punpckhwd %%mm5, %%mm1 \n\t" /* r1 = f3 e3 f2 e2 */ \ - "movq %%mm6, %%mm0 \n\t" /* r0 = g3 g2 g1 g0 */ \ - "punpcklwd %%mm7, %%mm6 \n\t" /* r6 = h1 g1 h0 g0 */ \ - "movq %%mm4, %%mm5 \n\t" /* r5 = f1 e1 f0 e0 */ \ - "punpckldq %%mm6, %%mm4 \n\t" /* r4 = h0 g0 f0 e0 = R4 */ \ - "punpckhdq %%mm6, %%mm5 \n\t" /* r5 = h1 g1 f1 e1 = R5 */ \ - "movq %%mm1, %%mm6 \n\t" /* r6 = f3 e3 f2 e2 */ \ - "movq %%mm4, "J(4)"\n\t" \ - "punpckhwd %%mm7, %%mm0 \n\t" /* r0 = h3 g3 h2 g2 */ \ - "movq %%mm5, "J(5)"\n\t" \ - "punpckhdq %%mm0, %%mm6 \n\t" /* r6 = h3 g3 f3 e3 = R7 */ \ - "movq "I(0)", %%mm4 \n\t" /* r4 = a3 a2 a1 a0 */ \ - "punpckldq %%mm0, %%mm1 \n\t" /* r1 = h2 g2 f2 e2 = R6 */ \ - "movq "I(1)", %%mm5 \n\t" /* r5 = b3 b2 b1 b0 */ \ - "movq %%mm4, %%mm0 \n\t" /* r0 = a3 a2 a1 a0 */ \ - "movq %%mm6, "J(7)"\n\t" \ - "punpcklwd %%mm5, %%mm0 \n\t" /* r0 = b1 a1 b0 a0 */ \ - "movq %%mm1, "J(6)"\n\t" \ - "punpckhwd %%mm5, %%mm4 \n\t" /* r4 = b3 a3 b2 a2 */ \ - "movq %%mm2, %%mm5 \n\t" /* r5 = c3 c2 c1 c0 */ \ - "punpcklwd %%mm3, %%mm2 \n\t" /* r2 = d1 c1 d0 c0 */ \ - "movq %%mm0, %%mm1 \n\t" /* r1 = b1 a1 b0 a0 */ \ - "punpckldq %%mm2, %%mm0 \n\t" /* r0 = d0 c0 b0 a0 = R0 */ \ - "punpckhdq %%mm2, %%mm1 \n\t" /* r1 = d1 c1 b1 a1 = R1 */ \ - "movq %%mm4, %%mm2 \n\t" /* r2 = b3 a3 b2 a2 */ \ - "movq %%mm0, "I(0)"\n\t" \ - "punpckhwd %%mm3, %%mm5 \n\t" /* r5 = d3 c3 d2 c2 */ \ - "movq %%mm1, "I(1)"\n\t" \ - "punpckhdq %%mm5, %%mm4 \n\t" /* r4 = d3 c3 b3 a3 = R3 */ \ - "punpckldq %%mm5, %%mm2 \n\t" /* r2 = d2 c2 b2 a2 = R2 */ \ - "movq %%mm4, "I(3)"\n\t" \ - "movq %%mm2, "I(2)"\n\t" - -void ff_vp3_idct_mmx(int16_t *output_data) -{ - /* eax = quantized input - * ebx = dequantizer matrix - * ecx = IDCT constants - * M(I) = ecx + MaskOffset(0) + I * 8 - * C(I) = ecx + CosineOffset(32) + (I-1) * 8 - * edx = output - * r0..r7 = mm0..mm7 - */ - -#define C(x) AV_STRINGIFY(16*(x-1))"(%1)" -#define OC_8 "%2" - - /* at this point, function has completed dequantization + dezigzag + - * partial transposition; now do the idct itself */ -#define I(x) AV_STRINGIFY(16* x )"(%0)" -#define J(x) AV_STRINGIFY(16*(x-4) + 8)"(%0)" - - __asm__ volatile ( - RowIDCT() - Transpose() - -#undef I -#undef J -#define I(x) AV_STRINGIFY(16* x + 64)"(%0)" -#define J(x) AV_STRINGIFY(16*(x-4) + 72)"(%0)" - - RowIDCT() - Transpose() - -#undef I -#undef J -#define I(x) AV_STRINGIFY(16*x)"(%0)" -#define J(x) AV_STRINGIFY(16*x)"(%0)" - - ColumnIDCT() - -#undef I -#undef J -#define I(x) AV_STRINGIFY(16*x + 8)"(%0)" -#define J(x) AV_STRINGIFY(16*x + 8)"(%0)" - - ColumnIDCT() - :: "r"(output_data), "r"(ff_vp3_idct_data), "m"(ff_pw_8) - ); -#undef I -#undef J - -} - -void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block) -{ - ff_vp3_idct_mmx(block); - ff_put_signed_pixels_clamped_mmx(block, dest, line_size); -} - -void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block) -{ - ff_vp3_idct_mmx(block); - ff_add_pixels_clamped_mmx(block, dest, line_size); -} - -void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int linesize, const DCTELEM *block) -{ - int dc = (block[0] + 15) >> 5; - - __asm__ volatile( - "movd %3, %%mm0 \n\t" - "pshufw $0, %%mm0, %%mm0 \n\t" - "pxor %%mm1, %%mm1 \n\t" - "psubw %%mm0, %%mm1 \n\t" - "packuswb %%mm0, %%mm0 \n\t" - "packuswb %%mm1, %%mm1 \n\t" - -#define DC_ADD \ - "movq (%0), %%mm2 \n\t" \ - "movq (%0,%1), %%mm3 \n\t" \ - "paddusb %%mm0, %%mm2 \n\t" \ - "movq (%0,%1,2), %%mm4 \n\t" \ - "paddusb %%mm0, %%mm3 \n\t" \ - "movq (%0,%2), %%mm5 \n\t" \ - "paddusb %%mm0, %%mm4 \n\t" \ - "paddusb %%mm0, %%mm5 \n\t" \ - "psubusb %%mm1, %%mm2 \n\t" \ - "psubusb %%mm1, %%mm3 \n\t" \ - "movq %%mm2, (%0) \n\t" \ - "psubusb %%mm1, %%mm4 \n\t" \ - "movq %%mm3, (%0,%1) \n\t" \ - "psubusb %%mm1, %%mm5 \n\t" \ - "movq %%mm4, (%0,%1,2) \n\t" \ - "movq %%mm5, (%0,%2) \n\t" - - DC_ADD - "lea (%0,%1,4), %0 \n\t" - DC_ADD - - : "+r"(dest) - : "r"((x86_reg)linesize), "r"((x86_reg)3*linesize), "r"(dc) - ); -} diff -r fe78a4548d12 -r d6d0a43848b4 x86/vp3dsp_mmx.h --- a/x86/vp3dsp_mmx.h Mon Aug 30 16:22:27 2010 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,36 +0,0 @@ -/* - * vp3dsp MMX function declarations - * Copyright (c) 2007 Aurelien Jacobs - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVCODEC_X86_VP3DSP_MMX_H -#define AVCODEC_X86_VP3DSP_MMX_H - -#include -#include "libavcodec/dsputil.h" - -void ff_vp3_idct_mmx(int16_t *data); -void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block); -void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block); -void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int line_size, const DCTELEM *block); - -void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values); -void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values); - -#endif /* AVCODEC_X86_VP3DSP_MMX_H */ diff -r fe78a4548d12 -r d6d0a43848b4 x86/vp3dsp_sse2.c --- a/x86/vp3dsp_sse2.c Mon Aug 30 16:22:27 2010 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,187 +0,0 @@ -/* - * Copyright (C) 2004 the ffmpeg project - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * SSE2-optimized functions cribbed from the original VP3 source code. - */ - -#include "libavcodec/dsputil.h" -#include "dsputil_mmx.h" -#include "vp3dsp_sse2.h" - -DECLARE_ALIGNED(16, const uint16_t, ff_vp3_idct_data)[7 * 8] = -{ - 64277,64277,64277,64277,64277,64277,64277,64277, - 60547,60547,60547,60547,60547,60547,60547,60547, - 54491,54491,54491,54491,54491,54491,54491,54491, - 46341,46341,46341,46341,46341,46341,46341,46341, - 36410,36410,36410,36410,36410,36410,36410,36410, - 25080,25080,25080,25080,25080,25080,25080,25080, - 12785,12785,12785,12785,12785,12785,12785,12785 -}; - - -#define VP3_1D_IDCT_SSE2(ADD, SHIFT) \ - "movdqa "I(3)", %%xmm2 \n\t" /* xmm2 = i3 */ \ - "movdqa "C(3)", %%xmm6 \n\t" /* xmm6 = c3 */ \ - "movdqa %%xmm2, %%xmm4 \n\t" /* xmm4 = i3 */ \ - "movdqa "I(5)", %%xmm7 \n\t" /* xmm7 = i5 */ \ - "pmulhw %%xmm6, %%xmm4 \n\t" /* xmm4 = c3 * i3 - i3 */ \ - "movdqa "C(5)", %%xmm1 \n\t" /* xmm1 = c5 */ \ - "pmulhw %%xmm7, %%xmm6 \n\t" /* xmm6 = c3 * i5 - i5 */ \ - "movdqa %%xmm1, %%xmm5 \n\t" /* xmm5 = c5 */ \ - "pmulhw %%xmm2, %%xmm1 \n\t" /* xmm1 = c5 * i3 - i3 */ \ - "movdqa "I(1)", %%xmm3 \n\t" /* xmm3 = i1 */ \ - "pmulhw %%xmm7, %%xmm5 \n\t" /* xmm5 = c5 * i5 - i5 */ \ - "movdqa "C(1)", %%xmm0 \n\t" /* xmm0 = c1 */ \ - "paddw %%xmm2, %%xmm4 \n\t" /* xmm4 = c3 * i3 */ \ - "paddw %%xmm7, %%xmm6 \n\t" /* xmm6 = c3 * i5 */ \ - "paddw %%xmm1, %%xmm2 \n\t" /* xmm2 = c5 * i3 */ \ - "movdqa "I(7)", %%xmm1 \n\t" /* xmm1 = i7 */ \ - "paddw %%xmm5, %%xmm7 \n\t" /* xmm7 = c5 * i5 */ \ - "movdqa %%xmm0, %%xmm5 \n\t" /* xmm5 = c1 */ \ - "pmulhw %%xmm3, %%xmm0 \n\t" /* xmm0 = c1 * i1 - i1 */ \ - "paddsw %%xmm7, %%xmm4 \n\t" /* xmm4 = c3 * i3 + c5 * i5 = C */ \ - "pmulhw %%xmm1, %%xmm5 \n\t" /* xmm5 = c1 * i7 - i7 */ \ - "movdqa "C(7)", %%xmm7 \n\t" /* xmm7 = c7 */ \ - "psubsw %%xmm2, %%xmm6 \n\t" /* xmm6 = c3 * i5 - c5 * i3 = D */ \ - "paddw %%xmm3, %%xmm0 \n\t" /* xmm0 = c1 * i1 */ \ - "pmulhw %%xmm7, %%xmm3 \n\t" /* xmm3 = c7 * i1 */ \ - "movdqa "I(2)", %%xmm2 \n\t" /* xmm2 = i2 */ \ - "pmulhw %%xmm1, %%xmm7 \n\t" /* xmm7 = c7 * i7 */ \ - "paddw %%xmm1, %%xmm5 \n\t" /* xmm5 = c1 * i7 */ \ - "movdqa %%xmm2, %%xmm1 \n\t" /* xmm1 = i2 */ \ - "pmulhw "C(2)", %%xmm2 \n\t" /* xmm2 = i2 * c2 -i2 */ \ - "psubsw %%xmm5, %%xmm3 \n\t" /* xmm3 = c7 * i1 - c1 * i7 = B */ \ - "movdqa "I(6)", %%xmm5 \n\t" /* xmm5 = i6 */ \ - "paddsw %%xmm7, %%xmm0 \n\t" /* xmm0 = c1 * i1 + c7 * i7 = A */ \ - "movdqa %%xmm5, %%xmm7 \n\t" /* xmm7 = i6 */ \ - "psubsw %%xmm4, %%xmm0 \n\t" /* xmm0 = A - C */ \ - "pmulhw "C(2)", %%xmm5 \n\t" /* xmm5 = c2 * i6 - i6 */ \ - "paddw %%xmm1, %%xmm2 \n\t" /* xmm2 = i2 * c2 */ \ - "pmulhw "C(6)", %%xmm1 \n\t" /* xmm1 = c6 * i2 */ \ - "paddsw %%xmm4, %%xmm4 \n\t" /* xmm4 = C + C */ \ - "paddsw %%xmm0, %%xmm4 \n\t" /* xmm4 = A + C = C. */ \ - "psubsw %%xmm6, %%xmm3 \n\t" /* xmm3 = B - D */ \ - "paddw %%xmm7, %%xmm5 \n\t" /* xmm5 = c2 * i6 */ \ - "paddsw %%xmm6, %%xmm6 \n\t" /* xmm6 = D + D */ \ - "pmulhw "C(6)", %%xmm7 \n\t" /* xmm7 = c6 * i6 */ \ - "paddsw %%xmm3, %%xmm6 \n\t" /* xmm6 = B + D = D. */ \ - "movdqa %%xmm4, "I(1)" \n\t" /* Save C. at I(1) */ \ - "psubsw %%xmm5, %%xmm1 \n\t" /* xmm1 = c6 * i2 - c2 * i6 = H */ \ - "movdqa "C(4)", %%xmm4 \n\t" /* xmm4 = c4 */ \ - "movdqa %%xmm3, %%xmm5 \n\t" /* xmm5 = B - D */ \ - "pmulhw %%xmm4, %%xmm3 \n\t" /* xmm3 = ( c4 -1 ) * ( B - D ) */ \ - "paddsw %%xmm2, %%xmm7 \n\t" /* xmm7 = c2 * i2 + c6 * i6 = G */ \ - "movdqa %%xmm6, "I(2)" \n\t" /* Save D. at I(2) */ \ - "movdqa %%xmm0, %%xmm2 \n\t" /* xmm2 = A - C */ \ - "movdqa "I(0)", %%xmm6 \n\t" /* xmm6 = i0 */ \ - "pmulhw %%xmm4, %%xmm0 \n\t" /* xmm0 = ( c4 - 1 ) * ( A - C ) = A. */ \ - "paddw %%xmm3, %%xmm5 \n\t" /* xmm5 = c4 * ( B - D ) = B. */ \ - "movdqa "I(4)", %%xmm3 \n\t" /* xmm3 = i4 */ \ - "psubsw %%xmm1, %%xmm5 \n\t" /* xmm5 = B. - H = B.. */ \ - "paddw %%xmm0, %%xmm2 \n\t" /* xmm2 = c4 * ( A - C) = A. */ \ - "psubsw %%xmm3, %%xmm6 \n\t" /* xmm6 = i0 - i4 */ \ - "movdqa %%xmm6, %%xmm0 \n\t" /* xmm0 = i0 - i4 */ \ - "pmulhw %%xmm4, %%xmm6 \n\t" /* xmm6 = (c4 - 1) * (i0 - i4) = F */ \ - "paddsw %%xmm3, %%xmm3 \n\t" /* xmm3 = i4 + i4 */ \ - "paddsw %%xmm1, %%xmm1 \n\t" /* xmm1 = H + H */ \ - "paddsw %%xmm0, %%xmm3 \n\t" /* xmm3 = i0 + i4 */ \ - "paddsw %%xmm5, %%xmm1 \n\t" /* xmm1 = B. + H = H. */ \ - "pmulhw %%xmm3, %%xmm4 \n\t" /* xmm4 = ( c4 - 1 ) * ( i0 + i4 ) */ \ - "paddw %%xmm0, %%xmm6 \n\t" /* xmm6 = c4 * ( i0 - i4 ) */ \ - "psubsw %%xmm2, %%xmm6 \n\t" /* xmm6 = F - A. = F. */ \ - "paddsw %%xmm2, %%xmm2 \n\t" /* xmm2 = A. + A. */ \ - "movdqa "I(1)", %%xmm0 \n\t" /* Load C. from I(1) */ \ - "paddsw %%xmm6, %%xmm2 \n\t" /* xmm2 = F + A. = A.. */ \ - "paddw %%xmm3, %%xmm4 \n\t" /* xmm4 = c4 * ( i0 + i4 ) = 3 */ \ - "psubsw %%xmm1, %%xmm2 \n\t" /* xmm2 = A.. - H. = R2 */ \ - ADD(%%xmm2) /* Adjust R2 and R1 before shifting */ \ - "paddsw %%xmm1, %%xmm1 \n\t" /* xmm1 = H. + H. */ \ - "paddsw %%xmm2, %%xmm1 \n\t" /* xmm1 = A.. + H. = R1 */ \ - SHIFT(%%xmm2) /* xmm2 = op2 */ \ - "psubsw %%xmm7, %%xmm4 \n\t" /* xmm4 = E - G = E. */ \ - SHIFT(%%xmm1) /* xmm1 = op1 */ \ - "movdqa "I(2)", %%xmm3 \n\t" /* Load D. from I(2) */ \ - "paddsw %%xmm7, %%xmm7 \n\t" /* xmm7 = G + G */ \ - "paddsw %%xmm4, %%xmm7 \n\t" /* xmm7 = E + G = G. */ \ - "psubsw %%xmm3, %%xmm4 \n\t" /* xmm4 = E. - D. = R4 */ \ - ADD(%%xmm4) /* Adjust R4 and R3 before shifting */ \ - "paddsw %%xmm3, %%xmm3 \n\t" /* xmm3 = D. + D. */ \ - "paddsw %%xmm4, %%xmm3 \n\t" /* xmm3 = E. + D. = R3 */ \ - SHIFT(%%xmm4) /* xmm4 = op4 */ \ - "psubsw %%xmm5, %%xmm6 \n\t" /* xmm6 = F. - B..= R6 */ \ - SHIFT(%%xmm3) /* xmm3 = op3 */ \ - ADD(%%xmm6) /* Adjust R6 and R5 before shifting */ \ - "paddsw %%xmm5, %%xmm5 \n\t" /* xmm5 = B.. + B.. */ \ - "paddsw %%xmm6, %%xmm5 \n\t" /* xmm5 = F. + B.. = R5 */ \ - SHIFT(%%xmm6) /* xmm6 = op6 */ \ - SHIFT(%%xmm5) /* xmm5 = op5 */ \ - "psubsw %%xmm0, %%xmm7 \n\t" /* xmm7 = G. - C. = R7 */ \ - ADD(%%xmm7) /* Adjust R7 and R0 before shifting */ \ - "paddsw %%xmm0, %%xmm0 \n\t" /* xmm0 = C. + C. */ \ - "paddsw %%xmm7, %%xmm0 \n\t" /* xmm0 = G. + C. */ \ - SHIFT(%%xmm7) /* xmm7 = op7 */ \ - SHIFT(%%xmm0) /* xmm0 = op0 */ - -#define PUT_BLOCK(r0, r1, r2, r3, r4, r5, r6, r7) \ - "movdqa " #r0 ", " O(0) "\n\t" \ - "movdqa " #r1 ", " O(1) "\n\t" \ - "movdqa " #r2 ", " O(2) "\n\t" \ - "movdqa " #r3 ", " O(3) "\n\t" \ - "movdqa " #r4 ", " O(4) "\n\t" \ - "movdqa " #r5 ", " O(5) "\n\t" \ - "movdqa " #r6 ", " O(6) "\n\t" \ - "movdqa " #r7 ", " O(7) "\n\t" - -#define NOP(xmm) -#define SHIFT4(xmm) "psraw $4, "#xmm"\n\t" -#define ADD8(xmm) "paddsw %2, "#xmm"\n\t" - -void ff_vp3_idct_sse2(int16_t *input_data) -{ -#define I(x) AV_STRINGIFY(16*x)"(%0)" -#define O(x) I(x) -#define C(x) AV_STRINGIFY(16*(x-1))"(%1)" - - __asm__ volatile ( - VP3_1D_IDCT_SSE2(NOP, NOP) - - TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%0)) - PUT_BLOCK(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1) - - VP3_1D_IDCT_SSE2(ADD8, SHIFT4) - PUT_BLOCK(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7) - :: "r"(input_data), "r"(ff_vp3_idct_data), "m"(ff_pw_8) - ); -} - -void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block) -{ - ff_vp3_idct_sse2(block); - ff_put_signed_pixels_clamped_mmx(block, dest, line_size); -} - -void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block) -{ - ff_vp3_idct_sse2(block); - ff_add_pixels_clamped_mmx(block, dest, line_size); -} diff -r fe78a4548d12 -r d6d0a43848b4 x86/vp3dsp_sse2.h --- a/x86/vp3dsp_sse2.h Mon Aug 30 16:22:27 2010 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,31 +0,0 @@ -/* - * vp3dsp SSE2 function declarations - * Copyright (c) 2007 Aurelien Jacobs - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVCODEC_X86_VP3DSP_SSE2_H -#define AVCODEC_X86_VP3DSP_SSE2_H - -#include "libavcodec/dsputil.h" - -void ff_vp3_idct_sse2(int16_t *input_data); -void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block); -void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block); - -#endif /* AVCODEC_X86_VP3DSP_SSE2_H */