changeset 1972:8a556283601d libavcodec

hook up support for SSE2-optimized VP3 IDCT
author melanson
date Sun, 25 Apr 2004 03:33:30 +0000
parents 39f2ba94e09b
children 8dc96e383480
files Makefile dsputil.h i386/dsputil_mmx.c vp3.c
diffstat 4 files changed, 23 insertions(+), 7 deletions(-) [+]
line wrap: on
line diff
--- a/Makefile	Sun Apr 25 03:30:19 2004 +0000
+++ b/Makefile	Sun Apr 25 03:33:30 2004 +0000
@@ -116,7 +116,8 @@
 OBJS += i386/fdct_mmx.o i386/cputest.o \
 	i386/dsputil_mmx.o i386/mpegvideo_mmx.o \
 	i386/idct_mmx.o i386/motion_est_mmx.o \
-	i386/simple_idct_mmx.o i386/fft_sse.o i386/vp3dsp_mmx.o
+	i386/simple_idct_mmx.o i386/fft_sse.o i386/vp3dsp_mmx.o \
+	i386/vp3dsp_sse2.o
 ifdef TARGET_BUILTIN_VECTOR
 i386/fft_sse.o: CFLAGS+= -msse
 depend: CFLAGS+= -msse
--- a/dsputil.h	Sun Apr 25 03:30:19 2004 +0000
+++ b/dsputil.h	Sun Apr 25 03:33:30 2004 +0000
@@ -73,6 +73,12 @@
 void vp3_idct_add_mmx(int16_t *input_data, int16_t *dequant_matrix,
     int coeff_count, uint8_t *dest, int stride);
 
+void vp3_dsp_init_sse2(void);
+void vp3_idct_put_sse2(int16_t *input_data, int16_t *dequant_matrix,
+    int coeff_count, uint8_t *dest, int stride);
+void vp3_idct_add_sse2(int16_t *input_data, int16_t *dequant_matrix,
+    int coeff_count, uint8_t *dest, int stride);
+
 
 /* minimum alignment rules ;)
 if u notice errors in the align stuff, need more alignment for some asm code for some cpu
@@ -403,6 +409,7 @@
 }
 
 #define __align8 __attribute__ ((aligned (8)))
+#define __align16 __attribute__ ((aligned (16)))
 
 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx);
 void dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx);
--- a/i386/dsputil_mmx.c	Sun Apr 25 03:30:19 2004 +0000
+++ b/i386/dsputil_mmx.c	Sun Apr 25 03:33:30 2004 +0000
@@ -2147,9 +2147,15 @@
         }
 
         /* VP3 optimized DSP functions */
-        c->vp3_dsp_init = vp3_dsp_init_mmx;
-        c->vp3_idct_put = vp3_idct_put_mmx;
-        c->vp3_idct_add = vp3_idct_add_mmx;
+        if (mm_flags & MM_SSE2) {
+            c->vp3_dsp_init = vp3_dsp_init_sse2;
+            c->vp3_idct_put = vp3_idct_put_sse2;
+            c->vp3_idct_add = vp3_idct_add_sse2;
+        } else {
+            c->vp3_dsp_init = vp3_dsp_init_mmx;
+            c->vp3_idct_put = vp3_idct_put_mmx;
+            c->vp3_idct_add = vp3_idct_add_mmx;
+        }
         
 #ifdef CONFIG_ENCODERS
         c->get_pixels = get_pixels_mmx;
--- a/vp3.c	Sun Apr 25 03:30:19 2004 +0000
+++ b/vp3.c	Sun Apr 25 03:33:30 2004 +0000
@@ -268,9 +268,11 @@
     VLC ac_vlc_3[16];
     VLC ac_vlc_4[16];
 
-    int16_t intra_y_dequant[64];
-    int16_t intra_c_dequant[64];
-    int16_t inter_dequant[64];
+    /* these arrays need to be on 16-byte boundaries since SSE2 operations
+     * index into them */
+    int16_t __align16 intra_y_dequant[64];
+    int16_t __align16 intra_c_dequant[64];
+    int16_t __align16 inter_dequant[64];
 
     /* This table contains superblock_count * 16 entries. Each set of 16
      * numbers corresponds to the fragment indices 0..15 of the superblock.