changeset 11301:a8dfeb7180ff

optimize
author michael
date Tue, 28 Oct 2003 10:45:55 +0000
parents 2b98db967c14
children dc4db88fc4ff
files libmpcodecs/vf_spp.c
diffstat 1 files changed, 76 insertions(+), 4 deletions(-) [+]
line wrap: on
line diff
--- a/libmpcodecs/vf_spp.c	Tue Oct 28 10:00:51 2003 +0000
+++ b/libmpcodecs/vf_spp.c	Tue Oct 28 10:45:55 2003 +0000
@@ -96,13 +96,12 @@
 
 #define SHIFT 22
 
-static inline void requantize(DCTELEM dst[64], DCTELEM src[64], int qp, uint8_t *permutation){
+static void requantize_c(DCTELEM dst[64], DCTELEM src[64], int qp, uint8_t *permutation){
 	int i; 
-	const int qmul= qp<<1;
 	int bias= 0; //FIXME
 	unsigned int threshold1, threshold2;
 	
-	threshold1= qmul*((1<<3) - bias) - 1;
+	threshold1= qp*((1<<4) - bias) - 1;
 	threshold2= (threshold1<<1);
         
 	memset(dst, 0, 64*sizeof(DCTELEM));
@@ -117,6 +116,76 @@
 	}
 }
 
+#ifdef HAVE_MMX
+static void requantize_mmx(DCTELEM dst[64], DCTELEM src[64], int qp, uint8_t *permutation){
+	int bias= 0; //FIXME
+	unsigned int threshold1;
+	
+	threshold1= qp*((1<<4) - bias) - 1;
+	
+        asm volatile(
+#define REQUANT_CORE(dst0, dst1, dst2, dst3, src0, src1, src2, src3) \
+		"movq " #src0 ", %%mm0	\n\t"\
+		"movq " #src1 ", %%mm1	\n\t"\
+		"movq " #src2 ", %%mm2	\n\t"\
+		"movq " #src3 ", %%mm3	\n\t"\
+		"psubw %%mm4, %%mm0	\n\t"\
+		"psubw %%mm4, %%mm1	\n\t"\
+		"psubw %%mm4, %%mm2	\n\t"\
+		"psubw %%mm4, %%mm3	\n\t"\
+		"paddusw %%mm5, %%mm0	\n\t"\
+		"paddusw %%mm5, %%mm1	\n\t"\
+		"paddusw %%mm5, %%mm2	\n\t"\
+		"paddusw %%mm5, %%mm3	\n\t"\
+		"paddw %%mm6, %%mm0	\n\t"\
+		"paddw %%mm6, %%mm1	\n\t"\
+		"paddw %%mm6, %%mm2	\n\t"\
+		"paddw %%mm6, %%mm3	\n\t"\
+		"psubusw %%mm6, %%mm0	\n\t"\
+		"psubusw %%mm6, %%mm1	\n\t"\
+		"psubusw %%mm6, %%mm2	\n\t"\
+		"psubusw %%mm6, %%mm3	\n\t"\
+		"psraw $3, %%mm0	\n\t"\
+		"psraw $3, %%mm1	\n\t"\
+		"psraw $3, %%mm2	\n\t"\
+		"psraw $3, %%mm3	\n\t"\
+\
+		"movq %%mm0, %%mm7	\n\t"\
+		"punpcklwd %%mm2, %%mm0	\n\t" /*A*/\
+		"punpckhwd %%mm2, %%mm7	\n\t" /*C*/\
+		"movq %%mm1, %%mm2	\n\t"\
+		"punpcklwd %%mm3, %%mm1	\n\t" /*B*/\
+		"punpckhwd %%mm3, %%mm2	\n\t" /*D*/\
+		"movq %%mm0, %%mm3	\n\t"\
+		"punpcklwd %%mm1, %%mm0	\n\t" /*A*/\
+		"punpckhwd %%mm7, %%mm3	\n\t" /*C*/\
+		"punpcklwd %%mm1, %%mm7	\n\t" /*B*/\
+		"punpckhwd %%mm2, %%mm1	\n\t" /*D*/\
+\
+		"movq %%mm0, " #dst0 "	\n\t"\
+		"movq %%mm7, " #dst1 "	\n\t"\
+		"movq %%mm3, " #dst2 "	\n\t"\
+		"movq %%mm1, " #dst3 "	\n\t"
+                
+		"movd %2, %%mm4		\n\t"
+		"movd %3, %%mm5		\n\t"
+		"movd %4, %%mm6		\n\t"
+		"packssdw %%mm4, %%mm4	\n\t"
+		"packssdw %%mm5, %%mm5	\n\t"
+		"packssdw %%mm6, %%mm6	\n\t"
+		"packssdw %%mm4, %%mm4	\n\t"
+		"packssdw %%mm5, %%mm5	\n\t"
+		"packssdw %%mm6, %%mm6	\n\t"
+		REQUANT_CORE(  (%1),  8(%1), 16(%1), 24(%1),  (%0), 8(%0), 64(%0), 72(%0))
+		REQUANT_CORE(32(%1), 40(%1), 48(%1), 56(%1),16(%0),24(%0), 48(%0), 56(%0))
+		REQUANT_CORE(64(%1), 72(%1), 80(%1), 88(%1),32(%0),40(%0), 96(%0),104(%0))
+		REQUANT_CORE(96(%1),104(%1),112(%1),120(%1),80(%0),88(%0),112(%0),120(%0))
+		: : "r" (src), "r" (dst), "g" (threshold1+1), "g" (threshold1+5), "g" (threshold1-4) //FIXME maybe more accurate then needed?
+	);
+	dst[0]= (src[0] + 4)>>3;
+}
+#endif
+
 static inline void add_block(int16_t *dst, int stride, DCTELEM block[64]){
 	int y;
 	
@@ -195,6 +264,8 @@
 
 static void (*store_slice)(uint8_t *dst, int16_t *src, int dst_stride, int src_stride, int width, int log2_scale)= store_slice_c;
 
+static void (*requantize)(DCTELEM dst[64], DCTELEM src[64], int qp, uint8_t *permutation)= requantize_c;
+
 static void filter(struct vf_priv_s *p, uint8_t *dst, uint8_t *src, int dst_stride, int src_stride, int width, int height, uint8_t *qp_store, int qp_stride, int is_luma){
 	int x, y, i;
 	const int count= 1<<p->log2_count;
@@ -202,7 +273,7 @@
 	uint64_t block_align[32];
 	DCTELEM *block = (DCTELEM *)block_align;
 	DCTELEM *block2= (DCTELEM *)(block_align+16);
-	
+
 	for(y=0; y<height; y++){
 		int index= 8 + 8*stride + y*stride;
 		memcpy(p->src + index, src + y*src_stride, width);
@@ -364,6 +435,7 @@
     dsputil_init(&vf->priv->dsp, vf->priv->avctx);
     if(gCpuCaps.hasMMX){
 	store_slice= store_slice_mmx;
+	requantize= requantize_mmx;
     }
     
     vf->priv->log2_count= 6;