Mercurial > libavcodec.hg
changeset 3093:19f3dd63af7d libavcodec
15% faster put_h264_qpel16_hv_lowpass_mmx2
author | lorenm |
---|---|
date | Tue, 07 Feb 2006 06:25:14 +0000 |
parents | 421579792fc2 |
children | 385fa360f618 |
files | i386/h264dsp_mmx.c |
diffstat | 1 files changed, 45 insertions(+), 26 deletions(-) [+] |
line wrap: on
line diff
--- a/i386/h264dsp_mmx.c Mon Feb 06 11:21:26 2006 +0000 +++ b/i386/h264dsp_mmx.c Tue Feb 07 06:25:14 2006 +0000 @@ -630,9 +630,9 @@ dst += 4-8*dstStride;\ }\ }\ -static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ - int h=8;\ - int w=4;\ +static inline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\ + int h = size;\ + int w = (size+8)>>2;\ src -= 2*srcStride+2;\ while(w--){\ asm volatile(\ @@ -652,23 +652,40 @@ "punpcklbw %%mm7, %%mm2 \n\t"\ "punpcklbw %%mm7, %%mm3 \n\t"\ "punpcklbw %%mm7, %%mm4 \n\t"\ - QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*4)\ - QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*4)\ - QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*4)\ - QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*4)\ - QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*8*4)\ - QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*8*4)\ - QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*8*4)\ - QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*8*4)\ - \ + QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*48)\ + QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*48)\ + QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*48)\ + QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*48)\ + QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*48)\ + QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*48)\ + QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*48)\ + QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*48)\ : "+a"(src)\ : "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5)\ : "memory"\ );\ + if(size==16){\ + asm volatile(\ + QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 8*48)\ + QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 9*48)\ + QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 10*48)\ + QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 11*48)\ + QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 12*48)\ + QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 13*48)\ + QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 14*48)\ + QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 15*48)\ + : "+a"(src)\ + : "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5)\ + : "memory"\ + );\ + }\ tmp += 4;\ - src += 4 - 13*srcStride;\ + src += 4 - (size+5)*srcStride;\ }\ - tmp -= 4*4;\ + tmp -= size+8;\ + w = size>>4;\ + do{\ + h = size;\ asm volatile(\ "movq %4, %%mm6 \n\t"\ "1: \n\t"\ @@ -702,7 +719,7 @@ "psraw $6, %%mm3 \n\t"\ "packuswb %%mm3, %%mm0 \n\t"\ OP(%%mm0, (%1),%%mm7, q)\ - "add $32, %0 \n\t"\ + "add $48, %0 \n\t"\ "add %3, %1 \n\t"\ "decl %2 \n\t"\ " jnz 1b \n\t"\ @@ -710,6 +727,9 @@ : "S"((long)dstStride), "m"(ff_pw_32)\ : "memory"\ );\ + tmp += 8 - size*24;\ + dst += 8 - size*dstStride;\ + }while(w--);\ }\ static void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ OPNAME ## h264_qpel8_v_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ @@ -729,13 +749,12 @@ OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ }\ \ +static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ + OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 8);\ +}\ +\ static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ - OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride);\ - OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst+8, tmp , src+8, dstStride, tmpStride, srcStride);\ - src += 8*srcStride;\ - dst += 8*dstStride;\ - OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride);\ - OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst+8, tmp , src+8, dstStride, tmpStride, srcStride);\ + OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 16);\ }\ #define H264_MC(OPNAME, SIZE, MMX) \ @@ -816,13 +835,13 @@ }\ \ static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t temp[SIZE*(SIZE+8)/4];\ + uint64_t temp[SIZE*(SIZE<8?12:24)/4];\ int16_t * const tmp= (int16_t*)temp;\ OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, tmp, src, stride, SIZE, stride);\ }\ \ static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t temp[SIZE*(SIZE+8)/4 + SIZE*SIZE/4];\ + uint64_t temp[SIZE*(SIZE<8?12:24)/4 + SIZE*SIZE/4];\ uint8_t * const halfH= (uint8_t*)temp;\ uint8_t * const halfHV= ((uint8_t*)temp) + SIZE*SIZE;\ int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE;\ @@ -832,7 +851,7 @@ }\ \ static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t temp[SIZE*(SIZE+8)/4 + SIZE*SIZE/4];\ + uint64_t temp[SIZE*(SIZE<8?12:24)/4 + SIZE*SIZE/4];\ uint8_t * const halfH= (uint8_t*)temp;\ uint8_t * const halfHV= ((uint8_t*)temp) + SIZE*SIZE;\ int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE;\ @@ -842,7 +861,7 @@ }\ \ static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t temp[SIZE*(SIZE+8)/4 + SIZE*SIZE/4];\ + uint64_t temp[SIZE*(SIZE<8?12:24)/4 + SIZE*SIZE/4];\ uint8_t * const halfV= (uint8_t*)temp;\ uint8_t * const halfHV= ((uint8_t*)temp) + SIZE*SIZE;\ int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE;\ @@ -852,7 +871,7 @@ }\ \ static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - uint64_t temp[SIZE*(SIZE+8)/4 + SIZE*SIZE/4];\ + uint64_t temp[SIZE*(SIZE<8?12:24)/4 + SIZE*SIZE/4];\ uint8_t * const halfV= (uint8_t*)temp;\ uint8_t * const halfHV= ((uint8_t*)temp) + SIZE*SIZE;\ int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE;\