changeset 8288:800444234375 libavcodec

clear_block mmx
author lorenm
date Wed, 10 Dec 2008 21:35:17 +0000
parents 7a1d037482c4
children e4877f9fc823
files dsputil.c dsputil.h h263.c i386/dsputil_mmx.c intrax8.c mimic.c mjpegdec.c vp3.c vp56.c wmv2.c
diffstat 10 files changed, 56 insertions(+), 25 deletions(-) [+]
line wrap: on
line diff
--- a/dsputil.c	Wed Dec 10 21:26:00 2008 +0000
+++ b/dsputil.c	Wed Dec 10 21:35:17 2008 +0000
@@ -3420,6 +3420,11 @@
     }
 }
 
+static void clear_block_c(DCTELEM *block)
+{
+    memset(block, 0, sizeof(DCTELEM)*64);
+}
+
 /**
  * memset(blocks, 0, sizeof(DCTELEM)*6*64)
  */
@@ -4288,6 +4293,7 @@
     c->sum_abs_dctelem = sum_abs_dctelem_c;
     c->gmc1 = gmc1_c;
     c->gmc = ff_gmc_c;
+    c->clear_block = clear_block_c;
     c->clear_blocks = clear_blocks_c;
     c->pix_sum = pix_sum_c;
     c->pix_norm1 = pix_norm1_c;
--- a/dsputil.h	Wed Dec 10 21:26:00 2008 +0000
+++ b/dsputil.h	Wed Dec 10 21:35:17 2008 +0000
@@ -203,6 +203,7 @@
      */
     void (*gmc )(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int ox, int oy,
                     int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height);
+    void (*clear_block)(DCTELEM *block/*align 16*/);
     void (*clear_blocks)(DCTELEM *blocks/*align 16*/);
     int (*pix_sum)(uint8_t * pix, int line_size);
     int (*pix_norm1)(uint8_t * pix, int line_size);
--- a/h263.c	Wed Dec 10 21:26:00 2008 +0000
+++ b/h263.c	Wed Dec 10 21:35:17 2008 +0000
@@ -810,7 +810,7 @@
         for (i = 0; i < 6; i++) {
             if (s->block_last_index[i] >= 0 && ((cbp >> (5 - i))&1)==0 ){
                 s->block_last_index[i]= -1;
-                memset(s->block[i], 0, sizeof(DCTELEM)*64);
+                s->dsp.clear_block(s->block[i]);
             }
         }
     }else{
@@ -853,7 +853,7 @@
         for (i = 0; i < 6; i++) {
             if (s->block_last_index[i] >= 0 && ((cbp >> (5 - i))&1)==0 ){
                 s->block_last_index[i]= -1;
-                memset(s->block[i], 0, sizeof(DCTELEM)*64);
+                s->dsp.clear_block(s->block[i]);
             }
         }
     }else{
@@ -4651,7 +4651,7 @@
                 rl = &rl_intra_aic;
                 i = 0;
                 s->gb= gb;
-                memset(block, 0, sizeof(DCTELEM)*64);
+                s->dsp.clear_block(block);
                 goto retry;
             }
             av_log(s->avctx, AV_LOG_ERROR, "run overflow at %dx%d i:%d\n", s->mb_x, s->mb_y, s->mb_intra);
--- a/i386/dsputil_mmx.c	Wed Dec 10 21:26:00 2008 +0000
+++ b/i386/dsputil_mmx.c	Wed Dec 10 21:35:17 2008 +0000
@@ -464,21 +464,42 @@
         );
 }
 
-static void clear_blocks_mmx(DCTELEM *blocks)
+#define CLEAR_BLOCKS(name,n) \
+static void name(DCTELEM *blocks)\
+{\
+    __asm__ volatile(\
+                "pxor %%mm7, %%mm7              \n\t"\
+                "mov     %1, %%"REG_a"          \n\t"\
+                "1:                             \n\t"\
+                "movq %%mm7, (%0, %%"REG_a")    \n\t"\
+                "movq %%mm7, 8(%0, %%"REG_a")   \n\t"\
+                "movq %%mm7, 16(%0, %%"REG_a")  \n\t"\
+                "movq %%mm7, 24(%0, %%"REG_a")  \n\t"\
+                "add $32, %%"REG_a"             \n\t"\
+                " js 1b                         \n\t"\
+                : : "r" (((uint8_t *)blocks)+128*n),\
+                    "i" (-128*n)\
+                : "%"REG_a\
+        );\
+}
+CLEAR_BLOCKS(clear_blocks_mmx, 6)
+CLEAR_BLOCKS(clear_block_mmx, 1)
+
+static void clear_block_sse(DCTELEM *block)
 {
     __asm__ volatile(
-                "pxor %%mm7, %%mm7              \n\t"
-                "mov $-128*6, %%"REG_a"         \n\t"
-                "1:                             \n\t"
-                "movq %%mm7, (%0, %%"REG_a")    \n\t"
-                "movq %%mm7, 8(%0, %%"REG_a")   \n\t"
-                "movq %%mm7, 16(%0, %%"REG_a")  \n\t"
-                "movq %%mm7, 24(%0, %%"REG_a")  \n\t"
-                "add $32, %%"REG_a"             \n\t"
-                " js 1b                         \n\t"
-                : : "r" (((uint8_t *)blocks)+128*6)
-                : "%"REG_a
-        );
+        "xorps  %%xmm0, %%xmm0  \n"
+        "movaps %%xmm0,    (%0) \n"
+        "movaps %%xmm0,  16(%0) \n"
+        "movaps %%xmm0,  32(%0) \n"
+        "movaps %%xmm0,  48(%0) \n"
+        "movaps %%xmm0,  64(%0) \n"
+        "movaps %%xmm0,  80(%0) \n"
+        "movaps %%xmm0,  96(%0) \n"
+        "movaps %%xmm0, 112(%0) \n"
+        :: "r"(block)
+        : "memory"
+    );
 }
 
 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
@@ -2569,7 +2590,10 @@
         c->put_pixels_clamped = put_pixels_clamped_mmx;
         c->put_signed_pixels_clamped = put_signed_pixels_clamped_mmx;
         c->add_pixels_clamped = add_pixels_clamped_mmx;
+        c->clear_block  = clear_block_mmx;
         c->clear_blocks = clear_blocks_mmx;
+        if (mm_flags & FF_MM_SSE)
+            c->clear_block = clear_block_sse;
 
 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
         c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
--- a/intrax8.c	Wed Dec 10 21:26:00 2008 +0000
+++ b/intrax8.c	Wed Dec 10 21:35:17 2008 +0000
@@ -511,7 +511,7 @@
     int sign;
 
     assert(w->orient<12);
-    memset(s->block[0],0x00,64*sizeof(DCTELEM));
+    s->dsp.clear_block(s->block[0]);
 
     if(chroma){
         dc_mode=2;
--- a/mimic.c	Wed Dec 10 21:26:00 2008 +0000
+++ b/mimic.c	Wed Dec 10 21:35:17 2008 +0000
@@ -163,7 +163,7 @@
     DCTELEM *block = ctx->dct_block;
     unsigned int pos;
 
-    memset(block, 0, 64 * sizeof(DCTELEM));
+    ctx->dsp.clear_block(block);
 
     block[0] = get_bits(&ctx->gb, 8) << 3;
 
--- a/mjpegdec.c	Wed Dec 10 21:26:00 2008 +0000
+++ b/mjpegdec.c	Wed Dec 10 21:35:17 2008 +0000
@@ -444,7 +444,7 @@
                                  int dc_index, int16_t *quant_matrix, int Al)
 {
     int val;
-    memset(block, 0, 64*sizeof(DCTELEM));
+    s->dsp.clear_block(block);
     val = mjpeg_decode_dc(s, dc_index);
     if (val == 0xffff) {
         av_log(s->avctx, AV_LOG_ERROR, "error dc\n");
@@ -800,7 +800,7 @@
                     if(s->interlaced && s->bottom_field)
                         ptr += linesize[c] >> 1;
                     if(!s->progressive) {
-                        memset(s->block, 0, sizeof(s->block));
+                        s->dsp.clear_block(s->block);
                         if(decode_block(s, s->block, i,
                                      s->dc_index[i], s->ac_index[i],
                                      s->quant_matrixes[ s->quant_index[c] ]) < 0) {
--- a/vp3.c	Wed Dec 10 21:26:00 2008 +0000
+++ b/vp3.c	Wed Dec 10 21:35:17 2008 +0000
@@ -1402,14 +1402,14 @@
                     /* dequantize the DCT coefficients */
                     if(s->avctx->idct_algo==FF_IDCT_VP3){
                         Coeff *coeff= s->coeffs + i;
-                        memset(block, 0, sizeof(block));
+                        s->dsp.clear_block(block);
                         while(coeff->next){
                             block[coeff->index]= coeff->coeff * dequantizer[coeff->index];
                             coeff= coeff->next;
                         }
                     }else{
                         Coeff *coeff= s->coeffs + i;
-                        memset(block, 0, sizeof(block));
+                        s->dsp.clear_block(block);
                         while(coeff->next){
                             block[coeff->index]= (coeff->coeff * dequantizer[coeff->index] + 2)>>2;
                             coeff= coeff->next;
--- a/vp56.c	Wed Dec 10 21:26:00 2008 +0000
+++ b/vp56.c	Wed Dec 10 21:35:17 2008 +0000
@@ -405,7 +405,7 @@
         mb_type = vp56_decode_mv(s, row, col);
     ref_frame = vp56_reference_frame[mb_type];
 
-    memset(s->block_coeff, 0, sizeof(s->block_coeff));
+    s->dsp.clear_blocks(*s->block_coeff);
 
     s->parse_coeff(s);
 
--- a/wmv2.c	Wed Dec 10 21:26:00 2008 +0000
+++ b/wmv2.c	Wed Dec 10 21:35:17 2008 +0000
@@ -43,12 +43,12 @@
     case 1:
         ff_simple_idct84_add(dst           , stride, block1);
         ff_simple_idct84_add(dst + 4*stride, stride, w->abt_block2[n]);
-        memset(w->abt_block2[n], 0, 64*sizeof(DCTELEM));
+        s->dsp.clear_block(w->abt_block2[n]);
         break;
     case 2:
         ff_simple_idct48_add(dst           , stride, block1);
         ff_simple_idct48_add(dst + 4       , stride, w->abt_block2[n]);
-        memset(w->abt_block2[n], 0, 64*sizeof(DCTELEM));
+        s->dsp.clear_block(w->abt_block2[n]);
         break;
     default:
         av_log(s->avctx, AV_LOG_ERROR, "internal error in WMV2 abt\n");