# HG changeset patch
# User romansh
# Date 1067020081 0
# Node ID e08df4d22d27c83718857d4ac14be05cc54edf4f
# Parent  396e8526e82cb2e59b1bd0d658e512902c06a915
   * introducing dct248 into the DSP context.
   * simple/accurate implementation of dct248
   * DV encoding now supports 2-4-8 DCT
   * DV encoding gets a bit faster (but still miles away
     from what I think it could do)
   * misc. DV codec cleanups

diff -r 396e8526e82c -r e08df4d22d27 dsputil.c
--- a/dsputil.c	Thu Oct 23 23:24:38 2003 +0000
+++ b/dsputil.c	Fri Oct 24 18:28:01 2003 +0000
@@ -44,6 +44,19 @@
     53, 60, 61, 54, 47, 55, 62, 63
 };
 
+/* Specific zigzag scan for 248 idct. NOTE that unlike the
+   specification, we interleave the fields */
+const uint8_t ff_zigzag248_direct[64] = {
+     0,  8,  1,  9, 16, 24,  2, 10,
+    17, 25, 32, 40, 48, 56, 33, 41,
+    18, 26,  3, 11,  4, 12, 19, 27,
+    34, 42, 49, 57, 50, 58, 35, 43,
+    20, 28,  5, 13,  6, 14, 21, 29,
+    36, 44, 51, 59, 52, 60, 37, 45,
+    22, 30,  7, 15, 23, 31, 38, 46,
+    53, 61, 54, 62, 39, 47, 55, 63,
+};
+
 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
 uint16_t __align8 inv_zigzag_direct16[64];
 
@@ -2869,12 +2882,18 @@
     int i;
 
 #ifdef CONFIG_ENCODERS
-    if(avctx->dct_algo==FF_DCT_FASTINT)
+    if(avctx->dct_algo==FF_DCT_FASTINT) {
         c->fdct = fdct_ifast;
-    else if(avctx->dct_algo==FF_DCT_FAAN)
+	c->fdct248 = ff_fdct248_islow; // FIXME: need an optimized version
+    } 
+    else if(avctx->dct_algo==FF_DCT_FAAN) {
         c->fdct = ff_faandct;
-    else
+	c->fdct248 = ff_fdct248_islow; // FIXME: need an optimized version
+    } 
+    else {
         c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
+	c->fdct248 = ff_fdct248_islow;
+    }
 #endif //CONFIG_ENCODERS
 
     if(avctx->idct_algo==FF_IDCT_INT){
diff -r 396e8526e82c -r e08df4d22d27 dsputil.h
--- a/dsputil.h	Thu Oct 23 23:24:38 2003 +0000
+++ b/dsputil.h	Fri Oct 24 18:28:01 2003 +0000
@@ -37,6 +37,7 @@
 
 void fdct_ifast (DCTELEM *data);
 void ff_jpeg_fdct_islow (DCTELEM *data);
+void ff_fdct248_islow (DCTELEM *data);
 
 void j_rev_dct (DCTELEM *data);
 
@@ -47,6 +48,7 @@
 extern const uint8_t ff_alternate_horizontal_scan[64];
 extern const uint8_t ff_alternate_vertical_scan[64];
 extern const uint8_t ff_zigzag_direct[64];
+extern const uint8_t ff_zigzag248_direct[64];
 
 /* pixel operations */
 #define MAX_NEG_CROP 384
@@ -244,6 +246,7 @@
     
     /* (I)DCT */
     void (*fdct)(DCTELEM *block/* align 16*/);
+    void (*fdct248)(DCTELEM *block/* align 16*/);
     
     /* IDCT really*/
     void (*idct)(DCTELEM *block/* align 16*/);
diff -r 396e8526e82c -r e08df4d22d27 dv.c
--- a/dv.c	Thu Oct 23 23:24:38 2003 +0000
+++ b/dv.c	Fri Oct 24 18:28:01 2003 +0000
@@ -35,19 +35,18 @@
 
 typedef struct DVVideoDecodeContext {
     const DVprofile* sys;
-    GetBitContext gb;
     AVFrame picture;
-    DCTELEM block[5*6][64] __align8;
     
-    /* FIXME: the following is extracted from DSP */
     uint8_t dv_zigzag[2][64];
-    uint8_t idct_permutation[64];
+    uint8_t dv_idct_shift[2][22][64];
+    uint8_t dv_dct_shift[2][22][64];
+  
     void (*get_pixels)(DCTELEM *block, const uint8_t *pixels, int line_size);
-    void (*fdct)(DCTELEM *block);
+    void (*fdct[2])(DCTELEM *block);
+    void (*idct_put[2])(uint8_t *dest, int line_size, DCTELEM *block);
     
-    /* XXX: move it to static storage ? */
-    uint8_t dv_shift[2][22][64];
-    void (*idct_put[2])(uint8_t *dest, int line_size, DCTELEM *block);
+    GetBitContext gb;
+    DCTELEM block[5*6][64] __align8;
 } DVVideoDecodeContext;
 
 #define TEX_VLC_BITS 9
@@ -55,25 +54,29 @@
 static RL_VLC_ELEM *dv_rl_vlc[1];
 static VLC_TYPE dv_vlc_codes[15][23];
 
-static void dv_build_unquantize_tables(DVVideoDecodeContext *s)
+static void dv_build_unquantize_tables(DVVideoDecodeContext *s, uint8_t* perm)
 {
     int i, q, j;
 
     /* NOTE: max left shift is 6 */
     for(q = 0; q < 22; q++) {
-        /* 88 unquant */
+        /* 88DCT */
         for(i = 1; i < 64; i++) {
             /* 88 table */
-            j = s->idct_permutation[i];
-            s->dv_shift[0][q][j] =
+            j = perm[i];
+            s->dv_idct_shift[0][q][j] =
                 dv_quant_shifts[q][dv_88_areas[i]] + 1;
+            s->dv_dct_shift[0][q][i] =
+                dv_quant_shifts[q][dv_88_areas[ff_zigzag_direct[i]]] + 4;
         }
         
-        /* 248 unquant */
+        /* 248DCT */
         for(i = 1; i < 64; i++) {
             /* 248 table */
-            s->dv_shift[1][q][i] =  
-                    dv_quant_shifts[q][dv_248_areas[i]] + 1;
+            s->dv_idct_shift[1][q][i] =  
+                dv_quant_shifts[q][dv_248_areas[i]] + 1;
+	    s->dv_dct_shift[1][q][i] =  
+                dv_quant_shifts[q][dv_248_areas[ff_zigzag248_direct[i]]] + 4;
         }
     }
 }
@@ -81,8 +84,9 @@
 static int dvvideo_init(AVCodecContext *avctx)
 {
     DVVideoDecodeContext *s = avctx->priv_data;
-    MpegEncContext s2;
+    DSPContext dsp;
     static int done=0;
+    int i;
 
     if (!done) {
         int i;
@@ -124,27 +128,23 @@
 	}
     }
 
-    /* ugly way to get the idct & scantable */
-    /* XXX: fix it */
-    memset(&s2, 0, sizeof(MpegEncContext));
-    s2.avctx = avctx;
-    dsputil_init(&s2.dsp, avctx);
-    if (DCT_common_init(&s2) < 0)
-       return -1;
+    /* Generic DSP setup */
+    dsputil_init(&dsp, avctx);
+    s->get_pixels = dsp.get_pixels;
 
-    s->get_pixels = s2.dsp.get_pixels;
-    s->fdct = s2.dsp.fdct;
-    
-    s->idct_put[0] = s2.dsp.idct_put;
-    memcpy(s->idct_permutation, s2.dsp.idct_permutation, 64);
-    memcpy(s->dv_zigzag[0], s2.intra_scantable.permutated, 64);
+    /* 88DCT setup */
+    s->fdct[0] = dsp.fdct;
+    s->idct_put[0] = dsp.idct_put;
+    for (i=0; i<64; i++)
+       s->dv_zigzag[0][i] = dsp.idct_permutation[ff_zigzag_direct[i]];
 
-    /* XXX: use MMX also for idct248 */
-    s->idct_put[1] = simple_idct248_put;
-    memcpy(s->dv_zigzag[1], dv_248_zigzag, 64);
+    /* 248DCT setup */
+    s->fdct[1] = dsp.fdct248;
+    s->idct_put[1] = simple_idct248_put;  // FIXME: need to add it to DSP
+    memcpy(s->dv_zigzag[1], ff_zigzag248_direct, 64);
 
     /* XXX: do it only for constant case */
-    dv_build_unquantize_tables(s);
+    dv_build_unquantize_tables(s, dsp.idct_permutation);
 
     /* FIXME: I really don't think this should be here */
     if (dv_codec_profile(avctx))
@@ -367,7 +367,7 @@
             mb->scan_table = s->dv_zigzag[dct_mode];
             class1 = get_bits(&s->gb, 2);
             mb->shift_offset = (class1 == 3);
-            mb->shift_table = s->dv_shift[dct_mode]
+            mb->shift_table = s->dv_idct_shift[dct_mode]
                 [quant + dv_quant_offset[class1]];
             dc = dc << 2;
             /* convert to unsigned because 128 is not added in the
@@ -571,6 +571,8 @@
     int block_size;
     DCTELEM *mb;
     PutBitContext pb;
+    const uint8_t* zigzag_scan;
+    uint8_t *dv_shift;
 } EncBlockInfo;
 
 static inline int dv_bits_left(EncBlockInfo* bi)
@@ -583,11 +585,10 @@
     int i, level, size, run = 0;
     uint32_t vlc;
     PutBitContext* cpb = &bi->pb;
+    int bias = (bi->cno == 3);
     
     for (i=1; i<64; i++) {
-       level = bi->mb[ff_zigzag_direct[i]] / 
-               (1<<(dv_quant_shifts[bi->qno + dv_quant_offset[bi->cno]]
-			       [dv_88_areas[ff_zigzag_direct[i]]] + 4 + (bi->cno == 3)));
+       level = bi->mb[bi->zigzag_scan[i]] / (1<<(bi->dv_shift[i] + bias));
        if (level != 0) {
 	   size = dv_rl2vlc(run, level, &vlc);
 put_vlc:
@@ -663,11 +664,26 @@
         bi->cno = 3;
 }
 
+#define SQ(a) ((a)*(a))
+static int dv_score_lines(DCTELEM *s, int stride) {
+    int score=0;
+    int x, y;
+    
+    for(y=0; y<4; y++) {
+        for(x=0; x<8; x+=4){
+            score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride]) 
+                   +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
+        }
+        s+= stride;
+    }
+    
+    return score;
+}
+
 /*
  * This is a very rough initial implementaion. The performance is
- * horrible and some features are missing, mainly 2-4-8 DCT encoding.
- * The weighting is missing as well, but it's missing from the decoding
- * step also -- so at least we're on the same page with decoder ;-)
+ * horrible and the weighting is missing. But it's missing from the 
+ * decoding step also -- so at least we're on the same page with decoder ;-)
  */
 static inline void dv_encode_video_segment(DVVideoDecodeContext *s, 
                                            uint8_t *dif, 
@@ -691,6 +707,7 @@
    
     /* Stage 1 -- doing DCT on 5 MBs */
     block = &s->block[0][0];
+    enc_blk = &enc_blks[0];
     for(mb_index = 0; mb_index < 5; mb_index++) {
         v = *mb_pos_ptr++;
         mb_x = v & 0xff;
@@ -731,36 +748,36 @@
 	    } else {             /* Simple copy: 8x8 -> 8x8 */
 	        s->get_pixels(block, data, linesize);
 	    }
-            
-	    s->fdct(block);
+	  
+	    if (dv_score_lines(block, 8) + dv_score_lines(block+8*4, 8) - 100 >
+	        dv_score_lines(block, 16) + dv_score_lines(block+8, 16)) {
+               enc_blk->dct_mode = 1;
+	       enc_blk->zigzag_scan = ff_zigzag248_direct; 
+	    } else {
+	       enc_blk->dct_mode = 0;
+	       enc_blk->zigzag_scan = ff_zigzag_direct;
+	    }
+	    enc_blk->mb = block;
+            enc_blk->block_size = block_sizes[j];
 	    
+	    s->fdct[enc_blk->dct_mode](block);
+	    
+	    dv_set_class_number(enc_blk, j/4*(j%2));
+
 	    block += 64;
+	    enc_blk++;
         }
     }
 
-    /* Stage 2 -- setup for encoding phase */
-    enc_blk = &enc_blks[0];
-    block = &s->block[0][0];
-    for (i=0; i<5; i++) {
-       for (j=0; j<6; j++) {
-	  enc_blk->mb = block;
-	  enc_blk->dct_mode = 0;
-	  enc_blk->block_size = block_sizes[j];
-	  
-	  dv_set_class_number(enc_blk, j/4*(j%2));
-	  
-	  block += 64;
-	  enc_blk++;
-       }
-    }
-   
-    /* Stage 3 -- encoding by trial-and-error */
+    /* Stage 2 -- encoding by trial-and-error */
 encode_vs:
     enc_blk = &enc_blks[0];
     for (i=0; i<5; i++) {
        uint8_t* p = dif + i*80 + 4;
        for (j=0; j<6; j++) {
           enc_blk->qno = QNO;
+	  enc_blk->dv_shift = &(s->dv_dct_shift[0]
+	                           [QNO + dv_quant_offset[enc_blk->cno]][0]);
 	  init_put_bits(&enc_blk->pb, p, block_sizes[j]/8);
 	  enc_blk++;
 	  p += block_sizes[j]/8;
diff -r 396e8526e82c -r e08df4d22d27 dvdata.h
--- a/dvdata.h	Thu Oct 23 23:24:38 2003 +0000
+++ b/dvdata.h	Fri Oct 24 18:28:01 2003 +0000
@@ -270,19 +270,6 @@
   0,
 };
 
-/* Specific zigzag scan for 248 idct. NOTE that unlike the
-   specification, we interleave the fields */
-static const uint8_t dv_248_zigzag[64] = {
-  0,  8,  1,  9, 16, 24,  2, 10,
- 17, 25, 32, 40, 48, 56, 33, 41,
- 18, 26,  3, 11,  4, 12, 19, 27,
- 34, 42, 49, 57, 50, 58, 35, 43,
- 20, 28,  5, 13,  6, 14, 21, 29,
- 36, 44, 51, 59, 52, 60, 37, 45,
- 22, 30,  7, 15, 23, 31, 38, 46,
- 53, 61, 54, 62, 39, 47, 55, 63,
-};
-
 /* unquant tables (not used directly) */
 static const uint8_t dv_88_areas[64] = {
     0,0,0,1,1,1,2,2,
diff -r 396e8526e82c -r e08df4d22d27 jfdctint.c
--- a/jfdctint.c	Thu Oct 23 23:24:38 2003 +0000
+++ b/jfdctint.c	Fri Oct 24 18:28:01 2003 +0000
@@ -295,3 +295,130 @@
     dataptr++;			/* advance pointer to next column */
   }
 }
+
+/*
+ * The secret of DCT2-4-8 is really simple -- you do the usual 1-DCT
+ * on the rows and then, instead of doing even and odd, part on the colums
+ * you do even part two times.
+ */
+GLOBAL(void)
+ff_fdct248_islow (DCTELEM * data)
+{
+  int32_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  int32_t tmp10, tmp11, tmp12, tmp13;
+  int32_t z1, z2, z3, z4, z5;
+  DCTELEM *dataptr;
+  int ctr;
+  SHIFT_TEMPS
+
+  /* Pass 1: process rows. */
+  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
+  /* furthermore, we scale the results by 2**PASS1_BITS. */
+
+  dataptr = data;
+  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+    tmp0 = dataptr[0] + dataptr[7];
+    tmp7 = dataptr[0] - dataptr[7];
+    tmp1 = dataptr[1] + dataptr[6];
+    tmp6 = dataptr[1] - dataptr[6];
+    tmp2 = dataptr[2] + dataptr[5];
+    tmp5 = dataptr[2] - dataptr[5];
+    tmp3 = dataptr[3] + dataptr[4];
+    tmp4 = dataptr[3] - dataptr[4];
+    
+    /* Even part per LL&M figure 1 --- note that published figure is faulty;
+     * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
+     */
+    
+    tmp10 = tmp0 + tmp3;
+    tmp13 = tmp0 - tmp3;
+    tmp11 = tmp1 + tmp2;
+    tmp12 = tmp1 - tmp2;
+    
+    dataptr[0] = (DCTELEM) ((tmp10 + tmp11) << PASS1_BITS);
+    dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);
+    
+    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
+    dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
+				   CONST_BITS-PASS1_BITS);
+    dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
+				   CONST_BITS-PASS1_BITS);
+    
+    /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
+     * cK represents cos(K*pi/16).
+     * i0..i3 in the paper are tmp4..tmp7 here.
+     */
+    
+    z1 = tmp4 + tmp7;
+    z2 = tmp5 + tmp6;
+    z3 = tmp4 + tmp6;
+    z4 = tmp5 + tmp7;
+    z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
+    
+    tmp4 = MULTIPLY(tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
+    tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
+    tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
+    tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
+    z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
+    z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
+    z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+    z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
+    
+    z3 += z5;
+    z4 += z5;
+    
+    dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS);
+    dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS);
+    dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS);
+    dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS);
+    
+    dataptr += DCTSIZE;		/* advance pointer to next row */
+  }
+
+  /* Pass 2: process columns.
+   * We remove the PASS1_BITS scaling, but leave the results scaled up
+   * by an overall factor of 8.
+   */
+
+  dataptr = data;
+  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+     tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*1];
+     tmp1 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*3];
+     tmp2 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*5];
+     tmp3 = dataptr[DCTSIZE*6] + dataptr[DCTSIZE*7];
+     tmp4 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*1];
+     tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*3];
+     tmp6 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*5];
+     tmp7 = dataptr[DCTSIZE*6] - dataptr[DCTSIZE*7];
+      
+     tmp10 = tmp0 + tmp3;
+     tmp11 = tmp1 + tmp2;
+     tmp12 = tmp1 - tmp2;
+     tmp13 = tmp0 - tmp3;
+     
+     dataptr[DCTSIZE*0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS);
+     dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS);
+     
+     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
+     dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
+				            CONST_BITS+PASS1_BITS);
+     dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
+				            CONST_BITS+PASS1_BITS);
+
+     tmp10 = tmp4 + tmp7;
+     tmp11 = tmp5 + tmp6;
+     tmp12 = tmp5 - tmp6;
+     tmp13 = tmp4 - tmp7;
+
+     dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS);
+     dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS);
+     
+     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
+     dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
+				            CONST_BITS+PASS1_BITS);
+     dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
+				            CONST_BITS+PASS1_BITS);
+    
+     dataptr++;			/* advance pointer to next column */
+  }
+}