# HG changeset patch # User arpi_esp # Date 1012138218 0 # Node ID 0b234715e20513059500c351205533c07bb54031 # Parent 5b88ee1abf97425303544e3689dd56b2dbdd05d5 (commit by michael) mmx & mmx2 quantizer c dct permutation bugfix dont copy input on intra only encodings if it can be avoided dont draw edges on intra only stuff diff -r 5b88ee1abf97 -r 0b234715e205 dsputil.c --- a/dsputil.c Sat Jan 26 00:22:25 2002 +0000 +++ b/dsputil.c Sun Jan 27 13:30:18 2002 +0000 @@ -49,6 +49,12 @@ 53, 60, 61, 54, 47, 55, 62, 63 }; +/* not permutated inverse zigzag_direct + 1 for MMX quantizer */ +UINT16 __align8 inv_zigzag_direct16[64]; + +/* not permutated zigzag_direct for MMX quantizer */ +UINT8 zigzag_direct_noperm[64]; + UINT8 ff_alternate_horizontal_scan[64] = { 0, 1, 2, 3, 8, 9, 16, 17, 10, 11, 4, 5, 6, 7, 15, 14, @@ -83,6 +89,42 @@ 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F, }; +/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */ +UINT32 inverse[256]={ + 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757, + 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154, + 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709, + 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333, + 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367, + 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283, + 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315, + 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085, + 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498, + 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675, + 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441, + 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183, + 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712, + 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400, + 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163, + 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641, + 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573, + 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737, + 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493, + 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373, + 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368, + 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671, + 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767, + 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740, + 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751, + 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635, + 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593, + 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944, + 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933, + 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575, + 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532, + 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010, +}; + /* used to skip zeros at the end */ UINT8 zigzag_end[64]; @@ -515,6 +557,9 @@ else for(i=0; i<64; i++) permutation[i]=i; + for(i=0; i<64; i++) inv_zigzag_direct16[zigzag_direct[i]]= i+1; + for(i=0; i<64; i++) zigzag_direct_noperm[i]= zigzag_direct[i]; + if (use_permuted_idct) { /* permute for IDCT */ for(i=0;i<64;i++) { diff -r 5b88ee1abf97 -r 0b234715e205 i386/mpegvideo_mmx.c --- a/i386/mpegvideo_mmx.c Sat Jan 26 00:22:25 2002 +0000 +++ b/i386/mpegvideo_mmx.c Sun Jan 27 13:30:18 2002 +0000 @@ -22,9 +22,16 @@ #include "../dsputil.h" #include "../mpegvideo.h" +#include "../avcodec.h" +#include "../mangle.h" extern UINT8 zigzag_end[64]; extern void (*draw_edges)(UINT8 *buf, int wrap, int width, int height, int w); +extern int (*dct_quantize)(MpegEncContext *s, DCTELEM *block, int n, int qscale); + +extern UINT8 zigzag_direct_noperm[64]; +extern UINT16 inv_zigzag_direct16[64]; +extern UINT32 inverse[256]; #if 0 @@ -252,7 +259,7 @@ } } else { i = 0; - unquant_even: +// unquant_even: quant_matrix = s->non_intra_matrix; /* Align on 4 elements boundary */ while(i&7) @@ -411,6 +418,20 @@ } } +static volatile int esp_temp; + +void unused_var_warning_killer(){ + esp_temp++; +} + +#undef HAVE_MMX2 +#define RENAME(a) a ## _MMX +#include "mpegvideo_mmx_template.c" + +#define HAVE_MMX2 +#undef RENAME +#define RENAME(a) a ## _MMX2 +#include "mpegvideo_mmx_template.c" void MPV_common_init_mmx(MpegEncContext *s) { @@ -421,5 +442,11 @@ s->dct_unquantize = dct_unquantize_mpeg1_mmx; draw_edges = draw_edges_mmx; + + if(mm_flags & MM_MMXEXT){ + dct_quantize= dct_quantize_MMX2; + }else{ + dct_quantize= dct_quantize_MMX; + } } } diff -r 5b88ee1abf97 -r 0b234715e205 i386/mpegvideo_mmx_template.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/i386/mpegvideo_mmx_template.c Sun Jan 27 13:30:18 2002 +0000 @@ -0,0 +1,201 @@ +/* + Copyright (C) 2002 Michael Niedermayer + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#undef SPREADW +#undef PMAXW +#ifdef HAVE_MMX2 +#define SPREADW(a) "pshufw $0, " #a ", " #a " \n\t" +#define PMAXW(a,b) "pmaxsw " #a ", " #b " \n\t" + +#else +#define SPREADW(a) \ + "punpcklwd " #a ", " #a " \n\t"\ + "punpcklwd " #a ", " #a " \n\t" +#define PMAXW(a,b) \ + "psubusw " #a ", " #b " \n\t"\ + "paddw " #a ", " #b " \n\t" +#endif + +static int RENAME(dct_quantize)(MpegEncContext *s, + DCTELEM *block, int n, + int qscale) +{ + int i, level, last_non_zero_p1, q; + const UINT16 *qmat; + static __align8 INT16 temp_block[64]; + int minLevel, maxLevel; + + if(s->avctx!=NULL && s->avctx->codec->id==CODEC_ID_MPEG4){ + /* mpeg4 */ + minLevel= -2048; + maxLevel= 2047; + }else if(s->out_format==FMT_MPEG1){ + /* mpeg1 */ + minLevel= -255; + maxLevel= 255; + }else{ + /* h263 / msmpeg4 */ + minLevel= -128; + maxLevel= 127; + } + + av_fdct (block); + + if (s->mb_intra) { + int dummy; + if (n < 4) + q = s->y_dc_scale; + else + q = s->c_dc_scale; + + /* note: block[0] is assumed to be positive */ +#if 1 + asm volatile ( + "xorl %%edx, %%edx \n\t" + "mul %%ebx \n\t" + : "=d" (temp_block[0]), "=a"(dummy) + : "a" (block[0] + (q >> 1)), "b" (inverse[q]) + ); +#else + asm volatile ( + "xorl %%edx, %%edx \n\t" + "divw %%bx \n\t" + "movzwl %%ax, %%eax \n\t" + : "=a" (temp_block[0]) + : "a" (block[0] + (q >> 1)), "b" (q) + : "%edx" + ); +#endif +// temp_block[0] = (block[0] + (q >> 1)) / q; + i = 1; + last_non_zero_p1 = 1; + if (s->out_format == FMT_H263) { + qmat = s->q_non_intra_matrix16; + } else { + qmat = s->q_intra_matrix16; + } + for(i=1;i<4;i++) { + level = block[i] * qmat[i]; + level = level / (1 << (QMAT_SHIFT_MMX - 3)); + /* XXX: currently, this code is not optimal. the range should be: + mpeg1: -255..255 + mpeg2: -2048..2047 + h263: -128..127 + mpeg4: -2048..2047 + */ + if (level > maxLevel) + level = maxLevel; + else if (level < minLevel) + level = minLevel; + temp_block[i] = level; + + if(level) + if(last_non_zero_p1 < inv_zigzag_direct16[i]) last_non_zero_p1= inv_zigzag_direct16[i]; + block[i]=0; + } + } else { + i = 0; + last_non_zero_p1 = 0; + qmat = s->q_non_intra_matrix16; + } + + asm volatile( /* XXX: small rounding bug, but it shouldnt matter */ + "movd %3, %%mm3 \n\t" + SPREADW(%%mm3) + "movd %4, %%mm4 \n\t" + SPREADW(%%mm4) + "movd %5, %%mm5 \n\t" + SPREADW(%%mm5) + "pxor %%mm7, %%mm7 \n\t" + "movd %%eax, %%mm2 \n\t" + SPREADW(%%mm2) + "movl %6, %%eax \n\t" + ".balign 16 \n\t" + "1: \n\t" + "movq (%1, %%eax), %%mm0 \n\t" + "movq (%2, %%eax), %%mm1 \n\t" + "movq %%mm0, %%mm6 \n\t" + "psraw $15, %%mm6 \n\t" + "pmulhw %%mm0, %%mm1 \n\t" + "psubsw %%mm6, %%mm1 \n\t" +#ifdef HAVE_MMX2 + "pminsw %%mm3, %%mm1 \n\t" + "pmaxsw %%mm4, %%mm1 \n\t" +#else + "paddsw %%mm3, %%mm1 \n\t" + "psubusw %%mm4, %%mm1 \n\t" + "paddsw %%mm5, %%mm1 \n\t" +#endif + "movq %%mm1, (%8, %%eax) \n\t" + "pcmpeqw %%mm7, %%mm1 \n\t" + "movq (%7, %%eax), %%mm0 \n\t" + "movq %%mm7, (%1, %%eax) \n\t" + "pandn %%mm0, %%mm1 \n\t" + PMAXW(%%mm1, %%mm2) + "addl $8, %%eax \n\t" + " js 1b \n\t" + "movq %%mm2, %%mm0 \n\t" + "psrlq $32, %%mm2 \n\t" + PMAXW(%%mm0, %%mm2) + "movq %%mm2, %%mm0 \n\t" + "psrlq $16, %%mm2 \n\t" + PMAXW(%%mm0, %%mm2) + "movd %%mm2, %%eax \n\t" + "movzbl %%al, %%eax \n\t" + : "+a" (last_non_zero_p1) + : "r" (block+64), "r" (qmat+64), +#ifdef HAVE_MMX2 + "m" (maxLevel), "m" (minLevel), "m" (0 /* dummy */), "g" (2*i - 128), +#else + "m" (0x7FFF - maxLevel), "m" (0x7FFF -maxLevel + minLevel), "m" (minLevel), "g" (2*i - 128), +#endif + "r" (inv_zigzag_direct16+64), "r" (temp_block+64) + ); +// last_non_zero_p1=64; + /* permute for IDCT */ + asm volatile( + "movl %0, %%eax \n\t" + "pushl %%ebp \n\t" + "movl %%esp, " MANGLE(esp_temp) "\n\t" + "1: \n\t" + "movzbl (%1, %%eax), %%ebx \n\t" + "movzbl 1(%1, %%eax), %%ebp \n\t" + "movw (%2, %%ebx, 2), %%cx \n\t" + "movw (%2, %%ebp, 2), %%sp \n\t" + "movzbl " MANGLE(permutation) "(%%ebx), %%ebx\n\t" + "movzbl " MANGLE(permutation) "(%%ebp), %%ebp\n\t" + "movw %%cx, (%3, %%ebx, 2) \n\t" + "movw %%sp, (%3, %%ebp, 2) \n\t" + "addl $2, %%eax \n\t" + " js 1b \n\t" + "movl " MANGLE(esp_temp) ", %%esp\n\t" + "popl %%ebp \n\t" + : + : "g" (-last_non_zero_p1), "d" (zigzag_direct_noperm+last_non_zero_p1), "S" (temp_block), "D" (block) + : "%eax", "%ebx", "%ecx" + ); +/* + for(i=0; i= (1<<36)/(aanscales[i] * qscale * quant_matrix[i]) >= (1<<36)/249205026 */ + /* 3444240 >= (1<<36)/(aanscales[i] * qscale * quant_matrix[i]) >= 275 */ - qmat[i] = (int)((UINT64_C(1) << (QMAT_SHIFT + 11)) / - (aanscales[i] * qscale * quant_matrix[i])); + qmat[block_permute_op(i)] = (int)((UINT64_C(1) << (QMAT_SHIFT + 11)) / + (aanscales[i] * qscale * quant_matrix[block_permute_op(i)])); } } else { for(i=0;i<64;i++) { /* We can safely suppose that 16 <= quant_matrix[i] <= 255 - So 16 <= qscale * quant_matrix[i] <= 7905 - so (1 << QMAT_SHIFT) / 16 >= qmat[i] >= (1 << QMAT_SHIFT) / 7905 + So 16 <= qscale * quant_matrix[i] <= 7905 + so (1<<19) / 16 >= (1<<19) / (qscale * quant_matrix[i]) >= (1<<19) / 7905 + so 32768 >= (1<<19) / (qscale * quant_matrix[i]) >= 67 */ - qmat[i] = (1 << QMAT_SHIFT_MMX) / (qscale * quant_matrix[i]); + qmat[i] = (1 << QMAT_SHIFT_MMX) / (qscale * quant_matrix[i]); + qmat16[i] = (1 << QMAT_SHIFT_MMX) / (qscale * quant_matrix[block_permute_op(i)]); } } } @@ -418,7 +416,7 @@ void MPV_frame_end(MpegEncContext *s) { /* draw edge for correct motion prediction if outside */ - if (s->pict_type != B_TYPE) { + if (s->pict_type != B_TYPE && !s->intra_only) { if(s->avctx==NULL || s->avctx->codec->id!=CODEC_ID_MPEG4){ draw_edges(s->current_picture[0], s->linesize, s->mb_width*16, s->mb_height*16, EDGE_WIDTH); draw_edges(s->current_picture[1], s->linesize/2, s->mb_width*8, s->mb_height*8, EDGE_WIDTH/2); @@ -457,7 +455,7 @@ avctx->key_frame = (s->pict_type == I_TYPE); MPV_frame_start(s); - + for(i=0;i<3;i++) { UINT8 *src = pict->data[i]; UINT8 *dest = s->current_picture[i]; @@ -472,11 +470,15 @@ h >>= 1; } - for(j=0;jintra_only && dest_wrap==src_wrap){ + s->current_picture[i] = pict->data[i]; + }else { + for(j=0;jnew_picture[i] = s->current_picture[i]; } @@ -873,10 +875,10 @@ s->intra_matrix[0] = default_intra_matrix[0]; for(i=1;i<64;i++) s->intra_matrix[i] = (default_intra_matrix[i] * s->qscale) >> 3; - convert_matrix(s->q_intra_matrix, s->intra_matrix, 8); + convert_matrix(s->q_intra_matrix, s->q_intra_matrix16, s->intra_matrix, 8); } else { - convert_matrix(s->q_intra_matrix, s->intra_matrix, s->qscale); - convert_matrix(s->q_non_intra_matrix, s->non_intra_matrix, s->qscale); + convert_matrix(s->q_intra_matrix, s->q_intra_matrix16, s->intra_matrix, s->qscale); + convert_matrix(s->q_non_intra_matrix, s->q_non_intra_matrix16, s->non_intra_matrix, s->qscale); } switch(s->out_format) { @@ -1011,14 +1013,8 @@ s->y_dc_scale = 8; s->c_dc_scale = 8; } - for(i=0;i<6;i++) { - int last_index; - if (av_fdct == jpeg_fdct_ifast) - last_index = dct_quantize(s, s->block[i], i, s->qscale); - else - last_index = dct_quantize_mmx(s, s->block[i], i, s->qscale); - s->block_last_index[i] = last_index; + s->block_last_index[i] = dct_quantize(s, s->block[i], i, s->qscale); } /* huffman encode */ @@ -1060,7 +1056,7 @@ // fprintf(stderr,"\nNumber of GOB: %d", s->gob_number); } -static int dct_quantize(MpegEncContext *s, +static int dct_quantize_c(MpegEncContext *s, DCTELEM *block, int n, int qscale) { @@ -1157,85 +1153,7 @@ level = maxLevel; else if (level < minLevel) level = minLevel; - block[j] = level; - last_non_zero = i; - } else { - block[j] = 0; - } - } - return last_non_zero; -} -static int dct_quantize_mmx(MpegEncContext *s, - DCTELEM *block, int n, - int qscale) -{ - int i, j, level, last_non_zero, q; - const int *qmat; - int minLevel, maxLevel; - - if(s->avctx!=NULL && s->avctx->codec->id==CODEC_ID_MPEG4){ - /* mpeg4 */ - minLevel= -2048; - maxLevel= 2047; - }else if(s->out_format==FMT_MPEG1){ - /* mpeg1 */ - minLevel= -255; - maxLevel= 255; - }else{ - /* h263 / msmpeg4 */ - minLevel= -128; - maxLevel= 127; - } - - av_fdct (block); - - /* we need this permutation so that we correct the IDCT - permutation. will be moved into DCT code */ - block_permute(block); - - if (s->mb_intra) { - if (n < 4) - q = s->y_dc_scale; - else - q = s->c_dc_scale; - - /* note: block[0] is assumed to be positive */ - block[0] = (block[0] + (q >> 1)) / q; - i = 1; - last_non_zero = 0; - if (s->out_format == FMT_H263) { - qmat = s->q_non_intra_matrix; - } else { - qmat = s->q_intra_matrix; - } - } else { - i = 0; - last_non_zero = -1; - qmat = s->q_non_intra_matrix; - } - - for(;i<64;i++) { - j = zigzag_direct[i]; - level = block[j]; - level = level * qmat[j]; - /* XXX: slight error for the low range. Test should be equivalent to - (level <= -(1 << (QMAT_SHIFT_MMX - 3)) || level >= (1 << - (QMAT_SHIFT_MMX - 3))) - */ - if (((level << (31 - (QMAT_SHIFT_MMX - 3))) >> (31 - (QMAT_SHIFT_MMX - 3))) != - level) { - level = level / (1 << (QMAT_SHIFT_MMX - 3)); - /* XXX: currently, this code is not optimal. the range should be: - mpeg1: -255..255 - mpeg2: -2048..2047 - h263: -128..127 - mpeg4: -2048..2047 - */ - if (level > maxLevel) - level = maxLevel; - else if (level < minLevel) - level = minLevel; block[j] = level; last_non_zero = i; } else { diff -r 5b88ee1abf97 -r 0b234715e205 mpegvideo.h --- a/mpegvideo.h Sat Jan 26 00:22:25 2002 +0000 +++ b/mpegvideo.h Sun Jan 27 13:30:18 2002 +0000 @@ -30,6 +30,9 @@ #define MPEG_BUF_SIZE (16 * 1024) +#define QMAT_SHIFT_MMX 19 +#define QMAT_SHIFT 25 + typedef struct MpegEncContext { struct AVCodecContext *avctx; /* the following parameters must be initialized before encoding */ @@ -120,6 +123,9 @@ /* precomputed matrix (combine qscale and DCT renorm) */ int q_intra_matrix[64]; int q_non_intra_matrix[64]; + /* identical to the above but for MMX & these are not permutated */ + UINT16 __align8 q_intra_matrix16[64] ; + UINT16 __align8 q_non_intra_matrix16[64]; int block_last_index[6]; /* last non zero coefficient in block */ void *opaque; /* private data for the user */