libavcodec.hg: cabac.h annotate

annotate cabac.h @ 5076:796c2a5481ad libavcodec

make decoder decode the first frame properly if theres more than just one frame input

author	michael
date	Sun, 27 May 2007 22:34:49 +0000
parents	9a6a0818e93f
children	ce36118abbbb

rev	line source
1287 9211fbd31353 CABAC michaelni parents: diff changeset	1 /*
9211fbd31353 CABAC michaelni parents: diff changeset	2 * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
9211fbd31353 CABAC michaelni parents: diff changeset	3 * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
9211fbd31353 CABAC michaelni parents: diff changeset	4 *
3947 c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library' diego parents: 3946 diff changeset	5 * This file is part of FFmpeg.
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library' diego parents: 3946 diff changeset	6 *
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library' diego parents: 3946 diff changeset	7 * FFmpeg is free software; you can redistribute it and/or
1287 9211fbd31353 CABAC michaelni parents: diff changeset	8 * modify it under the terms of the GNU Lesser General Public
9211fbd31353 CABAC michaelni parents: diff changeset	9 * License as published by the Free Software Foundation; either
3947 c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library' diego parents: 3946 diff changeset	10 * version 2.1 of the License, or (at your option) any later version.
1287 9211fbd31353 CABAC michaelni parents: diff changeset	11 *
3947 c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library' diego parents: 3946 diff changeset	12 * FFmpeg is distributed in the hope that it will be useful,
1287 9211fbd31353 CABAC michaelni parents: diff changeset	13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9211fbd31353 CABAC michaelni parents: diff changeset	14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
9211fbd31353 CABAC michaelni parents: diff changeset	15 * Lesser General Public License for more details.
9211fbd31353 CABAC michaelni parents: diff changeset	16 *
9211fbd31353 CABAC michaelni parents: diff changeset	17 * You should have received a copy of the GNU Lesser General Public
3947 c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library' diego parents: 3946 diff changeset	18 * License along with FFmpeg; if not, write to the Free Software
3036 0b546eab515d Update licensing information: The FSF changed postal address. diego parents: 2967 diff changeset	19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
1287 9211fbd31353 CABAC michaelni parents: diff changeset	20 *
9211fbd31353 CABAC michaelni parents: diff changeset	21 */
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	22
1287 9211fbd31353 CABAC michaelni parents: diff changeset	23 /**
9211fbd31353 CABAC michaelni parents: diff changeset	24 * @file cabac.h
9211fbd31353 CABAC michaelni parents: diff changeset	25 * Context Adaptive Binary Arithmetic Coder.
9211fbd31353 CABAC michaelni parents: diff changeset	26 */
9211fbd31353 CABAC michaelni parents: diff changeset	27
4975 9a6a0818e93f split h264.c to move parser in its own file aurel parents: 4908 diff changeset	28 #ifndef CABAC_H
9a6a0818e93f split h264.c to move parser in its own file aurel parents: 4908 diff changeset	29 #define CABAC_H
9a6a0818e93f split h264.c to move parser in its own file aurel parents: 4908 diff changeset	30
9a6a0818e93f split h264.c to move parser in its own file aurel parents: 4908 diff changeset	31 #include "bitstream.h"
1287 9211fbd31353 CABAC michaelni parents: diff changeset	32
3284 a224d9752912 don't force asserts in release builds. 2% faster h264. lorenm parents: 3036 diff changeset	33 //#undef NDEBUG
1287 9211fbd31353 CABAC michaelni parents: diff changeset	34 #include <assert.h>
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	35 #ifdef ARCH_X86
d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	36 #include "x86_cpu.h"
d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	37 #endif
1287 9211fbd31353 CABAC michaelni parents: diff changeset	38
3948 3edbf131ee44 refill cabac variables in 16bit steps, 3% faster get_cabac() michael parents: 3947 diff changeset	39 #define CABAC_BITS 16
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	40 #define CABAC_MASK ((1<<CABAC_BITS)-1)
3984 bb186452e7da BRANCHLESS_CABAD --> BRANCHLESS_CABAC_DECODER diego parents: 3982 diff changeset	41 #define BRANCHLESS_CABAC_DECODER 1
4039 866a83726985 Kill a warning with MSVC gpoirier parents: 4037 diff changeset	42 //#define ARCH_X86_DISABLED 1
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	43
1287 9211fbd31353 CABAC michaelni parents: diff changeset	44 typedef struct CABACContext{
9211fbd31353 CABAC michaelni parents: diff changeset	45 int low;
9211fbd31353 CABAC michaelni parents: diff changeset	46 int range;
9211fbd31353 CABAC michaelni parents: diff changeset	47 int outstanding_count;
9211fbd31353 CABAC michaelni parents: diff changeset	48 #ifdef STRICT_LIMITS
9211fbd31353 CABAC michaelni parents: diff changeset	49 int symCount;
9211fbd31353 CABAC michaelni parents: diff changeset	50 #endif
2024 f65d87bfdd5a some of the warning fixes by (Michael Roitzsch <mroi at users dot sourceforge dot net>) michael parents: 1787 diff changeset	51 const uint8_t *bytestream_start;
f65d87bfdd5a some of the warning fixes by (Michael Roitzsch <mroi at users dot sourceforge dot net>) michael parents: 1787 diff changeset	52 const uint8_t *bytestream;
2116 48d9f86fb047 overread fix michael parents: 2024 diff changeset	53 const uint8_t *bytestream_end;
1287 9211fbd31353 CABAC michaelni parents: diff changeset	54 PutBitContext pb;
9211fbd31353 CABAC michaelni parents: diff changeset	55 }CABACContext;
9211fbd31353 CABAC michaelni parents: diff changeset	56
4014 b2582438effe dehack ps_state indexing in the branchless decoder michael* parents: 4012 diff changeset	57 extern uint8_t ff_h264_mlps_state[4*64];
4039 866a83726985 Kill a warning with MSVC gpoirier parents: 4037 diff changeset	58 extern uint8_t ff_h264_lps_range[4264]; ///< rangeTabLPS
3993 8b7c59b7af01 make state transition tables global as they are constant and the code is slightly faster that way michael parents: 3992 diff changeset	59 extern uint8_t ff_h264_mps_state[2*64]; ///< transIdxMPS
8b7c59b7af01 make state transition tables global as they are constant and the code is slightly faster that way michael parents: 3992 diff changeset	60 extern uint8_t ff_h264_lps_state[2*64]; ///< transIdxLPS
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	61 extern const uint8_t ff_h264_norm_shift[512];
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	62
1287 9211fbd31353 CABAC michaelni parents: diff changeset	63
9211fbd31353 CABAC michaelni parents: diff changeset	64 void ff_init_cabac_encoder(CABACContext c, uint8_t buf, int buf_size);
2024 f65d87bfdd5a some of the warning fixes by (Michael Roitzsch <mroi at users dot sourceforge dot net>) michael parents: 1787 diff changeset	65 void ff_init_cabac_decoder(CABACContext c, const uint8_t buf, int buf_size);
3993 8b7c59b7af01 make state transition tables global as they are constant and the code is slightly faster that way michael parents: 3992 diff changeset	66 void ff_init_cabac_states(CABACContext *c);
1287 9211fbd31353 CABAC michaelni parents: diff changeset	67
9211fbd31353 CABAC michaelni parents: diff changeset	68
9211fbd31353 CABAC michaelni parents: diff changeset	69 static inline void put_cabac_bit(CABACContext *c, int b){
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	70 put_bits(&c->pb, 1, b);
ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	71 for(;c->outstanding_count; c->outstanding_count--){
1287 9211fbd31353 CABAC michaelni parents: diff changeset	72 put_bits(&c->pb, 1, 1-b);
9211fbd31353 CABAC michaelni parents: diff changeset	73 }
9211fbd31353 CABAC michaelni parents: diff changeset	74 }
9211fbd31353 CABAC michaelni parents: diff changeset	75
9211fbd31353 CABAC michaelni parents: diff changeset	76 static inline void renorm_cabac_encoder(CABACContext *c){
9211fbd31353 CABAC michaelni parents: diff changeset	77 while(c->range < 0x100){
9211fbd31353 CABAC michaelni parents: diff changeset	78 //FIXME optimize
9211fbd31353 CABAC michaelni parents: diff changeset	79 if(c->low<0x100){
9211fbd31353 CABAC michaelni parents: diff changeset	80 put_cabac_bit(c, 0);
9211fbd31353 CABAC michaelni parents: diff changeset	81 }else if(c->low<0x200){
9211fbd31353 CABAC michaelni parents: diff changeset	82 c->outstanding_count++;
9211fbd31353 CABAC michaelni parents: diff changeset	83 c->low -= 0x100;
9211fbd31353 CABAC michaelni parents: diff changeset	84 }else{
9211fbd31353 CABAC michaelni parents: diff changeset	85 put_cabac_bit(c, 1);
9211fbd31353 CABAC michaelni parents: diff changeset	86 c->low -= 0x200;
9211fbd31353 CABAC michaelni parents: diff changeset	87 }
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	88
1287 9211fbd31353 CABAC michaelni parents: diff changeset	89 c->range+= c->range;
9211fbd31353 CABAC michaelni parents: diff changeset	90 c->low += c->low;
9211fbd31353 CABAC michaelni parents: diff changeset	91 }
9211fbd31353 CABAC michaelni parents: diff changeset	92 }
9211fbd31353 CABAC michaelni parents: diff changeset	93
3928 987fffdf6ae7 don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower. lorenm parents: 3642 diff changeset	94 static void put_cabac(CABACContext c, uint8_t const state, int bit){
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	95 int RangeLPS= ff_h264_lps_range[2(c->range&0xC0) + state];
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	96
1287 9211fbd31353 CABAC michaelni parents: diff changeset	97 if(bit == ((*state)&1)){
9211fbd31353 CABAC michaelni parents: diff changeset	98 c->range -= RangeLPS;
3993 8b7c59b7af01 make state transition tables global as they are constant and the code is slightly faster that way michael parents: 3992 diff changeset	99 state= ff_h264_mps_state[state];
1287 9211fbd31353 CABAC michaelni parents: diff changeset	100 }else{
9211fbd31353 CABAC michaelni parents: diff changeset	101 c->low += c->range - RangeLPS;
9211fbd31353 CABAC michaelni parents: diff changeset	102 c->range = RangeLPS;
3993 8b7c59b7af01 make state transition tables global as they are constant and the code is slightly faster that way michael parents: 3992 diff changeset	103 state= ff_h264_lps_state[state];
1287 9211fbd31353 CABAC michaelni parents: diff changeset	104 }
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	105
1287 9211fbd31353 CABAC michaelni parents: diff changeset	106 renorm_cabac_encoder(c);
9211fbd31353 CABAC michaelni parents: diff changeset	107
9211fbd31353 CABAC michaelni parents: diff changeset	108 #ifdef STRICT_LIMITS
9211fbd31353 CABAC michaelni parents: diff changeset	109 c->symCount++;
9211fbd31353 CABAC michaelni parents: diff changeset	110 #endif
9211fbd31353 CABAC michaelni parents: diff changeset	111 }
9211fbd31353 CABAC michaelni parents: diff changeset	112
3928 987fffdf6ae7 don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower. lorenm parents: 3642 diff changeset	113 static void put_cabac_static(CABACContext *c, int RangeLPS, int bit){
1287 9211fbd31353 CABAC michaelni parents: diff changeset	114 assert(c->range > RangeLPS);
9211fbd31353 CABAC michaelni parents: diff changeset	115
9211fbd31353 CABAC michaelni parents: diff changeset	116 if(!bit){
9211fbd31353 CABAC michaelni parents: diff changeset	117 c->range -= RangeLPS;
9211fbd31353 CABAC michaelni parents: diff changeset	118 }else{
9211fbd31353 CABAC michaelni parents: diff changeset	119 c->low += c->range - RangeLPS;
9211fbd31353 CABAC michaelni parents: diff changeset	120 c->range = RangeLPS;
9211fbd31353 CABAC michaelni parents: diff changeset	121 }
9211fbd31353 CABAC michaelni parents: diff changeset	122
9211fbd31353 CABAC michaelni parents: diff changeset	123 renorm_cabac_encoder(c);
9211fbd31353 CABAC michaelni parents: diff changeset	124
9211fbd31353 CABAC michaelni parents: diff changeset	125 #ifdef STRICT_LIMITS
9211fbd31353 CABAC michaelni parents: diff changeset	126 c->symCount++;
9211fbd31353 CABAC michaelni parents: diff changeset	127 #endif
9211fbd31353 CABAC michaelni parents: diff changeset	128 }
9211fbd31353 CABAC michaelni parents: diff changeset	129
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	130 /**
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	131 * @param bit 0 -> write zero bit, !=0 write one bit
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	132 */
3928 987fffdf6ae7 don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower. lorenm parents: 3642 diff changeset	133 static void put_cabac_bypass(CABACContext *c, int bit){
1287 9211fbd31353 CABAC michaelni parents: diff changeset	134 c->low += c->low;
9211fbd31353 CABAC michaelni parents: diff changeset	135
9211fbd31353 CABAC michaelni parents: diff changeset	136 if(bit){
9211fbd31353 CABAC michaelni parents: diff changeset	137 c->low += c->range;
9211fbd31353 CABAC michaelni parents: diff changeset	138 }
9211fbd31353 CABAC michaelni parents: diff changeset	139 //FIXME optimize
9211fbd31353 CABAC michaelni parents: diff changeset	140 if(c->low<0x200){
9211fbd31353 CABAC michaelni parents: diff changeset	141 put_cabac_bit(c, 0);
9211fbd31353 CABAC michaelni parents: diff changeset	142 }else if(c->low<0x400){
9211fbd31353 CABAC michaelni parents: diff changeset	143 c->outstanding_count++;
9211fbd31353 CABAC michaelni parents: diff changeset	144 c->low -= 0x200;
9211fbd31353 CABAC michaelni parents: diff changeset	145 }else{
9211fbd31353 CABAC michaelni parents: diff changeset	146 put_cabac_bit(c, 1);
9211fbd31353 CABAC michaelni parents: diff changeset	147 c->low -= 0x400;
9211fbd31353 CABAC michaelni parents: diff changeset	148 }
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	149
1287 9211fbd31353 CABAC michaelni parents: diff changeset	150 #ifdef STRICT_LIMITS
9211fbd31353 CABAC michaelni parents: diff changeset	151 c->symCount++;
9211fbd31353 CABAC michaelni parents: diff changeset	152 #endif
9211fbd31353 CABAC michaelni parents: diff changeset	153 }
9211fbd31353 CABAC michaelni parents: diff changeset	154
1300 e18667d1e94d FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv) michaelni parents: 1298 diff changeset	155 /**
e18667d1e94d FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv) michaelni parents: 1298 diff changeset	156 *
e18667d1e94d FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv) michaelni parents: 1298 diff changeset	157 * @return the number of bytes written
e18667d1e94d FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv) michaelni parents: 1298 diff changeset	158 */
3928 987fffdf6ae7 don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower. lorenm parents: 3642 diff changeset	159 static int put_cabac_terminate(CABACContext *c, int bit){
1287 9211fbd31353 CABAC michaelni parents: diff changeset	160 c->range -= 2;
9211fbd31353 CABAC michaelni parents: diff changeset	161
9211fbd31353 CABAC michaelni parents: diff changeset	162 if(!bit){
9211fbd31353 CABAC michaelni parents: diff changeset	163 renorm_cabac_encoder(c);
9211fbd31353 CABAC michaelni parents: diff changeset	164 }else{
9211fbd31353 CABAC michaelni parents: diff changeset	165 c->low += c->range;
9211fbd31353 CABAC michaelni parents: diff changeset	166 c->range= 2;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	167
1287 9211fbd31353 CABAC michaelni parents: diff changeset	168 renorm_cabac_encoder(c);
9211fbd31353 CABAC michaelni parents: diff changeset	169
9211fbd31353 CABAC michaelni parents: diff changeset	170 assert(c->low <= 0x1FF);
9211fbd31353 CABAC michaelni parents: diff changeset	171 put_cabac_bit(c, c->low>>9);
9211fbd31353 CABAC michaelni parents: diff changeset	172 put_bits(&c->pb, 2, ((c->low>>7)&3)\|1);
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	173
1287 9211fbd31353 CABAC michaelni parents: diff changeset	174 flush_put_bits(&c->pb); //FIXME FIXME FIXME XXX wrong
9211fbd31353 CABAC michaelni parents: diff changeset	175 }
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	176
1287 9211fbd31353 CABAC michaelni parents: diff changeset	177 #ifdef STRICT_LIMITS
9211fbd31353 CABAC michaelni parents: diff changeset	178 c->symCount++;
9211fbd31353 CABAC michaelni parents: diff changeset	179 #endif
1300 e18667d1e94d FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv) michaelni parents: 1298 diff changeset	180
1787 752b51a3c8ed get_bit_count -> put_bits_count alex parents: 1301 diff changeset	181 return (put_bits_count(&c->pb)+7)>>3;
1287 9211fbd31353 CABAC michaelni parents: diff changeset	182 }
9211fbd31353 CABAC michaelni parents: diff changeset	183
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	184 /**
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	185 * put (truncated) unary binarization.
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	186 */
3928 987fffdf6ae7 don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower. lorenm parents: 3642 diff changeset	187 static void put_cabac_u(CABACContext c, uint8_t state, int v, int max, int max_index, int truncated){
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	188 int i;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	189
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	190 assert(v <= max);
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	191
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	192 #if 1
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	193 for(i=0; i<v; i++){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	194 put_cabac(c, state, 1);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	195 if(i < max_index) state++;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	196 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	197 if(truncated==0 \|\| v<max)
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	198 put_cabac(c, state, 0);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	199 #else
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	200 if(v <= max_index){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	201 for(i=0; i<v; i++){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	202 put_cabac(c, state+i, 1);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	203 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	204 if(truncated==0 \|\| v<max)
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	205 put_cabac(c, state+i, 0);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	206 }else{
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	207 for(i=0; i<=max_index; i++){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	208 put_cabac(c, state+i, 1);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	209 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	210 for(; i<v; i++){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	211 put_cabac(c, state+max_index, 1);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	212 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	213 if(truncated==0 \|\| v<max)
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	214 put_cabac(c, state+max_index, 0);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	215 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	216 #endif
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	217 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	218
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	219 /**
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	220 * put unary exp golomb k-th order binarization.
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	221 */
3928 987fffdf6ae7 don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower. lorenm parents: 3642 diff changeset	222 static void put_cabac_ueg(CABACContext c, uint8_t state, int v, int max, int is_signed, int k, int max_index){
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	223 int i;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	224
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	225 if(v==0)
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	226 put_cabac(c, state, 0);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	227 else{
1298 5bc3184810dc cleanup michaelni parents: 1290 diff changeset	228 const int sign= v < 0;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	229
4001 34fdffe98bd0 Rename ABS macro to FFABS. diego parents: 3999 diff changeset	230 if(is_signed) v= FFABS(v);
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	231
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	232 if(v<max){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	233 for(i=0; i<v; i++){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	234 put_cabac(c, state, 1);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	235 if(i < max_index) state++;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	236 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	237
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	238 put_cabac(c, state, 0);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	239 }else{
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	240 int m= 1<<k;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	241
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	242 for(i=0; i<max; i++){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	243 put_cabac(c, state, 1);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	244 if(i < max_index) state++;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	245 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	246
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	247 v -= max;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	248 while(v >= m){ //FIXME optimize
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	249 put_cabac_bypass(c, 1);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	250 v-= m;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	251 m+= m;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	252 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	253 put_cabac_bypass(c, 0);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	254 while(m>>=1){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	255 put_cabac_bypass(c, v&m);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	256 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	257 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	258
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	259 if(is_signed)
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	260 put_cabac_bypass(c, sign);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	261 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	262 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	263
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	264 static void refill(CABACContext *c){
1c39d9786efd optimization michael parents: 2116 diff changeset	265 #if CABAC_BITS == 16
3946 ab0797f2e397 () 10l michael parents: 3943 diff changeset	266 c->low+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1);
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	267 #else
1c39d9786efd optimization michael parents: 2116 diff changeset	268 c->low+= c->bytestream[0]<<1;
1c39d9786efd optimization michael parents: 2116 diff changeset	269 #endif
1c39d9786efd optimization michael parents: 2116 diff changeset	270 c->low -= CABAC_MASK;
1c39d9786efd optimization michael parents: 2116 diff changeset	271 c->bytestream+= CABAC_BITS/8;
1c39d9786efd optimization michael parents: 2116 diff changeset	272 }
1c39d9786efd optimization michael parents: 2116 diff changeset	273
1c39d9786efd optimization michael parents: 2116 diff changeset	274 static void refill2(CABACContext *c){
1c39d9786efd optimization michael parents: 2116 diff changeset	275 int i, x;
1c39d9786efd optimization michael parents: 2116 diff changeset	276
1c39d9786efd optimization michael parents: 2116 diff changeset	277 x= c->low ^ (c->low-1);
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	278 i= 7 - ff_h264_norm_shift[x>>(CABAC_BITS-1)];
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	279
1c39d9786efd optimization michael parents: 2116 diff changeset	280 x= -CABAC_MASK;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	281
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	282 #if CABAC_BITS == 16
1c39d9786efd optimization michael parents: 2116 diff changeset	283 x+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1);
1c39d9786efd optimization michael parents: 2116 diff changeset	284 #else
1c39d9786efd optimization michael parents: 2116 diff changeset	285 x+= c->bytestream[0]<<1;
1c39d9786efd optimization michael parents: 2116 diff changeset	286 #endif
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	287
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	288 c->low += x<<i;
1c39d9786efd optimization michael parents: 2116 diff changeset	289 c->bytestream+= CABAC_BITS/8;
1c39d9786efd optimization michael parents: 2116 diff changeset	290 }
1c39d9786efd optimization michael parents: 2116 diff changeset	291
1287 9211fbd31353 CABAC michaelni parents: diff changeset	292 static inline void renorm_cabac_decoder(CABACContext *c){
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	293 while(c->range < 0x100){
1287 9211fbd31353 CABAC michaelni parents: diff changeset	294 c->range+= c->range;
9211fbd31353 CABAC michaelni parents: diff changeset	295 c->low+= c->low;
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	296 if(!(c->low & CABAC_MASK))
1c39d9786efd optimization michael parents: 2116 diff changeset	297 refill(c);
1287 9211fbd31353 CABAC michaelni parents: diff changeset	298 }
9211fbd31353 CABAC michaelni parents: diff changeset	299 }
9211fbd31353 CABAC michaelni parents: diff changeset	300
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	301 static inline void renorm_cabac_decoder_once(CABACContext *c){
3951 751bfc30df72 disable benchmarking code michael parents: 3950 diff changeset	302 #ifdef ARCH_X86_DISABLED
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	303 int temp;
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	304 #if 0
3950 900d21b85dd6 renorm_cabac_decoder_once START/STOP_TIMER scores for athlon michael parents: 3948 diff changeset	305 //P3:683 athlon:475
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	306 asm(
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	307 "lea -0x100(%0), %2 \n\t"
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	308 "shr $31, %2 \n\t" //FIXME 31->63 for x86-64
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	309 "shl %%cl, %0 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	310 "shl %%cl, %1 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	311 : "+r"(c->range), "+r"(c->low), "+c"(temp)
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	312 );
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	313 #elif 0
3950 900d21b85dd6 renorm_cabac_decoder_once START/STOP_TIMER scores for athlon michael parents: 3948 diff changeset	314 //P3:680 athlon:474
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	315 asm(
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	316 "cmp $0x100, %0 \n\t"
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	317 "setb %%cl \n\t" //FIXME 31->63 for x86-64
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	318 "shl %%cl, %0 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	319 "shl %%cl, %1 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	320 : "+r"(c->range), "+r"(c->low), "+c"(temp)
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	321 );
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	322 #elif 1
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	323 int temp2;
3950 900d21b85dd6 renorm_cabac_decoder_once START/STOP_TIMER scores for athlon michael parents: 3948 diff changeset	324 //P3:665 athlon:517
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	325 asm(
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	326 "lea -0x100(%0), %%eax \n\t"
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	327 "cdq \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	328 "mov %0, %%eax \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	329 "and %%edx, %0 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	330 "and %1, %%edx \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	331 "add %%eax, %0 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	332 "add %%edx, %1 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	333 : "+r"(c->range), "+r"(c->low), "+a"(temp), "+d"(temp2)
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	334 );
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	335 #elif 0
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	336 int temp2;
3950 900d21b85dd6 renorm_cabac_decoder_once START/STOP_TIMER scores for athlon michael parents: 3948 diff changeset	337 //P3:673 athlon:509
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	338 asm(
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	339 "cmp $0x100, %0 \n\t"
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	340 "sbb %%edx, %%edx \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	341 "mov %0, %%eax \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	342 "and %%edx, %0 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	343 "and %1, %%edx \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	344 "add %%eax, %0 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	345 "add %%edx, %1 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	346 : "+r"(c->range), "+r"(c->low), "+a"(temp), "+d"(temp2)
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	347 );
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	348 #else
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	349 int temp2;
3950 900d21b85dd6 renorm_cabac_decoder_once START/STOP_TIMER scores for athlon michael parents: 3948 diff changeset	350 //P3:677 athlon:511
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	351 asm(
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	352 "cmp $0x100, %0 \n\t"
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	353 "lea (%0, %0), %%eax \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	354 "lea (%1, %1), %%edx \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	355 "cmovb %%eax, %0 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	356 "cmovb %%edx, %1 \n\t"
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	357 : "+r"(c->range), "+r"(c->low), "+a"(temp), "+d"(temp2)
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	358 );
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	359 #endif
811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	360 #else
3950 900d21b85dd6 renorm_cabac_decoder_once START/STOP_TIMER scores for athlon michael parents: 3948 diff changeset	361 //P3:675 athlon:476
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	362 int shift= (uint32_t)(c->range - 0x100)>>31;
3642 0efda682253c tweak cabac. 0.5% faster h264. lorenm parents: 3284 diff changeset	363 c->range<<= shift;
0efda682253c tweak cabac. 0.5% faster h264. lorenm parents: 3284 diff changeset	364 c->low <<= shift;
3943 811a9b0d9f32 several x86 renorm_cabac_decoder_once optimizations michael parents: 3928 diff changeset	365 #endif
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	366 if(!(c->low & CABAC_MASK))
1c39d9786efd optimization michael parents: 2116 diff changeset	367 refill(c);
1c39d9786efd optimization michael parents: 2116 diff changeset	368 }
1c39d9786efd optimization michael parents: 2116 diff changeset	369
4908 777f250df232 Fix multiple "��inline/static�� is not at beginning of declaration" warnings. diego parents: 4882 diff changeset	370 static av_always_inline int get_cabac_inline(CABACContext c, uint8_t const state){
3642 0efda682253c tweak cabac. 0.5% faster h264. lorenm parents: 3284 diff changeset	371 //FIXME gcc generates duplicate load/stores for c->low and c->range
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	372 #define LOW "0"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	373 #define RANGE "4"
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	374 #ifdef ARCH_X86_64
d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	375 #define BYTESTART "16"
d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	376 #define BYTE "24"
d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	377 #define BYTEEND "32"
d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	378 #else
3993 8b7c59b7af01 make state transition tables global as they are constant and the code is slightly faster that way michael parents: 3992 diff changeset	379 #define BYTESTART "12"
8b7c59b7af01 make state transition tables global as they are constant and the code is slightly faster that way michael parents: 3992 diff changeset	380 #define BYTE "16"
8b7c59b7af01 make state transition tables global as they are constant and the code is slightly faster that way michael parents: 3992 diff changeset	381 #define BYTEEND "20"
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	382 #endif
4882 8131ccb4ea72 Mark code parts that cannot work on AMD64 due to broken relocations as such. diego parents: 4881 diff changeset	383 #if defined(ARCH_X86) && defined(CONFIG_7REGS) && defined(HAVE_EBX_AVAILABLE) && !defined(BROKEN_RELOCATIONS)
4044 5ccdefd60f61 Fix PIC compilation, some defines were under #ifdef !PIC but used diego parents: 4043 diff changeset	384 int bit;
5ccdefd60f61 Fix PIC compilation, some defines were under #ifdef !PIC but used diego parents: 4043 diff changeset	385
3984 bb186452e7da BRANCHLESS_CABAD --> BRANCHLESS_CABAC_DECODER diego parents: 3982 diff changeset	386 #ifndef BRANCHLESS_CABAC_DECODER
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	387 asm volatile(
4035 b7f31a32bb30 cosmetic (%%eax->%0) michael parents: 4034 diff changeset	388 "movzbl (%1), %0 \n\t"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	389 "movl "RANGE "(%2), %%ebx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	390 "movl "RANGE "(%2), %%edx \n\t"
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	391 "andl $0xC0, %%ebx \n\t"
4035 b7f31a32bb30 cosmetic (%%eax->%0) michael parents: 4034 diff changeset	392 "movzbl "MANGLE(ff_h264_lps_range)"(%0, %%ebx, 2), %%esi\n\t"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	393 "movl "LOW "(%2), %%ebx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	394 //eax:state ebx:low, edx:range, esi:RangeLPS
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	395 "subl %%esi, %%edx \n\t"
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	396 "movl %%edx, %%ecx \n\t"
d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	397 "shll $17, %%ecx \n\t"
d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	398 "cmpl %%ecx, %%ebx \n\t"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	399 " ja 1f \n\t"
3999 6cbad3675632 slightly faster on P3 slightly slower on athlon and probably faster on P4 michael parents: 3996 diff changeset	400
6cbad3675632 slightly faster on P3 slightly slower on athlon and probably faster on P4 michael parents: 3996 diff changeset	401 #if 1
6cbad3675632 slightly faster on P3 slightly slower on athlon and probably faster on P4 michael parents: 3996 diff changeset	402 //athlon:4067 P3:4110
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	403 "lea -0x100(%%edx), %%ecx \n\t"
3999 6cbad3675632 slightly faster on P3 slightly slower on athlon and probably faster on P4 michael parents: 3996 diff changeset	404 "shr $31, %%ecx \n\t"
6cbad3675632 slightly faster on P3 slightly slower on athlon and probably faster on P4 michael parents: 3996 diff changeset	405 "shl %%cl, %%edx \n\t"
6cbad3675632 slightly faster on P3 slightly slower on athlon and probably faster on P4 michael parents: 3996 diff changeset	406 "shl %%cl, %%ebx \n\t"
6cbad3675632 slightly faster on P3 slightly slower on athlon and probably faster on P4 michael parents: 3996 diff changeset	407 #else
6cbad3675632 slightly faster on P3 slightly slower on athlon and probably faster on P4 michael parents: 3996 diff changeset	408 //athlon:4057 P3:4130
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	409 "cmp $0x100, %%edx \n\t" //FIXME avoidable
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	410 "setb %%cl \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	411 "shl %%cl, %%edx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	412 "shl %%cl, %%ebx \n\t"
3999 6cbad3675632 slightly faster on P3 slightly slower on athlon and probably faster on P4 michael parents: 3996 diff changeset	413 #endif
4035 b7f31a32bb30 cosmetic (%%eax->%0) michael parents: 4034 diff changeset	414 "movzbl "MANGLE(ff_h264_mps_state)"(%0), %%ecx \n\t"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	415 "movb %%cl, (%1) \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	416 //eax:state ebx:low, edx:range, esi:RangeLPS
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	417 "test %%bx, %%bx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	418 " jnz 2f \n\t"
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	419 "mov "BYTE "(%2), %%"REG_S" \n\t"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	420 "subl $0xFFFF, %%ebx \n\t"
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	421 "movzwl (%%"REG_S"), %%ecx \n\t"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	422 "bswap %%ecx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	423 "shrl $15, %%ecx \n\t"
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	424 "add $2, %%"REG_S" \n\t"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	425 "addl %%ecx, %%ebx \n\t"
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	426 "mov %%"REG_S", "BYTE "(%2) \n\t"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	427 "jmp 2f \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	428 "1: \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	429 //eax:state ebx:low, edx:range, esi:RangeLPS
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	430 "subl %%ecx, %%ebx \n\t"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	431 "movl %%esi, %%edx \n\t"
3979 ce16f66a48ad reading 8bit mem into a 8bit register needs 2 uops on P4, 8bit->32bit with zero extension needs just 1 michael parents: 3978 diff changeset	432 "movzbl " MANGLE(ff_h264_norm_shift) "(%%esi), %%ecx \n\t"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	433 "shll %%cl, %%ebx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	434 "shll %%cl, %%edx \n\t"
4035 b7f31a32bb30 cosmetic (%%eax->%0) michael parents: 4034 diff changeset	435 "movzbl "MANGLE(ff_h264_lps_state)"(%0), %%ecx \n\t"
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	436 "movb %%cl, (%1) \n\t"
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	437 "add $1, %0 \n\t"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	438 "test %%bx, %%bx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	439 " jnz 2f \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	440
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	441 "mov "BYTE "(%2), %%"REG_c" \n\t"
d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	442 "movzwl (%%"REG_c"), %%esi \n\t"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	443 "bswap %%esi \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	444 "shrl $15, %%esi \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	445 "subl $0xFFFF, %%esi \n\t"
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	446 "add $2, %%"REG_c" \n\t"
d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	447 "mov %%"REG_c", "BYTE "(%2) \n\t"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	448
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	449 "leal -1(%%ebx), %%ecx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	450 "xorl %%ebx, %%ecx \n\t"
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	451 "shrl $15, %%ecx \n\t"
3979 ce16f66a48ad reading 8bit mem into a 8bit register needs 2 uops on P4, 8bit->32bit with zero extension needs just 1 michael parents: 3978 diff changeset	452 "movzbl " MANGLE(ff_h264_norm_shift) "(%%ecx), %%ecx \n\t"
3994 2734b228fc87 use ecx instead of cl (no speed change on P3 but might avoid partial register stalls on some cpus) michael parents: 3993 diff changeset	453 "neg %%ecx \n\t"
2734b228fc87 use ecx instead of cl (no speed change on P3 but might avoid partial register stalls on some cpus) michael parents: 3993 diff changeset	454 "add $7, %%ecx \n\t"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	455
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	456 "shll %%cl , %%esi \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	457 "addl %%esi, %%ebx \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	458 "2: \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	459 "movl %%edx, "RANGE "(%2) \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	460 "movl %%ebx, "LOW "(%2) \n\t"
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	461 :"=&a"(bit) //FIXME this is fragile gcc either runs out of registers or misscompiles it (for example if "+a"(bit) or "+m"(*state) is used
fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	462 :"r"(state), "r"(c)
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	463 : "%"REG_c, "%ebx", "%edx", "%"REG_S, "memory"
3969 fc6e0942353b first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead michael parents: 3967 diff changeset	464 );
3982 af16271634c2 moving another bit&1 out, this is as fast as with it in there, but it makes more sense with it outside of the loop michael parents: 3981 diff changeset	465 bit&=1;
4002 ec426fa57dfe adds some useful comments after some of the #else, #elseif, gpoirier parents: 4001 diff changeset	466 #else /* BRANCHLESS_CABAC_DECODER */
4046 8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	467
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	468
4418 4cceb7c877af rename CMOV_IS_FAST to HAVE_FAST_CMOV and simplify configure mru parents: 4345 diff changeset	469 #if defined HAVE_FAST_CMOV
4046 8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	470 #define BRANCHLESS_GET_CABAC_UPDATE(ret, cabac, statep, low, lowword, range, tmp, tmpbyte)\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	471 "mov "tmp" , %%ecx \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	472 "shl $17 , "tmp" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	473 "cmp "low" , "tmp" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	474 "cmova %%ecx , "range" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	475 "sbb %%ecx , %%ecx \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	476 "and %%ecx , "tmp" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	477 "sub "tmp" , "low" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	478 "xor %%ecx , "ret" \n\t"
4418 4cceb7c877af rename CMOV_IS_FAST to HAVE_FAST_CMOV and simplify configure mru parents: 4345 diff changeset	479 #else /* HAVE_FAST_CMOV */
4046 8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	480 #define BRANCHLESS_GET_CABAC_UPDATE(ret, cabac, statep, low, lowword, range, tmp, tmpbyte)\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	481 "mov "tmp" , %%ecx \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	482 "shl $17 , "tmp" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	483 "sub "low" , "tmp" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	484 "sar $31 , "tmp" \n\t" /lps_mask/\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	485 "sub %%ecx , "range" \n\t" /RangeLPS - range/\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	486 "and "tmp" , "range" \n\t" /(RangeLPS - range)&lps_mask/\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	487 "add %%ecx , "range" \n\t" /new range/\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	488 "shl $17 , %%ecx \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	489 "and "tmp" , %%ecx \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	490 "sub %%ecx , "low" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	491 "xor "tmp" , "ret" \n\t"
4418 4cceb7c877af rename CMOV_IS_FAST to HAVE_FAST_CMOV and simplify configure mru parents: 4345 diff changeset	492 #endif /* HAVE_FAST_CMOV */
3975 6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	493
6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	494
4046 8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	495 #define BRANCHLESS_GET_CABAC(ret, cabac, statep, low, lowword, range, tmp, tmpbyte)\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	496 "movzbl "statep" , "ret" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	497 "mov "range" , "tmp" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	498 "and $0xC0 , "range" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	499 "movzbl "MANGLE(ff_h264_lps_range)"("ret", "range", 2), "range" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	500 "sub "range" , "tmp" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	501 BRANCHLESS_GET_CABAC_UPDATE(ret, cabac, statep, low, lowword, range, tmp, tmpbyte)\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	502 "movzbl " MANGLE(ff_h264_norm_shift) "("range"), %%ecx \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	503 "shl %%cl , "range" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	504 "movzbl "MANGLE(ff_h264_mlps_state)"+128("ret"), "tmp" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	505 "mov "tmpbyte" , "statep" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	506 "shl %%cl , "low" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	507 "test "lowword" , "lowword" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	508 " jnz 1f \n\t"\
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	509 "mov "BYTE"("cabac"), %%"REG_c" \n\t"\
d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	510 "movzwl (%%"REG_c") , "tmp" \n\t"\
4046 8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	511 "bswap "tmp" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	512 "shr $15 , "tmp" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	513 "sub $0xFFFF , "tmp" \n\t"\
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	514 "add $2 , %%"REG_c" \n\t"\
d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	515 "mov %%"REG_c" , "BYTE "("cabac") \n\t"\
4046 8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	516 "lea -1("low") , %%ecx \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	517 "xor "low" , %%ecx \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	518 "shr $15 , %%ecx \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	519 "movzbl " MANGLE(ff_h264_norm_shift) "(%%ecx), %%ecx \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	520 "neg %%ecx \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	521 "add $7 , %%ecx \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	522 "shl %%cl , "tmp" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	523 "add "tmp" , "low" \n\t"\
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	524 "1: \n\t"
3975 6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	525
4046 8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	526 asm volatile(
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	527 "movl "RANGE "(%2), %%esi \n\t"
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	528 "movl "LOW "(%2), %%ebx \n\t"
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	529 BRANCHLESS_GET_CABAC("%0", "%2", "(%1)", "%%ebx", "%%bx", "%%esi", "%%edx", "%%dl")
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	530 "movl %%esi, "RANGE "(%2) \n\t"
8bbc695c9603 factorize get_cabac asm (0.5% slower but its much cleaner) michael parents: 4044 diff changeset	531 "movl %%ebx, "LOW "(%2) \n\t"
3975 6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	532
6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	533 :"=&a"(bit)
6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	534 :"r"(state), "r"(c)
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	535 : "%"REG_c, "%ebx", "%edx", "%esi", "memory"
3975 6cc9eb5ee5e3 x86 branchless cabac decoder michael parents: 3974 diff changeset	536 );
3981 9854f686ba79 move the &1 out of the asm so gcc can optimize it away in inlined cases (yes this is slightly faster) michael parents: 3980 diff changeset	537 bit&=1;
4002 ec426fa57dfe adds some useful comments after some of the #else, #elseif, gpoirier parents: 4001 diff changeset	538 #endif /* BRANCHLESS_CABAC_DECODER */
4882 8131ccb4ea72 Mark code parts that cannot work on AMD64 due to broken relocations as such. diego parents: 4881 diff changeset	539 #else /* defined(ARCH_X86) && defined(CONFIG_7REGS) && defined(HAVE_EBX_AVAILABLE) && !defined(BROKEN_RELOCATIONS) */
3642 0efda682253c tweak cabac. 0.5% faster h264. lorenm parents: 3284 diff changeset	540 int s = *state;
4039 866a83726985 Kill a warning with MSVC gpoirier parents: 4037 diff changeset	541 int RangeLPS= ff_h264_lps_range[2*(c->range&0xC0) + s];
2522 e25782262d7d kill warnings patch by (M��ns Rullg��rd <mru inprovide com>) michael parents: 2323 diff changeset	542 int bit, lps_mask attribute_unused;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	543
1287 9211fbd31353 CABAC michaelni parents: diff changeset	544 c->range -= RangeLPS;
3984 bb186452e7da BRANCHLESS_CABAD --> BRANCHLESS_CABAC_DECODER diego parents: 3982 diff changeset	545 #ifndef BRANCHLESS_CABAC_DECODER
4345 88967250d718 replace a few hardcoded numbers with their correct named ones michael parents: 4283 diff changeset	546 if(c->low < (c->range<<(CABAC_BITS+1))){
3642 0efda682253c tweak cabac. 0.5% faster h264. lorenm parents: 3284 diff changeset	547 bit= s&1;
3993 8b7c59b7af01 make state transition tables global as they are constant and the code is slightly faster that way michael parents: 3992 diff changeset	548 *state= ff_h264_mps_state[s];
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	549 renorm_cabac_decoder_once(c);
1287 9211fbd31353 CABAC michaelni parents: diff changeset	550 }else{
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	551 bit= ff_h264_norm_shift[RangeLPS];
4345 88967250d718 replace a few hardcoded numbers with their correct named ones michael parents: 4283 diff changeset	552 c->low -= (c->range<<(CABAC_BITS+1));
3993 8b7c59b7af01 make state transition tables global as they are constant and the code is slightly faster that way michael parents: 3992 diff changeset	553 *state= ff_h264_lps_state[s];
3956 0910f2844f9a branchless renormalization (1% faster get_cabac) old branchless renormalization wasnt faster because gcc was scared of the shift variable (missusing bit variable now) michael parents: 3955 diff changeset	554 c->range = RangeLPS<<bit;
0910f2844f9a branchless renormalization (1% faster get_cabac) old branchless renormalization wasnt faster because gcc was scared of the shift variable (missusing bit variable now) michael parents: 3955 diff changeset	555 c->low <<= bit;
0910f2844f9a branchless renormalization (1% faster get_cabac) old branchless renormalization wasnt faster because gcc was scared of the shift variable (missusing bit variable now) michael parents: 3955 diff changeset	556 bit= (s&1)^1;
0910f2844f9a branchless renormalization (1% faster get_cabac) old branchless renormalization wasnt faster because gcc was scared of the shift variable (missusing bit variable now) michael parents: 3955 diff changeset	557
4345 88967250d718 replace a few hardcoded numbers with their correct named ones michael parents: 4283 diff changeset	558 if(!(c->low & CABAC_MASK)){
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	559 refill2(c);
3956 0910f2844f9a branchless renormalization (1% faster get_cabac) old branchless renormalization wasnt faster because gcc was scared of the shift variable (missusing bit variable now) michael parents: 3955 diff changeset	560 }
1287 9211fbd31353 CABAC michaelni parents: diff changeset	561 }
4002 ec426fa57dfe adds some useful comments after some of the #else, #elseif, gpoirier parents: 4001 diff changeset	562 #else /* BRANCHLESS_CABAC_DECODER */
4345 88967250d718 replace a few hardcoded numbers with their correct named ones michael parents: 4283 diff changeset	563 lps_mask= ((c->range<<(CABAC_BITS+1)) - c->low)>>31;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	564
4345 88967250d718 replace a few hardcoded numbers with their correct named ones michael parents: 4283 diff changeset	565 c->low -= (c->range<<(CABAC_BITS+1)) & lps_mask;
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	566 c->range += (RangeLPS - c->range) & lps_mask;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	567
3974 93746612bc78 optimize branchless C CABAC decoder michael parents: 3972 diff changeset	568 s^=lps_mask;
4014 b2582438effe dehack ps_state indexing in the branchless decoder michael* parents: 4012 diff changeset	569 *state= (ff_h264_mlps_state+128)[s];
3974 93746612bc78 optimize branchless C CABAC decoder michael parents: 3972 diff changeset	570 bit= s&1;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	571
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	572 lps_mask= ff_h264_norm_shift[c->range];
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	573 c->range<<= lps_mask;
1c39d9786efd optimization michael parents: 2116 diff changeset	574 c->low <<= lps_mask;
1c39d9786efd optimization michael parents: 2116 diff changeset	575 if(!(c->low & CABAC_MASK))
1c39d9786efd optimization michael parents: 2116 diff changeset	576 refill2(c);
4002 ec426fa57dfe adds some useful comments after some of the #else, #elseif, gpoirier parents: 4001 diff changeset	577 #endif /* BRANCHLESS_CABAC_DECODER */
4882 8131ccb4ea72 Mark code parts that cannot work on AMD64 due to broken relocations as such. diego parents: 4881 diff changeset	578 #endif /* defined(ARCH_X86) && defined(CONFIG_7REGS) && defined(HAVE_EBX_AVAILABLE) && !defined(BROKEN_RELOCATIONS) */
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	579 return bit;
1287 9211fbd31353 CABAC michaelni parents: diff changeset	580 }
9211fbd31353 CABAC michaelni parents: diff changeset	581
4579 03f7feb14dd3 use av_noinline instead of __attribute((noinline)) aurel parents: 4418 diff changeset	582 static int av_noinline get_cabac_noinline(CABACContext c, uint8_t const state){
4008 b636f3d59283 prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3) michael parents: 4002 diff changeset	583 return get_cabac_inline(c,state);
b636f3d59283 prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3) michael parents: 4002 diff changeset	584 }
b636f3d59283 prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3) michael parents: 4002 diff changeset	585
b636f3d59283 prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3) michael parents: 4002 diff changeset	586 static int get_cabac(CABACContext c, uint8_t const state){
b636f3d59283 prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3) michael parents: 4002 diff changeset	587 return get_cabac_inline(c,state);
b636f3d59283 prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3) michael parents: 4002 diff changeset	588 }
b636f3d59283 prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3) michael parents: 4002 diff changeset	589
3928 987fffdf6ae7 don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower. lorenm parents: 3642 diff changeset	590 static int get_cabac_bypass(CABACContext *c){
4040 9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	591 #if 0 //not faster
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	592 int bit;
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	593 asm volatile(
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	594 "movl "RANGE "(%1), %%ebx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	595 "movl "LOW "(%1), %%eax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	596 "shl $17, %%ebx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	597 "add %%eax, %%eax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	598 "sub %%ebx, %%eax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	599 "cdq \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	600 "and %%edx, %%ebx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	601 "add %%ebx, %%eax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	602 "test %%ax, %%ax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	603 " jnz 1f \n\t"
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	604 "movl "BYTE "(%1), %%"REG_b" \n\t"
4040 9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	605 "subl $0xFFFF, %%eax \n\t"
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	606 "movzwl (%%"REG_b"), %%ecx \n\t"
4040 9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	607 "bswap %%ecx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	608 "shrl $15, %%ecx \n\t"
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	609 "addl $2, %%"REG_b" \n\t"
4040 9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	610 "addl %%ecx, %%eax \n\t"
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	611 "movl %%"REG_b", "BYTE "(%1) \n\t"
4040 9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	612 "1: \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	613 "movl %%eax, "LOW "(%1) \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	614
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	615 :"=&d"(bit)
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	616 :"r"(c)
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	617 : "%eax", "%"REG_b, "%ecx", "memory"
4040 9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	618 );
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	619 return bit+1;
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	620 #else
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	621 int range;
1287 9211fbd31353 CABAC michaelni parents: diff changeset	622 c->low += c->low;
9211fbd31353 CABAC michaelni parents: diff changeset	623
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	624 if(!(c->low & CABAC_MASK))
1c39d9786efd optimization michael parents: 2116 diff changeset	625 refill(c);
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	626
4345 88967250d718 replace a few hardcoded numbers with their correct named ones michael parents: 4283 diff changeset	627 range= c->range<<(CABAC_BITS+1);
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	628 if(c->low < range){
1287 9211fbd31353 CABAC michaelni parents: diff changeset	629 return 0;
9211fbd31353 CABAC michaelni parents: diff changeset	630 }else{
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	631 c->low -= range;
1287 9211fbd31353 CABAC michaelni parents: diff changeset	632 return 1;
9211fbd31353 CABAC michaelni parents: diff changeset	633 }
4040 9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	634 #endif
1287 9211fbd31353 CABAC michaelni parents: diff changeset	635 }
4040 9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	636
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	637
4283 d6f83e2f8804 rename always_inline to av_always_inline and move to common.h mru parents: 4241 diff changeset	638 static av_always_inline int get_cabac_bypass_sign(CABACContext *c, int val){
4241 92f773cfebf5 PIC fix michael parents: 4113 diff changeset	639 #if defined(ARCH_X86) && !(defined(PIC) && defined(__GNUC__))
4040 9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	640 asm volatile(
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	641 "movl "RANGE "(%1), %%ebx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	642 "movl "LOW "(%1), %%eax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	643 "shl $17, %%ebx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	644 "add %%eax, %%eax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	645 "sub %%ebx, %%eax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	646 "cdq \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	647 "and %%edx, %%ebx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	648 "add %%ebx, %%eax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	649 "xor %%edx, %%ecx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	650 "sub %%edx, %%ecx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	651 "test %%ax, %%ax \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	652 " jnz 1f \n\t"
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	653 "mov "BYTE "(%1), %%"REG_b" \n\t"
4040 9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	654 "subl $0xFFFF, %%eax \n\t"
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	655 "movzwl (%%"REG_b"), %%edx \n\t"
4040 9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	656 "bswap %%edx \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	657 "shrl $15, %%edx \n\t"
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	658 "add $2, %%"REG_b" \n\t"
4040 9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	659 "addl %%edx, %%eax \n\t"
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	660 "mov %%"REG_b", "BYTE "(%1) \n\t"
4040 9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	661 "1: \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	662 "movl %%eax, "LOW "(%1) \n\t"
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	663
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	664 :"+c"(val)
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	665 :"r"(c)
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	666 : "%eax", "%"REG_b, "%edx", "memory"
4040 9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	667 );
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	668 return val;
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	669 #else
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	670 int range, mask;
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	671 c->low += c->low;
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	672
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	673 if(!(c->low & CABAC_MASK))
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	674 refill(c);
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	675
4345 88967250d718 replace a few hardcoded numbers with their correct named ones michael parents: 4283 diff changeset	676 range= c->range<<(CABAC_BITS+1);
4040 9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	677 c->low -= range;
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	678 mask= c->low >> 31;
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	679 range &= mask;
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	680 c->low += range;
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	681 return (val^mask)-mask;
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	682 #endif
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	683 }
9eaea06c5ba6 optimize sign decoding code in decode_residual() michael parents: 4039 diff changeset	684
4037 53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	685 //FIXME the x86 code from this file should be moved into i386/h264 or cabac something.c/h (note ill kill you if you move my code away from under my fingers before iam finished with it!)
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	686 //FIXME use some macros to avoid duplicatin get_cabac (cant be done yet as that would make optimization work hard)
4882 8131ccb4ea72 Mark code parts that cannot work on AMD64 due to broken relocations as such. diego parents: 4881 diff changeset	687 #if defined(ARCH_X86) && defined(CONFIG_7REGS) && defined(HAVE_EBX_AVAILABLE) && !defined(BROKEN_RELOCATIONS)
4037 53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	688 static int decode_significance_x86(CABACContext c, int max_coeff, uint8_t significant_coeff_ctx_base, int *index){
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	689 void *end= significant_coeff_ctx_base + max_coeff - 1;
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	690 int minusstart= -(int)significant_coeff_ctx_base;
4049 8c1a5ed03a00 another instruction less in decode_significance_x86() -> 1% faster ion P3 michael parents: 4048 diff changeset	691 int minusindex= 4-(int)index;
4037 53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	692 int coeff_count;
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	693 asm volatile(
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	694 "movl "RANGE "(%3), %%esi \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	695 "movl "LOW "(%3), %%ebx \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	696
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	697 "2: \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	698
4047 61a4e7218a45 reordering instructions a little in decode_significance_x86() -> 2 instructions less / 1% faster decode_residual on P3 michael parents: 4046 diff changeset	699 BRANCHLESS_GET_CABAC("%%edx", "%3", "(%1)", "%%ebx", "%%bx", "%%esi", "%%eax", "%%al")
4037 53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	700
4047 61a4e7218a45 reordering instructions a little in decode_significance_x86() -> 2 instructions less / 1% faster decode_residual on P3 michael parents: 4046 diff changeset	701 "test $1, %%edx \n\t"
4037 53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	702 " jz 3f \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	703
4047 61a4e7218a45 reordering instructions a little in decode_significance_x86() -> 2 instructions less / 1% faster decode_residual on P3 michael parents: 4046 diff changeset	704 BRANCHLESS_GET_CABAC("%%edx", "%3", "61(%1)", "%%ebx", "%%bx", "%%esi", "%%eax", "%%al")
61a4e7218a45 reordering instructions a little in decode_significance_x86() -> 2 instructions less / 1% faster decode_residual on P3 michael parents: 4046 diff changeset	705
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	706 "mov %2, %%"REG_a" \n\t"
4037 53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	707 "movl %4, %%ecx \n\t"
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	708 "add %1, %%"REG_c" \n\t"
d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	709 "movl %%ecx, (%%"REG_a") \n\t"
4037 53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	710
4047 61a4e7218a45 reordering instructions a little in decode_significance_x86() -> 2 instructions less / 1% faster decode_residual on P3 michael parents: 4046 diff changeset	711 "test $1, %%edx \n\t"
4037 53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	712 " jnz 4f \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	713
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	714 "add $4, %%"REG_a" \n\t"
d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	715 "mov %%"REG_a", %2 \n\t"
4048 bf6791303fa0 1 instruction less michael parents: 4047 diff changeset	716
4037 53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	717 "3: \n\t"
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	718 "add $1, %1 \n\t"
d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	719 "cmp %5, %1 \n\t"
4037 53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	720 " jb 2b \n\t"
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	721 "mov %2, %%"REG_a" \n\t"
4037 53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	722 "movl %4, %%ecx \n\t"
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	723 "add %1, %%"REG_c" \n\t"
d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	724 "movl %%ecx, (%%"REG_a") \n\t"
4037 53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	725 "4: \n\t"
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	726 "add %6, %%eax \n\t"
4037 53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	727 "shr $2, %%eax \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	728
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	729 "movl %%esi, "RANGE "(%3) \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	730 "movl %%ebx, "LOW "(%3) \n\t"
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	731 :"=&a"(coeff_count), "+r"(significant_coeff_ctx_base), "+m"(index)\
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	732 :"r"(c), "m"(minusstart), "m"(end), "m"(minusindex)\
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	733 : "%"REG_c, "%ebx", "%edx", "%esi", "memory"\
4037 53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	734 );
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	735 return coeff_count;
53be304c7f54 x86 asm version of the decode significance loop (not 8x8) of decode_residual() 5% faster decode_residual() on P3 michael parents: 4035 diff changeset	736 }
4051 19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	737
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	738 static int decode_significance_8x8_x86(CABACContext c, uint8_t significant_coeff_ctx_base, int index, uint8_t sig_off){
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	739 int minusindex= 4-(int)index;
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	740 int coeff_count;
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	741 long last=0;
4051 19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	742 asm volatile(
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	743 "movl "RANGE "(%3), %%esi \n\t"
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	744 "movl "LOW "(%3), %%ebx \n\t"
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	745
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	746 "mov %1, %%"REG_D" \n\t"
4051 19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	747 "2: \n\t"
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	748
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	749 "mov %6, %%"REG_a" \n\t"
d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	750 "movzbl (%%"REG_a", %%"REG_D"), %%edi \n\t"
d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	751 "add %5, %%"REG_D" \n\t"
4051 19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	752
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	753 BRANCHLESS_GET_CABAC("%%edx", "%3", "(%%"REG_D")", "%%ebx", "%%bx", "%%esi", "%%eax", "%%al")
4051 19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	754
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	755 "mov %1, %%edi \n\t"
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	756 "test $1, %%edx \n\t"
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	757 " jz 3f \n\t"
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	758
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	759 "movzbl "MANGLE(last_coeff_flag_offset_8x8)"(%%edi), %%edi\n\t"
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	760 "add %5, %%"REG_D" \n\t"
4051 19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	761
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	762 BRANCHLESS_GET_CABAC("%%edx", "%3", "15(%%"REG_D")", "%%ebx", "%%bx", "%%esi", "%%eax", "%%al")
4051 19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	763
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	764 "mov %2, %%"REG_a" \n\t"
4051 19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	765 "mov %1, %%edi \n\t"
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	766 "movl %%edi, (%%"REG_a") \n\t"
4051 19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	767
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	768 "test $1, %%edx \n\t"
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	769 " jnz 4f \n\t"
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	770
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	771 "add $4, %%"REG_a" \n\t"
d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	772 "mov %%"REG_a", %2 \n\t"
4051 19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	773
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	774 "3: \n\t"
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	775 "addl $1, %%edi \n\t"
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	776 "mov %%edi, %1 \n\t"
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	777 "cmpl $63, %%edi \n\t"
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	778 " jb 2b \n\t"
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	779 "mov %2, %%"REG_a" \n\t"
d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	780 "movl %%edi, (%%"REG_a") \n\t"
4051 19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	781 "4: \n\t"
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	782 "addl %4, %%eax \n\t"
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	783 "shr $2, %%eax \n\t"
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	784
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	785 "movl %%esi, "RANGE "(%3) \n\t"
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	786 "movl %%ebx, "LOW "(%3) \n\t"
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	787 :"=&a"(coeff_count),"+m"(last), "+m"(index)\
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	788 :"r"(c), "m"(minusindex), "m"(significant_coeff_ctx_base), "m"(sig_off)\
4064 d3afe4505951 CABAC assembler optimizations ported to AMD64 reimar parents: 4051 diff changeset	789 : "%"REG_c, "%ebx", "%edx", "%esi", "%"REG_D, "memory"\
4051 19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	790 );
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	791 return coeff_count;
19f07b651d79 decode_significance_8x8_x86() michael parents: 4050 diff changeset	792 }
4882 8131ccb4ea72 Mark code parts that cannot work on AMD64 due to broken relocations as such. diego parents: 4881 diff changeset	793 #endif /* defined(ARCH_X86) && && defined(CONFIG_7REGS) && defined(HAVE_EBX_AVAILABLE) && !defined(BROKEN_RELOCATIONS) */
1287 9211fbd31353 CABAC michaelni parents: diff changeset	794
1300 e18667d1e94d FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv) michaelni parents: 1298 diff changeset	795 /**
e18667d1e94d FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv) michaelni parents: 1298 diff changeset	796 *
e18667d1e94d FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv) michaelni parents: 1298 diff changeset	797 * @return the number of bytes read or 0 if no end
e18667d1e94d FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv) michaelni parents: 1298 diff changeset	798 */
3928 987fffdf6ae7 don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower. lorenm parents: 3642 diff changeset	799 static int get_cabac_terminate(CABACContext *c){
4024 d550343b5dac shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds) michael parents: 4014 diff changeset	800 c->range -= 2;
4345 88967250d718 replace a few hardcoded numbers with their correct named ones michael parents: 4283 diff changeset	801 if(c->low < c->range<<(CABAC_BITS+1)){
2323 1c39d9786efd optimization michael parents: 2116 diff changeset	802 renorm_cabac_decoder_once(c);
1287 9211fbd31353 CABAC michaelni parents: diff changeset	803 return 0;
9211fbd31353 CABAC michaelni parents: diff changeset	804 }else{
1300 e18667d1e94d FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv) michaelni parents: 1298 diff changeset	805 return c->bytestream - c->bytestream_start;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	806 }
1287 9211fbd31353 CABAC michaelni parents: diff changeset	807 }
9211fbd31353 CABAC michaelni parents: diff changeset	808
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	809 /**
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	810 * get (truncated) unnary binarization.
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	811 */
3928 987fffdf6ae7 don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower. lorenm parents: 3642 diff changeset	812 static int get_cabac_u(CABACContext c, uint8_t state, int max, int max_index, int truncated){
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	813 int i;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	814
ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	815 for(i=0; i<max; i++){
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	816 if(get_cabac(c, state)==0)
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	817 return i;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	818
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	819 if(i< max_index) state++;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	820 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	821
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	822 return truncated ? max : -1;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	823 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	824
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	825 /**
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	826 * get unary exp golomb k-th order binarization.
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	827 */
3928 987fffdf6ae7 don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower. lorenm parents: 3642 diff changeset	828 static int get_cabac_ueg(CABACContext c, uint8_t state, int max, int is_signed, int k, int max_index){
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	829 int i, v;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	830 int m= 1<<k;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	831
ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	832 if(get_cabac(c, state)==0)
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	833 return 0;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	834
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	835 if(0 < max_index) state++;
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	836
ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	837 for(i=1; i<max; i++){
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	838 if(get_cabac(c, state)==0){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	839 if(is_signed && get_cabac_bypass(c)){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	840 return -i;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	841 }else
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	842 return i;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	843 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	844
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	845 if(i < max_index) state++;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	846 }
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	847
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	848 while(get_cabac_bypass(c)){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	849 i+= m;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	850 m+= m;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	851 }
2967 ef2149182f1c COSMETICS: Remove all trailing whitespace. diego parents: 2736 diff changeset	852
1290 dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	853 v=0;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	854 while(m>>=1){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	855 v+= v + get_cabac_bypass(c);
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	856 }
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	857 i += v;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	858
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	859 if(is_signed && get_cabac_bypass(c)){
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	860 return -i;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	861 }else
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	862 return i;
dae280b939ca (truncated) unary binerization michaelni parents: 1287 diff changeset	863 }
4975 9a6a0818e93f split h264.c to move parser in its own file aurel parents: 4908 diff changeset	864
9a6a0818e93f split h264.c to move parser in its own file aurel parents: 4908 diff changeset	865 #endif /* CABAC_H */

Mercurial > libavcodec.hg

annotate cabac.h @ 5076:796c2a5481ad libavcodec