Mercurial > libavcodec.hg
annotate cabac.h @ 12043:f9a0bd0888a4 libavcodec
mpegaudio: call ff_mpegaudiodec_init_mmx() only from float decoder
The mmx code is floating-point only, and this function does not know
from which decoder it is called. Without this change, the integer
decoder only "works" because the size of the context struct is smaller
in this case, and the mmx init function writes the function pointer
outside the allocated context.
author | mru |
---|---|
date | Thu, 01 Jul 2010 23:21:17 +0000 |
parents | 7dd2a45249a9 |
children |
rev | line source |
---|---|
1287 | 1 /* |
2 * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder | |
3 * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> | |
4 * | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3946
diff
changeset
|
5 * This file is part of FFmpeg. |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3946
diff
changeset
|
6 * |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3946
diff
changeset
|
7 * FFmpeg is free software; you can redistribute it and/or |
1287 | 8 * modify it under the terms of the GNU Lesser General Public |
9 * License as published by the Free Software Foundation; either | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3946
diff
changeset
|
10 * version 2.1 of the License, or (at your option) any later version. |
1287 | 11 * |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3946
diff
changeset
|
12 * FFmpeg is distributed in the hope that it will be useful, |
1287 | 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 * Lesser General Public License for more details. | |
16 * | |
17 * You should have received a copy of the GNU Lesser General Public | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3946
diff
changeset
|
18 * License along with FFmpeg; if not, write to the Free Software |
3036
0b546eab515d
Update licensing information: The FSF changed postal address.
diego
parents:
2967
diff
changeset
|
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
1287 | 20 */ |
2967 | 21 |
1287 | 22 /** |
11644
7dd2a45249a9
Remove explicit filename from Doxygen @file commands.
diego
parents:
10071
diff
changeset
|
23 * @file |
1287 | 24 * Context Adaptive Binary Arithmetic Coder. |
25 */ | |
26 | |
7760 | 27 #ifndef AVCODEC_CABAC_H |
28 #define AVCODEC_CABAC_H | |
4975 | 29 |
9411
4cb7c65fc775
Split bitstream.h, put the bitstream writer stuff in the new file
stefano
parents:
8718
diff
changeset
|
30 #include "put_bits.h" |
1287 | 31 |
3284
a224d9752912
don't force asserts in release builds. 2% faster h264.
lorenm
parents:
3036
diff
changeset
|
32 //#undef NDEBUG |
1287 | 33 #include <assert.h> |
6763 | 34 #include "libavutil/x86_cpu.h" |
1287 | 35 |
3948
3edbf131ee44
refill cabac variables in 16bit steps, 3% faster get_cabac()
michael
parents:
3947
diff
changeset
|
36 #define CABAC_BITS 16 |
2323 | 37 #define CABAC_MASK ((1<<CABAC_BITS)-1) |
3984 | 38 #define BRANCHLESS_CABAC_DECODER 1 |
4039 | 39 //#define ARCH_X86_DISABLED 1 |
2323 | 40 |
1287 | 41 typedef struct CABACContext{ |
42 int low; | |
43 int range; | |
44 int outstanding_count; | |
45 #ifdef STRICT_LIMITS | |
46 int symCount; | |
47 #endif | |
2024
f65d87bfdd5a
some of the warning fixes by (Michael Roitzsch <mroi at users dot sourceforge dot net>)
michael
parents:
1787
diff
changeset
|
48 const uint8_t *bytestream_start; |
f65d87bfdd5a
some of the warning fixes by (Michael Roitzsch <mroi at users dot sourceforge dot net>)
michael
parents:
1787
diff
changeset
|
49 const uint8_t *bytestream; |
2116 | 50 const uint8_t *bytestream_end; |
1287 | 51 PutBitContext pb; |
52 }CABACContext; | |
53 | |
4014
b2582438effe
dehack *ps_state indexing in the branchless decoder
michael
parents:
4012
diff
changeset
|
54 extern uint8_t ff_h264_mlps_state[4*64]; |
4039 | 55 extern uint8_t ff_h264_lps_range[4*2*64]; ///< rangeTabLPS |
3993
8b7c59b7af01
make state transition tables global as they are constant and the code is slightly faster that way
michael
parents:
3992
diff
changeset
|
56 extern uint8_t ff_h264_mps_state[2*64]; ///< transIdxMPS |
8b7c59b7af01
make state transition tables global as they are constant and the code is slightly faster that way
michael
parents:
3992
diff
changeset
|
57 extern uint8_t ff_h264_lps_state[2*64]; ///< transIdxLPS |
4024
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
58 extern const uint8_t ff_h264_norm_shift[512]; |
2323 | 59 |
1287 | 60 |
61 void ff_init_cabac_encoder(CABACContext *c, uint8_t *buf, int buf_size); | |
2024
f65d87bfdd5a
some of the warning fixes by (Michael Roitzsch <mroi at users dot sourceforge dot net>)
michael
parents:
1787
diff
changeset
|
62 void ff_init_cabac_decoder(CABACContext *c, const uint8_t *buf, int buf_size); |
3993
8b7c59b7af01
make state transition tables global as they are constant and the code is slightly faster that way
michael
parents:
3992
diff
changeset
|
63 void ff_init_cabac_states(CABACContext *c); |
1287 | 64 |
65 | |
66 static inline void put_cabac_bit(CABACContext *c, int b){ | |
2967 | 67 put_bits(&c->pb, 1, b); |
68 for(;c->outstanding_count; c->outstanding_count--){ | |
1287 | 69 put_bits(&c->pb, 1, 1-b); |
70 } | |
71 } | |
72 | |
73 static inline void renorm_cabac_encoder(CABACContext *c){ | |
74 while(c->range < 0x100){ | |
75 //FIXME optimize | |
76 if(c->low<0x100){ | |
77 put_cabac_bit(c, 0); | |
78 }else if(c->low<0x200){ | |
79 c->outstanding_count++; | |
80 c->low -= 0x100; | |
81 }else{ | |
82 put_cabac_bit(c, 1); | |
83 c->low -= 0x200; | |
84 } | |
2967 | 85 |
1287 | 86 c->range+= c->range; |
87 c->low += c->low; | |
88 } | |
89 } | |
90 | |
6170
8cbfc9d75833
Put some disabled functions that are only used in the test program in cabac.c
diego
parents:
5973
diff
changeset
|
91 #ifdef TEST |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
92 static void put_cabac(CABACContext *c, uint8_t * const state, int bit){ |
4024
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
93 int RangeLPS= ff_h264_lps_range[2*(c->range&0xC0) + *state]; |
2967 | 94 |
1287 | 95 if(bit == ((*state)&1)){ |
96 c->range -= RangeLPS; | |
3993
8b7c59b7af01
make state transition tables global as they are constant and the code is slightly faster that way
michael
parents:
3992
diff
changeset
|
97 *state= ff_h264_mps_state[*state]; |
1287 | 98 }else{ |
99 c->low += c->range - RangeLPS; | |
100 c->range = RangeLPS; | |
3993
8b7c59b7af01
make state transition tables global as they are constant and the code is slightly faster that way
michael
parents:
3992
diff
changeset
|
101 *state= ff_h264_lps_state[*state]; |
1287 | 102 } |
2967 | 103 |
1287 | 104 renorm_cabac_encoder(c); |
105 | |
106 #ifdef STRICT_LIMITS | |
107 c->symCount++; | |
108 #endif | |
109 } | |
110 | |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
111 static void put_cabac_static(CABACContext *c, int RangeLPS, int bit){ |
1287 | 112 assert(c->range > RangeLPS); |
113 | |
114 if(!bit){ | |
115 c->range -= RangeLPS; | |
116 }else{ | |
117 c->low += c->range - RangeLPS; | |
118 c->range = RangeLPS; | |
119 } | |
120 | |
121 renorm_cabac_encoder(c); | |
122 | |
123 #ifdef STRICT_LIMITS | |
124 c->symCount++; | |
125 #endif | |
126 } | |
127 | |
1290 | 128 /** |
129 * @param bit 0 -> write zero bit, !=0 write one bit | |
130 */ | |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
131 static void put_cabac_bypass(CABACContext *c, int bit){ |
1287 | 132 c->low += c->low; |
133 | |
134 if(bit){ | |
135 c->low += c->range; | |
136 } | |
137 //FIXME optimize | |
138 if(c->low<0x200){ | |
139 put_cabac_bit(c, 0); | |
140 }else if(c->low<0x400){ | |
141 c->outstanding_count++; | |
142 c->low -= 0x200; | |
143 }else{ | |
144 put_cabac_bit(c, 1); | |
145 c->low -= 0x400; | |
146 } | |
2967 | 147 |
1287 | 148 #ifdef STRICT_LIMITS |
149 c->symCount++; | |
150 #endif | |
151 } | |
152 | |
1300
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
153 /** |
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
154 * |
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
155 * @return the number of bytes written |
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
156 */ |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
157 static int put_cabac_terminate(CABACContext *c, int bit){ |
1287 | 158 c->range -= 2; |
159 | |
160 if(!bit){ | |
161 renorm_cabac_encoder(c); | |
162 }else{ | |
163 c->low += c->range; | |
164 c->range= 2; | |
2967 | 165 |
1287 | 166 renorm_cabac_encoder(c); |
167 | |
168 assert(c->low <= 0x1FF); | |
169 put_cabac_bit(c, c->low>>9); | |
170 put_bits(&c->pb, 2, ((c->low>>7)&3)|1); | |
2967 | 171 |
1287 | 172 flush_put_bits(&c->pb); //FIXME FIXME FIXME XXX wrong |
173 } | |
2967 | 174 |
1287 | 175 #ifdef STRICT_LIMITS |
176 c->symCount++; | |
177 #endif | |
1300
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
178 |
1787 | 179 return (put_bits_count(&c->pb)+7)>>3; |
1287 | 180 } |
181 | |
1290 | 182 /** |
183 * put (truncated) unary binarization. | |
184 */ | |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
185 static void put_cabac_u(CABACContext *c, uint8_t * state, int v, int max, int max_index, int truncated){ |
1290 | 186 int i; |
2967 | 187 |
1290 | 188 assert(v <= max); |
2967 | 189 |
1290 | 190 #if 1 |
191 for(i=0; i<v; i++){ | |
192 put_cabac(c, state, 1); | |
193 if(i < max_index) state++; | |
194 } | |
195 if(truncated==0 || v<max) | |
196 put_cabac(c, state, 0); | |
197 #else | |
198 if(v <= max_index){ | |
199 for(i=0; i<v; i++){ | |
200 put_cabac(c, state+i, 1); | |
201 } | |
202 if(truncated==0 || v<max) | |
203 put_cabac(c, state+i, 0); | |
204 }else{ | |
205 for(i=0; i<=max_index; i++){ | |
206 put_cabac(c, state+i, 1); | |
207 } | |
208 for(; i<v; i++){ | |
209 put_cabac(c, state+max_index, 1); | |
210 } | |
211 if(truncated==0 || v<max) | |
212 put_cabac(c, state+max_index, 0); | |
213 } | |
214 #endif | |
215 } | |
216 | |
217 /** | |
218 * put unary exp golomb k-th order binarization. | |
219 */ | |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
220 static void put_cabac_ueg(CABACContext *c, uint8_t * state, int v, int max, int is_signed, int k, int max_index){ |
1290 | 221 int i; |
2967 | 222 |
1290 | 223 if(v==0) |
224 put_cabac(c, state, 0); | |
225 else{ | |
1298 | 226 const int sign= v < 0; |
2967 | 227 |
4001 | 228 if(is_signed) v= FFABS(v); |
2967 | 229 |
1290 | 230 if(v<max){ |
231 for(i=0; i<v; i++){ | |
232 put_cabac(c, state, 1); | |
233 if(i < max_index) state++; | |
234 } | |
235 | |
236 put_cabac(c, state, 0); | |
237 }else{ | |
238 int m= 1<<k; | |
239 | |
240 for(i=0; i<max; i++){ | |
241 put_cabac(c, state, 1); | |
242 if(i < max_index) state++; | |
243 } | |
244 | |
245 v -= max; | |
246 while(v >= m){ //FIXME optimize | |
247 put_cabac_bypass(c, 1); | |
248 v-= m; | |
249 m+= m; | |
250 } | |
251 put_cabac_bypass(c, 0); | |
252 while(m>>=1){ | |
253 put_cabac_bypass(c, v&m); | |
254 } | |
255 } | |
256 | |
257 if(is_signed) | |
258 put_cabac_bypass(c, sign); | |
259 } | |
260 } | |
6170
8cbfc9d75833
Put some disabled functions that are only used in the test program in cabac.c
diego
parents:
5973
diff
changeset
|
261 #endif /* TEST */ |
1290 | 262 |
2323 | 263 static void refill(CABACContext *c){ |
264 #if CABAC_BITS == 16 | |
3946 | 265 c->low+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1); |
2323 | 266 #else |
267 c->low+= c->bytestream[0]<<1; | |
268 #endif | |
269 c->low -= CABAC_MASK; | |
270 c->bytestream+= CABAC_BITS/8; | |
271 } | |
272 | |
8590 | 273 #if ! ( ARCH_X86 && HAVE_7REGS && HAVE_EBX_AVAILABLE && !defined(BROKEN_RELOCATIONS) ) |
2323 | 274 static void refill2(CABACContext *c){ |
275 int i, x; | |
276 | |
277 x= c->low ^ (c->low-1); | |
4024
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
278 i= 7 - ff_h264_norm_shift[x>>(CABAC_BITS-1)]; |
2323 | 279 |
280 x= -CABAC_MASK; | |
2967 | 281 |
2323 | 282 #if CABAC_BITS == 16 |
283 x+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1); | |
284 #else | |
285 x+= c->bytestream[0]<<1; | |
286 #endif | |
2967 | 287 |
2323 | 288 c->low += x<<i; |
289 c->bytestream+= CABAC_BITS/8; | |
290 } | |
5811 | 291 #endif |
2323 | 292 |
1287 | 293 static inline void renorm_cabac_decoder(CABACContext *c){ |
4024
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
294 while(c->range < 0x100){ |
1287 | 295 c->range+= c->range; |
296 c->low+= c->low; | |
2323 | 297 if(!(c->low & CABAC_MASK)) |
298 refill(c); | |
1287 | 299 } |
300 } | |
301 | |
2323 | 302 static inline void renorm_cabac_decoder_once(CABACContext *c){ |
3951 | 303 #ifdef ARCH_X86_DISABLED |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
304 int temp; |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
305 #if 0 |
3950
900d21b85dd6
renorm_cabac_decoder_once START/STOP_TIMER scores for athlon
michael
parents:
3948
diff
changeset
|
306 //P3:683 athlon:475 |
8031 | 307 __asm__( |
4024
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
308 "lea -0x100(%0), %2 \n\t" |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
309 "shr $31, %2 \n\t" //FIXME 31->63 for x86-64 |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
310 "shl %%cl, %0 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
311 "shl %%cl, %1 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
312 : "+r"(c->range), "+r"(c->low), "+c"(temp) |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
313 ); |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
314 #elif 0 |
3950
900d21b85dd6
renorm_cabac_decoder_once START/STOP_TIMER scores for athlon
michael
parents:
3948
diff
changeset
|
315 //P3:680 athlon:474 |
8031 | 316 __asm__( |
4024
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
317 "cmp $0x100, %0 \n\t" |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
318 "setb %%cl \n\t" //FIXME 31->63 for x86-64 |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
319 "shl %%cl, %0 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
320 "shl %%cl, %1 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
321 : "+r"(c->range), "+r"(c->low), "+c"(temp) |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
322 ); |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
323 #elif 1 |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
324 int temp2; |
3950
900d21b85dd6
renorm_cabac_decoder_once START/STOP_TIMER scores for athlon
michael
parents:
3948
diff
changeset
|
325 //P3:665 athlon:517 |
8031 | 326 __asm__( |
4024
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
327 "lea -0x100(%0), %%eax \n\t" |
6425
8b119a5a0644
Use cltd instead of cdq asm mnemonic, ICC and gcc support both, but
reimar
parents:
6170
diff
changeset
|
328 "cltd \n\t" |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
329 "mov %0, %%eax \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
330 "and %%edx, %0 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
331 "and %1, %%edx \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
332 "add %%eax, %0 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
333 "add %%edx, %1 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
334 : "+r"(c->range), "+r"(c->low), "+a"(temp), "+d"(temp2) |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
335 ); |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
336 #elif 0 |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
337 int temp2; |
3950
900d21b85dd6
renorm_cabac_decoder_once START/STOP_TIMER scores for athlon
michael
parents:
3948
diff
changeset
|
338 //P3:673 athlon:509 |
8031 | 339 __asm__( |
4024
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
340 "cmp $0x100, %0 \n\t" |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
341 "sbb %%edx, %%edx \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
342 "mov %0, %%eax \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
343 "and %%edx, %0 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
344 "and %1, %%edx \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
345 "add %%eax, %0 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
346 "add %%edx, %1 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
347 : "+r"(c->range), "+r"(c->low), "+a"(temp), "+d"(temp2) |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
348 ); |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
349 #else |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
350 int temp2; |
3950
900d21b85dd6
renorm_cabac_decoder_once START/STOP_TIMER scores for athlon
michael
parents:
3948
diff
changeset
|
351 //P3:677 athlon:511 |
8031 | 352 __asm__( |
4024
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
353 "cmp $0x100, %0 \n\t" |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
354 "lea (%0, %0), %%eax \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
355 "lea (%1, %1), %%edx \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
356 "cmovb %%eax, %0 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
357 "cmovb %%edx, %1 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
358 : "+r"(c->range), "+r"(c->low), "+a"(temp), "+d"(temp2) |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
359 ); |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
360 #endif |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
361 #else |
3950
900d21b85dd6
renorm_cabac_decoder_once START/STOP_TIMER scores for athlon
michael
parents:
3948
diff
changeset
|
362 //P3:675 athlon:476 |
4024
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
363 int shift= (uint32_t)(c->range - 0x100)>>31; |
3642 | 364 c->range<<= shift; |
365 c->low <<= shift; | |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
366 #endif |
2323 | 367 if(!(c->low & CABAC_MASK)) |
368 refill(c); | |
369 } | |
370 | |
4908
777f250df232
Fix multiple "¡Æinline/static¡Ç is not at beginning of declaration" warnings.
diego
parents:
4882
diff
changeset
|
371 static av_always_inline int get_cabac_inline(CABACContext *c, uint8_t * const state){ |
3642 | 372 //FIXME gcc generates duplicate load/stores for c->low and c->range |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
373 #define LOW "0" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
374 #define RANGE "4" |
8590 | 375 #if ARCH_X86_64 |
4064 | 376 #define BYTESTART "16" |
377 #define BYTE "24" | |
378 #define BYTEEND "32" | |
379 #else | |
3993
8b7c59b7af01
make state transition tables global as they are constant and the code is slightly faster that way
michael
parents:
3992
diff
changeset
|
380 #define BYTESTART "12" |
8b7c59b7af01
make state transition tables global as they are constant and the code is slightly faster that way
michael
parents:
3992
diff
changeset
|
381 #define BYTE "16" |
8b7c59b7af01
make state transition tables global as they are constant and the code is slightly faster that way
michael
parents:
3992
diff
changeset
|
382 #define BYTEEND "20" |
4064 | 383 #endif |
8590 | 384 #if ARCH_X86 && HAVE_7REGS && HAVE_EBX_AVAILABLE && !defined(BROKEN_RELOCATIONS) |
4044
5ccdefd60f61
Fix PIC compilation, some defines were under #ifdef !PIC but used
diego
parents:
4043
diff
changeset
|
385 int bit; |
5ccdefd60f61
Fix PIC compilation, some defines were under #ifdef !PIC but used
diego
parents:
4043
diff
changeset
|
386 |
3984 | 387 #ifndef BRANCHLESS_CABAC_DECODER |
8031 | 388 __asm__ volatile( |
4035 | 389 "movzbl (%1), %0 \n\t" |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
390 "movl "RANGE "(%2), %%ebx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
391 "movl "RANGE "(%2), %%edx \n\t" |
4024
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
392 "andl $0xC0, %%ebx \n\t" |
4035 | 393 "movzbl "MANGLE(ff_h264_lps_range)"(%0, %%ebx, 2), %%esi\n\t" |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
394 "movl "LOW "(%2), %%ebx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
395 //eax:state ebx:low, edx:range, esi:RangeLPS |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
396 "subl %%esi, %%edx \n\t" |
4024
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
397 "movl %%edx, %%ecx \n\t" |
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
398 "shll $17, %%ecx \n\t" |
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
399 "cmpl %%ecx, %%ebx \n\t" |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
400 " ja 1f \n\t" |
3999
6cbad3675632
slightly faster on P3 slightly slower on athlon and probably faster on P4
michael
parents:
3996
diff
changeset
|
401 |
6cbad3675632
slightly faster on P3 slightly slower on athlon and probably faster on P4
michael
parents:
3996
diff
changeset
|
402 #if 1 |
6cbad3675632
slightly faster on P3 slightly slower on athlon and probably faster on P4
michael
parents:
3996
diff
changeset
|
403 //athlon:4067 P3:4110 |
4024
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
404 "lea -0x100(%%edx), %%ecx \n\t" |
3999
6cbad3675632
slightly faster on P3 slightly slower on athlon and probably faster on P4
michael
parents:
3996
diff
changeset
|
405 "shr $31, %%ecx \n\t" |
6cbad3675632
slightly faster on P3 slightly slower on athlon and probably faster on P4
michael
parents:
3996
diff
changeset
|
406 "shl %%cl, %%edx \n\t" |
6cbad3675632
slightly faster on P3 slightly slower on athlon and probably faster on P4
michael
parents:
3996
diff
changeset
|
407 "shl %%cl, %%ebx \n\t" |
6cbad3675632
slightly faster on P3 slightly slower on athlon and probably faster on P4
michael
parents:
3996
diff
changeset
|
408 #else |
6cbad3675632
slightly faster on P3 slightly slower on athlon and probably faster on P4
michael
parents:
3996
diff
changeset
|
409 //athlon:4057 P3:4130 |
4024
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
410 "cmp $0x100, %%edx \n\t" //FIXME avoidable |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
411 "setb %%cl \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
412 "shl %%cl, %%edx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
413 "shl %%cl, %%ebx \n\t" |
3999
6cbad3675632
slightly faster on P3 slightly slower on athlon and probably faster on P4
michael
parents:
3996
diff
changeset
|
414 #endif |
4035 | 415 "movzbl "MANGLE(ff_h264_mps_state)"(%0), %%ecx \n\t" |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
416 "movb %%cl, (%1) \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
417 //eax:state ebx:low, edx:range, esi:RangeLPS |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
418 "test %%bx, %%bx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
419 " jnz 2f \n\t" |
4064 | 420 "mov "BYTE "(%2), %%"REG_S" \n\t" |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
421 "subl $0xFFFF, %%ebx \n\t" |
4064 | 422 "movzwl (%%"REG_S"), %%ecx \n\t" |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
423 "bswap %%ecx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
424 "shrl $15, %%ecx \n\t" |
4064 | 425 "add $2, %%"REG_S" \n\t" |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
426 "addl %%ecx, %%ebx \n\t" |
4064 | 427 "mov %%"REG_S", "BYTE "(%2) \n\t" |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
428 "jmp 2f \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
429 "1: \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
430 //eax:state ebx:low, edx:range, esi:RangeLPS |
4024
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
431 "subl %%ecx, %%ebx \n\t" |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
432 "movl %%esi, %%edx \n\t" |
3979
ce16f66a48ad
reading 8bit mem into a 8bit register needs 2 uops on P4, 8bit->32bit with zero extension needs just 1
michael
parents:
3978
diff
changeset
|
433 "movzbl " MANGLE(ff_h264_norm_shift) "(%%esi), %%ecx \n\t" |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
434 "shll %%cl, %%ebx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
435 "shll %%cl, %%edx \n\t" |
4035 | 436 "movzbl "MANGLE(ff_h264_lps_state)"(%0), %%ecx \n\t" |
4024
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
437 "movb %%cl, (%1) \n\t" |
4064 | 438 "add $1, %0 \n\t" |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
439 "test %%bx, %%bx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
440 " jnz 2f \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
441 |
4064 | 442 "mov "BYTE "(%2), %%"REG_c" \n\t" |
443 "movzwl (%%"REG_c"), %%esi \n\t" | |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
444 "bswap %%esi \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
445 "shrl $15, %%esi \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
446 "subl $0xFFFF, %%esi \n\t" |
4064 | 447 "add $2, %%"REG_c" \n\t" |
448 "mov %%"REG_c", "BYTE "(%2) \n\t" | |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
449 |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
450 "leal -1(%%ebx), %%ecx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
451 "xorl %%ebx, %%ecx \n\t" |
4024
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
452 "shrl $15, %%ecx \n\t" |
3979
ce16f66a48ad
reading 8bit mem into a 8bit register needs 2 uops on P4, 8bit->32bit with zero extension needs just 1
michael
parents:
3978
diff
changeset
|
453 "movzbl " MANGLE(ff_h264_norm_shift) "(%%ecx), %%ecx \n\t" |
3994
2734b228fc87
use ecx instead of cl (no speed change on P3 but might avoid partial register stalls on some cpus)
michael
parents:
3993
diff
changeset
|
454 "neg %%ecx \n\t" |
2734b228fc87
use ecx instead of cl (no speed change on P3 but might avoid partial register stalls on some cpus)
michael
parents:
3993
diff
changeset
|
455 "add $7, %%ecx \n\t" |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
456 |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
457 "shll %%cl , %%esi \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
458 "addl %%esi, %%ebx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
459 "2: \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
460 "movl %%edx, "RANGE "(%2) \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
461 "movl %%ebx, "LOW "(%2) \n\t" |
5408 | 462 :"=&a"(bit) //FIXME this is fragile gcc either runs out of registers or miscompiles it (for example if "+a"(bit) or "+m"(*state) is used |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
463 :"r"(state), "r"(c) |
4064 | 464 : "%"REG_c, "%ebx", "%edx", "%"REG_S, "memory" |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
465 ); |
3982
af16271634c2
moving another bit&1 out, this is as fast as with it in there, but it makes more sense with it outside of the loop
michael
parents:
3981
diff
changeset
|
466 bit&=1; |
4002
ec426fa57dfe
adds some useful comments after some of the #else, #elseif,
gpoirier
parents:
4001
diff
changeset
|
467 #else /* BRANCHLESS_CABAC_DECODER */ |
4046
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
468 |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
469 |
8590 | 470 #if HAVE_FAST_CMOV |
4046
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
471 #define BRANCHLESS_GET_CABAC_UPDATE(ret, cabac, statep, low, lowword, range, tmp, tmpbyte)\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
472 "mov "tmp" , %%ecx \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
473 "shl $17 , "tmp" \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
474 "cmp "low" , "tmp" \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
475 "cmova %%ecx , "range" \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
476 "sbb %%ecx , %%ecx \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
477 "and %%ecx , "tmp" \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
478 "sub "tmp" , "low" \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
479 "xor %%ecx , "ret" \n\t" |
4418
4cceb7c877af
rename CMOV_IS_FAST to HAVE_FAST_CMOV and simplify configure
mru
parents:
4345
diff
changeset
|
480 #else /* HAVE_FAST_CMOV */ |
4046
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
481 #define BRANCHLESS_GET_CABAC_UPDATE(ret, cabac, statep, low, lowword, range, tmp, tmpbyte)\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
482 "mov "tmp" , %%ecx \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
483 "shl $17 , "tmp" \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
484 "sub "low" , "tmp" \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
485 "sar $31 , "tmp" \n\t" /*lps_mask*/\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
486 "sub %%ecx , "range" \n\t" /*RangeLPS - range*/\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
487 "and "tmp" , "range" \n\t" /*(RangeLPS - range)&lps_mask*/\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
488 "add %%ecx , "range" \n\t" /*new range*/\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
489 "shl $17 , %%ecx \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
490 "and "tmp" , %%ecx \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
491 "sub %%ecx , "low" \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
492 "xor "tmp" , "ret" \n\t" |
4418
4cceb7c877af
rename CMOV_IS_FAST to HAVE_FAST_CMOV and simplify configure
mru
parents:
4345
diff
changeset
|
493 #endif /* HAVE_FAST_CMOV */ |
3975 | 494 |
495 | |
4046
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
496 #define BRANCHLESS_GET_CABAC(ret, cabac, statep, low, lowword, range, tmp, tmpbyte)\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
497 "movzbl "statep" , "ret" \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
498 "mov "range" , "tmp" \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
499 "and $0xC0 , "range" \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
500 "movzbl "MANGLE(ff_h264_lps_range)"("ret", "range", 2), "range" \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
501 "sub "range" , "tmp" \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
502 BRANCHLESS_GET_CABAC_UPDATE(ret, cabac, statep, low, lowword, range, tmp, tmpbyte)\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
503 "movzbl " MANGLE(ff_h264_norm_shift) "("range"), %%ecx \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
504 "shl %%cl , "range" \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
505 "movzbl "MANGLE(ff_h264_mlps_state)"+128("ret"), "tmp" \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
506 "mov "tmpbyte" , "statep" \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
507 "shl %%cl , "low" \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
508 "test "lowword" , "lowword" \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
509 " jnz 1f \n\t"\ |
4064 | 510 "mov "BYTE"("cabac"), %%"REG_c" \n\t"\ |
511 "movzwl (%%"REG_c") , "tmp" \n\t"\ | |
4046
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
512 "bswap "tmp" \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
513 "shr $15 , "tmp" \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
514 "sub $0xFFFF , "tmp" \n\t"\ |
4064 | 515 "add $2 , %%"REG_c" \n\t"\ |
516 "mov %%"REG_c" , "BYTE "("cabac") \n\t"\ | |
4046
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
517 "lea -1("low") , %%ecx \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
518 "xor "low" , %%ecx \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
519 "shr $15 , %%ecx \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
520 "movzbl " MANGLE(ff_h264_norm_shift) "(%%ecx), %%ecx \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
521 "neg %%ecx \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
522 "add $7 , %%ecx \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
523 "shl %%cl , "tmp" \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
524 "add "tmp" , "low" \n\t"\ |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
525 "1: \n\t" |
3975 | 526 |
8031 | 527 __asm__ volatile( |
4046
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
528 "movl "RANGE "(%2), %%esi \n\t" |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
529 "movl "LOW "(%2), %%ebx \n\t" |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
530 BRANCHLESS_GET_CABAC("%0", "%2", "(%1)", "%%ebx", "%%bx", "%%esi", "%%edx", "%%dl") |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
531 "movl %%esi, "RANGE "(%2) \n\t" |
8bbc695c9603
factorize get_cabac asm (0.5% slower but its much cleaner)
michael
parents:
4044
diff
changeset
|
532 "movl %%ebx, "LOW "(%2) \n\t" |
3975 | 533 |
534 :"=&a"(bit) | |
535 :"r"(state), "r"(c) | |
4064 | 536 : "%"REG_c, "%ebx", "%edx", "%esi", "memory" |
3975 | 537 ); |
3981
9854f686ba79
move the &1 out of the asm so gcc can optimize it away in inlined cases (yes this is slightly faster)
michael
parents:
3980
diff
changeset
|
538 bit&=1; |
4002
ec426fa57dfe
adds some useful comments after some of the #else, #elseif,
gpoirier
parents:
4001
diff
changeset
|
539 #endif /* BRANCHLESS_CABAC_DECODER */ |
8590 | 540 #else /* ARCH_X86 && HAVE_7REGS && HAVE_EBX_AVAILABLE && !defined(BROKEN_RELOCATIONS) */ |
3642 | 541 int s = *state; |
4039 | 542 int RangeLPS= ff_h264_lps_range[2*(c->range&0xC0) + s]; |
5083
ce36118abbbb
rename attribute_unused to av_unused and moves its declaration to common.h
benoit
parents:
4975
diff
changeset
|
543 int bit, lps_mask av_unused; |
2967 | 544 |
1287 | 545 c->range -= RangeLPS; |
3984 | 546 #ifndef BRANCHLESS_CABAC_DECODER |
4345
88967250d718
replace a few hardcoded numbers with their correct named ones
michael
parents:
4283
diff
changeset
|
547 if(c->low < (c->range<<(CABAC_BITS+1))){ |
3642 | 548 bit= s&1; |
3993
8b7c59b7af01
make state transition tables global as they are constant and the code is slightly faster that way
michael
parents:
3992
diff
changeset
|
549 *state= ff_h264_mps_state[s]; |
2323 | 550 renorm_cabac_decoder_once(c); |
1287 | 551 }else{ |
4024
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
552 bit= ff_h264_norm_shift[RangeLPS]; |
4345
88967250d718
replace a few hardcoded numbers with their correct named ones
michael
parents:
4283
diff
changeset
|
553 c->low -= (c->range<<(CABAC_BITS+1)); |
3993
8b7c59b7af01
make state transition tables global as they are constant and the code is slightly faster that way
michael
parents:
3992
diff
changeset
|
554 *state= ff_h264_lps_state[s]; |
3956
0910f2844f9a
branchless renormalization (1% faster get_cabac) old branchless renormalization wasnt faster because gcc was scared of the shift variable (missusing bit variable now)
michael
parents:
3955
diff
changeset
|
555 c->range = RangeLPS<<bit; |
0910f2844f9a
branchless renormalization (1% faster get_cabac) old branchless renormalization wasnt faster because gcc was scared of the shift variable (missusing bit variable now)
michael
parents:
3955
diff
changeset
|
556 c->low <<= bit; |
0910f2844f9a
branchless renormalization (1% faster get_cabac) old branchless renormalization wasnt faster because gcc was scared of the shift variable (missusing bit variable now)
michael
parents:
3955
diff
changeset
|
557 bit= (s&1)^1; |
0910f2844f9a
branchless renormalization (1% faster get_cabac) old branchless renormalization wasnt faster because gcc was scared of the shift variable (missusing bit variable now)
michael
parents:
3955
diff
changeset
|
558 |
4345
88967250d718
replace a few hardcoded numbers with their correct named ones
michael
parents:
4283
diff
changeset
|
559 if(!(c->low & CABAC_MASK)){ |
2323 | 560 refill2(c); |
3956
0910f2844f9a
branchless renormalization (1% faster get_cabac) old branchless renormalization wasnt faster because gcc was scared of the shift variable (missusing bit variable now)
michael
parents:
3955
diff
changeset
|
561 } |
1287 | 562 } |
4002
ec426fa57dfe
adds some useful comments after some of the #else, #elseif,
gpoirier
parents:
4001
diff
changeset
|
563 #else /* BRANCHLESS_CABAC_DECODER */ |
4345
88967250d718
replace a few hardcoded numbers with their correct named ones
michael
parents:
4283
diff
changeset
|
564 lps_mask= ((c->range<<(CABAC_BITS+1)) - c->low)>>31; |
2967 | 565 |
4345
88967250d718
replace a few hardcoded numbers with their correct named ones
michael
parents:
4283
diff
changeset
|
566 c->low -= (c->range<<(CABAC_BITS+1)) & lps_mask; |
2323 | 567 c->range += (RangeLPS - c->range) & lps_mask; |
2967 | 568 |
3974 | 569 s^=lps_mask; |
4014
b2582438effe
dehack *ps_state indexing in the branchless decoder
michael
parents:
4012
diff
changeset
|
570 *state= (ff_h264_mlps_state+128)[s]; |
3974 | 571 bit= s&1; |
2967 | 572 |
4024
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
573 lps_mask= ff_h264_norm_shift[c->range]; |
2323 | 574 c->range<<= lps_mask; |
575 c->low <<= lps_mask; | |
576 if(!(c->low & CABAC_MASK)) | |
577 refill2(c); | |
4002
ec426fa57dfe
adds some useful comments after some of the #else, #elseif,
gpoirier
parents:
4001
diff
changeset
|
578 #endif /* BRANCHLESS_CABAC_DECODER */ |
8590 | 579 #endif /* ARCH_X86 && HAVE_7REGS && HAVE_EBX_AVAILABLE && !defined(BROKEN_RELOCATIONS) */ |
2967 | 580 return bit; |
1287 | 581 } |
582 | |
7775
490ee89408c5
Silence a couple of 'defined but not used' warnings by adding an av_unused
diego
parents:
7760
diff
changeset
|
583 static int av_noinline av_unused get_cabac_noinline(CABACContext *c, uint8_t * const state){ |
4008
b636f3d59283
prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3)
michael
parents:
4002
diff
changeset
|
584 return get_cabac_inline(c,state); |
b636f3d59283
prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3)
michael
parents:
4002
diff
changeset
|
585 } |
b636f3d59283
prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3)
michael
parents:
4002
diff
changeset
|
586 |
7775
490ee89408c5
Silence a couple of 'defined but not used' warnings by adding an av_unused
diego
parents:
7760
diff
changeset
|
587 static int av_unused get_cabac(CABACContext *c, uint8_t * const state){ |
4008
b636f3d59283
prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3)
michael
parents:
4002
diff
changeset
|
588 return get_cabac_inline(c,state); |
b636f3d59283
prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3)
michael
parents:
4002
diff
changeset
|
589 } |
b636f3d59283
prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3)
michael
parents:
4002
diff
changeset
|
590 |
7775
490ee89408c5
Silence a couple of 'defined but not used' warnings by adding an av_unused
diego
parents:
7760
diff
changeset
|
591 static int av_unused get_cabac_bypass(CABACContext *c){ |
4040 | 592 #if 0 //not faster |
593 int bit; | |
8031 | 594 __asm__ volatile( |
4040 | 595 "movl "RANGE "(%1), %%ebx \n\t" |
596 "movl "LOW "(%1), %%eax \n\t" | |
597 "shl $17, %%ebx \n\t" | |
598 "add %%eax, %%eax \n\t" | |
599 "sub %%ebx, %%eax \n\t" | |
6425
8b119a5a0644
Use cltd instead of cdq asm mnemonic, ICC and gcc support both, but
reimar
parents:
6170
diff
changeset
|
600 "cltd \n\t" |
4040 | 601 "and %%edx, %%ebx \n\t" |
602 "add %%ebx, %%eax \n\t" | |
603 "test %%ax, %%ax \n\t" | |
604 " jnz 1f \n\t" | |
4064 | 605 "movl "BYTE "(%1), %%"REG_b" \n\t" |
4040 | 606 "subl $0xFFFF, %%eax \n\t" |
4064 | 607 "movzwl (%%"REG_b"), %%ecx \n\t" |
4040 | 608 "bswap %%ecx \n\t" |
609 "shrl $15, %%ecx \n\t" | |
4064 | 610 "addl $2, %%"REG_b" \n\t" |
4040 | 611 "addl %%ecx, %%eax \n\t" |
4064 | 612 "movl %%"REG_b", "BYTE "(%1) \n\t" |
4040 | 613 "1: \n\t" |
614 "movl %%eax, "LOW "(%1) \n\t" | |
615 | |
616 :"=&d"(bit) | |
617 :"r"(c) | |
4064 | 618 : "%eax", "%"REG_b, "%ecx", "memory" |
4040 | 619 ); |
620 return bit+1; | |
621 #else | |
4024
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
622 int range; |
1287 | 623 c->low += c->low; |
624 | |
2323 | 625 if(!(c->low & CABAC_MASK)) |
626 refill(c); | |
2967 | 627 |
4345
88967250d718
replace a few hardcoded numbers with their correct named ones
michael
parents:
4283
diff
changeset
|
628 range= c->range<<(CABAC_BITS+1); |
4024
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
629 if(c->low < range){ |
1287 | 630 return 0; |
631 }else{ | |
4024
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
632 c->low -= range; |
1287 | 633 return 1; |
634 } | |
4040 | 635 #endif |
1287 | 636 } |
4040 | 637 |
638 | |
4283
d6f83e2f8804
rename always_inline to av_always_inline and move to common.h
mru
parents:
4241
diff
changeset
|
639 static av_always_inline int get_cabac_bypass_sign(CABACContext *c, int val){ |
10071
6ac0d4957d35
Replace #ifdef PIC checks with the more appropriate HAVE_EBX_AVAILABLE/HAVE_7REGS.
reimar
parents:
9411
diff
changeset
|
640 #if ARCH_X86 && HAVE_EBX_AVAILABLE |
8031 | 641 __asm__ volatile( |
4040 | 642 "movl "RANGE "(%1), %%ebx \n\t" |
643 "movl "LOW "(%1), %%eax \n\t" | |
644 "shl $17, %%ebx \n\t" | |
645 "add %%eax, %%eax \n\t" | |
646 "sub %%ebx, %%eax \n\t" | |
6425
8b119a5a0644
Use cltd instead of cdq asm mnemonic, ICC and gcc support both, but
reimar
parents:
6170
diff
changeset
|
647 "cltd \n\t" |
4040 | 648 "and %%edx, %%ebx \n\t" |
649 "add %%ebx, %%eax \n\t" | |
650 "xor %%edx, %%ecx \n\t" | |
651 "sub %%edx, %%ecx \n\t" | |
652 "test %%ax, %%ax \n\t" | |
653 " jnz 1f \n\t" | |
4064 | 654 "mov "BYTE "(%1), %%"REG_b" \n\t" |
4040 | 655 "subl $0xFFFF, %%eax \n\t" |
4064 | 656 "movzwl (%%"REG_b"), %%edx \n\t" |
4040 | 657 "bswap %%edx \n\t" |
658 "shrl $15, %%edx \n\t" | |
4064 | 659 "add $2, %%"REG_b" \n\t" |
4040 | 660 "addl %%edx, %%eax \n\t" |
4064 | 661 "mov %%"REG_b", "BYTE "(%1) \n\t" |
4040 | 662 "1: \n\t" |
663 "movl %%eax, "LOW "(%1) \n\t" | |
664 | |
665 :"+c"(val) | |
666 :"r"(c) | |
4064 | 667 : "%eax", "%"REG_b, "%edx", "memory" |
4040 | 668 ); |
669 return val; | |
670 #else | |
671 int range, mask; | |
672 c->low += c->low; | |
673 | |
674 if(!(c->low & CABAC_MASK)) | |
675 refill(c); | |
676 | |
4345
88967250d718
replace a few hardcoded numbers with their correct named ones
michael
parents:
4283
diff
changeset
|
677 range= c->range<<(CABAC_BITS+1); |
4040 | 678 c->low -= range; |
679 mask= c->low >> 31; | |
680 range &= mask; | |
681 c->low += range; | |
682 return (val^mask)-mask; | |
683 #endif | |
684 } | |
685 | |
1300
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
686 /** |
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
687 * |
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
688 * @return the number of bytes read or 0 if no end |
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
689 */ |
7775
490ee89408c5
Silence a couple of 'defined but not used' warnings by adding an av_unused
diego
parents:
7760
diff
changeset
|
690 static int av_unused get_cabac_terminate(CABACContext *c){ |
4024
d550343b5dac
shift CABACContext.range right, this reduces the number of shifts needed in get_cabac() and is slightly faster on P3 (and should be much faster on P4 as the P4 except the more recent variants lacks an integer shifter and so shifts have ~10 times longer latency then simple operations like adds)
michael
parents:
4014
diff
changeset
|
691 c->range -= 2; |
4345
88967250d718
replace a few hardcoded numbers with their correct named ones
michael
parents:
4283
diff
changeset
|
692 if(c->low < c->range<<(CABAC_BITS+1)){ |
2323 | 693 renorm_cabac_decoder_once(c); |
1287 | 694 return 0; |
695 }else{ | |
1300
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
696 return c->bytestream - c->bytestream_start; |
2967 | 697 } |
1287 | 698 } |
699 | |
5811 | 700 #if 0 |
1290 | 701 /** |
5686 | 702 * Get (truncated) unary binarization. |
1290 | 703 */ |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
704 static int get_cabac_u(CABACContext *c, uint8_t * state, int max, int max_index, int truncated){ |
1290 | 705 int i; |
2967 | 706 |
707 for(i=0; i<max; i++){ | |
1290 | 708 if(get_cabac(c, state)==0) |
709 return i; | |
2967 | 710 |
1290 | 711 if(i< max_index) state++; |
712 } | |
713 | |
714 return truncated ? max : -1; | |
715 } | |
716 | |
717 /** | |
718 * get unary exp golomb k-th order binarization. | |
719 */ | |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
720 static int get_cabac_ueg(CABACContext *c, uint8_t * state, int max, int is_signed, int k, int max_index){ |
1290 | 721 int i, v; |
722 int m= 1<<k; | |
2967 | 723 |
724 if(get_cabac(c, state)==0) | |
1290 | 725 return 0; |
2967 | 726 |
1290 | 727 if(0 < max_index) state++; |
2967 | 728 |
729 for(i=1; i<max; i++){ | |
1290 | 730 if(get_cabac(c, state)==0){ |
731 if(is_signed && get_cabac_bypass(c)){ | |
732 return -i; | |
733 }else | |
734 return i; | |
735 } | |
736 | |
737 if(i < max_index) state++; | |
738 } | |
2967 | 739 |
1290 | 740 while(get_cabac_bypass(c)){ |
741 i+= m; | |
742 m+= m; | |
743 } | |
2967 | 744 |
1290 | 745 v=0; |
746 while(m>>=1){ | |
747 v+= v + get_cabac_bypass(c); | |
748 } | |
749 i += v; | |
750 | |
751 if(is_signed && get_cabac_bypass(c)){ | |
752 return -i; | |
753 }else | |
754 return i; | |
755 } | |
5811 | 756 #endif /* 0 */ |
4975 | 757 |
7760 | 758 #endif /* AVCODEC_CABAC_H */ |