Mercurial > libavcodec.hg
annotate cabac.h @ 4021:56a9e98b06a7 libavcodec
Ignore blocks with no samples and flags (but usually with MD5 sum)
author | kostya |
---|---|
date | Sun, 15 Oct 2006 04:50:19 +0000 |
parents | b2582438effe |
children | d550343b5dac |
rev | line source |
---|---|
1287 | 1 /* |
2 * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder | |
3 * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> | |
4 * | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3946
diff
changeset
|
5 * This file is part of FFmpeg. |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3946
diff
changeset
|
6 * |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3946
diff
changeset
|
7 * FFmpeg is free software; you can redistribute it and/or |
1287 | 8 * modify it under the terms of the GNU Lesser General Public |
9 * License as published by the Free Software Foundation; either | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3946
diff
changeset
|
10 * version 2.1 of the License, or (at your option) any later version. |
1287 | 11 * |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3946
diff
changeset
|
12 * FFmpeg is distributed in the hope that it will be useful, |
1287 | 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 * Lesser General Public License for more details. | |
16 * | |
17 * You should have received a copy of the GNU Lesser General Public | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3946
diff
changeset
|
18 * License along with FFmpeg; if not, write to the Free Software |
3036
0b546eab515d
Update licensing information: The FSF changed postal address.
diego
parents:
2967
diff
changeset
|
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
1287 | 20 * |
21 */ | |
2967 | 22 |
1287 | 23 /** |
24 * @file cabac.h | |
25 * Context Adaptive Binary Arithmetic Coder. | |
26 */ | |
27 | |
28 | |
3284
a224d9752912
don't force asserts in release builds. 2% faster h264.
lorenm
parents:
3036
diff
changeset
|
29 //#undef NDEBUG |
1287 | 30 #include <assert.h> |
31 | |
3948
3edbf131ee44
refill cabac variables in 16bit steps, 3% faster get_cabac()
michael
parents:
3947
diff
changeset
|
32 #define CABAC_BITS 16 |
2323 | 33 #define CABAC_MASK ((1<<CABAC_BITS)-1) |
3984 | 34 #define BRANCHLESS_CABAC_DECODER 1 |
3990
746a60ba3177
enable CMOV_IS_FAST as its faster or equal speed on every cpu (duron, athlon, PM, P3) from which ive seen benchmarks, it might be slower on P4 but noone has posted benchmarks ...
michael
parents:
3984
diff
changeset
|
35 #define CMOV_IS_FAST 1 |
2323 | 36 |
1287 | 37 typedef struct CABACContext{ |
38 int low; | |
39 int range; | |
40 int outstanding_count; | |
41 #ifdef STRICT_LIMITS | |
42 int symCount; | |
43 #endif | |
2024
f65d87bfdd5a
some of the warning fixes by (Michael Roitzsch <mroi at users dot sourceforge dot net>)
michael
parents:
1787
diff
changeset
|
44 const uint8_t *bytestream_start; |
f65d87bfdd5a
some of the warning fixes by (Michael Roitzsch <mroi at users dot sourceforge dot net>)
michael
parents:
1787
diff
changeset
|
45 const uint8_t *bytestream; |
2116 | 46 const uint8_t *bytestream_end; |
1287 | 47 PutBitContext pb; |
48 }CABACContext; | |
49 | |
4014
b2582438effe
dehack *ps_state indexing in the branchless decoder
michael
parents:
4012
diff
changeset
|
50 extern uint8_t ff_h264_mlps_state[4*64]; |
3991
72bae00a317f
make lps_range a global table its constant anyway (saves 1 addition for accessing it)
michael
parents:
3990
diff
changeset
|
51 extern uint8_t ff_h264_lps_range[2*65][4]; ///< rangeTabLPS |
3993
8b7c59b7af01
make state transition tables global as they are constant and the code is slightly faster that way
michael
parents:
3992
diff
changeset
|
52 extern uint8_t ff_h264_mps_state[2*64]; ///< transIdxMPS |
8b7c59b7af01
make state transition tables global as they are constant and the code is slightly faster that way
michael
parents:
3992
diff
changeset
|
53 extern uint8_t ff_h264_lps_state[2*64]; ///< transIdxLPS |
3964 | 54 extern const uint8_t ff_h264_norm_shift[128]; |
2323 | 55 |
1287 | 56 |
57 void ff_init_cabac_encoder(CABACContext *c, uint8_t *buf, int buf_size); | |
2024
f65d87bfdd5a
some of the warning fixes by (Michael Roitzsch <mroi at users dot sourceforge dot net>)
michael
parents:
1787
diff
changeset
|
58 void ff_init_cabac_decoder(CABACContext *c, const uint8_t *buf, int buf_size); |
3993
8b7c59b7af01
make state transition tables global as they are constant and the code is slightly faster that way
michael
parents:
3992
diff
changeset
|
59 void ff_init_cabac_states(CABACContext *c); |
1287 | 60 |
61 | |
62 static inline void put_cabac_bit(CABACContext *c, int b){ | |
2967 | 63 put_bits(&c->pb, 1, b); |
64 for(;c->outstanding_count; c->outstanding_count--){ | |
1287 | 65 put_bits(&c->pb, 1, 1-b); |
66 } | |
67 } | |
68 | |
69 static inline void renorm_cabac_encoder(CABACContext *c){ | |
70 while(c->range < 0x100){ | |
71 //FIXME optimize | |
72 if(c->low<0x100){ | |
73 put_cabac_bit(c, 0); | |
74 }else if(c->low<0x200){ | |
75 c->outstanding_count++; | |
76 c->low -= 0x100; | |
77 }else{ | |
78 put_cabac_bit(c, 1); | |
79 c->low -= 0x200; | |
80 } | |
2967 | 81 |
1287 | 82 c->range+= c->range; |
83 c->low += c->low; | |
84 } | |
85 } | |
86 | |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
87 static void put_cabac(CABACContext *c, uint8_t * const state, int bit){ |
3991
72bae00a317f
make lps_range a global table its constant anyway (saves 1 addition for accessing it)
michael
parents:
3990
diff
changeset
|
88 int RangeLPS= ff_h264_lps_range[*state][c->range>>6]; |
2967 | 89 |
1287 | 90 if(bit == ((*state)&1)){ |
91 c->range -= RangeLPS; | |
3993
8b7c59b7af01
make state transition tables global as they are constant and the code is slightly faster that way
michael
parents:
3992
diff
changeset
|
92 *state= ff_h264_mps_state[*state]; |
1287 | 93 }else{ |
94 c->low += c->range - RangeLPS; | |
95 c->range = RangeLPS; | |
3993
8b7c59b7af01
make state transition tables global as they are constant and the code is slightly faster that way
michael
parents:
3992
diff
changeset
|
96 *state= ff_h264_lps_state[*state]; |
1287 | 97 } |
2967 | 98 |
1287 | 99 renorm_cabac_encoder(c); |
100 | |
101 #ifdef STRICT_LIMITS | |
102 c->symCount++; | |
103 #endif | |
104 } | |
105 | |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
106 static void put_cabac_static(CABACContext *c, int RangeLPS, int bit){ |
1287 | 107 assert(c->range > RangeLPS); |
108 | |
109 if(!bit){ | |
110 c->range -= RangeLPS; | |
111 }else{ | |
112 c->low += c->range - RangeLPS; | |
113 c->range = RangeLPS; | |
114 } | |
115 | |
116 renorm_cabac_encoder(c); | |
117 | |
118 #ifdef STRICT_LIMITS | |
119 c->symCount++; | |
120 #endif | |
121 } | |
122 | |
1290 | 123 /** |
124 * @param bit 0 -> write zero bit, !=0 write one bit | |
125 */ | |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
126 static void put_cabac_bypass(CABACContext *c, int bit){ |
1287 | 127 c->low += c->low; |
128 | |
129 if(bit){ | |
130 c->low += c->range; | |
131 } | |
132 //FIXME optimize | |
133 if(c->low<0x200){ | |
134 put_cabac_bit(c, 0); | |
135 }else if(c->low<0x400){ | |
136 c->outstanding_count++; | |
137 c->low -= 0x200; | |
138 }else{ | |
139 put_cabac_bit(c, 1); | |
140 c->low -= 0x400; | |
141 } | |
2967 | 142 |
1287 | 143 #ifdef STRICT_LIMITS |
144 c->symCount++; | |
145 #endif | |
146 } | |
147 | |
1300
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
148 /** |
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
149 * |
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
150 * @return the number of bytes written |
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
151 */ |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
152 static int put_cabac_terminate(CABACContext *c, int bit){ |
1287 | 153 c->range -= 2; |
154 | |
155 if(!bit){ | |
156 renorm_cabac_encoder(c); | |
157 }else{ | |
158 c->low += c->range; | |
159 c->range= 2; | |
2967 | 160 |
1287 | 161 renorm_cabac_encoder(c); |
162 | |
163 assert(c->low <= 0x1FF); | |
164 put_cabac_bit(c, c->low>>9); | |
165 put_bits(&c->pb, 2, ((c->low>>7)&3)|1); | |
2967 | 166 |
1287 | 167 flush_put_bits(&c->pb); //FIXME FIXME FIXME XXX wrong |
168 } | |
2967 | 169 |
1287 | 170 #ifdef STRICT_LIMITS |
171 c->symCount++; | |
172 #endif | |
1300
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
173 |
1787 | 174 return (put_bits_count(&c->pb)+7)>>3; |
1287 | 175 } |
176 | |
1290 | 177 /** |
178 * put (truncated) unary binarization. | |
179 */ | |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
180 static void put_cabac_u(CABACContext *c, uint8_t * state, int v, int max, int max_index, int truncated){ |
1290 | 181 int i; |
2967 | 182 |
1290 | 183 assert(v <= max); |
2967 | 184 |
1290 | 185 #if 1 |
186 for(i=0; i<v; i++){ | |
187 put_cabac(c, state, 1); | |
188 if(i < max_index) state++; | |
189 } | |
190 if(truncated==0 || v<max) | |
191 put_cabac(c, state, 0); | |
192 #else | |
193 if(v <= max_index){ | |
194 for(i=0; i<v; i++){ | |
195 put_cabac(c, state+i, 1); | |
196 } | |
197 if(truncated==0 || v<max) | |
198 put_cabac(c, state+i, 0); | |
199 }else{ | |
200 for(i=0; i<=max_index; i++){ | |
201 put_cabac(c, state+i, 1); | |
202 } | |
203 for(; i<v; i++){ | |
204 put_cabac(c, state+max_index, 1); | |
205 } | |
206 if(truncated==0 || v<max) | |
207 put_cabac(c, state+max_index, 0); | |
208 } | |
209 #endif | |
210 } | |
211 | |
212 /** | |
213 * put unary exp golomb k-th order binarization. | |
214 */ | |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
215 static void put_cabac_ueg(CABACContext *c, uint8_t * state, int v, int max, int is_signed, int k, int max_index){ |
1290 | 216 int i; |
2967 | 217 |
1290 | 218 if(v==0) |
219 put_cabac(c, state, 0); | |
220 else{ | |
1298 | 221 const int sign= v < 0; |
2967 | 222 |
4001 | 223 if(is_signed) v= FFABS(v); |
2967 | 224 |
1290 | 225 if(v<max){ |
226 for(i=0; i<v; i++){ | |
227 put_cabac(c, state, 1); | |
228 if(i < max_index) state++; | |
229 } | |
230 | |
231 put_cabac(c, state, 0); | |
232 }else{ | |
233 int m= 1<<k; | |
234 | |
235 for(i=0; i<max; i++){ | |
236 put_cabac(c, state, 1); | |
237 if(i < max_index) state++; | |
238 } | |
239 | |
240 v -= max; | |
241 while(v >= m){ //FIXME optimize | |
242 put_cabac_bypass(c, 1); | |
243 v-= m; | |
244 m+= m; | |
245 } | |
246 put_cabac_bypass(c, 0); | |
247 while(m>>=1){ | |
248 put_cabac_bypass(c, v&m); | |
249 } | |
250 } | |
251 | |
252 if(is_signed) | |
253 put_cabac_bypass(c, sign); | |
254 } | |
255 } | |
256 | |
2323 | 257 static void refill(CABACContext *c){ |
258 #if CABAC_BITS == 16 | |
3946 | 259 c->low+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1); |
2323 | 260 #else |
261 c->low+= c->bytestream[0]<<1; | |
262 #endif | |
263 c->low -= CABAC_MASK; | |
264 c->bytestream+= CABAC_BITS/8; | |
265 } | |
266 | |
267 static void refill2(CABACContext *c){ | |
268 int i, x; | |
269 | |
270 x= c->low ^ (c->low-1); | |
3964 | 271 i= 7 - ff_h264_norm_shift[x>>(CABAC_BITS+1)]; |
2323 | 272 |
273 x= -CABAC_MASK; | |
2967 | 274 |
2323 | 275 #if CABAC_BITS == 16 |
276 x+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1); | |
277 #else | |
278 x+= c->bytestream[0]<<1; | |
279 #endif | |
2967 | 280 |
2323 | 281 c->low += x<<i; |
282 c->bytestream+= CABAC_BITS/8; | |
283 } | |
284 | |
1287 | 285 static inline void renorm_cabac_decoder(CABACContext *c){ |
2323 | 286 while(c->range < (0x200 << CABAC_BITS)){ |
1287 | 287 c->range+= c->range; |
288 c->low+= c->low; | |
2323 | 289 if(!(c->low & CABAC_MASK)) |
290 refill(c); | |
1287 | 291 } |
292 } | |
293 | |
2323 | 294 static inline void renorm_cabac_decoder_once(CABACContext *c){ |
3951 | 295 #ifdef ARCH_X86_DISABLED |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
296 int temp; |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
297 #if 0 |
3950
900d21b85dd6
renorm_cabac_decoder_once START/STOP_TIMER scores for athlon
michael
parents:
3948
diff
changeset
|
298 //P3:683 athlon:475 |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
299 asm( |
3948
3edbf131ee44
refill cabac variables in 16bit steps, 3% faster get_cabac()
michael
parents:
3947
diff
changeset
|
300 "lea -0x2000000(%0), %2 \n\t" |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
301 "shr $31, %2 \n\t" //FIXME 31->63 for x86-64 |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
302 "shl %%cl, %0 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
303 "shl %%cl, %1 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
304 : "+r"(c->range), "+r"(c->low), "+c"(temp) |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
305 ); |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
306 #elif 0 |
3950
900d21b85dd6
renorm_cabac_decoder_once START/STOP_TIMER scores for athlon
michael
parents:
3948
diff
changeset
|
307 //P3:680 athlon:474 |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
308 asm( |
3948
3edbf131ee44
refill cabac variables in 16bit steps, 3% faster get_cabac()
michael
parents:
3947
diff
changeset
|
309 "cmp $0x2000000, %0 \n\t" |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
310 "setb %%cl \n\t" //FIXME 31->63 for x86-64 |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
311 "shl %%cl, %0 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
312 "shl %%cl, %1 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
313 : "+r"(c->range), "+r"(c->low), "+c"(temp) |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
314 ); |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
315 #elif 1 |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
316 int temp2; |
3950
900d21b85dd6
renorm_cabac_decoder_once START/STOP_TIMER scores for athlon
michael
parents:
3948
diff
changeset
|
317 //P3:665 athlon:517 |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
318 asm( |
3948
3edbf131ee44
refill cabac variables in 16bit steps, 3% faster get_cabac()
michael
parents:
3947
diff
changeset
|
319 "lea -0x2000000(%0), %%eax \n\t" |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
320 "cdq \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
321 "mov %0, %%eax \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
322 "and %%edx, %0 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
323 "and %1, %%edx \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
324 "add %%eax, %0 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
325 "add %%edx, %1 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
326 : "+r"(c->range), "+r"(c->low), "+a"(temp), "+d"(temp2) |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
327 ); |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
328 #elif 0 |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
329 int temp2; |
3950
900d21b85dd6
renorm_cabac_decoder_once START/STOP_TIMER scores for athlon
michael
parents:
3948
diff
changeset
|
330 //P3:673 athlon:509 |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
331 asm( |
3948
3edbf131ee44
refill cabac variables in 16bit steps, 3% faster get_cabac()
michael
parents:
3947
diff
changeset
|
332 "cmp $0x2000000, %0 \n\t" |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
333 "sbb %%edx, %%edx \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
334 "mov %0, %%eax \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
335 "and %%edx, %0 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
336 "and %1, %%edx \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
337 "add %%eax, %0 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
338 "add %%edx, %1 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
339 : "+r"(c->range), "+r"(c->low), "+a"(temp), "+d"(temp2) |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
340 ); |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
341 #else |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
342 int temp2; |
3950
900d21b85dd6
renorm_cabac_decoder_once START/STOP_TIMER scores for athlon
michael
parents:
3948
diff
changeset
|
343 //P3:677 athlon:511 |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
344 asm( |
3948
3edbf131ee44
refill cabac variables in 16bit steps, 3% faster get_cabac()
michael
parents:
3947
diff
changeset
|
345 "cmp $0x2000000, %0 \n\t" |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
346 "lea (%0, %0), %%eax \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
347 "lea (%1, %1), %%edx \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
348 "cmovb %%eax, %0 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
349 "cmovb %%edx, %1 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
350 : "+r"(c->range), "+r"(c->low), "+a"(temp), "+d"(temp2) |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
351 ); |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
352 #endif |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
353 #else |
3950
900d21b85dd6
renorm_cabac_decoder_once START/STOP_TIMER scores for athlon
michael
parents:
3948
diff
changeset
|
354 //P3:675 athlon:476 |
3642 | 355 int shift= (uint32_t)(c->range - (0x200 << CABAC_BITS))>>31; |
356 c->range<<= shift; | |
357 c->low <<= shift; | |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
358 #endif |
2323 | 359 if(!(c->low & CABAC_MASK)) |
360 refill(c); | |
361 } | |
362 | |
4008
b636f3d59283
prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3)
michael
parents:
4002
diff
changeset
|
363 static int always_inline get_cabac_inline(CABACContext *c, uint8_t * const state){ |
3642 | 364 //FIXME gcc generates duplicate load/stores for c->low and c->range |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
365 #ifdef ARCH_X86 |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
366 int bit; |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
367 |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
368 #define LOW "0" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
369 #define RANGE "4" |
3993
8b7c59b7af01
make state transition tables global as they are constant and the code is slightly faster that way
michael
parents:
3992
diff
changeset
|
370 #define BYTESTART "12" |
8b7c59b7af01
make state transition tables global as they are constant and the code is slightly faster that way
michael
parents:
3992
diff
changeset
|
371 #define BYTE "16" |
8b7c59b7af01
make state transition tables global as they are constant and the code is slightly faster that way
michael
parents:
3992
diff
changeset
|
372 #define BYTEEND "20" |
3984 | 373 #ifndef BRANCHLESS_CABAC_DECODER |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
374 asm volatile( |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
375 "movzbl (%1), %%eax \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
376 "movl "RANGE "(%2), %%ebx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
377 "movl "RANGE "(%2), %%edx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
378 "shrl $23, %%ebx \n\t" |
3992 | 379 "movzbl "MANGLE(ff_h264_lps_range)"(%%ebx, %%eax, 4), %%esi\n\t" |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
380 "shll $17, %%esi \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
381 "movl "LOW "(%2), %%ebx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
382 //eax:state ebx:low, edx:range, esi:RangeLPS |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
383 "subl %%esi, %%edx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
384 "cmpl %%edx, %%ebx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
385 " ja 1f \n\t" |
3999
6cbad3675632
slightly faster on P3 slightly slower on athlon and probably faster on P4
michael
parents:
3996
diff
changeset
|
386 |
6cbad3675632
slightly faster on P3 slightly slower on athlon and probably faster on P4
michael
parents:
3996
diff
changeset
|
387 #if 1 |
6cbad3675632
slightly faster on P3 slightly slower on athlon and probably faster on P4
michael
parents:
3996
diff
changeset
|
388 //athlon:4067 P3:4110 |
6cbad3675632
slightly faster on P3 slightly slower on athlon and probably faster on P4
michael
parents:
3996
diff
changeset
|
389 "lea -0x2000000(%%edx), %%ecx \n\t" |
6cbad3675632
slightly faster on P3 slightly slower on athlon and probably faster on P4
michael
parents:
3996
diff
changeset
|
390 "shr $31, %%ecx \n\t" |
6cbad3675632
slightly faster on P3 slightly slower on athlon and probably faster on P4
michael
parents:
3996
diff
changeset
|
391 "shl %%cl, %%edx \n\t" |
6cbad3675632
slightly faster on P3 slightly slower on athlon and probably faster on P4
michael
parents:
3996
diff
changeset
|
392 "shl %%cl, %%ebx \n\t" |
6cbad3675632
slightly faster on P3 slightly slower on athlon and probably faster on P4
michael
parents:
3996
diff
changeset
|
393 #else |
6cbad3675632
slightly faster on P3 slightly slower on athlon and probably faster on P4
michael
parents:
3996
diff
changeset
|
394 //athlon:4057 P3:4130 |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
395 "cmp $0x2000000, %%edx \n\t" //FIXME avoidable |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
396 "setb %%cl \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
397 "shl %%cl, %%edx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
398 "shl %%cl, %%ebx \n\t" |
3999
6cbad3675632
slightly faster on P3 slightly slower on athlon and probably faster on P4
michael
parents:
3996
diff
changeset
|
399 #endif |
3993
8b7c59b7af01
make state transition tables global as they are constant and the code is slightly faster that way
michael
parents:
3992
diff
changeset
|
400 "movzbl "MANGLE(ff_h264_mps_state)"(%%eax), %%ecx \n\t" |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
401 "movb %%cl, (%1) \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
402 //eax:state ebx:low, edx:range, esi:RangeLPS |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
403 "test %%bx, %%bx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
404 " jnz 2f \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
405 "movl "BYTE "(%2), %%esi \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
406 "subl $0xFFFF, %%ebx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
407 "movzwl (%%esi), %%ecx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
408 "bswap %%ecx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
409 "shrl $15, %%ecx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
410 "addl $2, %%esi \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
411 "addl %%ecx, %%ebx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
412 "movl %%esi, "BYTE "(%2) \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
413 "jmp 2f \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
414 "1: \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
415 //eax:state ebx:low, edx:range, esi:RangeLPS |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
416 "subl %%edx, %%ebx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
417 "movl %%esi, %%edx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
418 "shr $19, %%esi \n\t" |
3996
c4ff7d570f19
moving lps state transition code a little up in the branched asm code (1% faster on P3)
michael
parents:
3995
diff
changeset
|
419 "movzbl "MANGLE(ff_h264_lps_state)"(%%eax), %%ecx \n\t" |
c4ff7d570f19
moving lps state transition code a little up in the branched asm code (1% faster on P3)
michael
parents:
3995
diff
changeset
|
420 "movb %%cl, (%1) \n\t" |
3979
ce16f66a48ad
reading 8bit mem into a 8bit register needs 2 uops on P4, 8bit->32bit with zero extension needs just 1
michael
parents:
3978
diff
changeset
|
421 "movzbl " MANGLE(ff_h264_norm_shift) "(%%esi), %%ecx \n\t" |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
422 "shll %%cl, %%ebx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
423 "shll %%cl, %%edx \n\t" |
3978 | 424 "addl $1, %%eax \n\t" |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
425 "test %%bx, %%bx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
426 " jnz 2f \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
427 |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
428 "movl "BYTE "(%2), %%ecx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
429 "movzwl (%%ecx), %%esi \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
430 "bswap %%esi \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
431 "shrl $15, %%esi \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
432 "subl $0xFFFF, %%esi \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
433 "addl $2, %%ecx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
434 "movl %%ecx, "BYTE "(%2) \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
435 |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
436 "leal -1(%%ebx), %%ecx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
437 "xorl %%ebx, %%ecx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
438 "shrl $17, %%ecx \n\t" |
3979
ce16f66a48ad
reading 8bit mem into a 8bit register needs 2 uops on P4, 8bit->32bit with zero extension needs just 1
michael
parents:
3978
diff
changeset
|
439 "movzbl " MANGLE(ff_h264_norm_shift) "(%%ecx), %%ecx \n\t" |
3994
2734b228fc87
use ecx instead of cl (no speed change on P3 but might avoid partial register stalls on some cpus)
michael
parents:
3993
diff
changeset
|
440 "neg %%ecx \n\t" |
2734b228fc87
use ecx instead of cl (no speed change on P3 but might avoid partial register stalls on some cpus)
michael
parents:
3993
diff
changeset
|
441 "add $7, %%ecx \n\t" |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
442 |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
443 "shll %%cl , %%esi \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
444 "addl %%esi, %%ebx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
445 "2: \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
446 "movl %%edx, "RANGE "(%2) \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
447 "movl %%ebx, "LOW "(%2) \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
448 :"=&a"(bit) //FIXME this is fragile gcc either runs out of registers or misscompiles it (for example if "+a"(bit) or "+m"(*state) is used |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
449 :"r"(state), "r"(c) |
4012
f8c649ac09dd
add "memory" to the clobber list we change memory so we need it, this also fixes some problems with gcc svn
michael
parents:
4008
diff
changeset
|
450 : "%ecx", "%ebx", "%edx", "%esi", "memory" |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
451 ); |
3982
af16271634c2
moving another bit&1 out, this is as fast as with it in there, but it makes more sense with it outside of the loop
michael
parents:
3981
diff
changeset
|
452 bit&=1; |
4002
ec426fa57dfe
adds some useful comments after some of the #else, #elseif,
gpoirier
parents:
4001
diff
changeset
|
453 #else /* BRANCHLESS_CABAC_DECODER */ |
3975 | 454 asm volatile( |
455 "movzbl (%1), %%eax \n\t" | |
456 "movl "RANGE "(%2), %%ebx \n\t" | |
457 "movl "RANGE "(%2), %%edx \n\t" | |
458 "shrl $23, %%ebx \n\t" | |
3992 | 459 "movzbl "MANGLE(ff_h264_lps_range)"(%%ebx, %%eax, 4), %%esi\n\t" |
3975 | 460 "shll $17, %%esi \n\t" |
461 "movl "LOW "(%2), %%ebx \n\t" | |
462 //eax:state ebx:low, edx:range, esi:RangeLPS | |
463 "subl %%esi, %%edx \n\t" | |
3980 | 464 #ifdef CMOV_IS_FAST //FIXME actually define this somewhere |
465 "cmpl %%ebx, %%edx \n\t" | |
466 "cmova %%edx, %%esi \n\t" | |
467 "sbbl %%ecx, %%ecx \n\t" | |
468 "andl %%ecx, %%edx \n\t" | |
469 "subl %%edx, %%ebx \n\t" | |
470 "xorl %%ecx, %%eax \n\t" | |
4002
ec426fa57dfe
adds some useful comments after some of the #else, #elseif,
gpoirier
parents:
4001
diff
changeset
|
471 #else /* CMOV_IS_FAST */ |
3975 | 472 "movl %%edx, %%ecx \n\t" |
473 "subl %%ebx, %%edx \n\t" | |
474 "sarl $31, %%edx \n\t" //lps_mask | |
475 "subl %%ecx, %%esi \n\t" //RangeLPS - range | |
476 "andl %%edx, %%esi \n\t" //(RangeLPS - range)&lps_mask | |
477 "addl %%ecx, %%esi \n\t" //new range | |
478 "andl %%edx, %%ecx \n\t" | |
479 "subl %%ecx, %%ebx \n\t" | |
3980 | 480 "xorl %%edx, %%eax \n\t" |
4002
ec426fa57dfe
adds some useful comments after some of the #else, #elseif,
gpoirier
parents:
4001
diff
changeset
|
481 #endif /* CMOV_IS_FAST */ |
3975 | 482 |
483 //eax:state ebx:low edx:mask esi:range | |
4014
b2582438effe
dehack *ps_state indexing in the branchless decoder
michael
parents:
4012
diff
changeset
|
484 "movzbl "MANGLE(ff_h264_mlps_state)"+128(%%eax), %%ecx \n\t" |
3975 | 485 "movb %%cl, (%1) \n\t" |
486 | |
487 "movl %%esi, %%edx \n\t" | |
488 //eax:bit ebx:low edx:range esi:range | |
489 | |
490 "shr $19, %%esi \n\t" | |
3979
ce16f66a48ad
reading 8bit mem into a 8bit register needs 2 uops on P4, 8bit->32bit with zero extension needs just 1
michael
parents:
3978
diff
changeset
|
491 "movzbl " MANGLE(ff_h264_norm_shift) "(%%esi), %%ecx \n\t" |
3995
b00c06477dff
write cabac low and range variables as early as possible to prevent stalls from reading them before they where written, the P4 is said to disslike that alot, on P3 its 2% faster (START/STOP_TIMER over decode_residual)
michael
parents:
3994
diff
changeset
|
492 "shll %%cl, %%edx \n\t" |
b00c06477dff
write cabac low and range variables as early as possible to prevent stalls from reading them before they where written, the P4 is said to disslike that alot, on P3 its 2% faster (START/STOP_TIMER over decode_residual)
michael
parents:
3994
diff
changeset
|
493 "movl %%edx, "RANGE "(%2) \n\t" |
3975 | 494 "shll %%cl, %%ebx \n\t" |
3995
b00c06477dff
write cabac low and range variables as early as possible to prevent stalls from reading them before they where written, the P4 is said to disslike that alot, on P3 its 2% faster (START/STOP_TIMER over decode_residual)
michael
parents:
3994
diff
changeset
|
495 "movl %%ebx, "LOW "(%2) \n\t" |
3975 | 496 "test %%bx, %%bx \n\t" |
497 " jnz 1f \n\t" | |
498 | |
499 "movl "BYTE "(%2), %%ecx \n\t" | |
500 "movzwl (%%ecx), %%esi \n\t" | |
501 "bswap %%esi \n\t" | |
502 "shrl $15, %%esi \n\t" | |
503 "subl $0xFFFF, %%esi \n\t" | |
504 "addl $2, %%ecx \n\t" | |
505 "movl %%ecx, "BYTE "(%2) \n\t" | |
506 | |
507 "leal -1(%%ebx), %%ecx \n\t" | |
508 "xorl %%ebx, %%ecx \n\t" | |
509 "shrl $17, %%ecx \n\t" | |
3979
ce16f66a48ad
reading 8bit mem into a 8bit register needs 2 uops on P4, 8bit->32bit with zero extension needs just 1
michael
parents:
3978
diff
changeset
|
510 "movzbl " MANGLE(ff_h264_norm_shift) "(%%ecx), %%ecx \n\t" |
3994
2734b228fc87
use ecx instead of cl (no speed change on P3 but might avoid partial register stalls on some cpus)
michael
parents:
3993
diff
changeset
|
511 "neg %%ecx \n\t" |
2734b228fc87
use ecx instead of cl (no speed change on P3 but might avoid partial register stalls on some cpus)
michael
parents:
3993
diff
changeset
|
512 "add $7, %%ecx \n\t" |
3975 | 513 |
514 "shll %%cl , %%esi \n\t" | |
515 "addl %%esi, %%ebx \n\t" | |
3995
b00c06477dff
write cabac low and range variables as early as possible to prevent stalls from reading them before they where written, the P4 is said to disslike that alot, on P3 its 2% faster (START/STOP_TIMER over decode_residual)
michael
parents:
3994
diff
changeset
|
516 "movl %%ebx, "LOW "(%2) \n\t" |
3975 | 517 "1: \n\t" |
518 :"=&a"(bit) | |
519 :"r"(state), "r"(c) | |
4012
f8c649ac09dd
add "memory" to the clobber list we change memory so we need it, this also fixes some problems with gcc svn
michael
parents:
4008
diff
changeset
|
520 : "%ecx", "%ebx", "%edx", "%esi", "memory" |
3975 | 521 ); |
3981
9854f686ba79
move the &1 out of the asm so gcc can optimize it away in inlined cases (yes this is slightly faster)
michael
parents:
3980
diff
changeset
|
522 bit&=1; |
4002
ec426fa57dfe
adds some useful comments after some of the #else, #elseif,
gpoirier
parents:
4001
diff
changeset
|
523 #endif /* BRANCHLESS_CABAC_DECODER */ |
ec426fa57dfe
adds some useful comments after some of the #else, #elseif,
gpoirier
parents:
4001
diff
changeset
|
524 #else /* ARCH_X86 */ |
3642 | 525 int s = *state; |
3991
72bae00a317f
make lps_range a global table its constant anyway (saves 1 addition for accessing it)
michael
parents:
3990
diff
changeset
|
526 int RangeLPS= ff_h264_lps_range[s][c->range>>(CABAC_BITS+7)]<<(CABAC_BITS+1); |
2522
e25782262d7d
kill warnings patch by (M«©ns Rullg«©rd <mru inprovide com>)
michael
parents:
2323
diff
changeset
|
527 int bit, lps_mask attribute_unused; |
2967 | 528 |
1287 | 529 c->range -= RangeLPS; |
3984 | 530 #ifndef BRANCHLESS_CABAC_DECODER |
1287 | 531 if(c->low < c->range){ |
3642 | 532 bit= s&1; |
3993
8b7c59b7af01
make state transition tables global as they are constant and the code is slightly faster that way
michael
parents:
3992
diff
changeset
|
533 *state= ff_h264_mps_state[s]; |
2323 | 534 renorm_cabac_decoder_once(c); |
1287 | 535 }else{ |
3964 | 536 bit= ff_h264_norm_shift[RangeLPS>>19]; |
1287 | 537 c->low -= c->range; |
3993
8b7c59b7af01
make state transition tables global as they are constant and the code is slightly faster that way
michael
parents:
3992
diff
changeset
|
538 *state= ff_h264_lps_state[s]; |
3956
0910f2844f9a
branchless renormalization (1% faster get_cabac) old branchless renormalization wasnt faster because gcc was scared of the shift variable (missusing bit variable now)
michael
parents:
3955
diff
changeset
|
539 c->range = RangeLPS<<bit; |
0910f2844f9a
branchless renormalization (1% faster get_cabac) old branchless renormalization wasnt faster because gcc was scared of the shift variable (missusing bit variable now)
michael
parents:
3955
diff
changeset
|
540 c->low <<= bit; |
0910f2844f9a
branchless renormalization (1% faster get_cabac) old branchless renormalization wasnt faster because gcc was scared of the shift variable (missusing bit variable now)
michael
parents:
3955
diff
changeset
|
541 bit= (s&1)^1; |
0910f2844f9a
branchless renormalization (1% faster get_cabac) old branchless renormalization wasnt faster because gcc was scared of the shift variable (missusing bit variable now)
michael
parents:
3955
diff
changeset
|
542 |
2323 | 543 if(!(c->low & 0xFFFF)){ |
544 refill2(c); | |
3956
0910f2844f9a
branchless renormalization (1% faster get_cabac) old branchless renormalization wasnt faster because gcc was scared of the shift variable (missusing bit variable now)
michael
parents:
3955
diff
changeset
|
545 } |
1287 | 546 } |
4002
ec426fa57dfe
adds some useful comments after some of the #else, #elseif,
gpoirier
parents:
4001
diff
changeset
|
547 #else /* BRANCHLESS_CABAC_DECODER */ |
2323 | 548 lps_mask= (c->range - c->low)>>31; |
2967 | 549 |
2323 | 550 c->low -= c->range & lps_mask; |
551 c->range += (RangeLPS - c->range) & lps_mask; | |
2967 | 552 |
3974 | 553 s^=lps_mask; |
4014
b2582438effe
dehack *ps_state indexing in the branchless decoder
michael
parents:
4012
diff
changeset
|
554 *state= (ff_h264_mlps_state+128)[s]; |
3974 | 555 bit= s&1; |
2967 | 556 |
3970 | 557 lps_mask= ff_h264_norm_shift[c->range>>(CABAC_BITS+3)]; |
2323 | 558 c->range<<= lps_mask; |
559 c->low <<= lps_mask; | |
560 if(!(c->low & CABAC_MASK)) | |
561 refill2(c); | |
4002
ec426fa57dfe
adds some useful comments after some of the #else, #elseif,
gpoirier
parents:
4001
diff
changeset
|
562 #endif /* BRANCHLESS_CABAC_DECODER */ |
ec426fa57dfe
adds some useful comments after some of the #else, #elseif,
gpoirier
parents:
4001
diff
changeset
|
563 #endif /* ARCH_X86 */ |
2967 | 564 return bit; |
1287 | 565 } |
566 | |
4008
b636f3d59283
prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3)
michael
parents:
4002
diff
changeset
|
567 static int __attribute((noinline)) get_cabac_noinline(CABACContext *c, uint8_t * const state){ |
b636f3d59283
prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3)
michael
parents:
4002
diff
changeset
|
568 return get_cabac_inline(c,state); |
b636f3d59283
prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3)
michael
parents:
4002
diff
changeset
|
569 } |
b636f3d59283
prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3)
michael
parents:
4002
diff
changeset
|
570 |
b636f3d59283
prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3)
michael
parents:
4002
diff
changeset
|
571 static int get_cabac(CABACContext *c, uint8_t * const state){ |
b636f3d59283
prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3)
michael
parents:
4002
diff
changeset
|
572 return get_cabac_inline(c,state); |
b636f3d59283
prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3)
michael
parents:
4002
diff
changeset
|
573 } |
b636f3d59283
prevent "mb level" get_cabac() calls from being inlined (3% faster decode_mb_cabac() on P3)
michael
parents:
4002
diff
changeset
|
574 |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
575 static int get_cabac_bypass(CABACContext *c){ |
1287 | 576 c->low += c->low; |
577 | |
2323 | 578 if(!(c->low & CABAC_MASK)) |
579 refill(c); | |
2967 | 580 |
1287 | 581 if(c->low < c->range){ |
582 return 0; | |
583 }else{ | |
584 c->low -= c->range; | |
585 return 1; | |
586 } | |
587 } | |
588 | |
1300
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
589 /** |
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
590 * |
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
591 * @return the number of bytes read or 0 if no end |
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
592 */ |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
593 static int get_cabac_terminate(CABACContext *c){ |
2323 | 594 c->range -= 4<<CABAC_BITS; |
1287 | 595 if(c->low < c->range){ |
2323 | 596 renorm_cabac_decoder_once(c); |
1287 | 597 return 0; |
598 }else{ | |
1300
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
599 return c->bytestream - c->bytestream_start; |
2967 | 600 } |
1287 | 601 } |
602 | |
1290 | 603 /** |
604 * get (truncated) unnary binarization. | |
605 */ | |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
606 static int get_cabac_u(CABACContext *c, uint8_t * state, int max, int max_index, int truncated){ |
1290 | 607 int i; |
2967 | 608 |
609 for(i=0; i<max; i++){ | |
1290 | 610 if(get_cabac(c, state)==0) |
611 return i; | |
2967 | 612 |
1290 | 613 if(i< max_index) state++; |
614 } | |
615 | |
616 return truncated ? max : -1; | |
617 } | |
618 | |
619 /** | |
620 * get unary exp golomb k-th order binarization. | |
621 */ | |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
622 static int get_cabac_ueg(CABACContext *c, uint8_t * state, int max, int is_signed, int k, int max_index){ |
1290 | 623 int i, v; |
624 int m= 1<<k; | |
2967 | 625 |
626 if(get_cabac(c, state)==0) | |
1290 | 627 return 0; |
2967 | 628 |
1290 | 629 if(0 < max_index) state++; |
2967 | 630 |
631 for(i=1; i<max; i++){ | |
1290 | 632 if(get_cabac(c, state)==0){ |
633 if(is_signed && get_cabac_bypass(c)){ | |
634 return -i; | |
635 }else | |
636 return i; | |
637 } | |
638 | |
639 if(i < max_index) state++; | |
640 } | |
2967 | 641 |
1290 | 642 while(get_cabac_bypass(c)){ |
643 i+= m; | |
644 m+= m; | |
645 } | |
2967 | 646 |
1290 | 647 v=0; |
648 while(m>>=1){ | |
649 v+= v + get_cabac_bypass(c); | |
650 } | |
651 i += v; | |
652 | |
653 if(is_signed && get_cabac_bypass(c)){ | |
654 return -i; | |
655 }else | |
656 return i; | |
657 } |