Mercurial > libavcodec.hg
annotate cabac.h @ 3980:5afe4253a220 libavcodec
replace a few and/sub/... by cmov
this is faster on P3, should be faster on AMD, and should be slower on P4
its disabled by default (benchmarks welcome so we know when to enable it)
author | michael |
---|---|
date | Tue, 10 Oct 2006 01:08:39 +0000 |
parents | ce16f66a48ad |
children | 9854f686ba79 |
rev | line source |
---|---|
1287 | 1 /* |
2 * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder | |
3 * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> | |
4 * | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3946
diff
changeset
|
5 * This file is part of FFmpeg. |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3946
diff
changeset
|
6 * |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3946
diff
changeset
|
7 * FFmpeg is free software; you can redistribute it and/or |
1287 | 8 * modify it under the terms of the GNU Lesser General Public |
9 * License as published by the Free Software Foundation; either | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3946
diff
changeset
|
10 * version 2.1 of the License, or (at your option) any later version. |
1287 | 11 * |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3946
diff
changeset
|
12 * FFmpeg is distributed in the hope that it will be useful, |
1287 | 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 * Lesser General Public License for more details. | |
16 * | |
17 * You should have received a copy of the GNU Lesser General Public | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3946
diff
changeset
|
18 * License along with FFmpeg; if not, write to the Free Software |
3036
0b546eab515d
Update licensing information: The FSF changed postal address.
diego
parents:
2967
diff
changeset
|
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
1287 | 20 * |
21 */ | |
2967 | 22 |
1287 | 23 /** |
24 * @file cabac.h | |
25 * Context Adaptive Binary Arithmetic Coder. | |
26 */ | |
27 | |
28 | |
3284
a224d9752912
don't force asserts in release builds. 2% faster h264.
lorenm
parents:
3036
diff
changeset
|
29 //#undef NDEBUG |
1287 | 30 #include <assert.h> |
31 | |
3948
3edbf131ee44
refill cabac variables in 16bit steps, 3% faster get_cabac()
michael
parents:
3947
diff
changeset
|
32 #define CABAC_BITS 16 |
2323 | 33 #define CABAC_MASK ((1<<CABAC_BITS)-1) |
3975 | 34 #define BRANCHLESS_CABAD 1 |
2323 | 35 |
1287 | 36 typedef struct CABACContext{ |
37 int low; | |
38 int range; | |
39 int outstanding_count; | |
40 #ifdef STRICT_LIMITS | |
41 int symCount; | |
42 #endif | |
3976
27e90123b346
reverse remainder of the failed attempt to optimize *state=c->mps_state[s]
michael
parents:
3975
diff
changeset
|
43 uint8_t lps_range[2*65][4]; ///< rangeTabLPS |
27e90123b346
reverse remainder of the failed attempt to optimize *state=c->mps_state[s]
michael
parents:
3975
diff
changeset
|
44 uint8_t lps_state[2*64]; ///< transIdxLPS |
27e90123b346
reverse remainder of the failed attempt to optimize *state=c->mps_state[s]
michael
parents:
3975
diff
changeset
|
45 uint8_t mps_state[2*64]; ///< transIdxMPS |
2024
f65d87bfdd5a
some of the warning fixes by (Michael Roitzsch <mroi at users dot sourceforge dot net>)
michael
parents:
1787
diff
changeset
|
46 const uint8_t *bytestream_start; |
f65d87bfdd5a
some of the warning fixes by (Michael Roitzsch <mroi at users dot sourceforge dot net>)
michael
parents:
1787
diff
changeset
|
47 const uint8_t *bytestream; |
2116 | 48 const uint8_t *bytestream_end; |
1287 | 49 PutBitContext pb; |
50 }CABACContext; | |
51 | |
1301 | 52 extern const uint8_t ff_h264_lps_range[64][4]; |
53 extern const uint8_t ff_h264_mps_state[64]; | |
54 extern const uint8_t ff_h264_lps_state[64]; | |
3964 | 55 extern const uint8_t ff_h264_norm_shift[128]; |
2323 | 56 |
1287 | 57 |
58 void ff_init_cabac_encoder(CABACContext *c, uint8_t *buf, int buf_size); | |
2024
f65d87bfdd5a
some of the warning fixes by (Michael Roitzsch <mroi at users dot sourceforge dot net>)
michael
parents:
1787
diff
changeset
|
59 void ff_init_cabac_decoder(CABACContext *c, const uint8_t *buf, int buf_size); |
2967 | 60 void ff_init_cabac_states(CABACContext *c, uint8_t const (*lps_range)[4], |
1287 | 61 uint8_t const *mps_state, uint8_t const *lps_state, int state_count); |
62 | |
63 | |
64 static inline void put_cabac_bit(CABACContext *c, int b){ | |
2967 | 65 put_bits(&c->pb, 1, b); |
66 for(;c->outstanding_count; c->outstanding_count--){ | |
1287 | 67 put_bits(&c->pb, 1, 1-b); |
68 } | |
69 } | |
70 | |
71 static inline void renorm_cabac_encoder(CABACContext *c){ | |
72 while(c->range < 0x100){ | |
73 //FIXME optimize | |
74 if(c->low<0x100){ | |
75 put_cabac_bit(c, 0); | |
76 }else if(c->low<0x200){ | |
77 c->outstanding_count++; | |
78 c->low -= 0x100; | |
79 }else{ | |
80 put_cabac_bit(c, 1); | |
81 c->low -= 0x200; | |
82 } | |
2967 | 83 |
1287 | 84 c->range+= c->range; |
85 c->low += c->low; | |
86 } | |
87 } | |
88 | |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
89 static void put_cabac(CABACContext *c, uint8_t * const state, int bit){ |
2323 | 90 int RangeLPS= c->lps_range[*state][c->range>>6]; |
2967 | 91 |
1287 | 92 if(bit == ((*state)&1)){ |
93 c->range -= RangeLPS; | |
94 *state= c->mps_state[*state]; | |
95 }else{ | |
96 c->low += c->range - RangeLPS; | |
97 c->range = RangeLPS; | |
98 *state= c->lps_state[*state]; | |
99 } | |
2967 | 100 |
1287 | 101 renorm_cabac_encoder(c); |
102 | |
103 #ifdef STRICT_LIMITS | |
104 c->symCount++; | |
105 #endif | |
106 } | |
107 | |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
108 static void put_cabac_static(CABACContext *c, int RangeLPS, int bit){ |
1287 | 109 assert(c->range > RangeLPS); |
110 | |
111 if(!bit){ | |
112 c->range -= RangeLPS; | |
113 }else{ | |
114 c->low += c->range - RangeLPS; | |
115 c->range = RangeLPS; | |
116 } | |
117 | |
118 renorm_cabac_encoder(c); | |
119 | |
120 #ifdef STRICT_LIMITS | |
121 c->symCount++; | |
122 #endif | |
123 } | |
124 | |
1290 | 125 /** |
126 * @param bit 0 -> write zero bit, !=0 write one bit | |
127 */ | |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
128 static void put_cabac_bypass(CABACContext *c, int bit){ |
1287 | 129 c->low += c->low; |
130 | |
131 if(bit){ | |
132 c->low += c->range; | |
133 } | |
134 //FIXME optimize | |
135 if(c->low<0x200){ | |
136 put_cabac_bit(c, 0); | |
137 }else if(c->low<0x400){ | |
138 c->outstanding_count++; | |
139 c->low -= 0x200; | |
140 }else{ | |
141 put_cabac_bit(c, 1); | |
142 c->low -= 0x400; | |
143 } | |
2967 | 144 |
1287 | 145 #ifdef STRICT_LIMITS |
146 c->symCount++; | |
147 #endif | |
148 } | |
149 | |
1300
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
150 /** |
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
151 * |
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
152 * @return the number of bytes written |
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
153 */ |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
154 static int put_cabac_terminate(CABACContext *c, int bit){ |
1287 | 155 c->range -= 2; |
156 | |
157 if(!bit){ | |
158 renorm_cabac_encoder(c); | |
159 }else{ | |
160 c->low += c->range; | |
161 c->range= 2; | |
2967 | 162 |
1287 | 163 renorm_cabac_encoder(c); |
164 | |
165 assert(c->low <= 0x1FF); | |
166 put_cabac_bit(c, c->low>>9); | |
167 put_bits(&c->pb, 2, ((c->low>>7)&3)|1); | |
2967 | 168 |
1287 | 169 flush_put_bits(&c->pb); //FIXME FIXME FIXME XXX wrong |
170 } | |
2967 | 171 |
1287 | 172 #ifdef STRICT_LIMITS |
173 c->symCount++; | |
174 #endif | |
1300
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
175 |
1787 | 176 return (put_bits_count(&c->pb)+7)>>3; |
1287 | 177 } |
178 | |
1290 | 179 /** |
180 * put (truncated) unary binarization. | |
181 */ | |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
182 static void put_cabac_u(CABACContext *c, uint8_t * state, int v, int max, int max_index, int truncated){ |
1290 | 183 int i; |
2967 | 184 |
1290 | 185 assert(v <= max); |
2967 | 186 |
1290 | 187 #if 1 |
188 for(i=0; i<v; i++){ | |
189 put_cabac(c, state, 1); | |
190 if(i < max_index) state++; | |
191 } | |
192 if(truncated==0 || v<max) | |
193 put_cabac(c, state, 0); | |
194 #else | |
195 if(v <= max_index){ | |
196 for(i=0; i<v; i++){ | |
197 put_cabac(c, state+i, 1); | |
198 } | |
199 if(truncated==0 || v<max) | |
200 put_cabac(c, state+i, 0); | |
201 }else{ | |
202 for(i=0; i<=max_index; i++){ | |
203 put_cabac(c, state+i, 1); | |
204 } | |
205 for(; i<v; i++){ | |
206 put_cabac(c, state+max_index, 1); | |
207 } | |
208 if(truncated==0 || v<max) | |
209 put_cabac(c, state+max_index, 0); | |
210 } | |
211 #endif | |
212 } | |
213 | |
214 /** | |
215 * put unary exp golomb k-th order binarization. | |
216 */ | |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
217 static void put_cabac_ueg(CABACContext *c, uint8_t * state, int v, int max, int is_signed, int k, int max_index){ |
1290 | 218 int i; |
2967 | 219 |
1290 | 220 if(v==0) |
221 put_cabac(c, state, 0); | |
222 else{ | |
1298 | 223 const int sign= v < 0; |
2967 | 224 |
1298 | 225 if(is_signed) v= ABS(v); |
2967 | 226 |
1290 | 227 if(v<max){ |
228 for(i=0; i<v; i++){ | |
229 put_cabac(c, state, 1); | |
230 if(i < max_index) state++; | |
231 } | |
232 | |
233 put_cabac(c, state, 0); | |
234 }else{ | |
235 int m= 1<<k; | |
236 | |
237 for(i=0; i<max; i++){ | |
238 put_cabac(c, state, 1); | |
239 if(i < max_index) state++; | |
240 } | |
241 | |
242 v -= max; | |
243 while(v >= m){ //FIXME optimize | |
244 put_cabac_bypass(c, 1); | |
245 v-= m; | |
246 m+= m; | |
247 } | |
248 put_cabac_bypass(c, 0); | |
249 while(m>>=1){ | |
250 put_cabac_bypass(c, v&m); | |
251 } | |
252 } | |
253 | |
254 if(is_signed) | |
255 put_cabac_bypass(c, sign); | |
256 } | |
257 } | |
258 | |
2323 | 259 static void refill(CABACContext *c){ |
260 #if CABAC_BITS == 16 | |
3946 | 261 c->low+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1); |
2323 | 262 #else |
263 c->low+= c->bytestream[0]<<1; | |
264 #endif | |
265 c->low -= CABAC_MASK; | |
266 c->bytestream+= CABAC_BITS/8; | |
267 } | |
268 | |
269 static void refill2(CABACContext *c){ | |
270 int i, x; | |
271 | |
272 x= c->low ^ (c->low-1); | |
3964 | 273 i= 7 - ff_h264_norm_shift[x>>(CABAC_BITS+1)]; |
2323 | 274 |
275 x= -CABAC_MASK; | |
2967 | 276 |
2323 | 277 #if CABAC_BITS == 16 |
278 x+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1); | |
279 #else | |
280 x+= c->bytestream[0]<<1; | |
281 #endif | |
2967 | 282 |
2323 | 283 c->low += x<<i; |
284 c->bytestream+= CABAC_BITS/8; | |
285 } | |
286 | |
1287 | 287 static inline void renorm_cabac_decoder(CABACContext *c){ |
2323 | 288 while(c->range < (0x200 << CABAC_BITS)){ |
1287 | 289 c->range+= c->range; |
290 c->low+= c->low; | |
2323 | 291 if(!(c->low & CABAC_MASK)) |
292 refill(c); | |
1287 | 293 } |
294 } | |
295 | |
2323 | 296 static inline void renorm_cabac_decoder_once(CABACContext *c){ |
3951 | 297 #ifdef ARCH_X86_DISABLED |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
298 int temp; |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
299 #if 0 |
3950
900d21b85dd6
renorm_cabac_decoder_once START/STOP_TIMER scores for athlon
michael
parents:
3948
diff
changeset
|
300 //P3:683 athlon:475 |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
301 asm( |
3948
3edbf131ee44
refill cabac variables in 16bit steps, 3% faster get_cabac()
michael
parents:
3947
diff
changeset
|
302 "lea -0x2000000(%0), %2 \n\t" |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
303 "shr $31, %2 \n\t" //FIXME 31->63 for x86-64 |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
304 "shl %%cl, %0 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
305 "shl %%cl, %1 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
306 : "+r"(c->range), "+r"(c->low), "+c"(temp) |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
307 ); |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
308 #elif 0 |
3950
900d21b85dd6
renorm_cabac_decoder_once START/STOP_TIMER scores for athlon
michael
parents:
3948
diff
changeset
|
309 //P3:680 athlon:474 |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
310 asm( |
3948
3edbf131ee44
refill cabac variables in 16bit steps, 3% faster get_cabac()
michael
parents:
3947
diff
changeset
|
311 "cmp $0x2000000, %0 \n\t" |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
312 "setb %%cl \n\t" //FIXME 31->63 for x86-64 |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
313 "shl %%cl, %0 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
314 "shl %%cl, %1 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
315 : "+r"(c->range), "+r"(c->low), "+c"(temp) |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
316 ); |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
317 #elif 1 |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
318 int temp2; |
3950
900d21b85dd6
renorm_cabac_decoder_once START/STOP_TIMER scores for athlon
michael
parents:
3948
diff
changeset
|
319 //P3:665 athlon:517 |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
320 asm( |
3948
3edbf131ee44
refill cabac variables in 16bit steps, 3% faster get_cabac()
michael
parents:
3947
diff
changeset
|
321 "lea -0x2000000(%0), %%eax \n\t" |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
322 "cdq \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
323 "mov %0, %%eax \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
324 "and %%edx, %0 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
325 "and %1, %%edx \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
326 "add %%eax, %0 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
327 "add %%edx, %1 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
328 : "+r"(c->range), "+r"(c->low), "+a"(temp), "+d"(temp2) |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
329 ); |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
330 #elif 0 |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
331 int temp2; |
3950
900d21b85dd6
renorm_cabac_decoder_once START/STOP_TIMER scores for athlon
michael
parents:
3948
diff
changeset
|
332 //P3:673 athlon:509 |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
333 asm( |
3948
3edbf131ee44
refill cabac variables in 16bit steps, 3% faster get_cabac()
michael
parents:
3947
diff
changeset
|
334 "cmp $0x2000000, %0 \n\t" |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
335 "sbb %%edx, %%edx \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
336 "mov %0, %%eax \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
337 "and %%edx, %0 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
338 "and %1, %%edx \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
339 "add %%eax, %0 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
340 "add %%edx, %1 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
341 : "+r"(c->range), "+r"(c->low), "+a"(temp), "+d"(temp2) |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
342 ); |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
343 #else |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
344 int temp2; |
3950
900d21b85dd6
renorm_cabac_decoder_once START/STOP_TIMER scores for athlon
michael
parents:
3948
diff
changeset
|
345 //P3:677 athlon:511 |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
346 asm( |
3948
3edbf131ee44
refill cabac variables in 16bit steps, 3% faster get_cabac()
michael
parents:
3947
diff
changeset
|
347 "cmp $0x2000000, %0 \n\t" |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
348 "lea (%0, %0), %%eax \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
349 "lea (%1, %1), %%edx \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
350 "cmovb %%eax, %0 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
351 "cmovb %%edx, %1 \n\t" |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
352 : "+r"(c->range), "+r"(c->low), "+a"(temp), "+d"(temp2) |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
353 ); |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
354 #endif |
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
355 #else |
3950
900d21b85dd6
renorm_cabac_decoder_once START/STOP_TIMER scores for athlon
michael
parents:
3948
diff
changeset
|
356 //P3:675 athlon:476 |
3642 | 357 int shift= (uint32_t)(c->range - (0x200 << CABAC_BITS))>>31; |
358 c->range<<= shift; | |
359 c->low <<= shift; | |
3943
811a9b0d9f32
several x86 renorm_cabac_decoder_once optimizations
michael
parents:
3928
diff
changeset
|
360 #endif |
2323 | 361 if(!(c->low & CABAC_MASK)) |
362 refill(c); | |
363 } | |
364 | |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
365 static int get_cabac(CABACContext *c, uint8_t * const state){ |
3642 | 366 //FIXME gcc generates duplicate load/stores for c->low and c->range |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
367 #ifdef ARCH_X86 |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
368 int bit; |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
369 |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
370 #define LOW "0" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
371 #define RANGE "4" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
372 #define LPS_RANGE "12" |
3976
27e90123b346
reverse remainder of the failed attempt to optimize *state=c->mps_state[s]
michael
parents:
3975
diff
changeset
|
373 #define LPS_STATE "12+2*65*4" |
27e90123b346
reverse remainder of the failed attempt to optimize *state=c->mps_state[s]
michael
parents:
3975
diff
changeset
|
374 #define MPS_STATE "12+2*65*4+2*64" |
27e90123b346
reverse remainder of the failed attempt to optimize *state=c->mps_state[s]
michael
parents:
3975
diff
changeset
|
375 #define BYTESTART "12+2*65*4+4*64" |
27e90123b346
reverse remainder of the failed attempt to optimize *state=c->mps_state[s]
michael
parents:
3975
diff
changeset
|
376 #define BYTE "16+2*65*4+4*64" |
27e90123b346
reverse remainder of the failed attempt to optimize *state=c->mps_state[s]
michael
parents:
3975
diff
changeset
|
377 #define BYTEEND "20+2*65*4+4*64" |
3975 | 378 #ifndef BRANCHLESS_CABAD |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
379 asm volatile( |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
380 "movzbl (%1), %%eax \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
381 "movl "RANGE "(%2), %%ebx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
382 "movl "RANGE "(%2), %%edx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
383 "shrl $23, %%ebx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
384 "leal "LPS_RANGE"(%2, %%eax, 4), %%esi \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
385 "movzbl (%%ebx, %%esi), %%esi \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
386 "shll $17, %%esi \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
387 "movl "LOW "(%2), %%ebx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
388 //eax:state ebx:low, edx:range, esi:RangeLPS |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
389 "subl %%esi, %%edx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
390 "cmpl %%edx, %%ebx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
391 " ja 1f \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
392 "cmp $0x2000000, %%edx \n\t" //FIXME avoidable |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
393 "setb %%cl \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
394 "shl %%cl, %%edx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
395 "shl %%cl, %%ebx \n\t" |
3979
ce16f66a48ad
reading 8bit mem into a 8bit register needs 2 uops on P4, 8bit->32bit with zero extension needs just 1
michael
parents:
3978
diff
changeset
|
396 "movzbl "MPS_STATE"(%2, %%eax), %%ecx \n\t" |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
397 "movb %%cl, (%1) \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
398 //eax:state ebx:low, edx:range, esi:RangeLPS |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
399 "test %%bx, %%bx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
400 " jnz 2f \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
401 "movl "BYTE "(%2), %%esi \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
402 "subl $0xFFFF, %%ebx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
403 "movzwl (%%esi), %%ecx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
404 "bswap %%ecx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
405 "shrl $15, %%ecx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
406 "addl $2, %%esi \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
407 "addl %%ecx, %%ebx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
408 "movl %%esi, "BYTE "(%2) \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
409 "jmp 2f \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
410 "1: \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
411 //eax:state ebx:low, edx:range, esi:RangeLPS |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
412 "subl %%edx, %%ebx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
413 "movl %%esi, %%edx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
414 "shr $19, %%esi \n\t" |
3979
ce16f66a48ad
reading 8bit mem into a 8bit register needs 2 uops on P4, 8bit->32bit with zero extension needs just 1
michael
parents:
3978
diff
changeset
|
415 "movzbl " MANGLE(ff_h264_norm_shift) "(%%esi), %%ecx \n\t" |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
416 "shll %%cl, %%ebx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
417 "shll %%cl, %%edx \n\t" |
3979
ce16f66a48ad
reading 8bit mem into a 8bit register needs 2 uops on P4, 8bit->32bit with zero extension needs just 1
michael
parents:
3978
diff
changeset
|
418 "movzbl "LPS_STATE"(%2, %%eax), %%ecx \n\t" |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
419 "movb %%cl, (%1) \n\t" |
3978 | 420 "addl $1, %%eax \n\t" |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
421 "test %%bx, %%bx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
422 " jnz 2f \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
423 |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
424 "movl "BYTE "(%2), %%ecx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
425 "movzwl (%%ecx), %%esi \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
426 "bswap %%esi \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
427 "shrl $15, %%esi \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
428 "subl $0xFFFF, %%esi \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
429 "addl $2, %%ecx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
430 "movl %%ecx, "BYTE "(%2) \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
431 |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
432 "leal -1(%%ebx), %%ecx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
433 "xorl %%ebx, %%ecx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
434 "shrl $17, %%ecx \n\t" |
3979
ce16f66a48ad
reading 8bit mem into a 8bit register needs 2 uops on P4, 8bit->32bit with zero extension needs just 1
michael
parents:
3978
diff
changeset
|
435 "movzbl " MANGLE(ff_h264_norm_shift) "(%%ecx), %%ecx \n\t" |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
436 "neg %%cl \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
437 "add $7, %%cl \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
438 |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
439 "shll %%cl , %%esi \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
440 "addl %%esi, %%ebx \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
441 "2: \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
442 "movl %%edx, "RANGE "(%2) \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
443 "movl %%ebx, "LOW "(%2) \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
444 "andl $1, %%eax \n\t" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
445 |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
446 :"=&a"(bit) //FIXME this is fragile gcc either runs out of registers or misscompiles it (for example if "+a"(bit) or "+m"(*state) is used |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
447 :"r"(state), "r"(c) |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
448 : "%ecx", "%ebx", "%edx", "%esi" |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
449 ); |
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
450 #else |
3975 | 451 asm volatile( |
452 "movzbl (%1), %%eax \n\t" | |
453 "movl "RANGE "(%2), %%ebx \n\t" | |
454 "movl "RANGE "(%2), %%edx \n\t" | |
455 "shrl $23, %%ebx \n\t" | |
456 "leal "LPS_RANGE"(%2, %%eax, 4), %%esi \n\t" | |
457 "movzbl (%%ebx, %%esi), %%esi \n\t" | |
458 "shll $17, %%esi \n\t" | |
459 "movl "LOW "(%2), %%ebx \n\t" | |
460 //eax:state ebx:low, edx:range, esi:RangeLPS | |
461 "subl %%esi, %%edx \n\t" | |
3980 | 462 #ifdef CMOV_IS_FAST //FIXME actually define this somewhere |
463 "cmpl %%ebx, %%edx \n\t" | |
464 "cmova %%edx, %%esi \n\t" | |
465 "sbbl %%ecx, %%ecx \n\t" | |
466 "andl %%ecx, %%edx \n\t" | |
467 "subl %%edx, %%ebx \n\t" | |
468 "xorl %%ecx, %%eax \n\t" | |
469 #else | |
3975 | 470 "movl %%edx, %%ecx \n\t" |
471 "subl %%ebx, %%edx \n\t" | |
472 "sarl $31, %%edx \n\t" //lps_mask | |
473 "subl %%ecx, %%esi \n\t" //RangeLPS - range | |
474 "andl %%edx, %%esi \n\t" //(RangeLPS - range)&lps_mask | |
475 "addl %%ecx, %%esi \n\t" //new range | |
476 "andl %%edx, %%ecx \n\t" | |
477 "subl %%ecx, %%ebx \n\t" | |
3980 | 478 "xorl %%edx, %%eax \n\t" |
479 #endif | |
3975 | 480 |
481 //eax:state ebx:low edx:mask esi:range | |
3979
ce16f66a48ad
reading 8bit mem into a 8bit register needs 2 uops on P4, 8bit->32bit with zero extension needs just 1
michael
parents:
3978
diff
changeset
|
482 "movzbl "MPS_STATE"(%2, %%eax), %%ecx \n\t" |
3975 | 483 "movb %%cl, (%1) \n\t" |
484 | |
485 "movl %%esi, %%edx \n\t" | |
486 //eax:bit ebx:low edx:range esi:range | |
487 | |
488 "shr $19, %%esi \n\t" | |
3979
ce16f66a48ad
reading 8bit mem into a 8bit register needs 2 uops on P4, 8bit->32bit with zero extension needs just 1
michael
parents:
3978
diff
changeset
|
489 "movzbl " MANGLE(ff_h264_norm_shift) "(%%esi), %%ecx \n\t" |
3975 | 490 "shll %%cl, %%ebx \n\t" |
491 "shll %%cl, %%edx \n\t" | |
492 "test %%bx, %%bx \n\t" | |
493 " jnz 1f \n\t" | |
494 | |
495 "movl "BYTE "(%2), %%ecx \n\t" | |
496 "movzwl (%%ecx), %%esi \n\t" | |
497 "bswap %%esi \n\t" | |
498 "shrl $15, %%esi \n\t" | |
499 "subl $0xFFFF, %%esi \n\t" | |
500 "addl $2, %%ecx \n\t" | |
501 "movl %%ecx, "BYTE "(%2) \n\t" | |
502 | |
503 "leal -1(%%ebx), %%ecx \n\t" | |
504 "xorl %%ebx, %%ecx \n\t" | |
505 "shrl $17, %%ecx \n\t" | |
3979
ce16f66a48ad
reading 8bit mem into a 8bit register needs 2 uops on P4, 8bit->32bit with zero extension needs just 1
michael
parents:
3978
diff
changeset
|
506 "movzbl " MANGLE(ff_h264_norm_shift) "(%%ecx), %%ecx \n\t" |
3975 | 507 "neg %%cl \n\t" |
508 "add $7, %%cl \n\t" | |
509 | |
510 "shll %%cl , %%esi \n\t" | |
511 "addl %%esi, %%ebx \n\t" | |
512 "1: \n\t" | |
513 "movl %%edx, "RANGE "(%2) \n\t" | |
514 "movl %%ebx, "LOW "(%2) \n\t" | |
515 "andl $1, %%eax \n\t" | |
516 :"=&a"(bit) | |
517 :"r"(state), "r"(c) | |
518 : "%ecx", "%ebx", "%edx", "%esi" | |
519 ); | |
520 #endif | |
521 #else | |
3642 | 522 int s = *state; |
523 int RangeLPS= c->lps_range[s][c->range>>(CABAC_BITS+7)]<<(CABAC_BITS+1); | |
2522
e25782262d7d
kill warnings patch by (M«©ns Rullg«©rd <mru inprovide com>)
michael
parents:
2323
diff
changeset
|
524 int bit, lps_mask attribute_unused; |
2967 | 525 |
1287 | 526 c->range -= RangeLPS; |
3974 | 527 #ifndef BRANCHLESS_CABAD |
1287 | 528 if(c->low < c->range){ |
3642 | 529 bit= s&1; |
3971
e8a6d5c1ab0b
drop failed attempt to optimize *state= c->mps_state[s];
michael
parents:
3970
diff
changeset
|
530 *state= c->mps_state[s]; |
2323 | 531 renorm_cabac_decoder_once(c); |
1287 | 532 }else{ |
3964 | 533 bit= ff_h264_norm_shift[RangeLPS>>19]; |
1287 | 534 c->low -= c->range; |
3642 | 535 *state= c->lps_state[s]; |
3956
0910f2844f9a
branchless renormalization (1% faster get_cabac) old branchless renormalization wasnt faster because gcc was scared of the shift variable (missusing bit variable now)
michael
parents:
3955
diff
changeset
|
536 c->range = RangeLPS<<bit; |
0910f2844f9a
branchless renormalization (1% faster get_cabac) old branchless renormalization wasnt faster because gcc was scared of the shift variable (missusing bit variable now)
michael
parents:
3955
diff
changeset
|
537 c->low <<= bit; |
0910f2844f9a
branchless renormalization (1% faster get_cabac) old branchless renormalization wasnt faster because gcc was scared of the shift variable (missusing bit variable now)
michael
parents:
3955
diff
changeset
|
538 bit= (s&1)^1; |
0910f2844f9a
branchless renormalization (1% faster get_cabac) old branchless renormalization wasnt faster because gcc was scared of the shift variable (missusing bit variable now)
michael
parents:
3955
diff
changeset
|
539 |
2323 | 540 if(!(c->low & 0xFFFF)){ |
541 refill2(c); | |
3956
0910f2844f9a
branchless renormalization (1% faster get_cabac) old branchless renormalization wasnt faster because gcc was scared of the shift variable (missusing bit variable now)
michael
parents:
3955
diff
changeset
|
542 } |
1287 | 543 } |
2323 | 544 #else |
545 lps_mask= (c->range - c->low)>>31; | |
2967 | 546 |
2323 | 547 c->low -= c->range & lps_mask; |
548 c->range += (RangeLPS - c->range) & lps_mask; | |
2967 | 549 |
3974 | 550 s^=lps_mask; |
551 *state= c->mps_state[s]; | |
552 bit= s&1; | |
2967 | 553 |
3970 | 554 lps_mask= ff_h264_norm_shift[c->range>>(CABAC_BITS+3)]; |
2323 | 555 c->range<<= lps_mask; |
556 c->low <<= lps_mask; | |
557 if(!(c->low & CABAC_MASK)) | |
558 refill2(c); | |
559 #endif | |
3969
fc6e0942353b
first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
michael
parents:
3967
diff
changeset
|
560 #endif |
2967 | 561 return bit; |
1287 | 562 } |
563 | |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
564 static int get_cabac_bypass(CABACContext *c){ |
1287 | 565 c->low += c->low; |
566 | |
2323 | 567 if(!(c->low & CABAC_MASK)) |
568 refill(c); | |
2967 | 569 |
1287 | 570 if(c->low < c->range){ |
571 return 0; | |
572 }else{ | |
573 c->low -= c->range; | |
574 return 1; | |
575 } | |
576 } | |
577 | |
1300
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
578 /** |
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
579 * |
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
580 * @return the number of bytes read or 0 if no end |
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
581 */ |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
582 static int get_cabac_terminate(CABACContext *c){ |
2323 | 583 c->range -= 4<<CABAC_BITS; |
1287 | 584 if(c->low < c->range){ |
2323 | 585 renorm_cabac_decoder_once(c); |
1287 | 586 return 0; |
587 }else{ | |
1300
e18667d1e94d
FFV1 codec (our very simple lossless intra only codec, compresses much better then huffyuv)
michaelni
parents:
1298
diff
changeset
|
588 return c->bytestream - c->bytestream_start; |
2967 | 589 } |
1287 | 590 } |
591 | |
1290 | 592 /** |
593 * get (truncated) unnary binarization. | |
594 */ | |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
595 static int get_cabac_u(CABACContext *c, uint8_t * state, int max, int max_index, int truncated){ |
1290 | 596 int i; |
2967 | 597 |
598 for(i=0; i<max; i++){ | |
1290 | 599 if(get_cabac(c, state)==0) |
600 return i; | |
2967 | 601 |
1290 | 602 if(i< max_index) state++; |
603 } | |
604 | |
605 return truncated ? max : -1; | |
606 } | |
607 | |
608 /** | |
609 * get unary exp golomb k-th order binarization. | |
610 */ | |
3928
987fffdf6ae7
don't try to inline cabac functions. gcc ignored the hint anyway, and forcing it would make h264 slower.
lorenm
parents:
3642
diff
changeset
|
611 static int get_cabac_ueg(CABACContext *c, uint8_t * state, int max, int is_signed, int k, int max_index){ |
1290 | 612 int i, v; |
613 int m= 1<<k; | |
2967 | 614 |
615 if(get_cabac(c, state)==0) | |
1290 | 616 return 0; |
2967 | 617 |
1290 | 618 if(0 < max_index) state++; |
2967 | 619 |
620 for(i=1; i<max; i++){ | |
1290 | 621 if(get_cabac(c, state)==0){ |
622 if(is_signed && get_cabac_bypass(c)){ | |
623 return -i; | |
624 }else | |
625 return i; | |
626 } | |
627 | |
628 if(i < max_index) state++; | |
629 } | |
2967 | 630 |
1290 | 631 while(get_cabac_bypass(c)){ |
632 i+= m; | |
633 m+= m; | |
634 } | |
2967 | 635 |
1290 | 636 v=0; |
637 while(m>>=1){ | |
638 v+= v + get_cabac_bypass(c); | |
639 } | |
640 i += v; | |
641 | |
642 if(is_signed && get_cabac_bypass(c)){ | |
643 return -i; | |
644 }else | |
645 return i; | |
646 } |