Mercurial > libavcodec.hg
comparison h264_direct.c @ 11140:9004c61e3aa0 libavcodec
Restructure spatial direct MV generation so that the zero predictior shortcut
is closer at the top.
50-130 cpu cycles faster depending on which kind of direct MBs are meassured.
author | michael |
---|---|
date | Sat, 13 Feb 2010 13:02:25 +0000 |
parents | d988e0a6f391 |
children | e81668fc717a |
comparison
equal
deleted
inserted
replaced
11139:d988e0a6f391 | 11140:9004c61e3aa0 |
---|---|
149 const int16_t (*l1mv0)[2], (*l1mv1)[2]; | 149 const int16_t (*l1mv0)[2], (*l1mv1)[2]; |
150 const int8_t *l1ref0, *l1ref1; | 150 const int8_t *l1ref0, *l1ref1; |
151 const int is_b8x8 = IS_8X8(*mb_type); | 151 const int is_b8x8 = IS_8X8(*mb_type); |
152 unsigned int sub_mb_type; | 152 unsigned int sub_mb_type; |
153 int i8, i4; | 153 int i8, i4; |
154 int ref[2]; | |
155 int mv[2]; | |
156 int list; | |
154 | 157 |
155 assert(h->ref_list[1][0].reference&3); | 158 assert(h->ref_list[1][0].reference&3); |
156 | 159 |
157 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM) | 160 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM) |
161 | |
162 *mb_type |= MB_TYPE_L0L1; | |
163 sub_mb_type |= MB_TYPE_L0L1; | |
164 | |
165 /* ref = min(neighbors) */ | |
166 for(list=0; list<2; list++){ | |
167 int left_ref = h->ref_cache[list][scan8[0] - 1]; | |
168 int top_ref = h->ref_cache[list][scan8[0] - 8]; | |
169 int refc = h->ref_cache[list][scan8[0] - 8 + 4]; | |
170 const int16_t *C= h->mv_cache[list][ scan8[0] - 8 + 4]; | |
171 if(refc == PART_NOT_AVAILABLE){ | |
172 refc = h->ref_cache[list][scan8[0] - 8 - 1]; | |
173 C = h-> mv_cache[list][scan8[0] - 8 - 1]; | |
174 } | |
175 ref[list] = FFMIN3((unsigned)left_ref, (unsigned)top_ref, (unsigned)refc); | |
176 if(ref[list] >= 0){ | |
177 //this is just pred_motion() but with the cases removed that cannot happen for direct blocks | |
178 const int16_t * const A= h->mv_cache[list][ scan8[0] - 1 ]; | |
179 const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ]; | |
180 | |
181 int match_count= (left_ref==ref[list]) + (top_ref==ref[list]) + (refc==ref[list]); | |
182 if(match_count > 1){ //most common | |
183 mv[list]= (mid_pred(A[0], B[0], C[0])&0xFFFF) | |
184 +(mid_pred(A[1], B[1], C[1])<<16); | |
185 }else { | |
186 assert(match_count==1); | |
187 if(left_ref==ref[list]){ | |
188 mv[list]= *(uint32_t*)A; | |
189 }else if(top_ref==ref[list]){ | |
190 mv[list]= *(uint32_t*)B; | |
191 }else{ | |
192 mv[list]= *(uint32_t*)C; | |
193 } | |
194 } | |
195 }else{ | |
196 int mask= ~(MB_TYPE_L0 << (2*list)); | |
197 mv[list] = 0; | |
198 ref[list] = -1; | |
199 if(!is_b8x8) | |
200 *mb_type &= mask; | |
201 sub_mb_type &= mask; | |
202 } | |
203 } | |
204 if(ref[0] < 0 && ref[1] < 0){ | |
205 ref[0] = ref[1] = 0; | |
206 if(!is_b8x8) | |
207 *mb_type |= MB_TYPE_L0L1; | |
208 sub_mb_type |= MB_TYPE_L0L1; | |
209 } | |
210 | |
211 if(!is_b8x8 && (mv[0]|mv[1]) == 0){ | |
212 fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1); | |
213 fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1); | |
214 fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, 0, 4); | |
215 fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, 0, 4); | |
216 *mb_type= (*mb_type & ~(MB_TYPE_8x8|MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_P1L0|MB_TYPE_P1L1))|MB_TYPE_16x16|MB_TYPE_DIRECT2; | |
217 return; | |
218 } | |
158 | 219 |
159 if(IS_INTERLACED(h->ref_list[1][0].mb_type[mb_xy])){ // AFL/AFR/FR/FL -> AFL/FL | 220 if(IS_INTERLACED(h->ref_list[1][0].mb_type[mb_xy])){ // AFL/AFR/FR/FL -> AFL/FL |
160 if(!IS_INTERLACED(*mb_type)){ // AFR/FR -> AFL/FL | 221 if(!IS_INTERLACED(*mb_type)){ // AFR/FR -> AFL/FL |
161 mb_xy= s->mb_x + ((s->mb_y&~1) + h->col_parity)*s->mb_stride; | 222 mb_xy= s->mb_x + ((s->mb_y&~1) + h->col_parity)*s->mb_stride; |
162 b8_stride = 0; | 223 b8_stride = 0; |
170 mb_type_col[0] = h->ref_list[1][0].mb_type[mb_xy]; | 231 mb_type_col[0] = h->ref_list[1][0].mb_type[mb_xy]; |
171 mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy + s->mb_stride]; | 232 mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy + s->mb_stride]; |
172 b8_stride *= 3; | 233 b8_stride *= 3; |
173 b4_stride *= 6; | 234 b4_stride *= 6; |
174 | 235 |
175 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */ | 236 sub_mb_type |= MB_TYPE_16x16|MB_TYPE_DIRECT2; /* B_SUB_8x8 */ |
176 if( (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA) | 237 if( (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA) |
177 && (mb_type_col[1] & MB_TYPE_16x16_OR_INTRA) | 238 && (mb_type_col[1] & MB_TYPE_16x16_OR_INTRA) |
178 && !is_b8x8){ | 239 && !is_b8x8){ |
179 *mb_type |= MB_TYPE_16x8 |MB_TYPE_L0L1|MB_TYPE_DIRECT2; /* B_16x8 */ | 240 *mb_type |= MB_TYPE_16x8 |MB_TYPE_DIRECT2; /* B_16x8 */ |
180 }else{ | 241 }else{ |
181 *mb_type |= MB_TYPE_8x8|MB_TYPE_L0L1; | 242 *mb_type |= MB_TYPE_8x8; |
182 } | 243 } |
183 }else{ // AFR/FR -> AFR/FR | 244 }else{ // AFR/FR -> AFR/FR |
184 single_col: | 245 single_col: |
185 mb_type_col[0] = | 246 mb_type_col[0] = |
186 mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy]; | 247 mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy]; |
187 | 248 |
188 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */ | 249 sub_mb_type |= MB_TYPE_16x16|MB_TYPE_DIRECT2; /* B_SUB_8x8 */ |
189 if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){ | 250 if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){ |
190 *mb_type |= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */ | 251 *mb_type |= MB_TYPE_16x16|MB_TYPE_DIRECT2; /* B_16x16 */ |
191 }else if(!is_b8x8 && (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16))){ | 252 }else if(!is_b8x8 && (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16))){ |
192 *mb_type |= MB_TYPE_L0L1|MB_TYPE_DIRECT2 | (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16)); | 253 *mb_type |= MB_TYPE_DIRECT2 | (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16)); |
193 }else{ | 254 }else{ |
194 if(!h->sps.direct_8x8_inference_flag){ | 255 if(!h->sps.direct_8x8_inference_flag){ |
195 /* FIXME save sub mb types from previous frames (or derive from MVs) | 256 /* FIXME save sub mb types from previous frames (or derive from MVs) |
196 * so we know exactly what block size to use */ | 257 * so we know exactly what block size to use */ |
197 sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */ | 258 sub_mb_type += (MB_TYPE_8x8-MB_TYPE_16x16); /* B_SUB_4x4 */ |
198 } | 259 } |
199 *mb_type |= MB_TYPE_8x8|MB_TYPE_L0L1; | 260 *mb_type |= MB_TYPE_8x8; |
200 } | 261 } |
201 } | 262 } |
202 } | 263 } |
203 | 264 |
204 l1mv0 = &h->ref_list[1][0].motion_val[0][h->mb2b_xy [mb_xy]]; | 265 l1mv0 = &h->ref_list[1][0].motion_val[0][h->mb2b_xy [mb_xy]]; |
212 l1mv0 += 2*b4_stride; | 273 l1mv0 += 2*b4_stride; |
213 l1mv1 += 2*b4_stride; | 274 l1mv1 += 2*b4_stride; |
214 } | 275 } |
215 } | 276 } |
216 | 277 |
217 { | 278 |
218 int ref[2]; | 279 if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){ |
219 int mv[2]; | |
220 int list; | |
221 | |
222 /* ref = min(neighbors) */ | |
223 for(list=0; list<2; list++){ | |
224 int left_ref = h->ref_cache[list][scan8[0] - 1]; | |
225 int top_ref = h->ref_cache[list][scan8[0] - 8]; | |
226 int refc = h->ref_cache[list][scan8[0] - 8 + 4]; | |
227 const int16_t *C= h->mv_cache[list][ scan8[0] - 8 + 4]; | |
228 if(refc == PART_NOT_AVAILABLE){ | |
229 refc = h->ref_cache[list][scan8[0] - 8 - 1]; | |
230 C = h-> mv_cache[list][scan8[0] - 8 - 1]; | |
231 } | |
232 ref[list] = FFMIN3((unsigned)left_ref, (unsigned)top_ref, (unsigned)refc); | |
233 if(ref[list] >= 0){ | |
234 //this is just pred_motion() but with the cases removed that cannot happen for direct blocks | |
235 const int16_t * const A= h->mv_cache[list][ scan8[0] - 1 ]; | |
236 const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ]; | |
237 | |
238 int match_count= (left_ref==ref[list]) + (top_ref==ref[list]) + (refc==ref[list]); | |
239 if(match_count > 1){ //most common | |
240 mv[list]= (mid_pred(A[0], B[0], C[0])&0xFFFF) | |
241 +(mid_pred(A[1], B[1], C[1])<<16); | |
242 }else { | |
243 assert(match_count==1); | |
244 if(left_ref==ref[list]){ | |
245 mv[list]= *(uint32_t*)A; | |
246 }else if(top_ref==ref[list]){ | |
247 mv[list]= *(uint32_t*)B; | |
248 }else{ | |
249 mv[list]= *(uint32_t*)C; | |
250 } | |
251 } | |
252 }else{ | |
253 int mask= ~(MB_TYPE_L0 << (2*list)); | |
254 mv[list] = 0; | |
255 ref[list] = -1; | |
256 if(!is_b8x8) | |
257 *mb_type &= mask; | |
258 sub_mb_type &= mask; | |
259 } | |
260 } | |
261 if(ref[0] < 0 && ref[1] < 0){ | |
262 ref[0] = ref[1] = 0; | |
263 if(!is_b8x8) | |
264 *mb_type |= MB_TYPE_L0L1; | |
265 sub_mb_type |= MB_TYPE_L0L1; | |
266 } | |
267 | |
268 if(!is_b8x8 && (mv[0]|mv[1]) == 0){ | |
269 fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1); | |
270 fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1); | |
271 fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, 0, 4); | |
272 fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, 0, 4); | |
273 *mb_type= (*mb_type & ~(MB_TYPE_8x8|MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_P1L0|MB_TYPE_P1L1))|MB_TYPE_16x16|MB_TYPE_DIRECT2; | |
274 }else if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){ | |
275 int n=0; | 280 int n=0; |
276 for(i8=0; i8<4; i8++){ | 281 for(i8=0; i8<4; i8++){ |
277 int x8 = i8&1; | 282 int x8 = i8&1; |
278 int y8 = i8>>1; | 283 int y8 = i8>>1; |
279 int xy8 = x8+y8*b8_stride; | 284 int xy8 = x8+y8*b8_stride; |