comparison h264_direct.c @ 11140:9004c61e3aa0 libavcodec

Restructure spatial direct MV generation so that the zero predictior shortcut is closer at the top. 50-130 cpu cycles faster depending on which kind of direct MBs are meassured.
author michael
date Sat, 13 Feb 2010 13:02:25 +0000
parents d988e0a6f391
children e81668fc717a
comparison
equal deleted inserted replaced
11139:d988e0a6f391 11140:9004c61e3aa0
149 const int16_t (*l1mv0)[2], (*l1mv1)[2]; 149 const int16_t (*l1mv0)[2], (*l1mv1)[2];
150 const int8_t *l1ref0, *l1ref1; 150 const int8_t *l1ref0, *l1ref1;
151 const int is_b8x8 = IS_8X8(*mb_type); 151 const int is_b8x8 = IS_8X8(*mb_type);
152 unsigned int sub_mb_type; 152 unsigned int sub_mb_type;
153 int i8, i4; 153 int i8, i4;
154 int ref[2];
155 int mv[2];
156 int list;
154 157
155 assert(h->ref_list[1][0].reference&3); 158 assert(h->ref_list[1][0].reference&3);
156 159
157 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM) 160 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
161
162 *mb_type |= MB_TYPE_L0L1;
163 sub_mb_type |= MB_TYPE_L0L1;
164
165 /* ref = min(neighbors) */
166 for(list=0; list<2; list++){
167 int left_ref = h->ref_cache[list][scan8[0] - 1];
168 int top_ref = h->ref_cache[list][scan8[0] - 8];
169 int refc = h->ref_cache[list][scan8[0] - 8 + 4];
170 const int16_t *C= h->mv_cache[list][ scan8[0] - 8 + 4];
171 if(refc == PART_NOT_AVAILABLE){
172 refc = h->ref_cache[list][scan8[0] - 8 - 1];
173 C = h-> mv_cache[list][scan8[0] - 8 - 1];
174 }
175 ref[list] = FFMIN3((unsigned)left_ref, (unsigned)top_ref, (unsigned)refc);
176 if(ref[list] >= 0){
177 //this is just pred_motion() but with the cases removed that cannot happen for direct blocks
178 const int16_t * const A= h->mv_cache[list][ scan8[0] - 1 ];
179 const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
180
181 int match_count= (left_ref==ref[list]) + (top_ref==ref[list]) + (refc==ref[list]);
182 if(match_count > 1){ //most common
183 mv[list]= (mid_pred(A[0], B[0], C[0])&0xFFFF)
184 +(mid_pred(A[1], B[1], C[1])<<16);
185 }else {
186 assert(match_count==1);
187 if(left_ref==ref[list]){
188 mv[list]= *(uint32_t*)A;
189 }else if(top_ref==ref[list]){
190 mv[list]= *(uint32_t*)B;
191 }else{
192 mv[list]= *(uint32_t*)C;
193 }
194 }
195 }else{
196 int mask= ~(MB_TYPE_L0 << (2*list));
197 mv[list] = 0;
198 ref[list] = -1;
199 if(!is_b8x8)
200 *mb_type &= mask;
201 sub_mb_type &= mask;
202 }
203 }
204 if(ref[0] < 0 && ref[1] < 0){
205 ref[0] = ref[1] = 0;
206 if(!is_b8x8)
207 *mb_type |= MB_TYPE_L0L1;
208 sub_mb_type |= MB_TYPE_L0L1;
209 }
210
211 if(!is_b8x8 && (mv[0]|mv[1]) == 0){
212 fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
213 fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
214 fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, 0, 4);
215 fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, 0, 4);
216 *mb_type= (*mb_type & ~(MB_TYPE_8x8|MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_P1L0|MB_TYPE_P1L1))|MB_TYPE_16x16|MB_TYPE_DIRECT2;
217 return;
218 }
158 219
159 if(IS_INTERLACED(h->ref_list[1][0].mb_type[mb_xy])){ // AFL/AFR/FR/FL -> AFL/FL 220 if(IS_INTERLACED(h->ref_list[1][0].mb_type[mb_xy])){ // AFL/AFR/FR/FL -> AFL/FL
160 if(!IS_INTERLACED(*mb_type)){ // AFR/FR -> AFL/FL 221 if(!IS_INTERLACED(*mb_type)){ // AFR/FR -> AFL/FL
161 mb_xy= s->mb_x + ((s->mb_y&~1) + h->col_parity)*s->mb_stride; 222 mb_xy= s->mb_x + ((s->mb_y&~1) + h->col_parity)*s->mb_stride;
162 b8_stride = 0; 223 b8_stride = 0;
170 mb_type_col[0] = h->ref_list[1][0].mb_type[mb_xy]; 231 mb_type_col[0] = h->ref_list[1][0].mb_type[mb_xy];
171 mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy + s->mb_stride]; 232 mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy + s->mb_stride];
172 b8_stride *= 3; 233 b8_stride *= 3;
173 b4_stride *= 6; 234 b4_stride *= 6;
174 235
175 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */ 236 sub_mb_type |= MB_TYPE_16x16|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
176 if( (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA) 237 if( (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)
177 && (mb_type_col[1] & MB_TYPE_16x16_OR_INTRA) 238 && (mb_type_col[1] & MB_TYPE_16x16_OR_INTRA)
178 && !is_b8x8){ 239 && !is_b8x8){
179 *mb_type |= MB_TYPE_16x8 |MB_TYPE_L0L1|MB_TYPE_DIRECT2; /* B_16x8 */ 240 *mb_type |= MB_TYPE_16x8 |MB_TYPE_DIRECT2; /* B_16x8 */
180 }else{ 241 }else{
181 *mb_type |= MB_TYPE_8x8|MB_TYPE_L0L1; 242 *mb_type |= MB_TYPE_8x8;
182 } 243 }
183 }else{ // AFR/FR -> AFR/FR 244 }else{ // AFR/FR -> AFR/FR
184 single_col: 245 single_col:
185 mb_type_col[0] = 246 mb_type_col[0] =
186 mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy]; 247 mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy];
187 248
188 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */ 249 sub_mb_type |= MB_TYPE_16x16|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
189 if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){ 250 if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){
190 *mb_type |= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */ 251 *mb_type |= MB_TYPE_16x16|MB_TYPE_DIRECT2; /* B_16x16 */
191 }else if(!is_b8x8 && (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16))){ 252 }else if(!is_b8x8 && (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16))){
192 *mb_type |= MB_TYPE_L0L1|MB_TYPE_DIRECT2 | (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16)); 253 *mb_type |= MB_TYPE_DIRECT2 | (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16));
193 }else{ 254 }else{
194 if(!h->sps.direct_8x8_inference_flag){ 255 if(!h->sps.direct_8x8_inference_flag){
195 /* FIXME save sub mb types from previous frames (or derive from MVs) 256 /* FIXME save sub mb types from previous frames (or derive from MVs)
196 * so we know exactly what block size to use */ 257 * so we know exactly what block size to use */
197 sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */ 258 sub_mb_type += (MB_TYPE_8x8-MB_TYPE_16x16); /* B_SUB_4x4 */
198 } 259 }
199 *mb_type |= MB_TYPE_8x8|MB_TYPE_L0L1; 260 *mb_type |= MB_TYPE_8x8;
200 } 261 }
201 } 262 }
202 } 263 }
203 264
204 l1mv0 = &h->ref_list[1][0].motion_val[0][h->mb2b_xy [mb_xy]]; 265 l1mv0 = &h->ref_list[1][0].motion_val[0][h->mb2b_xy [mb_xy]];
212 l1mv0 += 2*b4_stride; 273 l1mv0 += 2*b4_stride;
213 l1mv1 += 2*b4_stride; 274 l1mv1 += 2*b4_stride;
214 } 275 }
215 } 276 }
216 277
217 { 278
218 int ref[2]; 279 if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
219 int mv[2];
220 int list;
221
222 /* ref = min(neighbors) */
223 for(list=0; list<2; list++){
224 int left_ref = h->ref_cache[list][scan8[0] - 1];
225 int top_ref = h->ref_cache[list][scan8[0] - 8];
226 int refc = h->ref_cache[list][scan8[0] - 8 + 4];
227 const int16_t *C= h->mv_cache[list][ scan8[0] - 8 + 4];
228 if(refc == PART_NOT_AVAILABLE){
229 refc = h->ref_cache[list][scan8[0] - 8 - 1];
230 C = h-> mv_cache[list][scan8[0] - 8 - 1];
231 }
232 ref[list] = FFMIN3((unsigned)left_ref, (unsigned)top_ref, (unsigned)refc);
233 if(ref[list] >= 0){
234 //this is just pred_motion() but with the cases removed that cannot happen for direct blocks
235 const int16_t * const A= h->mv_cache[list][ scan8[0] - 1 ];
236 const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
237
238 int match_count= (left_ref==ref[list]) + (top_ref==ref[list]) + (refc==ref[list]);
239 if(match_count > 1){ //most common
240 mv[list]= (mid_pred(A[0], B[0], C[0])&0xFFFF)
241 +(mid_pred(A[1], B[1], C[1])<<16);
242 }else {
243 assert(match_count==1);
244 if(left_ref==ref[list]){
245 mv[list]= *(uint32_t*)A;
246 }else if(top_ref==ref[list]){
247 mv[list]= *(uint32_t*)B;
248 }else{
249 mv[list]= *(uint32_t*)C;
250 }
251 }
252 }else{
253 int mask= ~(MB_TYPE_L0 << (2*list));
254 mv[list] = 0;
255 ref[list] = -1;
256 if(!is_b8x8)
257 *mb_type &= mask;
258 sub_mb_type &= mask;
259 }
260 }
261 if(ref[0] < 0 && ref[1] < 0){
262 ref[0] = ref[1] = 0;
263 if(!is_b8x8)
264 *mb_type |= MB_TYPE_L0L1;
265 sub_mb_type |= MB_TYPE_L0L1;
266 }
267
268 if(!is_b8x8 && (mv[0]|mv[1]) == 0){
269 fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
270 fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
271 fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, 0, 4);
272 fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, 0, 4);
273 *mb_type= (*mb_type & ~(MB_TYPE_8x8|MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_P1L0|MB_TYPE_P1L1))|MB_TYPE_16x16|MB_TYPE_DIRECT2;
274 }else if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
275 int n=0; 280 int n=0;
276 for(i8=0; i8<4; i8++){ 281 for(i8=0; i8<4; i8++){
277 int x8 = i8&1; 282 int x8 = i8&1;
278 int y8 = i8>>1; 283 int y8 = i8>>1;
279 int xy8 = x8+y8*b8_stride; 284 int xy8 = x8+y8*b8_stride;