comparison h264.c @ 3315:cfd452a6560b libavcodec

h264: faster fill_rectangle()
author lorenm
date Sun, 28 May 2006 22:28:08 +0000
parents 9637da0a9c1b
children 7278f730af27
comparison
equal deleted inserted replaced
3314:aea2230e6033 3315:cfd452a6560b
396 * @param size the size of val (1 or 4), should be a constant 396 * @param size the size of val (1 or 4), should be a constant
397 */ 397 */
398 static always_inline void fill_rectangle(void *vp, int w, int h, int stride, uint32_t val, int size){ 398 static always_inline void fill_rectangle(void *vp, int w, int h, int stride, uint32_t val, int size){
399 uint8_t *p= (uint8_t*)vp; 399 uint8_t *p= (uint8_t*)vp;
400 assert(size==1 || size==4); 400 assert(size==1 || size==4);
401 assert(w<=4);
401 402
402 w *= size; 403 w *= size;
403 stride *= size; 404 stride *= size;
404 405
405 assert((((long)vp)&(FFMIN(w, STRIDE_ALIGN)-1)) == 0); 406 assert((((long)vp)&(FFMIN(w, STRIDE_ALIGN)-1)) == 0);
406 assert((stride&(w-1))==0); 407 assert((stride&(w-1))==0);
407 //FIXME check what gcc generates for 64 bit on x86 and possibly write a 32 bit ver of it 408 if(w==2){
408 if(w==2 && h==2){ 409 const uint16_t v= size==4 ? val : val*0x0101;
409 *(uint16_t*)(p + 0)= 410 *(uint16_t*)(p + 0*stride)= v;
410 *(uint16_t*)(p + stride)= size==4 ? val : val*0x0101; 411 if(h==1) return;
411 }else if(w==2 && h==4){ 412 *(uint16_t*)(p + 1*stride)= v;
412 *(uint16_t*)(p + 0*stride)= 413 if(h==2) return;
413 *(uint16_t*)(p + 1*stride)=
414 *(uint16_t*)(p + 2*stride)= 414 *(uint16_t*)(p + 2*stride)=
415 *(uint16_t*)(p + 3*stride)= size==4 ? val : val*0x0101; 415 *(uint16_t*)(p + 3*stride)= v;
416 }else if(w==4 && h==1){ 416 }else if(w==4){
417 *(uint32_t*)(p + 0*stride)= size==4 ? val : val*0x01010101; 417 const uint32_t v= size==4 ? val : val*0x01010101;
418 }else if(w==4 && h==2){ 418 *(uint32_t*)(p + 0*stride)= v;
419 *(uint32_t*)(p + 0*stride)= 419 if(h==1) return;
420 *(uint32_t*)(p + 1*stride)= size==4 ? val : val*0x01010101; 420 *(uint32_t*)(p + 1*stride)= v;
421 }else if(w==4 && h==4){ 421 if(h==2) return;
422 *(uint32_t*)(p + 0*stride)=
423 *(uint32_t*)(p + 1*stride)=
424 *(uint32_t*)(p + 2*stride)= 422 *(uint32_t*)(p + 2*stride)=
425 *(uint32_t*)(p + 3*stride)= size==4 ? val : val*0x01010101; 423 *(uint32_t*)(p + 3*stride)= v;
426 }else if(w==8 && h==1){ 424 }else if(w==8){
427 *(uint32_t*)(p + 0)= 425 //gcc can't optimize 64bit math on x86_32
428 *(uint32_t*)(p + 4)= size==4 ? val : val*0x01010101; 426 #if defined(ARCH_X86_64) || (defined(MP_WORDSIZE) && MP_WORDSIZE >= 64)
429 }else if(w==8 && h==2){ 427 const uint64_t v= val*0x0100000001ULL;
430 *(uint32_t*)(p + 0 + 0*stride)= 428 *(uint64_t*)(p + 0*stride)= v;
431 *(uint32_t*)(p + 4 + 0*stride)= 429 if(h==1) return;
432 *(uint32_t*)(p + 0 + 1*stride)= 430 *(uint64_t*)(p + 1*stride)= v;
433 *(uint32_t*)(p + 4 + 1*stride)= size==4 ? val : val*0x01010101; 431 if(h==2) return;
434 }else if(w==8 && h==4){
435 *(uint64_t*)(p + 0*stride)=
436 *(uint64_t*)(p + 1*stride)=
437 *(uint64_t*)(p + 2*stride)= 432 *(uint64_t*)(p + 2*stride)=
438 *(uint64_t*)(p + 3*stride)= size==4 ? val*0x0100000001ULL : val*0x0101010101010101ULL; 433 *(uint64_t*)(p + 3*stride)= v;
439 }else if(w==16 && h==2){ 434 }else if(w==16){
435 const uint64_t v= val*0x0100000001ULL;
440 *(uint64_t*)(p + 0+0*stride)= 436 *(uint64_t*)(p + 0+0*stride)=
441 *(uint64_t*)(p + 8+0*stride)= 437 *(uint64_t*)(p + 8+0*stride)=
442 *(uint64_t*)(p + 0+1*stride)= 438 *(uint64_t*)(p + 0+1*stride)=
443 *(uint64_t*)(p + 8+1*stride)= size==4 ? val*0x0100000001ULL : val*0x0101010101010101ULL; 439 *(uint64_t*)(p + 8+1*stride)= v;
444 }else if(w==16 && h==4){ 440 if(h==2) return;
445 *(uint64_t*)(p + 0+0*stride)=
446 *(uint64_t*)(p + 8+0*stride)=
447 *(uint64_t*)(p + 0+1*stride)=
448 *(uint64_t*)(p + 8+1*stride)=
449 *(uint64_t*)(p + 0+2*stride)= 441 *(uint64_t*)(p + 0+2*stride)=
450 *(uint64_t*)(p + 8+2*stride)= 442 *(uint64_t*)(p + 8+2*stride)=
451 *(uint64_t*)(p + 0+3*stride)= 443 *(uint64_t*)(p + 0+3*stride)=
452 *(uint64_t*)(p + 8+3*stride)= size==4 ? val*0x0100000001ULL : val*0x0101010101010101ULL; 444 *(uint64_t*)(p + 8+3*stride)= v;
445 #else
446 *(uint32_t*)(p + 0+0*stride)=
447 *(uint32_t*)(p + 4+0*stride)= val;
448 if(h==1) return;
449 *(uint32_t*)(p + 0+1*stride)=
450 *(uint32_t*)(p + 4+1*stride)= val;
451 if(h==2) return;
452 *(uint32_t*)(p + 0+2*stride)=
453 *(uint32_t*)(p + 4+2*stride)=
454 *(uint32_t*)(p + 0+3*stride)=
455 *(uint32_t*)(p + 4+3*stride)= val;
456 }else if(w==16){
457 *(uint32_t*)(p + 0+0*stride)=
458 *(uint32_t*)(p + 4+0*stride)=
459 *(uint32_t*)(p + 8+0*stride)=
460 *(uint32_t*)(p +12+0*stride)=
461 *(uint32_t*)(p + 0+1*stride)=
462 *(uint32_t*)(p + 4+1*stride)=
463 *(uint32_t*)(p + 8+1*stride)=
464 *(uint32_t*)(p +12+1*stride)= val;
465 if(h==2) return;
466 *(uint32_t*)(p + 0+2*stride)=
467 *(uint32_t*)(p + 4+2*stride)=
468 *(uint32_t*)(p + 8+2*stride)=
469 *(uint32_t*)(p +12+2*stride)=
470 *(uint32_t*)(p + 0+3*stride)=
471 *(uint32_t*)(p + 4+3*stride)=
472 *(uint32_t*)(p + 8+3*stride)=
473 *(uint32_t*)(p +12+3*stride)= val;
474 #endif
453 }else 475 }else
454 assert(0); 476 assert(0);
477 assert(h==4);
455 } 478 }
456 479
457 static void fill_caches(H264Context *h, int mb_type, int for_deblock){ 480 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
458 MpegEncContext * const s = &h->s; 481 MpegEncContext * const s = &h->s;
459 const int mb_xy= s->mb_x + s->mb_y*s->mb_stride; 482 const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;