Mercurial > libavcodec.hg
comparison h264.c @ 3315:cfd452a6560b libavcodec
h264: faster fill_rectangle()
author | lorenm |
---|---|
date | Sun, 28 May 2006 22:28:08 +0000 |
parents | 9637da0a9c1b |
children | 7278f730af27 |
comparison
equal
deleted
inserted
replaced
3314:aea2230e6033 | 3315:cfd452a6560b |
---|---|
396 * @param size the size of val (1 or 4), should be a constant | 396 * @param size the size of val (1 or 4), should be a constant |
397 */ | 397 */ |
398 static always_inline void fill_rectangle(void *vp, int w, int h, int stride, uint32_t val, int size){ | 398 static always_inline void fill_rectangle(void *vp, int w, int h, int stride, uint32_t val, int size){ |
399 uint8_t *p= (uint8_t*)vp; | 399 uint8_t *p= (uint8_t*)vp; |
400 assert(size==1 || size==4); | 400 assert(size==1 || size==4); |
401 assert(w<=4); | |
401 | 402 |
402 w *= size; | 403 w *= size; |
403 stride *= size; | 404 stride *= size; |
404 | 405 |
405 assert((((long)vp)&(FFMIN(w, STRIDE_ALIGN)-1)) == 0); | 406 assert((((long)vp)&(FFMIN(w, STRIDE_ALIGN)-1)) == 0); |
406 assert((stride&(w-1))==0); | 407 assert((stride&(w-1))==0); |
407 //FIXME check what gcc generates for 64 bit on x86 and possibly write a 32 bit ver of it | 408 if(w==2){ |
408 if(w==2 && h==2){ | 409 const uint16_t v= size==4 ? val : val*0x0101; |
409 *(uint16_t*)(p + 0)= | 410 *(uint16_t*)(p + 0*stride)= v; |
410 *(uint16_t*)(p + stride)= size==4 ? val : val*0x0101; | 411 if(h==1) return; |
411 }else if(w==2 && h==4){ | 412 *(uint16_t*)(p + 1*stride)= v; |
412 *(uint16_t*)(p + 0*stride)= | 413 if(h==2) return; |
413 *(uint16_t*)(p + 1*stride)= | |
414 *(uint16_t*)(p + 2*stride)= | 414 *(uint16_t*)(p + 2*stride)= |
415 *(uint16_t*)(p + 3*stride)= size==4 ? val : val*0x0101; | 415 *(uint16_t*)(p + 3*stride)= v; |
416 }else if(w==4 && h==1){ | 416 }else if(w==4){ |
417 *(uint32_t*)(p + 0*stride)= size==4 ? val : val*0x01010101; | 417 const uint32_t v= size==4 ? val : val*0x01010101; |
418 }else if(w==4 && h==2){ | 418 *(uint32_t*)(p + 0*stride)= v; |
419 *(uint32_t*)(p + 0*stride)= | 419 if(h==1) return; |
420 *(uint32_t*)(p + 1*stride)= size==4 ? val : val*0x01010101; | 420 *(uint32_t*)(p + 1*stride)= v; |
421 }else if(w==4 && h==4){ | 421 if(h==2) return; |
422 *(uint32_t*)(p + 0*stride)= | |
423 *(uint32_t*)(p + 1*stride)= | |
424 *(uint32_t*)(p + 2*stride)= | 422 *(uint32_t*)(p + 2*stride)= |
425 *(uint32_t*)(p + 3*stride)= size==4 ? val : val*0x01010101; | 423 *(uint32_t*)(p + 3*stride)= v; |
426 }else if(w==8 && h==1){ | 424 }else if(w==8){ |
427 *(uint32_t*)(p + 0)= | 425 //gcc can't optimize 64bit math on x86_32 |
428 *(uint32_t*)(p + 4)= size==4 ? val : val*0x01010101; | 426 #if defined(ARCH_X86_64) || (defined(MP_WORDSIZE) && MP_WORDSIZE >= 64) |
429 }else if(w==8 && h==2){ | 427 const uint64_t v= val*0x0100000001ULL; |
430 *(uint32_t*)(p + 0 + 0*stride)= | 428 *(uint64_t*)(p + 0*stride)= v; |
431 *(uint32_t*)(p + 4 + 0*stride)= | 429 if(h==1) return; |
432 *(uint32_t*)(p + 0 + 1*stride)= | 430 *(uint64_t*)(p + 1*stride)= v; |
433 *(uint32_t*)(p + 4 + 1*stride)= size==4 ? val : val*0x01010101; | 431 if(h==2) return; |
434 }else if(w==8 && h==4){ | |
435 *(uint64_t*)(p + 0*stride)= | |
436 *(uint64_t*)(p + 1*stride)= | |
437 *(uint64_t*)(p + 2*stride)= | 432 *(uint64_t*)(p + 2*stride)= |
438 *(uint64_t*)(p + 3*stride)= size==4 ? val*0x0100000001ULL : val*0x0101010101010101ULL; | 433 *(uint64_t*)(p + 3*stride)= v; |
439 }else if(w==16 && h==2){ | 434 }else if(w==16){ |
435 const uint64_t v= val*0x0100000001ULL; | |
440 *(uint64_t*)(p + 0+0*stride)= | 436 *(uint64_t*)(p + 0+0*stride)= |
441 *(uint64_t*)(p + 8+0*stride)= | 437 *(uint64_t*)(p + 8+0*stride)= |
442 *(uint64_t*)(p + 0+1*stride)= | 438 *(uint64_t*)(p + 0+1*stride)= |
443 *(uint64_t*)(p + 8+1*stride)= size==4 ? val*0x0100000001ULL : val*0x0101010101010101ULL; | 439 *(uint64_t*)(p + 8+1*stride)= v; |
444 }else if(w==16 && h==4){ | 440 if(h==2) return; |
445 *(uint64_t*)(p + 0+0*stride)= | |
446 *(uint64_t*)(p + 8+0*stride)= | |
447 *(uint64_t*)(p + 0+1*stride)= | |
448 *(uint64_t*)(p + 8+1*stride)= | |
449 *(uint64_t*)(p + 0+2*stride)= | 441 *(uint64_t*)(p + 0+2*stride)= |
450 *(uint64_t*)(p + 8+2*stride)= | 442 *(uint64_t*)(p + 8+2*stride)= |
451 *(uint64_t*)(p + 0+3*stride)= | 443 *(uint64_t*)(p + 0+3*stride)= |
452 *(uint64_t*)(p + 8+3*stride)= size==4 ? val*0x0100000001ULL : val*0x0101010101010101ULL; | 444 *(uint64_t*)(p + 8+3*stride)= v; |
445 #else | |
446 *(uint32_t*)(p + 0+0*stride)= | |
447 *(uint32_t*)(p + 4+0*stride)= val; | |
448 if(h==1) return; | |
449 *(uint32_t*)(p + 0+1*stride)= | |
450 *(uint32_t*)(p + 4+1*stride)= val; | |
451 if(h==2) return; | |
452 *(uint32_t*)(p + 0+2*stride)= | |
453 *(uint32_t*)(p + 4+2*stride)= | |
454 *(uint32_t*)(p + 0+3*stride)= | |
455 *(uint32_t*)(p + 4+3*stride)= val; | |
456 }else if(w==16){ | |
457 *(uint32_t*)(p + 0+0*stride)= | |
458 *(uint32_t*)(p + 4+0*stride)= | |
459 *(uint32_t*)(p + 8+0*stride)= | |
460 *(uint32_t*)(p +12+0*stride)= | |
461 *(uint32_t*)(p + 0+1*stride)= | |
462 *(uint32_t*)(p + 4+1*stride)= | |
463 *(uint32_t*)(p + 8+1*stride)= | |
464 *(uint32_t*)(p +12+1*stride)= val; | |
465 if(h==2) return; | |
466 *(uint32_t*)(p + 0+2*stride)= | |
467 *(uint32_t*)(p + 4+2*stride)= | |
468 *(uint32_t*)(p + 8+2*stride)= | |
469 *(uint32_t*)(p +12+2*stride)= | |
470 *(uint32_t*)(p + 0+3*stride)= | |
471 *(uint32_t*)(p + 4+3*stride)= | |
472 *(uint32_t*)(p + 8+3*stride)= | |
473 *(uint32_t*)(p +12+3*stride)= val; | |
474 #endif | |
453 }else | 475 }else |
454 assert(0); | 476 assert(0); |
477 assert(h==4); | |
455 } | 478 } |
456 | 479 |
457 static void fill_caches(H264Context *h, int mb_type, int for_deblock){ | 480 static void fill_caches(H264Context *h, int mb_type, int for_deblock){ |
458 MpegEncContext * const s = &h->s; | 481 MpegEncContext * const s = &h->s; |
459 const int mb_xy= s->mb_x + s->mb_y*s->mb_stride; | 482 const int mb_xy= s->mb_x + s->mb_y*s->mb_stride; |