Mercurial > libavcodec.hg
comparison dsputil.c @ 385:7ac7a48fbe5e libavcodec
new hopefully faster MC
author | michaelni |
---|---|
date | Thu, 16 May 2002 23:29:09 +0000 |
parents | 9c6f056f0e41 |
children | fce0a2520551 |
comparison
equal
deleted
inserted
replaced
384:d442918c4698 | 385:7ac7a48fbe5e |
---|---|
14 * | 14 * |
15 * You should have received a copy of the GNU General Public License | 15 * You should have received a copy of the GNU General Public License |
16 * along with this program; if not, write to the Free Software | 16 * along with this program; if not, write to the Free Software |
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | 17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
18 * | 18 * |
19 * gmc & q-pel support by Michael Niedermayer <michaelni@gmx.at> | 19 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at> |
20 */ | 20 */ |
21 #include <stdlib.h> | 21 #include <stdlib.h> |
22 #include <stdio.h> | 22 #include <stdio.h> |
23 #include <math.h> | 23 #include <math.h> |
24 #include "avcodec.h" | 24 #include "avcodec.h" |
250 pix += line_size; | 250 pix += line_size; |
251 p += 8; | 251 p += 8; |
252 } | 252 } |
253 } | 253 } |
254 | 254 |
255 //FIXME someone with a alignemtent picky cpu should change these | |
256 | |
257 #define LD32(a) (*((uint32_t*)(a))) | |
258 #define LD64(a) (*((uint64_t*)(a))) | |
259 | |
260 #if 0 | |
261 | |
262 #define PIXOP2(OPNAME, OP) \ | |
263 void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ | |
264 {\ | |
265 int i;\ | |
266 for(i=0; i<h; i++){\ | |
267 OP(*((uint64_t*)block), LD64(pixels));\ | |
268 pixels+=line_size;\ | |
269 block +=line_size;\ | |
270 }\ | |
271 }\ | |
272 \ | |
273 void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ | |
274 {\ | |
275 int i;\ | |
276 for(i=0; i<h; i++){\ | |
277 const uint64_t a= LD64(pixels );\ | |
278 const uint64_t b= LD64(pixels+1);\ | |
279 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\ | |
280 pixels+=line_size;\ | |
281 block +=line_size;\ | |
282 }\ | |
283 }\ | |
284 \ | |
285 void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ | |
286 {\ | |
287 int i;\ | |
288 for(i=0; i<h; i++){\ | |
289 const uint64_t a= LD64(pixels );\ | |
290 const uint64_t b= LD64(pixels+1);\ | |
291 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\ | |
292 pixels+=line_size;\ | |
293 block +=line_size;\ | |
294 }\ | |
295 }\ | |
296 \ | |
297 void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ | |
298 {\ | |
299 int i;\ | |
300 for(i=0; i<h; i++){\ | |
301 const uint64_t a= LD64(pixels );\ | |
302 const uint64_t b= LD64(pixels+line_size);\ | |
303 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\ | |
304 pixels+=line_size;\ | |
305 block +=line_size;\ | |
306 }\ | |
307 }\ | |
308 \ | |
309 void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ | |
310 {\ | |
311 int i;\ | |
312 for(i=0; i<h; i++){\ | |
313 const uint64_t a= LD64(pixels );\ | |
314 const uint64_t b= LD64(pixels+line_size);\ | |
315 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\ | |
316 pixels+=line_size;\ | |
317 block +=line_size;\ | |
318 }\ | |
319 }\ | |
320 \ | |
321 void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ | |
322 {\ | |
323 int i;\ | |
324 const uint64_t a= LD64(pixels );\ | |
325 const uint64_t b= LD64(pixels+1);\ | |
326 uint64_t l0= (a&0x0303030303030303ULL)\ | |
327 + (b&0x0303030303030303ULL)\ | |
328 + 0x0202020202020202ULL;\ | |
329 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\ | |
330 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\ | |
331 uint64_t l1,h1;\ | |
332 \ | |
333 pixels+=line_size;\ | |
334 for(i=0; i<h; i+=2){\ | |
335 uint64_t a= LD64(pixels );\ | |
336 uint64_t b= LD64(pixels+1);\ | |
337 l1= (a&0x0303030303030303ULL)\ | |
338 + (b&0x0303030303030303ULL);\ | |
339 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\ | |
340 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\ | |
341 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\ | |
342 pixels+=line_size;\ | |
343 block +=line_size;\ | |
344 a= LD64(pixels );\ | |
345 b= LD64(pixels+1);\ | |
346 l0= (a&0x0303030303030303ULL)\ | |
347 + (b&0x0303030303030303ULL)\ | |
348 + 0x0202020202020202ULL;\ | |
349 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\ | |
350 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\ | |
351 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\ | |
352 pixels+=line_size;\ | |
353 block +=line_size;\ | |
354 }\ | |
355 }\ | |
356 \ | |
357 void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ | |
358 {\ | |
359 int i;\ | |
360 const uint64_t a= LD64(pixels );\ | |
361 const uint64_t b= LD64(pixels+1);\ | |
362 uint64_t l0= (a&0x0303030303030303ULL)\ | |
363 + (b&0x0303030303030303ULL)\ | |
364 + 0x0101010101010101ULL;\ | |
365 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\ | |
366 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\ | |
367 uint64_t l1,h1;\ | |
368 \ | |
369 pixels+=line_size;\ | |
370 for(i=0; i<h; i+=2){\ | |
371 uint64_t a= LD64(pixels );\ | |
372 uint64_t b= LD64(pixels+1);\ | |
373 l1= (a&0x0303030303030303ULL)\ | |
374 + (b&0x0303030303030303ULL);\ | |
375 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\ | |
376 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\ | |
377 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\ | |
378 pixels+=line_size;\ | |
379 block +=line_size;\ | |
380 a= LD64(pixels );\ | |
381 b= LD64(pixels+1);\ | |
382 l0= (a&0x0303030303030303ULL)\ | |
383 + (b&0x0303030303030303ULL)\ | |
384 + 0x0101010101010101ULL;\ | |
385 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\ | |
386 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\ | |
387 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\ | |
388 pixels+=line_size;\ | |
389 block +=line_size;\ | |
390 }\ | |
391 }\ | |
392 \ | |
393 void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\ | |
394 OPNAME ## _pixels,\ | |
395 OPNAME ## _pixels_x2,\ | |
396 OPNAME ## _pixels_y2,\ | |
397 OPNAME ## _pixels_xy2,\ | |
398 };\ | |
399 \ | |
400 void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\ | |
401 OPNAME ## _pixels,\ | |
402 OPNAME ## _no_rnd_pixels_x2,\ | |
403 OPNAME ## _no_rnd_pixels_y2,\ | |
404 OPNAME ## _no_rnd_pixels_xy2,\ | |
405 }; | |
406 | |
407 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) ) | |
408 #else // 64 bit variant | |
409 | |
410 #define PIXOP2(OPNAME, OP) \ | |
411 void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ | |
412 {\ | |
413 int i;\ | |
414 for(i=0; i<h; i++){\ | |
415 OP(*((uint32_t*)(block )), LD32(pixels ));\ | |
416 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\ | |
417 pixels+=line_size;\ | |
418 block +=line_size;\ | |
419 }\ | |
420 }\ | |
421 \ | |
422 void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ | |
423 {\ | |
424 int i;\ | |
425 for(i=0; i<h; i++){\ | |
426 int j;\ | |
427 for(j=0; j<2; j++){\ | |
428 const uint32_t a= LD32(pixels );\ | |
429 const uint32_t b= LD32(pixels+1);\ | |
430 OP(*((uint32_t*)block), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\ | |
431 pixels+=4;\ | |
432 block +=4;\ | |
433 }\ | |
434 pixels+=line_size-8;\ | |
435 block +=line_size-8;\ | |
436 }\ | |
437 }\ | |
438 \ | |
439 void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ | |
440 {\ | |
441 int i;\ | |
442 for(i=0; i<h; i++){\ | |
443 int j;\ | |
444 for(j=0; j<2; j++){\ | |
445 const uint32_t a= LD32(pixels );\ | |
446 const uint32_t b= LD32(pixels+1);\ | |
447 OP(*((uint32_t*)block), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\ | |
448 pixels+=4;\ | |
449 block +=4;\ | |
450 }\ | |
451 pixels+=line_size-8;\ | |
452 block +=line_size-8;\ | |
453 }\ | |
454 }\ | |
455 \ | |
456 void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ | |
457 {\ | |
458 int i;\ | |
459 for(i=0; i<h; i++){\ | |
460 int j;\ | |
461 for(j=0; j<2; j++){\ | |
462 const uint32_t a= LD32(pixels );\ | |
463 const uint32_t b= LD32(pixels+line_size);\ | |
464 OP(*((uint32_t*)block), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\ | |
465 pixels+=4;\ | |
466 block +=4;\ | |
467 }\ | |
468 pixels+=line_size-8;\ | |
469 block +=line_size-8;\ | |
470 }\ | |
471 }\ | |
472 \ | |
473 void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ | |
474 {\ | |
475 int i;\ | |
476 for(i=0; i<h; i++){\ | |
477 int j;\ | |
478 for(j=0; j<2; j++){\ | |
479 const uint32_t a= LD32(pixels );\ | |
480 const uint32_t b= LD32(pixels+line_size);\ | |
481 OP(*((uint32_t*)block), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\ | |
482 pixels+=4;\ | |
483 block +=4;\ | |
484 }\ | |
485 pixels+=line_size-8;\ | |
486 block +=line_size-8;\ | |
487 }\ | |
488 }\ | |
489 \ | |
490 void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ | |
491 {\ | |
492 int j;\ | |
493 for(j=0; j<2; j++){\ | |
494 int i;\ | |
495 const uint32_t a= LD32(pixels );\ | |
496 const uint32_t b= LD32(pixels+1);\ | |
497 uint32_t l0= (a&0x03030303UL)\ | |
498 + (b&0x03030303UL)\ | |
499 + 0x02020202UL;\ | |
500 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\ | |
501 + ((b&0xFCFCFCFCUL)>>2);\ | |
502 uint32_t l1,h1;\ | |
503 \ | |
504 pixels+=line_size;\ | |
505 for(i=0; i<h; i+=2){\ | |
506 uint32_t a= LD32(pixels );\ | |
507 uint32_t b= LD32(pixels+1);\ | |
508 l1= (a&0x03030303UL)\ | |
509 + (b&0x03030303UL);\ | |
510 h1= ((a&0xFCFCFCFCUL)>>2)\ | |
511 + ((b&0xFCFCFCFCUL)>>2);\ | |
512 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ | |
513 pixels+=line_size;\ | |
514 block +=line_size;\ | |
515 a= LD32(pixels );\ | |
516 b= LD32(pixels+1);\ | |
517 l0= (a&0x03030303UL)\ | |
518 + (b&0x03030303UL)\ | |
519 + 0x02020202UL;\ | |
520 h0= ((a&0xFCFCFCFCUL)>>2)\ | |
521 + ((b&0xFCFCFCFCUL)>>2);\ | |
522 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ | |
523 pixels+=line_size;\ | |
524 block +=line_size;\ | |
525 }\ | |
526 pixels+=4-line_size*(h+1);\ | |
527 block +=4-line_size*h;\ | |
528 }\ | |
529 }\ | |
530 \ | |
531 void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ | |
532 {\ | |
533 int j;\ | |
534 for(j=0; j<2; j++){\ | |
535 int i;\ | |
536 const uint32_t a= LD32(pixels );\ | |
537 const uint32_t b= LD32(pixels+1);\ | |
538 uint32_t l0= (a&0x03030303UL)\ | |
539 + (b&0x03030303UL)\ | |
540 + 0x01010101UL;\ | |
541 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\ | |
542 + ((b&0xFCFCFCFCUL)>>2);\ | |
543 uint32_t l1,h1;\ | |
544 \ | |
545 pixels+=line_size;\ | |
546 for(i=0; i<h; i+=2){\ | |
547 uint32_t a= LD32(pixels );\ | |
548 uint32_t b= LD32(pixels+1);\ | |
549 l1= (a&0x03030303UL)\ | |
550 + (b&0x03030303UL);\ | |
551 h1= ((a&0xFCFCFCFCUL)>>2)\ | |
552 + ((b&0xFCFCFCFCUL)>>2);\ | |
553 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ | |
554 pixels+=line_size;\ | |
555 block +=line_size;\ | |
556 a= LD32(pixels );\ | |
557 b= LD32(pixels+1);\ | |
558 l0= (a&0x03030303UL)\ | |
559 + (b&0x03030303UL)\ | |
560 + 0x01010101UL;\ | |
561 h0= ((a&0xFCFCFCFCUL)>>2)\ | |
562 + ((b&0xFCFCFCFCUL)>>2);\ | |
563 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ | |
564 pixels+=line_size;\ | |
565 block +=line_size;\ | |
566 }\ | |
567 pixels+=4-line_size*(h+1);\ | |
568 block +=4-line_size*h;\ | |
569 }\ | |
570 }\ | |
571 \ | |
572 void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\ | |
573 OPNAME ## _pixels,\ | |
574 OPNAME ## _pixels_x2,\ | |
575 OPNAME ## _pixels_y2,\ | |
576 OPNAME ## _pixels_xy2,\ | |
577 };\ | |
578 \ | |
579 void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\ | |
580 OPNAME ## _pixels,\ | |
581 OPNAME ## _no_rnd_pixels_x2,\ | |
582 OPNAME ## _no_rnd_pixels_y2,\ | |
583 OPNAME ## _no_rnd_pixels_xy2,\ | |
584 }; | |
585 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) ) | |
586 #endif | |
587 | |
588 #define op_put(a, b) a = b | |
589 | |
590 PIXOP2(avg, op_avg) | |
591 PIXOP2(put, op_put) | |
592 #undef op_avg | |
593 #undef op_put | |
594 | |
595 /* FIXME this stuff could be removed as its ot really used anymore */ | |
255 #define PIXOP(BTYPE, OPNAME, OP, INCR) \ | 596 #define PIXOP(BTYPE, OPNAME, OP, INCR) \ |
256 \ | 597 \ |
257 static void OPNAME ## _pixels(BTYPE *block, const UINT8 *pixels, int line_size, int h) \ | 598 static void OPNAME ## _pixels(BTYPE *block, const UINT8 *pixels, int line_size, int h) \ |
258 { \ | 599 { \ |
259 BTYPE *p; \ | 600 BTYPE *p; \ |
354 | 695 |
355 /* rounding primitives */ | 696 /* rounding primitives */ |
356 #define avg2(a,b) ((a+b+1)>>1) | 697 #define avg2(a,b) ((a+b+1)>>1) |
357 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2) | 698 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2) |
358 | 699 |
359 #define op_put(a, b) a = b | |
360 #define op_avg(a, b) a = avg2(a, b) | 700 #define op_avg(a, b) a = avg2(a, b) |
361 #define op_sub(a, b) a -= b | 701 #define op_sub(a, b) a -= b |
362 | |
363 PIXOP(UINT8, put, op_put, line_size) | |
364 PIXOP(UINT8, avg, op_avg, line_size) | |
365 | 702 |
366 PIXOP(DCTELEM, sub, op_sub, 8) | 703 PIXOP(DCTELEM, sub, op_sub, 8) |
367 | 704 |
368 /* not rounding primitives */ | 705 /* not rounding primitives */ |
369 #undef avg2 | 706 #undef avg2 |
370 #undef avg4 | 707 #undef avg4 |
371 #define avg2(a,b) ((a+b)>>1) | 708 #define avg2(a,b) ((a+b)>>1) |
372 #define avg4(a,b,c,d) ((a+b+c+d+1)>>2) | 709 #define avg4(a,b,c,d) ((a+b+c+d+1)>>2) |
373 | 710 |
374 PIXOP(UINT8, put_no_rnd, op_put, line_size) | |
375 PIXOP(UINT8, avg_no_rnd, op_avg, line_size) | |
376 | |
377 /* motion estimation */ | 711 /* motion estimation */ |
378 | 712 |
379 #undef avg2 | 713 #undef avg2 |
380 #undef avg4 | 714 #undef avg4 |
381 #define avg2(a,b) ((a+b+1)>>1) | 715 #define avg2(a,b) ((a+b+1)>>1) |
382 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2) | 716 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2) |
717 | |
718 /* end of removeale stuff */ | |
383 | 719 |
384 static void gmc1_c(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder) | 720 static void gmc1_c(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder) |
385 { | 721 { |
386 const int A=(16-x16)*(16-y16); | 722 const int A=(16-x16)*(16-y16); |
387 const int B=( x16)*(16-y16); | 723 const int B=( x16)*(16-y16); |