comparison dsputil.c @ 385:7ac7a48fbe5e libavcodec

new hopefully faster MC
author michaelni
date Thu, 16 May 2002 23:29:09 +0000
parents 9c6f056f0e41
children fce0a2520551
comparison
equal deleted inserted replaced
384:d442918c4698 385:7ac7a48fbe5e
14 * 14 *
15 * You should have received a copy of the GNU General Public License 15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software 16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 * 18 *
19 * gmc & q-pel support by Michael Niedermayer <michaelni@gmx.at> 19 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
20 */ 20 */
21 #include <stdlib.h> 21 #include <stdlib.h>
22 #include <stdio.h> 22 #include <stdio.h>
23 #include <math.h> 23 #include <math.h>
24 #include "avcodec.h" 24 #include "avcodec.h"
250 pix += line_size; 250 pix += line_size;
251 p += 8; 251 p += 8;
252 } 252 }
253 } 253 }
254 254
255 //FIXME someone with a alignemtent picky cpu should change these
256
257 #define LD32(a) (*((uint32_t*)(a)))
258 #define LD64(a) (*((uint64_t*)(a)))
259
260 #if 0
261
262 #define PIXOP2(OPNAME, OP) \
263 void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
264 {\
265 int i;\
266 for(i=0; i<h; i++){\
267 OP(*((uint64_t*)block), LD64(pixels));\
268 pixels+=line_size;\
269 block +=line_size;\
270 }\
271 }\
272 \
273 void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
274 {\
275 int i;\
276 for(i=0; i<h; i++){\
277 const uint64_t a= LD64(pixels );\
278 const uint64_t b= LD64(pixels+1);\
279 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
280 pixels+=line_size;\
281 block +=line_size;\
282 }\
283 }\
284 \
285 void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
286 {\
287 int i;\
288 for(i=0; i<h; i++){\
289 const uint64_t a= LD64(pixels );\
290 const uint64_t b= LD64(pixels+1);\
291 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
292 pixels+=line_size;\
293 block +=line_size;\
294 }\
295 }\
296 \
297 void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
298 {\
299 int i;\
300 for(i=0; i<h; i++){\
301 const uint64_t a= LD64(pixels );\
302 const uint64_t b= LD64(pixels+line_size);\
303 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
304 pixels+=line_size;\
305 block +=line_size;\
306 }\
307 }\
308 \
309 void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
310 {\
311 int i;\
312 for(i=0; i<h; i++){\
313 const uint64_t a= LD64(pixels );\
314 const uint64_t b= LD64(pixels+line_size);\
315 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
316 pixels+=line_size;\
317 block +=line_size;\
318 }\
319 }\
320 \
321 void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
322 {\
323 int i;\
324 const uint64_t a= LD64(pixels );\
325 const uint64_t b= LD64(pixels+1);\
326 uint64_t l0= (a&0x0303030303030303ULL)\
327 + (b&0x0303030303030303ULL)\
328 + 0x0202020202020202ULL;\
329 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
330 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
331 uint64_t l1,h1;\
332 \
333 pixels+=line_size;\
334 for(i=0; i<h; i+=2){\
335 uint64_t a= LD64(pixels );\
336 uint64_t b= LD64(pixels+1);\
337 l1= (a&0x0303030303030303ULL)\
338 + (b&0x0303030303030303ULL);\
339 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
340 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
341 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
342 pixels+=line_size;\
343 block +=line_size;\
344 a= LD64(pixels );\
345 b= LD64(pixels+1);\
346 l0= (a&0x0303030303030303ULL)\
347 + (b&0x0303030303030303ULL)\
348 + 0x0202020202020202ULL;\
349 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
350 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
351 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
352 pixels+=line_size;\
353 block +=line_size;\
354 }\
355 }\
356 \
357 void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
358 {\
359 int i;\
360 const uint64_t a= LD64(pixels );\
361 const uint64_t b= LD64(pixels+1);\
362 uint64_t l0= (a&0x0303030303030303ULL)\
363 + (b&0x0303030303030303ULL)\
364 + 0x0101010101010101ULL;\
365 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
366 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
367 uint64_t l1,h1;\
368 \
369 pixels+=line_size;\
370 for(i=0; i<h; i+=2){\
371 uint64_t a= LD64(pixels );\
372 uint64_t b= LD64(pixels+1);\
373 l1= (a&0x0303030303030303ULL)\
374 + (b&0x0303030303030303ULL);\
375 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
376 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
377 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
378 pixels+=line_size;\
379 block +=line_size;\
380 a= LD64(pixels );\
381 b= LD64(pixels+1);\
382 l0= (a&0x0303030303030303ULL)\
383 + (b&0x0303030303030303ULL)\
384 + 0x0101010101010101ULL;\
385 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
386 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
387 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
388 pixels+=line_size;\
389 block +=line_size;\
390 }\
391 }\
392 \
393 void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
394 OPNAME ## _pixels,\
395 OPNAME ## _pixels_x2,\
396 OPNAME ## _pixels_y2,\
397 OPNAME ## _pixels_xy2,\
398 };\
399 \
400 void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
401 OPNAME ## _pixels,\
402 OPNAME ## _no_rnd_pixels_x2,\
403 OPNAME ## _no_rnd_pixels_y2,\
404 OPNAME ## _no_rnd_pixels_xy2,\
405 };
406
407 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
408 #else // 64 bit variant
409
410 #define PIXOP2(OPNAME, OP) \
411 void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
412 {\
413 int i;\
414 for(i=0; i<h; i++){\
415 OP(*((uint32_t*)(block )), LD32(pixels ));\
416 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
417 pixels+=line_size;\
418 block +=line_size;\
419 }\
420 }\
421 \
422 void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
423 {\
424 int i;\
425 for(i=0; i<h; i++){\
426 int j;\
427 for(j=0; j<2; j++){\
428 const uint32_t a= LD32(pixels );\
429 const uint32_t b= LD32(pixels+1);\
430 OP(*((uint32_t*)block), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
431 pixels+=4;\
432 block +=4;\
433 }\
434 pixels+=line_size-8;\
435 block +=line_size-8;\
436 }\
437 }\
438 \
439 void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
440 {\
441 int i;\
442 for(i=0; i<h; i++){\
443 int j;\
444 for(j=0; j<2; j++){\
445 const uint32_t a= LD32(pixels );\
446 const uint32_t b= LD32(pixels+1);\
447 OP(*((uint32_t*)block), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
448 pixels+=4;\
449 block +=4;\
450 }\
451 pixels+=line_size-8;\
452 block +=line_size-8;\
453 }\
454 }\
455 \
456 void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
457 {\
458 int i;\
459 for(i=0; i<h; i++){\
460 int j;\
461 for(j=0; j<2; j++){\
462 const uint32_t a= LD32(pixels );\
463 const uint32_t b= LD32(pixels+line_size);\
464 OP(*((uint32_t*)block), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
465 pixels+=4;\
466 block +=4;\
467 }\
468 pixels+=line_size-8;\
469 block +=line_size-8;\
470 }\
471 }\
472 \
473 void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
474 {\
475 int i;\
476 for(i=0; i<h; i++){\
477 int j;\
478 for(j=0; j<2; j++){\
479 const uint32_t a= LD32(pixels );\
480 const uint32_t b= LD32(pixels+line_size);\
481 OP(*((uint32_t*)block), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
482 pixels+=4;\
483 block +=4;\
484 }\
485 pixels+=line_size-8;\
486 block +=line_size-8;\
487 }\
488 }\
489 \
490 void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
491 {\
492 int j;\
493 for(j=0; j<2; j++){\
494 int i;\
495 const uint32_t a= LD32(pixels );\
496 const uint32_t b= LD32(pixels+1);\
497 uint32_t l0= (a&0x03030303UL)\
498 + (b&0x03030303UL)\
499 + 0x02020202UL;\
500 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
501 + ((b&0xFCFCFCFCUL)>>2);\
502 uint32_t l1,h1;\
503 \
504 pixels+=line_size;\
505 for(i=0; i<h; i+=2){\
506 uint32_t a= LD32(pixels );\
507 uint32_t b= LD32(pixels+1);\
508 l1= (a&0x03030303UL)\
509 + (b&0x03030303UL);\
510 h1= ((a&0xFCFCFCFCUL)>>2)\
511 + ((b&0xFCFCFCFCUL)>>2);\
512 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
513 pixels+=line_size;\
514 block +=line_size;\
515 a= LD32(pixels );\
516 b= LD32(pixels+1);\
517 l0= (a&0x03030303UL)\
518 + (b&0x03030303UL)\
519 + 0x02020202UL;\
520 h0= ((a&0xFCFCFCFCUL)>>2)\
521 + ((b&0xFCFCFCFCUL)>>2);\
522 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
523 pixels+=line_size;\
524 block +=line_size;\
525 }\
526 pixels+=4-line_size*(h+1);\
527 block +=4-line_size*h;\
528 }\
529 }\
530 \
531 void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
532 {\
533 int j;\
534 for(j=0; j<2; j++){\
535 int i;\
536 const uint32_t a= LD32(pixels );\
537 const uint32_t b= LD32(pixels+1);\
538 uint32_t l0= (a&0x03030303UL)\
539 + (b&0x03030303UL)\
540 + 0x01010101UL;\
541 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
542 + ((b&0xFCFCFCFCUL)>>2);\
543 uint32_t l1,h1;\
544 \
545 pixels+=line_size;\
546 for(i=0; i<h; i+=2){\
547 uint32_t a= LD32(pixels );\
548 uint32_t b= LD32(pixels+1);\
549 l1= (a&0x03030303UL)\
550 + (b&0x03030303UL);\
551 h1= ((a&0xFCFCFCFCUL)>>2)\
552 + ((b&0xFCFCFCFCUL)>>2);\
553 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
554 pixels+=line_size;\
555 block +=line_size;\
556 a= LD32(pixels );\
557 b= LD32(pixels+1);\
558 l0= (a&0x03030303UL)\
559 + (b&0x03030303UL)\
560 + 0x01010101UL;\
561 h0= ((a&0xFCFCFCFCUL)>>2)\
562 + ((b&0xFCFCFCFCUL)>>2);\
563 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
564 pixels+=line_size;\
565 block +=line_size;\
566 }\
567 pixels+=4-line_size*(h+1);\
568 block +=4-line_size*h;\
569 }\
570 }\
571 \
572 void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
573 OPNAME ## _pixels,\
574 OPNAME ## _pixels_x2,\
575 OPNAME ## _pixels_y2,\
576 OPNAME ## _pixels_xy2,\
577 };\
578 \
579 void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
580 OPNAME ## _pixels,\
581 OPNAME ## _no_rnd_pixels_x2,\
582 OPNAME ## _no_rnd_pixels_y2,\
583 OPNAME ## _no_rnd_pixels_xy2,\
584 };
585 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
586 #endif
587
588 #define op_put(a, b) a = b
589
590 PIXOP2(avg, op_avg)
591 PIXOP2(put, op_put)
592 #undef op_avg
593 #undef op_put
594
595 /* FIXME this stuff could be removed as its ot really used anymore */
255 #define PIXOP(BTYPE, OPNAME, OP, INCR) \ 596 #define PIXOP(BTYPE, OPNAME, OP, INCR) \
256 \ 597 \
257 static void OPNAME ## _pixels(BTYPE *block, const UINT8 *pixels, int line_size, int h) \ 598 static void OPNAME ## _pixels(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
258 { \ 599 { \
259 BTYPE *p; \ 600 BTYPE *p; \
354 695
355 /* rounding primitives */ 696 /* rounding primitives */
356 #define avg2(a,b) ((a+b+1)>>1) 697 #define avg2(a,b) ((a+b+1)>>1)
357 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2) 698 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
358 699
359 #define op_put(a, b) a = b
360 #define op_avg(a, b) a = avg2(a, b) 700 #define op_avg(a, b) a = avg2(a, b)
361 #define op_sub(a, b) a -= b 701 #define op_sub(a, b) a -= b
362
363 PIXOP(UINT8, put, op_put, line_size)
364 PIXOP(UINT8, avg, op_avg, line_size)
365 702
366 PIXOP(DCTELEM, sub, op_sub, 8) 703 PIXOP(DCTELEM, sub, op_sub, 8)
367 704
368 /* not rounding primitives */ 705 /* not rounding primitives */
369 #undef avg2 706 #undef avg2
370 #undef avg4 707 #undef avg4
371 #define avg2(a,b) ((a+b)>>1) 708 #define avg2(a,b) ((a+b)>>1)
372 #define avg4(a,b,c,d) ((a+b+c+d+1)>>2) 709 #define avg4(a,b,c,d) ((a+b+c+d+1)>>2)
373 710
374 PIXOP(UINT8, put_no_rnd, op_put, line_size)
375 PIXOP(UINT8, avg_no_rnd, op_avg, line_size)
376
377 /* motion estimation */ 711 /* motion estimation */
378 712
379 #undef avg2 713 #undef avg2
380 #undef avg4 714 #undef avg4
381 #define avg2(a,b) ((a+b+1)>>1) 715 #define avg2(a,b) ((a+b+1)>>1)
382 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2) 716 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
717
718 /* end of removeale stuff */
383 719
384 static void gmc1_c(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder) 720 static void gmc1_c(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder)
385 { 721 {
386 const int A=(16-x16)*(16-y16); 722 const int A=(16-x16)*(16-y16);
387 const int B=( x16)*(16-y16); 723 const int B=( x16)*(16-y16);