comparison i386/dsputil_mmx.c @ 444:a5edef76dac6 libavcodec

* new mmx code - based upon http://aggregate.org/MAGIC for now it's rather sneak preview (new functions are nearly 100% faster)
author kabi
date Wed, 29 May 2002 14:29:48 +0000
parents fe58fe638f9b
children 62c01dbdc1e0
comparison
equal deleted inserted replaced
443:63467327c06c 444:a5edef76dac6
47 /* external functions, from idct_mmx.c */ 47 /* external functions, from idct_mmx.c */
48 void ff_mmx_idct(DCTELEM *block); 48 void ff_mmx_idct(DCTELEM *block);
49 void ff_mmxext_idct(DCTELEM *block); 49 void ff_mmxext_idct(DCTELEM *block);
50 50
51 /* pixel operations */ 51 /* pixel operations */
52 static const uint64_t mm_bfe __attribute__ ((aligned(8))) = 0xfefefefefefefefeULL;
52 static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101ULL; 53 static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
53 static const uint64_t mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL; 54 static const uint64_t mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
54 static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002ULL; 55 static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002ULL;
55 //static const unsigned short mm_wone[4] __attribute__ ((aligned(8))) = { 0x1, 0x1, 0x1, 0x1 }; 56 //static const unsigned short mm_wone[4] __attribute__ ((aligned(8))) = { 0x1, 0x1, 0x1, 0x1 };
56 //static const unsigned short mm_wtwo[4] __attribute__ ((aligned(8))) = { 0x2, 0x2, 0x2, 0x2 }; 57 //static const unsigned short mm_wtwo[4] __attribute__ ((aligned(8))) = { 0x2, 0x2, 0x2, 0x2 };
60 61
61 #ifndef PIC 62 #ifndef PIC
62 #define MOVQ_WONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wone)) 63 #define MOVQ_WONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wone))
63 #define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo)) 64 #define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
64 #define MOVQ_BONE(regd) "movq "MANGLE(mm_bone)", "#regd" \n\t" 65 #define MOVQ_BONE(regd) "movq "MANGLE(mm_bone)", "#regd" \n\t"
66 #define MOVQ_BFE(regd) "movq "MANGLE(mm_bfe)", "#regd" \n\t"
65 #else 67 #else
66 // for shared library it's better to use this way for accessing constants 68 // for shared library it's better to use this way for accessing constants
67 // pcmpeqd -> -1 69 // pcmpeqd -> -1
68 #define MOVQ_WONE(regd) \ 70 #define MOVQ_WONE(regd) \
69 __asm __volatile ( \ 71 __asm __volatile ( \
78 80
79 #define MOVQ_BONE(regd) \ 81 #define MOVQ_BONE(regd) \
80 "pcmpeqd " #regd ", " #regd " \n\t" \ 82 "pcmpeqd " #regd ", " #regd " \n\t" \
81 "psrlw $15, " #regd " \n\t"\ 83 "psrlw $15, " #regd " \n\t"\
82 "packuswb " #regd ", " #regd " \n\t" 84 "packuswb " #regd ", " #regd " \n\t"
85
86 #define MOVQ_BFE(regd) \
87 "pcmpeqd " #regd ", " #regd " \n\t"\
88 "paddb " #regd ", " #regd " \n\t"
83 #endif 89 #endif
90
91 // using mm6 as temporary and for the output result
92 // first argument is unmodifed and second is trashed
93 // mm7 is supposed to contain 0xfefefefefefefefe
94 #define PAVG_MMX_NO_RND(rega, regb) \
95 "movq " #rega ", %%mm6 \n\t"\
96 "pand " #regb ", %%mm6 \n\t"\
97 "pxor " #rega ", " #regb " \n\t"\
98 "pand %%mm7, " #regb " \n\t"\
99 "psrlq $1, " #regb " \n\t"\
100 "paddb " #regb ", %%mm6 \n\t"
101
102 #define PAVG_MMX(rega, regb) \
103 "movq " #rega ", %%mm6 \n\t"\
104 "por " #regb ", %%mm6 \n\t"\
105 "pxor " #rega ", " #regb " \n\t"\
106 "pand %%mm7, " #regb " \n\t"\
107 "psrlq $1, " #regb " \n\t"\
108 "psubb " #regb ", %%mm6 \n\t"
84 109
85 110
86 /***********************************/ 111 /***********************************/
87 /* 3Dnow specific */ 112 /* 3Dnow specific */
88 113
289 : "r"(line_size) 314 : "r"(line_size)
290 : "%eax", "memory" 315 : "%eax", "memory"
291 ); 316 );
292 } 317 }
293 318
319 // will have to be check if it's better to have bigger
320 // unrolled code also on Celerons - for now yes
321 #define LONG_UNROLL 1
294 static void put_pixels_x2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) 322 static void put_pixels_x2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
295 { 323 {
324 #if 0
296 UINT8 *p; 325 UINT8 *p;
297 const UINT8 *pix; 326 const UINT8 *pix;
298 p = block; 327 p = block;
299 pix = pixels; 328 pix = pixels;
300 MOVQ_ZERO(mm7); 329 MOVQ_ZERO(mm7);
301 MOVQ_WONE(mm4); 330 MOVQ_WONE(mm4);
302 JUMPALIGN(); 331 JUMPALIGN();
303 do { 332 do {
304 __asm __volatile( 333 __asm __volatile(
305 "movq %1, %%mm0\n\t" 334 "movq %1, %%mm0\n\t"
306 "movq 1%1, %%mm1\n\t" 335 "movq 1%1, %%mm1\n\t"
307 "movq %%mm0, %%mm2\n\t" 336 "movq %%mm0, %%mm2\n\t"
308 "movq %%mm1, %%mm3\n\t" 337 "movq %%mm1, %%mm3\n\t"
309 "punpcklbw %%mm7, %%mm0\n\t" 338 "punpcklbw %%mm7, %%mm0\n\t"
318 "psrlw $1, %%mm2\n\t" 347 "psrlw $1, %%mm2\n\t"
319 "packuswb %%mm2, %%mm0\n\t" 348 "packuswb %%mm2, %%mm0\n\t"
320 "movq %%mm0, %0\n\t" 349 "movq %%mm0, %0\n\t"
321 :"=m"(*p) 350 :"=m"(*p)
322 :"m"(*pix) 351 :"m"(*pix)
323 :"memory"); 352 :"memory");
324 pix += line_size; p += line_size; 353 pix += line_size; p += line_size;
325 } while (--h); 354 } while (--h);
355 #else
356 __asm __volatile(
357 MOVQ_BFE(%%mm7)
358 "lea (%3, %3), %%eax \n\t"
359 ".balign 8 \n\t"
360 "1: \n\t"
361 "movq (%1), %%mm0 \n\t"
362 "movq (%1, %3), %%mm2 \n\t"
363 "movq 1(%1), %%mm1 \n\t"
364 "movq 1(%1, %3), %%mm3 \n\t"
365 PAVG_MMX(%%mm0, %%mm1)
366 "movq %%mm6, (%2) \n\t"
367 PAVG_MMX(%%mm2, %%mm3)
368 "movq %%mm6, (%2, %3) \n\t"
369 "addl %%eax, %1 \n\t"
370 "addl %%eax, %2 \n\t"
371 #if LONG_UNROLL
372 "movq (%1), %%mm0 \n\t"
373 "movq (%1, %3), %%mm2 \n\t"
374 "movq 1(%1), %%mm1 \n\t"
375 "movq 1(%1, %3), %%mm3 \n\t"
376 PAVG_MMX(%%mm0, %%mm1)
377 "movq %%mm6, (%2) \n\t"
378 PAVG_MMX(%%mm2, %%mm3)
379 "movq %%mm6, (%2, %3) \n\t"
380 "addl %%eax, %1 \n\t"
381 "addl %%eax, %2 \n\t"
382 "subl $4, %0 \n\t"
383 #else
384 "subl $2, %0 \n\t"
385 #endif
386 "jnz 1b \n\t"
387 :"+g"(h), "+S"(pixels), "+D"(block)
388 :"r"(line_size)
389 :"eax", "memory");
390 #endif
326 } 391 }
327 392
328 static void put_pixels_y2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) 393 static void put_pixels_y2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
329 { 394 {
395 #if 0
330 UINT8 *p; 396 UINT8 *p;
331 const UINT8 *pix; 397 const UINT8 *pix;
332 p = block; 398 p = block;
333 pix = pixels; 399 pix = pixels;
334 MOVQ_ZERO(mm7); 400 MOVQ_ZERO(mm7);
357 "m"(*(pix+line_size)) 423 "m"(*(pix+line_size))
358 :"memory"); 424 :"memory");
359 pix += line_size; 425 pix += line_size;
360 p += line_size; 426 p += line_size;
361 } while (--h); 427 } while (--h);
428 #else
429 __asm __volatile(
430 MOVQ_BFE(%%mm7)
431 "lea (%3, %3), %%eax \n\t"
432 "movq (%1), %%mm0 \n\t"
433 ".balign 8 \n\t"
434 "1: \n\t"
435 "movq (%1, %3), %%mm1 \n\t"
436 "movq (%1, %%eax),%%mm2 \n\t"
437 PAVG_MMX(%%mm1, %%mm0)
438 "movq %%mm6, (%2) \n\t"
439 PAVG_MMX(%%mm2, %%mm1)
440 "movq %%mm6, (%2, %3) \n\t"
441 "addl %%eax, %1 \n\t"
442 "addl %%eax, %2 \n\t"
443 #ifdef LONG_UNROLL
444 "movq (%1, %3), %%mm1 \n\t"
445 "movq (%1, %%eax),%%mm0 \n\t"
446 PAVG_MMX(%%mm1, %%mm2)
447 "movq %%mm6, (%2) \n\t"
448 PAVG_MMX(%%mm0, %%mm1)
449 "movq %%mm6, (%2, %3) \n\t"
450 "addl %%eax, %1 \n\t"
451 "addl %%eax, %2 \n\t"
452 "subl $4, %0 \n\t"
453 #else
454 "subl $2, %0 \n\t"
455 #endif
456 "jnz 1b \n\t"
457 :"+g"(h), "+S"(pixels), "+D"(block)
458 :"r"(line_size)
459 :"eax", "memory");
460 #endif
461
462
362 } 463 }
363 464
364 static void put_pixels_xy2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) 465 static void put_pixels_xy2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
365 { 466 {
366 UINT8 *p; 467 UINT8 *p;