comparison alpha/dsputil_alpha.c @ 509:cab79946302f libavcodec

Implement put_pixels_clamped and add_pixels_clamped in Assembler. This allows better scheduling of the memory accesses, and is portable among all compilers.
author mellum
date Mon, 01 Jul 2002 04:26:07 +0000
parents 7a976bf93394
children fa4425cf6b31
comparison
equal deleted inserted replaced
508:8f9fa4ec9cbb 509:cab79946302f
20 #include "asm.h" 20 #include "asm.h"
21 #include "../dsputil.h" 21 #include "../dsputil.h"
22 22
23 void simple_idct_axp(DCTELEM *block); 23 void simple_idct_axp(DCTELEM *block);
24 24
25 static void put_pixels_clamped_axp(const DCTELEM *block, UINT8 *pixels, 25 void put_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels,
26 int line_size) 26 int line_size);
27 void add_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels,
28 int line_size);
29
30 #if 0
31 /* These functions were the base for the optimized assembler routines,
32 and remain here for documentation purposes. */
33 static void put_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels,
34 int line_size)
27 { 35 {
28 int i = 8; 36 int i = 8;
37 uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */
29 38
30 ASM_ACCEPT_MVI; 39 ASM_ACCEPT_MVI;
31 40
32 do { 41 do {
33 UINT64 shorts; 42 uint64_t shorts0, shorts1;
34 43
35 shorts = ldq(block); 44 shorts0 = ldq(block);
36 shorts = maxsw4(shorts, 0); 45 shorts0 = maxsw4(shorts0, 0);
37 shorts = minsw4(shorts, WORD_VEC(0x00ff)); 46 shorts0 = minsw4(shorts0, clampmask);
38 stl(pkwb(shorts), pixels); 47 stl(pkwb(shorts0), pixels);
39 48
40 shorts = ldq(block + 4); 49 shorts1 = ldq(block + 4);
41 shorts = maxsw4(shorts, 0); 50 shorts1 = maxsw4(shorts1, 0);
42 shorts = minsw4(shorts, WORD_VEC(0x00ff)); 51 shorts1 = minsw4(shorts1, clampmask);
43 stl(pkwb(shorts), pixels + 4); 52 stl(pkwb(shorts1), pixels + 4);
44 53
45 pixels += line_size; 54 pixels += line_size;
46 block += 8; 55 block += 8;
47 } while (--i); 56 } while (--i);
48 } 57 }
49 58
50 static void add_pixels_clamped_axp(const DCTELEM *block, UINT8 *pixels, 59 void add_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels,
51 int line_size) 60 int line_size)
52 { 61 {
53 int i = 8; 62 int h = 8;
63 /* Keep this function a leaf function by generating the constants
64 manually (mainly for the hack value ;-). */
65 uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */
66 uint64_t signmask = zap(-1, 0x33);
67 signmask ^= signmask >> 1; /* 0x8000800080008000 */
54 68
55 ASM_ACCEPT_MVI; 69 ASM_ACCEPT_MVI;
56 70
57 do { 71 do {
58 UINT64 shorts; 72 uint64_t shorts0, pix0, signs0;
59 73 uint64_t shorts1, pix1, signs1;
60 shorts = ldq(block); 74
61 shorts &= ~WORD_VEC(0x8000); /* clear highest bit to avoid overflow */ 75 shorts0 = ldq(block);
62 shorts += unpkbw(ldl(pixels)); 76 shorts1 = ldq(block + 4);
63 shorts &= ~WORD_VEC(0x8000); /* hibit would be set for e. g. -2 + 3 */ 77
64 shorts = minuw4(shorts, WORD_VEC(0x4000)); /* set neg. to 0x4000 */ 78 pix0 = unpkbw(ldl(pixels));
65 shorts &= ~WORD_VEC(0x4000); /* ...and zap them */ 79 /* Signed subword add (MMX paddw). */
66 shorts = minsw4(shorts, WORD_VEC(0x00ff)); /* clamp to 255 */ 80 signs0 = shorts0 & signmask;
67 stl(pkwb(shorts), pixels); 81 shorts0 &= ~signmask;
68 82 shorts0 += pix0;
69 /* next 4 */ 83 shorts0 ^= signs0;
70 shorts = ldq(block + 4); 84 /* Clamp. */
71 shorts &= ~WORD_VEC(0x8000); 85 shorts0 = maxsw4(shorts0, 0);
72 shorts += unpkbw(ldl(pixels + 4)); 86 shorts0 = minsw4(shorts0, clampmask);
73 shorts &= ~WORD_VEC(0x8000); 87
74 shorts = minuw4(shorts, WORD_VEC(0x4000)); 88 /* Next 4. */
75 shorts &= ~WORD_VEC(0x4000); 89 pix1 = unpkbw(ldl(pixels + 4));
76 shorts = minsw4(shorts, WORD_VEC(0x00ff)); 90 signs1 = shorts1 & signmask;
77 stl(pkwb(shorts), pixels + 4); 91 shorts1 &= ~signmask;
78 92 shorts1 += pix1;
79 pixels += line_size; 93 shorts1 ^= signs1;
80 block += 8; 94 shorts1 = maxsw4(shorts1, 0);
81 } while (--i); 95 shorts1 = minsw4(shorts1, clampmask);
82 } 96
97 stl(pkwb(shorts0), pixels);
98 stl(pkwb(shorts1), pixels + 4);
99
100 pixels += line_size;
101 block += 8;
102 } while (--h);
103 }
104 #endif
83 105
84 /* Average 8 unsigned bytes in parallel: (b1 + b2) >> 1 106 /* Average 8 unsigned bytes in parallel: (b1 + b2) >> 1
85 Since the immediate result could be greater than 255, we do the 107 Since the immediate result could be greater than 255, we do the
86 shift first. The result is too low by one if the bytes were both 108 shift first. The result is too low by one if the bytes were both
87 odd, so we need to add (l1 & l2) & BYTE_VEC(0x01). */ 109 odd, so we need to add (l1 & l2) & BYTE_VEC(0x01). */
220 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_axp; 242 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_axp;
221 put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_axp; 243 put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_axp;
222 244
223 /* amask clears all bits that correspond to present features. */ 245 /* amask clears all bits that correspond to present features. */
224 if (amask(AMASK_MVI) == 0) { 246 if (amask(AMASK_MVI) == 0) {
225 put_pixels_clamped = put_pixels_clamped_axp; 247 put_pixels_clamped = put_pixels_clamped_mvi_asm;
226 add_pixels_clamped = add_pixels_clamped_axp; 248 add_pixels_clamped = add_pixels_clamped_mvi_asm;
227 } 249 }
228 } 250 }