Mercurial > libavcodec.hg
comparison alpha/dsputil_alpha.c @ 509:cab79946302f libavcodec
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
allows better scheduling of the memory accesses, and is portable among
all compilers.
author | mellum |
---|---|
date | Mon, 01 Jul 2002 04:26:07 +0000 |
parents | 7a976bf93394 |
children | fa4425cf6b31 |
comparison
equal
deleted
inserted
replaced
508:8f9fa4ec9cbb | 509:cab79946302f |
---|---|
20 #include "asm.h" | 20 #include "asm.h" |
21 #include "../dsputil.h" | 21 #include "../dsputil.h" |
22 | 22 |
23 void simple_idct_axp(DCTELEM *block); | 23 void simple_idct_axp(DCTELEM *block); |
24 | 24 |
25 static void put_pixels_clamped_axp(const DCTELEM *block, UINT8 *pixels, | 25 void put_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, |
26 int line_size) | 26 int line_size); |
27 void add_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, | |
28 int line_size); | |
29 | |
30 #if 0 | |
31 /* These functions were the base for the optimized assembler routines, | |
32 and remain here for documentation purposes. */ | |
33 static void put_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels, | |
34 int line_size) | |
27 { | 35 { |
28 int i = 8; | 36 int i = 8; |
37 uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */ | |
29 | 38 |
30 ASM_ACCEPT_MVI; | 39 ASM_ACCEPT_MVI; |
31 | 40 |
32 do { | 41 do { |
33 UINT64 shorts; | 42 uint64_t shorts0, shorts1; |
34 | 43 |
35 shorts = ldq(block); | 44 shorts0 = ldq(block); |
36 shorts = maxsw4(shorts, 0); | 45 shorts0 = maxsw4(shorts0, 0); |
37 shorts = minsw4(shorts, WORD_VEC(0x00ff)); | 46 shorts0 = minsw4(shorts0, clampmask); |
38 stl(pkwb(shorts), pixels); | 47 stl(pkwb(shorts0), pixels); |
39 | 48 |
40 shorts = ldq(block + 4); | 49 shorts1 = ldq(block + 4); |
41 shorts = maxsw4(shorts, 0); | 50 shorts1 = maxsw4(shorts1, 0); |
42 shorts = minsw4(shorts, WORD_VEC(0x00ff)); | 51 shorts1 = minsw4(shorts1, clampmask); |
43 stl(pkwb(shorts), pixels + 4); | 52 stl(pkwb(shorts1), pixels + 4); |
44 | 53 |
45 pixels += line_size; | 54 pixels += line_size; |
46 block += 8; | 55 block += 8; |
47 } while (--i); | 56 } while (--i); |
48 } | 57 } |
49 | 58 |
50 static void add_pixels_clamped_axp(const DCTELEM *block, UINT8 *pixels, | 59 void add_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels, |
51 int line_size) | 60 int line_size) |
52 { | 61 { |
53 int i = 8; | 62 int h = 8; |
63 /* Keep this function a leaf function by generating the constants | |
64 manually (mainly for the hack value ;-). */ | |
65 uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */ | |
66 uint64_t signmask = zap(-1, 0x33); | |
67 signmask ^= signmask >> 1; /* 0x8000800080008000 */ | |
54 | 68 |
55 ASM_ACCEPT_MVI; | 69 ASM_ACCEPT_MVI; |
56 | 70 |
57 do { | 71 do { |
58 UINT64 shorts; | 72 uint64_t shorts0, pix0, signs0; |
59 | 73 uint64_t shorts1, pix1, signs1; |
60 shorts = ldq(block); | 74 |
61 shorts &= ~WORD_VEC(0x8000); /* clear highest bit to avoid overflow */ | 75 shorts0 = ldq(block); |
62 shorts += unpkbw(ldl(pixels)); | 76 shorts1 = ldq(block + 4); |
63 shorts &= ~WORD_VEC(0x8000); /* hibit would be set for e. g. -2 + 3 */ | 77 |
64 shorts = minuw4(shorts, WORD_VEC(0x4000)); /* set neg. to 0x4000 */ | 78 pix0 = unpkbw(ldl(pixels)); |
65 shorts &= ~WORD_VEC(0x4000); /* ...and zap them */ | 79 /* Signed subword add (MMX paddw). */ |
66 shorts = minsw4(shorts, WORD_VEC(0x00ff)); /* clamp to 255 */ | 80 signs0 = shorts0 & signmask; |
67 stl(pkwb(shorts), pixels); | 81 shorts0 &= ~signmask; |
68 | 82 shorts0 += pix0; |
69 /* next 4 */ | 83 shorts0 ^= signs0; |
70 shorts = ldq(block + 4); | 84 /* Clamp. */ |
71 shorts &= ~WORD_VEC(0x8000); | 85 shorts0 = maxsw4(shorts0, 0); |
72 shorts += unpkbw(ldl(pixels + 4)); | 86 shorts0 = minsw4(shorts0, clampmask); |
73 shorts &= ~WORD_VEC(0x8000); | 87 |
74 shorts = minuw4(shorts, WORD_VEC(0x4000)); | 88 /* Next 4. */ |
75 shorts &= ~WORD_VEC(0x4000); | 89 pix1 = unpkbw(ldl(pixels + 4)); |
76 shorts = minsw4(shorts, WORD_VEC(0x00ff)); | 90 signs1 = shorts1 & signmask; |
77 stl(pkwb(shorts), pixels + 4); | 91 shorts1 &= ~signmask; |
78 | 92 shorts1 += pix1; |
79 pixels += line_size; | 93 shorts1 ^= signs1; |
80 block += 8; | 94 shorts1 = maxsw4(shorts1, 0); |
81 } while (--i); | 95 shorts1 = minsw4(shorts1, clampmask); |
82 } | 96 |
97 stl(pkwb(shorts0), pixels); | |
98 stl(pkwb(shorts1), pixels + 4); | |
99 | |
100 pixels += line_size; | |
101 block += 8; | |
102 } while (--h); | |
103 } | |
104 #endif | |
83 | 105 |
84 /* Average 8 unsigned bytes in parallel: (b1 + b2) >> 1 | 106 /* Average 8 unsigned bytes in parallel: (b1 + b2) >> 1 |
85 Since the immediate result could be greater than 255, we do the | 107 Since the immediate result could be greater than 255, we do the |
86 shift first. The result is too low by one if the bytes were both | 108 shift first. The result is too low by one if the bytes were both |
87 odd, so we need to add (l1 & l2) & BYTE_VEC(0x01). */ | 109 odd, so we need to add (l1 & l2) & BYTE_VEC(0x01). */ |
220 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_axp; | 242 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_axp; |
221 put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_axp; | 243 put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_axp; |
222 | 244 |
223 /* amask clears all bits that correspond to present features. */ | 245 /* amask clears all bits that correspond to present features. */ |
224 if (amask(AMASK_MVI) == 0) { | 246 if (amask(AMASK_MVI) == 0) { |
225 put_pixels_clamped = put_pixels_clamped_axp; | 247 put_pixels_clamped = put_pixels_clamped_mvi_asm; |
226 add_pixels_clamped = add_pixels_clamped_axp; | 248 add_pixels_clamped = add_pixels_clamped_mvi_asm; |
227 } | 249 } |
228 } | 250 } |