Mercurial > libavcodec.hg
comparison ppc/h264_template_altivec.c @ 5530:cd266411b11a libavcodec
use shorter types vec_"type" instead of the too long vector "type"
part 1 of h264 luma interpolation 8x8 for altivec contributed by
Mauricio Alvarez % lokifo A gmail P com %
Original thread:
Date: Jun 26, 2007 8:15 PM
Subject: Re: [FFmpeg-devel] [PATCH] h264 luma interpolation 8x8 for altivec
author | gpoirier |
---|---|
date | Sun, 12 Aug 2007 13:50:06 +0000 |
parents | 41cabe79ba25 |
children | 861eb234e6ba |
comparison
equal
deleted
inserted
replaced
5529:af68496af656 | 5530:cd266411b11a |
---|---|
25 {((8 - x) * (8 - y)), | 25 {((8 - x) * (8 - y)), |
26 ((x) * (8 - y)), | 26 ((x) * (8 - y)), |
27 ((8 - x) * (y)), | 27 ((8 - x) * (y)), |
28 ((x) * (y))}; | 28 ((x) * (y))}; |
29 register int i; | 29 register int i; |
30 vector unsigned char fperm; | 30 vec_u8_t fperm; |
31 const vector signed int vABCD = vec_ld(0, ABCD); | 31 const vec_s32_t vABCD = vec_ld(0, ABCD); |
32 const vector signed short vA = vec_splat((vector signed short)vABCD, 1); | 32 const vec_s16_t vA = vec_splat((vec_s16_t)vABCD, 1); |
33 const vector signed short vB = vec_splat((vector signed short)vABCD, 3); | 33 const vec_s16_t vB = vec_splat((vec_s16_t)vABCD, 3); |
34 const vector signed short vC = vec_splat((vector signed short)vABCD, 5); | 34 const vec_s16_t vC = vec_splat((vec_s16_t)vABCD, 5); |
35 const vector signed short vD = vec_splat((vector signed short)vABCD, 7); | 35 const vec_s16_t vD = vec_splat((vec_s16_t)vABCD, 7); |
36 const vector signed int vzero = vec_splat_s32(0); | 36 LOAD_ZERO; |
37 const vector signed short v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5)); | 37 const vec_s16_t v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5)); |
38 const vector unsigned short v6us = vec_splat_u16(6); | 38 const vec_u16_t v6us = vec_splat_u16(6); |
39 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; | 39 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; |
40 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; | 40 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; |
41 | 41 |
42 vector unsigned char vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1; | 42 vec_u8_t vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1; |
43 vector unsigned char vsrc0uc, vsrc1uc; | 43 vec_u8_t vsrc0uc, vsrc1uc; |
44 vector signed short vsrc0ssH, vsrc1ssH; | 44 vec_s16_t vsrc0ssH, vsrc1ssH; |
45 vector unsigned char vsrcCuc, vsrc2uc, vsrc3uc; | 45 vec_u8_t vsrcCuc, vsrc2uc, vsrc3uc; |
46 vector signed short vsrc2ssH, vsrc3ssH, psum; | 46 vec_s16_t vsrc2ssH, vsrc3ssH, psum; |
47 vector unsigned char vdst, ppsum, vfdst, fsum; | 47 vec_u8_t vdst, ppsum, vfdst, fsum; |
48 | 48 |
49 POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1); | 49 POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1); |
50 | 50 |
51 if (((unsigned long)dst) % 16 == 0) { | 51 if (((unsigned long)dst) % 16 == 0) { |
52 fperm = (vector unsigned char)AVV(0x10, 0x11, 0x12, 0x13, | 52 fperm = (vec_u8_t)AVV(0x10, 0x11, 0x12, 0x13, |
53 0x14, 0x15, 0x16, 0x17, | 53 0x14, 0x15, 0x16, 0x17, |
54 0x08, 0x09, 0x0A, 0x0B, | 54 0x08, 0x09, 0x0A, 0x0B, |
55 0x0C, 0x0D, 0x0E, 0x0F); | 55 0x0C, 0x0D, 0x0E, 0x0F); |
56 } else { | 56 } else { |
57 fperm = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, | 57 fperm = (vec_u8_t)AVV(0x00, 0x01, 0x02, 0x03, |
58 0x04, 0x05, 0x06, 0x07, | 58 0x04, 0x05, 0x06, 0x07, |
59 0x18, 0x19, 0x1A, 0x1B, | 59 0x18, 0x19, 0x1A, 0x1B, |
60 0x1C, 0x1D, 0x1E, 0x1F); | 60 0x1C, 0x1D, 0x1E, 0x1F); |
61 } | 61 } |
62 | 62 |
63 vsrcAuc = vec_ld(0, src); | 63 vsrcAuc = vec_ld(0, src); |
64 | 64 |
65 if (loadSecond) | 65 if (loadSecond) |
71 if (reallyBadAlign) | 71 if (reallyBadAlign) |
72 vsrc1uc = vsrcBuc; | 72 vsrc1uc = vsrcBuc; |
73 else | 73 else |
74 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); | 74 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); |
75 | 75 |
76 vsrc0ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, | 76 vsrc0ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc0uc); |
77 (vector unsigned char)vsrc0uc); | 77 vsrc1ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc1uc); |
78 vsrc1ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, | |
79 (vector unsigned char)vsrc1uc); | |
80 | 78 |
81 if (!loadSecond) {// -> !reallyBadAlign | 79 if (!loadSecond) {// -> !reallyBadAlign |
82 for (i = 0 ; i < h ; i++) { | 80 for (i = 0 ; i < h ; i++) { |
83 | 81 |
84 | 82 |
85 vsrcCuc = vec_ld(stride + 0, src); | 83 vsrcCuc = vec_ld(stride + 0, src); |
86 | 84 |
87 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); | 85 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); |
88 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); | 86 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); |
89 | 87 |
90 vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, | 88 vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc2uc); |
91 (vector unsigned char)vsrc2uc); | 89 vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc3uc); |
92 vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, | |
93 (vector unsigned char)vsrc3uc); | |
94 | 90 |
95 psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); | 91 psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); |
96 psum = vec_mladd(vB, vsrc1ssH, psum); | 92 psum = vec_mladd(vB, vsrc1ssH, psum); |
97 psum = vec_mladd(vC, vsrc2ssH, psum); | 93 psum = vec_mladd(vC, vsrc2ssH, psum); |
98 psum = vec_mladd(vD, vsrc3ssH, psum); | 94 psum = vec_mladd(vD, vsrc3ssH, psum); |
99 psum = vec_add(v32ss, psum); | 95 psum = vec_add(v32ss, psum); |
100 psum = vec_sra(psum, v6us); | 96 psum = vec_sra(psum, v6us); |
101 | 97 |
102 vdst = vec_ld(0, dst); | 98 vdst = vec_ld(0, dst); |
103 ppsum = (vector unsigned char)vec_packsu(psum, psum); | 99 ppsum = (vec_u8_t)vec_packsu(psum, psum); |
104 vfdst = vec_perm(vdst, ppsum, fperm); | 100 vfdst = vec_perm(vdst, ppsum, fperm); |
105 | 101 |
106 OP_U8_ALTIVEC(fsum, vfdst, vdst); | 102 OP_U8_ALTIVEC(fsum, vfdst, vdst); |
107 | 103 |
108 vec_st(fsum, 0, dst); | 104 vec_st(fsum, 0, dst); |
112 | 108 |
113 dst += stride; | 109 dst += stride; |
114 src += stride; | 110 src += stride; |
115 } | 111 } |
116 } else { | 112 } else { |
117 vector unsigned char vsrcDuc; | 113 vec_u8_t vsrcDuc; |
118 for (i = 0 ; i < h ; i++) { | 114 for (i = 0 ; i < h ; i++) { |
119 vsrcCuc = vec_ld(stride + 0, src); | 115 vsrcCuc = vec_ld(stride + 0, src); |
120 vsrcDuc = vec_ld(stride + 16, src); | 116 vsrcDuc = vec_ld(stride + 16, src); |
121 | 117 |
122 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); | 118 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); |
123 if (reallyBadAlign) | 119 if (reallyBadAlign) |
124 vsrc3uc = vsrcDuc; | 120 vsrc3uc = vsrcDuc; |
125 else | 121 else |
126 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); | 122 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); |
127 | 123 |
128 vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, | 124 vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc2uc); |
129 (vector unsigned char)vsrc2uc); | 125 vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc3uc); |
130 vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, | |
131 (vector unsigned char)vsrc3uc); | |
132 | 126 |
133 psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); | 127 psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); |
134 psum = vec_mladd(vB, vsrc1ssH, psum); | 128 psum = vec_mladd(vB, vsrc1ssH, psum); |
135 psum = vec_mladd(vC, vsrc2ssH, psum); | 129 psum = vec_mladd(vC, vsrc2ssH, psum); |
136 psum = vec_mladd(vD, vsrc3ssH, psum); | 130 psum = vec_mladd(vD, vsrc3ssH, psum); |
137 psum = vec_add(v32ss, psum); | 131 psum = vec_add(v32ss, psum); |
138 psum = vec_sr(psum, v6us); | 132 psum = vec_sr(psum, v6us); |
139 | 133 |
140 vdst = vec_ld(0, dst); | 134 vdst = vec_ld(0, dst); |
141 ppsum = (vector unsigned char)vec_pack(psum, psum); | 135 ppsum = (vec_u8_t)vec_pack(psum, psum); |
142 vfdst = vec_perm(vdst, ppsum, fperm); | 136 vfdst = vec_perm(vdst, ppsum, fperm); |
143 | 137 |
144 OP_U8_ALTIVEC(fsum, vfdst, vdst); | 138 OP_U8_ALTIVEC(fsum, vfdst, vdst); |
145 | 139 |
146 vec_st(fsum, 0, dst); | 140 vec_st(fsum, 0, dst); |
158 /* this code assume stride % 16 == 0 */ | 152 /* this code assume stride % 16 == 0 */ |
159 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { | 153 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { |
160 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1); | 154 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1); |
161 register int i; | 155 register int i; |
162 | 156 |
163 const vector signed int vzero = vec_splat_s32(0); | 157 LOAD_ZERO; |
164 const vector unsigned char permM2 = vec_lvsl(-2, src); | 158 const vec_u8_t permM2 = vec_lvsl(-2, src); |
165 const vector unsigned char permM1 = vec_lvsl(-1, src); | 159 const vec_u8_t permM1 = vec_lvsl(-1, src); |
166 const vector unsigned char permP0 = vec_lvsl(+0, src); | 160 const vec_u8_t permP0 = vec_lvsl(+0, src); |
167 const vector unsigned char permP1 = vec_lvsl(+1, src); | 161 const vec_u8_t permP1 = vec_lvsl(+1, src); |
168 const vector unsigned char permP2 = vec_lvsl(+2, src); | 162 const vec_u8_t permP2 = vec_lvsl(+2, src); |
169 const vector unsigned char permP3 = vec_lvsl(+3, src); | 163 const vec_u8_t permP3 = vec_lvsl(+3, src); |
170 const vector signed short v5ss = vec_splat_s16(5); | 164 const vec_s16_t v5ss = vec_splat_s16(5); |
171 const vector unsigned short v5us = vec_splat_u16(5); | 165 const vec_u16_t v5us = vec_splat_u16(5); |
172 const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); | 166 const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); |
173 const vector signed short v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); | 167 const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); |
174 const vector unsigned char dstperm = vec_lvsr(0, dst); | 168 const vec_u8_t dstperm = vec_lvsr(0, dst); |
175 const vector unsigned char neg1 = | 169 const vec_u8_t neg1 = (const vec_u8_t) vec_splat_s8(-1); |
176 (const vector unsigned char) vec_splat_s8(-1); | 170 const vec_u8_t dstmask = vec_perm(zero_u8v, neg1, dstperm); |
177 | 171 |
178 const vector unsigned char dstmask = | 172 vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; |
179 vec_perm((const vector unsigned char)vzero, | |
180 neg1, dstperm); | |
181 | |
182 vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; | |
183 | 173 |
184 register int align = ((((unsigned long)src) - 2) % 16); | 174 register int align = ((((unsigned long)src) - 2) % 16); |
185 | 175 |
186 vector signed short srcP0A, srcP0B, srcP1A, srcP1B, | 176 vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B, |
187 srcP2A, srcP2B, srcP3A, srcP3B, | 177 srcP2A, srcP2B, srcP3A, srcP3B, |
188 srcM1A, srcM1B, srcM2A, srcM2B, | 178 srcM1A, srcM1B, srcM2A, srcM2B, |
189 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, | 179 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, |
190 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, | 180 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, |
191 psumA, psumB, sumA, sumB; | 181 psumA, psumB, sumA, sumB; |
192 | 182 |
193 vector unsigned char sum, dst1, dst2, vdst, fsum, | 183 vec_u8_t sum, dst1, dst2, vdst, fsum, rsum, fdst1, fdst2; |
194 rsum, fdst1, fdst2; | |
195 | 184 |
196 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); | 185 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); |
197 | 186 |
198 for (i = 0 ; i < 16 ; i ++) { | 187 for (i = 0 ; i < 16 ; i ++) { |
199 vector unsigned char srcR1 = vec_ld(-2, src); | 188 vec_u8_t srcR1 = vec_ld(-2, src); |
200 vector unsigned char srcR2 = vec_ld(14, src); | 189 vec_u8_t srcR2 = vec_ld(14, src); |
201 | 190 |
202 switch (align) { | 191 switch (align) { |
203 default: { | 192 default: { |
204 srcM2 = vec_perm(srcR1, srcR2, permM2); | 193 srcM2 = vec_perm(srcR1, srcR2, permM2); |
205 srcM1 = vec_perm(srcR1, srcR2, permM1); | 194 srcM1 = vec_perm(srcR1, srcR2, permM1); |
215 srcP1 = vec_perm(srcR1, srcR2, permP1); | 204 srcP1 = vec_perm(srcR1, srcR2, permP1); |
216 srcP2 = vec_perm(srcR1, srcR2, permP2); | 205 srcP2 = vec_perm(srcR1, srcR2, permP2); |
217 srcP3 = srcR2; | 206 srcP3 = srcR2; |
218 } break; | 207 } break; |
219 case 12: { | 208 case 12: { |
220 vector unsigned char srcR3 = vec_ld(30, src); | 209 vec_u8_t srcR3 = vec_ld(30, src); |
221 srcM2 = vec_perm(srcR1, srcR2, permM2); | 210 srcM2 = vec_perm(srcR1, srcR2, permM2); |
222 srcM1 = vec_perm(srcR1, srcR2, permM1); | 211 srcM1 = vec_perm(srcR1, srcR2, permM1); |
223 srcP0 = vec_perm(srcR1, srcR2, permP0); | 212 srcP0 = vec_perm(srcR1, srcR2, permP0); |
224 srcP1 = vec_perm(srcR1, srcR2, permP1); | 213 srcP1 = vec_perm(srcR1, srcR2, permP1); |
225 srcP2 = srcR2; | 214 srcP2 = srcR2; |
226 srcP3 = vec_perm(srcR2, srcR3, permP3); | 215 srcP3 = vec_perm(srcR2, srcR3, permP3); |
227 } break; | 216 } break; |
228 case 13: { | 217 case 13: { |
229 vector unsigned char srcR3 = vec_ld(30, src); | 218 vec_u8_t srcR3 = vec_ld(30, src); |
230 srcM2 = vec_perm(srcR1, srcR2, permM2); | 219 srcM2 = vec_perm(srcR1, srcR2, permM2); |
231 srcM1 = vec_perm(srcR1, srcR2, permM1); | 220 srcM1 = vec_perm(srcR1, srcR2, permM1); |
232 srcP0 = vec_perm(srcR1, srcR2, permP0); | 221 srcP0 = vec_perm(srcR1, srcR2, permP0); |
233 srcP1 = srcR2; | 222 srcP1 = srcR2; |
234 srcP2 = vec_perm(srcR2, srcR3, permP2); | 223 srcP2 = vec_perm(srcR2, srcR3, permP2); |
235 srcP3 = vec_perm(srcR2, srcR3, permP3); | 224 srcP3 = vec_perm(srcR2, srcR3, permP3); |
236 } break; | 225 } break; |
237 case 14: { | 226 case 14: { |
238 vector unsigned char srcR3 = vec_ld(30, src); | 227 vec_u8_t srcR3 = vec_ld(30, src); |
239 srcM2 = vec_perm(srcR1, srcR2, permM2); | 228 srcM2 = vec_perm(srcR1, srcR2, permM2); |
240 srcM1 = vec_perm(srcR1, srcR2, permM1); | 229 srcM1 = vec_perm(srcR1, srcR2, permM1); |
241 srcP0 = srcR2; | 230 srcP0 = srcR2; |
242 srcP1 = vec_perm(srcR2, srcR3, permP1); | 231 srcP1 = vec_perm(srcR2, srcR3, permP1); |
243 srcP2 = vec_perm(srcR2, srcR3, permP2); | 232 srcP2 = vec_perm(srcR2, srcR3, permP2); |
244 srcP3 = vec_perm(srcR2, srcR3, permP3); | 233 srcP3 = vec_perm(srcR2, srcR3, permP3); |
245 } break; | 234 } break; |
246 case 15: { | 235 case 15: { |
247 vector unsigned char srcR3 = vec_ld(30, src); | 236 vec_u8_t srcR3 = vec_ld(30, src); |
248 srcM2 = vec_perm(srcR1, srcR2, permM2); | 237 srcM2 = vec_perm(srcR1, srcR2, permM2); |
249 srcM1 = srcR2; | 238 srcM1 = srcR2; |
250 srcP0 = vec_perm(srcR2, srcR3, permP0); | 239 srcP0 = vec_perm(srcR2, srcR3, permP0); |
251 srcP1 = vec_perm(srcR2, srcR3, permP1); | 240 srcP1 = vec_perm(srcR2, srcR3, permP1); |
252 srcP2 = vec_perm(srcR2, srcR3, permP2); | 241 srcP2 = vec_perm(srcR2, srcR3, permP2); |
253 srcP3 = vec_perm(srcR2, srcR3, permP3); | 242 srcP3 = vec_perm(srcR2, srcR3, permP3); |
254 } break; | 243 } break; |
255 } | 244 } |
256 | 245 |
257 srcP0A = (vector signed short) | 246 srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0); |
258 vec_mergeh((vector unsigned char)vzero, srcP0); | 247 srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0); |
259 srcP0B = (vector signed short) | 248 srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1); |
260 vec_mergel((vector unsigned char)vzero, srcP0); | 249 srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1); |
261 srcP1A = (vector signed short) | 250 |
262 vec_mergeh((vector unsigned char)vzero, srcP1); | 251 srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2); |
263 srcP1B = (vector signed short) | 252 srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2); |
264 vec_mergel((vector unsigned char)vzero, srcP1); | 253 srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3); |
265 | 254 srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3); |
266 srcP2A = (vector signed short) | 255 |
267 vec_mergeh((vector unsigned char)vzero, srcP2); | 256 srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1); |
268 srcP2B = (vector signed short) | 257 srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1); |
269 vec_mergel((vector unsigned char)vzero, srcP2); | 258 srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2); |
270 srcP3A = (vector signed short) | 259 srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2); |
271 vec_mergeh((vector unsigned char)vzero, srcP3); | |
272 srcP3B = (vector signed short) | |
273 vec_mergel((vector unsigned char)vzero, srcP3); | |
274 | |
275 srcM1A = (vector signed short) | |
276 vec_mergeh((vector unsigned char)vzero, srcM1); | |
277 srcM1B = (vector signed short) | |
278 vec_mergel((vector unsigned char)vzero, srcM1); | |
279 srcM2A = (vector signed short) | |
280 vec_mergeh((vector unsigned char)vzero, srcM2); | |
281 srcM2B = (vector signed short) | |
282 vec_mergel((vector unsigned char)vzero, srcM2); | |
283 | 260 |
284 sum1A = vec_adds(srcP0A, srcP1A); | 261 sum1A = vec_adds(srcP0A, srcP1A); |
285 sum1B = vec_adds(srcP0B, srcP1B); | 262 sum1B = vec_adds(srcP0B, srcP1B); |
286 sum2A = vec_adds(srcM1A, srcP2A); | 263 sum2A = vec_adds(srcM1A, srcP2A); |
287 sum2B = vec_adds(srcM1B, srcP2B); | 264 sum2B = vec_adds(srcM1B, srcP2B); |
289 sum3B = vec_adds(srcM2B, srcP3B); | 266 sum3B = vec_adds(srcM2B, srcP3B); |
290 | 267 |
291 pp1A = vec_mladd(sum1A, v20ss, v16ss); | 268 pp1A = vec_mladd(sum1A, v20ss, v16ss); |
292 pp1B = vec_mladd(sum1B, v20ss, v16ss); | 269 pp1B = vec_mladd(sum1B, v20ss, v16ss); |
293 | 270 |
294 pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero); | 271 pp2A = vec_mladd(sum2A, v5ss, zero_s16v); |
295 pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero); | 272 pp2B = vec_mladd(sum2B, v5ss, zero_s16v); |
296 | 273 |
297 pp3A = vec_add(sum3A, pp1A); | 274 pp3A = vec_add(sum3A, pp1A); |
298 pp3B = vec_add(sum3B, pp1B); | 275 pp3B = vec_add(sum3B, pp1B); |
299 | 276 |
300 psumA = vec_sub(pp3A, pp2A); | 277 psumA = vec_sub(pp3A, pp2A); |
328 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { | 305 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { |
329 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1); | 306 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1); |
330 | 307 |
331 register int i; | 308 register int i; |
332 | 309 |
333 const vector signed int vzero = vec_splat_s32(0); | 310 LOAD_ZERO; |
334 const vector unsigned char perm = vec_lvsl(0, src); | 311 const vec_u8_t perm = vec_lvsl(0, src); |
335 const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); | 312 const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); |
336 const vector unsigned short v5us = vec_splat_u16(5); | 313 const vec_u16_t v5us = vec_splat_u16(5); |
337 const vector signed short v5ss = vec_splat_s16(5); | 314 const vec_s16_t v5ss = vec_splat_s16(5); |
338 const vector signed short v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); | 315 const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); |
339 const vector unsigned char dstperm = vec_lvsr(0, dst); | 316 const vec_u8_t dstperm = vec_lvsr(0, dst); |
340 const vector unsigned char neg1 = (const vector unsigned char)vec_splat_s8(-1); | 317 const vec_u8_t neg1 = (const vec_u8_t)vec_splat_s8(-1); |
341 const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm); | 318 const vec_u8_t dstmask = vec_perm(zero_u8v, neg1, dstperm); |
342 | 319 |
343 uint8_t *srcbis = src - (srcStride * 2); | 320 uint8_t *srcbis = src - (srcStride * 2); |
344 | 321 |
345 const vector unsigned char srcM2a = vec_ld(0, srcbis); | 322 const vec_u8_t srcM2a = vec_ld(0, srcbis); |
346 const vector unsigned char srcM2b = vec_ld(16, srcbis); | 323 const vec_u8_t srcM2b = vec_ld(16, srcbis); |
347 const vector unsigned char srcM2 = vec_perm(srcM2a, srcM2b, perm); | 324 const vec_u8_t srcM2 = vec_perm(srcM2a, srcM2b, perm); |
348 // srcbis += srcStride; | 325 // srcbis += srcStride; |
349 const vector unsigned char srcM1a = vec_ld(0, srcbis += srcStride); | 326 const vec_u8_t srcM1a = vec_ld(0, srcbis += srcStride); |
350 const vector unsigned char srcM1b = vec_ld(16, srcbis); | 327 const vec_u8_t srcM1b = vec_ld(16, srcbis); |
351 const vector unsigned char srcM1 = vec_perm(srcM1a, srcM1b, perm); | 328 const vec_u8_t srcM1 = vec_perm(srcM1a, srcM1b, perm); |
352 // srcbis += srcStride; | 329 // srcbis += srcStride; |
353 const vector unsigned char srcP0a = vec_ld(0, srcbis += srcStride); | 330 const vec_u8_t srcP0a = vec_ld(0, srcbis += srcStride); |
354 const vector unsigned char srcP0b = vec_ld(16, srcbis); | 331 const vec_u8_t srcP0b = vec_ld(16, srcbis); |
355 const vector unsigned char srcP0 = vec_perm(srcP0a, srcP0b, perm); | 332 const vec_u8_t srcP0 = vec_perm(srcP0a, srcP0b, perm); |
356 // srcbis += srcStride; | 333 // srcbis += srcStride; |
357 const vector unsigned char srcP1a = vec_ld(0, srcbis += srcStride); | 334 const vec_u8_t srcP1a = vec_ld(0, srcbis += srcStride); |
358 const vector unsigned char srcP1b = vec_ld(16, srcbis); | 335 const vec_u8_t srcP1b = vec_ld(16, srcbis); |
359 const vector unsigned char srcP1 = vec_perm(srcP1a, srcP1b, perm); | 336 const vec_u8_t srcP1 = vec_perm(srcP1a, srcP1b, perm); |
360 // srcbis += srcStride; | 337 // srcbis += srcStride; |
361 const vector unsigned char srcP2a = vec_ld(0, srcbis += srcStride); | 338 const vec_u8_t srcP2a = vec_ld(0, srcbis += srcStride); |
362 const vector unsigned char srcP2b = vec_ld(16, srcbis); | 339 const vec_u8_t srcP2b = vec_ld(16, srcbis); |
363 const vector unsigned char srcP2 = vec_perm(srcP2a, srcP2b, perm); | 340 const vec_u8_t srcP2 = vec_perm(srcP2a, srcP2b, perm); |
364 // srcbis += srcStride; | 341 // srcbis += srcStride; |
365 | 342 |
366 vector signed short srcM2ssA = (vector signed short) | 343 vec_s16_t srcM2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM2); |
367 vec_mergeh((vector unsigned char)vzero, srcM2); | 344 vec_s16_t srcM2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM2); |
368 vector signed short srcM2ssB = (vector signed short) | 345 vec_s16_t srcM1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM1); |
369 vec_mergel((vector unsigned char)vzero, srcM2); | 346 vec_s16_t srcM1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM1); |
370 vector signed short srcM1ssA = (vector signed short) | 347 vec_s16_t srcP0ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP0); |
371 vec_mergeh((vector unsigned char)vzero, srcM1); | 348 vec_s16_t srcP0ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP0); |
372 vector signed short srcM1ssB = (vector signed short) | 349 vec_s16_t srcP1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP1); |
373 vec_mergel((vector unsigned char)vzero, srcM1); | 350 vec_s16_t srcP1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP1); |
374 vector signed short srcP0ssA = (vector signed short) | 351 vec_s16_t srcP2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP2); |
375 vec_mergeh((vector unsigned char)vzero, srcP0); | 352 vec_s16_t srcP2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP2); |
376 vector signed short srcP0ssB = (vector signed short) | 353 |
377 vec_mergel((vector unsigned char)vzero, srcP0); | 354 vec_s16_t pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, |
378 vector signed short srcP1ssA = (vector signed short) | |
379 vec_mergeh((vector unsigned char)vzero, srcP1); | |
380 vector signed short srcP1ssB = (vector signed short) | |
381 vec_mergel((vector unsigned char)vzero, srcP1); | |
382 vector signed short srcP2ssA = (vector signed short) | |
383 vec_mergeh((vector unsigned char)vzero, srcP2); | |
384 vector signed short srcP2ssB = (vector signed short) | |
385 vec_mergel((vector unsigned char)vzero, srcP2); | |
386 | |
387 vector signed short pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, | |
388 psumA, psumB, sumA, sumB, | 355 psumA, psumB, sumA, sumB, |
389 srcP3ssA, srcP3ssB, | 356 srcP3ssA, srcP3ssB, |
390 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B; | 357 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B; |
391 | 358 |
392 vector unsigned char sum, dst1, dst2, vdst, fsum, rsum, fdst1, fdst2, | 359 vec_u8_t sum, dst1, dst2, vdst, fsum, rsum, fdst1, fdst2, srcP3a, srcP3b, srcP3; |
393 srcP3a, srcP3b, srcP3; | |
394 | 360 |
395 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); | 361 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); |
396 | 362 |
397 for (i = 0 ; i < 16 ; i++) { | 363 for (i = 0 ; i < 16 ; i++) { |
398 srcP3a = vec_ld(0, srcbis += srcStride); | 364 srcP3a = vec_ld(0, srcbis += srcStride); |
399 srcP3b = vec_ld(16, srcbis); | 365 srcP3b = vec_ld(16, srcbis); |
400 srcP3 = vec_perm(srcP3a, srcP3b, perm); | 366 srcP3 = vec_perm(srcP3a, srcP3b, perm); |
401 srcP3ssA = (vector signed short) | 367 srcP3ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP3); |
402 vec_mergeh((vector unsigned char)vzero, srcP3); | 368 srcP3ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP3); |
403 srcP3ssB = (vector signed short) | |
404 vec_mergel((vector unsigned char)vzero, srcP3); | |
405 // srcbis += srcStride; | 369 // srcbis += srcStride; |
406 | 370 |
407 sum1A = vec_adds(srcP0ssA, srcP1ssA); | 371 sum1A = vec_adds(srcP0ssA, srcP1ssA); |
408 sum1B = vec_adds(srcP0ssB, srcP1ssB); | 372 sum1B = vec_adds(srcP0ssB, srcP1ssB); |
409 sum2A = vec_adds(srcM1ssA, srcP2ssA); | 373 sum2A = vec_adds(srcM1ssA, srcP2ssA); |
423 srcP2ssB = srcP3ssB; | 387 srcP2ssB = srcP3ssB; |
424 | 388 |
425 pp1A = vec_mladd(sum1A, v20ss, v16ss); | 389 pp1A = vec_mladd(sum1A, v20ss, v16ss); |
426 pp1B = vec_mladd(sum1B, v20ss, v16ss); | 390 pp1B = vec_mladd(sum1B, v20ss, v16ss); |
427 | 391 |
428 pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero); | 392 pp2A = vec_mladd(sum2A, v5ss, zero_s16v); |
429 pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero); | 393 pp2B = vec_mladd(sum2B, v5ss, zero_s16v); |
430 | 394 |
431 pp3A = vec_add(sum3A, pp1A); | 395 pp3A = vec_add(sum3A, pp1A); |
432 pp3B = vec_add(sum3B, pp1B); | 396 pp3B = vec_add(sum3B, pp1B); |
433 | 397 |
434 psumA = vec_sub(pp3A, pp2A); | 398 psumA = vec_sub(pp3A, pp2A); |
459 | 423 |
460 /* this code assume stride % 16 == 0 *and* tmp is properly aligned */ | 424 /* this code assume stride % 16 == 0 *and* tmp is properly aligned */ |
461 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) { | 425 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) { |
462 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1); | 426 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1); |
463 register int i; | 427 register int i; |
464 const vector signed int vzero = vec_splat_s32(0); | 428 LOAD_ZERO; |
465 const vector unsigned char permM2 = vec_lvsl(-2, src); | 429 const vec_u8_t permM2 = vec_lvsl(-2, src); |
466 const vector unsigned char permM1 = vec_lvsl(-1, src); | 430 const vec_u8_t permM1 = vec_lvsl(-1, src); |
467 const vector unsigned char permP0 = vec_lvsl(+0, src); | 431 const vec_u8_t permP0 = vec_lvsl(+0, src); |
468 const vector unsigned char permP1 = vec_lvsl(+1, src); | 432 const vec_u8_t permP1 = vec_lvsl(+1, src); |
469 const vector unsigned char permP2 = vec_lvsl(+2, src); | 433 const vec_u8_t permP2 = vec_lvsl(+2, src); |
470 const vector unsigned char permP3 = vec_lvsl(+3, src); | 434 const vec_u8_t permP3 = vec_lvsl(+3, src); |
471 const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); | 435 const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); |
472 const vector unsigned int v10ui = vec_splat_u32(10); | 436 const vec_u32_t v10ui = vec_splat_u32(10); |
473 const vector signed short v5ss = vec_splat_s16(5); | 437 const vec_s16_t v5ss = vec_splat_s16(5); |
474 const vector signed short v1ss = vec_splat_s16(1); | 438 const vec_s16_t v1ss = vec_splat_s16(1); |
475 const vector signed int v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9)); | 439 const vec_s32_t v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9)); |
476 const vector unsigned int v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4)); | 440 const vec_u32_t v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4)); |
477 | 441 |
478 register int align = ((((unsigned long)src) - 2) % 16); | 442 register int align = ((((unsigned long)src) - 2) % 16); |
479 | 443 |
480 const vector unsigned char neg1 = (const vector unsigned char) | 444 const vec_u8_t neg1 = (const vec_u8_t) vec_splat_s8(-1); |
481 vec_splat_s8(-1); | 445 |
482 | 446 vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B, |
483 vector signed short srcP0A, srcP0B, srcP1A, srcP1B, | |
484 srcP2A, srcP2B, srcP3A, srcP3B, | 447 srcP2A, srcP2B, srcP3A, srcP3B, |
485 srcM1A, srcM1B, srcM2A, srcM2B, | 448 srcM1A, srcM1B, srcM2A, srcM2B, |
486 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, | 449 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, |
487 pp1A, pp1B, pp2A, pp2B, psumA, psumB; | 450 pp1A, pp1B, pp2A, pp2B, psumA, psumB; |
488 | 451 |
489 const vector unsigned char dstperm = vec_lvsr(0, dst); | 452 const vec_u8_t dstperm = vec_lvsr(0, dst); |
490 | 453 |
491 const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm); | 454 const vec_u8_t dstmask = vec_perm(zero_u8v, neg1, dstperm); |
492 | 455 |
493 const vector unsigned char mperm = (const vector unsigned char) | 456 const vec_u8_t mperm = (const vec_u8_t) |
494 AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B, | 457 AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B, |
495 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F); | 458 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F); |
496 int16_t *tmpbis = tmp; | 459 int16_t *tmpbis = tmp; |
497 | 460 |
498 vector signed short tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB, | 461 vec_s16_t tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB, |
499 tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB, | 462 tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB, |
500 tmpP2ssA, tmpP2ssB; | 463 tmpP2ssA, tmpP2ssB; |
501 | 464 |
502 vector signed int pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo, | 465 vec_s32_t pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo, |
503 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo, | 466 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo, |
504 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo, | 467 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo, |
505 ssumAe, ssumAo, ssumBe, ssumBo; | 468 ssumAe, ssumAo, ssumBe, ssumBo; |
506 vector unsigned char fsum, sumv, sum, dst1, dst2, vdst, | 469 vec_u8_t fsum, sumv, sum, dst1, dst2, vdst, rsum, fdst1, fdst2; |
507 rsum, fdst1, fdst2; | 470 vec_s16_t ssume, ssumo; |
508 vector signed short ssume, ssumo; | |
509 | 471 |
510 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); | 472 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); |
511 src -= (2 * srcStride); | 473 src -= (2 * srcStride); |
512 for (i = 0 ; i < 21 ; i ++) { | 474 for (i = 0 ; i < 21 ; i ++) { |
513 vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; | 475 vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; |
514 vector unsigned char srcR1 = vec_ld(-2, src); | 476 vec_u8_t srcR1 = vec_ld(-2, src); |
515 vector unsigned char srcR2 = vec_ld(14, src); | 477 vec_u8_t srcR2 = vec_ld(14, src); |
516 | 478 |
517 switch (align) { | 479 switch (align) { |
518 default: { | 480 default: { |
519 srcM2 = vec_perm(srcR1, srcR2, permM2); | 481 srcM2 = vec_perm(srcR1, srcR2, permM2); |
520 srcM1 = vec_perm(srcR1, srcR2, permM1); | 482 srcM1 = vec_perm(srcR1, srcR2, permM1); |
530 srcP1 = vec_perm(srcR1, srcR2, permP1); | 492 srcP1 = vec_perm(srcR1, srcR2, permP1); |
531 srcP2 = vec_perm(srcR1, srcR2, permP2); | 493 srcP2 = vec_perm(srcR1, srcR2, permP2); |
532 srcP3 = srcR2; | 494 srcP3 = srcR2; |
533 } break; | 495 } break; |
534 case 12: { | 496 case 12: { |
535 vector unsigned char srcR3 = vec_ld(30, src); | 497 vec_u8_t srcR3 = vec_ld(30, src); |
536 srcM2 = vec_perm(srcR1, srcR2, permM2); | 498 srcM2 = vec_perm(srcR1, srcR2, permM2); |
537 srcM1 = vec_perm(srcR1, srcR2, permM1); | 499 srcM1 = vec_perm(srcR1, srcR2, permM1); |
538 srcP0 = vec_perm(srcR1, srcR2, permP0); | 500 srcP0 = vec_perm(srcR1, srcR2, permP0); |
539 srcP1 = vec_perm(srcR1, srcR2, permP1); | 501 srcP1 = vec_perm(srcR1, srcR2, permP1); |
540 srcP2 = srcR2; | 502 srcP2 = srcR2; |
541 srcP3 = vec_perm(srcR2, srcR3, permP3); | 503 srcP3 = vec_perm(srcR2, srcR3, permP3); |
542 } break; | 504 } break; |
543 case 13: { | 505 case 13: { |
544 vector unsigned char srcR3 = vec_ld(30, src); | 506 vec_u8_t srcR3 = vec_ld(30, src); |
545 srcM2 = vec_perm(srcR1, srcR2, permM2); | 507 srcM2 = vec_perm(srcR1, srcR2, permM2); |
546 srcM1 = vec_perm(srcR1, srcR2, permM1); | 508 srcM1 = vec_perm(srcR1, srcR2, permM1); |
547 srcP0 = vec_perm(srcR1, srcR2, permP0); | 509 srcP0 = vec_perm(srcR1, srcR2, permP0); |
548 srcP1 = srcR2; | 510 srcP1 = srcR2; |
549 srcP2 = vec_perm(srcR2, srcR3, permP2); | 511 srcP2 = vec_perm(srcR2, srcR3, permP2); |
550 srcP3 = vec_perm(srcR2, srcR3, permP3); | 512 srcP3 = vec_perm(srcR2, srcR3, permP3); |
551 } break; | 513 } break; |
552 case 14: { | 514 case 14: { |
553 vector unsigned char srcR3 = vec_ld(30, src); | 515 vec_u8_t srcR3 = vec_ld(30, src); |
554 srcM2 = vec_perm(srcR1, srcR2, permM2); | 516 srcM2 = vec_perm(srcR1, srcR2, permM2); |
555 srcM1 = vec_perm(srcR1, srcR2, permM1); | 517 srcM1 = vec_perm(srcR1, srcR2, permM1); |
556 srcP0 = srcR2; | 518 srcP0 = srcR2; |
557 srcP1 = vec_perm(srcR2, srcR3, permP1); | 519 srcP1 = vec_perm(srcR2, srcR3, permP1); |
558 srcP2 = vec_perm(srcR2, srcR3, permP2); | 520 srcP2 = vec_perm(srcR2, srcR3, permP2); |
559 srcP3 = vec_perm(srcR2, srcR3, permP3); | 521 srcP3 = vec_perm(srcR2, srcR3, permP3); |
560 } break; | 522 } break; |
561 case 15: { | 523 case 15: { |
562 vector unsigned char srcR3 = vec_ld(30, src); | 524 vec_u8_t srcR3 = vec_ld(30, src); |
563 srcM2 = vec_perm(srcR1, srcR2, permM2); | 525 srcM2 = vec_perm(srcR1, srcR2, permM2); |
564 srcM1 = srcR2; | 526 srcM1 = srcR2; |
565 srcP0 = vec_perm(srcR2, srcR3, permP0); | 527 srcP0 = vec_perm(srcR2, srcR3, permP0); |
566 srcP1 = vec_perm(srcR2, srcR3, permP1); | 528 srcP1 = vec_perm(srcR2, srcR3, permP1); |
567 srcP2 = vec_perm(srcR2, srcR3, permP2); | 529 srcP2 = vec_perm(srcR2, srcR3, permP2); |
568 srcP3 = vec_perm(srcR2, srcR3, permP3); | 530 srcP3 = vec_perm(srcR2, srcR3, permP3); |
569 } break; | 531 } break; |
570 } | 532 } |
571 | 533 |
572 srcP0A = (vector signed short) | 534 srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0); |
573 vec_mergeh((vector unsigned char)vzero, srcP0); | 535 srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0); |
574 srcP0B = (vector signed short) | 536 srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1); |
575 vec_mergel((vector unsigned char)vzero, srcP0); | 537 srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1); |
576 srcP1A = (vector signed short) | 538 |
577 vec_mergeh((vector unsigned char)vzero, srcP1); | 539 srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2); |
578 srcP1B = (vector signed short) | 540 srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2); |
579 vec_mergel((vector unsigned char)vzero, srcP1); | 541 srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3); |
580 | 542 srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3); |
581 srcP2A = (vector signed short) | 543 |
582 vec_mergeh((vector unsigned char)vzero, srcP2); | 544 srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1); |
583 srcP2B = (vector signed short) | 545 srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1); |
584 vec_mergel((vector unsigned char)vzero, srcP2); | 546 srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2); |
585 srcP3A = (vector signed short) | 547 srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2); |
586 vec_mergeh((vector unsigned char)vzero, srcP3); | |
587 srcP3B = (vector signed short) | |
588 vec_mergel((vector unsigned char)vzero, srcP3); | |
589 | |
590 srcM1A = (vector signed short) | |
591 vec_mergeh((vector unsigned char)vzero, srcM1); | |
592 srcM1B = (vector signed short) | |
593 vec_mergel((vector unsigned char)vzero, srcM1); | |
594 srcM2A = (vector signed short) | |
595 vec_mergeh((vector unsigned char)vzero, srcM2); | |
596 srcM2B = (vector signed short) | |
597 vec_mergel((vector unsigned char)vzero, srcM2); | |
598 | 548 |
599 sum1A = vec_adds(srcP0A, srcP1A); | 549 sum1A = vec_adds(srcP0A, srcP1A); |
600 sum1B = vec_adds(srcP0B, srcP1B); | 550 sum1B = vec_adds(srcP0B, srcP1B); |
601 sum2A = vec_adds(srcM1A, srcP2A); | 551 sum2A = vec_adds(srcM1A, srcP2A); |
602 sum2B = vec_adds(srcM1B, srcP2B); | 552 sum2B = vec_adds(srcM1B, srcP2B); |
604 sum3B = vec_adds(srcM2B, srcP3B); | 554 sum3B = vec_adds(srcM2B, srcP3B); |
605 | 555 |
606 pp1A = vec_mladd(sum1A, v20ss, sum3A); | 556 pp1A = vec_mladd(sum1A, v20ss, sum3A); |
607 pp1B = vec_mladd(sum1B, v20ss, sum3B); | 557 pp1B = vec_mladd(sum1B, v20ss, sum3B); |
608 | 558 |
609 pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero); | 559 pp2A = vec_mladd(sum2A, v5ss, zero_s16v); |
610 pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero); | 560 pp2B = vec_mladd(sum2B, v5ss, zero_s16v); |
611 | 561 |
612 psumA = vec_sub(pp1A, pp2A); | 562 psumA = vec_sub(pp1A, pp2A); |
613 psumB = vec_sub(pp1B, pp2B); | 563 psumB = vec_sub(pp1B, pp2B); |
614 | 564 |
615 vec_st(psumA, 0, tmp); | 565 vec_st(psumA, 0, tmp); |
634 tmpP2ssA = vec_ld(0, tmpbis); | 584 tmpP2ssA = vec_ld(0, tmpbis); |
635 tmpP2ssB = vec_ld(16, tmpbis); | 585 tmpP2ssB = vec_ld(16, tmpbis); |
636 tmpbis += tmpStride; | 586 tmpbis += tmpStride; |
637 | 587 |
638 for (i = 0 ; i < 16 ; i++) { | 588 for (i = 0 ; i < 16 ; i++) { |
639 const vector signed short tmpP3ssA = vec_ld(0, tmpbis); | 589 const vec_s16_t tmpP3ssA = vec_ld(0, tmpbis); |
640 const vector signed short tmpP3ssB = vec_ld(16, tmpbis); | 590 const vec_s16_t tmpP3ssB = vec_ld(16, tmpbis); |
641 | 591 |
642 const vector signed short sum1A = vec_adds(tmpP0ssA, tmpP1ssA); | 592 const vec_s16_t sum1A = vec_adds(tmpP0ssA, tmpP1ssA); |
643 const vector signed short sum1B = vec_adds(tmpP0ssB, tmpP1ssB); | 593 const vec_s16_t sum1B = vec_adds(tmpP0ssB, tmpP1ssB); |
644 const vector signed short sum2A = vec_adds(tmpM1ssA, tmpP2ssA); | 594 const vec_s16_t sum2A = vec_adds(tmpM1ssA, tmpP2ssA); |
645 const vector signed short sum2B = vec_adds(tmpM1ssB, tmpP2ssB); | 595 const vec_s16_t sum2B = vec_adds(tmpM1ssB, tmpP2ssB); |
646 const vector signed short sum3A = vec_adds(tmpM2ssA, tmpP3ssA); | 596 const vec_s16_t sum3A = vec_adds(tmpM2ssA, tmpP3ssA); |
647 const vector signed short sum3B = vec_adds(tmpM2ssB, tmpP3ssB); | 597 const vec_s16_t sum3B = vec_adds(tmpM2ssB, tmpP3ssB); |
648 | 598 |
649 tmpbis += tmpStride; | 599 tmpbis += tmpStride; |
650 | 600 |
651 tmpM2ssA = tmpM1ssA; | 601 tmpM2ssA = tmpM1ssA; |
652 tmpM2ssB = tmpM1ssB; | 602 tmpM2ssB = tmpM1ssB; |
667 pp2Ae = vec_mule(sum2A, v5ss); | 617 pp2Ae = vec_mule(sum2A, v5ss); |
668 pp2Ao = vec_mulo(sum2A, v5ss); | 618 pp2Ao = vec_mulo(sum2A, v5ss); |
669 pp2Be = vec_mule(sum2B, v5ss); | 619 pp2Be = vec_mule(sum2B, v5ss); |
670 pp2Bo = vec_mulo(sum2B, v5ss); | 620 pp2Bo = vec_mulo(sum2B, v5ss); |
671 | 621 |
672 pp3Ae = vec_sra((vector signed int)sum3A, v16ui); | 622 pp3Ae = vec_sra((vec_s32_t)sum3A, v16ui); |
673 pp3Ao = vec_mulo(sum3A, v1ss); | 623 pp3Ao = vec_mulo(sum3A, v1ss); |
674 pp3Be = vec_sra((vector signed int)sum3B, v16ui); | 624 pp3Be = vec_sra((vec_s32_t)sum3B, v16ui); |
675 pp3Bo = vec_mulo(sum3B, v1ss); | 625 pp3Bo = vec_mulo(sum3B, v1ss); |
676 | 626 |
677 pp1cAe = vec_add(pp1Ae, v512si); | 627 pp1cAe = vec_add(pp1Ae, v512si); |
678 pp1cAo = vec_add(pp1Ao, v512si); | 628 pp1cAo = vec_add(pp1Ao, v512si); |
679 pp1cBe = vec_add(pp1Be, v512si); | 629 pp1cBe = vec_add(pp1Be, v512si); |