comparison ppc/h264_template_altivec.c @ 6064:62d040333d51 libavcodec

Add C/B == 0 cases, 2% slower on CELL but should address Issue299 eventually
author lu_zero
date Sat, 22 Dec 2007 23:10:02 +0000
parents 47ed1b9610b1
children 180976fd652e
comparison
equal deleted inserted replaced
6063:47ed1b9610b1 6064:62d040333d51
50 \ 50 \
51 dst += stride;\ 51 dst += stride;\
52 src += stride; 52 src += stride;
53 53
54 #define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \ 54 #define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
55 vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc2uc);\ 55 \
56 vsrc0ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc0uc);\
57 vsrc1ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc1uc);\
56 \ 58 \
57 psum = vec_mladd(vA, vsrc0ssH, v32ss);\ 59 psum = vec_mladd(vA, vsrc0ssH, v32ss);\
58 psum = vec_mladd(vB, vsrc1ssH, psum);\ 60 psum = vec_mladd(vE, vsrc1ssH, psum);\
59 psum = vec_mladd(vC, vsrc2ssH, psum);\
60 psum = vec_sr(psum, v6us);\ 61 psum = vec_sr(psum, v6us);\
61 \ 62 \
62 vdst = vec_ld(0, dst);\ 63 vdst = vec_ld(0, dst);\
63 ppsum = (vec_u8_t)vec_pack(psum, psum);\ 64 ppsum = (vec_u8_t)vec_pack(psum, psum);\
64 vfdst = vec_perm(vdst, ppsum, fperm);\ 65 vfdst = vec_perm(vdst, ppsum, fperm);\
65 \ 66 \
66 OP_U8_ALTIVEC(fsum, vfdst, vdst);\ 67 OP_U8_ALTIVEC(fsum, vfdst, vdst);\
67 \ 68 \
68 vec_st(fsum, 0, dst);\ 69 vec_st(fsum, 0, dst);\
69 \
70 vsrc0ssH = vsrc1ssH;\
71 vsrc1ssH = vsrc2ssH;\
72 \ 70 \
73 dst += stride;\ 71 dst += stride;\
74 src += stride; 72 src += stride;
75 73
76 void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, 74 void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
153 151
154 CHROMA_MC8_ALTIVEC_CORE 152 CHROMA_MC8_ALTIVEC_CORE
155 } 153 }
156 } 154 }
157 } else { 155 } else {
156 const vec_s16_t vE = vec_add(vB, vC);
157 if (ABCD[2]) { // y == 0 B == 0
158 if (!loadSecond) {// -> !reallyBadAlign 158 if (!loadSecond) {// -> !reallyBadAlign
159 for (i = 0 ; i < h ; i++) { 159 for (i = 0 ; i < h ; i++) {
160 vsrcCuc = vec_ld(stride + 0, src); 160 vsrcCuc = vec_ld(stride + 0, src);
161 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); 161 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
162
163 CHROMA_MC8_ALTIVEC_CORE_SIMPLE 162 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
163
164 vsrc0uc = vsrc1uc;
164 } 165 }
165 } else { 166 } else {
166 vec_u8_t vsrcDuc; 167 vec_u8_t vsrcDuc;
167 for (i = 0 ; i < h ; i++) { 168 for (i = 0 ; i < h ; i++) {
168 vsrcCuc = vec_ld(stride + 0, src); 169 vsrcCuc = vec_ld(0, src);
169 vsrcDuc = vec_ld(stride + 16, src); 170 vsrcDuc = vec_ld(15, src);
170 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); 171 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
171
172 CHROMA_MC8_ALTIVEC_CORE_SIMPLE 172 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
173
174 vsrc0uc = vsrc1uc;
173 } 175 }
176 }
177 } else { // x == 0 C == 0
178 if (!loadSecond) {// -> !reallyBadAlign
179 for (i = 0 ; i < h ; i++) {
180 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
181
182 vsrcCuc = vec_ld(0, src);
183 vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
184 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
185 }
186 } else {
187 vec_u8_t vsrcDuc;
188 for (i = 0 ; i < h ; i++) {
189 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
190
191 vsrcCuc = vec_ld(0, src);
192 vsrcDuc = vec_ld(15, src);
193 vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
194 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
195 }
196 }
174 } 197 }
175 } 198 }
176 POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1); 199 POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1);
177 } 200 }
178 201