comparison sh4/qpel.c @ 1262:82e0e1b9c283 libavcodec

aligned dsputil (for sh4) patch by (BERO <bero at geocities dot co dot jp>)
author michaelni
date Wed, 14 May 2003 17:46:55 +0000
parents
children 2fa34e615c76
comparison
equal deleted inserted replaced
1261:362947395f5c 1262:82e0e1b9c283
1 /*
2 this is optimized for sh, which have post increment addressing (*p++)
3 some cpu may be index (p[n]) faster than post increment (*p++)
4 */
5
6 #define LD(adr) *(uint32_t*)(adr)
7
8 #define PIXOP2(OPNAME, OP) \
9 /*static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
10 {\
11 do {\
12 OP(LP(dst ),no_rnd_avg2(LD32(src1 ),LD32(src2 )) ); \
13 OP(LP(dst+4),no_rnd_avg2(LD32(src1+4),LD32(src2+4)) ); \
14 src1+=src_stride1; \
15 src2+=src_stride2; \
16 dst+=dst_stride; \
17 } while(--h); \
18 }\
19 \
20 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
21 {\
22 do {\
23 OP(LP(dst ),rnd_avg2(LD32(src1 ),LD32(src2 )) ); \
24 OP(LP(dst+4),rnd_avg2(LD32(src1+4),LD32(src2+4)) ); \
25 src1+=src_stride1; \
26 src2+=src_stride2; \
27 dst+=dst_stride; \
28 } while(--h); \
29 }\
30 \
31 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
32 {\
33 do {\
34 OP(LP(dst ),rnd_avg2(LD32(src1 ),LD32(src2 )) ); \
35 src1+=src_stride1; \
36 src2+=src_stride2; \
37 dst+=dst_stride; \
38 } while(--h); \
39 }\
40 \
41 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
42 {\
43 do {\
44 OP(LP(dst ),no_rnd_avg2(LD32(src1 ),LD32(src2 )) ); \
45 OP(LP(dst+4),no_rnd_avg2(LD32(src1+4),LD32(src2+4)) ); \
46 OP(LP(dst+8),no_rnd_avg2(LD32(src1+8),LD32(src2+8)) ); \
47 OP(LP(dst+12),no_rnd_avg2(LD32(src1+12),LD32(src2+12)) ); \
48 src1+=src_stride1; \
49 src2+=src_stride2; \
50 dst+=dst_stride; \
51 } while(--h); \
52 }\
53 \
54 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
55 {\
56 do {\
57 OP(LP(dst ),rnd_avg2(LD32(src1 ),LD32(src2 )) ); \
58 OP(LP(dst+4),rnd_avg2(LD32(src1+4),LD32(src2+4)) ); \
59 OP(LP(dst+8),rnd_avg2(LD32(src1+8),LD32(src2+8)) ); \
60 OP(LP(dst+12),rnd_avg2(LD32(src1+12),LD32(src2+12)) ); \
61 src1+=src_stride1; \
62 src2+=src_stride2; \
63 dst+=dst_stride; \
64 } while(--h); \
65 }*/\
66 \
67 static inline void OPNAME ## _pixels4_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
68 {\
69 do {\
70 OP(LP(dst ),rnd_avg2(LP(src1 ),LP(src2 )) ); \
71 src1+=src_stride1; \
72 src2+=src_stride2; \
73 dst+=dst_stride; \
74 } while(--h); \
75 }\
76 \
77 static inline void OPNAME ## _pixels4_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
78 {\
79 do {\
80 OP(LP(dst ),rnd_avg2(LD32(src1 ),LP(src2 )) ); \
81 src1+=src_stride1; \
82 src2+=src_stride2; \
83 dst+=dst_stride; \
84 } while(--h); \
85 }\
86 \
87 static inline void OPNAME ## _no_rnd_pixels16_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
88 {\
89 do {\
90 OP(LP(dst ),no_rnd_avg2(LD32(src1 ),LP(src2 )) ); \
91 OP(LP(dst+4),no_rnd_avg2(LD32(src1+4),LP(src2+4)) ); \
92 OP(LP(dst+8),no_rnd_avg2(LD32(src1+8),LP(src2+8)) ); \
93 OP(LP(dst+12),no_rnd_avg2(LD32(src1+12),LP(src2+12)) ); \
94 src1+=src_stride1; \
95 src2+=src_stride2; \
96 dst+=dst_stride; \
97 } while(--h); \
98 }\
99 \
100 static inline void OPNAME ## _pixels16_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
101 {\
102 do {\
103 OP(LP(dst ),rnd_avg2(LD32(src1 ),LP(src2 )) ); \
104 OP(LP(dst+4),rnd_avg2(LD32(src1+4),LP(src2+4)) ); \
105 OP(LP(dst+8),rnd_avg2(LD32(src1+8),LP(src2+8)) ); \
106 OP(LP(dst+12),rnd_avg2(LD32(src1+12),LP(src2+12)) ); \
107 src1+=src_stride1; \
108 src2+=src_stride2; \
109 dst+=dst_stride; \
110 } while(--h); \
111 }\
112 \
113 static inline void OPNAME ## _no_rnd_pixels8_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
114 {\
115 do { /* onlye src2 aligned */\
116 OP(LP(dst ),no_rnd_avg2(LD32(src1 ),LP(src2 )) ); \
117 OP(LP(dst+4),no_rnd_avg2(LD32(src1+4),LP(src2+4)) ); \
118 src1+=src_stride1; \
119 src2+=src_stride2; \
120 dst+=dst_stride; \
121 } while(--h); \
122 }\
123 \
124 static inline void OPNAME ## _pixels8_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
125 {\
126 do {\
127 OP(LP(dst ),rnd_avg2(LD32(src1 ),LP(src2 )) ); \
128 OP(LP(dst+4),rnd_avg2(LD32(src1+4),LP(src2+4)) ); \
129 src1+=src_stride1; \
130 src2+=src_stride2; \
131 dst+=dst_stride; \
132 } while(--h); \
133 }\
134 \
135 static inline void OPNAME ## _no_rnd_pixels8_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
136 {\
137 do {\
138 OP(LP(dst ),no_rnd_avg2(LP(src1 ),LP(src2 )) ); \
139 OP(LP(dst+4),no_rnd_avg2(LP(src1+4),LP(src2+4)) ); \
140 src1+=src_stride1; \
141 src2+=src_stride2; \
142 dst+=dst_stride; \
143 } while(--h); \
144 }\
145 \
146 static inline void OPNAME ## _pixels8_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
147 {\
148 do {\
149 OP(LP(dst ),rnd_avg2(LP(src1 ),LP(src2 )) ); \
150 OP(LP(dst+4),rnd_avg2(LP(src1+4),LP(src2+4)) ); \
151 src1+=src_stride1; \
152 src2+=src_stride2; \
153 dst+=dst_stride; \
154 } while(--h); \
155 }\
156 \
157 static inline void OPNAME ## _no_rnd_pixels16_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
158 {\
159 do {\
160 OP(LP(dst ),no_rnd_avg2(LP(src1 ),LP(src2 )) ); \
161 OP(LP(dst+4),no_rnd_avg2(LP(src1+4),LP(src2+4)) ); \
162 OP(LP(dst+8),no_rnd_avg2(LP(src1+8),LP(src2+8)) ); \
163 OP(LP(dst+12),no_rnd_avg2(LP(src1+12),LP(src2+12)) ); \
164 src1+=src_stride1; \
165 src2+=src_stride2; \
166 dst+=dst_stride; \
167 } while(--h); \
168 }\
169 \
170 static inline void OPNAME ## _pixels16_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
171 {\
172 do {\
173 OP(LP(dst ),rnd_avg2(LP(src1 ),LP(src2 )) ); \
174 OP(LP(dst+4),rnd_avg2(LP(src1+4),LP(src2+4)) ); \
175 OP(LP(dst+8),rnd_avg2(LP(src1+8),LP(src2+8)) ); \
176 OP(LP(dst+12),rnd_avg2(LP(src1+12),LP(src2+12)) ); \
177 src1+=src_stride1; \
178 src2+=src_stride2; \
179 dst+=dst_stride; \
180 } while(--h); \
181 }\
182 \
183 static inline void OPNAME ## _no_rnd_pixels16_l2_aligned1(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
184 { OPNAME ## _no_rnd_pixels16_l2_aligned2(dst,src2,src1,dst_stride,src_stride2,src_stride1,h); } \
185 \
186 static inline void OPNAME ## _pixels16_l2_aligned1(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
187 { OPNAME ## _pixels16_l2_aligned2(dst,src2,src1,dst_stride,src_stride2,src_stride1,h); } \
188 \
189 static inline void OPNAME ## _no_rnd_pixels8_l2_aligned1(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
190 { OPNAME ## _no_rnd_pixels8_l2_aligned2(dst,src2,src1,dst_stride,src_stride2,src_stride1,h); } \
191 \
192 static inline void OPNAME ## _pixels8_l2_aligned1(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
193 { OPNAME ## _pixels8_l2_aligned2(dst,src2,src1,dst_stride,src_stride2,src_stride1,h); } \
194 \
195 static inline void OPNAME ## _pixels8_l4_aligned(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
196 do { \
197 uint32_t a0,a1,a2,a3; \
198 UNPACK(a0,a1,LP(src1),LP(src2)); \
199 UNPACK(a2,a3,LP(src3),LP(src4)); \
200 OP(LP(dst),rnd_PACK(a0,a1,a2,a3)); \
201 UNPACK(a0,a1,LP(src1+4),LP(src2+4)); \
202 UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \
203 OP(LP(dst+4),rnd_PACK(a0,a1,a2,a3)); \
204 src1+=src_stride1;\
205 src2+=src_stride2;\
206 src3+=src_stride3;\
207 src4+=src_stride4;\
208 dst+=dst_stride;\
209 } while(--h); \
210 } \
211 \
212 static inline void OPNAME ## _no_rnd_pixels8_l4_aligned(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
213 do { \
214 uint32_t a0,a1,a2,a3; \
215 UNPACK(a0,a1,LP(src1),LP(src2)); \
216 UNPACK(a2,a3,LP(src3),LP(src4)); \
217 OP(LP(dst),no_rnd_PACK(a0,a1,a2,a3)); \
218 UNPACK(a0,a1,LP(src1+4),LP(src2+4)); \
219 UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \
220 OP(LP(dst+4),no_rnd_PACK(a0,a1,a2,a3)); \
221 src1+=src_stride1;\
222 src2+=src_stride2;\
223 src3+=src_stride3;\
224 src4+=src_stride4;\
225 dst+=dst_stride;\
226 } while(--h); \
227 } \
228 \
229 static inline void OPNAME ## _pixels8_l4_aligned0(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
230 do { \
231 uint32_t a0,a1,a2,a3; /* src1 only not aligned */\
232 UNPACK(a0,a1,LD32(src1),LP(src2)); \
233 UNPACK(a2,a3,LP(src3),LP(src4)); \
234 OP(LP(dst),rnd_PACK(a0,a1,a2,a3)); \
235 UNPACK(a0,a1,LD32(src1+4),LP(src2+4)); \
236 UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \
237 OP(LP(dst+4),rnd_PACK(a0,a1,a2,a3)); \
238 src1+=src_stride1;\
239 src2+=src_stride2;\
240 src3+=src_stride3;\
241 src4+=src_stride4;\
242 dst+=dst_stride;\
243 } while(--h); \
244 } \
245 \
246 static inline void OPNAME ## _no_rnd_pixels8_l4_aligned0(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
247 do { \
248 uint32_t a0,a1,a2,a3; \
249 UNPACK(a0,a1,LD32(src1),LP(src2)); \
250 UNPACK(a2,a3,LP(src3),LP(src4)); \
251 OP(LP(dst),no_rnd_PACK(a0,a1,a2,a3)); \
252 UNPACK(a0,a1,LD32(src1+4),LP(src2+4)); \
253 UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \
254 OP(LP(dst+4),no_rnd_PACK(a0,a1,a2,a3)); \
255 src1+=src_stride1;\
256 src2+=src_stride2;\
257 src3+=src_stride3;\
258 src4+=src_stride4;\
259 dst+=dst_stride;\
260 } while(--h); \
261 } \
262 \
263 static inline void OPNAME ## _pixels16_l4_aligned(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
264 do { \
265 uint32_t a0,a1,a2,a3; \
266 UNPACK(a0,a1,LP(src1),LP(src2)); \
267 UNPACK(a2,a3,LP(src3),LP(src4)); \
268 OP(LP(dst),rnd_PACK(a0,a1,a2,a3)); \
269 UNPACK(a0,a1,LP(src1+4),LP(src2+4)); \
270 UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \
271 OP(LP(dst+8),rnd_PACK(a0,a1,a2,a3)); \
272 UNPACK(a0,a1,LP(src1+8),LP(src2+8)); \
273 UNPACK(a2,a3,LP(src3+8),LP(src4+8)); \
274 OP(LP(dst+8),rnd_PACK(a0,a1,a2,a3)); \
275 UNPACK(a0,a1,LP(src1+12),LP(src2+12)); \
276 UNPACK(a2,a3,LP(src3+12),LP(src4+12)); \
277 OP(LP(dst+12),rnd_PACK(a0,a1,a2,a3)); \
278 src1+=src_stride1;\
279 src2+=src_stride2;\
280 src3+=src_stride3;\
281 src4+=src_stride4;\
282 dst+=dst_stride;\
283 } while(--h); \
284 } \
285 \
286 static inline void OPNAME ## _no_rnd_pixels16_l4_aligned(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
287 do { \
288 uint32_t a0,a1,a2,a3; \
289 UNPACK(a0,a1,LP(src1),LP(src2)); \
290 UNPACK(a2,a3,LP(src3),LP(src4)); \
291 OP(LP(dst),no_rnd_PACK(a0,a1,a2,a3)); \
292 UNPACK(a0,a1,LP(src1+4),LP(src2+4)); \
293 UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \
294 OP(LP(dst+4),no_rnd_PACK(a0,a1,a2,a3)); \
295 UNPACK(a0,a1,LP(src1+8),LP(src2+8)); \
296 UNPACK(a2,a3,LP(src3+8),LP(src4+8)); \
297 OP(LP(dst+8),no_rnd_PACK(a0,a1,a2,a3)); \
298 UNPACK(a0,a1,LP(src1+12),LP(src2+12)); \
299 UNPACK(a2,a3,LP(src3+12),LP(src4+12)); \
300 OP(LP(dst+12),no_rnd_PACK(a0,a1,a2,a3)); \
301 src1+=src_stride1;\
302 src2+=src_stride2;\
303 src3+=src_stride3;\
304 src4+=src_stride4;\
305 dst+=dst_stride;\
306 } while(--h); \
307 } \
308 \
309 static inline void OPNAME ## _pixels16_l4_aligned0(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
310 do { /* src1 is unaligned */\
311 uint32_t a0,a1,a2,a3; \
312 UNPACK(a0,a1,LD32(src1),LP(src2)); \
313 UNPACK(a2,a3,LP(src3),LP(src4)); \
314 OP(LP(dst),rnd_PACK(a0,a1,a2,a3)); \
315 UNPACK(a0,a1,LD32(src1+4),LP(src2+4)); \
316 UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \
317 OP(LP(dst+8),rnd_PACK(a0,a1,a2,a3)); \
318 UNPACK(a0,a1,LD32(src1+8),LP(src2+8)); \
319 UNPACK(a2,a3,LP(src3+8),LP(src4+8)); \
320 OP(LP(dst+8),rnd_PACK(a0,a1,a2,a3)); \
321 UNPACK(a0,a1,LD32(src1+12),LP(src2+12)); \
322 UNPACK(a2,a3,LP(src3+12),LP(src4+12)); \
323 OP(LP(dst+12),rnd_PACK(a0,a1,a2,a3)); \
324 src1+=src_stride1;\
325 src2+=src_stride2;\
326 src3+=src_stride3;\
327 src4+=src_stride4;\
328 dst+=dst_stride;\
329 } while(--h); \
330 } \
331 \
332 static inline void OPNAME ## _no_rnd_pixels16_l4_aligned0(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
333 do { \
334 uint32_t a0,a1,a2,a3; \
335 UNPACK(a0,a1,LD32(src1),LP(src2)); \
336 UNPACK(a2,a3,LP(src3),LP(src4)); \
337 OP(LP(dst),no_rnd_PACK(a0,a1,a2,a3)); \
338 UNPACK(a0,a1,LD32(src1+4),LP(src2+4)); \
339 UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \
340 OP(LP(dst+4),no_rnd_PACK(a0,a1,a2,a3)); \
341 UNPACK(a0,a1,LD32(src1+8),LP(src2+8)); \
342 UNPACK(a2,a3,LP(src3+8),LP(src4+8)); \
343 OP(LP(dst+8),no_rnd_PACK(a0,a1,a2,a3)); \
344 UNPACK(a0,a1,LD32(src1+12),LP(src2+12)); \
345 UNPACK(a2,a3,LP(src3+12),LP(src4+12)); \
346 OP(LP(dst+12),no_rnd_PACK(a0,a1,a2,a3)); \
347 src1+=src_stride1;\
348 src2+=src_stride2;\
349 src3+=src_stride3;\
350 src4+=src_stride4;\
351 dst+=dst_stride;\
352 } while(--h); \
353 } \
354 \
355
356 #define op_avg(a, b) a = rnd_avg2(a,b)
357 #define op_put(a, b) a = b
358
359 PIXOP2(avg, op_avg)
360 PIXOP2(put, op_put)
361 #undef op_avg
362 #undef op_put
363
364 #define avg2(a,b) ((a+b+1)>>1)
365 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
366
367
368 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
369 {
370 const int A=(16-x16)*(16-y16);
371 const int B=( x16)*(16-y16);
372 const int C=(16-x16)*( y16);
373 const int D=( x16)*( y16);
374
375 do {
376 int t0,t1,t2,t3;
377 uint8_t *s0 = src;
378 uint8_t *s1 = src+stride;
379 t0 = *s0++; t2 = *s1++;
380 t1 = *s0++; t3 = *s1++;
381 dst[0]= (A*t0 + B*t1 + C*t2 + D*t3 + rounder)>>8;
382 t0 = *s0++; t2 = *s1++;
383 dst[1]= (A*t1 + B*t0 + C*t3 + D*t2 + rounder)>>8;
384 t1 = *s0++; t3 = *s1++;
385 dst[2]= (A*t0 + B*t1 + C*t2 + D*t3 + rounder)>>8;
386 t0 = *s0++; t2 = *s1++;
387 dst[3]= (A*t1 + B*t0 + C*t3 + D*t2 + rounder)>>8;
388 t1 = *s0++; t3 = *s1++;
389 dst[4]= (A*t0 + B*t1 + C*t2 + D*t3 + rounder)>>8;
390 t0 = *s0++; t2 = *s1++;
391 dst[5]= (A*t1 + B*t0 + C*t3 + D*t2 + rounder)>>8;
392 t1 = *s0++; t3 = *s1++;
393 dst[6]= (A*t0 + B*t1 + C*t2 + D*t3 + rounder)>>8;
394 t0 = *s0++; t2 = *s1++;
395 dst[7]= (A*t1 + B*t0 + C*t3 + D*t2 + rounder)>>8;
396 dst+= stride;
397 src+= stride;
398 }while(--h);
399 }
400
401 static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
402 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
403 {
404 int y, vx, vy;
405 const int s= 1<<shift;
406
407 width--;
408 height--;
409
410 for(y=0; y<h; y++){
411 int x;
412
413 vx= ox;
414 vy= oy;
415 for(x=0; x<8; x++){ //XXX FIXME optimize
416 int src_x, src_y, frac_x, frac_y, index;
417
418 src_x= vx>>16;
419 src_y= vy>>16;
420 frac_x= src_x&(s-1);
421 frac_y= src_y&(s-1);
422 src_x>>=shift;
423 src_y>>=shift;
424
425 if((unsigned)src_x < width){
426 if((unsigned)src_y < height){
427 index= src_x + src_y*stride;
428 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
429 + src[index +1]* frac_x )*(s-frac_y)
430 + ( src[index+stride ]*(s-frac_x)
431 + src[index+stride+1]* frac_x )* frac_y
432 + r)>>(shift*2);
433 }else{
434 index= src_x + clip(src_y, 0, height)*stride;
435 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
436 + src[index +1]* frac_x )*s
437 + r)>>(shift*2);
438 }
439 }else{
440 if((unsigned)src_y < height){
441 index= clip(src_x, 0, width) + src_y*stride;
442 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
443 + src[index+stride ]* frac_y )*s
444 + r)>>(shift*2);
445 }else{
446 index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
447 dst[y*stride + x]= src[index ];
448 }
449 }
450
451 vx+= dxx;
452 vy+= dyx;
453 }
454 ox += dxy;
455 oy += dyy;
456 }
457 }
458 #define H264_CHROMA_MC(OPNAME, OP)\
459 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
460 const int A=(8-x)*(8-y);\
461 const int B=( x)*(8-y);\
462 const int C=(8-x)*( y);\
463 const int D=( x)*( y);\
464 \
465 assert(x<8 && y<8 && x>=0 && y>=0);\
466 \
467 do {\
468 int t0,t1,t2,t3; \
469 uint8_t *s0 = src; \
470 uint8_t *s1 = src+stride; \
471 t0 = *s0++; t2 = *s1++; \
472 t1 = *s0++; t3 = *s1++; \
473 OP(dst[0], (A*t0 + B*t1 + C*t2 + D*t3));\
474 t0 = *s0++; t2 = *s1++; \
475 OP(dst[1], (A*t1 + B*t0 + C*t3 + D*t2));\
476 dst+= stride;\
477 src+= stride;\
478 }while(--h);\
479 }\
480 \
481 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
482 const int A=(8-x)*(8-y);\
483 const int B=( x)*(8-y);\
484 const int C=(8-x)*( y);\
485 const int D=( x)*( y);\
486 \
487 assert(x<8 && y<8 && x>=0 && y>=0);\
488 \
489 do {\
490 int t0,t1,t2,t3; \
491 uint8_t *s0 = src; \
492 uint8_t *s1 = src+stride; \
493 t0 = *s0++; t2 = *s1++; \
494 t1 = *s0++; t3 = *s1++; \
495 OP(dst[0], (A*t0 + B*t1 + C*t2 + D*t3));\
496 t0 = *s0++; t2 = *s1++; \
497 OP(dst[1], (A*t1 + B*t0 + C*t3 + D*t2));\
498 t1 = *s0++; t3 = *s1++; \
499 OP(dst[2], (A*t0 + B*t1 + C*t2 + D*t3));\
500 t0 = *s0++; t2 = *s1++; \
501 OP(dst[3], (A*t1 + B*t0 + C*t3 + D*t2));\
502 dst+= stride;\
503 src+= stride;\
504 }while(--h);\
505 }\
506 \
507 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
508 const int A=(8-x)*(8-y);\
509 const int B=( x)*(8-y);\
510 const int C=(8-x)*( y);\
511 const int D=( x)*( y);\
512 \
513 assert(x<8 && y<8 && x>=0 && y>=0);\
514 \
515 do {\
516 int t0,t1,t2,t3; \
517 uint8_t *s0 = src; \
518 uint8_t *s1 = src+stride; \
519 t0 = *s0++; t2 = *s1++; \
520 t1 = *s0++; t3 = *s1++; \
521 OP(dst[0], (A*t0 + B*t1 + C*t2 + D*t3));\
522 t0 = *s0++; t2 = *s1++; \
523 OP(dst[1], (A*t1 + B*t0 + C*t3 + D*t2));\
524 t1 = *s0++; t3 = *s1++; \
525 OP(dst[2], (A*t0 + B*t1 + C*t2 + D*t3));\
526 t0 = *s0++; t2 = *s1++; \
527 OP(dst[3], (A*t1 + B*t0 + C*t3 + D*t2));\
528 t1 = *s0++; t3 = *s1++; \
529 OP(dst[4], (A*t0 + B*t1 + C*t2 + D*t3));\
530 t0 = *s0++; t2 = *s1++; \
531 OP(dst[5], (A*t1 + B*t0 + C*t3 + D*t2));\
532 t1 = *s0++; t3 = *s1++; \
533 OP(dst[6], (A*t0 + B*t1 + C*t2 + D*t3));\
534 t0 = *s0++; t2 = *s1++; \
535 OP(dst[7], (A*t1 + B*t0 + C*t3 + D*t2));\
536 dst+= stride;\
537 src+= stride;\
538 }while(--h);\
539 }
540
541 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
542 #define op_put(a, b) a = (((b) + 32)>>6)
543
544 H264_CHROMA_MC(put_ , op_put)
545 H264_CHROMA_MC(avg_ , op_avg)
546 #undef op_avg
547 #undef op_put
548
549 /* not yet optimized */
550 static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
551 {
552 int i;
553 for(i=0; i<h; i++)
554 {
555 ST32(dst , LD32(src ));
556 dst+=dstStride;
557 src+=srcStride;
558 }
559 }
560
561 static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
562 {
563 int i;
564 for(i=0; i<h; i++)
565 {
566 ST32(dst , LD32(src ));
567 ST32(dst+4 , LD32(src+4 ));
568 dst+=dstStride;
569 src+=srcStride;
570 }
571 }
572
573 static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
574 {
575 int i;
576 for(i=0; i<h; i++)
577 {
578 ST32(dst , LD32(src ));
579 ST32(dst+4 , LD32(src+4 ));
580 ST32(dst+8 , LD32(src+8 ));
581 ST32(dst+12, LD32(src+12));
582 dst+=dstStride;
583 src+=srcStride;
584 }
585 }
586
587 static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
588 {
589 int i;
590 for(i=0; i<h; i++)
591 {
592 ST32(dst , LD32(src ));
593 ST32(dst+4 , LD32(src+4 ));
594 ST32(dst+8 , LD32(src+8 ));
595 ST32(dst+12, LD32(src+12));
596 dst[16]= src[16];
597 dst+=dstStride;
598 src+=srcStride;
599 }
600 }
601
602 static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
603 {
604 int i;
605 for(i=0; i<h; i++)
606 {
607 ST32(dst , LD32(src ));
608 ST32(dst+4 , LD32(src+4 ));
609 dst[8]= src[8];
610 dst+=dstStride;
611 src+=srcStride;
612 }
613 }
614 /* end not optimized */
615
616 #define QPEL_MC(r, OPNAME, RND, OP) \
617 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
618 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
619 do {\
620 uint8_t *s = src; \
621 int src0,src1,src2,src3,src4,src5,src6,src7,src8;\
622 src0= *s++;\
623 src1= *s++;\
624 src2= *s++;\
625 src3= *s++;\
626 src4= *s++;\
627 OP(dst[0], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
628 src5= *s++;\
629 OP(dst[1], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
630 src6= *s++;\
631 OP(dst[2], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
632 src7= *s++;\
633 OP(dst[3], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
634 src8= *s++;\
635 OP(dst[4], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
636 OP(dst[5], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
637 OP(dst[6], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
638 OP(dst[7], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
639 dst+=dstStride;\
640 src+=srcStride;\
641 }while(--h);\
642 }\
643 \
644 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
645 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
646 int w=8;\
647 do{\
648 uint8_t *s = src, *d=dst;\
649 int src0,src1,src2,src3,src4,src5,src6,src7,src8;\
650 src0 = *s; s+=srcStride; \
651 src1 = *s; s+=srcStride; \
652 src2 = *s; s+=srcStride; \
653 src3 = *s; s+=srcStride; \
654 src4 = *s; s+=srcStride; \
655 OP(*d, (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));d+=dstStride;\
656 src5 = *s; s+=srcStride; \
657 OP(*d, (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));d+=dstStride;\
658 src6 = *s; s+=srcStride; \
659 OP(*d, (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));d+=dstStride;\
660 src7 = *s; s+=srcStride; \
661 OP(*d, (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));d+=dstStride;\
662 src8 = *s; \
663 OP(*d, (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));d+=dstStride;\
664 OP(*d, (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));d+=dstStride;\
665 OP(*d, (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));d+=dstStride;\
666 OP(*d, (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
667 dst++;\
668 src++;\
669 }while(--w);\
670 }\
671 \
672 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
673 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
674 do {\
675 uint8_t *s = src;\
676 int src0,src1,src2,src3,src4,src5,src6,src7,src8;\
677 int src9,src10,src11,src12,src13,src14,src15,src16;\
678 src0= *s++;\
679 src1= *s++;\
680 src2= *s++;\
681 src3= *s++;\
682 src4= *s++;\
683 OP(dst[ 0], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
684 src5= *s++;\
685 OP(dst[ 1], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
686 src6= *s++;\
687 OP(dst[ 2], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
688 src7= *s++;\
689 OP(dst[ 3], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
690 src8= *s++;\
691 OP(dst[ 4], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
692 src9= *s++;\
693 OP(dst[ 5], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
694 src10= *s++;\
695 OP(dst[ 6], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
696 src11= *s++;\
697 OP(dst[ 7], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
698 src12= *s++;\
699 OP(dst[ 8], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
700 src13= *s++;\
701 OP(dst[ 9], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
702 src14= *s++;\
703 OP(dst[10], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
704 src15= *s++;\
705 OP(dst[11], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
706 src16= *s++;\
707 OP(dst[12], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
708 OP(dst[13], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
709 OP(dst[14], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
710 OP(dst[15], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
711 dst+=dstStride;\
712 src+=srcStride;\
713 }while(--h);\
714 }\
715 \
716 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
717 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
718 int w=16;\
719 do {\
720 uint8_t *s = src, *d=dst;\
721 int src0,src1,src2,src3,src4,src5,src6,src7,src8;\
722 int src9,src10,src11,src12,src13,src14,src15,src16;\
723 src0 = *s; s+=srcStride; \
724 src1 = *s; s+=srcStride; \
725 src2 = *s; s+=srcStride; \
726 src3 = *s; s+=srcStride; \
727 src4 = *s; s+=srcStride; \
728 OP(*d, (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));d+=dstStride;\
729 src5 = *s; s+=srcStride; \
730 OP(*d, (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));d+=dstStride;\
731 src6 = *s; s+=srcStride; \
732 OP(*d, (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));d+=dstStride;\
733 src7 = *s; s+=srcStride; \
734 OP(*d, (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));d+=dstStride;\
735 src8 = *s; s+=srcStride; \
736 OP(*d, (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));d+=dstStride;\
737 src9 = *s; s+=srcStride; \
738 OP(*d, (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));d+=dstStride;\
739 src10 = *s; s+=srcStride; \
740 OP(*d, (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));d+=dstStride;\
741 src11 = *s; s+=srcStride; \
742 OP(*d, (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));d+=dstStride;\
743 src12 = *s; s+=srcStride; \
744 OP(*d, (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));d+=dstStride;\
745 src13 = *s; s+=srcStride; \
746 OP(*d, (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));d+=dstStride;\
747 src14 = *s; s+=srcStride; \
748 OP(*d, (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));d+=dstStride;\
749 src15 = *s; s+=srcStride; \
750 OP(*d, (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));d+=dstStride;\
751 src16 = *s; \
752 OP(*d, (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));d+=dstStride;\
753 OP(*d, (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));d+=dstStride;\
754 OP(*d, (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));d+=dstStride;\
755 OP(*d, (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
756 dst++;\
757 src++;\
758 }while(--w);\
759 }\
760 \
761 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
762 OPNAME ## pixels8_c(dst, src, stride, 8);\
763 }\
764 \
765 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
766 uint8_t half[64];\
767 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
768 OPNAME ## pixels8_l2_aligned2(dst, src, half, stride, stride, 8, 8);\
769 }\
770 \
771 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
772 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
773 }\
774 \
775 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
776 uint8_t half[64];\
777 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
778 OPNAME ## pixels8_l2_aligned2(dst, src+1, half, stride, stride, 8, 8);\
779 }\
780 \
781 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
782 uint8_t full[16*9];\
783 uint8_t half[64];\
784 copy_block9(full, src, 16, stride, 9);\
785 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
786 OPNAME ## pixels8_l2_aligned(dst, full, half, stride, 16, 8, 8);\
787 }\
788 \
789 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
790 uint8_t full[16*9];\
791 copy_block9(full, src, 16, stride, 9);\
792 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
793 }\
794 \
795 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
796 uint8_t full[16*9];\
797 uint8_t half[64];\
798 copy_block9(full, src, 16, stride, 9);\
799 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
800 OPNAME ## pixels8_l2_aligned(dst, full+16, half, stride, 16, 8, 8);\
801 }\
802 static void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
803 uint8_t full[16*9];\
804 uint8_t halfH[72];\
805 uint8_t halfV[64];\
806 uint8_t halfHV[64];\
807 copy_block9(full, src, 16, stride, 9);\
808 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
809 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
810 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
811 OPNAME ## pixels8_l4_aligned(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
812 }\
813 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
814 uint8_t full[16*9];\
815 uint8_t halfH[72];\
816 uint8_t halfHV[64];\
817 copy_block9(full, src, 16, stride, 9);\
818 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
819 put ## RND ## pixels8_l2_aligned(halfH, halfH, full, 8, 8, 16, 9);\
820 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
821 OPNAME ## pixels8_l2_aligned(dst, halfH, halfHV, stride, 8, 8, 8);\
822 }\
823 static void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
824 uint8_t full[16*9];\
825 uint8_t halfH[72];\
826 uint8_t halfV[64];\
827 uint8_t halfHV[64];\
828 copy_block9(full, src, 16, stride, 9);\
829 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
830 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
831 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
832 OPNAME ## pixels8_l4_aligned0(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
833 }\
834 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
835 uint8_t full[16*9];\
836 uint8_t halfH[72];\
837 uint8_t halfHV[64];\
838 copy_block9(full, src, 16, stride, 9);\
839 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
840 put ## RND ## pixels8_l2_aligned1(halfH, halfH, full+1, 8, 8, 16, 9);\
841 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
842 OPNAME ## pixels8_l2_aligned(dst, halfH, halfHV, stride, 8, 8, 8);\
843 }\
844 static void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
845 uint8_t full[16*9];\
846 uint8_t halfH[72];\
847 uint8_t halfV[64];\
848 uint8_t halfHV[64];\
849 copy_block9(full, src, 16, stride, 9);\
850 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
851 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
852 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
853 OPNAME ## pixels8_l4_aligned(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
854 }\
855 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
856 uint8_t full[16*9];\
857 uint8_t halfH[72];\
858 uint8_t halfHV[64];\
859 copy_block9(full, src, 16, stride, 9);\
860 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
861 put ## RND ## pixels8_l2_aligned(halfH, halfH, full, 8, 8, 16, 9);\
862 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
863 OPNAME ## pixels8_l2_aligned(dst, halfH+8, halfHV, stride, 8, 8, 8);\
864 }\
865 static void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
866 uint8_t full[16*9];\
867 uint8_t halfH[72];\
868 uint8_t halfV[64];\
869 uint8_t halfHV[64];\
870 copy_block9(full, src, 16, stride, 9);\
871 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
872 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
873 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
874 OPNAME ## pixels8_l4_aligned0(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
875 }\
876 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
877 uint8_t full[16*9];\
878 uint8_t halfH[72];\
879 uint8_t halfHV[64];\
880 copy_block9(full, src, 16, stride, 9);\
881 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
882 put ## RND ## pixels8_l2_aligned1(halfH, halfH, full+1, 8, 8, 16, 9);\
883 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
884 OPNAME ## pixels8_l2_aligned(dst, halfH+8, halfHV, stride, 8, 8, 8);\
885 }\
886 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
887 uint8_t halfH[72];\
888 uint8_t halfHV[64];\
889 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
890 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
891 OPNAME ## pixels8_l2_aligned(dst, halfH, halfHV, stride, 8, 8, 8);\
892 }\
893 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
894 uint8_t halfH[72];\
895 uint8_t halfHV[64];\
896 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
897 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
898 OPNAME ## pixels8_l2_aligned(dst, halfH+8, halfHV, stride, 8, 8, 8);\
899 }\
900 static void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
901 uint8_t full[16*9];\
902 uint8_t halfH[72];\
903 uint8_t halfV[64];\
904 uint8_t halfHV[64];\
905 copy_block9(full, src, 16, stride, 9);\
906 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
907 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
908 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
909 OPNAME ## pixels8_l2_aligned(dst, halfV, halfHV, stride, 8, 8, 8);\
910 }\
911 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
912 uint8_t full[16*9];\
913 uint8_t halfH[72];\
914 copy_block9(full, src, 16, stride, 9);\
915 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
916 put ## RND ## pixels8_l2_aligned(halfH, halfH, full, 8, 8, 16, 9);\
917 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
918 }\
919 static void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
920 uint8_t full[16*9];\
921 uint8_t halfH[72];\
922 uint8_t halfV[64];\
923 uint8_t halfHV[64];\
924 copy_block9(full, src, 16, stride, 9);\
925 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
926 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
927 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
928 OPNAME ## pixels8_l2_aligned(dst, halfV, halfHV, stride, 8, 8, 8);\
929 }\
930 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
931 uint8_t full[16*9];\
932 uint8_t halfH[72];\
933 copy_block9(full, src, 16, stride, 9);\
934 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
935 put ## RND ## pixels8_l2_aligned1(halfH, halfH, full+1, 8, 8, 16, 9);\
936 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
937 }\
938 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
939 uint8_t halfH[72];\
940 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
941 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
942 }\
943 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
944 OPNAME ## pixels16_c(dst, src, stride, 16);\
945 }\
946 \
947 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
948 uint8_t half[256];\
949 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
950 OPNAME ## pixels16_l2_aligned2(dst, src, half, stride, stride, 16, 16);\
951 }\
952 \
953 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
954 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
955 }\
956 \
957 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
958 uint8_t half[256];\
959 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
960 OPNAME ## pixels16_l2_aligned2(dst, src+1, half, stride, stride, 16, 16);\
961 }\
962 \
963 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
964 uint8_t full[24*17];\
965 uint8_t half[256];\
966 copy_block17(full, src, 24, stride, 17);\
967 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
968 OPNAME ## pixels16_l2_aligned(dst, full, half, stride, 24, 16, 16);\
969 }\
970 \
971 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
972 uint8_t full[24*17];\
973 copy_block17(full, src, 24, stride, 17);\
974 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
975 }\
976 \
977 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
978 uint8_t full[24*17];\
979 uint8_t half[256];\
980 copy_block17(full, src, 24, stride, 17);\
981 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
982 OPNAME ## pixels16_l2_aligned(dst, full+24, half, stride, 24, 16, 16);\
983 }\
984 static void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
985 uint8_t full[24*17];\
986 uint8_t halfH[272];\
987 uint8_t halfV[256];\
988 uint8_t halfHV[256];\
989 copy_block17(full, src, 24, stride, 17);\
990 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
991 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
992 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
993 OPNAME ## pixels16_l4_aligned(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
994 }\
995 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
996 uint8_t full[24*17];\
997 uint8_t halfH[272];\
998 uint8_t halfHV[256];\
999 copy_block17(full, src, 24, stride, 17);\
1000 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1001 put ## RND ## pixels16_l2_aligned(halfH, halfH, full, 16, 16, 24, 17);\
1002 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1003 OPNAME ## pixels16_l2_aligned(dst, halfH, halfHV, stride, 16, 16, 16);\
1004 }\
1005 static void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1006 uint8_t full[24*17];\
1007 uint8_t halfH[272];\
1008 uint8_t halfV[256];\
1009 uint8_t halfHV[256];\
1010 copy_block17(full, src, 24, stride, 17);\
1011 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1012 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1013 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1014 OPNAME ## pixels16_l4_aligned0(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1015 }\
1016 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1017 uint8_t full[24*17];\
1018 uint8_t halfH[272];\
1019 uint8_t halfHV[256];\
1020 copy_block17(full, src, 24, stride, 17);\
1021 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1022 put ## RND ## pixels16_l2_aligned1(halfH, halfH, full+1, 16, 16, 24, 17);\
1023 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1024 OPNAME ## pixels16_l2_aligned(dst, halfH, halfHV, stride, 16, 16, 16);\
1025 }\
1026 static void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1027 uint8_t full[24*17];\
1028 uint8_t halfH[272];\
1029 uint8_t halfV[256];\
1030 uint8_t halfHV[256];\
1031 copy_block17(full, src, 24, stride, 17);\
1032 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1033 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1034 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1035 OPNAME ## pixels16_l4_aligned(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1036 }\
1037 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1038 uint8_t full[24*17];\
1039 uint8_t halfH[272];\
1040 uint8_t halfHV[256];\
1041 copy_block17(full, src, 24, stride, 17);\
1042 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1043 put ## RND ## pixels16_l2_aligned(halfH, halfH, full, 16, 16, 24, 17);\
1044 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1045 OPNAME ## pixels16_l2_aligned(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1046 }\
1047 static void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1048 uint8_t full[24*17];\
1049 uint8_t halfH[272];\
1050 uint8_t halfV[256];\
1051 uint8_t halfHV[256];\
1052 copy_block17(full, src, 24, stride, 17);\
1053 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1054 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1055 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1056 OPNAME ## pixels16_l4_aligned0(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1057 }\
1058 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1059 uint8_t full[24*17];\
1060 uint8_t halfH[272];\
1061 uint8_t halfHV[256];\
1062 copy_block17(full, src, 24, stride, 17);\
1063 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1064 put ## RND ## pixels16_l2_aligned1(halfH, halfH, full+1, 16, 16, 24, 17);\
1065 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1066 OPNAME ## pixels16_l2_aligned(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1067 }\
1068 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1069 uint8_t halfH[272];\
1070 uint8_t halfHV[256];\
1071 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1072 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1073 OPNAME ## pixels16_l2_aligned(dst, halfH, halfHV, stride, 16, 16, 16);\
1074 }\
1075 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1076 uint8_t halfH[272];\
1077 uint8_t halfHV[256];\
1078 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1079 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1080 OPNAME ## pixels16_l2_aligned(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1081 }\
1082 static void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1083 uint8_t full[24*17];\
1084 uint8_t halfH[272];\
1085 uint8_t halfV[256];\
1086 uint8_t halfHV[256];\
1087 copy_block17(full, src, 24, stride, 17);\
1088 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1089 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1090 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1091 OPNAME ## pixels16_l2_aligned(dst, halfV, halfHV, stride, 16, 16, 16);\
1092 }\
1093 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1094 uint8_t full[24*17];\
1095 uint8_t halfH[272];\
1096 copy_block17(full, src, 24, stride, 17);\
1097 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1098 put ## RND ## pixels16_l2_aligned(halfH, halfH, full, 16, 16, 24, 17);\
1099 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1100 }\
1101 static void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1102 uint8_t full[24*17];\
1103 uint8_t halfH[272];\
1104 uint8_t halfV[256];\
1105 uint8_t halfHV[256];\
1106 copy_block17(full, src, 24, stride, 17);\
1107 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1108 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1109 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1110 OPNAME ## pixels16_l2_aligned(dst, halfV, halfHV, stride, 16, 16, 16);\
1111 }\
1112 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1113 uint8_t full[24*17];\
1114 uint8_t halfH[272];\
1115 copy_block17(full, src, 24, stride, 17);\
1116 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1117 put ## RND ## pixels16_l2_aligned1(halfH, halfH, full+1, 16, 16, 24, 17);\
1118 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1119 }\
1120 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1121 uint8_t halfH[272];\
1122 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1123 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1124 }
1125
1126 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1127 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1128 #define op_put(a, b) a = cm[((b) + 16)>>5]
1129 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1130
1131 QPEL_MC(0, put_ , _ , op_put)
1132 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1133 QPEL_MC(0, avg_ , _ , op_avg)
1134 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1135 #undef op_avg
1136 #undef op_avg_no_rnd
1137 #undef op_put
1138 #undef op_put_no_rnd
1139
1140 #if 1
1141 #define H264_LOWPASS(OPNAME, OP, OP2) \
1142 static inline void OPNAME ## h264_qpel_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride,int w,int h){\
1143 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1144 do {\
1145 int srcB,srcA,src0,src1,src2,src3,src4,src5,src6;\
1146 uint8_t *s = src-2;\
1147 srcB = *s++;\
1148 srcA = *s++;\
1149 src0 = *s++;\
1150 src1 = *s++;\
1151 src2 = *s++;\
1152 src3 = *s++;\
1153 OP(dst[0], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1154 src4 = *s++;\
1155 OP(dst[1], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1156 src5 = *s++;\
1157 OP(dst[2], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1158 src6 = *s++;\
1159 OP(dst[3], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1160 if (w>4) { /* it optimized */ \
1161 int src7,src8,src9,src10; \
1162 src7 = *s++;\
1163 OP(dst[4], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
1164 src8 = *s++;\
1165 OP(dst[5], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
1166 src9 = *s++;\
1167 OP(dst[6], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
1168 src10 = *s++;\
1169 OP(dst[7], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
1170 if (w>8) { \
1171 int src11,src12,src13,src14,src15,src16,src17,src18; \
1172 src11 = *s++;\
1173 OP(dst[8] , (src8 +src9 )*20 - (src7 +src10)*5 + (src6 +src11));\
1174 src12 = *s++;\
1175 OP(dst[9] , (src9 +src10)*20 - (src8 +src11)*5 + (src7 +src12));\
1176 src13 = *s++;\
1177 OP(dst[10], (src10+src11)*20 - (src9 +src12)*5 + (src8 +src13));\
1178 src14 = *s++;\
1179 OP(dst[11], (src11+src12)*20 - (src10+src13)*5 + (src9 +src14));\
1180 src15 = *s++;\
1181 OP(dst[12], (src12+src13)*20 - (src11+src14)*5 + (src10+src15));\
1182 src16 = *s++;\
1183 OP(dst[13], (src13+src14)*20 - (src12+src15)*5 + (src11+src16));\
1184 src17 = *s++;\
1185 OP(dst[14], (src14+src15)*20 - (src13+src16)*5 + (src12+src17));\
1186 src18 = *s++;\
1187 OP(dst[15], (src15+src16)*20 - (src14+src17)*5 + (src13+src18));\
1188 } \
1189 } \
1190 dst+=dstStride;\
1191 src+=srcStride;\
1192 }while(--h);\
1193 }\
1194 \
1195 static inline void OPNAME ## h264_qpel_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride,int w,int h){\
1196 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1197 do{\
1198 int srcB,srcA,src0,src1,src2,src3,src4,src5,src6;\
1199 uint8_t *s = src-2*srcStride,*d=dst;\
1200 srcB = *s; s+=srcStride;\
1201 srcA = *s; s+=srcStride;\
1202 src0 = *s; s+=srcStride;\
1203 src1 = *s; s+=srcStride;\
1204 src2 = *s; s+=srcStride;\
1205 src3 = *s; s+=srcStride;\
1206 OP(*d, (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));d+=dstStride;\
1207 src4 = *s; s+=srcStride;\
1208 OP(*d, (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));d+=dstStride;\
1209 src5 = *s; s+=srcStride;\
1210 OP(*d, (src2+src3)*20 - (src1+src4)*5 + (src0+src5));d+=dstStride;\
1211 src6 = *s; s+=srcStride;\
1212 OP(*d, (src3+src4)*20 - (src2+src5)*5 + (src1+src6));d+=dstStride;\
1213 if (h>4) { \
1214 int src7,src8,src9,src10; \
1215 src7 = *s; s+=srcStride;\
1216 OP(*d, (src4+src5)*20 - (src3+src6)*5 + (src2+src7));d+=dstStride;\
1217 src8 = *s; s+=srcStride;\
1218 OP(*d, (src5+src6)*20 - (src4+src7)*5 + (src3+src8));d+=dstStride;\
1219 src9 = *s; s+=srcStride;\
1220 OP(*d, (src6+src7)*20 - (src5+src8)*5 + (src4+src9));d+=dstStride;\
1221 src10 = *s; s+=srcStride;\
1222 OP(*d, (src7+src8)*20 - (src6+src9)*5 + (src5+src10));d+=dstStride;\
1223 if (h>8) { \
1224 int src11,src12,src13,src14,src15,src16,src17,src18; \
1225 src11 = *s; s+=srcStride;\
1226 OP(*d , (src8 +src9 )*20 - (src7 +src10)*5 + (src6 +src11));d+=dstStride;\
1227 src12 = *s; s+=srcStride;\
1228 OP(*d , (src9 +src10)*20 - (src8 +src11)*5 + (src7 +src12));d+=dstStride;\
1229 src13 = *s; s+=srcStride;\
1230 OP(*d, (src10+src11)*20 - (src9 +src12)*5 + (src8 +src13));d+=dstStride;\
1231 src14 = *s; s+=srcStride;\
1232 OP(*d, (src11+src12)*20 - (src10+src13)*5 + (src9 +src14));d+=dstStride;\
1233 src15 = *s; s+=srcStride;\
1234 OP(*d, (src12+src13)*20 - (src11+src14)*5 + (src10+src15));d+=dstStride;\
1235 src16 = *s; s+=srcStride;\
1236 OP(*d, (src13+src14)*20 - (src12+src15)*5 + (src11+src16));d+=dstStride;\
1237 src17 = *s; s+=srcStride;\
1238 OP(*d, (src14+src15)*20 - (src13+src16)*5 + (src12+src17));d+=dstStride;\
1239 src18 = *s; s+=srcStride;\
1240 OP(*d, (src15+src16)*20 - (src14+src17)*5 + (src13+src18));d+=dstStride;\
1241 } \
1242 } \
1243 dst++;\
1244 src++;\
1245 }while(--w);\
1246 }\
1247 \
1248 static inline void OPNAME ## h264_qpel_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride,int w,int h){\
1249 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1250 int i;\
1251 src -= 2*srcStride;\
1252 i= h+5; \
1253 do {\
1254 int srcB,srcA,src0,src1,src2,src3,src4,src5,src6;\
1255 uint8_t *s = src-2;\
1256 srcB = *s++;\
1257 srcA = *s++;\
1258 src0 = *s++;\
1259 src1 = *s++;\
1260 src2 = *s++;\
1261 src3 = *s++;\
1262 tmp[0] = ((src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1263 src4 = *s++;\
1264 tmp[1] = ((src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1265 src5 = *s++;\
1266 tmp[2] = ((src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1267 src6 = *s++;\
1268 tmp[3] = ((src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1269 if (w>4) { /* it optimized */ \
1270 int src7,src8,src9,src10; \
1271 src7 = *s++;\
1272 tmp[4] = ((src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
1273 src8 = *s++;\
1274 tmp[5] = ((src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
1275 src9 = *s++;\
1276 tmp[6] = ((src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
1277 src10 = *s++;\
1278 tmp[7] = ((src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
1279 if (w>8) { \
1280 int src11,src12,src13,src14,src15,src16,src17,src18; \
1281 src11 = *s++;\
1282 tmp[8] = ((src8 +src9 )*20 - (src7 +src10)*5 + (src6 +src11));\
1283 src12 = *s++;\
1284 tmp[9] = ((src9 +src10)*20 - (src8 +src11)*5 + (src7 +src12));\
1285 src13 = *s++;\
1286 tmp[10] = ((src10+src11)*20 - (src9 +src12)*5 + (src8 +src13));\
1287 src14 = *s++;\
1288 tmp[11] = ((src11+src12)*20 - (src10+src13)*5 + (src9 +src14));\
1289 src15 = *s++;\
1290 tmp[12] = ((src12+src13)*20 - (src11+src14)*5 + (src10+src15));\
1291 src16 = *s++;\
1292 tmp[13] = ((src13+src14)*20 - (src12+src15)*5 + (src11+src16));\
1293 src17 = *s++;\
1294 tmp[14] = ((src14+src15)*20 - (src13+src16)*5 + (src12+src17));\
1295 src18 = *s++;\
1296 tmp[15] = ((src15+src16)*20 - (src14+src17)*5 + (src13+src18));\
1297 } \
1298 } \
1299 tmp+=tmpStride;\
1300 src+=srcStride;\
1301 }while(--i);\
1302 tmp -= tmpStride*(h+5-2);\
1303 i = w; \
1304 do {\
1305 int tmpB,tmpA,tmp0,tmp1,tmp2,tmp3,tmp4,tmp5,tmp6;\
1306 int16_t *s = tmp-2*tmpStride; \
1307 uint8_t *d=dst;\
1308 tmpB = *s; s+=tmpStride;\
1309 tmpA = *s; s+=tmpStride;\
1310 tmp0 = *s; s+=tmpStride;\
1311 tmp1 = *s; s+=tmpStride;\
1312 tmp2 = *s; s+=tmpStride;\
1313 tmp3 = *s; s+=tmpStride;\
1314 OP2(*d, (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));d+=dstStride;\
1315 tmp4 = *s; s+=tmpStride;\
1316 OP2(*d, (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));d+=dstStride;\
1317 tmp5 = *s; s+=tmpStride;\
1318 OP2(*d, (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));d+=dstStride;\
1319 tmp6 = *s; s+=tmpStride;\
1320 OP2(*d, (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));d+=dstStride;\
1321 if (h>4) { \
1322 int tmp7,tmp8,tmp9,tmp10; \
1323 tmp7 = *s; s+=tmpStride;\
1324 OP2(*d, (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));d+=dstStride;\
1325 tmp8 = *s; s+=tmpStride;\
1326 OP2(*d, (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));d+=dstStride;\
1327 tmp9 = *s; s+=tmpStride;\
1328 OP2(*d, (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));d+=dstStride;\
1329 tmp10 = *s; s+=tmpStride;\
1330 OP2(*d, (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));d+=dstStride;\
1331 if (h>8) { \
1332 int tmp11,tmp12,tmp13,tmp14,tmp15,tmp16,tmp17,tmp18; \
1333 tmp11 = *s; s+=tmpStride;\
1334 OP2(*d , (tmp8 +tmp9 )*20 - (tmp7 +tmp10)*5 + (tmp6 +tmp11));d+=dstStride;\
1335 tmp12 = *s; s+=tmpStride;\
1336 OP2(*d , (tmp9 +tmp10)*20 - (tmp8 +tmp11)*5 + (tmp7 +tmp12));d+=dstStride;\
1337 tmp13 = *s; s+=tmpStride;\
1338 OP2(*d, (tmp10+tmp11)*20 - (tmp9 +tmp12)*5 + (tmp8 +tmp13));d+=dstStride;\
1339 tmp14 = *s; s+=tmpStride;\
1340 OP2(*d, (tmp11+tmp12)*20 - (tmp10+tmp13)*5 + (tmp9 +tmp14));d+=dstStride;\
1341 tmp15 = *s; s+=tmpStride;\
1342 OP2(*d, (tmp12+tmp13)*20 - (tmp11+tmp14)*5 + (tmp10+tmp15));d+=dstStride;\
1343 tmp16 = *s; s+=tmpStride;\
1344 OP2(*d, (tmp13+tmp14)*20 - (tmp12+tmp15)*5 + (tmp11+tmp16));d+=dstStride;\
1345 tmp17 = *s; s+=tmpStride;\
1346 OP2(*d, (tmp14+tmp15)*20 - (tmp13+tmp16)*5 + (tmp12+tmp17));d+=dstStride;\
1347 tmp18 = *s; s+=tmpStride;\
1348 OP2(*d, (tmp15+tmp16)*20 - (tmp14+tmp17)*5 + (tmp13+tmp18));d+=dstStride;\
1349 } \
1350 } \
1351 dst++;\
1352 tmp++;\
1353 }while(--i);\
1354 }\
1355 \
1356 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1357 OPNAME ## h264_qpel_h_lowpass(dst,src,dstStride,srcStride,4,4); \
1358 }\
1359 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1360 OPNAME ## h264_qpel_h_lowpass(dst,src,dstStride,srcStride,8,8); \
1361 }\
1362 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1363 OPNAME ## h264_qpel_h_lowpass(dst,src,dstStride,srcStride,16,16); \
1364 }\
1365 \
1366 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1367 OPNAME ## h264_qpel_v_lowpass(dst,src,dstStride,srcStride,4,4); \
1368 }\
1369 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1370 OPNAME ## h264_qpel_v_lowpass(dst,src,dstStride,srcStride,8,8); \
1371 }\
1372 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1373 OPNAME ## h264_qpel_v_lowpass(dst,src,dstStride,srcStride,16,16); \
1374 }\
1375 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1376 OPNAME ## h264_qpel_hv_lowpass(dst,tmp,src,dstStride,tmpStride,srcStride,4,4); \
1377 }\
1378 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1379 OPNAME ## h264_qpel_hv_lowpass(dst,tmp,src,dstStride,tmpStride,srcStride,8,8); \
1380 }\
1381 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1382 OPNAME ## h264_qpel_hv_lowpass(dst,tmp,src,dstStride,tmpStride,srcStride,16,16); \
1383 }\
1384
1385 #define H264_MC(OPNAME, SIZE) \
1386 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1387 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
1388 }\
1389 \
1390 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1391 uint8_t half[SIZE*SIZE];\
1392 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
1393 OPNAME ## pixels ## SIZE ## _l2_aligned2(dst, src, half, stride, stride, SIZE, SIZE);\
1394 }\
1395 \
1396 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1397 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
1398 }\
1399 \
1400 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1401 uint8_t half[SIZE*SIZE];\
1402 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
1403 OPNAME ## pixels ## SIZE ## _l2_aligned2(dst, src+1, half, stride, stride, SIZE, SIZE);\
1404 }\
1405 \
1406 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1407 uint8_t full[SIZE*(SIZE+5)];\
1408 uint8_t * const full_mid= full + SIZE*2;\
1409 uint8_t half[SIZE*SIZE];\
1410 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
1411 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
1412 OPNAME ## pixels ## SIZE ## _l2_aligned(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
1413 }\
1414 \
1415 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1416 uint8_t full[SIZE*(SIZE+5)];\
1417 uint8_t * const full_mid= full + SIZE*2;\
1418 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
1419 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
1420 }\
1421 \
1422 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1423 uint8_t full[SIZE*(SIZE+5)];\
1424 uint8_t * const full_mid= full + SIZE*2;\
1425 uint8_t half[SIZE*SIZE];\
1426 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
1427 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
1428 OPNAME ## pixels ## SIZE ## _l2_aligned(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
1429 }\
1430 \
1431 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1432 uint8_t full[SIZE*(SIZE+5)];\
1433 uint8_t * const full_mid= full + SIZE*2;\
1434 uint8_t halfH[SIZE*SIZE];\
1435 uint8_t halfV[SIZE*SIZE];\
1436 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
1437 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
1438 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1439 OPNAME ## pixels ## SIZE ## _l2_aligned(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
1440 }\
1441 \
1442 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1443 uint8_t full[SIZE*(SIZE+5)];\
1444 uint8_t * const full_mid= full + SIZE*2;\
1445 uint8_t halfH[SIZE*SIZE];\
1446 uint8_t halfV[SIZE*SIZE];\
1447 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
1448 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
1449 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1450 OPNAME ## pixels ## SIZE ## _l2_aligned(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
1451 }\
1452 \
1453 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1454 uint8_t full[SIZE*(SIZE+5)];\
1455 uint8_t * const full_mid= full + SIZE*2;\
1456 uint8_t halfH[SIZE*SIZE];\
1457 uint8_t halfV[SIZE*SIZE];\
1458 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
1459 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
1460 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1461 OPNAME ## pixels ## SIZE ## _l2_aligned(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
1462 }\
1463 \
1464 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1465 uint8_t full[SIZE*(SIZE+5)];\
1466 uint8_t * const full_mid= full + SIZE*2;\
1467 uint8_t halfH[SIZE*SIZE];\
1468 uint8_t halfV[SIZE*SIZE];\
1469 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
1470 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
1471 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1472 OPNAME ## pixels ## SIZE ## _l2_aligned(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
1473 }\
1474 \
1475 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1476 int16_t tmp[SIZE*(SIZE+5)];\
1477 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
1478 }\
1479 \
1480 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1481 int16_t tmp[SIZE*(SIZE+5)];\
1482 uint8_t halfH[SIZE*SIZE];\
1483 uint8_t halfHV[SIZE*SIZE];\
1484 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
1485 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
1486 OPNAME ## pixels ## SIZE ## _l2_aligned(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
1487 }\
1488 \
1489 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1490 int16_t tmp[SIZE*(SIZE+5)];\
1491 uint8_t halfH[SIZE*SIZE];\
1492 uint8_t halfHV[SIZE*SIZE];\
1493 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
1494 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
1495 OPNAME ## pixels ## SIZE ## _l2_aligned(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
1496 }\
1497 \
1498 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1499 uint8_t full[SIZE*(SIZE+5)];\
1500 uint8_t * const full_mid= full + SIZE*2;\
1501 int16_t tmp[SIZE*(SIZE+5)];\
1502 uint8_t halfV[SIZE*SIZE];\
1503 uint8_t halfHV[SIZE*SIZE];\
1504 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
1505 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1506 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
1507 OPNAME ## pixels ## SIZE ## _l2_aligned(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
1508 }\
1509 \
1510 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1511 uint8_t full[SIZE*(SIZE+5)];\
1512 uint8_t * const full_mid= full + SIZE*2;\
1513 int16_t tmp[SIZE*(SIZE+5)];\
1514 uint8_t halfV[SIZE*SIZE];\
1515 uint8_t halfHV[SIZE*SIZE];\
1516 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
1517 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1518 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
1519 OPNAME ## pixels ## SIZE ## _l2_aligned(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
1520 }\
1521
1522 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1523 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
1524 #define op_put(a, b) a = cm[((b) + 16)>>5]
1525 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
1526 #define op2_put(a, b) a = cm[((b) + 512)>>10]
1527
1528 H264_LOWPASS(put_ , op_put, op2_put)
1529 H264_LOWPASS(avg_ , op_avg, op2_avg)
1530 H264_MC(put_, 4)
1531 H264_MC(put_, 8)
1532 H264_MC(put_, 16)
1533 H264_MC(avg_, 4)
1534 H264_MC(avg_, 8)
1535 H264_MC(avg_, 16)
1536
1537 #undef op_avg
1538 #undef op_put
1539 #undef op2_avg
1540 #undef op2_put
1541 #endif
1542
1543 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1544 uint8_t *cm = cropTbl + MAX_NEG_CROP;
1545
1546 do{
1547 int src_1,src0,src1,src2,src3,src4,src5,src6,src7,src8,src9;
1548 uint8_t *s = src;
1549 src_1 = s[-1];
1550 src0 = *s++;
1551 src1 = *s++;
1552 src2 = *s++;
1553 dst[0]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1554 src3 = *s++;
1555 dst[1]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
1556 src4 = *s++;
1557 dst[2]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
1558 src5 = *s++;
1559 dst[3]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
1560 src6 = *s++;
1561 dst[4]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
1562 src7 = *s++;
1563 dst[5]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
1564 src8 = *s++;
1565 dst[6]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
1566 src9 = *s++;
1567 dst[7]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
1568 dst+=dstStride;
1569 src+=srcStride;
1570 }while(--h);
1571 }
1572
1573 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1574 uint8_t *cm = cropTbl + MAX_NEG_CROP;
1575
1576 do{
1577 int src_1,src0,src1,src2,src3,src4,src5,src6,src7,src8,src9;
1578 uint8_t *s = src,*d = dst;
1579 src_1 = *(s-srcStride);
1580 src0 = *s; s+=srcStride;
1581 src1 = *s; s+=srcStride;
1582 src2 = *s; s+=srcStride;
1583 *d= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4]; d+=dstStride;
1584 src3 = *s; s+=srcStride;
1585 *d= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4]; d+=dstStride;
1586 src4 = *s; s+=srcStride;
1587 *d= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4]; d+=dstStride;
1588 src5 = *s; s+=srcStride;
1589 *d= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4]; d+=dstStride;
1590 src6 = *s; s+=srcStride;
1591 *d= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4]; d+=dstStride;
1592 src7 = *s; s+=srcStride;
1593 *d= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4]; d+=dstStride;
1594 src8 = *s; s+=srcStride;
1595 *d= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4]; d+=dstStride;
1596 src9 = *s;
1597 *d= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4]; d+=dstStride;
1598 src++;
1599 dst++;
1600 }while(--w);
1601 }
1602
1603 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
1604 put_pixels8_c(dst, src, stride, 8);
1605 }
1606
1607 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1608 uint8_t half[64];
1609 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1610 put_pixels8_l2_aligned2(dst, src, half, stride, stride, 8, 8);
1611 }
1612
1613 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1614 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1615 }
1616
1617 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1618 uint8_t half[64];
1619 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1620 put_pixels8_l2_aligned2(dst, src+1, half, stride, stride, 8, 8);
1621 }
1622
1623 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1624 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1625 }
1626
1627 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1628 uint8_t halfH[88];
1629 uint8_t halfV[64];
1630 uint8_t halfHV[64];
1631 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1632 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1633 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1634 put_pixels8_l2_aligned(dst, halfV, halfHV, stride, 8, 8, 8);
1635 }
1636 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1637 uint8_t halfH[88];
1638 uint8_t halfV[64];
1639 uint8_t halfHV[64];
1640 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1641 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1642 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1643 put_pixels8_l2_aligned(dst, halfV, halfHV, stride, 8, 8, 8);
1644 }
1645 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1646 uint8_t halfH[88];
1647 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1648 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1649 }