comparison sh4/qpel.c @ 2979:bfabfdf9ce55 libavcodec

COSMETICS: tabs --> spaces, some prettyprinting
author diego
date Thu, 22 Dec 2005 01:10:11 +0000
parents ef2149182f1c
children c537a97eec66
comparison
equal deleted inserted replaced
2978:403183bbb505 2979:bfabfdf9ce55
1 /* 1 /*
2 this is optimized for sh, which have post increment addressing (*p++) 2 this is optimized for sh, which have post increment addressing (*p++)
3 some cpu may be index (p[n]) faster than post increment (*p++) 3 some cpu may be index (p[n]) faster than post increment (*p++)
4 */ 4 */
5 5
6 #define LD(adr) *(uint32_t*)(adr) 6 #define LD(adr) *(uint32_t*)(adr)
7 7
8 #define PIXOP2(OPNAME, OP) \ 8 #define PIXOP2(OPNAME, OP) \
9 /*static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \ 9 /*static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
10 {\ 10 {\
11 do {\ 11 do {\
12 OP(LP(dst ),no_rnd_avg32(LD32(src1 ),LD32(src2 )) ); \ 12 OP(LP(dst ),no_rnd_avg32(LD32(src1 ),LD32(src2 )) ); \
13 OP(LP(dst+4),no_rnd_avg32(LD32(src1+4),LD32(src2+4)) ); \ 13 OP(LP(dst+4),no_rnd_avg32(LD32(src1+4),LD32(src2+4)) ); \
14 src1+=src_stride1; \ 14 src1+=src_stride1; \
15 src2+=src_stride2; \ 15 src2+=src_stride2; \
16 dst+=dst_stride; \ 16 dst+=dst_stride; \
17 } while(--h); \ 17 } while(--h); \
18 }\ 18 }\
19 \ 19 \
20 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \ 20 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
21 {\ 21 {\
22 do {\ 22 do {\
23 OP(LP(dst ),rnd_avg32(LD32(src1 ),LD32(src2 )) ); \ 23 OP(LP(dst ),rnd_avg32(LD32(src1 ),LD32(src2 )) ); \
24 OP(LP(dst+4),rnd_avg32(LD32(src1+4),LD32(src2+4)) ); \ 24 OP(LP(dst+4),rnd_avg32(LD32(src1+4),LD32(src2+4)) ); \
25 src1+=src_stride1; \ 25 src1+=src_stride1; \
26 src2+=src_stride2; \ 26 src2+=src_stride2; \
27 dst+=dst_stride; \ 27 dst+=dst_stride; \
28 } while(--h); \ 28 } while(--h); \
29 }\ 29 }\
30 \ 30 \
31 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \ 31 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
32 {\ 32 {\
33 do {\ 33 do {\
34 OP(LP(dst ),rnd_avg32(LD32(src1 ),LD32(src2 )) ); \ 34 OP(LP(dst ),rnd_avg32(LD32(src1 ),LD32(src2 )) ); \
35 src1+=src_stride1; \ 35 src1+=src_stride1; \
36 src2+=src_stride2; \ 36 src2+=src_stride2; \
37 dst+=dst_stride; \ 37 dst+=dst_stride; \
38 } while(--h); \ 38 } while(--h); \
39 }\ 39 }\
40 \ 40 \
41 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \ 41 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
42 {\ 42 {\
43 do {\ 43 do {\
44 OP(LP(dst ),no_rnd_avg32(LD32(src1 ),LD32(src2 )) ); \ 44 OP(LP(dst ),no_rnd_avg32(LD32(src1 ),LD32(src2 )) ); \
45 OP(LP(dst+4),no_rnd_avg32(LD32(src1+4),LD32(src2+4)) ); \ 45 OP(LP(dst+4),no_rnd_avg32(LD32(src1+4),LD32(src2+4)) ); \
46 OP(LP(dst+8),no_rnd_avg32(LD32(src1+8),LD32(src2+8)) ); \ 46 OP(LP(dst+8),no_rnd_avg32(LD32(src1+8),LD32(src2+8)) ); \
47 OP(LP(dst+12),no_rnd_avg32(LD32(src1+12),LD32(src2+12)) ); \ 47 OP(LP(dst+12),no_rnd_avg32(LD32(src1+12),LD32(src2+12)) ); \
48 src1+=src_stride1; \ 48 src1+=src_stride1; \
49 src2+=src_stride2; \ 49 src2+=src_stride2; \
50 dst+=dst_stride; \ 50 dst+=dst_stride; \
51 } while(--h); \ 51 } while(--h); \
52 }\ 52 }\
53 \ 53 \
54 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \ 54 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
55 {\ 55 {\
56 do {\ 56 do {\
57 OP(LP(dst ),rnd_avg32(LD32(src1 ),LD32(src2 )) ); \ 57 OP(LP(dst ),rnd_avg32(LD32(src1 ),LD32(src2 )) ); \
58 OP(LP(dst+4),rnd_avg32(LD32(src1+4),LD32(src2+4)) ); \ 58 OP(LP(dst+4),rnd_avg32(LD32(src1+4),LD32(src2+4)) ); \
59 OP(LP(dst+8),rnd_avg32(LD32(src1+8),LD32(src2+8)) ); \ 59 OP(LP(dst+8),rnd_avg32(LD32(src1+8),LD32(src2+8)) ); \
60 OP(LP(dst+12),rnd_avg32(LD32(src1+12),LD32(src2+12)) ); \ 60 OP(LP(dst+12),rnd_avg32(LD32(src1+12),LD32(src2+12)) ); \
61 src1+=src_stride1; \ 61 src1+=src_stride1; \
62 src2+=src_stride2; \ 62 src2+=src_stride2; \
63 dst+=dst_stride; \ 63 dst+=dst_stride; \
64 } while(--h); \ 64 } while(--h); \
65 }*/\ 65 }*/\
66 \ 66 \
67 static inline void OPNAME ## _pixels4_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \ 67 static inline void OPNAME ## _pixels4_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
68 {\ 68 {\
69 do {\ 69 do {\
70 OP(LP(dst ),rnd_avg32(LP(src1 ),LP(src2 )) ); \ 70 OP(LP(dst ),rnd_avg32(LP(src1 ),LP(src2 )) ); \
71 src1+=src_stride1; \ 71 src1+=src_stride1; \
72 src2+=src_stride2; \ 72 src2+=src_stride2; \
73 dst+=dst_stride; \ 73 dst+=dst_stride; \
74 } while(--h); \ 74 } while(--h); \
75 }\ 75 }\
76 \ 76 \
77 static inline void OPNAME ## _pixels4_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \ 77 static inline void OPNAME ## _pixels4_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
78 {\ 78 {\
79 do {\ 79 do {\
80 OP(LP(dst ),rnd_avg32(LD32(src1 ),LP(src2 )) ); \ 80 OP(LP(dst ),rnd_avg32(LD32(src1 ),LP(src2 )) ); \
81 src1+=src_stride1; \ 81 src1+=src_stride1; \
82 src2+=src_stride2; \ 82 src2+=src_stride2; \
83 dst+=dst_stride; \ 83 dst+=dst_stride; \
84 } while(--h); \ 84 } while(--h); \
85 }\ 85 }\
86 \ 86 \
87 static inline void OPNAME ## _no_rnd_pixels16_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \ 87 static inline void OPNAME ## _no_rnd_pixels16_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
88 {\ 88 {\
89 do {\ 89 do {\
90 OP(LP(dst ),no_rnd_avg32(LD32(src1 ),LP(src2 )) ); \ 90 OP(LP(dst ),no_rnd_avg32(LD32(src1 ),LP(src2 )) ); \
91 OP(LP(dst+4),no_rnd_avg32(LD32(src1+4),LP(src2+4)) ); \ 91 OP(LP(dst+4),no_rnd_avg32(LD32(src1+4),LP(src2+4)) ); \
92 OP(LP(dst+8),no_rnd_avg32(LD32(src1+8),LP(src2+8)) ); \ 92 OP(LP(dst+8),no_rnd_avg32(LD32(src1+8),LP(src2+8)) ); \
93 OP(LP(dst+12),no_rnd_avg32(LD32(src1+12),LP(src2+12)) ); \ 93 OP(LP(dst+12),no_rnd_avg32(LD32(src1+12),LP(src2+12)) ); \
94 src1+=src_stride1; \ 94 src1+=src_stride1; \
95 src2+=src_stride2; \ 95 src2+=src_stride2; \
96 dst+=dst_stride; \ 96 dst+=dst_stride; \
97 } while(--h); \ 97 } while(--h); \
98 }\ 98 }\
99 \ 99 \
100 static inline void OPNAME ## _pixels16_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \ 100 static inline void OPNAME ## _pixels16_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
101 {\ 101 {\
102 do {\ 102 do {\
103 OP(LP(dst ),rnd_avg32(LD32(src1 ),LP(src2 )) ); \ 103 OP(LP(dst ),rnd_avg32(LD32(src1 ),LP(src2 )) ); \
104 OP(LP(dst+4),rnd_avg32(LD32(src1+4),LP(src2+4)) ); \ 104 OP(LP(dst+4),rnd_avg32(LD32(src1+4),LP(src2+4)) ); \
105 OP(LP(dst+8),rnd_avg32(LD32(src1+8),LP(src2+8)) ); \ 105 OP(LP(dst+8),rnd_avg32(LD32(src1+8),LP(src2+8)) ); \
106 OP(LP(dst+12),rnd_avg32(LD32(src1+12),LP(src2+12)) ); \ 106 OP(LP(dst+12),rnd_avg32(LD32(src1+12),LP(src2+12)) ); \
107 src1+=src_stride1; \ 107 src1+=src_stride1; \
108 src2+=src_stride2; \ 108 src2+=src_stride2; \
109 dst+=dst_stride; \ 109 dst+=dst_stride; \
110 } while(--h); \ 110 } while(--h); \
111 }\ 111 }\
112 \ 112 \
113 static inline void OPNAME ## _no_rnd_pixels8_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \ 113 static inline void OPNAME ## _no_rnd_pixels8_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
114 {\ 114 {\
115 do { /* onlye src2 aligned */\ 115 do { /* onlye src2 aligned */\
116 OP(LP(dst ),no_rnd_avg32(LD32(src1 ),LP(src2 )) ); \ 116 OP(LP(dst ),no_rnd_avg32(LD32(src1 ),LP(src2 )) ); \
117 OP(LP(dst+4),no_rnd_avg32(LD32(src1+4),LP(src2+4)) ); \ 117 OP(LP(dst+4),no_rnd_avg32(LD32(src1+4),LP(src2+4)) ); \
118 src1+=src_stride1; \ 118 src1+=src_stride1; \
119 src2+=src_stride2; \ 119 src2+=src_stride2; \
120 dst+=dst_stride; \ 120 dst+=dst_stride; \
121 } while(--h); \ 121 } while(--h); \
122 }\ 122 }\
123 \ 123 \
124 static inline void OPNAME ## _pixels8_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \ 124 static inline void OPNAME ## _pixels8_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
125 {\ 125 {\
126 do {\ 126 do {\
127 OP(LP(dst ),rnd_avg32(LD32(src1 ),LP(src2 )) ); \ 127 OP(LP(dst ),rnd_avg32(LD32(src1 ),LP(src2 )) ); \
128 OP(LP(dst+4),rnd_avg32(LD32(src1+4),LP(src2+4)) ); \ 128 OP(LP(dst+4),rnd_avg32(LD32(src1+4),LP(src2+4)) ); \
129 src1+=src_stride1; \ 129 src1+=src_stride1; \
130 src2+=src_stride2; \ 130 src2+=src_stride2; \
131 dst+=dst_stride; \ 131 dst+=dst_stride; \
132 } while(--h); \ 132 } while(--h); \
133 }\ 133 }\
134 \ 134 \
135 static inline void OPNAME ## _no_rnd_pixels8_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \ 135 static inline void OPNAME ## _no_rnd_pixels8_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
136 {\ 136 {\
137 do {\ 137 do {\
138 OP(LP(dst ),no_rnd_avg32(LP(src1 ),LP(src2 )) ); \ 138 OP(LP(dst ),no_rnd_avg32(LP(src1 ),LP(src2 )) ); \
139 OP(LP(dst+4),no_rnd_avg32(LP(src1+4),LP(src2+4)) ); \ 139 OP(LP(dst+4),no_rnd_avg32(LP(src1+4),LP(src2+4)) ); \
140 src1+=src_stride1; \ 140 src1+=src_stride1; \
141 src2+=src_stride2; \ 141 src2+=src_stride2; \
142 dst+=dst_stride; \ 142 dst+=dst_stride; \
143 } while(--h); \ 143 } while(--h); \
144 }\ 144 }\
145 \ 145 \
146 static inline void OPNAME ## _pixels8_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \ 146 static inline void OPNAME ## _pixels8_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
147 {\ 147 {\
148 do {\ 148 do {\
149 OP(LP(dst ),rnd_avg32(LP(src1 ),LP(src2 )) ); \ 149 OP(LP(dst ),rnd_avg32(LP(src1 ),LP(src2 )) ); \
150 OP(LP(dst+4),rnd_avg32(LP(src1+4),LP(src2+4)) ); \ 150 OP(LP(dst+4),rnd_avg32(LP(src1+4),LP(src2+4)) ); \
151 src1+=src_stride1; \ 151 src1+=src_stride1; \
152 src2+=src_stride2; \ 152 src2+=src_stride2; \
153 dst+=dst_stride; \ 153 dst+=dst_stride; \
154 } while(--h); \ 154 } while(--h); \
155 }\ 155 }\
156 \ 156 \
157 static inline void OPNAME ## _no_rnd_pixels16_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \ 157 static inline void OPNAME ## _no_rnd_pixels16_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
158 {\ 158 {\
159 do {\ 159 do {\
160 OP(LP(dst ),no_rnd_avg32(LP(src1 ),LP(src2 )) ); \ 160 OP(LP(dst ),no_rnd_avg32(LP(src1 ),LP(src2 )) ); \
161 OP(LP(dst+4),no_rnd_avg32(LP(src1+4),LP(src2+4)) ); \ 161 OP(LP(dst+4),no_rnd_avg32(LP(src1+4),LP(src2+4)) ); \
162 OP(LP(dst+8),no_rnd_avg32(LP(src1+8),LP(src2+8)) ); \ 162 OP(LP(dst+8),no_rnd_avg32(LP(src1+8),LP(src2+8)) ); \
163 OP(LP(dst+12),no_rnd_avg32(LP(src1+12),LP(src2+12)) ); \ 163 OP(LP(dst+12),no_rnd_avg32(LP(src1+12),LP(src2+12)) ); \
164 src1+=src_stride1; \ 164 src1+=src_stride1; \
165 src2+=src_stride2; \ 165 src2+=src_stride2; \
166 dst+=dst_stride; \ 166 dst+=dst_stride; \
167 } while(--h); \ 167 } while(--h); \
168 }\ 168 }\
169 \ 169 \
170 static inline void OPNAME ## _pixels16_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \ 170 static inline void OPNAME ## _pixels16_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
171 {\ 171 {\
172 do {\ 172 do {\
173 OP(LP(dst ),rnd_avg32(LP(src1 ),LP(src2 )) ); \ 173 OP(LP(dst ),rnd_avg32(LP(src1 ),LP(src2 )) ); \
174 OP(LP(dst+4),rnd_avg32(LP(src1+4),LP(src2+4)) ); \ 174 OP(LP(dst+4),rnd_avg32(LP(src1+4),LP(src2+4)) ); \
175 OP(LP(dst+8),rnd_avg32(LP(src1+8),LP(src2+8)) ); \ 175 OP(LP(dst+8),rnd_avg32(LP(src1+8),LP(src2+8)) ); \
176 OP(LP(dst+12),rnd_avg32(LP(src1+12),LP(src2+12)) ); \ 176 OP(LP(dst+12),rnd_avg32(LP(src1+12),LP(src2+12)) ); \
177 src1+=src_stride1; \ 177 src1+=src_stride1; \
178 src2+=src_stride2; \ 178 src2+=src_stride2; \
179 dst+=dst_stride; \ 179 dst+=dst_stride; \
180 } while(--h); \ 180 } while(--h); \
181 }\ 181 }\
182 \ 182 \
183 static inline void OPNAME ## _no_rnd_pixels16_l2_aligned1(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \ 183 static inline void OPNAME ## _no_rnd_pixels16_l2_aligned1(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
184 { OPNAME ## _no_rnd_pixels16_l2_aligned2(dst,src2,src1,dst_stride,src_stride2,src_stride1,h); } \ 184 { OPNAME ## _no_rnd_pixels16_l2_aligned2(dst,src2,src1,dst_stride,src_stride2,src_stride1,h); } \
185 \ 185 \
191 \ 191 \
192 static inline void OPNAME ## _pixels8_l2_aligned1(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \ 192 static inline void OPNAME ## _pixels8_l2_aligned1(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
193 { OPNAME ## _pixels8_l2_aligned2(dst,src2,src1,dst_stride,src_stride2,src_stride1,h); } \ 193 { OPNAME ## _pixels8_l2_aligned2(dst,src2,src1,dst_stride,src_stride2,src_stride1,h); } \
194 \ 194 \
195 static inline void OPNAME ## _pixels8_l4_aligned(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\ 195 static inline void OPNAME ## _pixels8_l4_aligned(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
196 do { \ 196 do { \
197 uint32_t a0,a1,a2,a3; \ 197 uint32_t a0,a1,a2,a3; \
198 UNPACK(a0,a1,LP(src1),LP(src2)); \ 198 UNPACK(a0,a1,LP(src1),LP(src2)); \
199 UNPACK(a2,a3,LP(src3),LP(src4)); \ 199 UNPACK(a2,a3,LP(src3),LP(src4)); \
200 OP(LP(dst),rnd_PACK(a0,a1,a2,a3)); \ 200 OP(LP(dst),rnd_PACK(a0,a1,a2,a3)); \
201 UNPACK(a0,a1,LP(src1+4),LP(src2+4)); \ 201 UNPACK(a0,a1,LP(src1+4),LP(src2+4)); \
202 UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \ 202 UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \
203 OP(LP(dst+4),rnd_PACK(a0,a1,a2,a3)); \ 203 OP(LP(dst+4),rnd_PACK(a0,a1,a2,a3)); \
204 src1+=src_stride1;\ 204 src1+=src_stride1;\
205 src2+=src_stride2;\ 205 src2+=src_stride2;\
206 src3+=src_stride3;\ 206 src3+=src_stride3;\
207 src4+=src_stride4;\ 207 src4+=src_stride4;\
208 dst+=dst_stride;\ 208 dst+=dst_stride;\
209 } while(--h); \ 209 } while(--h); \
210 } \ 210 } \
211 \ 211 \
212 static inline void OPNAME ## _no_rnd_pixels8_l4_aligned(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\ 212 static inline void OPNAME ## _no_rnd_pixels8_l4_aligned(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
213 do { \ 213 do { \
214 uint32_t a0,a1,a2,a3; \ 214 uint32_t a0,a1,a2,a3; \
215 UNPACK(a0,a1,LP(src1),LP(src2)); \ 215 UNPACK(a0,a1,LP(src1),LP(src2)); \
216 UNPACK(a2,a3,LP(src3),LP(src4)); \ 216 UNPACK(a2,a3,LP(src3),LP(src4)); \
217 OP(LP(dst),no_rnd_PACK(a0,a1,a2,a3)); \ 217 OP(LP(dst),no_rnd_PACK(a0,a1,a2,a3)); \
218 UNPACK(a0,a1,LP(src1+4),LP(src2+4)); \ 218 UNPACK(a0,a1,LP(src1+4),LP(src2+4)); \
219 UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \ 219 UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \
220 OP(LP(dst+4),no_rnd_PACK(a0,a1,a2,a3)); \ 220 OP(LP(dst+4),no_rnd_PACK(a0,a1,a2,a3)); \
221 src1+=src_stride1;\ 221 src1+=src_stride1;\
222 src2+=src_stride2;\ 222 src2+=src_stride2;\
223 src3+=src_stride3;\ 223 src3+=src_stride3;\
224 src4+=src_stride4;\ 224 src4+=src_stride4;\
225 dst+=dst_stride;\ 225 dst+=dst_stride;\
226 } while(--h); \ 226 } while(--h); \
227 } \ 227 } \
228 \ 228 \
229 static inline void OPNAME ## _pixels8_l4_aligned0(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\ 229 static inline void OPNAME ## _pixels8_l4_aligned0(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
230 do { \ 230 do { \
231 uint32_t a0,a1,a2,a3; /* src1 only not aligned */\ 231 uint32_t a0,a1,a2,a3; /* src1 only not aligned */\
232 UNPACK(a0,a1,LD32(src1),LP(src2)); \ 232 UNPACK(a0,a1,LD32(src1),LP(src2)); \
233 UNPACK(a2,a3,LP(src3),LP(src4)); \ 233 UNPACK(a2,a3,LP(src3),LP(src4)); \
234 OP(LP(dst),rnd_PACK(a0,a1,a2,a3)); \ 234 OP(LP(dst),rnd_PACK(a0,a1,a2,a3)); \
235 UNPACK(a0,a1,LD32(src1+4),LP(src2+4)); \ 235 UNPACK(a0,a1,LD32(src1+4),LP(src2+4)); \
236 UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \ 236 UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \
237 OP(LP(dst+4),rnd_PACK(a0,a1,a2,a3)); \ 237 OP(LP(dst+4),rnd_PACK(a0,a1,a2,a3)); \
238 src1+=src_stride1;\ 238 src1+=src_stride1;\
239 src2+=src_stride2;\ 239 src2+=src_stride2;\
240 src3+=src_stride3;\ 240 src3+=src_stride3;\
241 src4+=src_stride4;\ 241 src4+=src_stride4;\
242 dst+=dst_stride;\ 242 dst+=dst_stride;\
243 } while(--h); \ 243 } while(--h); \
244 } \ 244 } \
245 \ 245 \
246 static inline void OPNAME ## _no_rnd_pixels8_l4_aligned0(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\ 246 static inline void OPNAME ## _no_rnd_pixels8_l4_aligned0(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
247 do { \ 247 do { \
248 uint32_t a0,a1,a2,a3; \ 248 uint32_t a0,a1,a2,a3; \
249 UNPACK(a0,a1,LD32(src1),LP(src2)); \ 249 UNPACK(a0,a1,LD32(src1),LP(src2)); \
250 UNPACK(a2,a3,LP(src3),LP(src4)); \ 250 UNPACK(a2,a3,LP(src3),LP(src4)); \
251 OP(LP(dst),no_rnd_PACK(a0,a1,a2,a3)); \ 251 OP(LP(dst),no_rnd_PACK(a0,a1,a2,a3)); \
252 UNPACK(a0,a1,LD32(src1+4),LP(src2+4)); \ 252 UNPACK(a0,a1,LD32(src1+4),LP(src2+4)); \
253 UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \ 253 UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \
254 OP(LP(dst+4),no_rnd_PACK(a0,a1,a2,a3)); \ 254 OP(LP(dst+4),no_rnd_PACK(a0,a1,a2,a3)); \
255 src1+=src_stride1;\ 255 src1+=src_stride1;\
256 src2+=src_stride2;\ 256 src2+=src_stride2;\
257 src3+=src_stride3;\ 257 src3+=src_stride3;\
258 src4+=src_stride4;\ 258 src4+=src_stride4;\
259 dst+=dst_stride;\ 259 dst+=dst_stride;\
260 } while(--h); \ 260 } while(--h); \
261 } \ 261 } \
262 \ 262 \
263 static inline void OPNAME ## _pixels16_l4_aligned(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\ 263 static inline void OPNAME ## _pixels16_l4_aligned(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
264 do { \ 264 do { \
265 uint32_t a0,a1,a2,a3; \ 265 uint32_t a0,a1,a2,a3; \
266 UNPACK(a0,a1,LP(src1),LP(src2)); \ 266 UNPACK(a0,a1,LP(src1),LP(src2)); \
267 UNPACK(a2,a3,LP(src3),LP(src4)); \ 267 UNPACK(a2,a3,LP(src3),LP(src4)); \
268 OP(LP(dst),rnd_PACK(a0,a1,a2,a3)); \ 268 OP(LP(dst),rnd_PACK(a0,a1,a2,a3)); \
269 UNPACK(a0,a1,LP(src1+4),LP(src2+4)); \ 269 UNPACK(a0,a1,LP(src1+4),LP(src2+4)); \
270 UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \ 270 UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \
271 OP(LP(dst+8),rnd_PACK(a0,a1,a2,a3)); \ 271 OP(LP(dst+8),rnd_PACK(a0,a1,a2,a3)); \
272 UNPACK(a0,a1,LP(src1+8),LP(src2+8)); \ 272 UNPACK(a0,a1,LP(src1+8),LP(src2+8)); \
273 UNPACK(a2,a3,LP(src3+8),LP(src4+8)); \ 273 UNPACK(a2,a3,LP(src3+8),LP(src4+8)); \
274 OP(LP(dst+8),rnd_PACK(a0,a1,a2,a3)); \ 274 OP(LP(dst+8),rnd_PACK(a0,a1,a2,a3)); \
275 UNPACK(a0,a1,LP(src1+12),LP(src2+12)); \ 275 UNPACK(a0,a1,LP(src1+12),LP(src2+12)); \
276 UNPACK(a2,a3,LP(src3+12),LP(src4+12)); \ 276 UNPACK(a2,a3,LP(src3+12),LP(src4+12)); \
277 OP(LP(dst+12),rnd_PACK(a0,a1,a2,a3)); \ 277 OP(LP(dst+12),rnd_PACK(a0,a1,a2,a3)); \
278 src1+=src_stride1;\ 278 src1+=src_stride1;\
279 src2+=src_stride2;\ 279 src2+=src_stride2;\
280 src3+=src_stride3;\ 280 src3+=src_stride3;\
281 src4+=src_stride4;\ 281 src4+=src_stride4;\
282 dst+=dst_stride;\ 282 dst+=dst_stride;\
283 } while(--h); \ 283 } while(--h); \
284 } \ 284 } \
285 \ 285 \
286 static inline void OPNAME ## _no_rnd_pixels16_l4_aligned(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\ 286 static inline void OPNAME ## _no_rnd_pixels16_l4_aligned(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
287 do { \ 287 do { \
288 uint32_t a0,a1,a2,a3; \ 288 uint32_t a0,a1,a2,a3; \
289 UNPACK(a0,a1,LP(src1),LP(src2)); \ 289 UNPACK(a0,a1,LP(src1),LP(src2)); \
290 UNPACK(a2,a3,LP(src3),LP(src4)); \ 290 UNPACK(a2,a3,LP(src3),LP(src4)); \
291 OP(LP(dst),no_rnd_PACK(a0,a1,a2,a3)); \ 291 OP(LP(dst),no_rnd_PACK(a0,a1,a2,a3)); \
292 UNPACK(a0,a1,LP(src1+4),LP(src2+4)); \ 292 UNPACK(a0,a1,LP(src1+4),LP(src2+4)); \
293 UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \ 293 UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \
294 OP(LP(dst+4),no_rnd_PACK(a0,a1,a2,a3)); \ 294 OP(LP(dst+4),no_rnd_PACK(a0,a1,a2,a3)); \
295 UNPACK(a0,a1,LP(src1+8),LP(src2+8)); \ 295 UNPACK(a0,a1,LP(src1+8),LP(src2+8)); \
296 UNPACK(a2,a3,LP(src3+8),LP(src4+8)); \ 296 UNPACK(a2,a3,LP(src3+8),LP(src4+8)); \
297 OP(LP(dst+8),no_rnd_PACK(a0,a1,a2,a3)); \ 297 OP(LP(dst+8),no_rnd_PACK(a0,a1,a2,a3)); \
298 UNPACK(a0,a1,LP(src1+12),LP(src2+12)); \ 298 UNPACK(a0,a1,LP(src1+12),LP(src2+12)); \
299 UNPACK(a2,a3,LP(src3+12),LP(src4+12)); \ 299 UNPACK(a2,a3,LP(src3+12),LP(src4+12)); \
300 OP(LP(dst+12),no_rnd_PACK(a0,a1,a2,a3)); \ 300 OP(LP(dst+12),no_rnd_PACK(a0,a1,a2,a3)); \
301 src1+=src_stride1;\ 301 src1+=src_stride1;\
302 src2+=src_stride2;\ 302 src2+=src_stride2;\
303 src3+=src_stride3;\ 303 src3+=src_stride3;\
304 src4+=src_stride4;\ 304 src4+=src_stride4;\
305 dst+=dst_stride;\ 305 dst+=dst_stride;\
306 } while(--h); \ 306 } while(--h); \
307 } \ 307 } \
308 \ 308 \
309 static inline void OPNAME ## _pixels16_l4_aligned0(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\ 309 static inline void OPNAME ## _pixels16_l4_aligned0(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
310 do { /* src1 is unaligned */\ 310 do { /* src1 is unaligned */\
311 uint32_t a0,a1,a2,a3; \ 311 uint32_t a0,a1,a2,a3; \
312 UNPACK(a0,a1,LD32(src1),LP(src2)); \ 312 UNPACK(a0,a1,LD32(src1),LP(src2)); \
313 UNPACK(a2,a3,LP(src3),LP(src4)); \ 313 UNPACK(a2,a3,LP(src3),LP(src4)); \
314 OP(LP(dst),rnd_PACK(a0,a1,a2,a3)); \ 314 OP(LP(dst),rnd_PACK(a0,a1,a2,a3)); \
315 UNPACK(a0,a1,LD32(src1+4),LP(src2+4)); \ 315 UNPACK(a0,a1,LD32(src1+4),LP(src2+4)); \
316 UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \ 316 UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \
317 OP(LP(dst+8),rnd_PACK(a0,a1,a2,a3)); \ 317 OP(LP(dst+8),rnd_PACK(a0,a1,a2,a3)); \
318 UNPACK(a0,a1,LD32(src1+8),LP(src2+8)); \ 318 UNPACK(a0,a1,LD32(src1+8),LP(src2+8)); \
319 UNPACK(a2,a3,LP(src3+8),LP(src4+8)); \ 319 UNPACK(a2,a3,LP(src3+8),LP(src4+8)); \
320 OP(LP(dst+8),rnd_PACK(a0,a1,a2,a3)); \ 320 OP(LP(dst+8),rnd_PACK(a0,a1,a2,a3)); \
321 UNPACK(a0,a1,LD32(src1+12),LP(src2+12)); \ 321 UNPACK(a0,a1,LD32(src1+12),LP(src2+12)); \
322 UNPACK(a2,a3,LP(src3+12),LP(src4+12)); \ 322 UNPACK(a2,a3,LP(src3+12),LP(src4+12)); \
323 OP(LP(dst+12),rnd_PACK(a0,a1,a2,a3)); \ 323 OP(LP(dst+12),rnd_PACK(a0,a1,a2,a3)); \
324 src1+=src_stride1;\ 324 src1+=src_stride1;\
325 src2+=src_stride2;\ 325 src2+=src_stride2;\
326 src3+=src_stride3;\ 326 src3+=src_stride3;\
327 src4+=src_stride4;\ 327 src4+=src_stride4;\
328 dst+=dst_stride;\ 328 dst+=dst_stride;\
329 } while(--h); \ 329 } while(--h); \
330 } \ 330 } \
331 \ 331 \
332 static inline void OPNAME ## _no_rnd_pixels16_l4_aligned0(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\ 332 static inline void OPNAME ## _no_rnd_pixels16_l4_aligned0(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
333 do { \ 333 do { \
334 uint32_t a0,a1,a2,a3; \ 334 uint32_t a0,a1,a2,a3; \
335 UNPACK(a0,a1,LD32(src1),LP(src2)); \ 335 UNPACK(a0,a1,LD32(src1),LP(src2)); \
336 UNPACK(a2,a3,LP(src3),LP(src4)); \ 336 UNPACK(a2,a3,LP(src3),LP(src4)); \
337 OP(LP(dst),no_rnd_PACK(a0,a1,a2,a3)); \ 337 OP(LP(dst),no_rnd_PACK(a0,a1,a2,a3)); \
338 UNPACK(a0,a1,LD32(src1+4),LP(src2+4)); \ 338 UNPACK(a0,a1,LD32(src1+4),LP(src2+4)); \
339 UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \ 339 UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \
340 OP(LP(dst+4),no_rnd_PACK(a0,a1,a2,a3)); \ 340 OP(LP(dst+4),no_rnd_PACK(a0,a1,a2,a3)); \
341 UNPACK(a0,a1,LD32(src1+8),LP(src2+8)); \ 341 UNPACK(a0,a1,LD32(src1+8),LP(src2+8)); \
342 UNPACK(a2,a3,LP(src3+8),LP(src4+8)); \ 342 UNPACK(a2,a3,LP(src3+8),LP(src4+8)); \
343 OP(LP(dst+8),no_rnd_PACK(a0,a1,a2,a3)); \ 343 OP(LP(dst+8),no_rnd_PACK(a0,a1,a2,a3)); \
344 UNPACK(a0,a1,LD32(src1+12),LP(src2+12)); \ 344 UNPACK(a0,a1,LD32(src1+12),LP(src2+12)); \
345 UNPACK(a2,a3,LP(src3+12),LP(src4+12)); \ 345 UNPACK(a2,a3,LP(src3+12),LP(src4+12)); \
346 OP(LP(dst+12),no_rnd_PACK(a0,a1,a2,a3)); \ 346 OP(LP(dst+12),no_rnd_PACK(a0,a1,a2,a3)); \
347 src1+=src_stride1;\ 347 src1+=src_stride1;\
348 src2+=src_stride2;\ 348 src2+=src_stride2;\
349 src3+=src_stride3;\ 349 src3+=src_stride3;\
350 src4+=src_stride4;\ 350 src4+=src_stride4;\
351 dst+=dst_stride;\ 351 dst+=dst_stride;\
352 } while(--h); \ 352 } while(--h); \
353 } \ 353 } \
354 \ 354 \
355 355
356 #define op_avg(a, b) a = rnd_avg32(a,b) 356 #define op_avg(a, b) a = rnd_avg32(a,b)
357 #define op_put(a, b) a = b 357 #define op_put(a, b) a = b