Mercurial > libavcodec.hg
comparison sh4/qpel.c @ 2979:bfabfdf9ce55 libavcodec
COSMETICS: tabs --> spaces, some prettyprinting
author | diego |
---|---|
date | Thu, 22 Dec 2005 01:10:11 +0000 |
parents | ef2149182f1c |
children | c537a97eec66 |
comparison
equal
deleted
inserted
replaced
2978:403183bbb505 | 2979:bfabfdf9ce55 |
---|---|
1 /* | 1 /* |
2 this is optimized for sh, which have post increment addressing (*p++) | 2 this is optimized for sh, which have post increment addressing (*p++) |
3 some cpu may be index (p[n]) faster than post increment (*p++) | 3 some cpu may be index (p[n]) faster than post increment (*p++) |
4 */ | 4 */ |
5 | 5 |
6 #define LD(adr) *(uint32_t*)(adr) | 6 #define LD(adr) *(uint32_t*)(adr) |
7 | 7 |
8 #define PIXOP2(OPNAME, OP) \ | 8 #define PIXOP2(OPNAME, OP) \ |
9 /*static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \ | 9 /*static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \ |
10 {\ | 10 {\ |
11 do {\ | 11 do {\ |
12 OP(LP(dst ),no_rnd_avg32(LD32(src1 ),LD32(src2 )) ); \ | 12 OP(LP(dst ),no_rnd_avg32(LD32(src1 ),LD32(src2 )) ); \ |
13 OP(LP(dst+4),no_rnd_avg32(LD32(src1+4),LD32(src2+4)) ); \ | 13 OP(LP(dst+4),no_rnd_avg32(LD32(src1+4),LD32(src2+4)) ); \ |
14 src1+=src_stride1; \ | 14 src1+=src_stride1; \ |
15 src2+=src_stride2; \ | 15 src2+=src_stride2; \ |
16 dst+=dst_stride; \ | 16 dst+=dst_stride; \ |
17 } while(--h); \ | 17 } while(--h); \ |
18 }\ | 18 }\ |
19 \ | 19 \ |
20 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \ | 20 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \ |
21 {\ | 21 {\ |
22 do {\ | 22 do {\ |
23 OP(LP(dst ),rnd_avg32(LD32(src1 ),LD32(src2 )) ); \ | 23 OP(LP(dst ),rnd_avg32(LD32(src1 ),LD32(src2 )) ); \ |
24 OP(LP(dst+4),rnd_avg32(LD32(src1+4),LD32(src2+4)) ); \ | 24 OP(LP(dst+4),rnd_avg32(LD32(src1+4),LD32(src2+4)) ); \ |
25 src1+=src_stride1; \ | 25 src1+=src_stride1; \ |
26 src2+=src_stride2; \ | 26 src2+=src_stride2; \ |
27 dst+=dst_stride; \ | 27 dst+=dst_stride; \ |
28 } while(--h); \ | 28 } while(--h); \ |
29 }\ | 29 }\ |
30 \ | 30 \ |
31 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \ | 31 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \ |
32 {\ | 32 {\ |
33 do {\ | 33 do {\ |
34 OP(LP(dst ),rnd_avg32(LD32(src1 ),LD32(src2 )) ); \ | 34 OP(LP(dst ),rnd_avg32(LD32(src1 ),LD32(src2 )) ); \ |
35 src1+=src_stride1; \ | 35 src1+=src_stride1; \ |
36 src2+=src_stride2; \ | 36 src2+=src_stride2; \ |
37 dst+=dst_stride; \ | 37 dst+=dst_stride; \ |
38 } while(--h); \ | 38 } while(--h); \ |
39 }\ | 39 }\ |
40 \ | 40 \ |
41 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \ | 41 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \ |
42 {\ | 42 {\ |
43 do {\ | 43 do {\ |
44 OP(LP(dst ),no_rnd_avg32(LD32(src1 ),LD32(src2 )) ); \ | 44 OP(LP(dst ),no_rnd_avg32(LD32(src1 ),LD32(src2 )) ); \ |
45 OP(LP(dst+4),no_rnd_avg32(LD32(src1+4),LD32(src2+4)) ); \ | 45 OP(LP(dst+4),no_rnd_avg32(LD32(src1+4),LD32(src2+4)) ); \ |
46 OP(LP(dst+8),no_rnd_avg32(LD32(src1+8),LD32(src2+8)) ); \ | 46 OP(LP(dst+8),no_rnd_avg32(LD32(src1+8),LD32(src2+8)) ); \ |
47 OP(LP(dst+12),no_rnd_avg32(LD32(src1+12),LD32(src2+12)) ); \ | 47 OP(LP(dst+12),no_rnd_avg32(LD32(src1+12),LD32(src2+12)) ); \ |
48 src1+=src_stride1; \ | 48 src1+=src_stride1; \ |
49 src2+=src_stride2; \ | 49 src2+=src_stride2; \ |
50 dst+=dst_stride; \ | 50 dst+=dst_stride; \ |
51 } while(--h); \ | 51 } while(--h); \ |
52 }\ | 52 }\ |
53 \ | 53 \ |
54 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \ | 54 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \ |
55 {\ | 55 {\ |
56 do {\ | 56 do {\ |
57 OP(LP(dst ),rnd_avg32(LD32(src1 ),LD32(src2 )) ); \ | 57 OP(LP(dst ),rnd_avg32(LD32(src1 ),LD32(src2 )) ); \ |
58 OP(LP(dst+4),rnd_avg32(LD32(src1+4),LD32(src2+4)) ); \ | 58 OP(LP(dst+4),rnd_avg32(LD32(src1+4),LD32(src2+4)) ); \ |
59 OP(LP(dst+8),rnd_avg32(LD32(src1+8),LD32(src2+8)) ); \ | 59 OP(LP(dst+8),rnd_avg32(LD32(src1+8),LD32(src2+8)) ); \ |
60 OP(LP(dst+12),rnd_avg32(LD32(src1+12),LD32(src2+12)) ); \ | 60 OP(LP(dst+12),rnd_avg32(LD32(src1+12),LD32(src2+12)) ); \ |
61 src1+=src_stride1; \ | 61 src1+=src_stride1; \ |
62 src2+=src_stride2; \ | 62 src2+=src_stride2; \ |
63 dst+=dst_stride; \ | 63 dst+=dst_stride; \ |
64 } while(--h); \ | 64 } while(--h); \ |
65 }*/\ | 65 }*/\ |
66 \ | 66 \ |
67 static inline void OPNAME ## _pixels4_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \ | 67 static inline void OPNAME ## _pixels4_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \ |
68 {\ | 68 {\ |
69 do {\ | 69 do {\ |
70 OP(LP(dst ),rnd_avg32(LP(src1 ),LP(src2 )) ); \ | 70 OP(LP(dst ),rnd_avg32(LP(src1 ),LP(src2 )) ); \ |
71 src1+=src_stride1; \ | 71 src1+=src_stride1; \ |
72 src2+=src_stride2; \ | 72 src2+=src_stride2; \ |
73 dst+=dst_stride; \ | 73 dst+=dst_stride; \ |
74 } while(--h); \ | 74 } while(--h); \ |
75 }\ | 75 }\ |
76 \ | 76 \ |
77 static inline void OPNAME ## _pixels4_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \ | 77 static inline void OPNAME ## _pixels4_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \ |
78 {\ | 78 {\ |
79 do {\ | 79 do {\ |
80 OP(LP(dst ),rnd_avg32(LD32(src1 ),LP(src2 )) ); \ | 80 OP(LP(dst ),rnd_avg32(LD32(src1 ),LP(src2 )) ); \ |
81 src1+=src_stride1; \ | 81 src1+=src_stride1; \ |
82 src2+=src_stride2; \ | 82 src2+=src_stride2; \ |
83 dst+=dst_stride; \ | 83 dst+=dst_stride; \ |
84 } while(--h); \ | 84 } while(--h); \ |
85 }\ | 85 }\ |
86 \ | 86 \ |
87 static inline void OPNAME ## _no_rnd_pixels16_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \ | 87 static inline void OPNAME ## _no_rnd_pixels16_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \ |
88 {\ | 88 {\ |
89 do {\ | 89 do {\ |
90 OP(LP(dst ),no_rnd_avg32(LD32(src1 ),LP(src2 )) ); \ | 90 OP(LP(dst ),no_rnd_avg32(LD32(src1 ),LP(src2 )) ); \ |
91 OP(LP(dst+4),no_rnd_avg32(LD32(src1+4),LP(src2+4)) ); \ | 91 OP(LP(dst+4),no_rnd_avg32(LD32(src1+4),LP(src2+4)) ); \ |
92 OP(LP(dst+8),no_rnd_avg32(LD32(src1+8),LP(src2+8)) ); \ | 92 OP(LP(dst+8),no_rnd_avg32(LD32(src1+8),LP(src2+8)) ); \ |
93 OP(LP(dst+12),no_rnd_avg32(LD32(src1+12),LP(src2+12)) ); \ | 93 OP(LP(dst+12),no_rnd_avg32(LD32(src1+12),LP(src2+12)) ); \ |
94 src1+=src_stride1; \ | 94 src1+=src_stride1; \ |
95 src2+=src_stride2; \ | 95 src2+=src_stride2; \ |
96 dst+=dst_stride; \ | 96 dst+=dst_stride; \ |
97 } while(--h); \ | 97 } while(--h); \ |
98 }\ | 98 }\ |
99 \ | 99 \ |
100 static inline void OPNAME ## _pixels16_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \ | 100 static inline void OPNAME ## _pixels16_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \ |
101 {\ | 101 {\ |
102 do {\ | 102 do {\ |
103 OP(LP(dst ),rnd_avg32(LD32(src1 ),LP(src2 )) ); \ | 103 OP(LP(dst ),rnd_avg32(LD32(src1 ),LP(src2 )) ); \ |
104 OP(LP(dst+4),rnd_avg32(LD32(src1+4),LP(src2+4)) ); \ | 104 OP(LP(dst+4),rnd_avg32(LD32(src1+4),LP(src2+4)) ); \ |
105 OP(LP(dst+8),rnd_avg32(LD32(src1+8),LP(src2+8)) ); \ | 105 OP(LP(dst+8),rnd_avg32(LD32(src1+8),LP(src2+8)) ); \ |
106 OP(LP(dst+12),rnd_avg32(LD32(src1+12),LP(src2+12)) ); \ | 106 OP(LP(dst+12),rnd_avg32(LD32(src1+12),LP(src2+12)) ); \ |
107 src1+=src_stride1; \ | 107 src1+=src_stride1; \ |
108 src2+=src_stride2; \ | 108 src2+=src_stride2; \ |
109 dst+=dst_stride; \ | 109 dst+=dst_stride; \ |
110 } while(--h); \ | 110 } while(--h); \ |
111 }\ | 111 }\ |
112 \ | 112 \ |
113 static inline void OPNAME ## _no_rnd_pixels8_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \ | 113 static inline void OPNAME ## _no_rnd_pixels8_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \ |
114 {\ | 114 {\ |
115 do { /* onlye src2 aligned */\ | 115 do { /* onlye src2 aligned */\ |
116 OP(LP(dst ),no_rnd_avg32(LD32(src1 ),LP(src2 )) ); \ | 116 OP(LP(dst ),no_rnd_avg32(LD32(src1 ),LP(src2 )) ); \ |
117 OP(LP(dst+4),no_rnd_avg32(LD32(src1+4),LP(src2+4)) ); \ | 117 OP(LP(dst+4),no_rnd_avg32(LD32(src1+4),LP(src2+4)) ); \ |
118 src1+=src_stride1; \ | 118 src1+=src_stride1; \ |
119 src2+=src_stride2; \ | 119 src2+=src_stride2; \ |
120 dst+=dst_stride; \ | 120 dst+=dst_stride; \ |
121 } while(--h); \ | 121 } while(--h); \ |
122 }\ | 122 }\ |
123 \ | 123 \ |
124 static inline void OPNAME ## _pixels8_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \ | 124 static inline void OPNAME ## _pixels8_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \ |
125 {\ | 125 {\ |
126 do {\ | 126 do {\ |
127 OP(LP(dst ),rnd_avg32(LD32(src1 ),LP(src2 )) ); \ | 127 OP(LP(dst ),rnd_avg32(LD32(src1 ),LP(src2 )) ); \ |
128 OP(LP(dst+4),rnd_avg32(LD32(src1+4),LP(src2+4)) ); \ | 128 OP(LP(dst+4),rnd_avg32(LD32(src1+4),LP(src2+4)) ); \ |
129 src1+=src_stride1; \ | 129 src1+=src_stride1; \ |
130 src2+=src_stride2; \ | 130 src2+=src_stride2; \ |
131 dst+=dst_stride; \ | 131 dst+=dst_stride; \ |
132 } while(--h); \ | 132 } while(--h); \ |
133 }\ | 133 }\ |
134 \ | 134 \ |
135 static inline void OPNAME ## _no_rnd_pixels8_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \ | 135 static inline void OPNAME ## _no_rnd_pixels8_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \ |
136 {\ | 136 {\ |
137 do {\ | 137 do {\ |
138 OP(LP(dst ),no_rnd_avg32(LP(src1 ),LP(src2 )) ); \ | 138 OP(LP(dst ),no_rnd_avg32(LP(src1 ),LP(src2 )) ); \ |
139 OP(LP(dst+4),no_rnd_avg32(LP(src1+4),LP(src2+4)) ); \ | 139 OP(LP(dst+4),no_rnd_avg32(LP(src1+4),LP(src2+4)) ); \ |
140 src1+=src_stride1; \ | 140 src1+=src_stride1; \ |
141 src2+=src_stride2; \ | 141 src2+=src_stride2; \ |
142 dst+=dst_stride; \ | 142 dst+=dst_stride; \ |
143 } while(--h); \ | 143 } while(--h); \ |
144 }\ | 144 }\ |
145 \ | 145 \ |
146 static inline void OPNAME ## _pixels8_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \ | 146 static inline void OPNAME ## _pixels8_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \ |
147 {\ | 147 {\ |
148 do {\ | 148 do {\ |
149 OP(LP(dst ),rnd_avg32(LP(src1 ),LP(src2 )) ); \ | 149 OP(LP(dst ),rnd_avg32(LP(src1 ),LP(src2 )) ); \ |
150 OP(LP(dst+4),rnd_avg32(LP(src1+4),LP(src2+4)) ); \ | 150 OP(LP(dst+4),rnd_avg32(LP(src1+4),LP(src2+4)) ); \ |
151 src1+=src_stride1; \ | 151 src1+=src_stride1; \ |
152 src2+=src_stride2; \ | 152 src2+=src_stride2; \ |
153 dst+=dst_stride; \ | 153 dst+=dst_stride; \ |
154 } while(--h); \ | 154 } while(--h); \ |
155 }\ | 155 }\ |
156 \ | 156 \ |
157 static inline void OPNAME ## _no_rnd_pixels16_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \ | 157 static inline void OPNAME ## _no_rnd_pixels16_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \ |
158 {\ | 158 {\ |
159 do {\ | 159 do {\ |
160 OP(LP(dst ),no_rnd_avg32(LP(src1 ),LP(src2 )) ); \ | 160 OP(LP(dst ),no_rnd_avg32(LP(src1 ),LP(src2 )) ); \ |
161 OP(LP(dst+4),no_rnd_avg32(LP(src1+4),LP(src2+4)) ); \ | 161 OP(LP(dst+4),no_rnd_avg32(LP(src1+4),LP(src2+4)) ); \ |
162 OP(LP(dst+8),no_rnd_avg32(LP(src1+8),LP(src2+8)) ); \ | 162 OP(LP(dst+8),no_rnd_avg32(LP(src1+8),LP(src2+8)) ); \ |
163 OP(LP(dst+12),no_rnd_avg32(LP(src1+12),LP(src2+12)) ); \ | 163 OP(LP(dst+12),no_rnd_avg32(LP(src1+12),LP(src2+12)) ); \ |
164 src1+=src_stride1; \ | 164 src1+=src_stride1; \ |
165 src2+=src_stride2; \ | 165 src2+=src_stride2; \ |
166 dst+=dst_stride; \ | 166 dst+=dst_stride; \ |
167 } while(--h); \ | 167 } while(--h); \ |
168 }\ | 168 }\ |
169 \ | 169 \ |
170 static inline void OPNAME ## _pixels16_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \ | 170 static inline void OPNAME ## _pixels16_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \ |
171 {\ | 171 {\ |
172 do {\ | 172 do {\ |
173 OP(LP(dst ),rnd_avg32(LP(src1 ),LP(src2 )) ); \ | 173 OP(LP(dst ),rnd_avg32(LP(src1 ),LP(src2 )) ); \ |
174 OP(LP(dst+4),rnd_avg32(LP(src1+4),LP(src2+4)) ); \ | 174 OP(LP(dst+4),rnd_avg32(LP(src1+4),LP(src2+4)) ); \ |
175 OP(LP(dst+8),rnd_avg32(LP(src1+8),LP(src2+8)) ); \ | 175 OP(LP(dst+8),rnd_avg32(LP(src1+8),LP(src2+8)) ); \ |
176 OP(LP(dst+12),rnd_avg32(LP(src1+12),LP(src2+12)) ); \ | 176 OP(LP(dst+12),rnd_avg32(LP(src1+12),LP(src2+12)) ); \ |
177 src1+=src_stride1; \ | 177 src1+=src_stride1; \ |
178 src2+=src_stride2; \ | 178 src2+=src_stride2; \ |
179 dst+=dst_stride; \ | 179 dst+=dst_stride; \ |
180 } while(--h); \ | 180 } while(--h); \ |
181 }\ | 181 }\ |
182 \ | 182 \ |
183 static inline void OPNAME ## _no_rnd_pixels16_l2_aligned1(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \ | 183 static inline void OPNAME ## _no_rnd_pixels16_l2_aligned1(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \ |
184 { OPNAME ## _no_rnd_pixels16_l2_aligned2(dst,src2,src1,dst_stride,src_stride2,src_stride1,h); } \ | 184 { OPNAME ## _no_rnd_pixels16_l2_aligned2(dst,src2,src1,dst_stride,src_stride2,src_stride1,h); } \ |
185 \ | 185 \ |
191 \ | 191 \ |
192 static inline void OPNAME ## _pixels8_l2_aligned1(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \ | 192 static inline void OPNAME ## _pixels8_l2_aligned1(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \ |
193 { OPNAME ## _pixels8_l2_aligned2(dst,src2,src1,dst_stride,src_stride2,src_stride1,h); } \ | 193 { OPNAME ## _pixels8_l2_aligned2(dst,src2,src1,dst_stride,src_stride2,src_stride1,h); } \ |
194 \ | 194 \ |
195 static inline void OPNAME ## _pixels8_l4_aligned(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\ | 195 static inline void OPNAME ## _pixels8_l4_aligned(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\ |
196 do { \ | 196 do { \ |
197 uint32_t a0,a1,a2,a3; \ | 197 uint32_t a0,a1,a2,a3; \ |
198 UNPACK(a0,a1,LP(src1),LP(src2)); \ | 198 UNPACK(a0,a1,LP(src1),LP(src2)); \ |
199 UNPACK(a2,a3,LP(src3),LP(src4)); \ | 199 UNPACK(a2,a3,LP(src3),LP(src4)); \ |
200 OP(LP(dst),rnd_PACK(a0,a1,a2,a3)); \ | 200 OP(LP(dst),rnd_PACK(a0,a1,a2,a3)); \ |
201 UNPACK(a0,a1,LP(src1+4),LP(src2+4)); \ | 201 UNPACK(a0,a1,LP(src1+4),LP(src2+4)); \ |
202 UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \ | 202 UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \ |
203 OP(LP(dst+4),rnd_PACK(a0,a1,a2,a3)); \ | 203 OP(LP(dst+4),rnd_PACK(a0,a1,a2,a3)); \ |
204 src1+=src_stride1;\ | 204 src1+=src_stride1;\ |
205 src2+=src_stride2;\ | 205 src2+=src_stride2;\ |
206 src3+=src_stride3;\ | 206 src3+=src_stride3;\ |
207 src4+=src_stride4;\ | 207 src4+=src_stride4;\ |
208 dst+=dst_stride;\ | 208 dst+=dst_stride;\ |
209 } while(--h); \ | 209 } while(--h); \ |
210 } \ | 210 } \ |
211 \ | 211 \ |
212 static inline void OPNAME ## _no_rnd_pixels8_l4_aligned(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\ | 212 static inline void OPNAME ## _no_rnd_pixels8_l4_aligned(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\ |
213 do { \ | 213 do { \ |
214 uint32_t a0,a1,a2,a3; \ | 214 uint32_t a0,a1,a2,a3; \ |
215 UNPACK(a0,a1,LP(src1),LP(src2)); \ | 215 UNPACK(a0,a1,LP(src1),LP(src2)); \ |
216 UNPACK(a2,a3,LP(src3),LP(src4)); \ | 216 UNPACK(a2,a3,LP(src3),LP(src4)); \ |
217 OP(LP(dst),no_rnd_PACK(a0,a1,a2,a3)); \ | 217 OP(LP(dst),no_rnd_PACK(a0,a1,a2,a3)); \ |
218 UNPACK(a0,a1,LP(src1+4),LP(src2+4)); \ | 218 UNPACK(a0,a1,LP(src1+4),LP(src2+4)); \ |
219 UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \ | 219 UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \ |
220 OP(LP(dst+4),no_rnd_PACK(a0,a1,a2,a3)); \ | 220 OP(LP(dst+4),no_rnd_PACK(a0,a1,a2,a3)); \ |
221 src1+=src_stride1;\ | 221 src1+=src_stride1;\ |
222 src2+=src_stride2;\ | 222 src2+=src_stride2;\ |
223 src3+=src_stride3;\ | 223 src3+=src_stride3;\ |
224 src4+=src_stride4;\ | 224 src4+=src_stride4;\ |
225 dst+=dst_stride;\ | 225 dst+=dst_stride;\ |
226 } while(--h); \ | 226 } while(--h); \ |
227 } \ | 227 } \ |
228 \ | 228 \ |
229 static inline void OPNAME ## _pixels8_l4_aligned0(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\ | 229 static inline void OPNAME ## _pixels8_l4_aligned0(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\ |
230 do { \ | 230 do { \ |
231 uint32_t a0,a1,a2,a3; /* src1 only not aligned */\ | 231 uint32_t a0,a1,a2,a3; /* src1 only not aligned */\ |
232 UNPACK(a0,a1,LD32(src1),LP(src2)); \ | 232 UNPACK(a0,a1,LD32(src1),LP(src2)); \ |
233 UNPACK(a2,a3,LP(src3),LP(src4)); \ | 233 UNPACK(a2,a3,LP(src3),LP(src4)); \ |
234 OP(LP(dst),rnd_PACK(a0,a1,a2,a3)); \ | 234 OP(LP(dst),rnd_PACK(a0,a1,a2,a3)); \ |
235 UNPACK(a0,a1,LD32(src1+4),LP(src2+4)); \ | 235 UNPACK(a0,a1,LD32(src1+4),LP(src2+4)); \ |
236 UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \ | 236 UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \ |
237 OP(LP(dst+4),rnd_PACK(a0,a1,a2,a3)); \ | 237 OP(LP(dst+4),rnd_PACK(a0,a1,a2,a3)); \ |
238 src1+=src_stride1;\ | 238 src1+=src_stride1;\ |
239 src2+=src_stride2;\ | 239 src2+=src_stride2;\ |
240 src3+=src_stride3;\ | 240 src3+=src_stride3;\ |
241 src4+=src_stride4;\ | 241 src4+=src_stride4;\ |
242 dst+=dst_stride;\ | 242 dst+=dst_stride;\ |
243 } while(--h); \ | 243 } while(--h); \ |
244 } \ | 244 } \ |
245 \ | 245 \ |
246 static inline void OPNAME ## _no_rnd_pixels8_l4_aligned0(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\ | 246 static inline void OPNAME ## _no_rnd_pixels8_l4_aligned0(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\ |
247 do { \ | 247 do { \ |
248 uint32_t a0,a1,a2,a3; \ | 248 uint32_t a0,a1,a2,a3; \ |
249 UNPACK(a0,a1,LD32(src1),LP(src2)); \ | 249 UNPACK(a0,a1,LD32(src1),LP(src2)); \ |
250 UNPACK(a2,a3,LP(src3),LP(src4)); \ | 250 UNPACK(a2,a3,LP(src3),LP(src4)); \ |
251 OP(LP(dst),no_rnd_PACK(a0,a1,a2,a3)); \ | 251 OP(LP(dst),no_rnd_PACK(a0,a1,a2,a3)); \ |
252 UNPACK(a0,a1,LD32(src1+4),LP(src2+4)); \ | 252 UNPACK(a0,a1,LD32(src1+4),LP(src2+4)); \ |
253 UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \ | 253 UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \ |
254 OP(LP(dst+4),no_rnd_PACK(a0,a1,a2,a3)); \ | 254 OP(LP(dst+4),no_rnd_PACK(a0,a1,a2,a3)); \ |
255 src1+=src_stride1;\ | 255 src1+=src_stride1;\ |
256 src2+=src_stride2;\ | 256 src2+=src_stride2;\ |
257 src3+=src_stride3;\ | 257 src3+=src_stride3;\ |
258 src4+=src_stride4;\ | 258 src4+=src_stride4;\ |
259 dst+=dst_stride;\ | 259 dst+=dst_stride;\ |
260 } while(--h); \ | 260 } while(--h); \ |
261 } \ | 261 } \ |
262 \ | 262 \ |
263 static inline void OPNAME ## _pixels16_l4_aligned(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\ | 263 static inline void OPNAME ## _pixels16_l4_aligned(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\ |
264 do { \ | 264 do { \ |
265 uint32_t a0,a1,a2,a3; \ | 265 uint32_t a0,a1,a2,a3; \ |
266 UNPACK(a0,a1,LP(src1),LP(src2)); \ | 266 UNPACK(a0,a1,LP(src1),LP(src2)); \ |
267 UNPACK(a2,a3,LP(src3),LP(src4)); \ | 267 UNPACK(a2,a3,LP(src3),LP(src4)); \ |
268 OP(LP(dst),rnd_PACK(a0,a1,a2,a3)); \ | 268 OP(LP(dst),rnd_PACK(a0,a1,a2,a3)); \ |
269 UNPACK(a0,a1,LP(src1+4),LP(src2+4)); \ | 269 UNPACK(a0,a1,LP(src1+4),LP(src2+4)); \ |
270 UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \ | 270 UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \ |
271 OP(LP(dst+8),rnd_PACK(a0,a1,a2,a3)); \ | 271 OP(LP(dst+8),rnd_PACK(a0,a1,a2,a3)); \ |
272 UNPACK(a0,a1,LP(src1+8),LP(src2+8)); \ | 272 UNPACK(a0,a1,LP(src1+8),LP(src2+8)); \ |
273 UNPACK(a2,a3,LP(src3+8),LP(src4+8)); \ | 273 UNPACK(a2,a3,LP(src3+8),LP(src4+8)); \ |
274 OP(LP(dst+8),rnd_PACK(a0,a1,a2,a3)); \ | 274 OP(LP(dst+8),rnd_PACK(a0,a1,a2,a3)); \ |
275 UNPACK(a0,a1,LP(src1+12),LP(src2+12)); \ | 275 UNPACK(a0,a1,LP(src1+12),LP(src2+12)); \ |
276 UNPACK(a2,a3,LP(src3+12),LP(src4+12)); \ | 276 UNPACK(a2,a3,LP(src3+12),LP(src4+12)); \ |
277 OP(LP(dst+12),rnd_PACK(a0,a1,a2,a3)); \ | 277 OP(LP(dst+12),rnd_PACK(a0,a1,a2,a3)); \ |
278 src1+=src_stride1;\ | 278 src1+=src_stride1;\ |
279 src2+=src_stride2;\ | 279 src2+=src_stride2;\ |
280 src3+=src_stride3;\ | 280 src3+=src_stride3;\ |
281 src4+=src_stride4;\ | 281 src4+=src_stride4;\ |
282 dst+=dst_stride;\ | 282 dst+=dst_stride;\ |
283 } while(--h); \ | 283 } while(--h); \ |
284 } \ | 284 } \ |
285 \ | 285 \ |
286 static inline void OPNAME ## _no_rnd_pixels16_l4_aligned(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\ | 286 static inline void OPNAME ## _no_rnd_pixels16_l4_aligned(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\ |
287 do { \ | 287 do { \ |
288 uint32_t a0,a1,a2,a3; \ | 288 uint32_t a0,a1,a2,a3; \ |
289 UNPACK(a0,a1,LP(src1),LP(src2)); \ | 289 UNPACK(a0,a1,LP(src1),LP(src2)); \ |
290 UNPACK(a2,a3,LP(src3),LP(src4)); \ | 290 UNPACK(a2,a3,LP(src3),LP(src4)); \ |
291 OP(LP(dst),no_rnd_PACK(a0,a1,a2,a3)); \ | 291 OP(LP(dst),no_rnd_PACK(a0,a1,a2,a3)); \ |
292 UNPACK(a0,a1,LP(src1+4),LP(src2+4)); \ | 292 UNPACK(a0,a1,LP(src1+4),LP(src2+4)); \ |
293 UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \ | 293 UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \ |
294 OP(LP(dst+4),no_rnd_PACK(a0,a1,a2,a3)); \ | 294 OP(LP(dst+4),no_rnd_PACK(a0,a1,a2,a3)); \ |
295 UNPACK(a0,a1,LP(src1+8),LP(src2+8)); \ | 295 UNPACK(a0,a1,LP(src1+8),LP(src2+8)); \ |
296 UNPACK(a2,a3,LP(src3+8),LP(src4+8)); \ | 296 UNPACK(a2,a3,LP(src3+8),LP(src4+8)); \ |
297 OP(LP(dst+8),no_rnd_PACK(a0,a1,a2,a3)); \ | 297 OP(LP(dst+8),no_rnd_PACK(a0,a1,a2,a3)); \ |
298 UNPACK(a0,a1,LP(src1+12),LP(src2+12)); \ | 298 UNPACK(a0,a1,LP(src1+12),LP(src2+12)); \ |
299 UNPACK(a2,a3,LP(src3+12),LP(src4+12)); \ | 299 UNPACK(a2,a3,LP(src3+12),LP(src4+12)); \ |
300 OP(LP(dst+12),no_rnd_PACK(a0,a1,a2,a3)); \ | 300 OP(LP(dst+12),no_rnd_PACK(a0,a1,a2,a3)); \ |
301 src1+=src_stride1;\ | 301 src1+=src_stride1;\ |
302 src2+=src_stride2;\ | 302 src2+=src_stride2;\ |
303 src3+=src_stride3;\ | 303 src3+=src_stride3;\ |
304 src4+=src_stride4;\ | 304 src4+=src_stride4;\ |
305 dst+=dst_stride;\ | 305 dst+=dst_stride;\ |
306 } while(--h); \ | 306 } while(--h); \ |
307 } \ | 307 } \ |
308 \ | 308 \ |
309 static inline void OPNAME ## _pixels16_l4_aligned0(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\ | 309 static inline void OPNAME ## _pixels16_l4_aligned0(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\ |
310 do { /* src1 is unaligned */\ | 310 do { /* src1 is unaligned */\ |
311 uint32_t a0,a1,a2,a3; \ | 311 uint32_t a0,a1,a2,a3; \ |
312 UNPACK(a0,a1,LD32(src1),LP(src2)); \ | 312 UNPACK(a0,a1,LD32(src1),LP(src2)); \ |
313 UNPACK(a2,a3,LP(src3),LP(src4)); \ | 313 UNPACK(a2,a3,LP(src3),LP(src4)); \ |
314 OP(LP(dst),rnd_PACK(a0,a1,a2,a3)); \ | 314 OP(LP(dst),rnd_PACK(a0,a1,a2,a3)); \ |
315 UNPACK(a0,a1,LD32(src1+4),LP(src2+4)); \ | 315 UNPACK(a0,a1,LD32(src1+4),LP(src2+4)); \ |
316 UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \ | 316 UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \ |
317 OP(LP(dst+8),rnd_PACK(a0,a1,a2,a3)); \ | 317 OP(LP(dst+8),rnd_PACK(a0,a1,a2,a3)); \ |
318 UNPACK(a0,a1,LD32(src1+8),LP(src2+8)); \ | 318 UNPACK(a0,a1,LD32(src1+8),LP(src2+8)); \ |
319 UNPACK(a2,a3,LP(src3+8),LP(src4+8)); \ | 319 UNPACK(a2,a3,LP(src3+8),LP(src4+8)); \ |
320 OP(LP(dst+8),rnd_PACK(a0,a1,a2,a3)); \ | 320 OP(LP(dst+8),rnd_PACK(a0,a1,a2,a3)); \ |
321 UNPACK(a0,a1,LD32(src1+12),LP(src2+12)); \ | 321 UNPACK(a0,a1,LD32(src1+12),LP(src2+12)); \ |
322 UNPACK(a2,a3,LP(src3+12),LP(src4+12)); \ | 322 UNPACK(a2,a3,LP(src3+12),LP(src4+12)); \ |
323 OP(LP(dst+12),rnd_PACK(a0,a1,a2,a3)); \ | 323 OP(LP(dst+12),rnd_PACK(a0,a1,a2,a3)); \ |
324 src1+=src_stride1;\ | 324 src1+=src_stride1;\ |
325 src2+=src_stride2;\ | 325 src2+=src_stride2;\ |
326 src3+=src_stride3;\ | 326 src3+=src_stride3;\ |
327 src4+=src_stride4;\ | 327 src4+=src_stride4;\ |
328 dst+=dst_stride;\ | 328 dst+=dst_stride;\ |
329 } while(--h); \ | 329 } while(--h); \ |
330 } \ | 330 } \ |
331 \ | 331 \ |
332 static inline void OPNAME ## _no_rnd_pixels16_l4_aligned0(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\ | 332 static inline void OPNAME ## _no_rnd_pixels16_l4_aligned0(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\ |
333 do { \ | 333 do { \ |
334 uint32_t a0,a1,a2,a3; \ | 334 uint32_t a0,a1,a2,a3; \ |
335 UNPACK(a0,a1,LD32(src1),LP(src2)); \ | 335 UNPACK(a0,a1,LD32(src1),LP(src2)); \ |
336 UNPACK(a2,a3,LP(src3),LP(src4)); \ | 336 UNPACK(a2,a3,LP(src3),LP(src4)); \ |
337 OP(LP(dst),no_rnd_PACK(a0,a1,a2,a3)); \ | 337 OP(LP(dst),no_rnd_PACK(a0,a1,a2,a3)); \ |
338 UNPACK(a0,a1,LD32(src1+4),LP(src2+4)); \ | 338 UNPACK(a0,a1,LD32(src1+4),LP(src2+4)); \ |
339 UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \ | 339 UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \ |
340 OP(LP(dst+4),no_rnd_PACK(a0,a1,a2,a3)); \ | 340 OP(LP(dst+4),no_rnd_PACK(a0,a1,a2,a3)); \ |
341 UNPACK(a0,a1,LD32(src1+8),LP(src2+8)); \ | 341 UNPACK(a0,a1,LD32(src1+8),LP(src2+8)); \ |
342 UNPACK(a2,a3,LP(src3+8),LP(src4+8)); \ | 342 UNPACK(a2,a3,LP(src3+8),LP(src4+8)); \ |
343 OP(LP(dst+8),no_rnd_PACK(a0,a1,a2,a3)); \ | 343 OP(LP(dst+8),no_rnd_PACK(a0,a1,a2,a3)); \ |
344 UNPACK(a0,a1,LD32(src1+12),LP(src2+12)); \ | 344 UNPACK(a0,a1,LD32(src1+12),LP(src2+12)); \ |
345 UNPACK(a2,a3,LP(src3+12),LP(src4+12)); \ | 345 UNPACK(a2,a3,LP(src3+12),LP(src4+12)); \ |
346 OP(LP(dst+12),no_rnd_PACK(a0,a1,a2,a3)); \ | 346 OP(LP(dst+12),no_rnd_PACK(a0,a1,a2,a3)); \ |
347 src1+=src_stride1;\ | 347 src1+=src_stride1;\ |
348 src2+=src_stride2;\ | 348 src2+=src_stride2;\ |
349 src3+=src_stride3;\ | 349 src3+=src_stride3;\ |
350 src4+=src_stride4;\ | 350 src4+=src_stride4;\ |
351 dst+=dst_stride;\ | 351 dst+=dst_stride;\ |
352 } while(--h); \ | 352 } while(--h); \ |
353 } \ | 353 } \ |
354 \ | 354 \ |
355 | 355 |
356 #define op_avg(a, b) a = rnd_avg32(a,b) | 356 #define op_avg(a, b) a = rnd_avg32(a,b) |
357 #define op_put(a, b) a = b | 357 #define op_put(a, b) a = b |