comparison i386/dsputil_mmx_avg.h @ 441:c0de4d3c7d3c libavcodec

* optimized avg_* functions (except xy2) * minor speedup for put_pixels_x2 & cleanup
author kabi
date Tue, 28 May 2002 16:35:58 +0000
parents 6ae275655a23
children 006965950f49
comparison
equal deleted inserted replaced
440:000aeeac27a2 441:c0de4d3c7d3c
17 * License along with this library; if not, write to the Free Software 17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 * 19 *
20 * MMX optimization by Nick Kurshev <nickols_k@mail.ru> 20 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
21 * mostly rewritten by Michael Niedermayer <michaelni@gmx.at> 21 * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
22 * and improved by Zdenek Kabelac <kabi@users.sf.net>
22 */ 23 */
23 24
24 /* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm 25 /* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm
25 clobber bug - now it will work with 2.95.2 and also with -fPIC 26 clobber bug - now it will work with 2.95.2 and also with -fPIC
26 */ 27 */
27 static void DEF(put_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) 28 static void DEF(put_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
28 { 29 {
29 __asm __volatile( 30 __asm __volatile(
30 "lea (%3, %3), %%eax \n\t" 31 "lea (%3, %3), %%eax \n\t"
32 "1: \n\t"
33 "movq (%1), %%mm0 \n\t"
34 "movq (%1, %3), %%mm1 \n\t"
35 PAVGB" 1(%1), %%mm0 \n\t"
36 PAVGB" 1(%1, %3), %%mm1 \n\t"
37 "movq %%mm0, (%2) \n\t"
38 "movq %%mm1, (%2, %3) \n\t"
39 "addl %%eax, %1 \n\t"
40 "addl %%eax, %2 \n\t"
41 "movq (%1), %%mm0 \n\t"
42 "movq (%1, %3), %%mm1 \n\t"
43 PAVGB" 1(%1), %%mm0 \n\t"
44 PAVGB" 1(%1, %3), %%mm1 \n\t"
45 "addl %%eax, %1 \n\t"
46 "movq %%mm0, (%2) \n\t"
47 "movq %%mm1, (%2, %3) \n\t"
48 "addl %%eax, %2 \n\t"
49 "subl $4, %0 \n\t"
50 "jnz 1b \n\t"
51 :"+g"(h), "+S"(pixels), "+D"(block)
52 :"r" (line_size)
53 :"%eax", "memory");
54 }
55
56 /* GL: this function does incorrect rounding if overflow */
57 static void DEF(put_no_rnd_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
58 {
59 __asm __volatile(
60 "lea (%3, %3), %%eax \n\t"
61 MOVQ_BONE(%%mm7)
31 "1: \n\t" 62 "1: \n\t"
32 "movq (%1), %%mm0 \n\t" 63 "movq (%1), %%mm0 \n\t"
33 "movq (%1, %3), %%mm2 \n\t" 64 "movq (%1, %3), %%mm2 \n\t"
34 "movq 1(%1), %%mm1 \n\t" 65 "movq 1(%1), %%mm1 \n\t"
35 "movq 1(%1, %3), %%mm3 \n\t" 66 "movq 1(%1, %3), %%mm3 \n\t"
36 "addl %%eax, %1 \n\t" 67 "addl %%eax, %1 \n\t"
68 "psubusb %%mm7, %%mm0 \n\t"
69 "psubusb %%mm7, %%mm2 \n\t"
37 PAVGB" %%mm1, %%mm0 \n\t" 70 PAVGB" %%mm1, %%mm0 \n\t"
38 PAVGB" %%mm3, %%mm2 \n\t" 71 PAVGB" %%mm3, %%mm2 \n\t"
39 "movq %%mm0, (%2) \n\t" 72 "movq %%mm0, (%2) \n\t"
40 "movq %%mm2, (%2, %3) \n\t" 73 "movq %%mm2, (%2, %3) \n\t"
41 "movq (%1), %%mm0 \n\t" 74 "movq (%1), %%mm0 \n\t"
42 "movq 1(%1), %%mm1 \n\t" 75 "movq 1(%1), %%mm1 \n\t"
43 "movq (%1, %3), %%mm2 \n\t" 76 "movq (%1, %3), %%mm2 \n\t"
44 "movq 1(%1, %3), %%mm3 \n\t" 77 "movq 1(%1, %3), %%mm3 \n\t"
45 "addl %%eax, %2 \n\t" 78 "addl %%eax, %2 \n\t"
46 "addl %%eax, %1 \n\t" 79 "addl %%eax, %1 \n\t"
80 "psubusb %%mm7, %%mm0 \n\t"
81 "psubusb %%mm7, %%mm2 \n\t"
47 PAVGB" %%mm1, %%mm0 \n\t" 82 PAVGB" %%mm1, %%mm0 \n\t"
48 PAVGB" %%mm3, %%mm2 \n\t" 83 PAVGB" %%mm3, %%mm2 \n\t"
49 "movq %%mm0, (%2) \n\t" 84 "movq %%mm0, (%2) \n\t"
50 "movq %%mm2, (%2, %3) \n\t" 85 "movq %%mm2, (%2, %3) \n\t"
51 "addl %%eax, %2 \n\t" 86 "addl %%eax, %2 \n\t"
52 "subl $4, %0 \n\t" 87 "subl $4, %0 \n\t"
53 " jnz 1b \n\t" 88 "jnz 1b \n\t"
54 :"+g"(h), "+S"(pixels), "+D"(block) 89 :"+g"(h), "+S"(pixels), "+D"(block)
55 :"c" (line_size) 90 :"r" (line_size)
56 :"%eax", "memory");
57 }
58
59 /* GL: this function does incorrect rounding if overflow */
60 static void DEF(put_no_rnd_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
61 {
62 __asm __volatile(
63 "lea (%3, %3), %%eax \n\t"
64 MOVQ_BONE(%%mm7)
65 "1: \n\t"
66 "movq (%1), %%mm0 \n\t"
67 "movq (%1, %3), %%mm2 \n\t"
68 "movq 1(%1), %%mm1 \n\t"
69 "movq 1(%1, %3), %%mm3 \n\t"
70 "addl %%eax, %1 \n\t"
71 "psubusb %%mm7, %%mm0 \n\t"
72 "psubusb %%mm7, %%mm2 \n\t"
73 PAVGB" %%mm1, %%mm0 \n\t"
74 PAVGB" %%mm3, %%mm2 \n\t"
75 "movq %%mm0, (%2) \n\t"
76 "movq %%mm2, (%2, %3) \n\t"
77 "movq (%1), %%mm0 \n\t"
78 "movq 1(%1), %%mm1 \n\t"
79 "movq (%1, %3), %%mm2 \n\t"
80 "movq 1(%1, %3), %%mm3 \n\t"
81 "addl %%eax, %2 \n\t"
82 "addl %%eax, %1 \n\t"
83 "psubusb %%mm7, %%mm0 \n\t"
84 "psubusb %%mm7, %%mm2 \n\t"
85 PAVGB" %%mm1, %%mm0 \n\t"
86 PAVGB" %%mm3, %%mm2 \n\t"
87 "movq %%mm0, (%2) \n\t"
88 "movq %%mm2, (%2, %3) \n\t"
89 "addl %%eax, %2 \n\t"
90 "subl $4, %0 \n\t"
91 "jnz 1b \n\t"
92 :"+g"(h), "+S"(pixels), "+D"(block)
93 :"c" (line_size)
94 :"%eax", "memory"); 91 :"%eax", "memory");
95 } 92 }
96 93
97 static void DEF(put_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) 94 static void DEF(put_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
98 { 95 {
99 __asm __volatile( 96 __asm __volatile(
100 "lea (%3, %3), %%eax \n\t" 97 "lea (%3, %3), %%eax \n\t"
101 "movq (%1), %%mm0 \n\t" 98 "movq (%1), %%mm0 \n\t"
102 "subl %3, %2 \n\t" 99 "subl %3, %2 \n\t"
103 "1: \n\t" 100 "1: \n\t"
104 "movq (%1, %3), %%mm1 \n\t" 101 "movq (%1, %3), %%mm1 \n\t"
105 "movq (%1, %%eax), %%mm2 \n\t" 102 "movq (%1, %%eax), %%mm2 \n\t"
106 "addl %%eax, %1 \n\t" 103 "addl %%eax, %1 \n\t"
107 PAVGB" %%mm1, %%mm0 \n\t" 104 PAVGB" %%mm1, %%mm0 \n\t"
108 PAVGB" %%mm2, %%mm1 \n\t" 105 PAVGB" %%mm2, %%mm1 \n\t"
109 "movq %%mm0, (%2, %3) \n\t" 106 "movq %%mm0, (%2, %3) \n\t"
110 "movq %%mm1, (%2, %%eax) \n\t" 107 "movq %%mm1, (%2, %%eax) \n\t"
111 "movq (%1, %3), %%mm1 \n\t" 108 "movq (%1, %3), %%mm1 \n\t"
112 "movq (%1, %%eax), %%mm0 \n\t" 109 "movq (%1, %%eax), %%mm0 \n\t"
113 "addl %%eax, %2 \n\t" 110 "addl %%eax, %2 \n\t"
114 "addl %%eax, %1 \n\t" 111 "addl %%eax, %1 \n\t"
115 PAVGB" %%mm1, %%mm2 \n\t" 112 PAVGB" %%mm1, %%mm2 \n\t"
116 PAVGB" %%mm0, %%mm1 \n\t" 113 PAVGB" %%mm0, %%mm1 \n\t"
117 "movq %%mm2, (%2, %3) \n\t" 114 "movq %%mm2, (%2, %3) \n\t"
118 "movq %%mm1, (%2, %%eax) \n\t" 115 "movq %%mm1, (%2, %%eax) \n\t"
119 "addl %%eax, %2 \n\t" 116 "addl %%eax, %2 \n\t"
120 "subl $4, %0 \n\t" 117 "subl $4, %0 \n\t"
121 "jnz 1b \n\t" 118 "jnz 1b \n\t"
122 :"+g"(h), "+S"(pixels), "+D" (block) 119 :"+g"(h), "+S"(pixels), "+D" (block)
123 :"c"(line_size) 120 :"r" (line_size)
124 :"%eax", "memory"); 121 :"%eax", "memory");
125 } 122 }
126 123
127 /* GL: this function does incorrect rounding if overflow */ 124 /* GL: this function does incorrect rounding if overflow */
128 static void DEF(put_no_rnd_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) 125 static void DEF(put_no_rnd_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
129 { 126 {
130 __asm __volatile( 127 __asm __volatile(
131 MOVQ_BONE(%%mm7) 128 MOVQ_BONE(%%mm7)
132 "lea (%3, %3), %%eax \n\t" 129 "lea (%3, %3), %%eax \n\t"
133 "movq (%1), %%mm0 \n\t" 130 "movq (%1), %%mm0 \n\t"
134 "subl %3, %2 \n\t" 131 "subl %3, %2 \n\t"
135 "1: \n\t" 132 "1: \n\t"
136 "movq (%1, %3), %%mm1 \n\t" 133 "movq (%1, %3), %%mm1 \n\t"
137 "movq (%1, %%eax), %%mm2 \n\t" 134 "movq (%1, %%eax), %%mm2 \n\t"
138 "addl %%eax, %1 \n\t" 135 "addl %%eax, %1 \n\t"
139 "psubusb %%mm7, %%mm1 \n\t" 136 "psubusb %%mm7, %%mm1 \n\t"
140 PAVGB" %%mm1, %%mm0 \n\t" 137 PAVGB" %%mm1, %%mm0 \n\t"
141 PAVGB" %%mm2, %%mm1 \n\t" 138 PAVGB" %%mm2, %%mm1 \n\t"
142 "movq %%mm0, (%2, %3) \n\t" 139 "movq %%mm0, (%2, %3) \n\t"
143 "movq %%mm1, (%2, %%eax) \n\t" 140 "movq %%mm1, (%2, %%eax) \n\t"
144 "movq (%1, %3), %%mm1 \n\t" 141 "movq (%1, %3), %%mm1 \n\t"
145 "movq (%1, %%eax), %%mm0 \n\t" 142 "movq (%1, %%eax), %%mm0 \n\t"
146 "addl %%eax, %2 \n\t" 143 "addl %%eax, %2 \n\t"
147 "addl %%eax, %1 \n\t" 144 "addl %%eax, %1 \n\t"
148 "psubusb %%mm7, %%mm1 \n\t" 145 "psubusb %%mm7, %%mm1 \n\t"
149 PAVGB" %%mm1, %%mm2 \n\t" 146 PAVGB" %%mm1, %%mm2 \n\t"
150 PAVGB" %%mm0, %%mm1 \n\t" 147 PAVGB" %%mm0, %%mm1 \n\t"
151 "movq %%mm2, (%2, %3) \n\t" 148 "movq %%mm2, (%2, %3) \n\t"
152 "movq %%mm1, (%2, %%eax) \n\t" 149 "movq %%mm1, (%2, %%eax) \n\t"
153 "addl %%eax, %2 \n\t" 150 "addl %%eax, %2 \n\t"
154 "subl $4, %0 \n\t" 151 "subl $4, %0 \n\t"
155 "jnz 1b \n\t" 152 "jnz 1b \n\t"
156 :"+g"(h), "+S"(pixels), "+D" (block) 153 :"+g"(h), "+S"(pixels), "+D" (block)
157 :"c"(line_size) 154 :"r" (line_size)
158 :"%eax", "memory"); 155 :"%eax", "memory");
159 } 156 }
160 157
161 static void DEF(avg_pixels)(UINT8 *block, const UINT8 *pixels, int line_size, int h) 158 static void DEF(avg_pixels)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
162 { 159 {
163 __asm __volatile( 160 __asm __volatile(
164 "xorl %%eax, %%eax \n\t" 161 "lea (%3, %3), %%eax \n\t"
165 ".balign 16 \n\t" 162 "1: \n\t"
166 "1: \n\t" 163 "movq (%2), %%mm0 \n\t"
164 "movq (%2, %3), %%mm1 \n\t"
165 PAVGB" (%1), %%mm0 \n\t"
166 PAVGB" (%1, %3), %%mm1 \n\t"
167 "movq %%mm0, (%2) \n\t"
168 "movq %%mm1, (%2, %3) \n\t"
169 "addl %%eax, %1 \n\t"
170 "addl %%eax, %2 \n\t"
171 "movq (%2), %%mm0 \n\t"
172 "movq (%2, %3), %%mm1 \n\t"
173 PAVGB" (%1), %%mm0 \n\t"
174 PAVGB" (%1, %3), %%mm1 \n\t"
175 "addl %%eax, %1 \n\t"
176 "movq %%mm0, (%2) \n\t"
177 "movq %%mm1, (%2, %3) \n\t"
178 "addl %%eax, %2 \n\t"
179 "subl $4, %0 \n\t"
180 "jnz 1b \n\t"
181 :"+g"(h), "+S"(pixels), "+D"(block)
182 :"r" (line_size)
183 :"%eax", "memory");
184 }
185
186 static void DEF(avg_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
187 {
188 __asm __volatile(
189 "lea (%3, %3), %%eax \n\t"
190 "1: \n\t"
191 "movq (%1), %%mm0 \n\t"
192 "movq (%1, %3), %%mm2 \n\t"
193 PAVGB" 1(%1), %%mm0 \n\t"
194 PAVGB" 1(%1, %3), %%mm2 \n\t"
195 PAVGB" (%2), %%mm0 \n\t"
196 PAVGB" (%2, %3), %%mm2 \n\t"
197 "addl %%eax, %1 \n\t"
198 "movq %%mm0, (%2) \n\t"
199 "movq %%mm2, (%2, %3) \n\t"
200 "movq (%1), %%mm0 \n\t"
201 "movq (%1, %3), %%mm2 \n\t"
202 PAVGB" 1(%1), %%mm0 \n\t"
203 PAVGB" 1(%1, %3), %%mm2 \n\t"
204 "addl %%eax, %2 \n\t"
205 "addl %%eax, %1 \n\t"
206 PAVGB" (%2), %%mm0 \n\t"
207 PAVGB" (%2, %3), %%mm2 \n\t"
208 "movq %%mm0, (%2) \n\t"
209 "movq %%mm2, (%2, %3) \n\t"
210 "addl %%eax, %2 \n\t"
211 "subl $4, %0 \n\t"
212 "jnz 1b \n\t"
213 :"+g"(h), "+S"(pixels), "+D"(block)
214 :"r" (line_size)
215 :"%eax", "memory");
216 }
217
218 static void DEF(avg_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
219 {
220 __asm __volatile(
221 "lea (%3, %3), %%eax \n\t"
222 "movq (%1), %%mm0 \n\t"
223 "subl %3, %2 \n\t"
224 "1: \n\t"
225 "movq (%1, %3), %%mm1 \n\t"
226 "movq (%1, %%eax), %%mm2 \n\t"
227 "addl %%eax, %1 \n\t"
228 PAVGB" %%mm1, %%mm0 \n\t"
229 PAVGB" %%mm2, %%mm1 \n\t"
230 "movq (%2, %3), %%mm3 \n\t"
231 "movq (%2, %%eax), %%mm4 \n\t"
232 PAVGB" %%mm3, %%mm0 \n\t"
233 PAVGB" %%mm4, %%mm1 \n\t"
234 "movq %%mm0, (%2, %3) \n\t"
235 "movq %%mm1, (%2, %%eax) \n\t"
236 "movq (%1, %3), %%mm1 \n\t"
167 "movq (%1, %%eax), %%mm0 \n\t" 237 "movq (%1, %%eax), %%mm0 \n\t"
168 "movq (%2, %%eax), %%mm2 \n\t" 238 PAVGB" %%mm1, %%mm2 \n\t"
169 "movq (%3, %%eax), %%mm3 \n\t" 239 PAVGB" %%mm0, %%mm1 \n\t"
170 "movq (%4, %%eax), %%mm4 \n\t" 240 "addl %%eax, %2 \n\t"
171 PAVGB" %%mm3, %%mm0 \n\t" 241 "addl %%eax, %1 \n\t"
172 PAVGB" %%mm4, %%mm2 \n\t" 242 "movq (%2, %3), %%mm3 \n\t"
173 "movq %%mm0, (%3, %%eax) \n\t" 243 "movq (%2, %%eax), %%mm4 \n\t"
174 "movq %%mm2, (%4, %%eax) \n\t"
175 "addl %5, %%eax \n\t"
176 "movq (%1, %%eax), %%mm0 \n\t"
177 "movq (%2, %%eax), %%mm2 \n\t"
178 "movq (%3, %%eax), %%mm3 \n\t"
179 "movq (%4, %%eax), %%mm4 \n\t"
180 PAVGB" %%mm3, %%mm0 \n\t"
181 PAVGB" %%mm4, %%mm2 \n\t"
182 "movq %%mm0, (%3, %%eax) \n\t"
183 "movq %%mm2, (%4, %%eax) \n\t"
184 "addl %5, %%eax \n\t"
185 "subl $4, %0 \n\t"
186 " jnz 1b \n\t"
187 :"+g"(h)
188 :"D"(pixels), "S"(pixels+line_size), "r" (block), "r" (block+line_size),
189 "g"(line_size<<1)
190 :"%eax", "memory");
191 }
192
193 static void DEF(avg_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
194 {
195 __asm __volatile(
196 "xorl %%eax, %%eax \n\t"
197 ".balign 16 \n\t"
198 "1: \n\t"
199 "movq (%1, %%eax), %%mm0 \n\t"
200 "movq 1(%1, %%eax), %%mm1 \n\t"
201 "movq (%2, %%eax), %%mm2 \n\t"
202 "movq 1(%2, %%eax), %%mm3 \n\t"
203 PAVGB" %%mm1, %%mm0 \n\t"
204 PAVGB" %%mm3, %%mm2 \n\t" 244 PAVGB" %%mm3, %%mm2 \n\t"
205 "movq (%3, %%eax), %%mm3 \n\t" 245 PAVGB" %%mm4, %%mm1 \n\t"
206 "movq (%4, %%eax), %%mm4 \n\t" 246 "movq %%mm2, (%2, %3) \n\t"
207 PAVGB" %%mm3, %%mm0 \n\t" 247 "movq %%mm1, (%2, %%eax) \n\t"
208 PAVGB" %%mm4, %%mm2 \n\t" 248 "addl %%eax, %2 \n\t"
209 "movq %%mm0, (%3, %%eax) \n\t" 249 "subl $4, %0 \n\t"
210 "movq %%mm2, (%4, %%eax) \n\t" 250 "jnz 1b \n\t"
211 "addl %5, %%eax \n\t" 251 :"+g"(h), "+S"(pixels), "+D"(block)
212 "movq (%1, %%eax), %%mm0 \n\t" 252 :"r" (line_size)
213 "movq 1(%1, %%eax), %%mm1 \n\t" 253 :"%eax", "memory");
214 "movq (%2, %%eax), %%mm2 \n\t" 254 }
215 "movq 1(%2, %%eax), %%mm3 \n\t" 255
216 PAVGB" %%mm1, %%mm0 \n\t" 256 // Note this is not correctly rounded, but this function is only used for b frames so it doesnt matter
217 PAVGB" %%mm3, %%mm2 \n\t" 257 static void DEF(avg_pixels_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
218 "movq (%3, %%eax), %%mm3 \n\t" 258 {
219 "movq (%4, %%eax), %%mm4 \n\t" 259 __asm __volatile(
220 PAVGB" %%mm3, %%mm0 \n\t" 260 MOVQ_BONE(%%mm7)
221 PAVGB" %%mm4, %%mm2 \n\t" 261 "xorl %%eax, %%eax \n\t"
222 "movq %%mm0, (%3, %%eax) \n\t" 262 "movq (%1), %%mm0 \n\t"
223 "movq %%mm2, (%4, %%eax) \n\t" 263 "movq 1(%1), %%mm1 \n\t"
224 "addl %5, %%eax \n\t" 264 PAVGB" %%mm1, %%mm0 \n\t"
225 "subl $4, %0 \n\t" 265 ".balign 16 \n\t"
226 " jnz 1b \n\t" 266 "1: \n\t"
227 :"+g"(h)
228 :"D"(pixels), "S"(pixels+line_size), "r" (block), "r" (block+line_size),
229 "g"(line_size<<1)
230 :"%eax", "memory");
231 }
232
233 static void DEF(avg_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
234 {
235 __asm __volatile(
236 "xorl %%eax, %%eax \n\t"
237 "movq (%1), %%mm0 \n\t"
238 ".balign 16 \n\t"
239 "1: \n\t"
240 "movq (%2, %%eax), %%mm1 \n\t" 267 "movq (%2, %%eax), %%mm1 \n\t"
241 "movq (%3, %%eax), %%mm2 \n\t" 268 "movq (%3, %%eax), %%mm2 \n\t"
269 "movq 1(%2, %%eax), %%mm3 \n\t"
270 "movq 1(%3, %%eax), %%mm4 \n\t"
271 "psubusb %%mm7, %%mm2 \n\t"
272 PAVGB" %%mm3, %%mm1 \n\t"
273 PAVGB" %%mm4, %%mm2 \n\t"
242 PAVGB" %%mm1, %%mm0 \n\t" 274 PAVGB" %%mm1, %%mm0 \n\t"
243 PAVGB" %%mm2, %%mm1 \n\t" 275 PAVGB" %%mm2, %%mm1 \n\t"
244 "movq (%4, %%eax), %%mm3 \n\t" 276 "movq (%4, %%eax), %%mm3 \n\t"
245 "movq (%5, %%eax), %%mm4 \n\t" 277 "movq (%5, %%eax), %%mm4 \n\t"
246 PAVGB" %%mm3, %%mm0 \n\t" 278 PAVGB" %%mm3, %%mm0 \n\t"
247 PAVGB" %%mm4, %%mm1 \n\t" 279 PAVGB" %%mm4, %%mm1 \n\t"
248 "movq %%mm0, (%4, %%eax) \n\t" 280 "movq %%mm0, (%4, %%eax) \n\t"
249 "movq %%mm1, (%5, %%eax) \n\t" 281 "movq %%mm1, (%5, %%eax) \n\t"
250 "addl %6, %%eax \n\t" 282 "addl %6, %%eax \n\t"
251 "movq (%2, %%eax), %%mm1 \n\t"
252 "movq (%3, %%eax), %%mm0 \n\t"
253 PAVGB" %%mm1, %%mm2 \n\t"
254 PAVGB" %%mm0, %%mm1 \n\t"
255 "movq (%4, %%eax), %%mm3 \n\t"
256 "movq (%5, %%eax), %%mm4 \n\t"
257 PAVGB" %%mm3, %%mm2 \n\t"
258 PAVGB" %%mm4, %%mm1 \n\t"
259 "movq %%mm2, (%4, %%eax) \n\t"
260 "movq %%mm1, (%5, %%eax) \n\t"
261 "addl %6, %%eax \n\t"
262 "subl $4, %0 \n\t"
263 " jnz 1b \n\t"
264 :"+g"(h)
265 :"D"(pixels), "S"(pixels+line_size), "r"(pixels+line_size*2), "r" (block),
266 "r" (block+line_size), "g"(line_size<<1)
267 :"%eax", "memory");
268 }
269
270 // Note this is not correctly rounded, but this function is only used for b frames so it doesnt matter
271 static void DEF(avg_pixels_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
272 {
273 __asm __volatile(
274 MOVQ_BONE(%%mm7)
275 "xorl %%eax, %%eax \n\t"
276 "movq (%1), %%mm0 \n\t"
277 "movq 1(%1), %%mm1 \n\t"
278 PAVGB" %%mm1, %%mm0 \n\t"
279 ".balign 16 \n\t"
280 "1: \n\t"
281 "movq (%2, %%eax), %%mm1 \n\t"
282 "movq (%3, %%eax), %%mm2 \n\t"
283 "movq 1(%2, %%eax), %%mm3 \n\t"
284 "movq 1(%3, %%eax), %%mm4 \n\t"
285 "psubusb %%mm7, %%mm2 \n\t"
286 PAVGB" %%mm3, %%mm1 \n\t"
287 PAVGB" %%mm4, %%mm2 \n\t"
288 PAVGB" %%mm1, %%mm0 \n\t"
289 PAVGB" %%mm2, %%mm1 \n\t"
290 "movq (%4, %%eax), %%mm3 \n\t"
291 "movq (%5, %%eax), %%mm4 \n\t"
292 PAVGB" %%mm3, %%mm0 \n\t"
293 PAVGB" %%mm4, %%mm1 \n\t"
294 "movq %%mm0, (%4, %%eax) \n\t"
295 "movq %%mm1, (%5, %%eax) \n\t"
296 "addl %6, %%eax \n\t"
297 "movq (%2, %%eax), %%mm1 \n\t" 283 "movq (%2, %%eax), %%mm1 \n\t"
298 "movq (%3, %%eax), %%mm0 \n\t" 284 "movq (%3, %%eax), %%mm0 \n\t"
299 "movq 1(%2, %%eax), %%mm3 \n\t" 285 "movq 1(%2, %%eax), %%mm3 \n\t"
300 "movq 1(%3, %%eax), %%mm4 \n\t" 286 "movq 1(%3, %%eax), %%mm4 \n\t"
301 PAVGB" %%mm3, %%mm1 \n\t" 287 PAVGB" %%mm3, %%mm1 \n\t"
306 "movq (%5, %%eax), %%mm4 \n\t" 292 "movq (%5, %%eax), %%mm4 \n\t"
307 PAVGB" %%mm3, %%mm2 \n\t" 293 PAVGB" %%mm3, %%mm2 \n\t"
308 PAVGB" %%mm4, %%mm1 \n\t" 294 PAVGB" %%mm4, %%mm1 \n\t"
309 "movq %%mm2, (%4, %%eax) \n\t" 295 "movq %%mm2, (%4, %%eax) \n\t"
310 "movq %%mm1, (%5, %%eax) \n\t" 296 "movq %%mm1, (%5, %%eax) \n\t"
311 "addl %6, %%eax \n\t" 297 "addl %6, %%eax \n\t"
312 "subl $4, %0 \n\t" 298 "subl $4, %0 \n\t"
313 " jnz 1b \n\t" 299 " jnz 1b \n\t"
314 :"+g"(h) 300 :"+g"(h)
315 :"D"(pixels), "S"(pixels+line_size), "r"(pixels+line_size*2), "r" (block), 301 :"D"(pixels), "S"(pixels+line_size), "r"(pixels+line_size*2), "r" (block),
316 "r" (block+line_size), "g"(line_size<<1) 302 "r" (block+line_size), "g"(line_size<<1)
317 :"%eax", "memory"); 303 :"%eax", "memory");
318 } 304 }