Mercurial > libavcodec.hg
comparison i386/dsputil_mmx_avg.h @ 441:c0de4d3c7d3c libavcodec
* optimized avg_* functions (except xy2)
* minor speedup for put_pixels_x2 & cleanup
author | kabi |
---|---|
date | Tue, 28 May 2002 16:35:58 +0000 |
parents | 6ae275655a23 |
children | 006965950f49 |
comparison
equal
deleted
inserted
replaced
440:000aeeac27a2 | 441:c0de4d3c7d3c |
---|---|
17 * License along with this library; if not, write to the Free Software | 17 * License along with this library; if not, write to the Free Software |
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | 18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
19 * | 19 * |
20 * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | 20 * MMX optimization by Nick Kurshev <nickols_k@mail.ru> |
21 * mostly rewritten by Michael Niedermayer <michaelni@gmx.at> | 21 * mostly rewritten by Michael Niedermayer <michaelni@gmx.at> |
22 * and improved by Zdenek Kabelac <kabi@users.sf.net> | |
22 */ | 23 */ |
23 | 24 |
24 /* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm | 25 /* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm |
25 clobber bug - now it will work with 2.95.2 and also with -fPIC | 26 clobber bug - now it will work with 2.95.2 and also with -fPIC |
26 */ | 27 */ |
27 static void DEF(put_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) | 28 static void DEF(put_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
28 { | 29 { |
29 __asm __volatile( | 30 __asm __volatile( |
30 "lea (%3, %3), %%eax \n\t" | 31 "lea (%3, %3), %%eax \n\t" |
32 "1: \n\t" | |
33 "movq (%1), %%mm0 \n\t" | |
34 "movq (%1, %3), %%mm1 \n\t" | |
35 PAVGB" 1(%1), %%mm0 \n\t" | |
36 PAVGB" 1(%1, %3), %%mm1 \n\t" | |
37 "movq %%mm0, (%2) \n\t" | |
38 "movq %%mm1, (%2, %3) \n\t" | |
39 "addl %%eax, %1 \n\t" | |
40 "addl %%eax, %2 \n\t" | |
41 "movq (%1), %%mm0 \n\t" | |
42 "movq (%1, %3), %%mm1 \n\t" | |
43 PAVGB" 1(%1), %%mm0 \n\t" | |
44 PAVGB" 1(%1, %3), %%mm1 \n\t" | |
45 "addl %%eax, %1 \n\t" | |
46 "movq %%mm0, (%2) \n\t" | |
47 "movq %%mm1, (%2, %3) \n\t" | |
48 "addl %%eax, %2 \n\t" | |
49 "subl $4, %0 \n\t" | |
50 "jnz 1b \n\t" | |
51 :"+g"(h), "+S"(pixels), "+D"(block) | |
52 :"r" (line_size) | |
53 :"%eax", "memory"); | |
54 } | |
55 | |
56 /* GL: this function does incorrect rounding if overflow */ | |
57 static void DEF(put_no_rnd_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
58 { | |
59 __asm __volatile( | |
60 "lea (%3, %3), %%eax \n\t" | |
61 MOVQ_BONE(%%mm7) | |
31 "1: \n\t" | 62 "1: \n\t" |
32 "movq (%1), %%mm0 \n\t" | 63 "movq (%1), %%mm0 \n\t" |
33 "movq (%1, %3), %%mm2 \n\t" | 64 "movq (%1, %3), %%mm2 \n\t" |
34 "movq 1(%1), %%mm1 \n\t" | 65 "movq 1(%1), %%mm1 \n\t" |
35 "movq 1(%1, %3), %%mm3 \n\t" | 66 "movq 1(%1, %3), %%mm3 \n\t" |
36 "addl %%eax, %1 \n\t" | 67 "addl %%eax, %1 \n\t" |
68 "psubusb %%mm7, %%mm0 \n\t" | |
69 "psubusb %%mm7, %%mm2 \n\t" | |
37 PAVGB" %%mm1, %%mm0 \n\t" | 70 PAVGB" %%mm1, %%mm0 \n\t" |
38 PAVGB" %%mm3, %%mm2 \n\t" | 71 PAVGB" %%mm3, %%mm2 \n\t" |
39 "movq %%mm0, (%2) \n\t" | 72 "movq %%mm0, (%2) \n\t" |
40 "movq %%mm2, (%2, %3) \n\t" | 73 "movq %%mm2, (%2, %3) \n\t" |
41 "movq (%1), %%mm0 \n\t" | 74 "movq (%1), %%mm0 \n\t" |
42 "movq 1(%1), %%mm1 \n\t" | 75 "movq 1(%1), %%mm1 \n\t" |
43 "movq (%1, %3), %%mm2 \n\t" | 76 "movq (%1, %3), %%mm2 \n\t" |
44 "movq 1(%1, %3), %%mm3 \n\t" | 77 "movq 1(%1, %3), %%mm3 \n\t" |
45 "addl %%eax, %2 \n\t" | 78 "addl %%eax, %2 \n\t" |
46 "addl %%eax, %1 \n\t" | 79 "addl %%eax, %1 \n\t" |
80 "psubusb %%mm7, %%mm0 \n\t" | |
81 "psubusb %%mm7, %%mm2 \n\t" | |
47 PAVGB" %%mm1, %%mm0 \n\t" | 82 PAVGB" %%mm1, %%mm0 \n\t" |
48 PAVGB" %%mm3, %%mm2 \n\t" | 83 PAVGB" %%mm3, %%mm2 \n\t" |
49 "movq %%mm0, (%2) \n\t" | 84 "movq %%mm0, (%2) \n\t" |
50 "movq %%mm2, (%2, %3) \n\t" | 85 "movq %%mm2, (%2, %3) \n\t" |
51 "addl %%eax, %2 \n\t" | 86 "addl %%eax, %2 \n\t" |
52 "subl $4, %0 \n\t" | 87 "subl $4, %0 \n\t" |
53 " jnz 1b \n\t" | 88 "jnz 1b \n\t" |
54 :"+g"(h), "+S"(pixels), "+D"(block) | 89 :"+g"(h), "+S"(pixels), "+D"(block) |
55 :"c" (line_size) | 90 :"r" (line_size) |
56 :"%eax", "memory"); | |
57 } | |
58 | |
59 /* GL: this function does incorrect rounding if overflow */ | |
60 static void DEF(put_no_rnd_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
61 { | |
62 __asm __volatile( | |
63 "lea (%3, %3), %%eax \n\t" | |
64 MOVQ_BONE(%%mm7) | |
65 "1: \n\t" | |
66 "movq (%1), %%mm0 \n\t" | |
67 "movq (%1, %3), %%mm2 \n\t" | |
68 "movq 1(%1), %%mm1 \n\t" | |
69 "movq 1(%1, %3), %%mm3 \n\t" | |
70 "addl %%eax, %1 \n\t" | |
71 "psubusb %%mm7, %%mm0 \n\t" | |
72 "psubusb %%mm7, %%mm2 \n\t" | |
73 PAVGB" %%mm1, %%mm0 \n\t" | |
74 PAVGB" %%mm3, %%mm2 \n\t" | |
75 "movq %%mm0, (%2) \n\t" | |
76 "movq %%mm2, (%2, %3) \n\t" | |
77 "movq (%1), %%mm0 \n\t" | |
78 "movq 1(%1), %%mm1 \n\t" | |
79 "movq (%1, %3), %%mm2 \n\t" | |
80 "movq 1(%1, %3), %%mm3 \n\t" | |
81 "addl %%eax, %2 \n\t" | |
82 "addl %%eax, %1 \n\t" | |
83 "psubusb %%mm7, %%mm0 \n\t" | |
84 "psubusb %%mm7, %%mm2 \n\t" | |
85 PAVGB" %%mm1, %%mm0 \n\t" | |
86 PAVGB" %%mm3, %%mm2 \n\t" | |
87 "movq %%mm0, (%2) \n\t" | |
88 "movq %%mm2, (%2, %3) \n\t" | |
89 "addl %%eax, %2 \n\t" | |
90 "subl $4, %0 \n\t" | |
91 "jnz 1b \n\t" | |
92 :"+g"(h), "+S"(pixels), "+D"(block) | |
93 :"c" (line_size) | |
94 :"%eax", "memory"); | 91 :"%eax", "memory"); |
95 } | 92 } |
96 | 93 |
97 static void DEF(put_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) | 94 static void DEF(put_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
98 { | 95 { |
99 __asm __volatile( | 96 __asm __volatile( |
100 "lea (%3, %3), %%eax \n\t" | 97 "lea (%3, %3), %%eax \n\t" |
101 "movq (%1), %%mm0 \n\t" | 98 "movq (%1), %%mm0 \n\t" |
102 "subl %3, %2 \n\t" | 99 "subl %3, %2 \n\t" |
103 "1: \n\t" | 100 "1: \n\t" |
104 "movq (%1, %3), %%mm1 \n\t" | 101 "movq (%1, %3), %%mm1 \n\t" |
105 "movq (%1, %%eax), %%mm2 \n\t" | 102 "movq (%1, %%eax), %%mm2 \n\t" |
106 "addl %%eax, %1 \n\t" | 103 "addl %%eax, %1 \n\t" |
107 PAVGB" %%mm1, %%mm0 \n\t" | 104 PAVGB" %%mm1, %%mm0 \n\t" |
108 PAVGB" %%mm2, %%mm1 \n\t" | 105 PAVGB" %%mm2, %%mm1 \n\t" |
109 "movq %%mm0, (%2, %3) \n\t" | 106 "movq %%mm0, (%2, %3) \n\t" |
110 "movq %%mm1, (%2, %%eax) \n\t" | 107 "movq %%mm1, (%2, %%eax) \n\t" |
111 "movq (%1, %3), %%mm1 \n\t" | 108 "movq (%1, %3), %%mm1 \n\t" |
112 "movq (%1, %%eax), %%mm0 \n\t" | 109 "movq (%1, %%eax), %%mm0 \n\t" |
113 "addl %%eax, %2 \n\t" | 110 "addl %%eax, %2 \n\t" |
114 "addl %%eax, %1 \n\t" | 111 "addl %%eax, %1 \n\t" |
115 PAVGB" %%mm1, %%mm2 \n\t" | 112 PAVGB" %%mm1, %%mm2 \n\t" |
116 PAVGB" %%mm0, %%mm1 \n\t" | 113 PAVGB" %%mm0, %%mm1 \n\t" |
117 "movq %%mm2, (%2, %3) \n\t" | 114 "movq %%mm2, (%2, %3) \n\t" |
118 "movq %%mm1, (%2, %%eax) \n\t" | 115 "movq %%mm1, (%2, %%eax) \n\t" |
119 "addl %%eax, %2 \n\t" | 116 "addl %%eax, %2 \n\t" |
120 "subl $4, %0 \n\t" | 117 "subl $4, %0 \n\t" |
121 "jnz 1b \n\t" | 118 "jnz 1b \n\t" |
122 :"+g"(h), "+S"(pixels), "+D" (block) | 119 :"+g"(h), "+S"(pixels), "+D" (block) |
123 :"c"(line_size) | 120 :"r" (line_size) |
124 :"%eax", "memory"); | 121 :"%eax", "memory"); |
125 } | 122 } |
126 | 123 |
127 /* GL: this function does incorrect rounding if overflow */ | 124 /* GL: this function does incorrect rounding if overflow */ |
128 static void DEF(put_no_rnd_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) | 125 static void DEF(put_no_rnd_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
129 { | 126 { |
130 __asm __volatile( | 127 __asm __volatile( |
131 MOVQ_BONE(%%mm7) | 128 MOVQ_BONE(%%mm7) |
132 "lea (%3, %3), %%eax \n\t" | 129 "lea (%3, %3), %%eax \n\t" |
133 "movq (%1), %%mm0 \n\t" | 130 "movq (%1), %%mm0 \n\t" |
134 "subl %3, %2 \n\t" | 131 "subl %3, %2 \n\t" |
135 "1: \n\t" | 132 "1: \n\t" |
136 "movq (%1, %3), %%mm1 \n\t" | 133 "movq (%1, %3), %%mm1 \n\t" |
137 "movq (%1, %%eax), %%mm2 \n\t" | 134 "movq (%1, %%eax), %%mm2 \n\t" |
138 "addl %%eax, %1 \n\t" | 135 "addl %%eax, %1 \n\t" |
139 "psubusb %%mm7, %%mm1 \n\t" | 136 "psubusb %%mm7, %%mm1 \n\t" |
140 PAVGB" %%mm1, %%mm0 \n\t" | 137 PAVGB" %%mm1, %%mm0 \n\t" |
141 PAVGB" %%mm2, %%mm1 \n\t" | 138 PAVGB" %%mm2, %%mm1 \n\t" |
142 "movq %%mm0, (%2, %3) \n\t" | 139 "movq %%mm0, (%2, %3) \n\t" |
143 "movq %%mm1, (%2, %%eax) \n\t" | 140 "movq %%mm1, (%2, %%eax) \n\t" |
144 "movq (%1, %3), %%mm1 \n\t" | 141 "movq (%1, %3), %%mm1 \n\t" |
145 "movq (%1, %%eax), %%mm0 \n\t" | 142 "movq (%1, %%eax), %%mm0 \n\t" |
146 "addl %%eax, %2 \n\t" | 143 "addl %%eax, %2 \n\t" |
147 "addl %%eax, %1 \n\t" | 144 "addl %%eax, %1 \n\t" |
148 "psubusb %%mm7, %%mm1 \n\t" | 145 "psubusb %%mm7, %%mm1 \n\t" |
149 PAVGB" %%mm1, %%mm2 \n\t" | 146 PAVGB" %%mm1, %%mm2 \n\t" |
150 PAVGB" %%mm0, %%mm1 \n\t" | 147 PAVGB" %%mm0, %%mm1 \n\t" |
151 "movq %%mm2, (%2, %3) \n\t" | 148 "movq %%mm2, (%2, %3) \n\t" |
152 "movq %%mm1, (%2, %%eax) \n\t" | 149 "movq %%mm1, (%2, %%eax) \n\t" |
153 "addl %%eax, %2 \n\t" | 150 "addl %%eax, %2 \n\t" |
154 "subl $4, %0 \n\t" | 151 "subl $4, %0 \n\t" |
155 "jnz 1b \n\t" | 152 "jnz 1b \n\t" |
156 :"+g"(h), "+S"(pixels), "+D" (block) | 153 :"+g"(h), "+S"(pixels), "+D" (block) |
157 :"c"(line_size) | 154 :"r" (line_size) |
158 :"%eax", "memory"); | 155 :"%eax", "memory"); |
159 } | 156 } |
160 | 157 |
161 static void DEF(avg_pixels)(UINT8 *block, const UINT8 *pixels, int line_size, int h) | 158 static void DEF(avg_pixels)(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
162 { | 159 { |
163 __asm __volatile( | 160 __asm __volatile( |
164 "xorl %%eax, %%eax \n\t" | 161 "lea (%3, %3), %%eax \n\t" |
165 ".balign 16 \n\t" | 162 "1: \n\t" |
166 "1: \n\t" | 163 "movq (%2), %%mm0 \n\t" |
164 "movq (%2, %3), %%mm1 \n\t" | |
165 PAVGB" (%1), %%mm0 \n\t" | |
166 PAVGB" (%1, %3), %%mm1 \n\t" | |
167 "movq %%mm0, (%2) \n\t" | |
168 "movq %%mm1, (%2, %3) \n\t" | |
169 "addl %%eax, %1 \n\t" | |
170 "addl %%eax, %2 \n\t" | |
171 "movq (%2), %%mm0 \n\t" | |
172 "movq (%2, %3), %%mm1 \n\t" | |
173 PAVGB" (%1), %%mm0 \n\t" | |
174 PAVGB" (%1, %3), %%mm1 \n\t" | |
175 "addl %%eax, %1 \n\t" | |
176 "movq %%mm0, (%2) \n\t" | |
177 "movq %%mm1, (%2, %3) \n\t" | |
178 "addl %%eax, %2 \n\t" | |
179 "subl $4, %0 \n\t" | |
180 "jnz 1b \n\t" | |
181 :"+g"(h), "+S"(pixels), "+D"(block) | |
182 :"r" (line_size) | |
183 :"%eax", "memory"); | |
184 } | |
185 | |
186 static void DEF(avg_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
187 { | |
188 __asm __volatile( | |
189 "lea (%3, %3), %%eax \n\t" | |
190 "1: \n\t" | |
191 "movq (%1), %%mm0 \n\t" | |
192 "movq (%1, %3), %%mm2 \n\t" | |
193 PAVGB" 1(%1), %%mm0 \n\t" | |
194 PAVGB" 1(%1, %3), %%mm2 \n\t" | |
195 PAVGB" (%2), %%mm0 \n\t" | |
196 PAVGB" (%2, %3), %%mm2 \n\t" | |
197 "addl %%eax, %1 \n\t" | |
198 "movq %%mm0, (%2) \n\t" | |
199 "movq %%mm2, (%2, %3) \n\t" | |
200 "movq (%1), %%mm0 \n\t" | |
201 "movq (%1, %3), %%mm2 \n\t" | |
202 PAVGB" 1(%1), %%mm0 \n\t" | |
203 PAVGB" 1(%1, %3), %%mm2 \n\t" | |
204 "addl %%eax, %2 \n\t" | |
205 "addl %%eax, %1 \n\t" | |
206 PAVGB" (%2), %%mm0 \n\t" | |
207 PAVGB" (%2, %3), %%mm2 \n\t" | |
208 "movq %%mm0, (%2) \n\t" | |
209 "movq %%mm2, (%2, %3) \n\t" | |
210 "addl %%eax, %2 \n\t" | |
211 "subl $4, %0 \n\t" | |
212 "jnz 1b \n\t" | |
213 :"+g"(h), "+S"(pixels), "+D"(block) | |
214 :"r" (line_size) | |
215 :"%eax", "memory"); | |
216 } | |
217 | |
218 static void DEF(avg_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
219 { | |
220 __asm __volatile( | |
221 "lea (%3, %3), %%eax \n\t" | |
222 "movq (%1), %%mm0 \n\t" | |
223 "subl %3, %2 \n\t" | |
224 "1: \n\t" | |
225 "movq (%1, %3), %%mm1 \n\t" | |
226 "movq (%1, %%eax), %%mm2 \n\t" | |
227 "addl %%eax, %1 \n\t" | |
228 PAVGB" %%mm1, %%mm0 \n\t" | |
229 PAVGB" %%mm2, %%mm1 \n\t" | |
230 "movq (%2, %3), %%mm3 \n\t" | |
231 "movq (%2, %%eax), %%mm4 \n\t" | |
232 PAVGB" %%mm3, %%mm0 \n\t" | |
233 PAVGB" %%mm4, %%mm1 \n\t" | |
234 "movq %%mm0, (%2, %3) \n\t" | |
235 "movq %%mm1, (%2, %%eax) \n\t" | |
236 "movq (%1, %3), %%mm1 \n\t" | |
167 "movq (%1, %%eax), %%mm0 \n\t" | 237 "movq (%1, %%eax), %%mm0 \n\t" |
168 "movq (%2, %%eax), %%mm2 \n\t" | 238 PAVGB" %%mm1, %%mm2 \n\t" |
169 "movq (%3, %%eax), %%mm3 \n\t" | 239 PAVGB" %%mm0, %%mm1 \n\t" |
170 "movq (%4, %%eax), %%mm4 \n\t" | 240 "addl %%eax, %2 \n\t" |
171 PAVGB" %%mm3, %%mm0 \n\t" | 241 "addl %%eax, %1 \n\t" |
172 PAVGB" %%mm4, %%mm2 \n\t" | 242 "movq (%2, %3), %%mm3 \n\t" |
173 "movq %%mm0, (%3, %%eax) \n\t" | 243 "movq (%2, %%eax), %%mm4 \n\t" |
174 "movq %%mm2, (%4, %%eax) \n\t" | |
175 "addl %5, %%eax \n\t" | |
176 "movq (%1, %%eax), %%mm0 \n\t" | |
177 "movq (%2, %%eax), %%mm2 \n\t" | |
178 "movq (%3, %%eax), %%mm3 \n\t" | |
179 "movq (%4, %%eax), %%mm4 \n\t" | |
180 PAVGB" %%mm3, %%mm0 \n\t" | |
181 PAVGB" %%mm4, %%mm2 \n\t" | |
182 "movq %%mm0, (%3, %%eax) \n\t" | |
183 "movq %%mm2, (%4, %%eax) \n\t" | |
184 "addl %5, %%eax \n\t" | |
185 "subl $4, %0 \n\t" | |
186 " jnz 1b \n\t" | |
187 :"+g"(h) | |
188 :"D"(pixels), "S"(pixels+line_size), "r" (block), "r" (block+line_size), | |
189 "g"(line_size<<1) | |
190 :"%eax", "memory"); | |
191 } | |
192 | |
193 static void DEF(avg_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
194 { | |
195 __asm __volatile( | |
196 "xorl %%eax, %%eax \n\t" | |
197 ".balign 16 \n\t" | |
198 "1: \n\t" | |
199 "movq (%1, %%eax), %%mm0 \n\t" | |
200 "movq 1(%1, %%eax), %%mm1 \n\t" | |
201 "movq (%2, %%eax), %%mm2 \n\t" | |
202 "movq 1(%2, %%eax), %%mm3 \n\t" | |
203 PAVGB" %%mm1, %%mm0 \n\t" | |
204 PAVGB" %%mm3, %%mm2 \n\t" | 244 PAVGB" %%mm3, %%mm2 \n\t" |
205 "movq (%3, %%eax), %%mm3 \n\t" | 245 PAVGB" %%mm4, %%mm1 \n\t" |
206 "movq (%4, %%eax), %%mm4 \n\t" | 246 "movq %%mm2, (%2, %3) \n\t" |
207 PAVGB" %%mm3, %%mm0 \n\t" | 247 "movq %%mm1, (%2, %%eax) \n\t" |
208 PAVGB" %%mm4, %%mm2 \n\t" | 248 "addl %%eax, %2 \n\t" |
209 "movq %%mm0, (%3, %%eax) \n\t" | 249 "subl $4, %0 \n\t" |
210 "movq %%mm2, (%4, %%eax) \n\t" | 250 "jnz 1b \n\t" |
211 "addl %5, %%eax \n\t" | 251 :"+g"(h), "+S"(pixels), "+D"(block) |
212 "movq (%1, %%eax), %%mm0 \n\t" | 252 :"r" (line_size) |
213 "movq 1(%1, %%eax), %%mm1 \n\t" | 253 :"%eax", "memory"); |
214 "movq (%2, %%eax), %%mm2 \n\t" | 254 } |
215 "movq 1(%2, %%eax), %%mm3 \n\t" | 255 |
216 PAVGB" %%mm1, %%mm0 \n\t" | 256 // Note this is not correctly rounded, but this function is only used for b frames so it doesnt matter |
217 PAVGB" %%mm3, %%mm2 \n\t" | 257 static void DEF(avg_pixels_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
218 "movq (%3, %%eax), %%mm3 \n\t" | 258 { |
219 "movq (%4, %%eax), %%mm4 \n\t" | 259 __asm __volatile( |
220 PAVGB" %%mm3, %%mm0 \n\t" | 260 MOVQ_BONE(%%mm7) |
221 PAVGB" %%mm4, %%mm2 \n\t" | 261 "xorl %%eax, %%eax \n\t" |
222 "movq %%mm0, (%3, %%eax) \n\t" | 262 "movq (%1), %%mm0 \n\t" |
223 "movq %%mm2, (%4, %%eax) \n\t" | 263 "movq 1(%1), %%mm1 \n\t" |
224 "addl %5, %%eax \n\t" | 264 PAVGB" %%mm1, %%mm0 \n\t" |
225 "subl $4, %0 \n\t" | 265 ".balign 16 \n\t" |
226 " jnz 1b \n\t" | 266 "1: \n\t" |
227 :"+g"(h) | |
228 :"D"(pixels), "S"(pixels+line_size), "r" (block), "r" (block+line_size), | |
229 "g"(line_size<<1) | |
230 :"%eax", "memory"); | |
231 } | |
232 | |
233 static void DEF(avg_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
234 { | |
235 __asm __volatile( | |
236 "xorl %%eax, %%eax \n\t" | |
237 "movq (%1), %%mm0 \n\t" | |
238 ".balign 16 \n\t" | |
239 "1: \n\t" | |
240 "movq (%2, %%eax), %%mm1 \n\t" | 267 "movq (%2, %%eax), %%mm1 \n\t" |
241 "movq (%3, %%eax), %%mm2 \n\t" | 268 "movq (%3, %%eax), %%mm2 \n\t" |
269 "movq 1(%2, %%eax), %%mm3 \n\t" | |
270 "movq 1(%3, %%eax), %%mm4 \n\t" | |
271 "psubusb %%mm7, %%mm2 \n\t" | |
272 PAVGB" %%mm3, %%mm1 \n\t" | |
273 PAVGB" %%mm4, %%mm2 \n\t" | |
242 PAVGB" %%mm1, %%mm0 \n\t" | 274 PAVGB" %%mm1, %%mm0 \n\t" |
243 PAVGB" %%mm2, %%mm1 \n\t" | 275 PAVGB" %%mm2, %%mm1 \n\t" |
244 "movq (%4, %%eax), %%mm3 \n\t" | 276 "movq (%4, %%eax), %%mm3 \n\t" |
245 "movq (%5, %%eax), %%mm4 \n\t" | 277 "movq (%5, %%eax), %%mm4 \n\t" |
246 PAVGB" %%mm3, %%mm0 \n\t" | 278 PAVGB" %%mm3, %%mm0 \n\t" |
247 PAVGB" %%mm4, %%mm1 \n\t" | 279 PAVGB" %%mm4, %%mm1 \n\t" |
248 "movq %%mm0, (%4, %%eax) \n\t" | 280 "movq %%mm0, (%4, %%eax) \n\t" |
249 "movq %%mm1, (%5, %%eax) \n\t" | 281 "movq %%mm1, (%5, %%eax) \n\t" |
250 "addl %6, %%eax \n\t" | 282 "addl %6, %%eax \n\t" |
251 "movq (%2, %%eax), %%mm1 \n\t" | |
252 "movq (%3, %%eax), %%mm0 \n\t" | |
253 PAVGB" %%mm1, %%mm2 \n\t" | |
254 PAVGB" %%mm0, %%mm1 \n\t" | |
255 "movq (%4, %%eax), %%mm3 \n\t" | |
256 "movq (%5, %%eax), %%mm4 \n\t" | |
257 PAVGB" %%mm3, %%mm2 \n\t" | |
258 PAVGB" %%mm4, %%mm1 \n\t" | |
259 "movq %%mm2, (%4, %%eax) \n\t" | |
260 "movq %%mm1, (%5, %%eax) \n\t" | |
261 "addl %6, %%eax \n\t" | |
262 "subl $4, %0 \n\t" | |
263 " jnz 1b \n\t" | |
264 :"+g"(h) | |
265 :"D"(pixels), "S"(pixels+line_size), "r"(pixels+line_size*2), "r" (block), | |
266 "r" (block+line_size), "g"(line_size<<1) | |
267 :"%eax", "memory"); | |
268 } | |
269 | |
270 // Note this is not correctly rounded, but this function is only used for b frames so it doesnt matter | |
271 static void DEF(avg_pixels_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
272 { | |
273 __asm __volatile( | |
274 MOVQ_BONE(%%mm7) | |
275 "xorl %%eax, %%eax \n\t" | |
276 "movq (%1), %%mm0 \n\t" | |
277 "movq 1(%1), %%mm1 \n\t" | |
278 PAVGB" %%mm1, %%mm0 \n\t" | |
279 ".balign 16 \n\t" | |
280 "1: \n\t" | |
281 "movq (%2, %%eax), %%mm1 \n\t" | |
282 "movq (%3, %%eax), %%mm2 \n\t" | |
283 "movq 1(%2, %%eax), %%mm3 \n\t" | |
284 "movq 1(%3, %%eax), %%mm4 \n\t" | |
285 "psubusb %%mm7, %%mm2 \n\t" | |
286 PAVGB" %%mm3, %%mm1 \n\t" | |
287 PAVGB" %%mm4, %%mm2 \n\t" | |
288 PAVGB" %%mm1, %%mm0 \n\t" | |
289 PAVGB" %%mm2, %%mm1 \n\t" | |
290 "movq (%4, %%eax), %%mm3 \n\t" | |
291 "movq (%5, %%eax), %%mm4 \n\t" | |
292 PAVGB" %%mm3, %%mm0 \n\t" | |
293 PAVGB" %%mm4, %%mm1 \n\t" | |
294 "movq %%mm0, (%4, %%eax) \n\t" | |
295 "movq %%mm1, (%5, %%eax) \n\t" | |
296 "addl %6, %%eax \n\t" | |
297 "movq (%2, %%eax), %%mm1 \n\t" | 283 "movq (%2, %%eax), %%mm1 \n\t" |
298 "movq (%3, %%eax), %%mm0 \n\t" | 284 "movq (%3, %%eax), %%mm0 \n\t" |
299 "movq 1(%2, %%eax), %%mm3 \n\t" | 285 "movq 1(%2, %%eax), %%mm3 \n\t" |
300 "movq 1(%3, %%eax), %%mm4 \n\t" | 286 "movq 1(%3, %%eax), %%mm4 \n\t" |
301 PAVGB" %%mm3, %%mm1 \n\t" | 287 PAVGB" %%mm3, %%mm1 \n\t" |
306 "movq (%5, %%eax), %%mm4 \n\t" | 292 "movq (%5, %%eax), %%mm4 \n\t" |
307 PAVGB" %%mm3, %%mm2 \n\t" | 293 PAVGB" %%mm3, %%mm2 \n\t" |
308 PAVGB" %%mm4, %%mm1 \n\t" | 294 PAVGB" %%mm4, %%mm1 \n\t" |
309 "movq %%mm2, (%4, %%eax) \n\t" | 295 "movq %%mm2, (%4, %%eax) \n\t" |
310 "movq %%mm1, (%5, %%eax) \n\t" | 296 "movq %%mm1, (%5, %%eax) \n\t" |
311 "addl %6, %%eax \n\t" | 297 "addl %6, %%eax \n\t" |
312 "subl $4, %0 \n\t" | 298 "subl $4, %0 \n\t" |
313 " jnz 1b \n\t" | 299 " jnz 1b \n\t" |
314 :"+g"(h) | 300 :"+g"(h) |
315 :"D"(pixels), "S"(pixels+line_size), "r"(pixels+line_size*2), "r" (block), | 301 :"D"(pixels), "S"(pixels+line_size), "r"(pixels+line_size*2), "r" (block), |
316 "r" (block+line_size), "g"(line_size<<1) | 302 "r" (block+line_size), "g"(line_size<<1) |
317 :"%eax", "memory"); | 303 :"%eax", "memory"); |
318 } | 304 } |