Mercurial > libavcodec.hg
annotate i386/dsputil_mmx_avg.h @ 416:ca1f2c0e44ef libavcodec
* fixed contrains and avoid usage of scale index access
author | kabi |
---|---|
date | Thu, 23 May 2002 10:10:14 +0000 |
parents | f56e4d08e082 |
children | 718a22dc121f |
rev | line source |
---|---|
0 | 1 /* |
2 * DSP utils : average functions are compiled twice for 3dnow/mmx2 | |
3 * Copyright (c) 2000, 2001 Gerard Lantau. | |
386 | 4 * Copyright (c) 2002 Michael Niedermayer |
0 | 5 * |
6 * This program is free software; you can redistribute it and/or modify | |
7 * it under the terms of the GNU General Public License as published by | |
8 * the Free Software Foundation; either version 2 of the License, or | |
9 * (at your option) any later version. | |
10 * | |
11 * This program is distributed in the hope that it will be useful, | |
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 * GNU General Public License for more details. | |
15 * | |
16 * You should have received a copy of the GNU General Public License | |
17 * along with this program; if not, write to the Free Software | |
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | |
19 * | |
20 * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | |
386 | 21 * mostly rewritten by Michael Niedermayer <michaelni@gmx.at> |
0 | 22 */ |
387 | 23 |
389
f874493a1970
tried to avoid gcc 2.95.2 bug by puting explicit register constraints - added comment about rounding bug in some functions (need to correct or suppress them for regression tests)
glantau
parents:
387
diff
changeset
|
24 /* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
25 clobber bug - now it will work with 2.95.2 and also with -fPIC |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
26 */ |
0 | 27 static void DEF(put_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
28 { | |
386 | 29 __asm __volatile( |
30 "xorl %%eax, %%eax \n\t" | |
31 ".balign 16 \n\t" | |
32 "1: \n\t" | |
33 "movq (%1, %%eax), %%mm0 \n\t" | |
34 "movq 1(%1, %%eax), %%mm1 \n\t" | |
35 "movq (%2, %%eax), %%mm2 \n\t" | |
36 "movq 1(%2, %%eax), %%mm3 \n\t" | |
37 PAVGB" %%mm1, %%mm0 \n\t" | |
38 PAVGB" %%mm3, %%mm2 \n\t" | |
39 "movq %%mm0, (%3, %%eax) \n\t" | |
40 "movq %%mm2, (%4, %%eax) \n\t" | |
41 "addl %5, %%eax \n\t" | |
42 "movq (%1, %%eax), %%mm0 \n\t" | |
43 "movq 1(%1, %%eax), %%mm1 \n\t" | |
44 "movq (%2, %%eax), %%mm2 \n\t" | |
45 "movq 1(%2, %%eax), %%mm3 \n\t" | |
46 PAVGB" %%mm1, %%mm0 \n\t" | |
47 PAVGB" %%mm3, %%mm2 \n\t" | |
48 "movq %%mm0, (%3, %%eax) \n\t" | |
49 "movq %%mm2, (%4, %%eax) \n\t" | |
50 "addl %5, %%eax \n\t" | |
51 "subl $4, %0 \n\t" | |
52 " jnz 1b \n\t" | |
53 :"+g"(h) | |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
54 :"D"(pixels), "S"(pixels+line_size), "r" (block), "r" (block+line_size), |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
55 "g"(line_size<<1) |
386 | 56 :"%eax", "memory"); |
57 } | |
58 | |
389
f874493a1970
tried to avoid gcc 2.95.2 bug by puting explicit register constraints - added comment about rounding bug in some functions (need to correct or suppress them for regression tests)
glantau
parents:
387
diff
changeset
|
59 /* GL: this function does incorrect rounding if overflow */ |
386 | 60 static void DEF(put_no_rnd_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
61 { | |
0 | 62 __asm __volatile( |
386 | 63 "xorl %%eax, %%eax \n\t" |
387 | 64 MOVQ_BONE(%%mm7) |
386 | 65 ".balign 16 \n\t" |
66 "1: \n\t" | |
67 "movq (%1, %%eax), %%mm0 \n\t" | |
68 "movq 1(%1, %%eax), %%mm1 \n\t" | |
69 "movq (%2, %%eax), %%mm2 \n\t" | |
70 "movq 1(%2, %%eax), %%mm3 \n\t" | |
71 "psubusb %%mm7, %%mm0 \n\t" | |
72 "psubusb %%mm7, %%mm2 \n\t" | |
73 PAVGB" %%mm1, %%mm0 \n\t" | |
74 PAVGB" %%mm3, %%mm2 \n\t" | |
75 "movq %%mm0, (%3, %%eax) \n\t" | |
76 "movq %%mm2, (%4, %%eax) \n\t" | |
77 "addl %5, %%eax \n\t" | |
78 "movq (%1, %%eax), %%mm0 \n\t" | |
79 "movq 1(%1, %%eax), %%mm1 \n\t" | |
80 "movq (%2, %%eax), %%mm2 \n\t" | |
81 "movq 1(%2, %%eax), %%mm3 \n\t" | |
82 "psubusb %%mm7, %%mm0 \n\t" | |
83 "psubusb %%mm7, %%mm2 \n\t" | |
84 PAVGB" %%mm1, %%mm0 \n\t" | |
85 PAVGB" %%mm3, %%mm2 \n\t" | |
86 "movq %%mm0, (%3, %%eax) \n\t" | |
87 "movq %%mm2, (%4, %%eax) \n\t" | |
88 "addl %5, %%eax \n\t" | |
89 "subl $4, %0 \n\t" | |
90 " jnz 1b \n\t" | |
91 :"+g"(h) | |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
92 :"D"(pixels), "S"(pixels+line_size), "r" (block), "r" (block+line_size), |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
93 "r"(line_size<<1) |
386 | 94 :"%eax", "memory"); |
0 | 95 } |
96 | |
97 static void DEF(put_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
98 { | |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
99 #if 1 |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
100 // Michael - measure me |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
101 __asm __volatile( |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
102 "lea (%3, %3), %%eax \n\t" |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
103 "movq (%1), %%mm0 \n\t" |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
104 "subl %3, %2 \n\t" |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
105 ".balign 16 \n\t" |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
106 "1: \n\t" |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
107 "movq (%1, %3), %%mm1 \n\t" |
416
ca1f2c0e44ef
* fixed contrains and avoid usage of scale index access
kabi
parents:
414
diff
changeset
|
108 "movq (%1, %%eax), %%mm2 \n\t" |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
109 PAVGB" %%mm1, %%mm0 \n\t" |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
110 PAVGB" %%mm2, %%mm1 \n\t" |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
111 "addl %%eax, %1 \n\t" |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
112 "movq %%mm0, (%2, %3) \n\t" |
416
ca1f2c0e44ef
* fixed contrains and avoid usage of scale index access
kabi
parents:
414
diff
changeset
|
113 "movq %%mm1, (%2, %%eax) \n\t" |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
114 "movq (%1, %3), %%mm1 \n\t" |
416
ca1f2c0e44ef
* fixed contrains and avoid usage of scale index access
kabi
parents:
414
diff
changeset
|
115 "movq (%1, %%eax), %%mm0 \n\t" |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
116 PAVGB" %%mm1, %%mm2 \n\t" |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
117 PAVGB" %%mm0, %%mm1 \n\t" |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
118 "addl %%eax, %2 \n\t" |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
119 "addl %%eax, %1 \n\t" |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
120 "movq %%mm2, (%2, %3) \n\t" |
416
ca1f2c0e44ef
* fixed contrains and avoid usage of scale index access
kabi
parents:
414
diff
changeset
|
121 "movq %%mm1, (%2, %%eax) \n\t" |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
122 "addl %%eax, %2 \n\t" |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
123 "subl $4, %0 \n\t" |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
124 "jnz 1b \n\t" |
416
ca1f2c0e44ef
* fixed contrains and avoid usage of scale index access
kabi
parents:
414
diff
changeset
|
125 :"+g"(h), "+D"(pixels), "+S" (block) |
ca1f2c0e44ef
* fixed contrains and avoid usage of scale index access
kabi
parents:
414
diff
changeset
|
126 :"c"(line_size) |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
127 :"%eax", "memory"); |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
128 #else |
414
f56e4d08e082
using 1 operand less and slightly faster put_pixels_y2
michaelni
parents:
413
diff
changeset
|
129 // kabi measure me |
0 | 130 __asm __volatile( |
414
f56e4d08e082
using 1 operand less and slightly faster put_pixels_y2
michaelni
parents:
413
diff
changeset
|
131 "movq (%2), %%mm0 \n\t" |
f56e4d08e082
using 1 operand less and slightly faster put_pixels_y2
michaelni
parents:
413
diff
changeset
|
132 "addl %1, %2 \n\t" |
386 | 133 "xorl %%eax, %%eax \n\t" |
414
f56e4d08e082
using 1 operand less and slightly faster put_pixels_y2
michaelni
parents:
413
diff
changeset
|
134 "leal (%1, %2), %%edi \n\t" |
f56e4d08e082
using 1 operand less and slightly faster put_pixels_y2
michaelni
parents:
413
diff
changeset
|
135 "leal (%1, %3), %%esi \n\t" |
f56e4d08e082
using 1 operand less and slightly faster put_pixels_y2
michaelni
parents:
413
diff
changeset
|
136 "addl %1, %1 \n\t" |
386 | 137 ".balign 16 \n\t" |
138 "1: \n\t" | |
414
f56e4d08e082
using 1 operand less and slightly faster put_pixels_y2
michaelni
parents:
413
diff
changeset
|
139 "movq (%2 , %%eax), %%mm1 \n\t" |
f56e4d08e082
using 1 operand less and slightly faster put_pixels_y2
michaelni
parents:
413
diff
changeset
|
140 "movq (%%edi, %%eax), %%mm2 \n\t" |
386 | 141 PAVGB" %%mm1, %%mm0 \n\t" |
142 PAVGB" %%mm2, %%mm1 \n\t" | |
414
f56e4d08e082
using 1 operand less and slightly faster put_pixels_y2
michaelni
parents:
413
diff
changeset
|
143 "movq %%mm0, (%3 , %%eax) \n\t" |
f56e4d08e082
using 1 operand less and slightly faster put_pixels_y2
michaelni
parents:
413
diff
changeset
|
144 "movq %%mm1, (%%esi, %%eax) \n\t" |
f56e4d08e082
using 1 operand less and slightly faster put_pixels_y2
michaelni
parents:
413
diff
changeset
|
145 "addl %1, %%eax \n\t" |
f56e4d08e082
using 1 operand less and slightly faster put_pixels_y2
michaelni
parents:
413
diff
changeset
|
146 "movq (%2 , %%eax), %%mm1 \n\t" |
f56e4d08e082
using 1 operand less and slightly faster put_pixels_y2
michaelni
parents:
413
diff
changeset
|
147 "movq (%%edi, %%eax), %%mm0 \n\t" |
386 | 148 PAVGB" %%mm1, %%mm2 \n\t" |
149 PAVGB" %%mm0, %%mm1 \n\t" | |
414
f56e4d08e082
using 1 operand less and slightly faster put_pixels_y2
michaelni
parents:
413
diff
changeset
|
150 "movq %%mm2, (%3 , %%eax) \n\t" |
f56e4d08e082
using 1 operand less and slightly faster put_pixels_y2
michaelni
parents:
413
diff
changeset
|
151 "movq %%mm1, (%%esi, %%eax) \n\t" |
f56e4d08e082
using 1 operand less and slightly faster put_pixels_y2
michaelni
parents:
413
diff
changeset
|
152 "addl %1, %%eax \n\t" |
386 | 153 "subl $4, %0 \n\t" |
154 " jnz 1b \n\t" | |
414
f56e4d08e082
using 1 operand less and slightly faster put_pixels_y2
michaelni
parents:
413
diff
changeset
|
155 :"+g"(h), "+r"(line_size), "+r"(pixels) |
f56e4d08e082
using 1 operand less and slightly faster put_pixels_y2
michaelni
parents:
413
diff
changeset
|
156 : "r" (block) |
f56e4d08e082
using 1 operand less and slightly faster put_pixels_y2
michaelni
parents:
413
diff
changeset
|
157 : "%eax", "%esi", "%edi", "memory"); |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
158 #endif |
386 | 159 } |
160 | |
389
f874493a1970
tried to avoid gcc 2.95.2 bug by puting explicit register constraints - added comment about rounding bug in some functions (need to correct or suppress them for regression tests)
glantau
parents:
387
diff
changeset
|
161 /* GL: this function does incorrect rounding if overflow */ |
386 | 162 static void DEF(put_no_rnd_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
163 { | |
0 | 164 __asm __volatile( |
387 | 165 MOVQ_BONE(%%mm7) |
386 | 166 "xorl %%eax, %%eax \n\t" |
167 "movq (%1), %%mm0 \n\t" | |
168 ".balign 16 \n\t" | |
169 "1: \n\t" | |
170 "movq (%2, %%eax), %%mm1 \n\t" | |
171 "movq (%3, %%eax), %%mm2 \n\t" | |
172 "psubusb %%mm7, %%mm1 \n\t" | |
173 PAVGB" %%mm1, %%mm0 \n\t" | |
174 PAVGB" %%mm2, %%mm1 \n\t" | |
175 "movq %%mm0, (%4, %%eax) \n\t" | |
176 "movq %%mm1, (%5, %%eax) \n\t" | |
177 "addl %6, %%eax \n\t" | |
178 "movq (%2, %%eax), %%mm1 \n\t" | |
179 "movq (%3, %%eax), %%mm0 \n\t" | |
180 "psubusb %%mm7, %%mm1 \n\t" | |
181 PAVGB" %%mm1, %%mm2 \n\t" | |
182 PAVGB" %%mm0, %%mm1 \n\t" | |
183 "movq %%mm2, (%4, %%eax) \n\t" | |
184 "movq %%mm1, (%5, %%eax) \n\t" | |
185 "addl %6, %%eax \n\t" | |
186 "subl $4, %0 \n\t" | |
187 " jnz 1b \n\t" | |
188 :"+g"(h) | |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
189 :"D"(pixels), "S"(pixels+line_size), "r"(pixels+line_size*2), "r" (block), |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
190 "r" (block+line_size), "g"(line_size<<1) |
386 | 191 :"%eax", "memory"); |
0 | 192 } |
193 | |
194 static void DEF(avg_pixels)(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
195 { | |
196 __asm __volatile( | |
386 | 197 "xorl %%eax, %%eax \n\t" |
198 ".balign 16 \n\t" | |
199 "1: \n\t" | |
200 "movq (%1, %%eax), %%mm0 \n\t" | |
201 "movq (%2, %%eax), %%mm2 \n\t" | |
202 "movq (%3, %%eax), %%mm3 \n\t" | |
203 "movq (%4, %%eax), %%mm4 \n\t" | |
204 PAVGB" %%mm3, %%mm0 \n\t" | |
205 PAVGB" %%mm4, %%mm2 \n\t" | |
206 "movq %%mm0, (%3, %%eax) \n\t" | |
207 "movq %%mm2, (%4, %%eax) \n\t" | |
208 "addl %5, %%eax \n\t" | |
209 "movq (%1, %%eax), %%mm0 \n\t" | |
210 "movq (%2, %%eax), %%mm2 \n\t" | |
211 "movq (%3, %%eax), %%mm3 \n\t" | |
212 "movq (%4, %%eax), %%mm4 \n\t" | |
213 PAVGB" %%mm3, %%mm0 \n\t" | |
214 PAVGB" %%mm4, %%mm2 \n\t" | |
215 "movq %%mm0, (%3, %%eax) \n\t" | |
216 "movq %%mm2, (%4, %%eax) \n\t" | |
217 "addl %5, %%eax \n\t" | |
218 "subl $4, %0 \n\t" | |
219 " jnz 1b \n\t" | |
220 :"+g"(h) | |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
221 :"D"(pixels), "S"(pixels+line_size), "r" (block), "r" (block+line_size), |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
222 "g"(line_size<<1) |
386 | 223 :"%eax", "memory"); |
0 | 224 } |
225 | |
386 | 226 static void DEF(avg_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
0 | 227 { |
228 __asm __volatile( | |
386 | 229 "xorl %%eax, %%eax \n\t" |
230 ".balign 16 \n\t" | |
231 "1: \n\t" | |
232 "movq (%1, %%eax), %%mm0 \n\t" | |
233 "movq 1(%1, %%eax), %%mm1 \n\t" | |
234 "movq (%2, %%eax), %%mm2 \n\t" | |
235 "movq 1(%2, %%eax), %%mm3 \n\t" | |
236 PAVGB" %%mm1, %%mm0 \n\t" | |
237 PAVGB" %%mm3, %%mm2 \n\t" | |
238 "movq (%3, %%eax), %%mm3 \n\t" | |
239 "movq (%4, %%eax), %%mm4 \n\t" | |
240 PAVGB" %%mm3, %%mm0 \n\t" | |
241 PAVGB" %%mm4, %%mm2 \n\t" | |
242 "movq %%mm0, (%3, %%eax) \n\t" | |
243 "movq %%mm2, (%4, %%eax) \n\t" | |
244 "addl %5, %%eax \n\t" | |
245 "movq (%1, %%eax), %%mm0 \n\t" | |
246 "movq 1(%1, %%eax), %%mm1 \n\t" | |
247 "movq (%2, %%eax), %%mm2 \n\t" | |
248 "movq 1(%2, %%eax), %%mm3 \n\t" | |
249 PAVGB" %%mm1, %%mm0 \n\t" | |
250 PAVGB" %%mm3, %%mm2 \n\t" | |
251 "movq (%3, %%eax), %%mm3 \n\t" | |
252 "movq (%4, %%eax), %%mm4 \n\t" | |
253 PAVGB" %%mm3, %%mm0 \n\t" | |
254 PAVGB" %%mm4, %%mm2 \n\t" | |
255 "movq %%mm0, (%3, %%eax) \n\t" | |
256 "movq %%mm2, (%4, %%eax) \n\t" | |
257 "addl %5, %%eax \n\t" | |
258 "subl $4, %0 \n\t" | |
259 " jnz 1b \n\t" | |
260 :"+g"(h) | |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
261 :"D"(pixels), "S"(pixels+line_size), "r" (block), "r" (block+line_size), |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
262 "g"(line_size<<1) |
386 | 263 :"%eax", "memory"); |
0 | 264 } |
265 | |
386 | 266 static void DEF(avg_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
0 | 267 { |
268 __asm __volatile( | |
386 | 269 "xorl %%eax, %%eax \n\t" |
270 "movq (%1), %%mm0 \n\t" | |
271 ".balign 16 \n\t" | |
272 "1: \n\t" | |
273 "movq (%2, %%eax), %%mm1 \n\t" | |
274 "movq (%3, %%eax), %%mm2 \n\t" | |
275 PAVGB" %%mm1, %%mm0 \n\t" | |
276 PAVGB" %%mm2, %%mm1 \n\t" | |
277 "movq (%4, %%eax), %%mm3 \n\t" | |
278 "movq (%5, %%eax), %%mm4 \n\t" | |
279 PAVGB" %%mm3, %%mm0 \n\t" | |
280 PAVGB" %%mm4, %%mm1 \n\t" | |
281 "movq %%mm0, (%4, %%eax) \n\t" | |
282 "movq %%mm1, (%5, %%eax) \n\t" | |
283 "addl %6, %%eax \n\t" | |
284 "movq (%2, %%eax), %%mm1 \n\t" | |
285 "movq (%3, %%eax), %%mm0 \n\t" | |
286 PAVGB" %%mm1, %%mm2 \n\t" | |
287 PAVGB" %%mm0, %%mm1 \n\t" | |
288 "movq (%4, %%eax), %%mm3 \n\t" | |
289 "movq (%5, %%eax), %%mm4 \n\t" | |
290 PAVGB" %%mm3, %%mm2 \n\t" | |
291 PAVGB" %%mm4, %%mm1 \n\t" | |
292 "movq %%mm2, (%4, %%eax) \n\t" | |
293 "movq %%mm1, (%5, %%eax) \n\t" | |
294 "addl %6, %%eax \n\t" | |
295 "subl $4, %0 \n\t" | |
296 " jnz 1b \n\t" | |
297 :"+g"(h) | |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
298 :"D"(pixels), "S"(pixels+line_size), "r"(pixels+line_size*2), "r" (block), |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
299 "r" (block+line_size), "g"(line_size<<1) |
386 | 300 :"%eax", "memory"); |
0 | 301 } |
302 | |
386 | 303 // Note this is not correctly rounded, but this function is only used for b frames so it doesnt matter |
304 static void DEF(avg_pixels_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) | |
0 | 305 { |
306 __asm __volatile( | |
387 | 307 MOVQ_BONE(%%mm7) |
386 | 308 "xorl %%eax, %%eax \n\t" |
309 "movq (%1), %%mm0 \n\t" | |
310 "movq 1(%1), %%mm1 \n\t" | |
311 PAVGB" %%mm1, %%mm0 \n\t" | |
312 ".balign 16 \n\t" | |
313 "1: \n\t" | |
314 "movq (%2, %%eax), %%mm1 \n\t" | |
315 "movq (%3, %%eax), %%mm2 \n\t" | |
316 "movq 1(%2, %%eax), %%mm3 \n\t" | |
317 "movq 1(%3, %%eax), %%mm4 \n\t" | |
318 "psubusb %%mm7, %%mm2 \n\t" | |
319 PAVGB" %%mm3, %%mm1 \n\t" | |
320 PAVGB" %%mm4, %%mm2 \n\t" | |
321 PAVGB" %%mm1, %%mm0 \n\t" | |
322 PAVGB" %%mm2, %%mm1 \n\t" | |
323 "movq (%4, %%eax), %%mm3 \n\t" | |
324 "movq (%5, %%eax), %%mm4 \n\t" | |
325 PAVGB" %%mm3, %%mm0 \n\t" | |
326 PAVGB" %%mm4, %%mm1 \n\t" | |
327 "movq %%mm0, (%4, %%eax) \n\t" | |
328 "movq %%mm1, (%5, %%eax) \n\t" | |
329 "addl %6, %%eax \n\t" | |
330 "movq (%2, %%eax), %%mm1 \n\t" | |
331 "movq (%3, %%eax), %%mm0 \n\t" | |
332 "movq 1(%2, %%eax), %%mm3 \n\t" | |
333 "movq 1(%3, %%eax), %%mm4 \n\t" | |
334 PAVGB" %%mm3, %%mm1 \n\t" | |
335 PAVGB" %%mm4, %%mm0 \n\t" | |
336 PAVGB" %%mm1, %%mm2 \n\t" | |
337 PAVGB" %%mm0, %%mm1 \n\t" | |
338 "movq (%4, %%eax), %%mm3 \n\t" | |
339 "movq (%5, %%eax), %%mm4 \n\t" | |
340 PAVGB" %%mm3, %%mm2 \n\t" | |
341 PAVGB" %%mm4, %%mm1 \n\t" | |
342 "movq %%mm2, (%4, %%eax) \n\t" | |
343 "movq %%mm1, (%5, %%eax) \n\t" | |
344 "addl %6, %%eax \n\t" | |
345 "subl $4, %0 \n\t" | |
346 " jnz 1b \n\t" | |
347 :"+g"(h) | |
413
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
348 :"D"(pixels), "S"(pixels+line_size), "r"(pixels+line_size*2), "r" (block), |
1548abb7bbed
* fix for -fPIC compilation - compiles with 2.95.2 as well - any
kabi
parents:
402
diff
changeset
|
349 "r" (block+line_size), "g"(line_size<<1) |
386 | 350 :"%eax", "memory"); |
0 | 351 } |