comparison i386/motion_est_mmx.c @ 294:944632089814 libavcodec

4MV motion estimation (not finished yet) SAD functions rewritten (8x8 support & MMX2 optimizations) HQ inter/intra decission msmpeg4 encoding bugfix (MV where too long)
author michaelni
date Wed, 27 Mar 2002 21:25:22 +0000
parents 3049d6d452a3
children 54d86f074a4b
comparison
equal deleted inserted replaced
293:6eaf5da091fa 294:944632089814
14 * 14 *
15 * You should have received a copy of the GNU General Public License 15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software 16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 * 18 *
19 * mostly by Michael Niedermayer <michaelni@gmx.at>
19 */ 20 */
20 #include "../dsputil.h" 21 #include "../dsputil.h"
21 #include "mmx.h" 22
22 23 static const __attribute__ ((aligned(8))) UINT64 round_tab[3]={
23 static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001; 24 0x0000000000000000,
24 static const unsigned long long int mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002; 25 0x0001000100010001,
25 26 0x0002000200020002,
26 /* mm7 is accumulator, mm6 is zero */ 27 };
27 static inline void sad_add(const UINT8 *p1, const UINT8 *p2) 28
28 { 29 static inline void sad8_mmx(UINT8 *blk1, UINT8 *blk2, int stride, int h)
29 movq_m2r(*p1, mm0); 30 {
30 movq_m2r(*p2, mm1); 31 int len= -(stride<<h);
31 movq_r2r(mm0, mm2); 32 asm volatile(
32 psubusb_r2r(mm1, mm0); 33 ".balign 16 \n\t"
33 psubusb_r2r(mm2, mm1); 34 "1: \n\t"
34 por_r2r(mm1, mm0); /* mm0 is absolute value */ 35 "movq (%1, %%eax), %%mm0 \n\t"
35 36 "movq (%2, %%eax), %%mm2 \n\t"
36 movq_r2r(mm0, mm1); 37 "movq (%2, %%eax), %%mm4 \n\t"
37 punpcklbw_r2r(mm6, mm0); 38 "addl %3, %%eax \n\t"
38 punpckhbw_r2r(mm6, mm1); 39 "psubusb %%mm0, %%mm2 \n\t"
39 paddusw_r2r(mm0, mm7); 40 "psubusb %%mm4, %%mm0 \n\t"
40 paddusw_r2r(mm1, mm7); 41 "movq (%1, %%eax), %%mm1 \n\t"
41 } 42 "movq (%2, %%eax), %%mm3 \n\t"
42 43 "movq (%2, %%eax), %%mm5 \n\t"
43 /* convert mm7 to value */ 44 "psubusb %%mm1, %%mm3 \n\t"
44 static inline int sad_end(void) 45 "psubusb %%mm5, %%mm1 \n\t"
45 { 46 "por %%mm2, %%mm0 \n\t"
46 int res; 47 "por %%mm1, %%mm3 \n\t"
47 48 "movq %%mm0, %%mm1 \n\t"
48 movq_r2r(mm7, mm0); 49 "movq %%mm3, %%mm2 \n\t"
49 psrlq_i2r(32, mm7); 50 "punpcklbw %%mm7, %%mm0 \n\t"
50 paddusw_r2r(mm0, mm7); 51 "punpckhbw %%mm7, %%mm1 \n\t"
51 52 "punpcklbw %%mm7, %%mm3 \n\t"
52 movq_r2r(mm7, mm0); 53 "punpckhbw %%mm7, %%mm2 \n\t"
53 psrlq_i2r(16, mm7); 54 "paddw %%mm1, %%mm0 \n\t"
54 paddusw_r2r(mm0, mm7); 55 "paddw %%mm3, %%mm2 \n\t"
55 __asm __volatile ("movd %%mm7, %0" : "=a" (res)); 56 "paddw %%mm2, %%mm0 \n\t"
56 return res & 0xffff; 57 "paddw %%mm0, %%mm6 \n\t"
57 } 58 "addl %3, %%eax \n\t"
58 59 " js 1b \n\t"
59 int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h) 60 : "+a" (len)
60 { 61 : "r" (blk1 - len), "r" (blk2 - len), "r" (stride)
61 const UINT8 *p1, *p2; 62 );
62 63 }
63 h >>= 1; 64
64 p1 = blk1; 65 static inline void sad8_mmx2(UINT8 *blk1, UINT8 *blk2, int stride, int h)
65 p2 = blk2; 66 {
66 pxor_r2r(mm7, mm7); /* mm7 is accumulator */ 67 int len= -(stride<<h);
67 pxor_r2r(mm6, mm6); /* mm7 is zero constant */ 68 asm volatile(
68 do { 69 ".balign 16 \n\t"
69 sad_add(p1, p2); 70 "1: \n\t"
70 sad_add(p1 + 8, p2 + 8); 71 "movq (%1, %%eax), %%mm0 \n\t"
71 p1 += lx; 72 "movq (%2, %%eax), %%mm2 \n\t"
72 p2 += lx; 73 "psadbw %%mm2, %%mm0 \n\t"
73 sad_add(p1, p2); 74 "addl %3, %%eax \n\t"
74 sad_add(p1 + 8, p2 + 8); 75 "movq (%1, %%eax), %%mm1 \n\t"
75 p1 += lx; 76 "movq (%2, %%eax), %%mm3 \n\t"
76 p2 += lx; 77 "psadbw %%mm1, %%mm3 \n\t"
77 } while (--h); 78 "paddw %%mm3, %%mm0 \n\t"
78 return sad_end(); 79 "paddw %%mm0, %%mm6 \n\t"
79 } 80 "addl %3, %%eax \n\t"
80 81 " js 1b \n\t"
81 /* please test it ! */ 82 : "+a" (len)
82 static inline void sad_add_sse(const UINT8 *p1, const UINT8 *p2) 83 : "r" (blk1 - len), "r" (blk2 - len), "r" (stride)
83 { 84 );
84 movq_m2r(*(p1 + 0), mm0); 85 }
85 movq_m2r(*(p1 + 8), mm1); 86
86 psadbw_m2r(*(p2 + 0), mm0); 87 static inline void sad8_2_mmx2(UINT8 *blk1a, UINT8 *blk1b, UINT8 *blk2, int stride, int h)
87 psadbw_m2r(*(p2 + 8), mm1); 88 {
88 paddusw_r2r(mm0, mm7); 89 int len= -(stride<<h);
89 paddusw_r2r(mm1, mm7); 90 asm volatile(
90 } 91 ".balign 16 \n\t"
91 92 "1: \n\t"
92 int pix_abs16x16_sse(UINT8 *blk1, UINT8 *blk2, int lx, int h) 93 "movq (%1, %%eax), %%mm0 \n\t"
93 { 94 "movq (%2, %%eax), %%mm2 \n\t"
94 const UINT8 *p1, *p2; 95 "pavgb %%mm2, %%mm0 \n\t"
95 96 "movq (%3, %%eax), %%mm2 \n\t"
96 h >>= 1; 97 "psadbw %%mm2, %%mm0 \n\t"
97 p1 = blk1; 98 "addl %4, %%eax \n\t"
98 p2 = blk2; 99 "movq (%1, %%eax), %%mm1 \n\t"
99 pxor_r2r(mm7, mm7); /* mm7 is accumulator */ 100 "movq (%2, %%eax), %%mm3 \n\t"
100 do { 101 "pavgb %%mm1, %%mm3 \n\t"
101 sad_add_sse(p1, p2); 102 "movq (%3, %%eax), %%mm1 \n\t"
102 p1 += lx; 103 "psadbw %%mm1, %%mm3 \n\t"
103 p2 += lx; 104 "paddw %%mm3, %%mm0 \n\t"
104 sad_add_sse(p1, p2); 105 "paddw %%mm0, %%mm6 \n\t"
105 p1 += lx; 106 "addl %4, %%eax \n\t"
106 p2 += lx; 107 " js 1b \n\t"
107 } while (--h); 108 : "+a" (len)
108 return sad_end(); 109 : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" (stride)
109 } 110 );
110 111 }
111 #define DUMP(reg) { mmx_t tmp; movq_r2m(reg, tmp); printf(#reg "=%016Lx\n", tmp.uq); } 112
112 113 static inline void sad8_4_mmx2(UINT8 *blk1, UINT8 *blk2, int stride, int h)
113 /* mm7 is accumulator, mm6 is zero */ 114 { //FIXME reuse src
114 static inline void sad_add_x2(const UINT8 *p1, const UINT8 *p2, const UINT8 *p3) 115 int len= -(stride<<h);
115 { 116 asm volatile(
116 movq_m2r(*(p2 + 0), mm0); 117 ".balign 16 \n\t"
117 movq_m2r(*(p3 + 0), mm1); 118 "1: \n\t"
118 movq_r2r(mm0, mm2); 119 "movq (%1, %%eax), %%mm0 \n\t"
119 movq_r2r(mm1, mm3); 120 "movq (%2, %%eax), %%mm2 \n\t"
120 punpcklbw_r2r(mm6, mm0); /* extract 4 bytes low */ 121 "movq 1(%1, %%eax), %%mm1 \n\t"
121 punpcklbw_r2r(mm6, mm1); 122 "movq 1(%2, %%eax), %%mm3 \n\t"
122 punpckhbw_r2r(mm6, mm2); /* high */ 123 "pavgb %%mm2, %%mm0 \n\t"
123 punpckhbw_r2r(mm6, mm3); 124 "pavgb %%mm1, %%mm3 \n\t"
124 paddusw_r2r(mm1, mm0); 125 "pavgb %%mm3, %%mm0 \n\t"
125 paddusw_r2r(mm3, mm2); 126 "movq (%3, %%eax), %%mm2 \n\t"
126 movq_m2r(*(p1 + 0), mm1); /* mm1 : other value */ 127 "psadbw %%mm2, %%mm0 \n\t"
127 paddusw_r2r(mm5, mm0); /* + 1 */ 128 "addl %4, %%eax \n\t"
128 paddusw_r2r(mm5, mm2); /* + 1 */ 129 "movq (%1, %%eax), %%mm1 \n\t"
129 psrlw_i2r(1, mm0); 130 "movq (%2, %%eax), %%mm3 \n\t"
130 psrlw_i2r(1, mm2); 131 "movq 1(%1, %%eax), %%mm2 \n\t"
131 packuswb_r2r(mm2, mm0); /* average is in mm0 */ 132 "movq 1(%2, %%eax), %%mm4 \n\t"
132 133 "pavgb %%mm3, %%mm1 \n\t"
133 movq_r2r(mm1, mm2); 134 "pavgb %%mm4, %%mm2 \n\t"
134 psubusb_r2r(mm0, mm1); 135 "pavgb %%mm1, %%mm2 \n\t"
135 psubusb_r2r(mm2, mm0); 136 "movq (%3, %%eax), %%mm1 \n\t"
136 por_r2r(mm1, mm0); /* mm0 is absolute value */ 137 "psadbw %%mm1, %%mm2 \n\t"
137 138 "paddw %%mm2, %%mm0 \n\t"
138 movq_r2r(mm0, mm1); 139 "paddw %%mm0, %%mm6 \n\t"
139 punpcklbw_r2r(mm6, mm0); 140 "addl %4, %%eax \n\t"
140 punpckhbw_r2r(mm6, mm1); 141 " js 1b \n\t"
141 paddusw_r2r(mm0, mm7); 142 : "+a" (len)
142 paddusw_r2r(mm1, mm7); 143 : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len), "r" (stride)
143 } 144 );
144 145 }
145 int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h) 146
146 { 147 static inline void sad8_2_mmx(UINT8 *blk1a, UINT8 *blk1b, UINT8 *blk2, int stride, int h)
147 const UINT8 *p1, *p2; 148 {
148 149 int len= -(stride<<h);
149 p1 = blk1; 150 asm volatile(
150 p2 = blk2; 151 ".balign 16 \n\t"
151 pxor_r2r(mm7, mm7); /* mm7 is accumulator */ 152 "1: \n\t"
152 pxor_r2r(mm6, mm6); /* mm7 is zero constant */ 153 "movq (%1, %%eax), %%mm0 \n\t"
153 movq_m2r(mm_wone, mm5); /* one constant */ 154 "movq (%2, %%eax), %%mm1 \n\t"
154 do { 155 "movq (%1, %%eax), %%mm2 \n\t"
155 sad_add_x2(p1, p2, p2 + 1); 156 "movq (%2, %%eax), %%mm3 \n\t"
156 sad_add_x2(p1 + 8, p2 + 8, p2 + 9); 157 "punpcklbw %%mm7, %%mm0 \n\t"
157 p1 += lx; 158 "punpcklbw %%mm7, %%mm1 \n\t"
158 p2 += lx; 159 "punpckhbw %%mm7, %%mm2 \n\t"
159 } while (--h); 160 "punpckhbw %%mm7, %%mm3 \n\t"
160 return sad_end(); 161 "paddw %%mm0, %%mm1 \n\t"
161 } 162 "paddw %%mm2, %%mm3 \n\t"
162 163 "movq (%3, %%eax), %%mm4 \n\t"
163 int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h) 164 "movq (%3, %%eax), %%mm2 \n\t"
164 { 165 "paddw %%mm5, %%mm1 \n\t"
165 const UINT8 *p1, *p2; 166 "paddw %%mm5, %%mm3 \n\t"
166 167 "psrlw $1, %%mm1 \n\t"
167 p1 = blk1; 168 "psrlw $1, %%mm3 \n\t"
168 p2 = blk2; 169 "packuswb %%mm3, %%mm1 \n\t"
169 pxor_r2r(mm7, mm7); /* mm7 is accumulator */ 170 "psubusb %%mm1, %%mm4 \n\t"
170 pxor_r2r(mm6, mm6); /* mm7 is zero constant */ 171 "psubusb %%mm2, %%mm1 \n\t"
171 movq_m2r(mm_wone, mm5); /* one constant */ 172 "por %%mm4, %%mm1 \n\t"
172 do { 173 "movq %%mm1, %%mm0 \n\t"
173 sad_add_x2(p1, p2, p2 + lx); 174 "punpcklbw %%mm7, %%mm0 \n\t"
174 sad_add_x2(p1 + 8, p2 + 8, p2 + 8 + lx); 175 "punpckhbw %%mm7, %%mm1 \n\t"
175 p1 += lx; 176 "paddw %%mm1, %%mm0 \n\t"
176 p2 += lx; 177 "paddw %%mm0, %%mm6 \n\t"
177 } while (--h); 178 "addl %4, %%eax \n\t"
178 return sad_end(); 179 " js 1b \n\t"
179 } 180 : "+a" (len)
180 181 : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" (stride)
181 /* mm7 is accumulator, mm6 is zero */ 182 );
182 static inline void sad_add_xy2(const UINT8 *p1, const UINT8 *p2, const UINT8 *p3) 183 }
183 { 184
184 movq_m2r(*(p2 + 0), mm0); 185 static inline void sad8_4_mmx(UINT8 *blk1, UINT8 *blk2, int stride, int h)
185 movq_m2r(*(p3 + 0), mm1); 186 {
186 movq_r2r(mm0, mm2); 187 int len= -(stride<<h);
187 movq_r2r(mm1, mm3); 188 asm volatile(
188 punpcklbw_r2r(mm6, mm0); /* extract 4 bytes low */ 189 ".balign 16 \n\t"
189 punpcklbw_r2r(mm6, mm1); 190 "1: \n\t"
190 punpckhbw_r2r(mm6, mm2); /* high */ 191 "movq (%1, %%eax), %%mm0 \n\t"
191 punpckhbw_r2r(mm6, mm3); 192 "movq (%2, %%eax), %%mm1 \n\t"
192 paddusw_r2r(mm1, mm0); 193 "movq %%mm0, %%mm4 \n\t"
193 paddusw_r2r(mm3, mm2); 194 "movq %%mm1, %%mm2 \n\t"
194 195 "punpcklbw %%mm7, %%mm0 \n\t"
195 movq_m2r(*(p2 + 1), mm1); 196 "punpcklbw %%mm7, %%mm1 \n\t"
196 movq_m2r(*(p3 + 1), mm3); 197 "punpckhbw %%mm7, %%mm4 \n\t"
197 movq_r2r(mm1, mm4); 198 "punpckhbw %%mm7, %%mm2 \n\t"
198 punpcklbw_r2r(mm6, mm1); /* low */ 199 "paddw %%mm1, %%mm0 \n\t"
199 punpckhbw_r2r(mm6, mm4); /* high */ 200 "paddw %%mm2, %%mm4 \n\t"
200 paddusw_r2r(mm1, mm0); 201 "movq 1(%1, %%eax), %%mm2 \n\t"
201 paddusw_r2r(mm4, mm2); 202 "movq 1(%2, %%eax), %%mm3 \n\t"
202 movq_r2r(mm3, mm4); 203 "movq %%mm2, %%mm1 \n\t"
203 punpcklbw_r2r(mm6, mm3); /* low */ 204 "punpcklbw %%mm7, %%mm2 \n\t"
204 punpckhbw_r2r(mm6, mm4); /* high */ 205 "punpckhbw %%mm7, %%mm1 \n\t"
205 paddusw_r2r(mm3, mm0); 206 "paddw %%mm0, %%mm2 \n\t"
206 paddusw_r2r(mm4, mm2); 207 "paddw %%mm4, %%mm1 \n\t"
207 208 "movq %%mm3, %%mm4 \n\t"
208 movq_m2r(*(p1 + 0), mm1); /* mm1 : other value */ 209 "punpcklbw %%mm7, %%mm3 \n\t"
209 paddusw_r2r(mm5, mm0); /* + 2 */ 210 "punpckhbw %%mm7, %%mm4 \n\t"
210 paddusw_r2r(mm5, mm2); /* + 2 */ 211 "paddw %%mm3, %%mm2 \n\t"
211 psrlw_i2r(2, mm0); 212 "paddw %%mm4, %%mm1 \n\t"
212 psrlw_i2r(2, mm2); 213 "movq (%3, %%eax), %%mm3 \n\t"
213 packuswb_r2r(mm2, mm0); /* average is in mm0 */ 214 "movq (%3, %%eax), %%mm4 \n\t"
214 215 "paddw %%mm5, %%mm2 \n\t"
215 movq_r2r(mm1, mm2); 216 "paddw %%mm5, %%mm1 \n\t"
216 psubusb_r2r(mm0, mm1); 217 "psrlw $2, %%mm2 \n\t"
217 psubusb_r2r(mm2, mm0); 218 "psrlw $2, %%mm1 \n\t"
218 por_r2r(mm1, mm0); /* mm0 is absolute value */ 219 "packuswb %%mm1, %%mm2 \n\t"
219 220 "psubusb %%mm2, %%mm3 \n\t"
220 movq_r2r(mm0, mm1); 221 "psubusb %%mm4, %%mm2 \n\t"
221 punpcklbw_r2r(mm6, mm0); 222 "por %%mm3, %%mm2 \n\t"
222 punpckhbw_r2r(mm6, mm1); 223 "movq %%mm2, %%mm0 \n\t"
223 paddusw_r2r(mm0, mm7); 224 "punpcklbw %%mm7, %%mm0 \n\t"
224 paddusw_r2r(mm1, mm7); 225 "punpckhbw %%mm7, %%mm2 \n\t"
225 } 226 "paddw %%mm2, %%mm0 \n\t"
226 227 "paddw %%mm0, %%mm6 \n\t"
227 int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h) 228 "addl %4, %%eax \n\t"
228 { 229 " js 1b \n\t"
229 const UINT8 *p1, *p2, *p3; 230 : "+a" (len)
230 231 : "r" (blk1 - len), "r" (blk1 -len + stride), "r" (blk2 - len), "r" (stride)
231 p1 = blk1; 232 );
232 p2 = blk2; 233 }
233 p3 = blk2 + lx; 234
234 pxor_r2r(mm7, mm7); /* mm7 is accumulator */ 235 static inline int sum_mmx()
235 pxor_r2r(mm6, mm6); /* mm7 is zero constant */ 236 {
236 movq_m2r(mm_wtwo, mm5); /* one constant */ 237 int ret;
237 do { 238 asm volatile(
238 sad_add_xy2(p1, p2, p2 + lx); 239 "movq %%mm6, %%mm0 \n\t"
239 sad_add_xy2(p1 + 8, p2 + 8, p2 + 8 + lx); 240 "psrlq $32, %%mm6 \n\t"
240 p1 += lx; 241 "paddw %%mm0, %%mm6 \n\t"
241 p2 += lx; 242 "movq %%mm6, %%mm0 \n\t"
242 } while (--h); 243 "psrlq $16, %%mm6 \n\t"
243 return sad_end(); 244 "paddw %%mm0, %%mm6 \n\t"
244 } 245 "movd %%mm6, %0 \n\t"
246 : "=r" (ret)
247 );
248 return ret&0xFFFF;
249 }
250
251 static inline int sum_mmx2()
252 {
253 int ret;
254 asm volatile(
255 "movd %%mm6, %0 \n\t"
256 : "=r" (ret)
257 );
258 return ret;
259 }
260
261 #define PIX_SAD(suf)\
262 int pix_abs8x8_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\
263 {\
264 asm volatile("pxor %%mm7, %%mm7 \n\t"\
265 "pxor %%mm6, %%mm6 \n\t":);\
266 \
267 sad8_ ## suf(blk1, blk2, stride, 3);\
268 \
269 return sum_ ## suf();\
270 }\
271 \
272 int pix_abs8x8_x2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\
273 {\
274 asm volatile("pxor %%mm7, %%mm7 \n\t"\
275 "pxor %%mm6, %%mm6 \n\t"\
276 "movq %0, %%mm5 \n\t"\
277 :: "m"(round_tab[1]) \
278 );\
279 \
280 sad8_2_ ## suf(blk1, blk2+1, blk2, stride, 3);\
281 \
282 return sum_ ## suf();\
283 }\
284 \
285 int pix_abs8x8_y2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\
286 {\
287 asm volatile("pxor %%mm7, %%mm7 \n\t"\
288 "pxor %%mm6, %%mm6 \n\t"\
289 "movq %0, %%mm5 \n\t"\
290 :: "m"(round_tab[1]) \
291 );\
292 \
293 sad8_2_ ## suf(blk1, blk1+stride, blk2, stride, 3);\
294 \
295 return sum_ ## suf();\
296 }\
297 \
298 int pix_abs8x8_xy2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\
299 {\
300 asm volatile("pxor %%mm7, %%mm7 \n\t"\
301 "pxor %%mm6, %%mm6 \n\t"\
302 "movq %0, %%mm5 \n\t"\
303 :: "m"(round_tab[2]) \
304 );\
305 \
306 sad8_4_ ## suf(blk1, blk2, stride, 3);\
307 \
308 return sum_ ## suf();\
309 }\
310 \
311 int pix_abs16x16_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\
312 {\
313 asm volatile("pxor %%mm7, %%mm7 \n\t"\
314 "pxor %%mm6, %%mm6 \n\t":);\
315 \
316 sad8_ ## suf(blk1 , blk2 , stride, 4);\
317 sad8_ ## suf(blk1+8, blk2+8, stride, 4);\
318 \
319 return sum_ ## suf();\
320 }\
321 int pix_abs16x16_x2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\
322 {\
323 asm volatile("pxor %%mm7, %%mm7 \n\t"\
324 "pxor %%mm6, %%mm6 \n\t"\
325 "movq %0, %%mm5 \n\t"\
326 :: "m"(round_tab[1]) \
327 );\
328 \
329 sad8_2_ ## suf(blk1 , blk1+1, blk2 , stride, 4);\
330 sad8_2_ ## suf(blk1+8, blk1+9, blk2+8, stride, 4);\
331 \
332 return sum_ ## suf();\
333 }\
334 int pix_abs16x16_y2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\
335 {\
336 asm volatile("pxor %%mm7, %%mm7 \n\t"\
337 "pxor %%mm6, %%mm6 \n\t"\
338 "movq %0, %%mm5 \n\t"\
339 :: "m"(round_tab[1]) \
340 );\
341 \
342 sad8_2_ ## suf(blk1 , blk1+stride, blk2 , stride, 4);\
343 sad8_2_ ## suf(blk1+8, blk1+stride+8,blk2+8, stride, 4);\
344 \
345 return sum_ ## suf();\
346 }\
347 int pix_abs16x16_xy2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\
348 {\
349 asm volatile("pxor %%mm7, %%mm7 \n\t"\
350 "pxor %%mm6, %%mm6 \n\t"\
351 "movq %0, %%mm5 \n\t"\
352 :: "m"(round_tab[2]) \
353 );\
354 \
355 sad8_4_ ## suf(blk1 , blk2 , stride, 4);\
356 sad8_4_ ## suf(blk1+8, blk2+8, stride, 4);\
357 \
358 return sum_ ## suf();\
359 }\
360
361 PIX_SAD(mmx)
362 PIX_SAD(mmx2)