Mercurial > libavcodec.hg
comparison i386/motion_est_mmx.c @ 294:944632089814 libavcodec
4MV motion estimation (not finished yet)
SAD functions rewritten (8x8 support & MMX2 optimizations)
HQ inter/intra decission
msmpeg4 encoding bugfix (MV where too long)
author | michaelni |
---|---|
date | Wed, 27 Mar 2002 21:25:22 +0000 |
parents | 3049d6d452a3 |
children | 54d86f074a4b |
comparison
equal
deleted
inserted
replaced
293:6eaf5da091fa | 294:944632089814 |
---|---|
14 * | 14 * |
15 * You should have received a copy of the GNU General Public License | 15 * You should have received a copy of the GNU General Public License |
16 * along with this program; if not, write to the Free Software | 16 * along with this program; if not, write to the Free Software |
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | 17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
18 * | 18 * |
19 * mostly by Michael Niedermayer <michaelni@gmx.at> | |
19 */ | 20 */ |
20 #include "../dsputil.h" | 21 #include "../dsputil.h" |
21 #include "mmx.h" | 22 |
22 | 23 static const __attribute__ ((aligned(8))) UINT64 round_tab[3]={ |
23 static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001; | 24 0x0000000000000000, |
24 static const unsigned long long int mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002; | 25 0x0001000100010001, |
25 | 26 0x0002000200020002, |
26 /* mm7 is accumulator, mm6 is zero */ | 27 }; |
27 static inline void sad_add(const UINT8 *p1, const UINT8 *p2) | 28 |
28 { | 29 static inline void sad8_mmx(UINT8 *blk1, UINT8 *blk2, int stride, int h) |
29 movq_m2r(*p1, mm0); | 30 { |
30 movq_m2r(*p2, mm1); | 31 int len= -(stride<<h); |
31 movq_r2r(mm0, mm2); | 32 asm volatile( |
32 psubusb_r2r(mm1, mm0); | 33 ".balign 16 \n\t" |
33 psubusb_r2r(mm2, mm1); | 34 "1: \n\t" |
34 por_r2r(mm1, mm0); /* mm0 is absolute value */ | 35 "movq (%1, %%eax), %%mm0 \n\t" |
35 | 36 "movq (%2, %%eax), %%mm2 \n\t" |
36 movq_r2r(mm0, mm1); | 37 "movq (%2, %%eax), %%mm4 \n\t" |
37 punpcklbw_r2r(mm6, mm0); | 38 "addl %3, %%eax \n\t" |
38 punpckhbw_r2r(mm6, mm1); | 39 "psubusb %%mm0, %%mm2 \n\t" |
39 paddusw_r2r(mm0, mm7); | 40 "psubusb %%mm4, %%mm0 \n\t" |
40 paddusw_r2r(mm1, mm7); | 41 "movq (%1, %%eax), %%mm1 \n\t" |
41 } | 42 "movq (%2, %%eax), %%mm3 \n\t" |
42 | 43 "movq (%2, %%eax), %%mm5 \n\t" |
43 /* convert mm7 to value */ | 44 "psubusb %%mm1, %%mm3 \n\t" |
44 static inline int sad_end(void) | 45 "psubusb %%mm5, %%mm1 \n\t" |
45 { | 46 "por %%mm2, %%mm0 \n\t" |
46 int res; | 47 "por %%mm1, %%mm3 \n\t" |
47 | 48 "movq %%mm0, %%mm1 \n\t" |
48 movq_r2r(mm7, mm0); | 49 "movq %%mm3, %%mm2 \n\t" |
49 psrlq_i2r(32, mm7); | 50 "punpcklbw %%mm7, %%mm0 \n\t" |
50 paddusw_r2r(mm0, mm7); | 51 "punpckhbw %%mm7, %%mm1 \n\t" |
51 | 52 "punpcklbw %%mm7, %%mm3 \n\t" |
52 movq_r2r(mm7, mm0); | 53 "punpckhbw %%mm7, %%mm2 \n\t" |
53 psrlq_i2r(16, mm7); | 54 "paddw %%mm1, %%mm0 \n\t" |
54 paddusw_r2r(mm0, mm7); | 55 "paddw %%mm3, %%mm2 \n\t" |
55 __asm __volatile ("movd %%mm7, %0" : "=a" (res)); | 56 "paddw %%mm2, %%mm0 \n\t" |
56 return res & 0xffff; | 57 "paddw %%mm0, %%mm6 \n\t" |
57 } | 58 "addl %3, %%eax \n\t" |
58 | 59 " js 1b \n\t" |
59 int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h) | 60 : "+a" (len) |
60 { | 61 : "r" (blk1 - len), "r" (blk2 - len), "r" (stride) |
61 const UINT8 *p1, *p2; | 62 ); |
62 | 63 } |
63 h >>= 1; | 64 |
64 p1 = blk1; | 65 static inline void sad8_mmx2(UINT8 *blk1, UINT8 *blk2, int stride, int h) |
65 p2 = blk2; | 66 { |
66 pxor_r2r(mm7, mm7); /* mm7 is accumulator */ | 67 int len= -(stride<<h); |
67 pxor_r2r(mm6, mm6); /* mm7 is zero constant */ | 68 asm volatile( |
68 do { | 69 ".balign 16 \n\t" |
69 sad_add(p1, p2); | 70 "1: \n\t" |
70 sad_add(p1 + 8, p2 + 8); | 71 "movq (%1, %%eax), %%mm0 \n\t" |
71 p1 += lx; | 72 "movq (%2, %%eax), %%mm2 \n\t" |
72 p2 += lx; | 73 "psadbw %%mm2, %%mm0 \n\t" |
73 sad_add(p1, p2); | 74 "addl %3, %%eax \n\t" |
74 sad_add(p1 + 8, p2 + 8); | 75 "movq (%1, %%eax), %%mm1 \n\t" |
75 p1 += lx; | 76 "movq (%2, %%eax), %%mm3 \n\t" |
76 p2 += lx; | 77 "psadbw %%mm1, %%mm3 \n\t" |
77 } while (--h); | 78 "paddw %%mm3, %%mm0 \n\t" |
78 return sad_end(); | 79 "paddw %%mm0, %%mm6 \n\t" |
79 } | 80 "addl %3, %%eax \n\t" |
80 | 81 " js 1b \n\t" |
81 /* please test it ! */ | 82 : "+a" (len) |
82 static inline void sad_add_sse(const UINT8 *p1, const UINT8 *p2) | 83 : "r" (blk1 - len), "r" (blk2 - len), "r" (stride) |
83 { | 84 ); |
84 movq_m2r(*(p1 + 0), mm0); | 85 } |
85 movq_m2r(*(p1 + 8), mm1); | 86 |
86 psadbw_m2r(*(p2 + 0), mm0); | 87 static inline void sad8_2_mmx2(UINT8 *blk1a, UINT8 *blk1b, UINT8 *blk2, int stride, int h) |
87 psadbw_m2r(*(p2 + 8), mm1); | 88 { |
88 paddusw_r2r(mm0, mm7); | 89 int len= -(stride<<h); |
89 paddusw_r2r(mm1, mm7); | 90 asm volatile( |
90 } | 91 ".balign 16 \n\t" |
91 | 92 "1: \n\t" |
92 int pix_abs16x16_sse(UINT8 *blk1, UINT8 *blk2, int lx, int h) | 93 "movq (%1, %%eax), %%mm0 \n\t" |
93 { | 94 "movq (%2, %%eax), %%mm2 \n\t" |
94 const UINT8 *p1, *p2; | 95 "pavgb %%mm2, %%mm0 \n\t" |
95 | 96 "movq (%3, %%eax), %%mm2 \n\t" |
96 h >>= 1; | 97 "psadbw %%mm2, %%mm0 \n\t" |
97 p1 = blk1; | 98 "addl %4, %%eax \n\t" |
98 p2 = blk2; | 99 "movq (%1, %%eax), %%mm1 \n\t" |
99 pxor_r2r(mm7, mm7); /* mm7 is accumulator */ | 100 "movq (%2, %%eax), %%mm3 \n\t" |
100 do { | 101 "pavgb %%mm1, %%mm3 \n\t" |
101 sad_add_sse(p1, p2); | 102 "movq (%3, %%eax), %%mm1 \n\t" |
102 p1 += lx; | 103 "psadbw %%mm1, %%mm3 \n\t" |
103 p2 += lx; | 104 "paddw %%mm3, %%mm0 \n\t" |
104 sad_add_sse(p1, p2); | 105 "paddw %%mm0, %%mm6 \n\t" |
105 p1 += lx; | 106 "addl %4, %%eax \n\t" |
106 p2 += lx; | 107 " js 1b \n\t" |
107 } while (--h); | 108 : "+a" (len) |
108 return sad_end(); | 109 : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" (stride) |
109 } | 110 ); |
110 | 111 } |
111 #define DUMP(reg) { mmx_t tmp; movq_r2m(reg, tmp); printf(#reg "=%016Lx\n", tmp.uq); } | 112 |
112 | 113 static inline void sad8_4_mmx2(UINT8 *blk1, UINT8 *blk2, int stride, int h) |
113 /* mm7 is accumulator, mm6 is zero */ | 114 { //FIXME reuse src |
114 static inline void sad_add_x2(const UINT8 *p1, const UINT8 *p2, const UINT8 *p3) | 115 int len= -(stride<<h); |
115 { | 116 asm volatile( |
116 movq_m2r(*(p2 + 0), mm0); | 117 ".balign 16 \n\t" |
117 movq_m2r(*(p3 + 0), mm1); | 118 "1: \n\t" |
118 movq_r2r(mm0, mm2); | 119 "movq (%1, %%eax), %%mm0 \n\t" |
119 movq_r2r(mm1, mm3); | 120 "movq (%2, %%eax), %%mm2 \n\t" |
120 punpcklbw_r2r(mm6, mm0); /* extract 4 bytes low */ | 121 "movq 1(%1, %%eax), %%mm1 \n\t" |
121 punpcklbw_r2r(mm6, mm1); | 122 "movq 1(%2, %%eax), %%mm3 \n\t" |
122 punpckhbw_r2r(mm6, mm2); /* high */ | 123 "pavgb %%mm2, %%mm0 \n\t" |
123 punpckhbw_r2r(mm6, mm3); | 124 "pavgb %%mm1, %%mm3 \n\t" |
124 paddusw_r2r(mm1, mm0); | 125 "pavgb %%mm3, %%mm0 \n\t" |
125 paddusw_r2r(mm3, mm2); | 126 "movq (%3, %%eax), %%mm2 \n\t" |
126 movq_m2r(*(p1 + 0), mm1); /* mm1 : other value */ | 127 "psadbw %%mm2, %%mm0 \n\t" |
127 paddusw_r2r(mm5, mm0); /* + 1 */ | 128 "addl %4, %%eax \n\t" |
128 paddusw_r2r(mm5, mm2); /* + 1 */ | 129 "movq (%1, %%eax), %%mm1 \n\t" |
129 psrlw_i2r(1, mm0); | 130 "movq (%2, %%eax), %%mm3 \n\t" |
130 psrlw_i2r(1, mm2); | 131 "movq 1(%1, %%eax), %%mm2 \n\t" |
131 packuswb_r2r(mm2, mm0); /* average is in mm0 */ | 132 "movq 1(%2, %%eax), %%mm4 \n\t" |
132 | 133 "pavgb %%mm3, %%mm1 \n\t" |
133 movq_r2r(mm1, mm2); | 134 "pavgb %%mm4, %%mm2 \n\t" |
134 psubusb_r2r(mm0, mm1); | 135 "pavgb %%mm1, %%mm2 \n\t" |
135 psubusb_r2r(mm2, mm0); | 136 "movq (%3, %%eax), %%mm1 \n\t" |
136 por_r2r(mm1, mm0); /* mm0 is absolute value */ | 137 "psadbw %%mm1, %%mm2 \n\t" |
137 | 138 "paddw %%mm2, %%mm0 \n\t" |
138 movq_r2r(mm0, mm1); | 139 "paddw %%mm0, %%mm6 \n\t" |
139 punpcklbw_r2r(mm6, mm0); | 140 "addl %4, %%eax \n\t" |
140 punpckhbw_r2r(mm6, mm1); | 141 " js 1b \n\t" |
141 paddusw_r2r(mm0, mm7); | 142 : "+a" (len) |
142 paddusw_r2r(mm1, mm7); | 143 : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len), "r" (stride) |
143 } | 144 ); |
144 | 145 } |
145 int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h) | 146 |
146 { | 147 static inline void sad8_2_mmx(UINT8 *blk1a, UINT8 *blk1b, UINT8 *blk2, int stride, int h) |
147 const UINT8 *p1, *p2; | 148 { |
148 | 149 int len= -(stride<<h); |
149 p1 = blk1; | 150 asm volatile( |
150 p2 = blk2; | 151 ".balign 16 \n\t" |
151 pxor_r2r(mm7, mm7); /* mm7 is accumulator */ | 152 "1: \n\t" |
152 pxor_r2r(mm6, mm6); /* mm7 is zero constant */ | 153 "movq (%1, %%eax), %%mm0 \n\t" |
153 movq_m2r(mm_wone, mm5); /* one constant */ | 154 "movq (%2, %%eax), %%mm1 \n\t" |
154 do { | 155 "movq (%1, %%eax), %%mm2 \n\t" |
155 sad_add_x2(p1, p2, p2 + 1); | 156 "movq (%2, %%eax), %%mm3 \n\t" |
156 sad_add_x2(p1 + 8, p2 + 8, p2 + 9); | 157 "punpcklbw %%mm7, %%mm0 \n\t" |
157 p1 += lx; | 158 "punpcklbw %%mm7, %%mm1 \n\t" |
158 p2 += lx; | 159 "punpckhbw %%mm7, %%mm2 \n\t" |
159 } while (--h); | 160 "punpckhbw %%mm7, %%mm3 \n\t" |
160 return sad_end(); | 161 "paddw %%mm0, %%mm1 \n\t" |
161 } | 162 "paddw %%mm2, %%mm3 \n\t" |
162 | 163 "movq (%3, %%eax), %%mm4 \n\t" |
163 int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h) | 164 "movq (%3, %%eax), %%mm2 \n\t" |
164 { | 165 "paddw %%mm5, %%mm1 \n\t" |
165 const UINT8 *p1, *p2; | 166 "paddw %%mm5, %%mm3 \n\t" |
166 | 167 "psrlw $1, %%mm1 \n\t" |
167 p1 = blk1; | 168 "psrlw $1, %%mm3 \n\t" |
168 p2 = blk2; | 169 "packuswb %%mm3, %%mm1 \n\t" |
169 pxor_r2r(mm7, mm7); /* mm7 is accumulator */ | 170 "psubusb %%mm1, %%mm4 \n\t" |
170 pxor_r2r(mm6, mm6); /* mm7 is zero constant */ | 171 "psubusb %%mm2, %%mm1 \n\t" |
171 movq_m2r(mm_wone, mm5); /* one constant */ | 172 "por %%mm4, %%mm1 \n\t" |
172 do { | 173 "movq %%mm1, %%mm0 \n\t" |
173 sad_add_x2(p1, p2, p2 + lx); | 174 "punpcklbw %%mm7, %%mm0 \n\t" |
174 sad_add_x2(p1 + 8, p2 + 8, p2 + 8 + lx); | 175 "punpckhbw %%mm7, %%mm1 \n\t" |
175 p1 += lx; | 176 "paddw %%mm1, %%mm0 \n\t" |
176 p2 += lx; | 177 "paddw %%mm0, %%mm6 \n\t" |
177 } while (--h); | 178 "addl %4, %%eax \n\t" |
178 return sad_end(); | 179 " js 1b \n\t" |
179 } | 180 : "+a" (len) |
180 | 181 : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" (stride) |
181 /* mm7 is accumulator, mm6 is zero */ | 182 ); |
182 static inline void sad_add_xy2(const UINT8 *p1, const UINT8 *p2, const UINT8 *p3) | 183 } |
183 { | 184 |
184 movq_m2r(*(p2 + 0), mm0); | 185 static inline void sad8_4_mmx(UINT8 *blk1, UINT8 *blk2, int stride, int h) |
185 movq_m2r(*(p3 + 0), mm1); | 186 { |
186 movq_r2r(mm0, mm2); | 187 int len= -(stride<<h); |
187 movq_r2r(mm1, mm3); | 188 asm volatile( |
188 punpcklbw_r2r(mm6, mm0); /* extract 4 bytes low */ | 189 ".balign 16 \n\t" |
189 punpcklbw_r2r(mm6, mm1); | 190 "1: \n\t" |
190 punpckhbw_r2r(mm6, mm2); /* high */ | 191 "movq (%1, %%eax), %%mm0 \n\t" |
191 punpckhbw_r2r(mm6, mm3); | 192 "movq (%2, %%eax), %%mm1 \n\t" |
192 paddusw_r2r(mm1, mm0); | 193 "movq %%mm0, %%mm4 \n\t" |
193 paddusw_r2r(mm3, mm2); | 194 "movq %%mm1, %%mm2 \n\t" |
194 | 195 "punpcklbw %%mm7, %%mm0 \n\t" |
195 movq_m2r(*(p2 + 1), mm1); | 196 "punpcklbw %%mm7, %%mm1 \n\t" |
196 movq_m2r(*(p3 + 1), mm3); | 197 "punpckhbw %%mm7, %%mm4 \n\t" |
197 movq_r2r(mm1, mm4); | 198 "punpckhbw %%mm7, %%mm2 \n\t" |
198 punpcklbw_r2r(mm6, mm1); /* low */ | 199 "paddw %%mm1, %%mm0 \n\t" |
199 punpckhbw_r2r(mm6, mm4); /* high */ | 200 "paddw %%mm2, %%mm4 \n\t" |
200 paddusw_r2r(mm1, mm0); | 201 "movq 1(%1, %%eax), %%mm2 \n\t" |
201 paddusw_r2r(mm4, mm2); | 202 "movq 1(%2, %%eax), %%mm3 \n\t" |
202 movq_r2r(mm3, mm4); | 203 "movq %%mm2, %%mm1 \n\t" |
203 punpcklbw_r2r(mm6, mm3); /* low */ | 204 "punpcklbw %%mm7, %%mm2 \n\t" |
204 punpckhbw_r2r(mm6, mm4); /* high */ | 205 "punpckhbw %%mm7, %%mm1 \n\t" |
205 paddusw_r2r(mm3, mm0); | 206 "paddw %%mm0, %%mm2 \n\t" |
206 paddusw_r2r(mm4, mm2); | 207 "paddw %%mm4, %%mm1 \n\t" |
207 | 208 "movq %%mm3, %%mm4 \n\t" |
208 movq_m2r(*(p1 + 0), mm1); /* mm1 : other value */ | 209 "punpcklbw %%mm7, %%mm3 \n\t" |
209 paddusw_r2r(mm5, mm0); /* + 2 */ | 210 "punpckhbw %%mm7, %%mm4 \n\t" |
210 paddusw_r2r(mm5, mm2); /* + 2 */ | 211 "paddw %%mm3, %%mm2 \n\t" |
211 psrlw_i2r(2, mm0); | 212 "paddw %%mm4, %%mm1 \n\t" |
212 psrlw_i2r(2, mm2); | 213 "movq (%3, %%eax), %%mm3 \n\t" |
213 packuswb_r2r(mm2, mm0); /* average is in mm0 */ | 214 "movq (%3, %%eax), %%mm4 \n\t" |
214 | 215 "paddw %%mm5, %%mm2 \n\t" |
215 movq_r2r(mm1, mm2); | 216 "paddw %%mm5, %%mm1 \n\t" |
216 psubusb_r2r(mm0, mm1); | 217 "psrlw $2, %%mm2 \n\t" |
217 psubusb_r2r(mm2, mm0); | 218 "psrlw $2, %%mm1 \n\t" |
218 por_r2r(mm1, mm0); /* mm0 is absolute value */ | 219 "packuswb %%mm1, %%mm2 \n\t" |
219 | 220 "psubusb %%mm2, %%mm3 \n\t" |
220 movq_r2r(mm0, mm1); | 221 "psubusb %%mm4, %%mm2 \n\t" |
221 punpcklbw_r2r(mm6, mm0); | 222 "por %%mm3, %%mm2 \n\t" |
222 punpckhbw_r2r(mm6, mm1); | 223 "movq %%mm2, %%mm0 \n\t" |
223 paddusw_r2r(mm0, mm7); | 224 "punpcklbw %%mm7, %%mm0 \n\t" |
224 paddusw_r2r(mm1, mm7); | 225 "punpckhbw %%mm7, %%mm2 \n\t" |
225 } | 226 "paddw %%mm2, %%mm0 \n\t" |
226 | 227 "paddw %%mm0, %%mm6 \n\t" |
227 int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h) | 228 "addl %4, %%eax \n\t" |
228 { | 229 " js 1b \n\t" |
229 const UINT8 *p1, *p2, *p3; | 230 : "+a" (len) |
230 | 231 : "r" (blk1 - len), "r" (blk1 -len + stride), "r" (blk2 - len), "r" (stride) |
231 p1 = blk1; | 232 ); |
232 p2 = blk2; | 233 } |
233 p3 = blk2 + lx; | 234 |
234 pxor_r2r(mm7, mm7); /* mm7 is accumulator */ | 235 static inline int sum_mmx() |
235 pxor_r2r(mm6, mm6); /* mm7 is zero constant */ | 236 { |
236 movq_m2r(mm_wtwo, mm5); /* one constant */ | 237 int ret; |
237 do { | 238 asm volatile( |
238 sad_add_xy2(p1, p2, p2 + lx); | 239 "movq %%mm6, %%mm0 \n\t" |
239 sad_add_xy2(p1 + 8, p2 + 8, p2 + 8 + lx); | 240 "psrlq $32, %%mm6 \n\t" |
240 p1 += lx; | 241 "paddw %%mm0, %%mm6 \n\t" |
241 p2 += lx; | 242 "movq %%mm6, %%mm0 \n\t" |
242 } while (--h); | 243 "psrlq $16, %%mm6 \n\t" |
243 return sad_end(); | 244 "paddw %%mm0, %%mm6 \n\t" |
244 } | 245 "movd %%mm6, %0 \n\t" |
246 : "=r" (ret) | |
247 ); | |
248 return ret&0xFFFF; | |
249 } | |
250 | |
251 static inline int sum_mmx2() | |
252 { | |
253 int ret; | |
254 asm volatile( | |
255 "movd %%mm6, %0 \n\t" | |
256 : "=r" (ret) | |
257 ); | |
258 return ret; | |
259 } | |
260 | |
261 #define PIX_SAD(suf)\ | |
262 int pix_abs8x8_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\ | |
263 {\ | |
264 asm volatile("pxor %%mm7, %%mm7 \n\t"\ | |
265 "pxor %%mm6, %%mm6 \n\t":);\ | |
266 \ | |
267 sad8_ ## suf(blk1, blk2, stride, 3);\ | |
268 \ | |
269 return sum_ ## suf();\ | |
270 }\ | |
271 \ | |
272 int pix_abs8x8_x2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\ | |
273 {\ | |
274 asm volatile("pxor %%mm7, %%mm7 \n\t"\ | |
275 "pxor %%mm6, %%mm6 \n\t"\ | |
276 "movq %0, %%mm5 \n\t"\ | |
277 :: "m"(round_tab[1]) \ | |
278 );\ | |
279 \ | |
280 sad8_2_ ## suf(blk1, blk2+1, blk2, stride, 3);\ | |
281 \ | |
282 return sum_ ## suf();\ | |
283 }\ | |
284 \ | |
285 int pix_abs8x8_y2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\ | |
286 {\ | |
287 asm volatile("pxor %%mm7, %%mm7 \n\t"\ | |
288 "pxor %%mm6, %%mm6 \n\t"\ | |
289 "movq %0, %%mm5 \n\t"\ | |
290 :: "m"(round_tab[1]) \ | |
291 );\ | |
292 \ | |
293 sad8_2_ ## suf(blk1, blk1+stride, blk2, stride, 3);\ | |
294 \ | |
295 return sum_ ## suf();\ | |
296 }\ | |
297 \ | |
298 int pix_abs8x8_xy2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\ | |
299 {\ | |
300 asm volatile("pxor %%mm7, %%mm7 \n\t"\ | |
301 "pxor %%mm6, %%mm6 \n\t"\ | |
302 "movq %0, %%mm5 \n\t"\ | |
303 :: "m"(round_tab[2]) \ | |
304 );\ | |
305 \ | |
306 sad8_4_ ## suf(blk1, blk2, stride, 3);\ | |
307 \ | |
308 return sum_ ## suf();\ | |
309 }\ | |
310 \ | |
311 int pix_abs16x16_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\ | |
312 {\ | |
313 asm volatile("pxor %%mm7, %%mm7 \n\t"\ | |
314 "pxor %%mm6, %%mm6 \n\t":);\ | |
315 \ | |
316 sad8_ ## suf(blk1 , blk2 , stride, 4);\ | |
317 sad8_ ## suf(blk1+8, blk2+8, stride, 4);\ | |
318 \ | |
319 return sum_ ## suf();\ | |
320 }\ | |
321 int pix_abs16x16_x2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\ | |
322 {\ | |
323 asm volatile("pxor %%mm7, %%mm7 \n\t"\ | |
324 "pxor %%mm6, %%mm6 \n\t"\ | |
325 "movq %0, %%mm5 \n\t"\ | |
326 :: "m"(round_tab[1]) \ | |
327 );\ | |
328 \ | |
329 sad8_2_ ## suf(blk1 , blk1+1, blk2 , stride, 4);\ | |
330 sad8_2_ ## suf(blk1+8, blk1+9, blk2+8, stride, 4);\ | |
331 \ | |
332 return sum_ ## suf();\ | |
333 }\ | |
334 int pix_abs16x16_y2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\ | |
335 {\ | |
336 asm volatile("pxor %%mm7, %%mm7 \n\t"\ | |
337 "pxor %%mm6, %%mm6 \n\t"\ | |
338 "movq %0, %%mm5 \n\t"\ | |
339 :: "m"(round_tab[1]) \ | |
340 );\ | |
341 \ | |
342 sad8_2_ ## suf(blk1 , blk1+stride, blk2 , stride, 4);\ | |
343 sad8_2_ ## suf(blk1+8, blk1+stride+8,blk2+8, stride, 4);\ | |
344 \ | |
345 return sum_ ## suf();\ | |
346 }\ | |
347 int pix_abs16x16_xy2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\ | |
348 {\ | |
349 asm volatile("pxor %%mm7, %%mm7 \n\t"\ | |
350 "pxor %%mm6, %%mm6 \n\t"\ | |
351 "movq %0, %%mm5 \n\t"\ | |
352 :: "m"(round_tab[2]) \ | |
353 );\ | |
354 \ | |
355 sad8_4_ ## suf(blk1 , blk2 , stride, 4);\ | |
356 sad8_4_ ## suf(blk1+8, blk2+8, stride, 4);\ | |
357 \ | |
358 return sum_ ## suf();\ | |
359 }\ | |
360 | |
361 PIX_SAD(mmx) | |
362 PIX_SAD(mmx2) |