Mercurial > libavcodec.hg
comparison i386/motion_est_mmx.c @ 72:3049d6d452a3 libavcodec
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
author | glantau |
---|---|
date | Wed, 15 Aug 2001 22:25:32 +0000 |
parents | |
children | 944632089814 |
comparison
equal
deleted
inserted
replaced
71:79be2c581c01 | 72:3049d6d452a3 |
---|---|
1 /* | |
2 * MMX optimized motion estimation | |
3 * Copyright (c) 2001 Gerard Lantau. | |
4 * | |
5 * This program is free software; you can redistribute it and/or modify | |
6 * it under the terms of the GNU General Public License as published by | |
7 * the Free Software Foundation; either version 2 of the License, or | |
8 * (at your option) any later version. | |
9 * | |
10 * This program is distributed in the hope that it will be useful, | |
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 * GNU General Public License for more details. | |
14 * | |
15 * You should have received a copy of the GNU General Public License | |
16 * along with this program; if not, write to the Free Software | |
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | |
18 * | |
19 */ | |
20 #include "../dsputil.h" | |
21 #include "mmx.h" | |
22 | |
23 static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001; | |
24 static const unsigned long long int mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002; | |
25 | |
26 /* mm7 is accumulator, mm6 is zero */ | |
27 static inline void sad_add(const UINT8 *p1, const UINT8 *p2) | |
28 { | |
29 movq_m2r(*p1, mm0); | |
30 movq_m2r(*p2, mm1); | |
31 movq_r2r(mm0, mm2); | |
32 psubusb_r2r(mm1, mm0); | |
33 psubusb_r2r(mm2, mm1); | |
34 por_r2r(mm1, mm0); /* mm0 is absolute value */ | |
35 | |
36 movq_r2r(mm0, mm1); | |
37 punpcklbw_r2r(mm6, mm0); | |
38 punpckhbw_r2r(mm6, mm1); | |
39 paddusw_r2r(mm0, mm7); | |
40 paddusw_r2r(mm1, mm7); | |
41 } | |
42 | |
43 /* convert mm7 to value */ | |
44 static inline int sad_end(void) | |
45 { | |
46 int res; | |
47 | |
48 movq_r2r(mm7, mm0); | |
49 psrlq_i2r(32, mm7); | |
50 paddusw_r2r(mm0, mm7); | |
51 | |
52 movq_r2r(mm7, mm0); | |
53 psrlq_i2r(16, mm7); | |
54 paddusw_r2r(mm0, mm7); | |
55 __asm __volatile ("movd %%mm7, %0" : "=a" (res)); | |
56 return res & 0xffff; | |
57 } | |
58 | |
59 int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h) | |
60 { | |
61 const UINT8 *p1, *p2; | |
62 | |
63 h >>= 1; | |
64 p1 = blk1; | |
65 p2 = blk2; | |
66 pxor_r2r(mm7, mm7); /* mm7 is accumulator */ | |
67 pxor_r2r(mm6, mm6); /* mm7 is zero constant */ | |
68 do { | |
69 sad_add(p1, p2); | |
70 sad_add(p1 + 8, p2 + 8); | |
71 p1 += lx; | |
72 p2 += lx; | |
73 sad_add(p1, p2); | |
74 sad_add(p1 + 8, p2 + 8); | |
75 p1 += lx; | |
76 p2 += lx; | |
77 } while (--h); | |
78 return sad_end(); | |
79 } | |
80 | |
81 /* please test it ! */ | |
82 static inline void sad_add_sse(const UINT8 *p1, const UINT8 *p2) | |
83 { | |
84 movq_m2r(*(p1 + 0), mm0); | |
85 movq_m2r(*(p1 + 8), mm1); | |
86 psadbw_m2r(*(p2 + 0), mm0); | |
87 psadbw_m2r(*(p2 + 8), mm1); | |
88 paddusw_r2r(mm0, mm7); | |
89 paddusw_r2r(mm1, mm7); | |
90 } | |
91 | |
92 int pix_abs16x16_sse(UINT8 *blk1, UINT8 *blk2, int lx, int h) | |
93 { | |
94 const UINT8 *p1, *p2; | |
95 | |
96 h >>= 1; | |
97 p1 = blk1; | |
98 p2 = blk2; | |
99 pxor_r2r(mm7, mm7); /* mm7 is accumulator */ | |
100 do { | |
101 sad_add_sse(p1, p2); | |
102 p1 += lx; | |
103 p2 += lx; | |
104 sad_add_sse(p1, p2); | |
105 p1 += lx; | |
106 p2 += lx; | |
107 } while (--h); | |
108 return sad_end(); | |
109 } | |
110 | |
111 #define DUMP(reg) { mmx_t tmp; movq_r2m(reg, tmp); printf(#reg "=%016Lx\n", tmp.uq); } | |
112 | |
113 /* mm7 is accumulator, mm6 is zero */ | |
114 static inline void sad_add_x2(const UINT8 *p1, const UINT8 *p2, const UINT8 *p3) | |
115 { | |
116 movq_m2r(*(p2 + 0), mm0); | |
117 movq_m2r(*(p3 + 0), mm1); | |
118 movq_r2r(mm0, mm2); | |
119 movq_r2r(mm1, mm3); | |
120 punpcklbw_r2r(mm6, mm0); /* extract 4 bytes low */ | |
121 punpcklbw_r2r(mm6, mm1); | |
122 punpckhbw_r2r(mm6, mm2); /* high */ | |
123 punpckhbw_r2r(mm6, mm3); | |
124 paddusw_r2r(mm1, mm0); | |
125 paddusw_r2r(mm3, mm2); | |
126 movq_m2r(*(p1 + 0), mm1); /* mm1 : other value */ | |
127 paddusw_r2r(mm5, mm0); /* + 1 */ | |
128 paddusw_r2r(mm5, mm2); /* + 1 */ | |
129 psrlw_i2r(1, mm0); | |
130 psrlw_i2r(1, mm2); | |
131 packuswb_r2r(mm2, mm0); /* average is in mm0 */ | |
132 | |
133 movq_r2r(mm1, mm2); | |
134 psubusb_r2r(mm0, mm1); | |
135 psubusb_r2r(mm2, mm0); | |
136 por_r2r(mm1, mm0); /* mm0 is absolute value */ | |
137 | |
138 movq_r2r(mm0, mm1); | |
139 punpcklbw_r2r(mm6, mm0); | |
140 punpckhbw_r2r(mm6, mm1); | |
141 paddusw_r2r(mm0, mm7); | |
142 paddusw_r2r(mm1, mm7); | |
143 } | |
144 | |
145 int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h) | |
146 { | |
147 const UINT8 *p1, *p2; | |
148 | |
149 p1 = blk1; | |
150 p2 = blk2; | |
151 pxor_r2r(mm7, mm7); /* mm7 is accumulator */ | |
152 pxor_r2r(mm6, mm6); /* mm7 is zero constant */ | |
153 movq_m2r(mm_wone, mm5); /* one constant */ | |
154 do { | |
155 sad_add_x2(p1, p2, p2 + 1); | |
156 sad_add_x2(p1 + 8, p2 + 8, p2 + 9); | |
157 p1 += lx; | |
158 p2 += lx; | |
159 } while (--h); | |
160 return sad_end(); | |
161 } | |
162 | |
163 int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h) | |
164 { | |
165 const UINT8 *p1, *p2; | |
166 | |
167 p1 = blk1; | |
168 p2 = blk2; | |
169 pxor_r2r(mm7, mm7); /* mm7 is accumulator */ | |
170 pxor_r2r(mm6, mm6); /* mm7 is zero constant */ | |
171 movq_m2r(mm_wone, mm5); /* one constant */ | |
172 do { | |
173 sad_add_x2(p1, p2, p2 + lx); | |
174 sad_add_x2(p1 + 8, p2 + 8, p2 + 8 + lx); | |
175 p1 += lx; | |
176 p2 += lx; | |
177 } while (--h); | |
178 return sad_end(); | |
179 } | |
180 | |
181 /* mm7 is accumulator, mm6 is zero */ | |
182 static inline void sad_add_xy2(const UINT8 *p1, const UINT8 *p2, const UINT8 *p3) | |
183 { | |
184 movq_m2r(*(p2 + 0), mm0); | |
185 movq_m2r(*(p3 + 0), mm1); | |
186 movq_r2r(mm0, mm2); | |
187 movq_r2r(mm1, mm3); | |
188 punpcklbw_r2r(mm6, mm0); /* extract 4 bytes low */ | |
189 punpcklbw_r2r(mm6, mm1); | |
190 punpckhbw_r2r(mm6, mm2); /* high */ | |
191 punpckhbw_r2r(mm6, mm3); | |
192 paddusw_r2r(mm1, mm0); | |
193 paddusw_r2r(mm3, mm2); | |
194 | |
195 movq_m2r(*(p2 + 1), mm1); | |
196 movq_m2r(*(p3 + 1), mm3); | |
197 movq_r2r(mm1, mm4); | |
198 punpcklbw_r2r(mm6, mm1); /* low */ | |
199 punpckhbw_r2r(mm6, mm4); /* high */ | |
200 paddusw_r2r(mm1, mm0); | |
201 paddusw_r2r(mm4, mm2); | |
202 movq_r2r(mm3, mm4); | |
203 punpcklbw_r2r(mm6, mm3); /* low */ | |
204 punpckhbw_r2r(mm6, mm4); /* high */ | |
205 paddusw_r2r(mm3, mm0); | |
206 paddusw_r2r(mm4, mm2); | |
207 | |
208 movq_m2r(*(p1 + 0), mm1); /* mm1 : other value */ | |
209 paddusw_r2r(mm5, mm0); /* + 2 */ | |
210 paddusw_r2r(mm5, mm2); /* + 2 */ | |
211 psrlw_i2r(2, mm0); | |
212 psrlw_i2r(2, mm2); | |
213 packuswb_r2r(mm2, mm0); /* average is in mm0 */ | |
214 | |
215 movq_r2r(mm1, mm2); | |
216 psubusb_r2r(mm0, mm1); | |
217 psubusb_r2r(mm2, mm0); | |
218 por_r2r(mm1, mm0); /* mm0 is absolute value */ | |
219 | |
220 movq_r2r(mm0, mm1); | |
221 punpcklbw_r2r(mm6, mm0); | |
222 punpckhbw_r2r(mm6, mm1); | |
223 paddusw_r2r(mm0, mm7); | |
224 paddusw_r2r(mm1, mm7); | |
225 } | |
226 | |
227 int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h) | |
228 { | |
229 const UINT8 *p1, *p2, *p3; | |
230 | |
231 p1 = blk1; | |
232 p2 = blk2; | |
233 p3 = blk2 + lx; | |
234 pxor_r2r(mm7, mm7); /* mm7 is accumulator */ | |
235 pxor_r2r(mm6, mm6); /* mm7 is zero constant */ | |
236 movq_m2r(mm_wtwo, mm5); /* one constant */ | |
237 do { | |
238 sad_add_xy2(p1, p2, p2 + lx); | |
239 sad_add_xy2(p1 + 8, p2 + 8, p2 + 8 + lx); | |
240 p1 += lx; | |
241 p2 += lx; | |
242 } while (--h); | |
243 return sad_end(); | |
244 } |