comparison i386/motion_est_mmx.c @ 72:3049d6d452a3 libavcodec

suppressed nasm dependancy - rewrote forward DCT and motion estimation code
author glantau
date Wed, 15 Aug 2001 22:25:32 +0000
parents
children 944632089814
comparison
equal deleted inserted replaced
71:79be2c581c01 72:3049d6d452a3
1 /*
2 * MMX optimized motion estimation
3 * Copyright (c) 2001 Gerard Lantau.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 *
19 */
20 #include "../dsputil.h"
21 #include "mmx.h"
22
23 static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001;
24 static const unsigned long long int mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002;
25
26 /* mm7 is accumulator, mm6 is zero */
27 static inline void sad_add(const UINT8 *p1, const UINT8 *p2)
28 {
29 movq_m2r(*p1, mm0);
30 movq_m2r(*p2, mm1);
31 movq_r2r(mm0, mm2);
32 psubusb_r2r(mm1, mm0);
33 psubusb_r2r(mm2, mm1);
34 por_r2r(mm1, mm0); /* mm0 is absolute value */
35
36 movq_r2r(mm0, mm1);
37 punpcklbw_r2r(mm6, mm0);
38 punpckhbw_r2r(mm6, mm1);
39 paddusw_r2r(mm0, mm7);
40 paddusw_r2r(mm1, mm7);
41 }
42
43 /* convert mm7 to value */
44 static inline int sad_end(void)
45 {
46 int res;
47
48 movq_r2r(mm7, mm0);
49 psrlq_i2r(32, mm7);
50 paddusw_r2r(mm0, mm7);
51
52 movq_r2r(mm7, mm0);
53 psrlq_i2r(16, mm7);
54 paddusw_r2r(mm0, mm7);
55 __asm __volatile ("movd %%mm7, %0" : "=a" (res));
56 return res & 0xffff;
57 }
58
59 int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h)
60 {
61 const UINT8 *p1, *p2;
62
63 h >>= 1;
64 p1 = blk1;
65 p2 = blk2;
66 pxor_r2r(mm7, mm7); /* mm7 is accumulator */
67 pxor_r2r(mm6, mm6); /* mm7 is zero constant */
68 do {
69 sad_add(p1, p2);
70 sad_add(p1 + 8, p2 + 8);
71 p1 += lx;
72 p2 += lx;
73 sad_add(p1, p2);
74 sad_add(p1 + 8, p2 + 8);
75 p1 += lx;
76 p2 += lx;
77 } while (--h);
78 return sad_end();
79 }
80
81 /* please test it ! */
82 static inline void sad_add_sse(const UINT8 *p1, const UINT8 *p2)
83 {
84 movq_m2r(*(p1 + 0), mm0);
85 movq_m2r(*(p1 + 8), mm1);
86 psadbw_m2r(*(p2 + 0), mm0);
87 psadbw_m2r(*(p2 + 8), mm1);
88 paddusw_r2r(mm0, mm7);
89 paddusw_r2r(mm1, mm7);
90 }
91
92 int pix_abs16x16_sse(UINT8 *blk1, UINT8 *blk2, int lx, int h)
93 {
94 const UINT8 *p1, *p2;
95
96 h >>= 1;
97 p1 = blk1;
98 p2 = blk2;
99 pxor_r2r(mm7, mm7); /* mm7 is accumulator */
100 do {
101 sad_add_sse(p1, p2);
102 p1 += lx;
103 p2 += lx;
104 sad_add_sse(p1, p2);
105 p1 += lx;
106 p2 += lx;
107 } while (--h);
108 return sad_end();
109 }
110
111 #define DUMP(reg) { mmx_t tmp; movq_r2m(reg, tmp); printf(#reg "=%016Lx\n", tmp.uq); }
112
113 /* mm7 is accumulator, mm6 is zero */
114 static inline void sad_add_x2(const UINT8 *p1, const UINT8 *p2, const UINT8 *p3)
115 {
116 movq_m2r(*(p2 + 0), mm0);
117 movq_m2r(*(p3 + 0), mm1);
118 movq_r2r(mm0, mm2);
119 movq_r2r(mm1, mm3);
120 punpcklbw_r2r(mm6, mm0); /* extract 4 bytes low */
121 punpcklbw_r2r(mm6, mm1);
122 punpckhbw_r2r(mm6, mm2); /* high */
123 punpckhbw_r2r(mm6, mm3);
124 paddusw_r2r(mm1, mm0);
125 paddusw_r2r(mm3, mm2);
126 movq_m2r(*(p1 + 0), mm1); /* mm1 : other value */
127 paddusw_r2r(mm5, mm0); /* + 1 */
128 paddusw_r2r(mm5, mm2); /* + 1 */
129 psrlw_i2r(1, mm0);
130 psrlw_i2r(1, mm2);
131 packuswb_r2r(mm2, mm0); /* average is in mm0 */
132
133 movq_r2r(mm1, mm2);
134 psubusb_r2r(mm0, mm1);
135 psubusb_r2r(mm2, mm0);
136 por_r2r(mm1, mm0); /* mm0 is absolute value */
137
138 movq_r2r(mm0, mm1);
139 punpcklbw_r2r(mm6, mm0);
140 punpckhbw_r2r(mm6, mm1);
141 paddusw_r2r(mm0, mm7);
142 paddusw_r2r(mm1, mm7);
143 }
144
145 int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h)
146 {
147 const UINT8 *p1, *p2;
148
149 p1 = blk1;
150 p2 = blk2;
151 pxor_r2r(mm7, mm7); /* mm7 is accumulator */
152 pxor_r2r(mm6, mm6); /* mm7 is zero constant */
153 movq_m2r(mm_wone, mm5); /* one constant */
154 do {
155 sad_add_x2(p1, p2, p2 + 1);
156 sad_add_x2(p1 + 8, p2 + 8, p2 + 9);
157 p1 += lx;
158 p2 += lx;
159 } while (--h);
160 return sad_end();
161 }
162
163 int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h)
164 {
165 const UINT8 *p1, *p2;
166
167 p1 = blk1;
168 p2 = blk2;
169 pxor_r2r(mm7, mm7); /* mm7 is accumulator */
170 pxor_r2r(mm6, mm6); /* mm7 is zero constant */
171 movq_m2r(mm_wone, mm5); /* one constant */
172 do {
173 sad_add_x2(p1, p2, p2 + lx);
174 sad_add_x2(p1 + 8, p2 + 8, p2 + 8 + lx);
175 p1 += lx;
176 p2 += lx;
177 } while (--h);
178 return sad_end();
179 }
180
181 /* mm7 is accumulator, mm6 is zero */
182 static inline void sad_add_xy2(const UINT8 *p1, const UINT8 *p2, const UINT8 *p3)
183 {
184 movq_m2r(*(p2 + 0), mm0);
185 movq_m2r(*(p3 + 0), mm1);
186 movq_r2r(mm0, mm2);
187 movq_r2r(mm1, mm3);
188 punpcklbw_r2r(mm6, mm0); /* extract 4 bytes low */
189 punpcklbw_r2r(mm6, mm1);
190 punpckhbw_r2r(mm6, mm2); /* high */
191 punpckhbw_r2r(mm6, mm3);
192 paddusw_r2r(mm1, mm0);
193 paddusw_r2r(mm3, mm2);
194
195 movq_m2r(*(p2 + 1), mm1);
196 movq_m2r(*(p3 + 1), mm3);
197 movq_r2r(mm1, mm4);
198 punpcklbw_r2r(mm6, mm1); /* low */
199 punpckhbw_r2r(mm6, mm4); /* high */
200 paddusw_r2r(mm1, mm0);
201 paddusw_r2r(mm4, mm2);
202 movq_r2r(mm3, mm4);
203 punpcklbw_r2r(mm6, mm3); /* low */
204 punpckhbw_r2r(mm6, mm4); /* high */
205 paddusw_r2r(mm3, mm0);
206 paddusw_r2r(mm4, mm2);
207
208 movq_m2r(*(p1 + 0), mm1); /* mm1 : other value */
209 paddusw_r2r(mm5, mm0); /* + 2 */
210 paddusw_r2r(mm5, mm2); /* + 2 */
211 psrlw_i2r(2, mm0);
212 psrlw_i2r(2, mm2);
213 packuswb_r2r(mm2, mm0); /* average is in mm0 */
214
215 movq_r2r(mm1, mm2);
216 psubusb_r2r(mm0, mm1);
217 psubusb_r2r(mm2, mm0);
218 por_r2r(mm1, mm0); /* mm0 is absolute value */
219
220 movq_r2r(mm0, mm1);
221 punpcklbw_r2r(mm6, mm0);
222 punpckhbw_r2r(mm6, mm1);
223 paddusw_r2r(mm0, mm7);
224 paddusw_r2r(mm1, mm7);
225 }
226
227 int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h)
228 {
229 const UINT8 *p1, *p2, *p3;
230
231 p1 = blk1;
232 p2 = blk2;
233 p3 = blk2 + lx;
234 pxor_r2r(mm7, mm7); /* mm7 is accumulator */
235 pxor_r2r(mm6, mm6); /* mm7 is zero constant */
236 movq_m2r(mm_wtwo, mm5); /* one constant */
237 do {
238 sad_add_xy2(p1, p2, p2 + lx);
239 sad_add_xy2(p1 + 8, p2 + 8, p2 + 8 + lx);
240 p1 += lx;
241 p2 += lx;
242 } while (--h);
243 return sad_end();
244 }