Mercurial > libavcodec.hg
annotate i386/motion_est_mmx.c @ 7229:f03d29b6fefe libavcodec
New full search ME
author | michael |
---|---|
date | Wed, 09 Jul 2008 18:59:52 +0000 |
parents | f7cbb7733146 |
children | eebc7209c47f |
rev | line source |
---|---|
72
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
1 /* |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
2 * MMX optimized motion estimation |
429 | 3 * Copyright (c) 2001 Fabrice Bellard. |
1739
07a484280a82
copyright year update of the files i touched and remembered, things look annoyingly unmaintained otherwise
michael
parents:
1708
diff
changeset
|
4 * Copyright (c) 2002-2004 Michael Niedermayer |
72
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
5 * |
5214 | 6 * mostly by Michael Niedermayer <michaelni@gmx.at> |
7 * | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3576
diff
changeset
|
8 * This file is part of FFmpeg. |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3576
diff
changeset
|
9 * |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3576
diff
changeset
|
10 * FFmpeg is free software; you can redistribute it and/or |
429 | 11 * modify it under the terms of the GNU Lesser General Public |
12 * License as published by the Free Software Foundation; either | |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3576
diff
changeset
|
13 * version 2.1 of the License, or (at your option) any later version. |
72
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
14 * |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3576
diff
changeset
|
15 * FFmpeg is distributed in the hope that it will be useful, |
72
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
429 | 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
18 * Lesser General Public License for more details. | |
72
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
19 * |
429 | 20 * You should have received a copy of the GNU Lesser General Public |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3576
diff
changeset
|
21 * License along with FFmpeg; if not, write to the Free Software |
3036
0b546eab515d
Update licensing information: The FSF changed postal address.
diego
parents:
2979
diff
changeset
|
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
72
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
23 */ |
6763 | 24 |
25 #include "libavutil/x86_cpu.h" | |
26 #include "libavcodec/dsputil.h" | |
72
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
27 |
6190
2aa536e36c89
Add and use DECLARE_ASM_CONST for constants used in assembler code.
reimar
parents:
5214
diff
changeset
|
28 DECLARE_ASM_CONST(8, uint64_t, round_tab[3])={ |
1569
1f8d1e1173d8
Fixes GCC 3.3.2 warnings patch by (Panagiotis Issaris <takis at lumumba dot luc dot ac dot be>)
michael
parents:
1455
diff
changeset
|
29 0x0000000000000000ULL, |
1f8d1e1173d8
Fixes GCC 3.3.2 warnings patch by (Panagiotis Issaris <takis at lumumba dot luc dot ac dot be>)
michael
parents:
1455
diff
changeset
|
30 0x0001000100010001ULL, |
1f8d1e1173d8
Fixes GCC 3.3.2 warnings patch by (Panagiotis Issaris <takis at lumumba dot luc dot ac dot be>)
michael
parents:
1455
diff
changeset
|
31 0x0002000200020002ULL, |
294 | 32 }; |
72
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
33 |
6190
2aa536e36c89
Add and use DECLARE_ASM_CONST for constants used in assembler code.
reimar
parents:
5214
diff
changeset
|
34 DECLARE_ASM_CONST(8, uint64_t, bone)= 0x0101010101010101LL; |
330 | 35 |
1708 | 36 static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) |
72
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
37 { |
6755
33896780c612
Do not misuse long as the size of a register in x86.
ramiro
parents:
6190
diff
changeset
|
38 x86_reg len= -(stride*h); |
294 | 39 asm volatile( |
3576
f7125bf10892
Support for MacIntel, last part: balign directives
gpoirier
parents:
3398
diff
changeset
|
40 ASMALIGN(4) |
2979 | 41 "1: \n\t" |
42 "movq (%1, %%"REG_a"), %%mm0 \n\t" | |
43 "movq (%2, %%"REG_a"), %%mm2 \n\t" | |
44 "movq (%2, %%"REG_a"), %%mm4 \n\t" | |
45 "add %3, %%"REG_a" \n\t" | |
46 "psubusb %%mm0, %%mm2 \n\t" | |
47 "psubusb %%mm4, %%mm0 \n\t" | |
48 "movq (%1, %%"REG_a"), %%mm1 \n\t" | |
49 "movq (%2, %%"REG_a"), %%mm3 \n\t" | |
50 "movq (%2, %%"REG_a"), %%mm5 \n\t" | |
51 "psubusb %%mm1, %%mm3 \n\t" | |
52 "psubusb %%mm5, %%mm1 \n\t" | |
53 "por %%mm2, %%mm0 \n\t" | |
54 "por %%mm1, %%mm3 \n\t" | |
55 "movq %%mm0, %%mm1 \n\t" | |
56 "movq %%mm3, %%mm2 \n\t" | |
57 "punpcklbw %%mm7, %%mm0 \n\t" | |
58 "punpckhbw %%mm7, %%mm1 \n\t" | |
59 "punpcklbw %%mm7, %%mm3 \n\t" | |
60 "punpckhbw %%mm7, %%mm2 \n\t" | |
61 "paddw %%mm1, %%mm0 \n\t" | |
62 "paddw %%mm3, %%mm2 \n\t" | |
63 "paddw %%mm2, %%mm0 \n\t" | |
64 "paddw %%mm0, %%mm6 \n\t" | |
65 "add %3, %%"REG_a" \n\t" | |
66 " js 1b \n\t" | |
294 | 67 : "+a" (len) |
6755
33896780c612
Do not misuse long as the size of a register in x86.
ramiro
parents:
6190
diff
changeset
|
68 : "r" (blk1 - len), "r" (blk2 - len), "r" ((x86_reg)stride) |
294 | 69 ); |
72
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
70 } |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
71 |
1708 | 72 static inline void sad8_1_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h) |
72
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
73 { |
294 | 74 asm volatile( |
3576
f7125bf10892
Support for MacIntel, last part: balign directives
gpoirier
parents:
3398
diff
changeset
|
75 ASMALIGN(4) |
2979 | 76 "1: \n\t" |
4980 | 77 "movq (%1), %%mm0 \n\t" |
78 "movq (%1, %3), %%mm1 \n\t" | |
79 "psadbw (%2), %%mm0 \n\t" | |
80 "psadbw (%2, %3), %%mm1 \n\t" | |
2979 | 81 "paddw %%mm0, %%mm6 \n\t" |
4980 | 82 "paddw %%mm1, %%mm6 \n\t" |
83 "lea (%1,%3,2), %1 \n\t" | |
84 "lea (%2,%3,2), %2 \n\t" | |
85 "sub $2, %0 \n\t" | |
86 " jg 1b \n\t" | |
87 : "+r" (h), "+r" (blk1), "+r" (blk2) | |
6755
33896780c612
Do not misuse long as the size of a register in x86.
ramiro
parents:
6190
diff
changeset
|
88 : "r" ((x86_reg)stride) |
294 | 89 ); |
72
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
90 } |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
91 |
4981 | 92 static int sad16_sse2(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h) |
93 { | |
94 int ret; | |
95 asm volatile( | |
96 "pxor %%xmm6, %%xmm6 \n\t" | |
97 ASMALIGN(4) | |
98 "1: \n\t" | |
99 "movdqu (%1), %%xmm0 \n\t" | |
100 "movdqu (%1, %3), %%xmm1 \n\t" | |
101 "psadbw (%2), %%xmm0 \n\t" | |
102 "psadbw (%2, %3), %%xmm1 \n\t" | |
103 "paddw %%xmm0, %%xmm6 \n\t" | |
104 "paddw %%xmm1, %%xmm6 \n\t" | |
105 "lea (%1,%3,2), %1 \n\t" | |
106 "lea (%2,%3,2), %2 \n\t" | |
107 "sub $2, %0 \n\t" | |
108 " jg 1b \n\t" | |
109 : "+r" (h), "+r" (blk1), "+r" (blk2) | |
6755
33896780c612
Do not misuse long as the size of a register in x86.
ramiro
parents:
6190
diff
changeset
|
110 : "r" ((x86_reg)stride) |
4981 | 111 ); |
112 asm volatile( | |
113 "movhlps %%xmm6, %%xmm0 \n\t" | |
114 "paddw %%xmm0, %%xmm6 \n\t" | |
115 "movd %%xmm6, %0 \n\t" | |
116 : "=r"(ret) | |
117 ); | |
118 return ret; | |
119 } | |
120 | |
4980 | 121 static inline void sad8_x2a_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h) |
72
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
122 { |
294 | 123 asm volatile( |
3576
f7125bf10892
Support for MacIntel, last part: balign directives
gpoirier
parents:
3398
diff
changeset
|
124 ASMALIGN(4) |
2979 | 125 "1: \n\t" |
4980 | 126 "movq (%1), %%mm0 \n\t" |
127 "movq (%1, %3), %%mm1 \n\t" | |
128 "pavgb 1(%1), %%mm0 \n\t" | |
129 "pavgb 1(%1, %3), %%mm1 \n\t" | |
130 "psadbw (%2), %%mm0 \n\t" | |
131 "psadbw (%2, %3), %%mm1 \n\t" | |
2979 | 132 "paddw %%mm0, %%mm6 \n\t" |
4980 | 133 "paddw %%mm1, %%mm6 \n\t" |
134 "lea (%1,%3,2), %1 \n\t" | |
135 "lea (%2,%3,2), %2 \n\t" | |
136 "sub $2, %0 \n\t" | |
137 " jg 1b \n\t" | |
138 : "+r" (h), "+r" (blk1), "+r" (blk2) | |
6755
33896780c612
Do not misuse long as the size of a register in x86.
ramiro
parents:
6190
diff
changeset
|
139 : "r" ((x86_reg)stride) |
4980 | 140 ); |
141 } | |
142 | |
143 static inline void sad8_y2a_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h) | |
144 { | |
145 asm volatile( | |
146 "movq (%1), %%mm0 \n\t" | |
147 "add %3, %1 \n\t" | |
148 ASMALIGN(4) | |
149 "1: \n\t" | |
150 "movq (%1), %%mm1 \n\t" | |
151 "movq (%1, %3), %%mm2 \n\t" | |
152 "pavgb %%mm1, %%mm0 \n\t" | |
153 "pavgb %%mm2, %%mm1 \n\t" | |
154 "psadbw (%2), %%mm0 \n\t" | |
155 "psadbw (%2, %3), %%mm1 \n\t" | |
156 "paddw %%mm0, %%mm6 \n\t" | |
157 "paddw %%mm1, %%mm6 \n\t" | |
158 "movq %%mm2, %%mm0 \n\t" | |
159 "lea (%1,%3,2), %1 \n\t" | |
160 "lea (%2,%3,2), %2 \n\t" | |
161 "sub $2, %0 \n\t" | |
162 " jg 1b \n\t" | |
163 : "+r" (h), "+r" (blk1), "+r" (blk2) | |
6755
33896780c612
Do not misuse long as the size of a register in x86.
ramiro
parents:
6190
diff
changeset
|
164 : "r" ((x86_reg)stride) |
294 | 165 ); |
72
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
166 } |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
167 |
1064 | 168 static inline void sad8_4_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h) |
4980 | 169 { |
294 | 170 asm volatile( |
4974 | 171 "movq "MANGLE(bone)", %%mm5 \n\t" |
4980 | 172 "movq (%1), %%mm0 \n\t" |
173 "pavgb 1(%1), %%mm0 \n\t" | |
174 "add %3, %1 \n\t" | |
3576
f7125bf10892
Support for MacIntel, last part: balign directives
gpoirier
parents:
3398
diff
changeset
|
175 ASMALIGN(4) |
2979 | 176 "1: \n\t" |
4980 | 177 "movq (%1), %%mm1 \n\t" |
178 "movq (%1,%3), %%mm2 \n\t" | |
179 "pavgb 1(%1), %%mm1 \n\t" | |
180 "pavgb 1(%1,%3), %%mm2 \n\t" | |
4974 | 181 "psubusb %%mm5, %%mm1 \n\t" |
182 "pavgb %%mm1, %%mm0 \n\t" | |
4980 | 183 "pavgb %%mm2, %%mm1 \n\t" |
184 "psadbw (%2), %%mm0 \n\t" | |
185 "psadbw (%2,%3), %%mm1 \n\t" | |
2979 | 186 "paddw %%mm0, %%mm6 \n\t" |
4980 | 187 "paddw %%mm1, %%mm6 \n\t" |
188 "movq %%mm2, %%mm0 \n\t" | |
189 "lea (%1,%3,2), %1 \n\t" | |
190 "lea (%2,%3,2), %2 \n\t" | |
191 "sub $2, %0 \n\t" | |
192 " jg 1b \n\t" | |
193 : "+r" (h), "+r" (blk1), "+r" (blk2) | |
6755
33896780c612
Do not misuse long as the size of a register in x86.
ramiro
parents:
6190
diff
changeset
|
194 : "r" ((x86_reg)stride) |
294 | 195 ); |
72
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
196 } |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
197 |
1064 | 198 static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h) |
72
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
199 { |
6755
33896780c612
Do not misuse long as the size of a register in x86.
ramiro
parents:
6190
diff
changeset
|
200 x86_reg len= -(stride*h); |
294 | 201 asm volatile( |
3576
f7125bf10892
Support for MacIntel, last part: balign directives
gpoirier
parents:
3398
diff
changeset
|
202 ASMALIGN(4) |
2979 | 203 "1: \n\t" |
204 "movq (%1, %%"REG_a"), %%mm0 \n\t" | |
205 "movq (%2, %%"REG_a"), %%mm1 \n\t" | |
206 "movq (%1, %%"REG_a"), %%mm2 \n\t" | |
207 "movq (%2, %%"REG_a"), %%mm3 \n\t" | |
208 "punpcklbw %%mm7, %%mm0 \n\t" | |
209 "punpcklbw %%mm7, %%mm1 \n\t" | |
210 "punpckhbw %%mm7, %%mm2 \n\t" | |
211 "punpckhbw %%mm7, %%mm3 \n\t" | |
212 "paddw %%mm0, %%mm1 \n\t" | |
213 "paddw %%mm2, %%mm3 \n\t" | |
214 "movq (%3, %%"REG_a"), %%mm4 \n\t" | |
215 "movq (%3, %%"REG_a"), %%mm2 \n\t" | |
216 "paddw %%mm5, %%mm1 \n\t" | |
217 "paddw %%mm5, %%mm3 \n\t" | |
218 "psrlw $1, %%mm1 \n\t" | |
219 "psrlw $1, %%mm3 \n\t" | |
220 "packuswb %%mm3, %%mm1 \n\t" | |
221 "psubusb %%mm1, %%mm4 \n\t" | |
222 "psubusb %%mm2, %%mm1 \n\t" | |
223 "por %%mm4, %%mm1 \n\t" | |
224 "movq %%mm1, %%mm0 \n\t" | |
225 "punpcklbw %%mm7, %%mm0 \n\t" | |
226 "punpckhbw %%mm7, %%mm1 \n\t" | |
227 "paddw %%mm1, %%mm0 \n\t" | |
228 "paddw %%mm0, %%mm6 \n\t" | |
229 "add %4, %%"REG_a" \n\t" | |
230 " js 1b \n\t" | |
294 | 231 : "+a" (len) |
6755
33896780c612
Do not misuse long as the size of a register in x86.
ramiro
parents:
6190
diff
changeset
|
232 : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" ((x86_reg)stride) |
294 | 233 ); |
72
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
234 } |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
235 |
1064 | 236 static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) |
72
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
237 { |
6755
33896780c612
Do not misuse long as the size of a register in x86.
ramiro
parents:
6190
diff
changeset
|
238 x86_reg len= -(stride*h); |
294 | 239 asm volatile( |
4980 | 240 "movq (%1, %%"REG_a"), %%mm0 \n\t" |
241 "movq 1(%1, %%"REG_a"), %%mm2 \n\t" | |
242 "movq %%mm0, %%mm1 \n\t" | |
243 "movq %%mm2, %%mm3 \n\t" | |
244 "punpcklbw %%mm7, %%mm0 \n\t" | |
245 "punpckhbw %%mm7, %%mm1 \n\t" | |
246 "punpcklbw %%mm7, %%mm2 \n\t" | |
247 "punpckhbw %%mm7, %%mm3 \n\t" | |
248 "paddw %%mm2, %%mm0 \n\t" | |
249 "paddw %%mm3, %%mm1 \n\t" | |
3576
f7125bf10892
Support for MacIntel, last part: balign directives
gpoirier
parents:
3398
diff
changeset
|
250 ASMALIGN(4) |
2979 | 251 "1: \n\t" |
4980 | 252 "movq (%2, %%"REG_a"), %%mm2 \n\t" |
253 "movq 1(%2, %%"REG_a"), %%mm4 \n\t" | |
254 "movq %%mm2, %%mm3 \n\t" | |
255 "movq %%mm4, %%mm5 \n\t" | |
2979 | 256 "punpcklbw %%mm7, %%mm2 \n\t" |
4980 | 257 "punpckhbw %%mm7, %%mm3 \n\t" |
258 "punpcklbw %%mm7, %%mm4 \n\t" | |
259 "punpckhbw %%mm7, %%mm5 \n\t" | |
260 "paddw %%mm4, %%mm2 \n\t" | |
261 "paddw %%mm5, %%mm3 \n\t" | |
262 "movq 16+"MANGLE(round_tab)", %%mm5 \n\t" | |
263 "paddw %%mm2, %%mm0 \n\t" | |
264 "paddw %%mm3, %%mm1 \n\t" | |
265 "paddw %%mm5, %%mm0 \n\t" | |
2979 | 266 "paddw %%mm5, %%mm1 \n\t" |
4980 | 267 "movq (%3, %%"REG_a"), %%mm4 \n\t" |
268 "movq (%3, %%"REG_a"), %%mm5 \n\t" | |
269 "psrlw $2, %%mm0 \n\t" | |
2979 | 270 "psrlw $2, %%mm1 \n\t" |
4980 | 271 "packuswb %%mm1, %%mm0 \n\t" |
272 "psubusb %%mm0, %%mm4 \n\t" | |
273 "psubusb %%mm5, %%mm0 \n\t" | |
274 "por %%mm4, %%mm0 \n\t" | |
275 "movq %%mm0, %%mm4 \n\t" | |
2979 | 276 "punpcklbw %%mm7, %%mm0 \n\t" |
4980 | 277 "punpckhbw %%mm7, %%mm4 \n\t" |
2979 | 278 "paddw %%mm0, %%mm6 \n\t" |
4980 | 279 "paddw %%mm4, %%mm6 \n\t" |
280 "movq %%mm2, %%mm0 \n\t" | |
281 "movq %%mm3, %%mm1 \n\t" | |
2979 | 282 "add %4, %%"REG_a" \n\t" |
283 " js 1b \n\t" | |
294 | 284 : "+a" (len) |
6755
33896780c612
Do not misuse long as the size of a register in x86.
ramiro
parents:
6190
diff
changeset
|
285 : "r" (blk1 - len), "r" (blk1 -len + stride), "r" (blk2 - len), "r" ((x86_reg)stride) |
294 | 286 ); |
72
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
287 } |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
288 |
1057 | 289 static inline int sum_mmx(void) |
72
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
290 { |
294 | 291 int ret; |
292 asm volatile( | |
2979 | 293 "movq %%mm6, %%mm0 \n\t" |
294 "psrlq $32, %%mm6 \n\t" | |
295 "paddw %%mm0, %%mm6 \n\t" | |
296 "movq %%mm6, %%mm0 \n\t" | |
297 "psrlq $16, %%mm6 \n\t" | |
298 "paddw %%mm0, %%mm6 \n\t" | |
299 "movd %%mm6, %0 \n\t" | |
294 | 300 : "=r" (ret) |
301 ); | |
302 return ret&0xFFFF; | |
303 } | |
72
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
304 |
1057 | 305 static inline int sum_mmx2(void) |
294 | 306 { |
307 int ret; | |
308 asm volatile( | |
2979 | 309 "movd %%mm6, %0 \n\t" |
294 | 310 : "=r" (ret) |
311 ); | |
312 return ret; | |
72
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
313 } |
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
314 |
4980 | 315 static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) |
316 { | |
317 sad8_2_mmx(blk1, blk1+1, blk2, stride, h); | |
318 } | |
319 static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) | |
320 { | |
321 sad8_2_mmx(blk1, blk1+stride, blk2, stride, h); | |
322 } | |
323 | |
900 | 324 |
294 | 325 #define PIX_SAD(suf)\ |
1708 | 326 static int sad8_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ |
327 {\ | |
328 assert(h==8);\ | |
2979 | 329 asm volatile("pxor %%mm7, %%mm7 \n\t"\ |
330 "pxor %%mm6, %%mm6 \n\t":);\ | |
1708 | 331 \ |
332 sad8_1_ ## suf(blk1, blk2, stride, 8);\ | |
333 \ | |
334 return sum_ ## suf();\ | |
335 }\ | |
336 static int sad8_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ | |
337 {\ | |
338 assert(h==8);\ | |
2979 | 339 asm volatile("pxor %%mm7, %%mm7 \n\t"\ |
340 "pxor %%mm6, %%mm6 \n\t"\ | |
4982 | 341 "movq %0, %%mm5 \n\t"\ |
1708 | 342 :: "m"(round_tab[1]) \ |
343 );\ | |
344 \ | |
4980 | 345 sad8_x2a_ ## suf(blk1, blk2, stride, 8);\ |
1708 | 346 \ |
347 return sum_ ## suf();\ | |
348 }\ | |
349 \ | |
350 static int sad8_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ | |
351 {\ | |
352 assert(h==8);\ | |
2979 | 353 asm volatile("pxor %%mm7, %%mm7 \n\t"\ |
354 "pxor %%mm6, %%mm6 \n\t"\ | |
355 "movq %0, %%mm5 \n\t"\ | |
1708 | 356 :: "m"(round_tab[1]) \ |
357 );\ | |
358 \ | |
4980 | 359 sad8_y2a_ ## suf(blk1, blk2, stride, 8);\ |
1708 | 360 \ |
361 return sum_ ## suf();\ | |
362 }\ | |
363 \ | |
364 static int sad8_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ | |
365 {\ | |
366 assert(h==8);\ | |
2979 | 367 asm volatile("pxor %%mm7, %%mm7 \n\t"\ |
368 "pxor %%mm6, %%mm6 \n\t"\ | |
4980 | 369 ::);\ |
1708 | 370 \ |
371 sad8_4_ ## suf(blk1, blk2, stride, 8);\ | |
372 \ | |
373 return sum_ ## suf();\ | |
374 }\ | |
375 \ | |
376 static int sad16_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ | |
294 | 377 {\ |
2979 | 378 asm volatile("pxor %%mm7, %%mm7 \n\t"\ |
379 "pxor %%mm6, %%mm6 \n\t":);\ | |
294 | 380 \ |
1708 | 381 sad8_1_ ## suf(blk1 , blk2 , stride, h);\ |
382 sad8_1_ ## suf(blk1+8, blk2+8, stride, h);\ | |
294 | 383 \ |
384 return sum_ ## suf();\ | |
385 }\ | |
1708 | 386 static int sad16_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ |
294 | 387 {\ |
2979 | 388 asm volatile("pxor %%mm7, %%mm7 \n\t"\ |
389 "pxor %%mm6, %%mm6 \n\t"\ | |
390 "movq %0, %%mm5 \n\t"\ | |
294 | 391 :: "m"(round_tab[1]) \ |
392 );\ | |
393 \ | |
4980 | 394 sad8_x2a_ ## suf(blk1 , blk2 , stride, h);\ |
395 sad8_x2a_ ## suf(blk1+8, blk2+8, stride, h);\ | |
294 | 396 \ |
397 return sum_ ## suf();\ | |
398 }\ | |
1708 | 399 static int sad16_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ |
294 | 400 {\ |
2979 | 401 asm volatile("pxor %%mm7, %%mm7 \n\t"\ |
402 "pxor %%mm6, %%mm6 \n\t"\ | |
403 "movq %0, %%mm5 \n\t"\ | |
294 | 404 :: "m"(round_tab[1]) \ |
405 );\ | |
406 \ | |
4980 | 407 sad8_y2a_ ## suf(blk1 , blk2 , stride, h);\ |
408 sad8_y2a_ ## suf(blk1+8, blk2+8, stride, h);\ | |
294 | 409 \ |
410 return sum_ ## suf();\ | |
411 }\ | |
1708 | 412 static int sad16_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ |
294 | 413 {\ |
2979 | 414 asm volatile("pxor %%mm7, %%mm7 \n\t"\ |
415 "pxor %%mm6, %%mm6 \n\t"\ | |
4980 | 416 ::);\ |
294 | 417 \ |
1708 | 418 sad8_4_ ## suf(blk1 , blk2 , stride, h);\ |
419 sad8_4_ ## suf(blk1+8, blk2+8, stride, h);\ | |
294 | 420 \ |
421 return sum_ ## suf();\ | |
422 }\ | |
72
3049d6d452a3
suppressed nasm dependancy - rewrote forward DCT and motion estimation code
glantau
parents:
diff
changeset
|
423 |
294 | 424 PIX_SAD(mmx) |
425 PIX_SAD(mmx2) | |
1057 | 426 |
1092 | 427 void dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx) |
1057 | 428 { |
4197 | 429 if (mm_flags & MM_MMX) { |
1708 | 430 c->pix_abs[0][0] = sad16_mmx; |
431 c->pix_abs[0][1] = sad16_x2_mmx; | |
432 c->pix_abs[0][2] = sad16_y2_mmx; | |
433 c->pix_abs[0][3] = sad16_xy2_mmx; | |
434 c->pix_abs[1][0] = sad8_mmx; | |
435 c->pix_abs[1][1] = sad8_x2_mmx; | |
436 c->pix_abs[1][2] = sad8_y2_mmx; | |
437 c->pix_abs[1][3] = sad8_xy2_mmx; | |
1057 | 438 |
2979 | 439 c->sad[0]= sad16_mmx; |
1708 | 440 c->sad[1]= sad8_mmx; |
1057 | 441 } |
4197 | 442 if (mm_flags & MM_MMXEXT) { |
2979 | 443 c->pix_abs[0][0] = sad16_mmx2; |
444 c->pix_abs[1][0] = sad8_mmx2; | |
1057 | 445 |
2979 | 446 c->sad[0]= sad16_mmx2; |
447 c->sad[1]= sad8_mmx2; | |
2967 | 448 |
1092 | 449 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ |
1708 | 450 c->pix_abs[0][1] = sad16_x2_mmx2; |
451 c->pix_abs[0][2] = sad16_y2_mmx2; | |
452 c->pix_abs[0][3] = sad16_xy2_mmx2; | |
453 c->pix_abs[1][1] = sad8_x2_mmx2; | |
454 c->pix_abs[1][2] = sad8_y2_mmx2; | |
455 c->pix_abs[1][3] = sad8_xy2_mmx2; | |
1092 | 456 } |
1057 | 457 } |
4981 | 458 if ((mm_flags & MM_SSE2) && !(mm_flags & MM_3DNOW)) { |
459 c->sad[0]= sad16_sse2; | |
460 } | |
1057 | 461 } |