Mercurial > libavcodec.hg
comparison x86/dsputilenc_yasm.asm @ 12498:c997f09d1e10 libavcodec
Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
which will hopefully solve the Win64/FATE failures caused by these functions.
author | rbultje |
---|---|
date | Fri, 17 Sep 2010 01:56:06 +0000 |
parents | c5ffa8b81f9c |
children | 9575307cbb82 |
comparison
equal
deleted
inserted
replaced
12497:c5ffa8b81f9c | 12498:c997f09d1e10 |
---|---|
24 %include "x86inc.asm" | 24 %include "x86inc.asm" |
25 %include "x86util.asm" | 25 %include "x86util.asm" |
26 | 26 |
27 SECTION .text | 27 SECTION .text |
28 | 28 |
29 %macro DIFF_PIXELS_1 4 | |
30 movh %1, %3 | |
31 movh %2, %4 | |
32 punpcklbw %2, %1 | |
33 punpcklbw %1, %1 | |
34 psubw %1, %2 | |
35 %endmacro | |
36 | |
37 ; %1=uint8_t *pix1, %2=uint8_t *pix2, %3=static offset, %4=stride, %5=stride*3 | |
38 ; %6=temporary storage location | |
39 ; this macro requires $mmsize stack space (aligned) on %6 (except on SSE+x86-64) | |
40 %macro DIFF_PIXELS_8 6 | |
41 DIFF_PIXELS_1 m0, m7, [%1 +%3], [%2 +%3] | |
42 DIFF_PIXELS_1 m1, m7, [%1+%4 +%3], [%2+%4 +%3] | |
43 DIFF_PIXELS_1 m2, m7, [%1+%4*2+%3], [%2+%4*2+%3] | |
44 add %1, %5 | |
45 add %2, %5 | |
46 DIFF_PIXELS_1 m3, m7, [%1 +%3], [%2 +%3] | |
47 DIFF_PIXELS_1 m4, m7, [%1+%4 +%3], [%2+%4 +%3] | |
48 DIFF_PIXELS_1 m5, m7, [%1+%4*2+%3], [%2+%4*2+%3] | |
49 DIFF_PIXELS_1 m6, m7, [%1+%5 +%3], [%2+%5 +%3] | |
50 %ifdef m8 | |
51 DIFF_PIXELS_1 m7, m8, [%1+%4*4+%3], [%2+%4*4+%3] | |
52 %else | |
53 mova [%6], m0 | |
54 DIFF_PIXELS_1 m7, m0, [%1+%4*4+%3], [%2+%4*4+%3] | |
55 mova m0, [%6] | |
56 %endif | |
57 sub %1, %5 | |
58 sub %2, %5 | |
59 %endmacro | |
60 | |
61 %macro HADAMARD8 0 | |
62 SUMSUB_BADC m0, m1, m2, m3 | |
63 SUMSUB_BADC m4, m5, m6, m7 | |
64 SUMSUB_BADC m0, m2, m1, m3 | |
65 SUMSUB_BADC m4, m6, m5, m7 | |
66 SUMSUB_BADC m0, m4, m1, m5 | |
67 SUMSUB_BADC m2, m6, m3, m7 | |
68 %endmacro | |
69 | |
70 %macro ABS1_SUM 3 | |
71 ABS1 %1, %2 | |
72 paddusw %3, %1 | |
73 %endmacro | |
74 | |
75 %macro ABS2_SUM 6 | |
76 ABS2 %1, %2, %3, %4 | |
77 paddusw %5, %1 | |
78 paddusw %6, %2 | |
79 %endmacro | |
80 | |
81 %macro ABS_SUM_8x8_64 1 | |
82 ABS2 m0, m1, m8, m9 | |
83 ABS2_SUM m2, m3, m8, m9, m0, m1 | |
84 ABS2_SUM m4, m5, m8, m9, m0, m1 | |
85 ABS2_SUM m6, m7, m8, m9, m0, m1 | |
86 paddusw m0, m1 | |
87 %endmacro | |
88 | |
89 %macro ABS_SUM_8x8_32 1 | |
90 mova [%1], m7 | |
91 ABS1 m0, m7 | |
92 ABS1 m1, m7 | |
93 ABS1_SUM m2, m7, m0 | |
94 ABS1_SUM m3, m7, m1 | |
95 ABS1_SUM m4, m7, m0 | |
96 ABS1_SUM m5, m7, m1 | |
97 ABS1_SUM m6, m7, m0 | |
98 mova m2, [%1] | |
99 ABS1_SUM m2, m7, m1 | |
100 paddusw m0, m1 | |
101 %endmacro | |
102 | |
103 ; FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to | |
104 ; about 100k on extreme inputs. But that's very unlikely to occur in natural video, | |
105 ; and it's even more unlikely to not have any alternative mvs/modes with lower cost. | |
106 %macro HSUM_MMX 3 | |
107 mova %2, %1 | |
108 psrlq %1, 32 | |
109 paddusw %1, %2 | |
110 mova %2, %1 | |
111 psrlq %1, 16 | |
112 paddusw %1, %2 | |
113 movd %3, %1 | |
114 %endmacro | |
115 | |
116 %macro HSUM_MMX2 3 | |
117 pshufw %2, %1, 0xE | |
118 paddusw %1, %2 | |
119 pshufw %2, %1, 0x1 | |
120 paddusw %1, %2 | |
121 movd %3, %1 | |
122 %endmacro | |
123 | |
124 %macro HSUM_SSE2 3 | |
125 movhlps %2, %1 | |
126 paddusw %1, %2 | |
127 pshuflw %2, %1, 0xE | |
128 paddusw %1, %2 | |
129 pshuflw %2, %1, 0x1 | |
130 paddusw %1, %2 | |
131 movd %3, %1 | |
132 %endmacro | |
133 | |
134 %macro STORE4 5 | |
135 mova [%1+mmsize*0], %2 | |
136 mova [%1+mmsize*1], %3 | |
137 mova [%1+mmsize*2], %4 | |
138 mova [%1+mmsize*3], %5 | |
139 %endmacro | |
140 | |
141 %macro LOAD4 5 | |
142 mova %2, [%1+mmsize*0] | |
143 mova %3, [%1+mmsize*1] | |
144 mova %4, [%1+mmsize*2] | |
145 mova %5, [%1+mmsize*3] | |
146 %endmacro | |
147 | |
148 %macro hadamard8_16_wrapper 3 | |
149 cglobal hadamard8_diff_%1, 4, 4, %2 | |
150 %ifndef m8 | |
151 %assign pad %3*mmsize-(4+stack_offset&(mmsize-1)) | |
152 SUB rsp, pad | |
153 %endif | |
154 call hadamard8x8_diff_%1 | |
155 %ifndef m8 | |
156 ADD rsp, pad | |
157 %endif | |
158 RET | |
159 | |
160 cglobal hadamard8_diff16_%1, 5, 6, %2 | |
161 %ifndef m8 | |
162 %assign pad %3*mmsize-(4+stack_offset&(mmsize-1)) | |
163 SUB rsp, pad | |
164 %endif | |
165 | |
166 call hadamard8x8_diff_%1 | |
167 mov r5d, eax | |
168 | |
169 add r1, 8 | |
170 add r2, 8 | |
171 call hadamard8x8_diff_%1 | |
172 add r5d, eax | |
173 | |
174 cmp r4d, 16 | |
175 jne .done | |
176 | |
177 lea r1, [r1+r3*8-8] | |
178 lea r2, [r2+r3*8-8] | |
179 call hadamard8x8_diff_%1 | |
180 add r5d, eax | |
181 | |
182 add r1, 8 | |
183 add r2, 8 | |
184 call hadamard8x8_diff_%1 | |
185 add r5d, eax | |
186 | |
187 .done | |
188 mov eax, r5d | |
189 %ifndef m8 | |
190 ADD rsp, pad | |
191 %endif | |
192 RET | |
193 %endmacro | |
194 | |
195 %macro HADAMARD8_DIFF_MMX 1 | |
196 ALIGN 16 | |
197 ; int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, | |
198 ; int stride, int h) | |
199 ; r0 = void *s = unused, int h = unused (always 8) | |
200 ; note how r1, r2 and r3 are not clobbered in this function, so 16x16 | |
201 ; can simply call this 2x2x (and that's why we access rsp+gprsize | |
202 ; everywhere, which is rsp of calling func | |
203 hadamard8x8_diff_%1: | |
204 lea r0, [r3*3] | |
205 | |
206 ; first 4x8 pixels | |
207 DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize+0x60 | |
208 HADAMARD8 | |
209 mova [rsp+gprsize+0x60], m7 | |
210 TRANSPOSE4x4W 0, 1, 2, 3, 7 | |
211 STORE4 rsp+gprsize, m0, m1, m2, m3 | |
212 mova m7, [rsp+gprsize+0x60] | |
213 TRANSPOSE4x4W 4, 5, 6, 7, 0 | |
214 STORE4 rsp+gprsize+0x40, m4, m5, m6, m7 | |
215 | |
216 ; second 4x8 pixels | |
217 DIFF_PIXELS_8 r1, r2, 4, r3, r0, rsp+gprsize+0x60 | |
218 HADAMARD8 | |
219 mova [rsp+gprsize+0x60], m7 | |
220 TRANSPOSE4x4W 0, 1, 2, 3, 7 | |
221 STORE4 rsp+gprsize+0x20, m0, m1, m2, m3 | |
222 mova m7, [rsp+gprsize+0x60] | |
223 TRANSPOSE4x4W 4, 5, 6, 7, 0 | |
224 | |
225 LOAD4 rsp+gprsize+0x40, m0, m1, m2, m3 | |
226 HADAMARD8 | |
227 ABS_SUM_8x8_32 rsp+gprsize+0x60 | |
228 mova [rsp+gprsize+0x60], m0 | |
229 | |
230 LOAD4 rsp+gprsize , m0, m1, m2, m3 | |
231 LOAD4 rsp+gprsize+0x20, m4, m5, m6, m7 | |
232 HADAMARD8 | |
233 ABS_SUM_8x8_32 rsp+gprsize | |
234 paddusw m0, [rsp+gprsize+0x60] | |
235 | |
236 HSUM m0, m1, eax | |
237 and rax, 0xFFFF | |
238 ret | |
239 | |
240 hadamard8_16_wrapper %1, 0, 14 | |
241 %endmacro | |
242 | |
243 %macro HADAMARD8_DIFF_SSE2 2 | |
244 hadamard8x8_diff_%1: | |
245 lea r0, [r3*3] | |
246 DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize | |
247 HADAMARD8 | |
248 %ifdef ARCH_X86_64 | |
249 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 | |
250 %else | |
251 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [rsp+gprsize], [rsp+mmsize+gprsize] | |
252 %endif | |
253 HADAMARD8 | |
254 ABS_SUM_8x8 rsp+gprsize | |
255 HSUM_SSE2 m0, m1, eax | |
256 and eax, 0xFFFF | |
257 ret | |
258 | |
259 hadamard8_16_wrapper %1, %2, 3 | |
260 %endmacro | |
261 | |
262 INIT_MMX | |
263 %define ABS1 ABS1_MMX | |
264 %define HSUM HSUM_MMX | |
265 HADAMARD8_DIFF_MMX mmx | |
266 | |
267 %define ABS1 ABS1_MMX2 | |
268 %define HSUM HSUM_MMX2 | |
269 HADAMARD8_DIFF_MMX mmx2 | |
270 | |
271 INIT_XMM | |
272 %define ABS2 ABS2_MMX2 | |
273 %ifdef ARCH_X86_64 | |
274 %define ABS_SUM_8x8 ABS_SUM_8x8_64 | |
275 %else | |
276 %define ABS_SUM_8x8 ABS_SUM_8x8_32 | |
277 %endif | |
278 HADAMARD8_DIFF_SSE2 sse2, 10 | |
279 | |
280 %define ABS2 ABS2_SSSE3 | |
281 %define ABS_SUM_8x8 ABS_SUM_8x8_64 | |
282 HADAMARD8_DIFF_SSE2 ssse3, 9 | |
283 | |
29 INIT_XMM | 284 INIT_XMM |
30 ; sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) | 285 ; sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) |
31 cglobal sse16_sse2, 5, 5, 8 | 286 cglobal sse16_sse2, 5, 5, 8 |
32 shr r4, 1 | 287 shr r4, 1 |
33 pxor m0, m0 ; mm0 = 0 | 288 pxor m0, m0 ; mm0 = 0 |