comparison x86/dsputilenc_yasm.asm @ 12498:c997f09d1e10 libavcodec

Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm, which will hopefully solve the Win64/FATE failures caused by these functions.
author rbultje
date Fri, 17 Sep 2010 01:56:06 +0000
parents c5ffa8b81f9c
children 9575307cbb82
comparison
equal deleted inserted replaced
12497:c5ffa8b81f9c 12498:c997f09d1e10
24 %include "x86inc.asm" 24 %include "x86inc.asm"
25 %include "x86util.asm" 25 %include "x86util.asm"
26 26
27 SECTION .text 27 SECTION .text
28 28
29 %macro DIFF_PIXELS_1 4
30 movh %1, %3
31 movh %2, %4
32 punpcklbw %2, %1
33 punpcklbw %1, %1
34 psubw %1, %2
35 %endmacro
36
37 ; %1=uint8_t *pix1, %2=uint8_t *pix2, %3=static offset, %4=stride, %5=stride*3
38 ; %6=temporary storage location
39 ; this macro requires $mmsize stack space (aligned) on %6 (except on SSE+x86-64)
40 %macro DIFF_PIXELS_8 6
41 DIFF_PIXELS_1 m0, m7, [%1 +%3], [%2 +%3]
42 DIFF_PIXELS_1 m1, m7, [%1+%4 +%3], [%2+%4 +%3]
43 DIFF_PIXELS_1 m2, m7, [%1+%4*2+%3], [%2+%4*2+%3]
44 add %1, %5
45 add %2, %5
46 DIFF_PIXELS_1 m3, m7, [%1 +%3], [%2 +%3]
47 DIFF_PIXELS_1 m4, m7, [%1+%4 +%3], [%2+%4 +%3]
48 DIFF_PIXELS_1 m5, m7, [%1+%4*2+%3], [%2+%4*2+%3]
49 DIFF_PIXELS_1 m6, m7, [%1+%5 +%3], [%2+%5 +%3]
50 %ifdef m8
51 DIFF_PIXELS_1 m7, m8, [%1+%4*4+%3], [%2+%4*4+%3]
52 %else
53 mova [%6], m0
54 DIFF_PIXELS_1 m7, m0, [%1+%4*4+%3], [%2+%4*4+%3]
55 mova m0, [%6]
56 %endif
57 sub %1, %5
58 sub %2, %5
59 %endmacro
60
61 %macro HADAMARD8 0
62 SUMSUB_BADC m0, m1, m2, m3
63 SUMSUB_BADC m4, m5, m6, m7
64 SUMSUB_BADC m0, m2, m1, m3
65 SUMSUB_BADC m4, m6, m5, m7
66 SUMSUB_BADC m0, m4, m1, m5
67 SUMSUB_BADC m2, m6, m3, m7
68 %endmacro
69
70 %macro ABS1_SUM 3
71 ABS1 %1, %2
72 paddusw %3, %1
73 %endmacro
74
75 %macro ABS2_SUM 6
76 ABS2 %1, %2, %3, %4
77 paddusw %5, %1
78 paddusw %6, %2
79 %endmacro
80
81 %macro ABS_SUM_8x8_64 1
82 ABS2 m0, m1, m8, m9
83 ABS2_SUM m2, m3, m8, m9, m0, m1
84 ABS2_SUM m4, m5, m8, m9, m0, m1
85 ABS2_SUM m6, m7, m8, m9, m0, m1
86 paddusw m0, m1
87 %endmacro
88
89 %macro ABS_SUM_8x8_32 1
90 mova [%1], m7
91 ABS1 m0, m7
92 ABS1 m1, m7
93 ABS1_SUM m2, m7, m0
94 ABS1_SUM m3, m7, m1
95 ABS1_SUM m4, m7, m0
96 ABS1_SUM m5, m7, m1
97 ABS1_SUM m6, m7, m0
98 mova m2, [%1]
99 ABS1_SUM m2, m7, m1
100 paddusw m0, m1
101 %endmacro
102
103 ; FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
104 ; about 100k on extreme inputs. But that's very unlikely to occur in natural video,
105 ; and it's even more unlikely to not have any alternative mvs/modes with lower cost.
106 %macro HSUM_MMX 3
107 mova %2, %1
108 psrlq %1, 32
109 paddusw %1, %2
110 mova %2, %1
111 psrlq %1, 16
112 paddusw %1, %2
113 movd %3, %1
114 %endmacro
115
116 %macro HSUM_MMX2 3
117 pshufw %2, %1, 0xE
118 paddusw %1, %2
119 pshufw %2, %1, 0x1
120 paddusw %1, %2
121 movd %3, %1
122 %endmacro
123
124 %macro HSUM_SSE2 3
125 movhlps %2, %1
126 paddusw %1, %2
127 pshuflw %2, %1, 0xE
128 paddusw %1, %2
129 pshuflw %2, %1, 0x1
130 paddusw %1, %2
131 movd %3, %1
132 %endmacro
133
134 %macro STORE4 5
135 mova [%1+mmsize*0], %2
136 mova [%1+mmsize*1], %3
137 mova [%1+mmsize*2], %4
138 mova [%1+mmsize*3], %5
139 %endmacro
140
141 %macro LOAD4 5
142 mova %2, [%1+mmsize*0]
143 mova %3, [%1+mmsize*1]
144 mova %4, [%1+mmsize*2]
145 mova %5, [%1+mmsize*3]
146 %endmacro
147
148 %macro hadamard8_16_wrapper 3
149 cglobal hadamard8_diff_%1, 4, 4, %2
150 %ifndef m8
151 %assign pad %3*mmsize-(4+stack_offset&(mmsize-1))
152 SUB rsp, pad
153 %endif
154 call hadamard8x8_diff_%1
155 %ifndef m8
156 ADD rsp, pad
157 %endif
158 RET
159
160 cglobal hadamard8_diff16_%1, 5, 6, %2
161 %ifndef m8
162 %assign pad %3*mmsize-(4+stack_offset&(mmsize-1))
163 SUB rsp, pad
164 %endif
165
166 call hadamard8x8_diff_%1
167 mov r5d, eax
168
169 add r1, 8
170 add r2, 8
171 call hadamard8x8_diff_%1
172 add r5d, eax
173
174 cmp r4d, 16
175 jne .done
176
177 lea r1, [r1+r3*8-8]
178 lea r2, [r2+r3*8-8]
179 call hadamard8x8_diff_%1
180 add r5d, eax
181
182 add r1, 8
183 add r2, 8
184 call hadamard8x8_diff_%1
185 add r5d, eax
186
187 .done
188 mov eax, r5d
189 %ifndef m8
190 ADD rsp, pad
191 %endif
192 RET
193 %endmacro
194
195 %macro HADAMARD8_DIFF_MMX 1
196 ALIGN 16
197 ; int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2,
198 ; int stride, int h)
199 ; r0 = void *s = unused, int h = unused (always 8)
200 ; note how r1, r2 and r3 are not clobbered in this function, so 16x16
201 ; can simply call this 2x2x (and that's why we access rsp+gprsize
202 ; everywhere, which is rsp of calling func
203 hadamard8x8_diff_%1:
204 lea r0, [r3*3]
205
206 ; first 4x8 pixels
207 DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize+0x60
208 HADAMARD8
209 mova [rsp+gprsize+0x60], m7
210 TRANSPOSE4x4W 0, 1, 2, 3, 7
211 STORE4 rsp+gprsize, m0, m1, m2, m3
212 mova m7, [rsp+gprsize+0x60]
213 TRANSPOSE4x4W 4, 5, 6, 7, 0
214 STORE4 rsp+gprsize+0x40, m4, m5, m6, m7
215
216 ; second 4x8 pixels
217 DIFF_PIXELS_8 r1, r2, 4, r3, r0, rsp+gprsize+0x60
218 HADAMARD8
219 mova [rsp+gprsize+0x60], m7
220 TRANSPOSE4x4W 0, 1, 2, 3, 7
221 STORE4 rsp+gprsize+0x20, m0, m1, m2, m3
222 mova m7, [rsp+gprsize+0x60]
223 TRANSPOSE4x4W 4, 5, 6, 7, 0
224
225 LOAD4 rsp+gprsize+0x40, m0, m1, m2, m3
226 HADAMARD8
227 ABS_SUM_8x8_32 rsp+gprsize+0x60
228 mova [rsp+gprsize+0x60], m0
229
230 LOAD4 rsp+gprsize , m0, m1, m2, m3
231 LOAD4 rsp+gprsize+0x20, m4, m5, m6, m7
232 HADAMARD8
233 ABS_SUM_8x8_32 rsp+gprsize
234 paddusw m0, [rsp+gprsize+0x60]
235
236 HSUM m0, m1, eax
237 and rax, 0xFFFF
238 ret
239
240 hadamard8_16_wrapper %1, 0, 14
241 %endmacro
242
243 %macro HADAMARD8_DIFF_SSE2 2
244 hadamard8x8_diff_%1:
245 lea r0, [r3*3]
246 DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize
247 HADAMARD8
248 %ifdef ARCH_X86_64
249 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
250 %else
251 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [rsp+gprsize], [rsp+mmsize+gprsize]
252 %endif
253 HADAMARD8
254 ABS_SUM_8x8 rsp+gprsize
255 HSUM_SSE2 m0, m1, eax
256 and eax, 0xFFFF
257 ret
258
259 hadamard8_16_wrapper %1, %2, 3
260 %endmacro
261
262 INIT_MMX
263 %define ABS1 ABS1_MMX
264 %define HSUM HSUM_MMX
265 HADAMARD8_DIFF_MMX mmx
266
267 %define ABS1 ABS1_MMX2
268 %define HSUM HSUM_MMX2
269 HADAMARD8_DIFF_MMX mmx2
270
271 INIT_XMM
272 %define ABS2 ABS2_MMX2
273 %ifdef ARCH_X86_64
274 %define ABS_SUM_8x8 ABS_SUM_8x8_64
275 %else
276 %define ABS_SUM_8x8 ABS_SUM_8x8_32
277 %endif
278 HADAMARD8_DIFF_SSE2 sse2, 10
279
280 %define ABS2 ABS2_SSSE3
281 %define ABS_SUM_8x8 ABS_SUM_8x8_64
282 HADAMARD8_DIFF_SSE2 ssse3, 9
283
29 INIT_XMM 284 INIT_XMM
30 ; sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) 285 ; sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
31 cglobal sse16_sse2, 5, 5, 8 286 cglobal sse16_sse2, 5, 5, 8
32 shr r4, 1 287 shr r4, 1
33 pxor m0, m0 ; mm0 = 0 288 pxor m0, m0 ; mm0 = 0