Mercurial > libavcodec.hg
comparison x86/h264_deblock_sse2.asm @ 8430:7768bdfd4f7b libavcodec
Rename libavcodec/i386/ --> libavcodec/x86/.
It contains optimizations that are not specific to i386 and
libavutil uses this naming scheme already.
author | diego |
---|---|
date | Mon, 22 Dec 2008 09:12:42 +0000 |
parents | |
children | e5c9a3a813ea |
comparison
equal
deleted
inserted
replaced
8429:b3ecaba81501 | 8430:7768bdfd4f7b |
---|---|
1 ;***************************************************************************** | |
2 ;* deblock-a.asm: h264 encoder library | |
3 ;***************************************************************************** | |
4 ;* Copyright (C) 2005-2008 x264 project | |
5 ;* | |
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu> | |
7 ;* | |
8 ;* This program is free software; you can redistribute it and/or modify | |
9 ;* it under the terms of the GNU General Public License as published by | |
10 ;* the Free Software Foundation; either version 2 of the License, or | |
11 ;* (at your option) any later version. | |
12 ;* | |
13 ;* This program is distributed in the hope that it will be useful, | |
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
16 ;* GNU General Public License for more details. | |
17 ;* | |
18 ;* You should have received a copy of the GNU General Public License | |
19 ;* along with this program; if not, write to the Free Software | |
20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. | |
21 ;***************************************************************************** | |
22 | |
23 %include "x86inc.asm" | |
24 | |
25 SECTION_RODATA | |
26 pb_00: times 16 db 0x00 | |
27 pb_01: times 16 db 0x01 | |
28 pb_03: times 16 db 0x03 | |
29 pb_a1: times 16 db 0xa1 | |
30 | |
31 SECTION .text | |
32 | |
33 ; expands to [base],...,[base+7*stride] | |
34 %define PASS8ROWS(base, base3, stride, stride3) \ | |
35 [base], [base+stride], [base+stride*2], [base3], \ | |
36 [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4] | |
37 | |
38 ; in: 8 rows of 4 bytes in %1..%8 | |
39 ; out: 4 rows of 8 bytes in m0..m3 | |
40 %macro TRANSPOSE4x8_LOAD 8 | |
41 movd m0, %1 | |
42 movd m2, %2 | |
43 movd m1, %3 | |
44 movd m3, %4 | |
45 punpcklbw m0, m2 | |
46 punpcklbw m1, m3 | |
47 movq m2, m0 | |
48 punpcklwd m0, m1 | |
49 punpckhwd m2, m1 | |
50 | |
51 movd m4, %5 | |
52 movd m6, %6 | |
53 movd m5, %7 | |
54 movd m7, %8 | |
55 punpcklbw m4, m6 | |
56 punpcklbw m5, m7 | |
57 movq m6, m4 | |
58 punpcklwd m4, m5 | |
59 punpckhwd m6, m5 | |
60 | |
61 movq m1, m0 | |
62 movq m3, m2 | |
63 punpckldq m0, m4 | |
64 punpckhdq m1, m4 | |
65 punpckldq m2, m6 | |
66 punpckhdq m3, m6 | |
67 %endmacro | |
68 | |
69 ; in: 4 rows of 8 bytes in m0..m3 | |
70 ; out: 8 rows of 4 bytes in %1..%8 | |
71 %macro TRANSPOSE8x4_STORE 8 | |
72 movq m4, m0 | |
73 movq m5, m1 | |
74 movq m6, m2 | |
75 punpckhdq m4, m4 | |
76 punpckhdq m5, m5 | |
77 punpckhdq m6, m6 | |
78 | |
79 punpcklbw m0, m1 | |
80 punpcklbw m2, m3 | |
81 movq m1, m0 | |
82 punpcklwd m0, m2 | |
83 punpckhwd m1, m2 | |
84 movd %1, m0 | |
85 punpckhdq m0, m0 | |
86 movd %2, m0 | |
87 movd %3, m1 | |
88 punpckhdq m1, m1 | |
89 movd %4, m1 | |
90 | |
91 punpckhdq m3, m3 | |
92 punpcklbw m4, m5 | |
93 punpcklbw m6, m3 | |
94 movq m5, m4 | |
95 punpcklwd m4, m6 | |
96 punpckhwd m5, m6 | |
97 movd %5, m4 | |
98 punpckhdq m4, m4 | |
99 movd %6, m4 | |
100 movd %7, m5 | |
101 punpckhdq m5, m5 | |
102 movd %8, m5 | |
103 %endmacro | |
104 | |
105 %macro SBUTTERFLY 4 | |
106 movq %4, %2 | |
107 punpckl%1 %2, %3 | |
108 punpckh%1 %4, %3 | |
109 %endmacro | |
110 | |
111 ; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8 | |
112 ; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16] | |
113 %macro TRANSPOSE6x8_MEM 9 | |
114 movq m0, %1 | |
115 movq m1, %2 | |
116 movq m2, %3 | |
117 movq m3, %4 | |
118 movq m4, %5 | |
119 movq m5, %6 | |
120 movq m6, %7 | |
121 SBUTTERFLY bw, m0, m1, m7 | |
122 SBUTTERFLY bw, m2, m3, m1 | |
123 SBUTTERFLY bw, m4, m5, m3 | |
124 movq [%9+0x10], m1 | |
125 SBUTTERFLY bw, m6, %8, m5 | |
126 SBUTTERFLY wd, m0, m2, m1 | |
127 SBUTTERFLY wd, m4, m6, m2 | |
128 punpckhdq m0, m4 | |
129 movq [%9+0x00], m0 | |
130 SBUTTERFLY wd, m7, [%9+0x10], m6 | |
131 SBUTTERFLY wd, m3, m5, m4 | |
132 SBUTTERFLY dq, m7, m3, m0 | |
133 SBUTTERFLY dq, m1, m2, m5 | |
134 punpckldq m6, m4 | |
135 movq [%9+0x10], m1 | |
136 movq [%9+0x20], m5 | |
137 movq [%9+0x30], m7 | |
138 movq [%9+0x40], m0 | |
139 movq [%9+0x50], m6 | |
140 %endmacro | |
141 | |
142 ; in: 8 rows of 8 in %1..%8 | |
143 ; out: 8 rows of 8 in %9..%16 | |
144 %macro TRANSPOSE8x8_MEM 16 | |
145 movq m0, %1 | |
146 movq m1, %2 | |
147 movq m2, %3 | |
148 movq m3, %4 | |
149 movq m4, %5 | |
150 movq m5, %6 | |
151 movq m6, %7 | |
152 SBUTTERFLY bw, m0, m1, m7 | |
153 SBUTTERFLY bw, m2, m3, m1 | |
154 SBUTTERFLY bw, m4, m5, m3 | |
155 SBUTTERFLY bw, m6, %8, m5 | |
156 movq %9, m3 | |
157 SBUTTERFLY wd, m0, m2, m3 | |
158 SBUTTERFLY wd, m4, m6, m2 | |
159 SBUTTERFLY wd, m7, m1, m6 | |
160 movq %11, m2 | |
161 movq m2, %9 | |
162 SBUTTERFLY wd, m2, m5, m1 | |
163 SBUTTERFLY dq, m0, m4, m5 | |
164 SBUTTERFLY dq, m7, m2, m4 | |
165 movq %9, m0 | |
166 movq %10, m5 | |
167 movq %13, m7 | |
168 movq %14, m4 | |
169 SBUTTERFLY dq, m3, %11, m0 | |
170 SBUTTERFLY dq, m6, m1, m5 | |
171 movq %11, m3 | |
172 movq %12, m0 | |
173 movq %15, m6 | |
174 movq %16, m5 | |
175 %endmacro | |
176 | |
177 ; out: %4 = |%1-%2|>%3 | |
178 ; clobbers: %5 | |
179 %macro DIFF_GT 5 | |
180 mova %5, %2 | |
181 mova %4, %1 | |
182 psubusb %5, %1 | |
183 psubusb %4, %2 | |
184 por %4, %5 | |
185 psubusb %4, %3 | |
186 %endmacro | |
187 | |
188 ; out: %4 = |%1-%2|>%3 | |
189 ; clobbers: %5 | |
190 %macro DIFF_GT2 5 | |
191 mova %5, %2 | |
192 mova %4, %1 | |
193 psubusb %5, %1 | |
194 psubusb %4, %2 | |
195 psubusb %5, %3 | |
196 psubusb %4, %3 | |
197 pcmpeqb %4, %5 | |
198 %endmacro | |
199 | |
200 %macro SPLATW 1 | |
201 %ifidn m0, xmm0 | |
202 pshuflw %1, %1, 0 | |
203 punpcklqdq %1, %1 | |
204 %else | |
205 pshufw %1, %1, 0 | |
206 %endif | |
207 %endmacro | |
208 | |
209 ; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1 | |
210 ; out: m5=beta-1, m7=mask, %3=alpha-1 | |
211 ; clobbers: m4,m6 | |
212 %macro LOAD_MASK 2-3 | |
213 movd m4, %1 | |
214 movd m5, %2 | |
215 SPLATW m4 | |
216 SPLATW m5 | |
217 packuswb m4, m4 ; 16x alpha-1 | |
218 packuswb m5, m5 ; 16x beta-1 | |
219 %if %0>2 | |
220 mova %3, m4 | |
221 %endif | |
222 DIFF_GT m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1 | |
223 DIFF_GT m0, m1, m5, m4, m6 ; |p1-p0| > beta-1 | |
224 por m7, m4 | |
225 DIFF_GT m3, m2, m5, m4, m6 ; |q1-q0| > beta-1 | |
226 por m7, m4 | |
227 pxor m6, m6 | |
228 pcmpeqb m7, m6 | |
229 %endmacro | |
230 | |
231 ; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask) | |
232 ; out: m1=p0' m2=q0' | |
233 ; clobbers: m0,3-6 | |
234 %macro DEBLOCK_P0_Q0 0 | |
235 mova m5, m1 | |
236 pxor m5, m2 ; p0^q0 | |
237 pand m5, [pb_01 GLOBAL] ; (p0^q0)&1 | |
238 pcmpeqb m4, m4 | |
239 pxor m3, m4 | |
240 pavgb m3, m0 ; (p1 - q1 + 256)>>1 | |
241 pavgb m3, [pb_03 GLOBAL] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2 | |
242 pxor m4, m1 | |
243 pavgb m4, m2 ; (q0 - p0 + 256)>>1 | |
244 pavgb m3, m5 | |
245 paddusb m3, m4 ; d+128+33 | |
246 mova m6, [pb_a1 GLOBAL] | |
247 psubusb m6, m3 | |
248 psubusb m3, [pb_a1 GLOBAL] | |
249 pminub m6, m7 | |
250 pminub m3, m7 | |
251 psubusb m1, m6 | |
252 psubusb m2, m3 | |
253 paddusb m1, m3 | |
254 paddusb m2, m6 | |
255 %endmacro | |
256 | |
257 ; in: m1=p0 m2=q0 | |
258 ; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp | |
259 ; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 ) | |
260 ; clobbers: q2, tmp, tc0 | |
261 %macro LUMA_Q1 6 | |
262 mova %6, m1 | |
263 pavgb %6, m2 | |
264 pavgb %2, %6 ; avg(p2,avg(p0,q0)) | |
265 pxor %6, %3 | |
266 pand %6, [pb_01 GLOBAL] ; (p2^avg(p0,q0))&1 | |
267 psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1 | |
268 mova %6, %1 | |
269 psubusb %6, %5 | |
270 paddusb %5, %1 | |
271 pmaxub %2, %6 | |
272 pminub %2, %5 | |
273 mova %4, %2 | |
274 %endmacro | |
275 | |
276 %ifdef ARCH_X86_64 | |
277 ;----------------------------------------------------------------------------- | |
278 ; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) | |
279 ;----------------------------------------------------------------------------- | |
280 INIT_XMM | |
281 cglobal x264_deblock_v_luma_sse2 | |
282 movd m8, [r4] ; tc0 | |
283 lea r4, [r1*3] | |
284 dec r2d ; alpha-1 | |
285 neg r4 | |
286 dec r3d ; beta-1 | |
287 add r4, r0 ; pix-3*stride | |
288 | |
289 mova m0, [r4+r1] ; p1 | |
290 mova m1, [r4+2*r1] ; p0 | |
291 mova m2, [r0] ; q0 | |
292 mova m3, [r0+r1] ; q1 | |
293 LOAD_MASK r2d, r3d | |
294 | |
295 punpcklbw m8, m8 | |
296 punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] | |
297 pcmpeqb m9, m9 | |
298 pcmpeqb m9, m8 | |
299 pandn m9, m7 | |
300 pand m8, m9 | |
301 | |
302 movdqa m3, [r4] ; p2 | |
303 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 | |
304 pand m6, m9 | |
305 mova m7, m8 | |
306 psubb m7, m6 | |
307 pand m6, m8 | |
308 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 | |
309 | |
310 movdqa m4, [r0+2*r1] ; q2 | |
311 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 | |
312 pand m6, m9 | |
313 pand m8, m6 | |
314 psubb m7, m6 | |
315 mova m3, [r0+r1] | |
316 LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6 | |
317 | |
318 DEBLOCK_P0_Q0 | |
319 mova [r4+2*r1], m1 | |
320 mova [r0], m2 | |
321 ret | |
322 | |
323 ;----------------------------------------------------------------------------- | |
324 ; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) | |
325 ;----------------------------------------------------------------------------- | |
326 INIT_MMX | |
327 cglobal x264_deblock_h_luma_sse2 | |
328 movsxd r10, esi | |
329 lea r11, [r10+r10*2] | |
330 lea rax, [r0-4] | |
331 lea r9, [r0-4+r11] | |
332 sub rsp, 0x68 | |
333 %define pix_tmp rsp | |
334 | |
335 ; transpose 6x16 -> tmp space | |
336 TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp | |
337 lea rax, [rax+r10*8] | |
338 lea r9, [r9 +r10*8] | |
339 TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp+8 | |
340 | |
341 ; vertical filter | |
342 ; alpha, beta, tc0 are still in r2d, r3d, r4 | |
343 ; don't backup rax, r9, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them | |
344 lea r0, [pix_tmp+0x30] | |
345 mov esi, 0x10 | |
346 call x264_deblock_v_luma_sse2 | |
347 | |
348 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) | |
349 add rax, 2 | |
350 add r9, 2 | |
351 movq m0, [pix_tmp+0x18] | |
352 movq m1, [pix_tmp+0x28] | |
353 movq m2, [pix_tmp+0x38] | |
354 movq m3, [pix_tmp+0x48] | |
355 TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11) | |
356 | |
357 shl r10, 3 | |
358 sub rax, r10 | |
359 sub r9, r10 | |
360 shr r10, 3 | |
361 movq m0, [pix_tmp+0x10] | |
362 movq m1, [pix_tmp+0x20] | |
363 movq m2, [pix_tmp+0x30] | |
364 movq m3, [pix_tmp+0x40] | |
365 TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11) | |
366 | |
367 add rsp, 0x68 | |
368 ret | |
369 | |
370 %else | |
371 | |
372 %macro DEBLOCK_LUMA 3 | |
373 ;----------------------------------------------------------------------------- | |
374 ; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) | |
375 ;----------------------------------------------------------------------------- | |
376 cglobal x264_deblock_%2_luma_%1, 5,5 | |
377 lea r4, [r1*3] | |
378 dec r2 ; alpha-1 | |
379 neg r4 | |
380 dec r3 ; beta-1 | |
381 add r4, r0 ; pix-3*stride | |
382 %assign pad 2*%3+12-(stack_offset&15) | |
383 SUB esp, pad | |
384 | |
385 mova m0, [r4+r1] ; p1 | |
386 mova m1, [r4+2*r1] ; p0 | |
387 mova m2, [r0] ; q0 | |
388 mova m3, [r0+r1] ; q1 | |
389 LOAD_MASK r2, r3 | |
390 | |
391 mov r3, r4m | |
392 movd m4, [r3] ; tc0 | |
393 punpcklbw m4, m4 | |
394 punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] | |
395 mova [esp+%3], m4 ; tc | |
396 pcmpeqb m3, m3 | |
397 pcmpgtb m4, m3 | |
398 pand m4, m7 | |
399 mova [esp], m4 ; mask | |
400 | |
401 mova m3, [r4] ; p2 | |
402 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 | |
403 pand m6, m4 | |
404 pand m4, [esp+%3] ; tc | |
405 mova m7, m4 | |
406 psubb m7, m6 | |
407 pand m6, m4 | |
408 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 | |
409 | |
410 mova m4, [r0+2*r1] ; q2 | |
411 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 | |
412 mova m5, [esp] ; mask | |
413 pand m6, m5 | |
414 mova m5, [esp+%3] ; tc | |
415 pand m5, m6 | |
416 psubb m7, m6 | |
417 mova m3, [r0+r1] | |
418 LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6 | |
419 | |
420 DEBLOCK_P0_Q0 | |
421 mova [r4+2*r1], m1 | |
422 mova [r0], m2 | |
423 ADD esp, pad | |
424 RET | |
425 | |
426 ;----------------------------------------------------------------------------- | |
427 ; void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) | |
428 ;----------------------------------------------------------------------------- | |
429 INIT_MMX | |
430 cglobal x264_deblock_h_luma_%1, 0,5 | |
431 mov r0, r0m | |
432 mov r3, r1m | |
433 lea r4, [r3*3] | |
434 sub r0, 4 | |
435 lea r1, [r0+r4] | |
436 %assign pad 0x78-(stack_offset&15) | |
437 SUB esp, pad | |
438 %define pix_tmp esp+12 | |
439 | |
440 ; transpose 6x16 -> tmp space | |
441 TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp | |
442 lea r0, [r0+r3*8] | |
443 lea r1, [r1+r3*8] | |
444 TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp+8 | |
445 | |
446 ; vertical filter | |
447 lea r0, [pix_tmp+0x30] | |
448 PUSH dword r4m | |
449 PUSH dword r3m | |
450 PUSH dword r2m | |
451 PUSH dword 16 | |
452 PUSH dword r0 | |
453 call x264_deblock_%2_luma_%1 | |
454 %ifidn %2, v8 | |
455 add dword [esp ], 8 ; pix_tmp+0x38 | |
456 add dword [esp+16], 2 ; tc0+2 | |
457 call x264_deblock_%2_luma_%1 | |
458 %endif | |
459 ADD esp, 20 | |
460 | |
461 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) | |
462 mov r0, r0m | |
463 sub r0, 2 | |
464 lea r1, [r0+r4] | |
465 | |
466 movq m0, [pix_tmp+0x10] | |
467 movq m1, [pix_tmp+0x20] | |
468 movq m2, [pix_tmp+0x30] | |
469 movq m3, [pix_tmp+0x40] | |
470 TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4) | |
471 | |
472 lea r0, [r0+r3*8] | |
473 lea r1, [r1+r3*8] | |
474 movq m0, [pix_tmp+0x18] | |
475 movq m1, [pix_tmp+0x28] | |
476 movq m2, [pix_tmp+0x38] | |
477 movq m3, [pix_tmp+0x48] | |
478 TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4) | |
479 | |
480 ADD esp, pad | |
481 RET | |
482 %endmacro ; DEBLOCK_LUMA | |
483 | |
484 INIT_XMM | |
485 DEBLOCK_LUMA sse2, v, 16 | |
486 | |
487 %endif ; ARCH | |
488 | |
489 | |
490 | |
491 %macro LUMA_INTRA_P012 4 ; p0..p3 in memory | |
492 mova t0, p2 | |
493 mova t1, p0 | |
494 pavgb t0, p1 | |
495 pavgb t1, q0 | |
496 pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2 | |
497 mova t5, t1 | |
498 mova t2, p2 | |
499 mova t3, p0 | |
500 paddb t2, p1 | |
501 paddb t3, q0 | |
502 paddb t2, t3 | |
503 mova t3, t2 | |
504 mova t4, t2 | |
505 psrlw t2, 1 | |
506 pavgb t2, mpb_00 | |
507 pxor t2, t0 | |
508 pand t2, mpb_01 | |
509 psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4; | |
510 | |
511 mova t1, p2 | |
512 mova t2, p2 | |
513 pavgb t1, q1 | |
514 psubb t2, q1 | |
515 paddb t3, t3 | |
516 psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1 | |
517 pand t2, mpb_01 | |
518 psubb t1, t2 | |
519 pavgb t1, p1 | |
520 pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2 | |
521 psrlw t3, 2 | |
522 pavgb t3, mpb_00 | |
523 pxor t3, t1 | |
524 pand t3, mpb_01 | |
525 psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8 | |
526 | |
527 mova t3, p0 | |
528 mova t2, p0 | |
529 pxor t3, q1 | |
530 pavgb t2, q1 | |
531 pand t3, mpb_01 | |
532 psubb t2, t3 | |
533 pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4 | |
534 | |
535 pxor t1, t2 | |
536 pxor t2, p0 | |
537 pand t1, mask1p | |
538 pand t2, mask0 | |
539 pxor t1, t2 | |
540 pxor t1, p0 | |
541 mova %1, t1 ; store p0 | |
542 | |
543 mova t1, %4 ; p3 | |
544 mova t2, t1 | |
545 pavgb t1, p2 | |
546 paddb t2, p2 | |
547 pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4 | |
548 paddb t2, t2 | |
549 paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0 | |
550 psrlw t2, 2 | |
551 pavgb t2, mpb_00 | |
552 pxor t2, t1 | |
553 pand t2, mpb_01 | |
554 psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8 | |
555 | |
556 pxor t0, p1 | |
557 pxor t1, p2 | |
558 pand t0, mask1p | |
559 pand t1, mask1p | |
560 pxor t0, p1 | |
561 pxor t1, p2 | |
562 mova %2, t0 ; store p1 | |
563 mova %3, t1 ; store p2 | |
564 %endmacro | |
565 | |
566 %macro LUMA_INTRA_SWAP_PQ 0 | |
567 %define q1 m0 | |
568 %define q0 m1 | |
569 %define p0 m2 | |
570 %define p1 m3 | |
571 %define p2 q2 | |
572 %define mask1p mask1q | |
573 %endmacro | |
574 | |
575 %macro DEBLOCK_LUMA_INTRA 2 | |
576 %define p1 m0 | |
577 %define p0 m1 | |
578 %define q0 m2 | |
579 %define q1 m3 | |
580 %define t0 m4 | |
581 %define t1 m5 | |
582 %define t2 m6 | |
583 %define t3 m7 | |
584 %ifdef ARCH_X86_64 | |
585 %define p2 m8 | |
586 %define q2 m9 | |
587 %define t4 m10 | |
588 %define t5 m11 | |
589 %define mask0 m12 | |
590 %define mask1p m13 | |
591 %define mask1q [rsp-24] | |
592 %define mpb_00 m14 | |
593 %define mpb_01 m15 | |
594 %else | |
595 %define spill(x) [esp+16*x+((stack_offset+4)&15)] | |
596 %define p2 [r4+r1] | |
597 %define q2 [r0+2*r1] | |
598 %define t4 spill(0) | |
599 %define t5 spill(1) | |
600 %define mask0 spill(2) | |
601 %define mask1p spill(3) | |
602 %define mask1q spill(4) | |
603 %define mpb_00 [pb_00 GLOBAL] | |
604 %define mpb_01 [pb_01 GLOBAL] | |
605 %endif | |
606 | |
607 ;----------------------------------------------------------------------------- | |
608 ; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ) | |
609 ;----------------------------------------------------------------------------- | |
610 cglobal x264_deblock_%2_luma_intra_%1, 4,6 | |
611 %ifndef ARCH_X86_64 | |
612 sub esp, 0x60 | |
613 %endif | |
614 lea r4, [r1*4] | |
615 lea r5, [r1*3] ; 3*stride | |
616 dec r2d ; alpha-1 | |
617 jl .end | |
618 neg r4 | |
619 dec r3d ; beta-1 | |
620 jl .end | |
621 add r4, r0 ; pix-4*stride | |
622 mova p1, [r4+2*r1] | |
623 mova p0, [r4+r5] | |
624 mova q0, [r0] | |
625 mova q1, [r0+r1] | |
626 %ifdef ARCH_X86_64 | |
627 pxor mpb_00, mpb_00 | |
628 mova mpb_01, [pb_01 GLOBAL] | |
629 LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 | |
630 SWAP 7, 12 ; m12=mask0 | |
631 pavgb t5, mpb_00 | |
632 pavgb t5, mpb_01 ; alpha/4+1 | |
633 movdqa p2, [r4+r1] | |
634 movdqa q2, [r0+2*r1] | |
635 DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1 | |
636 DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1 | |
637 DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1 | |
638 pand t0, mask0 | |
639 pand t4, t0 | |
640 pand t2, t0 | |
641 mova mask1q, t4 | |
642 mova mask1p, t2 | |
643 %else | |
644 LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 | |
645 mova m4, t5 | |
646 mova mask0, m7 | |
647 pavgb m4, [pb_00 GLOBAL] | |
648 pavgb m4, [pb_01 GLOBAL] ; alpha/4+1 | |
649 DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1 | |
650 pand m6, mask0 | |
651 DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1 | |
652 pand m4, m6 | |
653 mova mask1p, m4 | |
654 DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1 | |
655 pand m4, m6 | |
656 mova mask1q, m4 | |
657 %endif | |
658 LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4] | |
659 LUMA_INTRA_SWAP_PQ | |
660 LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5] | |
661 .end: | |
662 %ifndef ARCH_X86_64 | |
663 add esp, 0x60 | |
664 %endif | |
665 RET | |
666 | |
667 INIT_MMX | |
668 %ifdef ARCH_X86_64 | |
669 ;----------------------------------------------------------------------------- | |
670 ; void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ) | |
671 ;----------------------------------------------------------------------------- | |
672 cglobal x264_deblock_h_luma_intra_%1 | |
673 movsxd r10, r1d | |
674 lea r11, [r10*3] | |
675 lea rax, [r0-4] | |
676 lea r9, [r0-4+r11] | |
677 sub rsp, 0x88 | |
678 %define pix_tmp rsp | |
679 | |
680 ; transpose 8x16 -> tmp space | |
681 TRANSPOSE8x8_MEM PASS8ROWS(rax, r9, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) | |
682 lea rax, [rax+r10*8] | |
683 lea r9, [r9+r10*8] | |
684 TRANSPOSE8x8_MEM PASS8ROWS(rax, r9, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) | |
685 | |
686 lea r0, [pix_tmp+0x40] | |
687 mov r1, 0x10 | |
688 call x264_deblock_v_luma_intra_%1 | |
689 | |
690 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) | |
691 lea r9, [rax+r11] | |
692 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(rax, r9, r10, r11) | |
693 shl r10, 3 | |
694 sub rax, r10 | |
695 sub r9, r10 | |
696 shr r10, 3 | |
697 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(rax, r9, r10, r11) | |
698 add rsp, 0x88 | |
699 ret | |
700 %else | |
701 cglobal x264_deblock_h_luma_intra_%1, 2,4 | |
702 lea r3, [r1*3] | |
703 sub r0, 4 | |
704 lea r2, [r0+r3] | |
705 %assign pad 0x8c-(stack_offset&15) | |
706 SUB rsp, pad | |
707 %define pix_tmp rsp | |
708 | |
709 ; transpose 8x16 -> tmp space | |
710 TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) | |
711 lea r0, [r0+r1*8] | |
712 lea r2, [r2+r1*8] | |
713 TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) | |
714 | |
715 lea r0, [pix_tmp+0x40] | |
716 PUSH dword r3m | |
717 PUSH dword r2m | |
718 PUSH dword 16 | |
719 PUSH r0 | |
720 call x264_deblock_%2_luma_intra_%1 | |
721 %ifidn %2, v8 | |
722 add dword [rsp], 8 ; pix_tmp+8 | |
723 call x264_deblock_%2_luma_intra_%1 | |
724 %endif | |
725 ADD esp, 16 | |
726 | |
727 mov r1, r1m | |
728 mov r0, r0m | |
729 lea r3, [r1*3] | |
730 sub r0, 4 | |
731 lea r2, [r0+r3] | |
732 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) | |
733 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3) | |
734 lea r0, [r0+r1*8] | |
735 lea r2, [r2+r1*8] | |
736 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3) | |
737 ADD rsp, pad | |
738 RET | |
739 %endif ; ARCH_X86_64 | |
740 %endmacro ; DEBLOCK_LUMA_INTRA | |
741 | |
742 INIT_XMM | |
743 DEBLOCK_LUMA_INTRA sse2, v | |
744 %ifndef ARCH_X86_64 | |
745 INIT_MMX | |
746 DEBLOCK_LUMA_INTRA mmxext, v8 | |
747 %endif |