comparison x86/h264_deblock.asm @ 12454:f4355cd85faa libavcodec

Port latest x264 deblock asm (before they moved to using NV12 as internal format), LGPL'ed with permission from Jason and Loren. This includes mmx2 code, so remove inline asm from h264dsp_mmx.c accordingly.
author rbultje
date Fri, 03 Sep 2010 16:52:46 +0000
parents
children
comparison
equal deleted inserted replaced
12453:35e1de8243c6 12454:f4355cd85faa
1 ;*****************************************************************************
2 ;* MMX/SSE2-optimized H.264 deblocking code
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2008 x264 project
5 ;*
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Jason Garrett-Glaser <darkshikari@gmail.com>
8 ;*
9 ;* This file is part of FFmpeg.
10 ;*
11 ;* FFmpeg is free software; you can redistribute it and/or
12 ;* modify it under the terms of the GNU Lesser General Public
13 ;* License as published by the Free Software Foundation; either
14 ;* version 2.1 of the License, or (at your option) any later version.
15 ;*
16 ;* FFmpeg is distributed in the hope that it will be useful,
17 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 ;* Lesser General Public License for more details.
20 ;*
21 ;* You should have received a copy of the GNU Lesser General Public
22 ;* License along with FFmpeg; if not, write to the Free Software
23 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 ;******************************************************************************
25
26 %include "x86inc.asm"
27 %include "x86util.asm"
28
29 SECTION_RODATA
30
31 cextern pb_0
32 cextern pb_1
33 cextern pb_3
34 cextern pb_A1
35
36 SECTION .text
37
38 ; expands to [base],...,[base+7*stride]
39 %define PASS8ROWS(base, base3, stride, stride3) \
40 [base], [base+stride], [base+stride*2], [base3], \
41 [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
42
43 ; in: 8 rows of 4 bytes in %1..%8
44 ; out: 4 rows of 8 bytes in m0..m3
45 %macro TRANSPOSE4x8_LOAD 8
46 movd m0, %1
47 movd m2, %2
48 movd m1, %3
49 movd m3, %4
50 punpcklbw m0, m2
51 punpcklbw m1, m3
52 movq m2, m0
53 punpcklwd m0, m1
54 punpckhwd m2, m1
55
56 movd m4, %5
57 movd m6, %6
58 movd m5, %7
59 movd m7, %8
60 punpcklbw m4, m6
61 punpcklbw m5, m7
62 movq m6, m4
63 punpcklwd m4, m5
64 punpckhwd m6, m5
65
66 movq m1, m0
67 movq m3, m2
68 punpckldq m0, m4
69 punpckhdq m1, m4
70 punpckldq m2, m6
71 punpckhdq m3, m6
72 %endmacro
73
74 ; in: 4 rows of 8 bytes in m0..m3
75 ; out: 8 rows of 4 bytes in %1..%8
76 %macro TRANSPOSE8x4_STORE 8
77 movq m4, m0
78 movq m5, m1
79 movq m6, m2
80 punpckhdq m4, m4
81 punpckhdq m5, m5
82 punpckhdq m6, m6
83
84 punpcklbw m0, m1
85 punpcklbw m2, m3
86 movq m1, m0
87 punpcklwd m0, m2
88 punpckhwd m1, m2
89 movd %1, m0
90 punpckhdq m0, m0
91 movd %2, m0
92 movd %3, m1
93 punpckhdq m1, m1
94 movd %4, m1
95
96 punpckhdq m3, m3
97 punpcklbw m4, m5
98 punpcklbw m6, m3
99 movq m5, m4
100 punpcklwd m4, m6
101 punpckhwd m5, m6
102 movd %5, m4
103 punpckhdq m4, m4
104 movd %6, m4
105 movd %7, m5
106 punpckhdq m5, m5
107 movd %8, m5
108 %endmacro
109
110 %macro SBUTTERFLY3 4
111 movq %4, %2
112 punpckl%1 %2, %3
113 punpckh%1 %4, %3
114 %endmacro
115
116 ; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
117 ; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
118 %macro TRANSPOSE6x8_MEM 9
119 movq m0, %1
120 movq m1, %2
121 movq m2, %3
122 movq m3, %4
123 movq m4, %5
124 movq m5, %6
125 movq m6, %7
126 SBUTTERFLY3 bw, m0, m1, m7
127 SBUTTERFLY3 bw, m2, m3, m1
128 SBUTTERFLY3 bw, m4, m5, m3
129 movq [%9+0x10], m1
130 SBUTTERFLY3 bw, m6, %8, m5
131 SBUTTERFLY3 wd, m0, m2, m1
132 SBUTTERFLY3 wd, m4, m6, m2
133 punpckhdq m0, m4
134 movq [%9+0x00], m0
135 SBUTTERFLY3 wd, m7, [%9+0x10], m6
136 SBUTTERFLY3 wd, m3, m5, m4
137 SBUTTERFLY3 dq, m7, m3, m0
138 SBUTTERFLY3 dq, m1, m2, m5
139 punpckldq m6, m4
140 movq [%9+0x10], m1
141 movq [%9+0x20], m5
142 movq [%9+0x30], m7
143 movq [%9+0x40], m0
144 movq [%9+0x50], m6
145 %endmacro
146
147 ; in: 8 rows of 8 in %1..%8
148 ; out: 8 rows of 8 in %9..%16
149 %macro TRANSPOSE8x8_MEM 16
150 movq m0, %1
151 movq m1, %2
152 movq m2, %3
153 movq m3, %4
154 movq m4, %5
155 movq m5, %6
156 movq m6, %7
157 SBUTTERFLY3 bw, m0, m1, m7
158 SBUTTERFLY3 bw, m2, m3, m1
159 SBUTTERFLY3 bw, m4, m5, m3
160 SBUTTERFLY3 bw, m6, %8, m5
161 movq %9, m3
162 SBUTTERFLY3 wd, m0, m2, m3
163 SBUTTERFLY3 wd, m4, m6, m2
164 SBUTTERFLY3 wd, m7, m1, m6
165 movq %11, m2
166 movq m2, %9
167 SBUTTERFLY3 wd, m2, m5, m1
168 SBUTTERFLY3 dq, m0, m4, m5
169 SBUTTERFLY3 dq, m7, m2, m4
170 movq %9, m0
171 movq %10, m5
172 movq %13, m7
173 movq %14, m4
174 SBUTTERFLY3 dq, m3, %11, m0
175 SBUTTERFLY3 dq, m6, m1, m5
176 movq %11, m3
177 movq %12, m0
178 movq %15, m6
179 movq %16, m5
180 %endmacro
181
182 ; out: %4 = |%1-%2|>%3
183 ; clobbers: %5
184 %macro DIFF_GT 5
185 mova %5, %2
186 mova %4, %1
187 psubusb %5, %1
188 psubusb %4, %2
189 por %4, %5
190 psubusb %4, %3
191 %endmacro
192
193 ; out: %4 = |%1-%2|>%3
194 ; clobbers: %5
195 %macro DIFF_GT2 5
196 mova %5, %2
197 mova %4, %1
198 psubusb %5, %1
199 psubusb %4, %2
200 psubusb %5, %3
201 psubusb %4, %3
202 pcmpeqb %4, %5
203 %endmacro
204
205 %macro SPLATW 1
206 %ifidn m0, xmm0
207 pshuflw %1, %1, 0
208 punpcklqdq %1, %1
209 %else
210 pshufw %1, %1, 0
211 %endif
212 %endmacro
213
214 ; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1
215 ; out: m5=beta-1, m7=mask, %3=alpha-1
216 ; clobbers: m4,m6
217 %macro LOAD_MASK 2-3
218 movd m4, %1
219 movd m5, %2
220 SPLATW m4
221 SPLATW m5
222 packuswb m4, m4 ; 16x alpha-1
223 packuswb m5, m5 ; 16x beta-1
224 %if %0>2
225 mova %3, m4
226 %endif
227 DIFF_GT m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1
228 DIFF_GT m0, m1, m5, m4, m6 ; |p1-p0| > beta-1
229 por m7, m4
230 DIFF_GT m3, m2, m5, m4, m6 ; |q1-q0| > beta-1
231 por m7, m4
232 pxor m6, m6
233 pcmpeqb m7, m6
234 %endmacro
235
236 ; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask)
237 ; out: m1=p0' m2=q0'
238 ; clobbers: m0,3-6
239 %macro DEBLOCK_P0_Q0 0
240 mova m5, m1
241 pxor m5, m2 ; p0^q0
242 pand m5, [pb_1] ; (p0^q0)&1
243 pcmpeqb m4, m4
244 pxor m3, m4
245 pavgb m3, m0 ; (p1 - q1 + 256)>>1
246 pavgb m3, [pb_3] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
247 pxor m4, m1
248 pavgb m4, m2 ; (q0 - p0 + 256)>>1
249 pavgb m3, m5
250 paddusb m3, m4 ; d+128+33
251 mova m6, [pb_A1]
252 psubusb m6, m3
253 psubusb m3, [pb_A1]
254 pminub m6, m7
255 pminub m3, m7
256 psubusb m1, m6
257 psubusb m2, m3
258 paddusb m1, m3
259 paddusb m2, m6
260 %endmacro
261
262 ; in: m1=p0 m2=q0
263 ; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp
264 ; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
265 ; clobbers: q2, tmp, tc0
266 %macro LUMA_Q1 6
267 mova %6, m1
268 pavgb %6, m2
269 pavgb %2, %6 ; avg(p2,avg(p0,q0))
270 pxor %6, %3
271 pand %6, [pb_1] ; (p2^avg(p0,q0))&1
272 psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1
273 mova %6, %1
274 psubusb %6, %5
275 paddusb %5, %1
276 pmaxub %2, %6
277 pminub %2, %5
278 mova %4, %2
279 %endmacro
280
281 %ifdef ARCH_X86_64
282 ;-----------------------------------------------------------------------------
283 ; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
284 ;-----------------------------------------------------------------------------
285 INIT_XMM
286 cglobal x264_deblock_v_luma_sse2, 5,5,10
287 movd m8, [r4] ; tc0
288 lea r4, [r1*3]
289 dec r2d ; alpha-1
290 neg r4
291 dec r3d ; beta-1
292 add r4, r0 ; pix-3*stride
293
294 mova m0, [r4+r1] ; p1
295 mova m1, [r4+2*r1] ; p0
296 mova m2, [r0] ; q0
297 mova m3, [r0+r1] ; q1
298 LOAD_MASK r2d, r3d
299
300 punpcklbw m8, m8
301 punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
302 pcmpeqb m9, m9
303 pcmpeqb m9, m8
304 pandn m9, m7
305 pand m8, m9
306
307 movdqa m3, [r4] ; p2
308 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
309 pand m6, m9
310 mova m7, m8
311 psubb m7, m6
312 pand m6, m8
313 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
314
315 movdqa m4, [r0+2*r1] ; q2
316 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
317 pand m6, m9
318 pand m8, m6
319 psubb m7, m6
320 mova m3, [r0+r1]
321 LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6
322
323 DEBLOCK_P0_Q0
324 mova [r4+2*r1], m1
325 mova [r0], m2
326 RET
327
328 ;-----------------------------------------------------------------------------
329 ; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
330 ;-----------------------------------------------------------------------------
331 INIT_MMX
332 cglobal x264_deblock_h_luma_sse2, 5,7
333 movsxd r10, r1d
334 lea r11, [r10+r10*2]
335 lea r6, [r0-4]
336 lea r5, [r0-4+r11]
337 %ifdef WIN64
338 sub rsp, 0x98
339 %define pix_tmp rsp+0x30
340 %else
341 sub rsp, 0x68
342 %define pix_tmp rsp
343 %endif
344
345 ; transpose 6x16 -> tmp space
346 TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp
347 lea r6, [r6+r10*8]
348 lea r5, [r5+r10*8]
349 TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp+8
350
351 ; vertical filter
352 ; alpha, beta, tc0 are still in r2d, r3d, r4
353 ; don't backup r6, r5, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them
354 lea r0, [pix_tmp+0x30]
355 mov r1d, 0x10
356 %ifdef WIN64
357 mov [rsp+0x20], r4
358 %endif
359 call x264_deblock_v_luma_sse2
360
361 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
362 add r6, 2
363 add r5, 2
364 movq m0, [pix_tmp+0x18]
365 movq m1, [pix_tmp+0x28]
366 movq m2, [pix_tmp+0x38]
367 movq m3, [pix_tmp+0x48]
368 TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11)
369
370 shl r10, 3
371 sub r6, r10
372 sub r5, r10
373 shr r10, 3
374 movq m0, [pix_tmp+0x10]
375 movq m1, [pix_tmp+0x20]
376 movq m2, [pix_tmp+0x30]
377 movq m3, [pix_tmp+0x40]
378 TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11)
379
380 %ifdef WIN64
381 add rsp, 0x98
382 %else
383 add rsp, 0x68
384 %endif
385 RET
386
387 %else
388
389 %macro DEBLOCK_LUMA 3
390 ;-----------------------------------------------------------------------------
391 ; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
392 ;-----------------------------------------------------------------------------
393 cglobal x264_deblock_%2_luma_%1, 5,5
394 lea r4, [r1*3]
395 dec r2 ; alpha-1
396 neg r4
397 dec r3 ; beta-1
398 add r4, r0 ; pix-3*stride
399 %assign pad 2*%3+12-(stack_offset&15)
400 SUB esp, pad
401
402 mova m0, [r4+r1] ; p1
403 mova m1, [r4+2*r1] ; p0
404 mova m2, [r0] ; q0
405 mova m3, [r0+r1] ; q1
406 LOAD_MASK r2, r3
407
408 mov r3, r4mp
409 movd m4, [r3] ; tc0
410 punpcklbw m4, m4
411 punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
412 mova [esp+%3], m4 ; tc
413 pcmpeqb m3, m3
414 pcmpgtb m4, m3
415 pand m4, m7
416 mova [esp], m4 ; mask
417
418 mova m3, [r4] ; p2
419 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
420 pand m6, m4
421 pand m4, [esp+%3] ; tc
422 mova m7, m4
423 psubb m7, m6
424 pand m6, m4
425 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
426
427 mova m4, [r0+2*r1] ; q2
428 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
429 mova m5, [esp] ; mask
430 pand m6, m5
431 mova m5, [esp+%3] ; tc
432 pand m5, m6
433 psubb m7, m6
434 mova m3, [r0+r1]
435 LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6
436
437 DEBLOCK_P0_Q0
438 mova [r4+2*r1], m1
439 mova [r0], m2
440 ADD esp, pad
441 RET
442
443 ;-----------------------------------------------------------------------------
444 ; void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
445 ;-----------------------------------------------------------------------------
446 INIT_MMX
447 cglobal x264_deblock_h_luma_%1, 0,5
448 mov r0, r0mp
449 mov r3, r1m
450 lea r4, [r3*3]
451 sub r0, 4
452 lea r1, [r0+r4]
453 %assign pad 0x78-(stack_offset&15)
454 SUB esp, pad
455 %define pix_tmp esp+12
456
457 ; transpose 6x16 -> tmp space
458 TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp
459 lea r0, [r0+r3*8]
460 lea r1, [r1+r3*8]
461 TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp+8
462
463 ; vertical filter
464 lea r0, [pix_tmp+0x30]
465 PUSH dword r4m
466 PUSH dword r3m
467 PUSH dword r2m
468 PUSH dword 16
469 PUSH dword r0
470 call x264_deblock_%2_luma_%1
471 %ifidn %2, v8
472 add dword [esp ], 8 ; pix_tmp+0x38
473 add dword [esp+16], 2 ; tc0+2
474 call x264_deblock_%2_luma_%1
475 %endif
476 ADD esp, 20
477
478 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
479 mov r0, r0mp
480 sub r0, 2
481 lea r1, [r0+r4]
482
483 movq m0, [pix_tmp+0x10]
484 movq m1, [pix_tmp+0x20]
485 movq m2, [pix_tmp+0x30]
486 movq m3, [pix_tmp+0x40]
487 TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4)
488
489 lea r0, [r0+r3*8]
490 lea r1, [r1+r3*8]
491 movq m0, [pix_tmp+0x18]
492 movq m1, [pix_tmp+0x28]
493 movq m2, [pix_tmp+0x38]
494 movq m3, [pix_tmp+0x48]
495 TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4)
496
497 ADD esp, pad
498 RET
499 %endmacro ; DEBLOCK_LUMA
500
501 INIT_MMX
502 DEBLOCK_LUMA mmxext, v8, 8
503 INIT_XMM
504 DEBLOCK_LUMA sse2, v, 16
505
506 %endif ; ARCH
507
508
509
510 %macro LUMA_INTRA_P012 4 ; p0..p3 in memory
511 mova t0, p2
512 mova t1, p0
513 pavgb t0, p1
514 pavgb t1, q0
515 pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2
516 mova t5, t1
517 mova t2, p2
518 mova t3, p0
519 paddb t2, p1
520 paddb t3, q0
521 paddb t2, t3
522 mova t3, t2
523 mova t4, t2
524 psrlw t2, 1
525 pavgb t2, mpb_0
526 pxor t2, t0
527 pand t2, mpb_1
528 psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4;
529
530 mova t1, p2
531 mova t2, p2
532 pavgb t1, q1
533 psubb t2, q1
534 paddb t3, t3
535 psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1
536 pand t2, mpb_1
537 psubb t1, t2
538 pavgb t1, p1
539 pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2
540 psrlw t3, 2
541 pavgb t3, mpb_0
542 pxor t3, t1
543 pand t3, mpb_1
544 psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8
545
546 mova t3, p0
547 mova t2, p0
548 pxor t3, q1
549 pavgb t2, q1
550 pand t3, mpb_1
551 psubb t2, t3
552 pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4
553
554 pxor t1, t2
555 pxor t2, p0
556 pand t1, mask1p
557 pand t2, mask0
558 pxor t1, t2
559 pxor t1, p0
560 mova %1, t1 ; store p0
561
562 mova t1, %4 ; p3
563 mova t2, t1
564 pavgb t1, p2
565 paddb t2, p2
566 pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4
567 paddb t2, t2
568 paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0
569 psrlw t2, 2
570 pavgb t2, mpb_0
571 pxor t2, t1
572 pand t2, mpb_1
573 psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8
574
575 pxor t0, p1
576 pxor t1, p2
577 pand t0, mask1p
578 pand t1, mask1p
579 pxor t0, p1
580 pxor t1, p2
581 mova %2, t0 ; store p1
582 mova %3, t1 ; store p2
583 %endmacro
584
585 %macro LUMA_INTRA_SWAP_PQ 0
586 %define q1 m0
587 %define q0 m1
588 %define p0 m2
589 %define p1 m3
590 %define p2 q2
591 %define mask1p mask1q
592 %endmacro
593
594 %macro DEBLOCK_LUMA_INTRA 2
595 %define p1 m0
596 %define p0 m1
597 %define q0 m2
598 %define q1 m3
599 %define t0 m4
600 %define t1 m5
601 %define t2 m6
602 %define t3 m7
603 %ifdef ARCH_X86_64
604 %define p2 m8
605 %define q2 m9
606 %define t4 m10
607 %define t5 m11
608 %define mask0 m12
609 %define mask1p m13
610 %define mask1q [rsp-24]
611 %define mpb_0 m14
612 %define mpb_1 m15
613 %else
614 %define spill(x) [esp+16*x+((stack_offset+4)&15)]
615 %define p2 [r4+r1]
616 %define q2 [r0+2*r1]
617 %define t4 spill(0)
618 %define t5 spill(1)
619 %define mask0 spill(2)
620 %define mask1p spill(3)
621 %define mask1q spill(4)
622 %define mpb_0 [pb_0]
623 %define mpb_1 [pb_1]
624 %endif
625
626 ;-----------------------------------------------------------------------------
627 ; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
628 ;-----------------------------------------------------------------------------
629 cglobal x264_deblock_%2_luma_intra_%1, 4,6,16
630 %ifndef ARCH_X86_64
631 sub esp, 0x60
632 %endif
633 lea r4, [r1*4]
634 lea r5, [r1*3] ; 3*stride
635 dec r2d ; alpha-1
636 jl .end
637 neg r4
638 dec r3d ; beta-1
639 jl .end
640 add r4, r0 ; pix-4*stride
641 mova p1, [r4+2*r1]
642 mova p0, [r4+r5]
643 mova q0, [r0]
644 mova q1, [r0+r1]
645 %ifdef ARCH_X86_64
646 pxor mpb_0, mpb_0
647 mova mpb_1, [pb_1]
648 LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
649 SWAP 7, 12 ; m12=mask0
650 pavgb t5, mpb_0
651 pavgb t5, mpb_1 ; alpha/4+1
652 movdqa p2, [r4+r1]
653 movdqa q2, [r0+2*r1]
654 DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1
655 DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1
656 DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1
657 pand t0, mask0
658 pand t4, t0
659 pand t2, t0
660 mova mask1q, t4
661 mova mask1p, t2
662 %else
663 LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
664 mova m4, t5
665 mova mask0, m7
666 pavgb m4, [pb_0]
667 pavgb m4, [pb_1] ; alpha/4+1
668 DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1
669 pand m6, mask0
670 DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1
671 pand m4, m6
672 mova mask1p, m4
673 DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1
674 pand m4, m6
675 mova mask1q, m4
676 %endif
677 LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4]
678 LUMA_INTRA_SWAP_PQ
679 LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5]
680 .end:
681 %ifndef ARCH_X86_64
682 add esp, 0x60
683 %endif
684 RET
685
686 INIT_MMX
687 %ifdef ARCH_X86_64
688 ;-----------------------------------------------------------------------------
689 ; void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
690 ;-----------------------------------------------------------------------------
691 cglobal x264_deblock_h_luma_intra_%1, 4,7
692 movsxd r10, r1d
693 lea r11, [r10*3]
694 lea r6, [r0-4]
695 lea r5, [r0-4+r11]
696 sub rsp, 0x88
697 %define pix_tmp rsp
698
699 ; transpose 8x16 -> tmp space
700 TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
701 lea r6, [r6+r10*8]
702 lea r5, [r5+r10*8]
703 TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
704
705 lea r0, [pix_tmp+0x40]
706 mov r1, 0x10
707 call x264_deblock_v_luma_intra_%1
708
709 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
710 lea r5, [r6+r11]
711 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
712 shl r10, 3
713 sub r6, r10
714 sub r5, r10
715 shr r10, 3
716 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
717 add rsp, 0x88
718 RET
719 %else
720 cglobal x264_deblock_h_luma_intra_%1, 2,4
721 lea r3, [r1*3]
722 sub r0, 4
723 lea r2, [r0+r3]
724 %assign pad 0x8c-(stack_offset&15)
725 SUB rsp, pad
726 %define pix_tmp rsp
727
728 ; transpose 8x16 -> tmp space
729 TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
730 lea r0, [r0+r1*8]
731 lea r2, [r2+r1*8]
732 TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
733
734 lea r0, [pix_tmp+0x40]
735 PUSH dword r3m
736 PUSH dword r2m
737 PUSH dword 16
738 PUSH r0
739 call x264_deblock_%2_luma_intra_%1
740 %ifidn %2, v8
741 add dword [rsp], 8 ; pix_tmp+8
742 call x264_deblock_%2_luma_intra_%1
743 %endif
744 ADD esp, 16
745
746 mov r1, r1m
747 mov r0, r0mp
748 lea r3, [r1*3]
749 sub r0, 4
750 lea r2, [r0+r3]
751 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
752 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
753 lea r0, [r0+r1*8]
754 lea r2, [r2+r1*8]
755 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
756 ADD rsp, pad
757 RET
758 %endif ; ARCH_X86_64
759 %endmacro ; DEBLOCK_LUMA_INTRA
760
761 INIT_XMM
762 DEBLOCK_LUMA_INTRA sse2, v
763 %ifndef ARCH_X86_64
764 INIT_MMX
765 DEBLOCK_LUMA_INTRA mmxext, v8
766 %endif
767
768
769
770 INIT_MMX
771
772 %macro CHROMA_V_START 0
773 dec r2d ; alpha-1
774 dec r3d ; beta-1
775 mov t5, r0
776 sub t5, r1
777 sub t5, r1
778 %endmacro
779
780 %macro CHROMA_H_START 0
781 dec r2d
782 dec r3d
783 sub r0, 2
784 lea t6, [r1*3]
785 mov t5, r0
786 add r0, t6
787 %endmacro
788
789 %define t5 r5
790 %define t6 r6
791
792 ;-----------------------------------------------------------------------------
793 ; void x264_deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
794 ;-----------------------------------------------------------------------------
795 cglobal x264_deblock_v_chroma_mmxext, 5,6
796 CHROMA_V_START
797 movq m0, [t5]
798 movq m1, [t5+r1]
799 movq m2, [r0]
800 movq m3, [r0+r1]
801 call x264_chroma_inter_body_mmxext
802 movq [t5+r1], m1
803 movq [r0], m2
804 RET
805
806 ;-----------------------------------------------------------------------------
807 ; void x264_deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
808 ;-----------------------------------------------------------------------------
809 cglobal x264_deblock_h_chroma_mmxext, 5,7
810 %ifdef ARCH_X86_64
811 %define buf0 [rsp-24]
812 %define buf1 [rsp-16]
813 %else
814 %define buf0 r0m
815 %define buf1 r2m
816 %endif
817 CHROMA_H_START
818 TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6)
819 movq buf0, m0
820 movq buf1, m3
821 call x264_chroma_inter_body_mmxext
822 movq m0, buf0
823 movq m3, buf1
824 TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6)
825 RET
826
827 ALIGN 16
828 x264_chroma_inter_body_mmxext:
829 LOAD_MASK r2d, r3d
830 movd m6, [r4] ; tc0
831 punpcklbw m6, m6
832 pand m7, m6
833 DEBLOCK_P0_Q0
834 ret
835
836
837
838 ; in: %1=p0 %2=p1 %3=q1
839 ; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2
840 %macro CHROMA_INTRA_P0 3
841 movq m4, %1
842 pxor m4, %3
843 pand m4, [pb_1] ; m4 = (p0^q1)&1
844 pavgb %1, %3
845 psubusb %1, m4
846 pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
847 %endmacro
848
849 %define t5 r4
850 %define t6 r5
851
852 ;-----------------------------------------------------------------------------
853 ; void x264_deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
854 ;-----------------------------------------------------------------------------
855 cglobal x264_deblock_v_chroma_intra_mmxext, 4,5
856 CHROMA_V_START
857 movq m0, [t5]
858 movq m1, [t5+r1]
859 movq m2, [r0]
860 movq m3, [r0+r1]
861 call x264_chroma_intra_body_mmxext
862 movq [t5+r1], m1
863 movq [r0], m2
864 RET
865
866 ;-----------------------------------------------------------------------------
867 ; void x264_deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
868 ;-----------------------------------------------------------------------------
869 cglobal x264_deblock_h_chroma_intra_mmxext, 4,6
870 CHROMA_H_START
871 TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6)
872 call x264_chroma_intra_body_mmxext
873 TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6)
874 RET
875
876 ALIGN 16
877 x264_chroma_intra_body_mmxext:
878 LOAD_MASK r2d, r3d
879 movq m5, m1
880 movq m6, m2
881 CHROMA_INTRA_P0 m1, m0, m3
882 CHROMA_INTRA_P0 m2, m3, m0
883 psubb m1, m5
884 psubb m2, m6
885 pand m1, m7
886 pand m2, m7
887 paddb m1, m5
888 paddb m2, m6
889 ret