comparison x86/h264_deblock_sse2.asm @ 8430:7768bdfd4f7b libavcodec

Rename libavcodec/i386/ --> libavcodec/x86/. It contains optimizations that are not specific to i386 and libavutil uses this naming scheme already.
author diego
date Mon, 22 Dec 2008 09:12:42 +0000
parents
children e5c9a3a813ea
comparison
equal deleted inserted replaced
8429:b3ecaba81501 8430:7768bdfd4f7b
1 ;*****************************************************************************
2 ;* deblock-a.asm: h264 encoder library
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2008 x264 project
5 ;*
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;*
8 ;* This program is free software; you can redistribute it and/or modify
9 ;* it under the terms of the GNU General Public License as published by
10 ;* the Free Software Foundation; either version 2 of the License, or
11 ;* (at your option) any later version.
12 ;*
13 ;* This program is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 ;* GNU General Public License for more details.
17 ;*
18 ;* You should have received a copy of the GNU General Public License
19 ;* along with this program; if not, write to the Free Software
20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
21 ;*****************************************************************************
22
23 %include "x86inc.asm"
24
25 SECTION_RODATA
26 pb_00: times 16 db 0x00
27 pb_01: times 16 db 0x01
28 pb_03: times 16 db 0x03
29 pb_a1: times 16 db 0xa1
30
31 SECTION .text
32
33 ; expands to [base],...,[base+7*stride]
34 %define PASS8ROWS(base, base3, stride, stride3) \
35 [base], [base+stride], [base+stride*2], [base3], \
36 [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
37
38 ; in: 8 rows of 4 bytes in %1..%8
39 ; out: 4 rows of 8 bytes in m0..m3
40 %macro TRANSPOSE4x8_LOAD 8
41 movd m0, %1
42 movd m2, %2
43 movd m1, %3
44 movd m3, %4
45 punpcklbw m0, m2
46 punpcklbw m1, m3
47 movq m2, m0
48 punpcklwd m0, m1
49 punpckhwd m2, m1
50
51 movd m4, %5
52 movd m6, %6
53 movd m5, %7
54 movd m7, %8
55 punpcklbw m4, m6
56 punpcklbw m5, m7
57 movq m6, m4
58 punpcklwd m4, m5
59 punpckhwd m6, m5
60
61 movq m1, m0
62 movq m3, m2
63 punpckldq m0, m4
64 punpckhdq m1, m4
65 punpckldq m2, m6
66 punpckhdq m3, m6
67 %endmacro
68
69 ; in: 4 rows of 8 bytes in m0..m3
70 ; out: 8 rows of 4 bytes in %1..%8
71 %macro TRANSPOSE8x4_STORE 8
72 movq m4, m0
73 movq m5, m1
74 movq m6, m2
75 punpckhdq m4, m4
76 punpckhdq m5, m5
77 punpckhdq m6, m6
78
79 punpcklbw m0, m1
80 punpcklbw m2, m3
81 movq m1, m0
82 punpcklwd m0, m2
83 punpckhwd m1, m2
84 movd %1, m0
85 punpckhdq m0, m0
86 movd %2, m0
87 movd %3, m1
88 punpckhdq m1, m1
89 movd %4, m1
90
91 punpckhdq m3, m3
92 punpcklbw m4, m5
93 punpcklbw m6, m3
94 movq m5, m4
95 punpcklwd m4, m6
96 punpckhwd m5, m6
97 movd %5, m4
98 punpckhdq m4, m4
99 movd %6, m4
100 movd %7, m5
101 punpckhdq m5, m5
102 movd %8, m5
103 %endmacro
104
105 %macro SBUTTERFLY 4
106 movq %4, %2
107 punpckl%1 %2, %3
108 punpckh%1 %4, %3
109 %endmacro
110
111 ; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
112 ; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
113 %macro TRANSPOSE6x8_MEM 9
114 movq m0, %1
115 movq m1, %2
116 movq m2, %3
117 movq m3, %4
118 movq m4, %5
119 movq m5, %6
120 movq m6, %7
121 SBUTTERFLY bw, m0, m1, m7
122 SBUTTERFLY bw, m2, m3, m1
123 SBUTTERFLY bw, m4, m5, m3
124 movq [%9+0x10], m1
125 SBUTTERFLY bw, m6, %8, m5
126 SBUTTERFLY wd, m0, m2, m1
127 SBUTTERFLY wd, m4, m6, m2
128 punpckhdq m0, m4
129 movq [%9+0x00], m0
130 SBUTTERFLY wd, m7, [%9+0x10], m6
131 SBUTTERFLY wd, m3, m5, m4
132 SBUTTERFLY dq, m7, m3, m0
133 SBUTTERFLY dq, m1, m2, m5
134 punpckldq m6, m4
135 movq [%9+0x10], m1
136 movq [%9+0x20], m5
137 movq [%9+0x30], m7
138 movq [%9+0x40], m0
139 movq [%9+0x50], m6
140 %endmacro
141
142 ; in: 8 rows of 8 in %1..%8
143 ; out: 8 rows of 8 in %9..%16
144 %macro TRANSPOSE8x8_MEM 16
145 movq m0, %1
146 movq m1, %2
147 movq m2, %3
148 movq m3, %4
149 movq m4, %5
150 movq m5, %6
151 movq m6, %7
152 SBUTTERFLY bw, m0, m1, m7
153 SBUTTERFLY bw, m2, m3, m1
154 SBUTTERFLY bw, m4, m5, m3
155 SBUTTERFLY bw, m6, %8, m5
156 movq %9, m3
157 SBUTTERFLY wd, m0, m2, m3
158 SBUTTERFLY wd, m4, m6, m2
159 SBUTTERFLY wd, m7, m1, m6
160 movq %11, m2
161 movq m2, %9
162 SBUTTERFLY wd, m2, m5, m1
163 SBUTTERFLY dq, m0, m4, m5
164 SBUTTERFLY dq, m7, m2, m4
165 movq %9, m0
166 movq %10, m5
167 movq %13, m7
168 movq %14, m4
169 SBUTTERFLY dq, m3, %11, m0
170 SBUTTERFLY dq, m6, m1, m5
171 movq %11, m3
172 movq %12, m0
173 movq %15, m6
174 movq %16, m5
175 %endmacro
176
177 ; out: %4 = |%1-%2|>%3
178 ; clobbers: %5
179 %macro DIFF_GT 5
180 mova %5, %2
181 mova %4, %1
182 psubusb %5, %1
183 psubusb %4, %2
184 por %4, %5
185 psubusb %4, %3
186 %endmacro
187
188 ; out: %4 = |%1-%2|>%3
189 ; clobbers: %5
190 %macro DIFF_GT2 5
191 mova %5, %2
192 mova %4, %1
193 psubusb %5, %1
194 psubusb %4, %2
195 psubusb %5, %3
196 psubusb %4, %3
197 pcmpeqb %4, %5
198 %endmacro
199
200 %macro SPLATW 1
201 %ifidn m0, xmm0
202 pshuflw %1, %1, 0
203 punpcklqdq %1, %1
204 %else
205 pshufw %1, %1, 0
206 %endif
207 %endmacro
208
209 ; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1
210 ; out: m5=beta-1, m7=mask, %3=alpha-1
211 ; clobbers: m4,m6
212 %macro LOAD_MASK 2-3
213 movd m4, %1
214 movd m5, %2
215 SPLATW m4
216 SPLATW m5
217 packuswb m4, m4 ; 16x alpha-1
218 packuswb m5, m5 ; 16x beta-1
219 %if %0>2
220 mova %3, m4
221 %endif
222 DIFF_GT m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1
223 DIFF_GT m0, m1, m5, m4, m6 ; |p1-p0| > beta-1
224 por m7, m4
225 DIFF_GT m3, m2, m5, m4, m6 ; |q1-q0| > beta-1
226 por m7, m4
227 pxor m6, m6
228 pcmpeqb m7, m6
229 %endmacro
230
231 ; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask)
232 ; out: m1=p0' m2=q0'
233 ; clobbers: m0,3-6
234 %macro DEBLOCK_P0_Q0 0
235 mova m5, m1
236 pxor m5, m2 ; p0^q0
237 pand m5, [pb_01 GLOBAL] ; (p0^q0)&1
238 pcmpeqb m4, m4
239 pxor m3, m4
240 pavgb m3, m0 ; (p1 - q1 + 256)>>1
241 pavgb m3, [pb_03 GLOBAL] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
242 pxor m4, m1
243 pavgb m4, m2 ; (q0 - p0 + 256)>>1
244 pavgb m3, m5
245 paddusb m3, m4 ; d+128+33
246 mova m6, [pb_a1 GLOBAL]
247 psubusb m6, m3
248 psubusb m3, [pb_a1 GLOBAL]
249 pminub m6, m7
250 pminub m3, m7
251 psubusb m1, m6
252 psubusb m2, m3
253 paddusb m1, m3
254 paddusb m2, m6
255 %endmacro
256
257 ; in: m1=p0 m2=q0
258 ; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp
259 ; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
260 ; clobbers: q2, tmp, tc0
261 %macro LUMA_Q1 6
262 mova %6, m1
263 pavgb %6, m2
264 pavgb %2, %6 ; avg(p2,avg(p0,q0))
265 pxor %6, %3
266 pand %6, [pb_01 GLOBAL] ; (p2^avg(p0,q0))&1
267 psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1
268 mova %6, %1
269 psubusb %6, %5
270 paddusb %5, %1
271 pmaxub %2, %6
272 pminub %2, %5
273 mova %4, %2
274 %endmacro
275
276 %ifdef ARCH_X86_64
277 ;-----------------------------------------------------------------------------
278 ; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
279 ;-----------------------------------------------------------------------------
280 INIT_XMM
281 cglobal x264_deblock_v_luma_sse2
282 movd m8, [r4] ; tc0
283 lea r4, [r1*3]
284 dec r2d ; alpha-1
285 neg r4
286 dec r3d ; beta-1
287 add r4, r0 ; pix-3*stride
288
289 mova m0, [r4+r1] ; p1
290 mova m1, [r4+2*r1] ; p0
291 mova m2, [r0] ; q0
292 mova m3, [r0+r1] ; q1
293 LOAD_MASK r2d, r3d
294
295 punpcklbw m8, m8
296 punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
297 pcmpeqb m9, m9
298 pcmpeqb m9, m8
299 pandn m9, m7
300 pand m8, m9
301
302 movdqa m3, [r4] ; p2
303 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
304 pand m6, m9
305 mova m7, m8
306 psubb m7, m6
307 pand m6, m8
308 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
309
310 movdqa m4, [r0+2*r1] ; q2
311 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
312 pand m6, m9
313 pand m8, m6
314 psubb m7, m6
315 mova m3, [r0+r1]
316 LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6
317
318 DEBLOCK_P0_Q0
319 mova [r4+2*r1], m1
320 mova [r0], m2
321 ret
322
323 ;-----------------------------------------------------------------------------
324 ; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
325 ;-----------------------------------------------------------------------------
326 INIT_MMX
327 cglobal x264_deblock_h_luma_sse2
328 movsxd r10, esi
329 lea r11, [r10+r10*2]
330 lea rax, [r0-4]
331 lea r9, [r0-4+r11]
332 sub rsp, 0x68
333 %define pix_tmp rsp
334
335 ; transpose 6x16 -> tmp space
336 TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp
337 lea rax, [rax+r10*8]
338 lea r9, [r9 +r10*8]
339 TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp+8
340
341 ; vertical filter
342 ; alpha, beta, tc0 are still in r2d, r3d, r4
343 ; don't backup rax, r9, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them
344 lea r0, [pix_tmp+0x30]
345 mov esi, 0x10
346 call x264_deblock_v_luma_sse2
347
348 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
349 add rax, 2
350 add r9, 2
351 movq m0, [pix_tmp+0x18]
352 movq m1, [pix_tmp+0x28]
353 movq m2, [pix_tmp+0x38]
354 movq m3, [pix_tmp+0x48]
355 TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11)
356
357 shl r10, 3
358 sub rax, r10
359 sub r9, r10
360 shr r10, 3
361 movq m0, [pix_tmp+0x10]
362 movq m1, [pix_tmp+0x20]
363 movq m2, [pix_tmp+0x30]
364 movq m3, [pix_tmp+0x40]
365 TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11)
366
367 add rsp, 0x68
368 ret
369
370 %else
371
372 %macro DEBLOCK_LUMA 3
373 ;-----------------------------------------------------------------------------
374 ; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
375 ;-----------------------------------------------------------------------------
376 cglobal x264_deblock_%2_luma_%1, 5,5
377 lea r4, [r1*3]
378 dec r2 ; alpha-1
379 neg r4
380 dec r3 ; beta-1
381 add r4, r0 ; pix-3*stride
382 %assign pad 2*%3+12-(stack_offset&15)
383 SUB esp, pad
384
385 mova m0, [r4+r1] ; p1
386 mova m1, [r4+2*r1] ; p0
387 mova m2, [r0] ; q0
388 mova m3, [r0+r1] ; q1
389 LOAD_MASK r2, r3
390
391 mov r3, r4m
392 movd m4, [r3] ; tc0
393 punpcklbw m4, m4
394 punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
395 mova [esp+%3], m4 ; tc
396 pcmpeqb m3, m3
397 pcmpgtb m4, m3
398 pand m4, m7
399 mova [esp], m4 ; mask
400
401 mova m3, [r4] ; p2
402 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
403 pand m6, m4
404 pand m4, [esp+%3] ; tc
405 mova m7, m4
406 psubb m7, m6
407 pand m6, m4
408 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
409
410 mova m4, [r0+2*r1] ; q2
411 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
412 mova m5, [esp] ; mask
413 pand m6, m5
414 mova m5, [esp+%3] ; tc
415 pand m5, m6
416 psubb m7, m6
417 mova m3, [r0+r1]
418 LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6
419
420 DEBLOCK_P0_Q0
421 mova [r4+2*r1], m1
422 mova [r0], m2
423 ADD esp, pad
424 RET
425
426 ;-----------------------------------------------------------------------------
427 ; void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
428 ;-----------------------------------------------------------------------------
429 INIT_MMX
430 cglobal x264_deblock_h_luma_%1, 0,5
431 mov r0, r0m
432 mov r3, r1m
433 lea r4, [r3*3]
434 sub r0, 4
435 lea r1, [r0+r4]
436 %assign pad 0x78-(stack_offset&15)
437 SUB esp, pad
438 %define pix_tmp esp+12
439
440 ; transpose 6x16 -> tmp space
441 TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp
442 lea r0, [r0+r3*8]
443 lea r1, [r1+r3*8]
444 TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp+8
445
446 ; vertical filter
447 lea r0, [pix_tmp+0x30]
448 PUSH dword r4m
449 PUSH dword r3m
450 PUSH dword r2m
451 PUSH dword 16
452 PUSH dword r0
453 call x264_deblock_%2_luma_%1
454 %ifidn %2, v8
455 add dword [esp ], 8 ; pix_tmp+0x38
456 add dword [esp+16], 2 ; tc0+2
457 call x264_deblock_%2_luma_%1
458 %endif
459 ADD esp, 20
460
461 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
462 mov r0, r0m
463 sub r0, 2
464 lea r1, [r0+r4]
465
466 movq m0, [pix_tmp+0x10]
467 movq m1, [pix_tmp+0x20]
468 movq m2, [pix_tmp+0x30]
469 movq m3, [pix_tmp+0x40]
470 TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4)
471
472 lea r0, [r0+r3*8]
473 lea r1, [r1+r3*8]
474 movq m0, [pix_tmp+0x18]
475 movq m1, [pix_tmp+0x28]
476 movq m2, [pix_tmp+0x38]
477 movq m3, [pix_tmp+0x48]
478 TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4)
479
480 ADD esp, pad
481 RET
482 %endmacro ; DEBLOCK_LUMA
483
484 INIT_XMM
485 DEBLOCK_LUMA sse2, v, 16
486
487 %endif ; ARCH
488
489
490
491 %macro LUMA_INTRA_P012 4 ; p0..p3 in memory
492 mova t0, p2
493 mova t1, p0
494 pavgb t0, p1
495 pavgb t1, q0
496 pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2
497 mova t5, t1
498 mova t2, p2
499 mova t3, p0
500 paddb t2, p1
501 paddb t3, q0
502 paddb t2, t3
503 mova t3, t2
504 mova t4, t2
505 psrlw t2, 1
506 pavgb t2, mpb_00
507 pxor t2, t0
508 pand t2, mpb_01
509 psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4;
510
511 mova t1, p2
512 mova t2, p2
513 pavgb t1, q1
514 psubb t2, q1
515 paddb t3, t3
516 psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1
517 pand t2, mpb_01
518 psubb t1, t2
519 pavgb t1, p1
520 pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2
521 psrlw t3, 2
522 pavgb t3, mpb_00
523 pxor t3, t1
524 pand t3, mpb_01
525 psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8
526
527 mova t3, p0
528 mova t2, p0
529 pxor t3, q1
530 pavgb t2, q1
531 pand t3, mpb_01
532 psubb t2, t3
533 pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4
534
535 pxor t1, t2
536 pxor t2, p0
537 pand t1, mask1p
538 pand t2, mask0
539 pxor t1, t2
540 pxor t1, p0
541 mova %1, t1 ; store p0
542
543 mova t1, %4 ; p3
544 mova t2, t1
545 pavgb t1, p2
546 paddb t2, p2
547 pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4
548 paddb t2, t2
549 paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0
550 psrlw t2, 2
551 pavgb t2, mpb_00
552 pxor t2, t1
553 pand t2, mpb_01
554 psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8
555
556 pxor t0, p1
557 pxor t1, p2
558 pand t0, mask1p
559 pand t1, mask1p
560 pxor t0, p1
561 pxor t1, p2
562 mova %2, t0 ; store p1
563 mova %3, t1 ; store p2
564 %endmacro
565
566 %macro LUMA_INTRA_SWAP_PQ 0
567 %define q1 m0
568 %define q0 m1
569 %define p0 m2
570 %define p1 m3
571 %define p2 q2
572 %define mask1p mask1q
573 %endmacro
574
575 %macro DEBLOCK_LUMA_INTRA 2
576 %define p1 m0
577 %define p0 m1
578 %define q0 m2
579 %define q1 m3
580 %define t0 m4
581 %define t1 m5
582 %define t2 m6
583 %define t3 m7
584 %ifdef ARCH_X86_64
585 %define p2 m8
586 %define q2 m9
587 %define t4 m10
588 %define t5 m11
589 %define mask0 m12
590 %define mask1p m13
591 %define mask1q [rsp-24]
592 %define mpb_00 m14
593 %define mpb_01 m15
594 %else
595 %define spill(x) [esp+16*x+((stack_offset+4)&15)]
596 %define p2 [r4+r1]
597 %define q2 [r0+2*r1]
598 %define t4 spill(0)
599 %define t5 spill(1)
600 %define mask0 spill(2)
601 %define mask1p spill(3)
602 %define mask1q spill(4)
603 %define mpb_00 [pb_00 GLOBAL]
604 %define mpb_01 [pb_01 GLOBAL]
605 %endif
606
607 ;-----------------------------------------------------------------------------
608 ; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
609 ;-----------------------------------------------------------------------------
610 cglobal x264_deblock_%2_luma_intra_%1, 4,6
611 %ifndef ARCH_X86_64
612 sub esp, 0x60
613 %endif
614 lea r4, [r1*4]
615 lea r5, [r1*3] ; 3*stride
616 dec r2d ; alpha-1
617 jl .end
618 neg r4
619 dec r3d ; beta-1
620 jl .end
621 add r4, r0 ; pix-4*stride
622 mova p1, [r4+2*r1]
623 mova p0, [r4+r5]
624 mova q0, [r0]
625 mova q1, [r0+r1]
626 %ifdef ARCH_X86_64
627 pxor mpb_00, mpb_00
628 mova mpb_01, [pb_01 GLOBAL]
629 LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
630 SWAP 7, 12 ; m12=mask0
631 pavgb t5, mpb_00
632 pavgb t5, mpb_01 ; alpha/4+1
633 movdqa p2, [r4+r1]
634 movdqa q2, [r0+2*r1]
635 DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1
636 DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1
637 DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1
638 pand t0, mask0
639 pand t4, t0
640 pand t2, t0
641 mova mask1q, t4
642 mova mask1p, t2
643 %else
644 LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
645 mova m4, t5
646 mova mask0, m7
647 pavgb m4, [pb_00 GLOBAL]
648 pavgb m4, [pb_01 GLOBAL] ; alpha/4+1
649 DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1
650 pand m6, mask0
651 DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1
652 pand m4, m6
653 mova mask1p, m4
654 DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1
655 pand m4, m6
656 mova mask1q, m4
657 %endif
658 LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4]
659 LUMA_INTRA_SWAP_PQ
660 LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5]
661 .end:
662 %ifndef ARCH_X86_64
663 add esp, 0x60
664 %endif
665 RET
666
667 INIT_MMX
668 %ifdef ARCH_X86_64
669 ;-----------------------------------------------------------------------------
670 ; void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
671 ;-----------------------------------------------------------------------------
672 cglobal x264_deblock_h_luma_intra_%1
673 movsxd r10, r1d
674 lea r11, [r10*3]
675 lea rax, [r0-4]
676 lea r9, [r0-4+r11]
677 sub rsp, 0x88
678 %define pix_tmp rsp
679
680 ; transpose 8x16 -> tmp space
681 TRANSPOSE8x8_MEM PASS8ROWS(rax, r9, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
682 lea rax, [rax+r10*8]
683 lea r9, [r9+r10*8]
684 TRANSPOSE8x8_MEM PASS8ROWS(rax, r9, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
685
686 lea r0, [pix_tmp+0x40]
687 mov r1, 0x10
688 call x264_deblock_v_luma_intra_%1
689
690 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
691 lea r9, [rax+r11]
692 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(rax, r9, r10, r11)
693 shl r10, 3
694 sub rax, r10
695 sub r9, r10
696 shr r10, 3
697 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(rax, r9, r10, r11)
698 add rsp, 0x88
699 ret
700 %else
701 cglobal x264_deblock_h_luma_intra_%1, 2,4
702 lea r3, [r1*3]
703 sub r0, 4
704 lea r2, [r0+r3]
705 %assign pad 0x8c-(stack_offset&15)
706 SUB rsp, pad
707 %define pix_tmp rsp
708
709 ; transpose 8x16 -> tmp space
710 TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
711 lea r0, [r0+r1*8]
712 lea r2, [r2+r1*8]
713 TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
714
715 lea r0, [pix_tmp+0x40]
716 PUSH dword r3m
717 PUSH dword r2m
718 PUSH dword 16
719 PUSH r0
720 call x264_deblock_%2_luma_intra_%1
721 %ifidn %2, v8
722 add dword [rsp], 8 ; pix_tmp+8
723 call x264_deblock_%2_luma_intra_%1
724 %endif
725 ADD esp, 16
726
727 mov r1, r1m
728 mov r0, r0m
729 lea r3, [r1*3]
730 sub r0, 4
731 lea r2, [r0+r3]
732 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
733 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
734 lea r0, [r0+r1*8]
735 lea r2, [r2+r1*8]
736 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
737 ADD rsp, pad
738 RET
739 %endif ; ARCH_X86_64
740 %endmacro ; DEBLOCK_LUMA_INTRA
741
742 INIT_XMM
743 DEBLOCK_LUMA_INTRA sse2, v
744 %ifndef ARCH_X86_64
745 INIT_MMX
746 DEBLOCK_LUMA_INTRA mmxext, v8
747 %endif