comparison x86/h264_deblock_sse2.asm @ 11931:980030a3e315 libavcodec

Update x264asm header files to latest versions. Modify the asm accordingly. GLOBAL is now no longoer necessary for PIC-compliant loads.
author darkshikari
date Wed, 23 Jun 2010 19:20:46 +0000
parents c08ca946c80a
children c6509c205922
comparison
equal deleted inserted replaced
11930:1e8556438209 11931:980030a3e315
232 ; out: m1=p0' m2=q0' 232 ; out: m1=p0' m2=q0'
233 ; clobbers: m0,3-6 233 ; clobbers: m0,3-6
234 %macro DEBLOCK_P0_Q0 0 234 %macro DEBLOCK_P0_Q0 0
235 mova m5, m1 235 mova m5, m1
236 pxor m5, m2 ; p0^q0 236 pxor m5, m2 ; p0^q0
237 pand m5, [pb_01 GLOBAL] ; (p0^q0)&1 237 pand m5, [pb_01] ; (p0^q0)&1
238 pcmpeqb m4, m4 238 pcmpeqb m4, m4
239 pxor m3, m4 239 pxor m3, m4
240 pavgb m3, m0 ; (p1 - q1 + 256)>>1 240 pavgb m3, m0 ; (p1 - q1 + 256)>>1
241 pavgb m3, [pb_03 GLOBAL] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2 241 pavgb m3, [pb_03] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
242 pxor m4, m1 242 pxor m4, m1
243 pavgb m4, m2 ; (q0 - p0 + 256)>>1 243 pavgb m4, m2 ; (q0 - p0 + 256)>>1
244 pavgb m3, m5 244 pavgb m3, m5
245 paddusb m3, m4 ; d+128+33 245 paddusb m3, m4 ; d+128+33
246 mova m6, [pb_a1 GLOBAL] 246 mova m6, [pb_a1]
247 psubusb m6, m3 247 psubusb m6, m3
248 psubusb m3, [pb_a1 GLOBAL] 248 psubusb m3, [pb_a1]
249 pminub m6, m7 249 pminub m6, m7
250 pminub m3, m7 250 pminub m3, m7
251 psubusb m1, m6 251 psubusb m1, m6
252 psubusb m2, m3 252 psubusb m2, m3
253 paddusb m1, m3 253 paddusb m1, m3
261 %macro LUMA_Q1 6 261 %macro LUMA_Q1 6
262 mova %6, m1 262 mova %6, m1
263 pavgb %6, m2 263 pavgb %6, m2
264 pavgb %2, %6 ; avg(p2,avg(p0,q0)) 264 pavgb %2, %6 ; avg(p2,avg(p0,q0))
265 pxor %6, %3 265 pxor %6, %3
266 pand %6, [pb_01 GLOBAL] ; (p2^avg(p0,q0))&1 266 pand %6, [pb_01] ; (p2^avg(p0,q0))&1
267 psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1 267 psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1
268 mova %6, %1 268 mova %6, %1
269 psubusb %6, %5 269 psubusb %6, %5
270 paddusb %5, %1 270 paddusb %5, %1
271 pmaxub %2, %6 271 pmaxub %2, %6
610 %define t4 spill(0) 610 %define t4 spill(0)
611 %define t5 spill(1) 611 %define t5 spill(1)
612 %define mask0 spill(2) 612 %define mask0 spill(2)
613 %define mask1p spill(3) 613 %define mask1p spill(3)
614 %define mask1q spill(4) 614 %define mask1q spill(4)
615 %define mpb_00 [pb_00 GLOBAL] 615 %define mpb_00 [pb_00]
616 %define mpb_01 [pb_01 GLOBAL] 616 %define mpb_01 [pb_01]
617 %endif 617 %endif
618 618
619 ;----------------------------------------------------------------------------- 619 ;-----------------------------------------------------------------------------
620 ; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ) 620 ; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
621 ;----------------------------------------------------------------------------- 621 ;-----------------------------------------------------------------------------
635 mova p0, [r4+r5] 635 mova p0, [r4+r5]
636 mova q0, [r0] 636 mova q0, [r0]
637 mova q1, [r0+r1] 637 mova q1, [r0+r1]
638 %ifdef ARCH_X86_64 638 %ifdef ARCH_X86_64
639 pxor mpb_00, mpb_00 639 pxor mpb_00, mpb_00
640 mova mpb_01, [pb_01 GLOBAL] 640 mova mpb_01, [pb_01]
641 LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 641 LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
642 SWAP 7, 12 ; m12=mask0 642 SWAP 7, 12 ; m12=mask0
643 pavgb t5, mpb_00 643 pavgb t5, mpb_00
644 pavgb t5, mpb_01 ; alpha/4+1 644 pavgb t5, mpb_01 ; alpha/4+1
645 movdqa p2, [r4+r1] 645 movdqa p2, [r4+r1]
654 mova mask1p, t2 654 mova mask1p, t2
655 %else 655 %else
656 LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 656 LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
657 mova m4, t5 657 mova m4, t5
658 mova mask0, m7 658 mova mask0, m7
659 pavgb m4, [pb_00 GLOBAL] 659 pavgb m4, [pb_00]
660 pavgb m4, [pb_01 GLOBAL] ; alpha/4+1 660 pavgb m4, [pb_01] ; alpha/4+1
661 DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1 661 DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1
662 pand m6, mask0 662 pand m6, mask0
663 DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1 663 DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1
664 pand m4, m6 664 pand m4, m6
665 mova mask1p, m4 665 mova mask1p, m4