Mercurial > libavcodec.hg
comparison x86/h264_deblock_sse2.asm @ 11931:980030a3e315 libavcodec
Update x264asm header files to latest versions.
Modify the asm accordingly.
GLOBAL is now no longoer necessary for PIC-compliant loads.
author | darkshikari |
---|---|
date | Wed, 23 Jun 2010 19:20:46 +0000 |
parents | c08ca946c80a |
children | c6509c205922 |
comparison
equal
deleted
inserted
replaced
11930:1e8556438209 | 11931:980030a3e315 |
---|---|
232 ; out: m1=p0' m2=q0' | 232 ; out: m1=p0' m2=q0' |
233 ; clobbers: m0,3-6 | 233 ; clobbers: m0,3-6 |
234 %macro DEBLOCK_P0_Q0 0 | 234 %macro DEBLOCK_P0_Q0 0 |
235 mova m5, m1 | 235 mova m5, m1 |
236 pxor m5, m2 ; p0^q0 | 236 pxor m5, m2 ; p0^q0 |
237 pand m5, [pb_01 GLOBAL] ; (p0^q0)&1 | 237 pand m5, [pb_01] ; (p0^q0)&1 |
238 pcmpeqb m4, m4 | 238 pcmpeqb m4, m4 |
239 pxor m3, m4 | 239 pxor m3, m4 |
240 pavgb m3, m0 ; (p1 - q1 + 256)>>1 | 240 pavgb m3, m0 ; (p1 - q1 + 256)>>1 |
241 pavgb m3, [pb_03 GLOBAL] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2 | 241 pavgb m3, [pb_03] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2 |
242 pxor m4, m1 | 242 pxor m4, m1 |
243 pavgb m4, m2 ; (q0 - p0 + 256)>>1 | 243 pavgb m4, m2 ; (q0 - p0 + 256)>>1 |
244 pavgb m3, m5 | 244 pavgb m3, m5 |
245 paddusb m3, m4 ; d+128+33 | 245 paddusb m3, m4 ; d+128+33 |
246 mova m6, [pb_a1 GLOBAL] | 246 mova m6, [pb_a1] |
247 psubusb m6, m3 | 247 psubusb m6, m3 |
248 psubusb m3, [pb_a1 GLOBAL] | 248 psubusb m3, [pb_a1] |
249 pminub m6, m7 | 249 pminub m6, m7 |
250 pminub m3, m7 | 250 pminub m3, m7 |
251 psubusb m1, m6 | 251 psubusb m1, m6 |
252 psubusb m2, m3 | 252 psubusb m2, m3 |
253 paddusb m1, m3 | 253 paddusb m1, m3 |
261 %macro LUMA_Q1 6 | 261 %macro LUMA_Q1 6 |
262 mova %6, m1 | 262 mova %6, m1 |
263 pavgb %6, m2 | 263 pavgb %6, m2 |
264 pavgb %2, %6 ; avg(p2,avg(p0,q0)) | 264 pavgb %2, %6 ; avg(p2,avg(p0,q0)) |
265 pxor %6, %3 | 265 pxor %6, %3 |
266 pand %6, [pb_01 GLOBAL] ; (p2^avg(p0,q0))&1 | 266 pand %6, [pb_01] ; (p2^avg(p0,q0))&1 |
267 psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1 | 267 psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1 |
268 mova %6, %1 | 268 mova %6, %1 |
269 psubusb %6, %5 | 269 psubusb %6, %5 |
270 paddusb %5, %1 | 270 paddusb %5, %1 |
271 pmaxub %2, %6 | 271 pmaxub %2, %6 |
610 %define t4 spill(0) | 610 %define t4 spill(0) |
611 %define t5 spill(1) | 611 %define t5 spill(1) |
612 %define mask0 spill(2) | 612 %define mask0 spill(2) |
613 %define mask1p spill(3) | 613 %define mask1p spill(3) |
614 %define mask1q spill(4) | 614 %define mask1q spill(4) |
615 %define mpb_00 [pb_00 GLOBAL] | 615 %define mpb_00 [pb_00] |
616 %define mpb_01 [pb_01 GLOBAL] | 616 %define mpb_01 [pb_01] |
617 %endif | 617 %endif |
618 | 618 |
619 ;----------------------------------------------------------------------------- | 619 ;----------------------------------------------------------------------------- |
620 ; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ) | 620 ; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ) |
621 ;----------------------------------------------------------------------------- | 621 ;----------------------------------------------------------------------------- |
635 mova p0, [r4+r5] | 635 mova p0, [r4+r5] |
636 mova q0, [r0] | 636 mova q0, [r0] |
637 mova q1, [r0+r1] | 637 mova q1, [r0+r1] |
638 %ifdef ARCH_X86_64 | 638 %ifdef ARCH_X86_64 |
639 pxor mpb_00, mpb_00 | 639 pxor mpb_00, mpb_00 |
640 mova mpb_01, [pb_01 GLOBAL] | 640 mova mpb_01, [pb_01] |
641 LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 | 641 LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 |
642 SWAP 7, 12 ; m12=mask0 | 642 SWAP 7, 12 ; m12=mask0 |
643 pavgb t5, mpb_00 | 643 pavgb t5, mpb_00 |
644 pavgb t5, mpb_01 ; alpha/4+1 | 644 pavgb t5, mpb_01 ; alpha/4+1 |
645 movdqa p2, [r4+r1] | 645 movdqa p2, [r4+r1] |
654 mova mask1p, t2 | 654 mova mask1p, t2 |
655 %else | 655 %else |
656 LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 | 656 LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 |
657 mova m4, t5 | 657 mova m4, t5 |
658 mova mask0, m7 | 658 mova mask0, m7 |
659 pavgb m4, [pb_00 GLOBAL] | 659 pavgb m4, [pb_00] |
660 pavgb m4, [pb_01 GLOBAL] ; alpha/4+1 | 660 pavgb m4, [pb_01] ; alpha/4+1 |
661 DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1 | 661 DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1 |
662 pand m6, mask0 | 662 pand m6, mask0 |
663 DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1 | 663 DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1 |
664 pand m4, m6 | 664 pand m4, m6 |
665 mova mask1p, m4 | 665 mova mask1p, m4 |