annotate x86/h264_deblock.asm @ 12454:f4355cd85faa libavcodec

Port latest x264 deblock asm (before they moved to using NV12 as internal format), LGPL'ed with permission from Jason and Loren. This includes mmx2 code, so remove inline asm from h264dsp_mmx.c accordingly.
author rbultje
date Fri, 03 Sep 2010 16:52:46 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
12454
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
1 ;*****************************************************************************
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
2 ;* MMX/SSE2-optimized H.264 deblocking code
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
3 ;*****************************************************************************
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
4 ;* Copyright (C) 2005-2008 x264 project
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
5 ;*
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
7 ;* Jason Garrett-Glaser <darkshikari@gmail.com>
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
8 ;*
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
9 ;* This file is part of FFmpeg.
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
10 ;*
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
11 ;* FFmpeg is free software; you can redistribute it and/or
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
12 ;* modify it under the terms of the GNU Lesser General Public
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
13 ;* License as published by the Free Software Foundation; either
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
14 ;* version 2.1 of the License, or (at your option) any later version.
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
15 ;*
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
16 ;* FFmpeg is distributed in the hope that it will be useful,
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
17 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
18 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
19 ;* Lesser General Public License for more details.
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
20 ;*
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
21 ;* You should have received a copy of the GNU Lesser General Public
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
22 ;* License along with FFmpeg; if not, write to the Free Software
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
23 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
24 ;******************************************************************************
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
25
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
26 %include "x86inc.asm"
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
27 %include "x86util.asm"
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
28
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
29 SECTION_RODATA
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
30
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
31 cextern pb_0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
32 cextern pb_1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
33 cextern pb_3
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
34 cextern pb_A1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
35
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
36 SECTION .text
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
37
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
38 ; expands to [base],...,[base+7*stride]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
39 %define PASS8ROWS(base, base3, stride, stride3) \
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
40 [base], [base+stride], [base+stride*2], [base3], \
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
41 [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
42
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
43 ; in: 8 rows of 4 bytes in %1..%8
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
44 ; out: 4 rows of 8 bytes in m0..m3
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
45 %macro TRANSPOSE4x8_LOAD 8
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
46 movd m0, %1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
47 movd m2, %2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
48 movd m1, %3
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
49 movd m3, %4
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
50 punpcklbw m0, m2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
51 punpcklbw m1, m3
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
52 movq m2, m0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
53 punpcklwd m0, m1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
54 punpckhwd m2, m1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
55
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
56 movd m4, %5
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
57 movd m6, %6
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
58 movd m5, %7
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
59 movd m7, %8
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
60 punpcklbw m4, m6
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
61 punpcklbw m5, m7
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
62 movq m6, m4
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
63 punpcklwd m4, m5
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
64 punpckhwd m6, m5
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
65
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
66 movq m1, m0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
67 movq m3, m2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
68 punpckldq m0, m4
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
69 punpckhdq m1, m4
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
70 punpckldq m2, m6
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
71 punpckhdq m3, m6
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
72 %endmacro
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
73
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
74 ; in: 4 rows of 8 bytes in m0..m3
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
75 ; out: 8 rows of 4 bytes in %1..%8
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
76 %macro TRANSPOSE8x4_STORE 8
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
77 movq m4, m0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
78 movq m5, m1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
79 movq m6, m2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
80 punpckhdq m4, m4
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
81 punpckhdq m5, m5
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
82 punpckhdq m6, m6
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
83
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
84 punpcklbw m0, m1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
85 punpcklbw m2, m3
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
86 movq m1, m0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
87 punpcklwd m0, m2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
88 punpckhwd m1, m2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
89 movd %1, m0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
90 punpckhdq m0, m0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
91 movd %2, m0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
92 movd %3, m1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
93 punpckhdq m1, m1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
94 movd %4, m1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
95
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
96 punpckhdq m3, m3
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
97 punpcklbw m4, m5
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
98 punpcklbw m6, m3
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
99 movq m5, m4
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
100 punpcklwd m4, m6
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
101 punpckhwd m5, m6
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
102 movd %5, m4
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
103 punpckhdq m4, m4
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
104 movd %6, m4
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
105 movd %7, m5
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
106 punpckhdq m5, m5
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
107 movd %8, m5
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
108 %endmacro
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
109
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
110 %macro SBUTTERFLY3 4
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
111 movq %4, %2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
112 punpckl%1 %2, %3
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
113 punpckh%1 %4, %3
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
114 %endmacro
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
115
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
116 ; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
117 ; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
118 %macro TRANSPOSE6x8_MEM 9
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
119 movq m0, %1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
120 movq m1, %2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
121 movq m2, %3
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
122 movq m3, %4
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
123 movq m4, %5
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
124 movq m5, %6
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
125 movq m6, %7
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
126 SBUTTERFLY3 bw, m0, m1, m7
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
127 SBUTTERFLY3 bw, m2, m3, m1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
128 SBUTTERFLY3 bw, m4, m5, m3
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
129 movq [%9+0x10], m1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
130 SBUTTERFLY3 bw, m6, %8, m5
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
131 SBUTTERFLY3 wd, m0, m2, m1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
132 SBUTTERFLY3 wd, m4, m6, m2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
133 punpckhdq m0, m4
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
134 movq [%9+0x00], m0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
135 SBUTTERFLY3 wd, m7, [%9+0x10], m6
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
136 SBUTTERFLY3 wd, m3, m5, m4
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
137 SBUTTERFLY3 dq, m7, m3, m0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
138 SBUTTERFLY3 dq, m1, m2, m5
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
139 punpckldq m6, m4
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
140 movq [%9+0x10], m1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
141 movq [%9+0x20], m5
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
142 movq [%9+0x30], m7
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
143 movq [%9+0x40], m0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
144 movq [%9+0x50], m6
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
145 %endmacro
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
146
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
147 ; in: 8 rows of 8 in %1..%8
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
148 ; out: 8 rows of 8 in %9..%16
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
149 %macro TRANSPOSE8x8_MEM 16
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
150 movq m0, %1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
151 movq m1, %2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
152 movq m2, %3
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
153 movq m3, %4
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
154 movq m4, %5
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
155 movq m5, %6
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
156 movq m6, %7
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
157 SBUTTERFLY3 bw, m0, m1, m7
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
158 SBUTTERFLY3 bw, m2, m3, m1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
159 SBUTTERFLY3 bw, m4, m5, m3
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
160 SBUTTERFLY3 bw, m6, %8, m5
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
161 movq %9, m3
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
162 SBUTTERFLY3 wd, m0, m2, m3
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
163 SBUTTERFLY3 wd, m4, m6, m2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
164 SBUTTERFLY3 wd, m7, m1, m6
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
165 movq %11, m2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
166 movq m2, %9
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
167 SBUTTERFLY3 wd, m2, m5, m1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
168 SBUTTERFLY3 dq, m0, m4, m5
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
169 SBUTTERFLY3 dq, m7, m2, m4
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
170 movq %9, m0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
171 movq %10, m5
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
172 movq %13, m7
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
173 movq %14, m4
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
174 SBUTTERFLY3 dq, m3, %11, m0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
175 SBUTTERFLY3 dq, m6, m1, m5
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
176 movq %11, m3
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
177 movq %12, m0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
178 movq %15, m6
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
179 movq %16, m5
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
180 %endmacro
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
181
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
182 ; out: %4 = |%1-%2|>%3
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
183 ; clobbers: %5
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
184 %macro DIFF_GT 5
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
185 mova %5, %2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
186 mova %4, %1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
187 psubusb %5, %1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
188 psubusb %4, %2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
189 por %4, %5
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
190 psubusb %4, %3
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
191 %endmacro
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
192
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
193 ; out: %4 = |%1-%2|>%3
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
194 ; clobbers: %5
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
195 %macro DIFF_GT2 5
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
196 mova %5, %2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
197 mova %4, %1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
198 psubusb %5, %1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
199 psubusb %4, %2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
200 psubusb %5, %3
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
201 psubusb %4, %3
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
202 pcmpeqb %4, %5
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
203 %endmacro
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
204
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
205 %macro SPLATW 1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
206 %ifidn m0, xmm0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
207 pshuflw %1, %1, 0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
208 punpcklqdq %1, %1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
209 %else
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
210 pshufw %1, %1, 0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
211 %endif
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
212 %endmacro
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
213
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
214 ; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
215 ; out: m5=beta-1, m7=mask, %3=alpha-1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
216 ; clobbers: m4,m6
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
217 %macro LOAD_MASK 2-3
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
218 movd m4, %1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
219 movd m5, %2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
220 SPLATW m4
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
221 SPLATW m5
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
222 packuswb m4, m4 ; 16x alpha-1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
223 packuswb m5, m5 ; 16x beta-1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
224 %if %0>2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
225 mova %3, m4
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
226 %endif
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
227 DIFF_GT m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
228 DIFF_GT m0, m1, m5, m4, m6 ; |p1-p0| > beta-1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
229 por m7, m4
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
230 DIFF_GT m3, m2, m5, m4, m6 ; |q1-q0| > beta-1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
231 por m7, m4
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
232 pxor m6, m6
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
233 pcmpeqb m7, m6
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
234 %endmacro
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
235
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
236 ; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask)
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
237 ; out: m1=p0' m2=q0'
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
238 ; clobbers: m0,3-6
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
239 %macro DEBLOCK_P0_Q0 0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
240 mova m5, m1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
241 pxor m5, m2 ; p0^q0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
242 pand m5, [pb_1] ; (p0^q0)&1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
243 pcmpeqb m4, m4
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
244 pxor m3, m4
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
245 pavgb m3, m0 ; (p1 - q1 + 256)>>1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
246 pavgb m3, [pb_3] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
247 pxor m4, m1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
248 pavgb m4, m2 ; (q0 - p0 + 256)>>1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
249 pavgb m3, m5
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
250 paddusb m3, m4 ; d+128+33
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
251 mova m6, [pb_A1]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
252 psubusb m6, m3
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
253 psubusb m3, [pb_A1]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
254 pminub m6, m7
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
255 pminub m3, m7
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
256 psubusb m1, m6
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
257 psubusb m2, m3
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
258 paddusb m1, m3
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
259 paddusb m2, m6
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
260 %endmacro
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
261
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
262 ; in: m1=p0 m2=q0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
263 ; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
264 ; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
265 ; clobbers: q2, tmp, tc0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
266 %macro LUMA_Q1 6
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
267 mova %6, m1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
268 pavgb %6, m2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
269 pavgb %2, %6 ; avg(p2,avg(p0,q0))
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
270 pxor %6, %3
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
271 pand %6, [pb_1] ; (p2^avg(p0,q0))&1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
272 psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
273 mova %6, %1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
274 psubusb %6, %5
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
275 paddusb %5, %1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
276 pmaxub %2, %6
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
277 pminub %2, %5
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
278 mova %4, %2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
279 %endmacro
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
280
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
281 %ifdef ARCH_X86_64
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
282 ;-----------------------------------------------------------------------------
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
283 ; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
284 ;-----------------------------------------------------------------------------
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
285 INIT_XMM
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
286 cglobal x264_deblock_v_luma_sse2, 5,5,10
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
287 movd m8, [r4] ; tc0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
288 lea r4, [r1*3]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
289 dec r2d ; alpha-1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
290 neg r4
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
291 dec r3d ; beta-1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
292 add r4, r0 ; pix-3*stride
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
293
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
294 mova m0, [r4+r1] ; p1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
295 mova m1, [r4+2*r1] ; p0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
296 mova m2, [r0] ; q0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
297 mova m3, [r0+r1] ; q1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
298 LOAD_MASK r2d, r3d
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
299
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
300 punpcklbw m8, m8
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
301 punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
302 pcmpeqb m9, m9
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
303 pcmpeqb m9, m8
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
304 pandn m9, m7
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
305 pand m8, m9
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
306
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
307 movdqa m3, [r4] ; p2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
308 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
309 pand m6, m9
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
310 mova m7, m8
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
311 psubb m7, m6
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
312 pand m6, m8
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
313 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
314
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
315 movdqa m4, [r0+2*r1] ; q2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
316 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
317 pand m6, m9
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
318 pand m8, m6
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
319 psubb m7, m6
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
320 mova m3, [r0+r1]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
321 LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
322
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
323 DEBLOCK_P0_Q0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
324 mova [r4+2*r1], m1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
325 mova [r0], m2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
326 RET
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
327
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
328 ;-----------------------------------------------------------------------------
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
329 ; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
330 ;-----------------------------------------------------------------------------
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
331 INIT_MMX
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
332 cglobal x264_deblock_h_luma_sse2, 5,7
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
333 movsxd r10, r1d
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
334 lea r11, [r10+r10*2]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
335 lea r6, [r0-4]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
336 lea r5, [r0-4+r11]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
337 %ifdef WIN64
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
338 sub rsp, 0x98
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
339 %define pix_tmp rsp+0x30
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
340 %else
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
341 sub rsp, 0x68
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
342 %define pix_tmp rsp
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
343 %endif
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
344
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
345 ; transpose 6x16 -> tmp space
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
346 TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
347 lea r6, [r6+r10*8]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
348 lea r5, [r5+r10*8]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
349 TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp+8
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
350
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
351 ; vertical filter
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
352 ; alpha, beta, tc0 are still in r2d, r3d, r4
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
353 ; don't backup r6, r5, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
354 lea r0, [pix_tmp+0x30]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
355 mov r1d, 0x10
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
356 %ifdef WIN64
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
357 mov [rsp+0x20], r4
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
358 %endif
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
359 call x264_deblock_v_luma_sse2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
360
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
361 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
362 add r6, 2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
363 add r5, 2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
364 movq m0, [pix_tmp+0x18]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
365 movq m1, [pix_tmp+0x28]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
366 movq m2, [pix_tmp+0x38]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
367 movq m3, [pix_tmp+0x48]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
368 TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11)
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
369
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
370 shl r10, 3
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
371 sub r6, r10
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
372 sub r5, r10
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
373 shr r10, 3
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
374 movq m0, [pix_tmp+0x10]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
375 movq m1, [pix_tmp+0x20]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
376 movq m2, [pix_tmp+0x30]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
377 movq m3, [pix_tmp+0x40]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
378 TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11)
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
379
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
380 %ifdef WIN64
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
381 add rsp, 0x98
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
382 %else
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
383 add rsp, 0x68
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
384 %endif
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
385 RET
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
386
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
387 %else
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
388
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
389 %macro DEBLOCK_LUMA 3
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
390 ;-----------------------------------------------------------------------------
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
391 ; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
392 ;-----------------------------------------------------------------------------
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
393 cglobal x264_deblock_%2_luma_%1, 5,5
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
394 lea r4, [r1*3]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
395 dec r2 ; alpha-1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
396 neg r4
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
397 dec r3 ; beta-1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
398 add r4, r0 ; pix-3*stride
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
399 %assign pad 2*%3+12-(stack_offset&15)
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
400 SUB esp, pad
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
401
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
402 mova m0, [r4+r1] ; p1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
403 mova m1, [r4+2*r1] ; p0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
404 mova m2, [r0] ; q0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
405 mova m3, [r0+r1] ; q1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
406 LOAD_MASK r2, r3
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
407
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
408 mov r3, r4mp
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
409 movd m4, [r3] ; tc0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
410 punpcklbw m4, m4
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
411 punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
412 mova [esp+%3], m4 ; tc
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
413 pcmpeqb m3, m3
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
414 pcmpgtb m4, m3
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
415 pand m4, m7
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
416 mova [esp], m4 ; mask
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
417
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
418 mova m3, [r4] ; p2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
419 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
420 pand m6, m4
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
421 pand m4, [esp+%3] ; tc
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
422 mova m7, m4
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
423 psubb m7, m6
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
424 pand m6, m4
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
425 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
426
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
427 mova m4, [r0+2*r1] ; q2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
428 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
429 mova m5, [esp] ; mask
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
430 pand m6, m5
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
431 mova m5, [esp+%3] ; tc
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
432 pand m5, m6
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
433 psubb m7, m6
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
434 mova m3, [r0+r1]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
435 LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
436
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
437 DEBLOCK_P0_Q0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
438 mova [r4+2*r1], m1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
439 mova [r0], m2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
440 ADD esp, pad
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
441 RET
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
442
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
443 ;-----------------------------------------------------------------------------
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
444 ; void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
445 ;-----------------------------------------------------------------------------
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
446 INIT_MMX
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
447 cglobal x264_deblock_h_luma_%1, 0,5
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
448 mov r0, r0mp
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
449 mov r3, r1m
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
450 lea r4, [r3*3]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
451 sub r0, 4
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
452 lea r1, [r0+r4]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
453 %assign pad 0x78-(stack_offset&15)
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
454 SUB esp, pad
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
455 %define pix_tmp esp+12
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
456
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
457 ; transpose 6x16 -> tmp space
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
458 TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
459 lea r0, [r0+r3*8]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
460 lea r1, [r1+r3*8]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
461 TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp+8
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
462
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
463 ; vertical filter
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
464 lea r0, [pix_tmp+0x30]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
465 PUSH dword r4m
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
466 PUSH dword r3m
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
467 PUSH dword r2m
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
468 PUSH dword 16
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
469 PUSH dword r0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
470 call x264_deblock_%2_luma_%1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
471 %ifidn %2, v8
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
472 add dword [esp ], 8 ; pix_tmp+0x38
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
473 add dword [esp+16], 2 ; tc0+2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
474 call x264_deblock_%2_luma_%1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
475 %endif
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
476 ADD esp, 20
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
477
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
478 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
479 mov r0, r0mp
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
480 sub r0, 2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
481 lea r1, [r0+r4]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
482
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
483 movq m0, [pix_tmp+0x10]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
484 movq m1, [pix_tmp+0x20]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
485 movq m2, [pix_tmp+0x30]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
486 movq m3, [pix_tmp+0x40]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
487 TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4)
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
488
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
489 lea r0, [r0+r3*8]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
490 lea r1, [r1+r3*8]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
491 movq m0, [pix_tmp+0x18]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
492 movq m1, [pix_tmp+0x28]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
493 movq m2, [pix_tmp+0x38]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
494 movq m3, [pix_tmp+0x48]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
495 TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4)
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
496
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
497 ADD esp, pad
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
498 RET
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
499 %endmacro ; DEBLOCK_LUMA
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
500
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
501 INIT_MMX
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
502 DEBLOCK_LUMA mmxext, v8, 8
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
503 INIT_XMM
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
504 DEBLOCK_LUMA sse2, v, 16
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
505
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
506 %endif ; ARCH
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
507
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
508
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
509
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
510 %macro LUMA_INTRA_P012 4 ; p0..p3 in memory
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
511 mova t0, p2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
512 mova t1, p0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
513 pavgb t0, p1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
514 pavgb t1, q0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
515 pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
516 mova t5, t1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
517 mova t2, p2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
518 mova t3, p0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
519 paddb t2, p1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
520 paddb t3, q0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
521 paddb t2, t3
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
522 mova t3, t2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
523 mova t4, t2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
524 psrlw t2, 1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
525 pavgb t2, mpb_0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
526 pxor t2, t0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
527 pand t2, mpb_1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
528 psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4;
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
529
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
530 mova t1, p2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
531 mova t2, p2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
532 pavgb t1, q1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
533 psubb t2, q1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
534 paddb t3, t3
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
535 psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
536 pand t2, mpb_1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
537 psubb t1, t2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
538 pavgb t1, p1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
539 pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
540 psrlw t3, 2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
541 pavgb t3, mpb_0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
542 pxor t3, t1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
543 pand t3, mpb_1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
544 psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
545
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
546 mova t3, p0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
547 mova t2, p0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
548 pxor t3, q1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
549 pavgb t2, q1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
550 pand t3, mpb_1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
551 psubb t2, t3
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
552 pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
553
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
554 pxor t1, t2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
555 pxor t2, p0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
556 pand t1, mask1p
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
557 pand t2, mask0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
558 pxor t1, t2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
559 pxor t1, p0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
560 mova %1, t1 ; store p0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
561
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
562 mova t1, %4 ; p3
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
563 mova t2, t1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
564 pavgb t1, p2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
565 paddb t2, p2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
566 pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
567 paddb t2, t2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
568 paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
569 psrlw t2, 2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
570 pavgb t2, mpb_0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
571 pxor t2, t1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
572 pand t2, mpb_1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
573 psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
574
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
575 pxor t0, p1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
576 pxor t1, p2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
577 pand t0, mask1p
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
578 pand t1, mask1p
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
579 pxor t0, p1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
580 pxor t1, p2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
581 mova %2, t0 ; store p1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
582 mova %3, t1 ; store p2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
583 %endmacro
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
584
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
585 %macro LUMA_INTRA_SWAP_PQ 0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
586 %define q1 m0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
587 %define q0 m1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
588 %define p0 m2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
589 %define p1 m3
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
590 %define p2 q2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
591 %define mask1p mask1q
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
592 %endmacro
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
593
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
594 %macro DEBLOCK_LUMA_INTRA 2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
595 %define p1 m0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
596 %define p0 m1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
597 %define q0 m2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
598 %define q1 m3
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
599 %define t0 m4
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
600 %define t1 m5
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
601 %define t2 m6
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
602 %define t3 m7
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
603 %ifdef ARCH_X86_64
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
604 %define p2 m8
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
605 %define q2 m9
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
606 %define t4 m10
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
607 %define t5 m11
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
608 %define mask0 m12
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
609 %define mask1p m13
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
610 %define mask1q [rsp-24]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
611 %define mpb_0 m14
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
612 %define mpb_1 m15
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
613 %else
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
614 %define spill(x) [esp+16*x+((stack_offset+4)&15)]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
615 %define p2 [r4+r1]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
616 %define q2 [r0+2*r1]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
617 %define t4 spill(0)
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
618 %define t5 spill(1)
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
619 %define mask0 spill(2)
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
620 %define mask1p spill(3)
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
621 %define mask1q spill(4)
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
622 %define mpb_0 [pb_0]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
623 %define mpb_1 [pb_1]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
624 %endif
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
625
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
626 ;-----------------------------------------------------------------------------
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
627 ; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
628 ;-----------------------------------------------------------------------------
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
629 cglobal x264_deblock_%2_luma_intra_%1, 4,6,16
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
630 %ifndef ARCH_X86_64
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
631 sub esp, 0x60
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
632 %endif
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
633 lea r4, [r1*4]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
634 lea r5, [r1*3] ; 3*stride
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
635 dec r2d ; alpha-1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
636 jl .end
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
637 neg r4
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
638 dec r3d ; beta-1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
639 jl .end
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
640 add r4, r0 ; pix-4*stride
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
641 mova p1, [r4+2*r1]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
642 mova p0, [r4+r5]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
643 mova q0, [r0]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
644 mova q1, [r0+r1]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
645 %ifdef ARCH_X86_64
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
646 pxor mpb_0, mpb_0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
647 mova mpb_1, [pb_1]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
648 LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
649 SWAP 7, 12 ; m12=mask0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
650 pavgb t5, mpb_0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
651 pavgb t5, mpb_1 ; alpha/4+1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
652 movdqa p2, [r4+r1]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
653 movdqa q2, [r0+2*r1]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
654 DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
655 DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
656 DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
657 pand t0, mask0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
658 pand t4, t0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
659 pand t2, t0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
660 mova mask1q, t4
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
661 mova mask1p, t2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
662 %else
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
663 LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
664 mova m4, t5
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
665 mova mask0, m7
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
666 pavgb m4, [pb_0]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
667 pavgb m4, [pb_1] ; alpha/4+1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
668 DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
669 pand m6, mask0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
670 DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
671 pand m4, m6
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
672 mova mask1p, m4
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
673 DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
674 pand m4, m6
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
675 mova mask1q, m4
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
676 %endif
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
677 LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
678 LUMA_INTRA_SWAP_PQ
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
679 LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
680 .end:
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
681 %ifndef ARCH_X86_64
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
682 add esp, 0x60
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
683 %endif
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
684 RET
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
685
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
686 INIT_MMX
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
687 %ifdef ARCH_X86_64
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
688 ;-----------------------------------------------------------------------------
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
689 ; void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
690 ;-----------------------------------------------------------------------------
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
691 cglobal x264_deblock_h_luma_intra_%1, 4,7
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
692 movsxd r10, r1d
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
693 lea r11, [r10*3]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
694 lea r6, [r0-4]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
695 lea r5, [r0-4+r11]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
696 sub rsp, 0x88
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
697 %define pix_tmp rsp
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
698
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
699 ; transpose 8x16 -> tmp space
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
700 TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
701 lea r6, [r6+r10*8]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
702 lea r5, [r5+r10*8]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
703 TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
704
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
705 lea r0, [pix_tmp+0x40]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
706 mov r1, 0x10
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
707 call x264_deblock_v_luma_intra_%1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
708
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
709 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
710 lea r5, [r6+r11]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
711 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
712 shl r10, 3
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
713 sub r6, r10
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
714 sub r5, r10
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
715 shr r10, 3
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
716 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
717 add rsp, 0x88
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
718 RET
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
719 %else
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
720 cglobal x264_deblock_h_luma_intra_%1, 2,4
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
721 lea r3, [r1*3]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
722 sub r0, 4
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
723 lea r2, [r0+r3]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
724 %assign pad 0x8c-(stack_offset&15)
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
725 SUB rsp, pad
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
726 %define pix_tmp rsp
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
727
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
728 ; transpose 8x16 -> tmp space
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
729 TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
730 lea r0, [r0+r1*8]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
731 lea r2, [r2+r1*8]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
732 TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
733
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
734 lea r0, [pix_tmp+0x40]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
735 PUSH dword r3m
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
736 PUSH dword r2m
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
737 PUSH dword 16
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
738 PUSH r0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
739 call x264_deblock_%2_luma_intra_%1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
740 %ifidn %2, v8
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
741 add dword [rsp], 8 ; pix_tmp+8
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
742 call x264_deblock_%2_luma_intra_%1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
743 %endif
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
744 ADD esp, 16
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
745
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
746 mov r1, r1m
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
747 mov r0, r0mp
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
748 lea r3, [r1*3]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
749 sub r0, 4
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
750 lea r2, [r0+r3]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
751 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
752 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
753 lea r0, [r0+r1*8]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
754 lea r2, [r2+r1*8]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
755 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
756 ADD rsp, pad
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
757 RET
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
758 %endif ; ARCH_X86_64
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
759 %endmacro ; DEBLOCK_LUMA_INTRA
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
760
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
761 INIT_XMM
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
762 DEBLOCK_LUMA_INTRA sse2, v
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
763 %ifndef ARCH_X86_64
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
764 INIT_MMX
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
765 DEBLOCK_LUMA_INTRA mmxext, v8
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
766 %endif
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
767
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
768
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
769
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
770 INIT_MMX
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
771
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
772 %macro CHROMA_V_START 0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
773 dec r2d ; alpha-1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
774 dec r3d ; beta-1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
775 mov t5, r0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
776 sub t5, r1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
777 sub t5, r1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
778 %endmacro
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
779
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
780 %macro CHROMA_H_START 0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
781 dec r2d
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
782 dec r3d
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
783 sub r0, 2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
784 lea t6, [r1*3]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
785 mov t5, r0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
786 add r0, t6
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
787 %endmacro
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
788
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
789 %define t5 r5
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
790 %define t6 r6
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
791
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
792 ;-----------------------------------------------------------------------------
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
793 ; void x264_deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
794 ;-----------------------------------------------------------------------------
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
795 cglobal x264_deblock_v_chroma_mmxext, 5,6
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
796 CHROMA_V_START
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
797 movq m0, [t5]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
798 movq m1, [t5+r1]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
799 movq m2, [r0]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
800 movq m3, [r0+r1]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
801 call x264_chroma_inter_body_mmxext
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
802 movq [t5+r1], m1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
803 movq [r0], m2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
804 RET
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
805
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
806 ;-----------------------------------------------------------------------------
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
807 ; void x264_deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
808 ;-----------------------------------------------------------------------------
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
809 cglobal x264_deblock_h_chroma_mmxext, 5,7
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
810 %ifdef ARCH_X86_64
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
811 %define buf0 [rsp-24]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
812 %define buf1 [rsp-16]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
813 %else
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
814 %define buf0 r0m
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
815 %define buf1 r2m
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
816 %endif
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
817 CHROMA_H_START
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
818 TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6)
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
819 movq buf0, m0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
820 movq buf1, m3
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
821 call x264_chroma_inter_body_mmxext
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
822 movq m0, buf0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
823 movq m3, buf1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
824 TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6)
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
825 RET
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
826
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
827 ALIGN 16
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
828 x264_chroma_inter_body_mmxext:
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
829 LOAD_MASK r2d, r3d
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
830 movd m6, [r4] ; tc0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
831 punpcklbw m6, m6
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
832 pand m7, m6
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
833 DEBLOCK_P0_Q0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
834 ret
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
835
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
836
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
837
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
838 ; in: %1=p0 %2=p1 %3=q1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
839 ; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
840 %macro CHROMA_INTRA_P0 3
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
841 movq m4, %1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
842 pxor m4, %3
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
843 pand m4, [pb_1] ; m4 = (p0^q1)&1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
844 pavgb %1, %3
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
845 psubusb %1, m4
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
846 pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
847 %endmacro
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
848
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
849 %define t5 r4
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
850 %define t6 r5
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
851
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
852 ;-----------------------------------------------------------------------------
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
853 ; void x264_deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
854 ;-----------------------------------------------------------------------------
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
855 cglobal x264_deblock_v_chroma_intra_mmxext, 4,5
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
856 CHROMA_V_START
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
857 movq m0, [t5]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
858 movq m1, [t5+r1]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
859 movq m2, [r0]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
860 movq m3, [r0+r1]
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
861 call x264_chroma_intra_body_mmxext
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
862 movq [t5+r1], m1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
863 movq [r0], m2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
864 RET
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
865
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
866 ;-----------------------------------------------------------------------------
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
867 ; void x264_deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
868 ;-----------------------------------------------------------------------------
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
869 cglobal x264_deblock_h_chroma_intra_mmxext, 4,6
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
870 CHROMA_H_START
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
871 TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6)
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
872 call x264_chroma_intra_body_mmxext
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
873 TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6)
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
874 RET
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
875
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
876 ALIGN 16
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
877 x264_chroma_intra_body_mmxext:
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
878 LOAD_MASK r2d, r3d
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
879 movq m5, m1
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
880 movq m6, m2
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
881 CHROMA_INTRA_P0 m1, m0, m3
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
882 CHROMA_INTRA_P0 m2, m3, m0
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
883 psubb m1, m5
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
884 psubb m2, m6
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
885 pand m1, m7
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
886 pand m2, m7
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
887 paddb m1, m5
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
888 paddb m2, m6
f4355cd85faa Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff changeset
889 ret