Mercurial > libavcodec.hg
annotate x86/h264_deblock.asm @ 12454:f4355cd85faa libavcodec
Port latest x264 deblock asm (before they moved to using NV12 as internal
format), LGPL'ed with permission from Jason and Loren. This includes mmx2
code, so remove inline asm from h264dsp_mmx.c accordingly.
author | rbultje |
---|---|
date | Fri, 03 Sep 2010 16:52:46 +0000 |
parents | |
children |
rev | line source |
---|---|
12454
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
1 ;***************************************************************************** |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
2 ;* MMX/SSE2-optimized H.264 deblocking code |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
3 ;***************************************************************************** |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
4 ;* Copyright (C) 2005-2008 x264 project |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
5 ;* |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu> |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
7 ;* Jason Garrett-Glaser <darkshikari@gmail.com> |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
8 ;* |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
9 ;* This file is part of FFmpeg. |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
10 ;* |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
11 ;* FFmpeg is free software; you can redistribute it and/or |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
12 ;* modify it under the terms of the GNU Lesser General Public |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
13 ;* License as published by the Free Software Foundation; either |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
14 ;* version 2.1 of the License, or (at your option) any later version. |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
15 ;* |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
16 ;* FFmpeg is distributed in the hope that it will be useful, |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
17 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
18 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
19 ;* Lesser General Public License for more details. |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
20 ;* |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
21 ;* You should have received a copy of the GNU Lesser General Public |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
22 ;* License along with FFmpeg; if not, write to the Free Software |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
23 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
24 ;****************************************************************************** |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
25 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
26 %include "x86inc.asm" |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
27 %include "x86util.asm" |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
28 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
29 SECTION_RODATA |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
30 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
31 cextern pb_0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
32 cextern pb_1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
33 cextern pb_3 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
34 cextern pb_A1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
35 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
36 SECTION .text |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
37 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
38 ; expands to [base],...,[base+7*stride] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
39 %define PASS8ROWS(base, base3, stride, stride3) \ |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
40 [base], [base+stride], [base+stride*2], [base3], \ |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
41 [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
42 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
43 ; in: 8 rows of 4 bytes in %1..%8 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
44 ; out: 4 rows of 8 bytes in m0..m3 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
45 %macro TRANSPOSE4x8_LOAD 8 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
46 movd m0, %1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
47 movd m2, %2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
48 movd m1, %3 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
49 movd m3, %4 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
50 punpcklbw m0, m2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
51 punpcklbw m1, m3 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
52 movq m2, m0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
53 punpcklwd m0, m1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
54 punpckhwd m2, m1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
55 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
56 movd m4, %5 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
57 movd m6, %6 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
58 movd m5, %7 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
59 movd m7, %8 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
60 punpcklbw m4, m6 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
61 punpcklbw m5, m7 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
62 movq m6, m4 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
63 punpcklwd m4, m5 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
64 punpckhwd m6, m5 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
65 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
66 movq m1, m0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
67 movq m3, m2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
68 punpckldq m0, m4 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
69 punpckhdq m1, m4 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
70 punpckldq m2, m6 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
71 punpckhdq m3, m6 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
72 %endmacro |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
73 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
74 ; in: 4 rows of 8 bytes in m0..m3 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
75 ; out: 8 rows of 4 bytes in %1..%8 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
76 %macro TRANSPOSE8x4_STORE 8 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
77 movq m4, m0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
78 movq m5, m1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
79 movq m6, m2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
80 punpckhdq m4, m4 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
81 punpckhdq m5, m5 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
82 punpckhdq m6, m6 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
83 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
84 punpcklbw m0, m1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
85 punpcklbw m2, m3 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
86 movq m1, m0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
87 punpcklwd m0, m2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
88 punpckhwd m1, m2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
89 movd %1, m0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
90 punpckhdq m0, m0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
91 movd %2, m0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
92 movd %3, m1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
93 punpckhdq m1, m1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
94 movd %4, m1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
95 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
96 punpckhdq m3, m3 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
97 punpcklbw m4, m5 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
98 punpcklbw m6, m3 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
99 movq m5, m4 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
100 punpcklwd m4, m6 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
101 punpckhwd m5, m6 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
102 movd %5, m4 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
103 punpckhdq m4, m4 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
104 movd %6, m4 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
105 movd %7, m5 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
106 punpckhdq m5, m5 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
107 movd %8, m5 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
108 %endmacro |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
109 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
110 %macro SBUTTERFLY3 4 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
111 movq %4, %2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
112 punpckl%1 %2, %3 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
113 punpckh%1 %4, %3 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
114 %endmacro |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
115 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
116 ; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
117 ; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
118 %macro TRANSPOSE6x8_MEM 9 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
119 movq m0, %1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
120 movq m1, %2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
121 movq m2, %3 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
122 movq m3, %4 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
123 movq m4, %5 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
124 movq m5, %6 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
125 movq m6, %7 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
126 SBUTTERFLY3 bw, m0, m1, m7 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
127 SBUTTERFLY3 bw, m2, m3, m1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
128 SBUTTERFLY3 bw, m4, m5, m3 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
129 movq [%9+0x10], m1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
130 SBUTTERFLY3 bw, m6, %8, m5 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
131 SBUTTERFLY3 wd, m0, m2, m1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
132 SBUTTERFLY3 wd, m4, m6, m2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
133 punpckhdq m0, m4 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
134 movq [%9+0x00], m0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
135 SBUTTERFLY3 wd, m7, [%9+0x10], m6 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
136 SBUTTERFLY3 wd, m3, m5, m4 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
137 SBUTTERFLY3 dq, m7, m3, m0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
138 SBUTTERFLY3 dq, m1, m2, m5 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
139 punpckldq m6, m4 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
140 movq [%9+0x10], m1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
141 movq [%9+0x20], m5 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
142 movq [%9+0x30], m7 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
143 movq [%9+0x40], m0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
144 movq [%9+0x50], m6 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
145 %endmacro |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
146 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
147 ; in: 8 rows of 8 in %1..%8 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
148 ; out: 8 rows of 8 in %9..%16 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
149 %macro TRANSPOSE8x8_MEM 16 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
150 movq m0, %1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
151 movq m1, %2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
152 movq m2, %3 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
153 movq m3, %4 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
154 movq m4, %5 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
155 movq m5, %6 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
156 movq m6, %7 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
157 SBUTTERFLY3 bw, m0, m1, m7 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
158 SBUTTERFLY3 bw, m2, m3, m1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
159 SBUTTERFLY3 bw, m4, m5, m3 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
160 SBUTTERFLY3 bw, m6, %8, m5 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
161 movq %9, m3 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
162 SBUTTERFLY3 wd, m0, m2, m3 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
163 SBUTTERFLY3 wd, m4, m6, m2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
164 SBUTTERFLY3 wd, m7, m1, m6 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
165 movq %11, m2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
166 movq m2, %9 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
167 SBUTTERFLY3 wd, m2, m5, m1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
168 SBUTTERFLY3 dq, m0, m4, m5 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
169 SBUTTERFLY3 dq, m7, m2, m4 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
170 movq %9, m0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
171 movq %10, m5 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
172 movq %13, m7 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
173 movq %14, m4 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
174 SBUTTERFLY3 dq, m3, %11, m0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
175 SBUTTERFLY3 dq, m6, m1, m5 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
176 movq %11, m3 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
177 movq %12, m0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
178 movq %15, m6 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
179 movq %16, m5 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
180 %endmacro |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
181 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
182 ; out: %4 = |%1-%2|>%3 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
183 ; clobbers: %5 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
184 %macro DIFF_GT 5 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
185 mova %5, %2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
186 mova %4, %1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
187 psubusb %5, %1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
188 psubusb %4, %2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
189 por %4, %5 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
190 psubusb %4, %3 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
191 %endmacro |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
192 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
193 ; out: %4 = |%1-%2|>%3 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
194 ; clobbers: %5 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
195 %macro DIFF_GT2 5 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
196 mova %5, %2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
197 mova %4, %1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
198 psubusb %5, %1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
199 psubusb %4, %2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
200 psubusb %5, %3 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
201 psubusb %4, %3 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
202 pcmpeqb %4, %5 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
203 %endmacro |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
204 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
205 %macro SPLATW 1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
206 %ifidn m0, xmm0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
207 pshuflw %1, %1, 0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
208 punpcklqdq %1, %1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
209 %else |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
210 pshufw %1, %1, 0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
211 %endif |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
212 %endmacro |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
213 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
214 ; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
215 ; out: m5=beta-1, m7=mask, %3=alpha-1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
216 ; clobbers: m4,m6 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
217 %macro LOAD_MASK 2-3 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
218 movd m4, %1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
219 movd m5, %2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
220 SPLATW m4 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
221 SPLATW m5 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
222 packuswb m4, m4 ; 16x alpha-1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
223 packuswb m5, m5 ; 16x beta-1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
224 %if %0>2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
225 mova %3, m4 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
226 %endif |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
227 DIFF_GT m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
228 DIFF_GT m0, m1, m5, m4, m6 ; |p1-p0| > beta-1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
229 por m7, m4 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
230 DIFF_GT m3, m2, m5, m4, m6 ; |q1-q0| > beta-1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
231 por m7, m4 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
232 pxor m6, m6 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
233 pcmpeqb m7, m6 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
234 %endmacro |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
235 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
236 ; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
237 ; out: m1=p0' m2=q0' |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
238 ; clobbers: m0,3-6 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
239 %macro DEBLOCK_P0_Q0 0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
240 mova m5, m1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
241 pxor m5, m2 ; p0^q0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
242 pand m5, [pb_1] ; (p0^q0)&1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
243 pcmpeqb m4, m4 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
244 pxor m3, m4 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
245 pavgb m3, m0 ; (p1 - q1 + 256)>>1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
246 pavgb m3, [pb_3] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
247 pxor m4, m1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
248 pavgb m4, m2 ; (q0 - p0 + 256)>>1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
249 pavgb m3, m5 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
250 paddusb m3, m4 ; d+128+33 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
251 mova m6, [pb_A1] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
252 psubusb m6, m3 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
253 psubusb m3, [pb_A1] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
254 pminub m6, m7 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
255 pminub m3, m7 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
256 psubusb m1, m6 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
257 psubusb m2, m3 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
258 paddusb m1, m3 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
259 paddusb m2, m6 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
260 %endmacro |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
261 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
262 ; in: m1=p0 m2=q0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
263 ; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
264 ; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 ) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
265 ; clobbers: q2, tmp, tc0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
266 %macro LUMA_Q1 6 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
267 mova %6, m1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
268 pavgb %6, m2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
269 pavgb %2, %6 ; avg(p2,avg(p0,q0)) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
270 pxor %6, %3 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
271 pand %6, [pb_1] ; (p2^avg(p0,q0))&1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
272 psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
273 mova %6, %1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
274 psubusb %6, %5 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
275 paddusb %5, %1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
276 pmaxub %2, %6 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
277 pminub %2, %5 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
278 mova %4, %2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
279 %endmacro |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
280 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
281 %ifdef ARCH_X86_64 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
282 ;----------------------------------------------------------------------------- |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
283 ; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
284 ;----------------------------------------------------------------------------- |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
285 INIT_XMM |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
286 cglobal x264_deblock_v_luma_sse2, 5,5,10 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
287 movd m8, [r4] ; tc0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
288 lea r4, [r1*3] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
289 dec r2d ; alpha-1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
290 neg r4 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
291 dec r3d ; beta-1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
292 add r4, r0 ; pix-3*stride |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
293 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
294 mova m0, [r4+r1] ; p1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
295 mova m1, [r4+2*r1] ; p0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
296 mova m2, [r0] ; q0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
297 mova m3, [r0+r1] ; q1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
298 LOAD_MASK r2d, r3d |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
299 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
300 punpcklbw m8, m8 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
301 punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
302 pcmpeqb m9, m9 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
303 pcmpeqb m9, m8 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
304 pandn m9, m7 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
305 pand m8, m9 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
306 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
307 movdqa m3, [r4] ; p2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
308 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
309 pand m6, m9 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
310 mova m7, m8 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
311 psubb m7, m6 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
312 pand m6, m8 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
313 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
314 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
315 movdqa m4, [r0+2*r1] ; q2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
316 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
317 pand m6, m9 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
318 pand m8, m6 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
319 psubb m7, m6 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
320 mova m3, [r0+r1] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
321 LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
322 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
323 DEBLOCK_P0_Q0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
324 mova [r4+2*r1], m1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
325 mova [r0], m2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
326 RET |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
327 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
328 ;----------------------------------------------------------------------------- |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
329 ; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
330 ;----------------------------------------------------------------------------- |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
331 INIT_MMX |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
332 cglobal x264_deblock_h_luma_sse2, 5,7 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
333 movsxd r10, r1d |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
334 lea r11, [r10+r10*2] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
335 lea r6, [r0-4] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
336 lea r5, [r0-4+r11] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
337 %ifdef WIN64 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
338 sub rsp, 0x98 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
339 %define pix_tmp rsp+0x30 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
340 %else |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
341 sub rsp, 0x68 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
342 %define pix_tmp rsp |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
343 %endif |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
344 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
345 ; transpose 6x16 -> tmp space |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
346 TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
347 lea r6, [r6+r10*8] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
348 lea r5, [r5+r10*8] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
349 TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp+8 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
350 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
351 ; vertical filter |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
352 ; alpha, beta, tc0 are still in r2d, r3d, r4 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
353 ; don't backup r6, r5, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
354 lea r0, [pix_tmp+0x30] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
355 mov r1d, 0x10 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
356 %ifdef WIN64 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
357 mov [rsp+0x20], r4 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
358 %endif |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
359 call x264_deblock_v_luma_sse2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
360 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
361 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
362 add r6, 2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
363 add r5, 2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
364 movq m0, [pix_tmp+0x18] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
365 movq m1, [pix_tmp+0x28] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
366 movq m2, [pix_tmp+0x38] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
367 movq m3, [pix_tmp+0x48] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
368 TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
369 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
370 shl r10, 3 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
371 sub r6, r10 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
372 sub r5, r10 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
373 shr r10, 3 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
374 movq m0, [pix_tmp+0x10] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
375 movq m1, [pix_tmp+0x20] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
376 movq m2, [pix_tmp+0x30] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
377 movq m3, [pix_tmp+0x40] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
378 TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
379 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
380 %ifdef WIN64 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
381 add rsp, 0x98 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
382 %else |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
383 add rsp, 0x68 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
384 %endif |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
385 RET |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
386 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
387 %else |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
388 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
389 %macro DEBLOCK_LUMA 3 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
390 ;----------------------------------------------------------------------------- |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
391 ; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
392 ;----------------------------------------------------------------------------- |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
393 cglobal x264_deblock_%2_luma_%1, 5,5 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
394 lea r4, [r1*3] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
395 dec r2 ; alpha-1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
396 neg r4 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
397 dec r3 ; beta-1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
398 add r4, r0 ; pix-3*stride |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
399 %assign pad 2*%3+12-(stack_offset&15) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
400 SUB esp, pad |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
401 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
402 mova m0, [r4+r1] ; p1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
403 mova m1, [r4+2*r1] ; p0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
404 mova m2, [r0] ; q0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
405 mova m3, [r0+r1] ; q1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
406 LOAD_MASK r2, r3 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
407 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
408 mov r3, r4mp |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
409 movd m4, [r3] ; tc0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
410 punpcklbw m4, m4 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
411 punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
412 mova [esp+%3], m4 ; tc |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
413 pcmpeqb m3, m3 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
414 pcmpgtb m4, m3 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
415 pand m4, m7 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
416 mova [esp], m4 ; mask |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
417 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
418 mova m3, [r4] ; p2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
419 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
420 pand m6, m4 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
421 pand m4, [esp+%3] ; tc |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
422 mova m7, m4 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
423 psubb m7, m6 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
424 pand m6, m4 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
425 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
426 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
427 mova m4, [r0+2*r1] ; q2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
428 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
429 mova m5, [esp] ; mask |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
430 pand m6, m5 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
431 mova m5, [esp+%3] ; tc |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
432 pand m5, m6 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
433 psubb m7, m6 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
434 mova m3, [r0+r1] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
435 LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
436 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
437 DEBLOCK_P0_Q0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
438 mova [r4+2*r1], m1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
439 mova [r0], m2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
440 ADD esp, pad |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
441 RET |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
442 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
443 ;----------------------------------------------------------------------------- |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
444 ; void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
445 ;----------------------------------------------------------------------------- |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
446 INIT_MMX |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
447 cglobal x264_deblock_h_luma_%1, 0,5 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
448 mov r0, r0mp |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
449 mov r3, r1m |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
450 lea r4, [r3*3] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
451 sub r0, 4 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
452 lea r1, [r0+r4] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
453 %assign pad 0x78-(stack_offset&15) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
454 SUB esp, pad |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
455 %define pix_tmp esp+12 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
456 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
457 ; transpose 6x16 -> tmp space |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
458 TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
459 lea r0, [r0+r3*8] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
460 lea r1, [r1+r3*8] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
461 TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp+8 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
462 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
463 ; vertical filter |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
464 lea r0, [pix_tmp+0x30] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
465 PUSH dword r4m |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
466 PUSH dword r3m |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
467 PUSH dword r2m |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
468 PUSH dword 16 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
469 PUSH dword r0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
470 call x264_deblock_%2_luma_%1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
471 %ifidn %2, v8 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
472 add dword [esp ], 8 ; pix_tmp+0x38 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
473 add dword [esp+16], 2 ; tc0+2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
474 call x264_deblock_%2_luma_%1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
475 %endif |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
476 ADD esp, 20 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
477 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
478 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
479 mov r0, r0mp |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
480 sub r0, 2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
481 lea r1, [r0+r4] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
482 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
483 movq m0, [pix_tmp+0x10] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
484 movq m1, [pix_tmp+0x20] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
485 movq m2, [pix_tmp+0x30] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
486 movq m3, [pix_tmp+0x40] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
487 TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
488 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
489 lea r0, [r0+r3*8] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
490 lea r1, [r1+r3*8] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
491 movq m0, [pix_tmp+0x18] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
492 movq m1, [pix_tmp+0x28] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
493 movq m2, [pix_tmp+0x38] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
494 movq m3, [pix_tmp+0x48] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
495 TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
496 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
497 ADD esp, pad |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
498 RET |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
499 %endmacro ; DEBLOCK_LUMA |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
500 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
501 INIT_MMX |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
502 DEBLOCK_LUMA mmxext, v8, 8 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
503 INIT_XMM |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
504 DEBLOCK_LUMA sse2, v, 16 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
505 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
506 %endif ; ARCH |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
507 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
508 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
509 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
510 %macro LUMA_INTRA_P012 4 ; p0..p3 in memory |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
511 mova t0, p2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
512 mova t1, p0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
513 pavgb t0, p1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
514 pavgb t1, q0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
515 pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
516 mova t5, t1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
517 mova t2, p2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
518 mova t3, p0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
519 paddb t2, p1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
520 paddb t3, q0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
521 paddb t2, t3 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
522 mova t3, t2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
523 mova t4, t2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
524 psrlw t2, 1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
525 pavgb t2, mpb_0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
526 pxor t2, t0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
527 pand t2, mpb_1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
528 psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4; |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
529 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
530 mova t1, p2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
531 mova t2, p2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
532 pavgb t1, q1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
533 psubb t2, q1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
534 paddb t3, t3 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
535 psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
536 pand t2, mpb_1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
537 psubb t1, t2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
538 pavgb t1, p1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
539 pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
540 psrlw t3, 2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
541 pavgb t3, mpb_0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
542 pxor t3, t1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
543 pand t3, mpb_1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
544 psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
545 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
546 mova t3, p0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
547 mova t2, p0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
548 pxor t3, q1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
549 pavgb t2, q1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
550 pand t3, mpb_1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
551 psubb t2, t3 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
552 pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
553 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
554 pxor t1, t2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
555 pxor t2, p0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
556 pand t1, mask1p |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
557 pand t2, mask0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
558 pxor t1, t2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
559 pxor t1, p0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
560 mova %1, t1 ; store p0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
561 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
562 mova t1, %4 ; p3 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
563 mova t2, t1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
564 pavgb t1, p2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
565 paddb t2, p2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
566 pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
567 paddb t2, t2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
568 paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
569 psrlw t2, 2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
570 pavgb t2, mpb_0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
571 pxor t2, t1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
572 pand t2, mpb_1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
573 psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
574 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
575 pxor t0, p1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
576 pxor t1, p2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
577 pand t0, mask1p |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
578 pand t1, mask1p |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
579 pxor t0, p1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
580 pxor t1, p2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
581 mova %2, t0 ; store p1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
582 mova %3, t1 ; store p2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
583 %endmacro |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
584 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
585 %macro LUMA_INTRA_SWAP_PQ 0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
586 %define q1 m0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
587 %define q0 m1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
588 %define p0 m2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
589 %define p1 m3 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
590 %define p2 q2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
591 %define mask1p mask1q |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
592 %endmacro |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
593 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
594 %macro DEBLOCK_LUMA_INTRA 2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
595 %define p1 m0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
596 %define p0 m1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
597 %define q0 m2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
598 %define q1 m3 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
599 %define t0 m4 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
600 %define t1 m5 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
601 %define t2 m6 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
602 %define t3 m7 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
603 %ifdef ARCH_X86_64 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
604 %define p2 m8 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
605 %define q2 m9 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
606 %define t4 m10 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
607 %define t5 m11 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
608 %define mask0 m12 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
609 %define mask1p m13 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
610 %define mask1q [rsp-24] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
611 %define mpb_0 m14 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
612 %define mpb_1 m15 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
613 %else |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
614 %define spill(x) [esp+16*x+((stack_offset+4)&15)] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
615 %define p2 [r4+r1] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
616 %define q2 [r0+2*r1] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
617 %define t4 spill(0) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
618 %define t5 spill(1) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
619 %define mask0 spill(2) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
620 %define mask1p spill(3) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
621 %define mask1q spill(4) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
622 %define mpb_0 [pb_0] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
623 %define mpb_1 [pb_1] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
624 %endif |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
625 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
626 ;----------------------------------------------------------------------------- |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
627 ; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
628 ;----------------------------------------------------------------------------- |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
629 cglobal x264_deblock_%2_luma_intra_%1, 4,6,16 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
630 %ifndef ARCH_X86_64 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
631 sub esp, 0x60 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
632 %endif |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
633 lea r4, [r1*4] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
634 lea r5, [r1*3] ; 3*stride |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
635 dec r2d ; alpha-1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
636 jl .end |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
637 neg r4 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
638 dec r3d ; beta-1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
639 jl .end |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
640 add r4, r0 ; pix-4*stride |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
641 mova p1, [r4+2*r1] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
642 mova p0, [r4+r5] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
643 mova q0, [r0] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
644 mova q1, [r0+r1] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
645 %ifdef ARCH_X86_64 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
646 pxor mpb_0, mpb_0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
647 mova mpb_1, [pb_1] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
648 LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
649 SWAP 7, 12 ; m12=mask0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
650 pavgb t5, mpb_0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
651 pavgb t5, mpb_1 ; alpha/4+1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
652 movdqa p2, [r4+r1] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
653 movdqa q2, [r0+2*r1] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
654 DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
655 DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
656 DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
657 pand t0, mask0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
658 pand t4, t0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
659 pand t2, t0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
660 mova mask1q, t4 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
661 mova mask1p, t2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
662 %else |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
663 LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
664 mova m4, t5 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
665 mova mask0, m7 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
666 pavgb m4, [pb_0] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
667 pavgb m4, [pb_1] ; alpha/4+1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
668 DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
669 pand m6, mask0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
670 DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
671 pand m4, m6 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
672 mova mask1p, m4 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
673 DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
674 pand m4, m6 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
675 mova mask1q, m4 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
676 %endif |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
677 LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
678 LUMA_INTRA_SWAP_PQ |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
679 LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
680 .end: |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
681 %ifndef ARCH_X86_64 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
682 add esp, 0x60 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
683 %endif |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
684 RET |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
685 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
686 INIT_MMX |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
687 %ifdef ARCH_X86_64 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
688 ;----------------------------------------------------------------------------- |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
689 ; void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
690 ;----------------------------------------------------------------------------- |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
691 cglobal x264_deblock_h_luma_intra_%1, 4,7 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
692 movsxd r10, r1d |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
693 lea r11, [r10*3] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
694 lea r6, [r0-4] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
695 lea r5, [r0-4+r11] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
696 sub rsp, 0x88 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
697 %define pix_tmp rsp |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
698 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
699 ; transpose 8x16 -> tmp space |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
700 TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
701 lea r6, [r6+r10*8] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
702 lea r5, [r5+r10*8] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
703 TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
704 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
705 lea r0, [pix_tmp+0x40] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
706 mov r1, 0x10 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
707 call x264_deblock_v_luma_intra_%1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
708 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
709 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
710 lea r5, [r6+r11] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
711 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
712 shl r10, 3 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
713 sub r6, r10 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
714 sub r5, r10 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
715 shr r10, 3 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
716 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
717 add rsp, 0x88 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
718 RET |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
719 %else |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
720 cglobal x264_deblock_h_luma_intra_%1, 2,4 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
721 lea r3, [r1*3] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
722 sub r0, 4 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
723 lea r2, [r0+r3] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
724 %assign pad 0x8c-(stack_offset&15) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
725 SUB rsp, pad |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
726 %define pix_tmp rsp |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
727 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
728 ; transpose 8x16 -> tmp space |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
729 TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
730 lea r0, [r0+r1*8] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
731 lea r2, [r2+r1*8] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
732 TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
733 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
734 lea r0, [pix_tmp+0x40] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
735 PUSH dword r3m |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
736 PUSH dword r2m |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
737 PUSH dword 16 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
738 PUSH r0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
739 call x264_deblock_%2_luma_intra_%1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
740 %ifidn %2, v8 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
741 add dword [rsp], 8 ; pix_tmp+8 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
742 call x264_deblock_%2_luma_intra_%1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
743 %endif |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
744 ADD esp, 16 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
745 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
746 mov r1, r1m |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
747 mov r0, r0mp |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
748 lea r3, [r1*3] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
749 sub r0, 4 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
750 lea r2, [r0+r3] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
751 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
752 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
753 lea r0, [r0+r1*8] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
754 lea r2, [r2+r1*8] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
755 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
756 ADD rsp, pad |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
757 RET |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
758 %endif ; ARCH_X86_64 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
759 %endmacro ; DEBLOCK_LUMA_INTRA |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
760 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
761 INIT_XMM |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
762 DEBLOCK_LUMA_INTRA sse2, v |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
763 %ifndef ARCH_X86_64 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
764 INIT_MMX |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
765 DEBLOCK_LUMA_INTRA mmxext, v8 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
766 %endif |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
767 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
768 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
769 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
770 INIT_MMX |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
771 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
772 %macro CHROMA_V_START 0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
773 dec r2d ; alpha-1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
774 dec r3d ; beta-1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
775 mov t5, r0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
776 sub t5, r1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
777 sub t5, r1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
778 %endmacro |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
779 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
780 %macro CHROMA_H_START 0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
781 dec r2d |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
782 dec r3d |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
783 sub r0, 2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
784 lea t6, [r1*3] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
785 mov t5, r0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
786 add r0, t6 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
787 %endmacro |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
788 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
789 %define t5 r5 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
790 %define t6 r6 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
791 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
792 ;----------------------------------------------------------------------------- |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
793 ; void x264_deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
794 ;----------------------------------------------------------------------------- |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
795 cglobal x264_deblock_v_chroma_mmxext, 5,6 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
796 CHROMA_V_START |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
797 movq m0, [t5] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
798 movq m1, [t5+r1] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
799 movq m2, [r0] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
800 movq m3, [r0+r1] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
801 call x264_chroma_inter_body_mmxext |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
802 movq [t5+r1], m1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
803 movq [r0], m2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
804 RET |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
805 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
806 ;----------------------------------------------------------------------------- |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
807 ; void x264_deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
808 ;----------------------------------------------------------------------------- |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
809 cglobal x264_deblock_h_chroma_mmxext, 5,7 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
810 %ifdef ARCH_X86_64 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
811 %define buf0 [rsp-24] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
812 %define buf1 [rsp-16] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
813 %else |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
814 %define buf0 r0m |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
815 %define buf1 r2m |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
816 %endif |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
817 CHROMA_H_START |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
818 TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
819 movq buf0, m0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
820 movq buf1, m3 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
821 call x264_chroma_inter_body_mmxext |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
822 movq m0, buf0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
823 movq m3, buf1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
824 TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
825 RET |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
826 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
827 ALIGN 16 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
828 x264_chroma_inter_body_mmxext: |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
829 LOAD_MASK r2d, r3d |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
830 movd m6, [r4] ; tc0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
831 punpcklbw m6, m6 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
832 pand m7, m6 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
833 DEBLOCK_P0_Q0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
834 ret |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
835 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
836 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
837 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
838 ; in: %1=p0 %2=p1 %3=q1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
839 ; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
840 %macro CHROMA_INTRA_P0 3 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
841 movq m4, %1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
842 pxor m4, %3 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
843 pand m4, [pb_1] ; m4 = (p0^q1)&1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
844 pavgb %1, %3 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
845 psubusb %1, m4 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
846 pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
847 %endmacro |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
848 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
849 %define t5 r4 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
850 %define t6 r5 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
851 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
852 ;----------------------------------------------------------------------------- |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
853 ; void x264_deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta ) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
854 ;----------------------------------------------------------------------------- |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
855 cglobal x264_deblock_v_chroma_intra_mmxext, 4,5 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
856 CHROMA_V_START |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
857 movq m0, [t5] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
858 movq m1, [t5+r1] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
859 movq m2, [r0] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
860 movq m3, [r0+r1] |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
861 call x264_chroma_intra_body_mmxext |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
862 movq [t5+r1], m1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
863 movq [r0], m2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
864 RET |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
865 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
866 ;----------------------------------------------------------------------------- |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
867 ; void x264_deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta ) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
868 ;----------------------------------------------------------------------------- |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
869 cglobal x264_deblock_h_chroma_intra_mmxext, 4,6 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
870 CHROMA_H_START |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
871 TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
872 call x264_chroma_intra_body_mmxext |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
873 TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6) |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
874 RET |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
875 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
876 ALIGN 16 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
877 x264_chroma_intra_body_mmxext: |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
878 LOAD_MASK r2d, r3d |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
879 movq m5, m1 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
880 movq m6, m2 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
881 CHROMA_INTRA_P0 m1, m0, m3 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
882 CHROMA_INTRA_P0 m2, m3, m0 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
883 psubb m1, m5 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
884 psubb m2, m6 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
885 pand m1, m7 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
886 pand m2, m7 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
887 paddb m1, m5 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
888 paddb m2, m6 |
f4355cd85faa
Port latest x264 deblock asm (before they moved to using NV12 as internal
rbultje
parents:
diff
changeset
|
889 ret |