Mercurial > libavcodec.hg
comparison x86/h264_deblock.asm @ 12454:f4355cd85faa libavcodec
Port latest x264 deblock asm (before they moved to using NV12 as internal
format), LGPL'ed with permission from Jason and Loren. This includes mmx2
code, so remove inline asm from h264dsp_mmx.c accordingly.
author | rbultje |
---|---|
date | Fri, 03 Sep 2010 16:52:46 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
12453:35e1de8243c6 | 12454:f4355cd85faa |
---|---|
1 ;***************************************************************************** | |
2 ;* MMX/SSE2-optimized H.264 deblocking code | |
3 ;***************************************************************************** | |
4 ;* Copyright (C) 2005-2008 x264 project | |
5 ;* | |
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu> | |
7 ;* Jason Garrett-Glaser <darkshikari@gmail.com> | |
8 ;* | |
9 ;* This file is part of FFmpeg. | |
10 ;* | |
11 ;* FFmpeg is free software; you can redistribute it and/or | |
12 ;* modify it under the terms of the GNU Lesser General Public | |
13 ;* License as published by the Free Software Foundation; either | |
14 ;* version 2.1 of the License, or (at your option) any later version. | |
15 ;* | |
16 ;* FFmpeg is distributed in the hope that it will be useful, | |
17 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
18 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
19 ;* Lesser General Public License for more details. | |
20 ;* | |
21 ;* You should have received a copy of the GNU Lesser General Public | |
22 ;* License along with FFmpeg; if not, write to the Free Software | |
23 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
24 ;****************************************************************************** | |
25 | |
26 %include "x86inc.asm" | |
27 %include "x86util.asm" | |
28 | |
29 SECTION_RODATA | |
30 | |
31 cextern pb_0 | |
32 cextern pb_1 | |
33 cextern pb_3 | |
34 cextern pb_A1 | |
35 | |
36 SECTION .text | |
37 | |
38 ; expands to [base],...,[base+7*stride] | |
39 %define PASS8ROWS(base, base3, stride, stride3) \ | |
40 [base], [base+stride], [base+stride*2], [base3], \ | |
41 [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4] | |
42 | |
43 ; in: 8 rows of 4 bytes in %1..%8 | |
44 ; out: 4 rows of 8 bytes in m0..m3 | |
45 %macro TRANSPOSE4x8_LOAD 8 | |
46 movd m0, %1 | |
47 movd m2, %2 | |
48 movd m1, %3 | |
49 movd m3, %4 | |
50 punpcklbw m0, m2 | |
51 punpcklbw m1, m3 | |
52 movq m2, m0 | |
53 punpcklwd m0, m1 | |
54 punpckhwd m2, m1 | |
55 | |
56 movd m4, %5 | |
57 movd m6, %6 | |
58 movd m5, %7 | |
59 movd m7, %8 | |
60 punpcklbw m4, m6 | |
61 punpcklbw m5, m7 | |
62 movq m6, m4 | |
63 punpcklwd m4, m5 | |
64 punpckhwd m6, m5 | |
65 | |
66 movq m1, m0 | |
67 movq m3, m2 | |
68 punpckldq m0, m4 | |
69 punpckhdq m1, m4 | |
70 punpckldq m2, m6 | |
71 punpckhdq m3, m6 | |
72 %endmacro | |
73 | |
74 ; in: 4 rows of 8 bytes in m0..m3 | |
75 ; out: 8 rows of 4 bytes in %1..%8 | |
76 %macro TRANSPOSE8x4_STORE 8 | |
77 movq m4, m0 | |
78 movq m5, m1 | |
79 movq m6, m2 | |
80 punpckhdq m4, m4 | |
81 punpckhdq m5, m5 | |
82 punpckhdq m6, m6 | |
83 | |
84 punpcklbw m0, m1 | |
85 punpcklbw m2, m3 | |
86 movq m1, m0 | |
87 punpcklwd m0, m2 | |
88 punpckhwd m1, m2 | |
89 movd %1, m0 | |
90 punpckhdq m0, m0 | |
91 movd %2, m0 | |
92 movd %3, m1 | |
93 punpckhdq m1, m1 | |
94 movd %4, m1 | |
95 | |
96 punpckhdq m3, m3 | |
97 punpcklbw m4, m5 | |
98 punpcklbw m6, m3 | |
99 movq m5, m4 | |
100 punpcklwd m4, m6 | |
101 punpckhwd m5, m6 | |
102 movd %5, m4 | |
103 punpckhdq m4, m4 | |
104 movd %6, m4 | |
105 movd %7, m5 | |
106 punpckhdq m5, m5 | |
107 movd %8, m5 | |
108 %endmacro | |
109 | |
110 %macro SBUTTERFLY3 4 | |
111 movq %4, %2 | |
112 punpckl%1 %2, %3 | |
113 punpckh%1 %4, %3 | |
114 %endmacro | |
115 | |
116 ; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8 | |
117 ; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16] | |
118 %macro TRANSPOSE6x8_MEM 9 | |
119 movq m0, %1 | |
120 movq m1, %2 | |
121 movq m2, %3 | |
122 movq m3, %4 | |
123 movq m4, %5 | |
124 movq m5, %6 | |
125 movq m6, %7 | |
126 SBUTTERFLY3 bw, m0, m1, m7 | |
127 SBUTTERFLY3 bw, m2, m3, m1 | |
128 SBUTTERFLY3 bw, m4, m5, m3 | |
129 movq [%9+0x10], m1 | |
130 SBUTTERFLY3 bw, m6, %8, m5 | |
131 SBUTTERFLY3 wd, m0, m2, m1 | |
132 SBUTTERFLY3 wd, m4, m6, m2 | |
133 punpckhdq m0, m4 | |
134 movq [%9+0x00], m0 | |
135 SBUTTERFLY3 wd, m7, [%9+0x10], m6 | |
136 SBUTTERFLY3 wd, m3, m5, m4 | |
137 SBUTTERFLY3 dq, m7, m3, m0 | |
138 SBUTTERFLY3 dq, m1, m2, m5 | |
139 punpckldq m6, m4 | |
140 movq [%9+0x10], m1 | |
141 movq [%9+0x20], m5 | |
142 movq [%9+0x30], m7 | |
143 movq [%9+0x40], m0 | |
144 movq [%9+0x50], m6 | |
145 %endmacro | |
146 | |
147 ; in: 8 rows of 8 in %1..%8 | |
148 ; out: 8 rows of 8 in %9..%16 | |
149 %macro TRANSPOSE8x8_MEM 16 | |
150 movq m0, %1 | |
151 movq m1, %2 | |
152 movq m2, %3 | |
153 movq m3, %4 | |
154 movq m4, %5 | |
155 movq m5, %6 | |
156 movq m6, %7 | |
157 SBUTTERFLY3 bw, m0, m1, m7 | |
158 SBUTTERFLY3 bw, m2, m3, m1 | |
159 SBUTTERFLY3 bw, m4, m5, m3 | |
160 SBUTTERFLY3 bw, m6, %8, m5 | |
161 movq %9, m3 | |
162 SBUTTERFLY3 wd, m0, m2, m3 | |
163 SBUTTERFLY3 wd, m4, m6, m2 | |
164 SBUTTERFLY3 wd, m7, m1, m6 | |
165 movq %11, m2 | |
166 movq m2, %9 | |
167 SBUTTERFLY3 wd, m2, m5, m1 | |
168 SBUTTERFLY3 dq, m0, m4, m5 | |
169 SBUTTERFLY3 dq, m7, m2, m4 | |
170 movq %9, m0 | |
171 movq %10, m5 | |
172 movq %13, m7 | |
173 movq %14, m4 | |
174 SBUTTERFLY3 dq, m3, %11, m0 | |
175 SBUTTERFLY3 dq, m6, m1, m5 | |
176 movq %11, m3 | |
177 movq %12, m0 | |
178 movq %15, m6 | |
179 movq %16, m5 | |
180 %endmacro | |
181 | |
182 ; out: %4 = |%1-%2|>%3 | |
183 ; clobbers: %5 | |
184 %macro DIFF_GT 5 | |
185 mova %5, %2 | |
186 mova %4, %1 | |
187 psubusb %5, %1 | |
188 psubusb %4, %2 | |
189 por %4, %5 | |
190 psubusb %4, %3 | |
191 %endmacro | |
192 | |
193 ; out: %4 = |%1-%2|>%3 | |
194 ; clobbers: %5 | |
195 %macro DIFF_GT2 5 | |
196 mova %5, %2 | |
197 mova %4, %1 | |
198 psubusb %5, %1 | |
199 psubusb %4, %2 | |
200 psubusb %5, %3 | |
201 psubusb %4, %3 | |
202 pcmpeqb %4, %5 | |
203 %endmacro | |
204 | |
205 %macro SPLATW 1 | |
206 %ifidn m0, xmm0 | |
207 pshuflw %1, %1, 0 | |
208 punpcklqdq %1, %1 | |
209 %else | |
210 pshufw %1, %1, 0 | |
211 %endif | |
212 %endmacro | |
213 | |
214 ; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1 | |
215 ; out: m5=beta-1, m7=mask, %3=alpha-1 | |
216 ; clobbers: m4,m6 | |
217 %macro LOAD_MASK 2-3 | |
218 movd m4, %1 | |
219 movd m5, %2 | |
220 SPLATW m4 | |
221 SPLATW m5 | |
222 packuswb m4, m4 ; 16x alpha-1 | |
223 packuswb m5, m5 ; 16x beta-1 | |
224 %if %0>2 | |
225 mova %3, m4 | |
226 %endif | |
227 DIFF_GT m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1 | |
228 DIFF_GT m0, m1, m5, m4, m6 ; |p1-p0| > beta-1 | |
229 por m7, m4 | |
230 DIFF_GT m3, m2, m5, m4, m6 ; |q1-q0| > beta-1 | |
231 por m7, m4 | |
232 pxor m6, m6 | |
233 pcmpeqb m7, m6 | |
234 %endmacro | |
235 | |
236 ; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask) | |
237 ; out: m1=p0' m2=q0' | |
238 ; clobbers: m0,3-6 | |
239 %macro DEBLOCK_P0_Q0 0 | |
240 mova m5, m1 | |
241 pxor m5, m2 ; p0^q0 | |
242 pand m5, [pb_1] ; (p0^q0)&1 | |
243 pcmpeqb m4, m4 | |
244 pxor m3, m4 | |
245 pavgb m3, m0 ; (p1 - q1 + 256)>>1 | |
246 pavgb m3, [pb_3] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2 | |
247 pxor m4, m1 | |
248 pavgb m4, m2 ; (q0 - p0 + 256)>>1 | |
249 pavgb m3, m5 | |
250 paddusb m3, m4 ; d+128+33 | |
251 mova m6, [pb_A1] | |
252 psubusb m6, m3 | |
253 psubusb m3, [pb_A1] | |
254 pminub m6, m7 | |
255 pminub m3, m7 | |
256 psubusb m1, m6 | |
257 psubusb m2, m3 | |
258 paddusb m1, m3 | |
259 paddusb m2, m6 | |
260 %endmacro | |
261 | |
262 ; in: m1=p0 m2=q0 | |
263 ; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp | |
264 ; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 ) | |
265 ; clobbers: q2, tmp, tc0 | |
266 %macro LUMA_Q1 6 | |
267 mova %6, m1 | |
268 pavgb %6, m2 | |
269 pavgb %2, %6 ; avg(p2,avg(p0,q0)) | |
270 pxor %6, %3 | |
271 pand %6, [pb_1] ; (p2^avg(p0,q0))&1 | |
272 psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1 | |
273 mova %6, %1 | |
274 psubusb %6, %5 | |
275 paddusb %5, %1 | |
276 pmaxub %2, %6 | |
277 pminub %2, %5 | |
278 mova %4, %2 | |
279 %endmacro | |
280 | |
281 %ifdef ARCH_X86_64 | |
282 ;----------------------------------------------------------------------------- | |
283 ; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) | |
284 ;----------------------------------------------------------------------------- | |
285 INIT_XMM | |
286 cglobal x264_deblock_v_luma_sse2, 5,5,10 | |
287 movd m8, [r4] ; tc0 | |
288 lea r4, [r1*3] | |
289 dec r2d ; alpha-1 | |
290 neg r4 | |
291 dec r3d ; beta-1 | |
292 add r4, r0 ; pix-3*stride | |
293 | |
294 mova m0, [r4+r1] ; p1 | |
295 mova m1, [r4+2*r1] ; p0 | |
296 mova m2, [r0] ; q0 | |
297 mova m3, [r0+r1] ; q1 | |
298 LOAD_MASK r2d, r3d | |
299 | |
300 punpcklbw m8, m8 | |
301 punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] | |
302 pcmpeqb m9, m9 | |
303 pcmpeqb m9, m8 | |
304 pandn m9, m7 | |
305 pand m8, m9 | |
306 | |
307 movdqa m3, [r4] ; p2 | |
308 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 | |
309 pand m6, m9 | |
310 mova m7, m8 | |
311 psubb m7, m6 | |
312 pand m6, m8 | |
313 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 | |
314 | |
315 movdqa m4, [r0+2*r1] ; q2 | |
316 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 | |
317 pand m6, m9 | |
318 pand m8, m6 | |
319 psubb m7, m6 | |
320 mova m3, [r0+r1] | |
321 LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6 | |
322 | |
323 DEBLOCK_P0_Q0 | |
324 mova [r4+2*r1], m1 | |
325 mova [r0], m2 | |
326 RET | |
327 | |
328 ;----------------------------------------------------------------------------- | |
329 ; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) | |
330 ;----------------------------------------------------------------------------- | |
331 INIT_MMX | |
332 cglobal x264_deblock_h_luma_sse2, 5,7 | |
333 movsxd r10, r1d | |
334 lea r11, [r10+r10*2] | |
335 lea r6, [r0-4] | |
336 lea r5, [r0-4+r11] | |
337 %ifdef WIN64 | |
338 sub rsp, 0x98 | |
339 %define pix_tmp rsp+0x30 | |
340 %else | |
341 sub rsp, 0x68 | |
342 %define pix_tmp rsp | |
343 %endif | |
344 | |
345 ; transpose 6x16 -> tmp space | |
346 TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp | |
347 lea r6, [r6+r10*8] | |
348 lea r5, [r5+r10*8] | |
349 TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp+8 | |
350 | |
351 ; vertical filter | |
352 ; alpha, beta, tc0 are still in r2d, r3d, r4 | |
353 ; don't backup r6, r5, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them | |
354 lea r0, [pix_tmp+0x30] | |
355 mov r1d, 0x10 | |
356 %ifdef WIN64 | |
357 mov [rsp+0x20], r4 | |
358 %endif | |
359 call x264_deblock_v_luma_sse2 | |
360 | |
361 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) | |
362 add r6, 2 | |
363 add r5, 2 | |
364 movq m0, [pix_tmp+0x18] | |
365 movq m1, [pix_tmp+0x28] | |
366 movq m2, [pix_tmp+0x38] | |
367 movq m3, [pix_tmp+0x48] | |
368 TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11) | |
369 | |
370 shl r10, 3 | |
371 sub r6, r10 | |
372 sub r5, r10 | |
373 shr r10, 3 | |
374 movq m0, [pix_tmp+0x10] | |
375 movq m1, [pix_tmp+0x20] | |
376 movq m2, [pix_tmp+0x30] | |
377 movq m3, [pix_tmp+0x40] | |
378 TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11) | |
379 | |
380 %ifdef WIN64 | |
381 add rsp, 0x98 | |
382 %else | |
383 add rsp, 0x68 | |
384 %endif | |
385 RET | |
386 | |
387 %else | |
388 | |
389 %macro DEBLOCK_LUMA 3 | |
390 ;----------------------------------------------------------------------------- | |
391 ; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) | |
392 ;----------------------------------------------------------------------------- | |
393 cglobal x264_deblock_%2_luma_%1, 5,5 | |
394 lea r4, [r1*3] | |
395 dec r2 ; alpha-1 | |
396 neg r4 | |
397 dec r3 ; beta-1 | |
398 add r4, r0 ; pix-3*stride | |
399 %assign pad 2*%3+12-(stack_offset&15) | |
400 SUB esp, pad | |
401 | |
402 mova m0, [r4+r1] ; p1 | |
403 mova m1, [r4+2*r1] ; p0 | |
404 mova m2, [r0] ; q0 | |
405 mova m3, [r0+r1] ; q1 | |
406 LOAD_MASK r2, r3 | |
407 | |
408 mov r3, r4mp | |
409 movd m4, [r3] ; tc0 | |
410 punpcklbw m4, m4 | |
411 punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] | |
412 mova [esp+%3], m4 ; tc | |
413 pcmpeqb m3, m3 | |
414 pcmpgtb m4, m3 | |
415 pand m4, m7 | |
416 mova [esp], m4 ; mask | |
417 | |
418 mova m3, [r4] ; p2 | |
419 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 | |
420 pand m6, m4 | |
421 pand m4, [esp+%3] ; tc | |
422 mova m7, m4 | |
423 psubb m7, m6 | |
424 pand m6, m4 | |
425 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 | |
426 | |
427 mova m4, [r0+2*r1] ; q2 | |
428 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 | |
429 mova m5, [esp] ; mask | |
430 pand m6, m5 | |
431 mova m5, [esp+%3] ; tc | |
432 pand m5, m6 | |
433 psubb m7, m6 | |
434 mova m3, [r0+r1] | |
435 LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6 | |
436 | |
437 DEBLOCK_P0_Q0 | |
438 mova [r4+2*r1], m1 | |
439 mova [r0], m2 | |
440 ADD esp, pad | |
441 RET | |
442 | |
443 ;----------------------------------------------------------------------------- | |
444 ; void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) | |
445 ;----------------------------------------------------------------------------- | |
446 INIT_MMX | |
447 cglobal x264_deblock_h_luma_%1, 0,5 | |
448 mov r0, r0mp | |
449 mov r3, r1m | |
450 lea r4, [r3*3] | |
451 sub r0, 4 | |
452 lea r1, [r0+r4] | |
453 %assign pad 0x78-(stack_offset&15) | |
454 SUB esp, pad | |
455 %define pix_tmp esp+12 | |
456 | |
457 ; transpose 6x16 -> tmp space | |
458 TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp | |
459 lea r0, [r0+r3*8] | |
460 lea r1, [r1+r3*8] | |
461 TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp+8 | |
462 | |
463 ; vertical filter | |
464 lea r0, [pix_tmp+0x30] | |
465 PUSH dword r4m | |
466 PUSH dword r3m | |
467 PUSH dword r2m | |
468 PUSH dword 16 | |
469 PUSH dword r0 | |
470 call x264_deblock_%2_luma_%1 | |
471 %ifidn %2, v8 | |
472 add dword [esp ], 8 ; pix_tmp+0x38 | |
473 add dword [esp+16], 2 ; tc0+2 | |
474 call x264_deblock_%2_luma_%1 | |
475 %endif | |
476 ADD esp, 20 | |
477 | |
478 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) | |
479 mov r0, r0mp | |
480 sub r0, 2 | |
481 lea r1, [r0+r4] | |
482 | |
483 movq m0, [pix_tmp+0x10] | |
484 movq m1, [pix_tmp+0x20] | |
485 movq m2, [pix_tmp+0x30] | |
486 movq m3, [pix_tmp+0x40] | |
487 TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4) | |
488 | |
489 lea r0, [r0+r3*8] | |
490 lea r1, [r1+r3*8] | |
491 movq m0, [pix_tmp+0x18] | |
492 movq m1, [pix_tmp+0x28] | |
493 movq m2, [pix_tmp+0x38] | |
494 movq m3, [pix_tmp+0x48] | |
495 TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4) | |
496 | |
497 ADD esp, pad | |
498 RET | |
499 %endmacro ; DEBLOCK_LUMA | |
500 | |
501 INIT_MMX | |
502 DEBLOCK_LUMA mmxext, v8, 8 | |
503 INIT_XMM | |
504 DEBLOCK_LUMA sse2, v, 16 | |
505 | |
506 %endif ; ARCH | |
507 | |
508 | |
509 | |
510 %macro LUMA_INTRA_P012 4 ; p0..p3 in memory | |
511 mova t0, p2 | |
512 mova t1, p0 | |
513 pavgb t0, p1 | |
514 pavgb t1, q0 | |
515 pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2 | |
516 mova t5, t1 | |
517 mova t2, p2 | |
518 mova t3, p0 | |
519 paddb t2, p1 | |
520 paddb t3, q0 | |
521 paddb t2, t3 | |
522 mova t3, t2 | |
523 mova t4, t2 | |
524 psrlw t2, 1 | |
525 pavgb t2, mpb_0 | |
526 pxor t2, t0 | |
527 pand t2, mpb_1 | |
528 psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4; | |
529 | |
530 mova t1, p2 | |
531 mova t2, p2 | |
532 pavgb t1, q1 | |
533 psubb t2, q1 | |
534 paddb t3, t3 | |
535 psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1 | |
536 pand t2, mpb_1 | |
537 psubb t1, t2 | |
538 pavgb t1, p1 | |
539 pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2 | |
540 psrlw t3, 2 | |
541 pavgb t3, mpb_0 | |
542 pxor t3, t1 | |
543 pand t3, mpb_1 | |
544 psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8 | |
545 | |
546 mova t3, p0 | |
547 mova t2, p0 | |
548 pxor t3, q1 | |
549 pavgb t2, q1 | |
550 pand t3, mpb_1 | |
551 psubb t2, t3 | |
552 pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4 | |
553 | |
554 pxor t1, t2 | |
555 pxor t2, p0 | |
556 pand t1, mask1p | |
557 pand t2, mask0 | |
558 pxor t1, t2 | |
559 pxor t1, p0 | |
560 mova %1, t1 ; store p0 | |
561 | |
562 mova t1, %4 ; p3 | |
563 mova t2, t1 | |
564 pavgb t1, p2 | |
565 paddb t2, p2 | |
566 pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4 | |
567 paddb t2, t2 | |
568 paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0 | |
569 psrlw t2, 2 | |
570 pavgb t2, mpb_0 | |
571 pxor t2, t1 | |
572 pand t2, mpb_1 | |
573 psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8 | |
574 | |
575 pxor t0, p1 | |
576 pxor t1, p2 | |
577 pand t0, mask1p | |
578 pand t1, mask1p | |
579 pxor t0, p1 | |
580 pxor t1, p2 | |
581 mova %2, t0 ; store p1 | |
582 mova %3, t1 ; store p2 | |
583 %endmacro | |
584 | |
585 %macro LUMA_INTRA_SWAP_PQ 0 | |
586 %define q1 m0 | |
587 %define q0 m1 | |
588 %define p0 m2 | |
589 %define p1 m3 | |
590 %define p2 q2 | |
591 %define mask1p mask1q | |
592 %endmacro | |
593 | |
594 %macro DEBLOCK_LUMA_INTRA 2 | |
595 %define p1 m0 | |
596 %define p0 m1 | |
597 %define q0 m2 | |
598 %define q1 m3 | |
599 %define t0 m4 | |
600 %define t1 m5 | |
601 %define t2 m6 | |
602 %define t3 m7 | |
603 %ifdef ARCH_X86_64 | |
604 %define p2 m8 | |
605 %define q2 m9 | |
606 %define t4 m10 | |
607 %define t5 m11 | |
608 %define mask0 m12 | |
609 %define mask1p m13 | |
610 %define mask1q [rsp-24] | |
611 %define mpb_0 m14 | |
612 %define mpb_1 m15 | |
613 %else | |
614 %define spill(x) [esp+16*x+((stack_offset+4)&15)] | |
615 %define p2 [r4+r1] | |
616 %define q2 [r0+2*r1] | |
617 %define t4 spill(0) | |
618 %define t5 spill(1) | |
619 %define mask0 spill(2) | |
620 %define mask1p spill(3) | |
621 %define mask1q spill(4) | |
622 %define mpb_0 [pb_0] | |
623 %define mpb_1 [pb_1] | |
624 %endif | |
625 | |
626 ;----------------------------------------------------------------------------- | |
627 ; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ) | |
628 ;----------------------------------------------------------------------------- | |
629 cglobal x264_deblock_%2_luma_intra_%1, 4,6,16 | |
630 %ifndef ARCH_X86_64 | |
631 sub esp, 0x60 | |
632 %endif | |
633 lea r4, [r1*4] | |
634 lea r5, [r1*3] ; 3*stride | |
635 dec r2d ; alpha-1 | |
636 jl .end | |
637 neg r4 | |
638 dec r3d ; beta-1 | |
639 jl .end | |
640 add r4, r0 ; pix-4*stride | |
641 mova p1, [r4+2*r1] | |
642 mova p0, [r4+r5] | |
643 mova q0, [r0] | |
644 mova q1, [r0+r1] | |
645 %ifdef ARCH_X86_64 | |
646 pxor mpb_0, mpb_0 | |
647 mova mpb_1, [pb_1] | |
648 LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 | |
649 SWAP 7, 12 ; m12=mask0 | |
650 pavgb t5, mpb_0 | |
651 pavgb t5, mpb_1 ; alpha/4+1 | |
652 movdqa p2, [r4+r1] | |
653 movdqa q2, [r0+2*r1] | |
654 DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1 | |
655 DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1 | |
656 DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1 | |
657 pand t0, mask0 | |
658 pand t4, t0 | |
659 pand t2, t0 | |
660 mova mask1q, t4 | |
661 mova mask1p, t2 | |
662 %else | |
663 LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 | |
664 mova m4, t5 | |
665 mova mask0, m7 | |
666 pavgb m4, [pb_0] | |
667 pavgb m4, [pb_1] ; alpha/4+1 | |
668 DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1 | |
669 pand m6, mask0 | |
670 DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1 | |
671 pand m4, m6 | |
672 mova mask1p, m4 | |
673 DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1 | |
674 pand m4, m6 | |
675 mova mask1q, m4 | |
676 %endif | |
677 LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4] | |
678 LUMA_INTRA_SWAP_PQ | |
679 LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5] | |
680 .end: | |
681 %ifndef ARCH_X86_64 | |
682 add esp, 0x60 | |
683 %endif | |
684 RET | |
685 | |
686 INIT_MMX | |
687 %ifdef ARCH_X86_64 | |
688 ;----------------------------------------------------------------------------- | |
689 ; void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ) | |
690 ;----------------------------------------------------------------------------- | |
691 cglobal x264_deblock_h_luma_intra_%1, 4,7 | |
692 movsxd r10, r1d | |
693 lea r11, [r10*3] | |
694 lea r6, [r0-4] | |
695 lea r5, [r0-4+r11] | |
696 sub rsp, 0x88 | |
697 %define pix_tmp rsp | |
698 | |
699 ; transpose 8x16 -> tmp space | |
700 TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) | |
701 lea r6, [r6+r10*8] | |
702 lea r5, [r5+r10*8] | |
703 TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) | |
704 | |
705 lea r0, [pix_tmp+0x40] | |
706 mov r1, 0x10 | |
707 call x264_deblock_v_luma_intra_%1 | |
708 | |
709 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) | |
710 lea r5, [r6+r11] | |
711 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11) | |
712 shl r10, 3 | |
713 sub r6, r10 | |
714 sub r5, r10 | |
715 shr r10, 3 | |
716 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11) | |
717 add rsp, 0x88 | |
718 RET | |
719 %else | |
720 cglobal x264_deblock_h_luma_intra_%1, 2,4 | |
721 lea r3, [r1*3] | |
722 sub r0, 4 | |
723 lea r2, [r0+r3] | |
724 %assign pad 0x8c-(stack_offset&15) | |
725 SUB rsp, pad | |
726 %define pix_tmp rsp | |
727 | |
728 ; transpose 8x16 -> tmp space | |
729 TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) | |
730 lea r0, [r0+r1*8] | |
731 lea r2, [r2+r1*8] | |
732 TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) | |
733 | |
734 lea r0, [pix_tmp+0x40] | |
735 PUSH dword r3m | |
736 PUSH dword r2m | |
737 PUSH dword 16 | |
738 PUSH r0 | |
739 call x264_deblock_%2_luma_intra_%1 | |
740 %ifidn %2, v8 | |
741 add dword [rsp], 8 ; pix_tmp+8 | |
742 call x264_deblock_%2_luma_intra_%1 | |
743 %endif | |
744 ADD esp, 16 | |
745 | |
746 mov r1, r1m | |
747 mov r0, r0mp | |
748 lea r3, [r1*3] | |
749 sub r0, 4 | |
750 lea r2, [r0+r3] | |
751 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) | |
752 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3) | |
753 lea r0, [r0+r1*8] | |
754 lea r2, [r2+r1*8] | |
755 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3) | |
756 ADD rsp, pad | |
757 RET | |
758 %endif ; ARCH_X86_64 | |
759 %endmacro ; DEBLOCK_LUMA_INTRA | |
760 | |
761 INIT_XMM | |
762 DEBLOCK_LUMA_INTRA sse2, v | |
763 %ifndef ARCH_X86_64 | |
764 INIT_MMX | |
765 DEBLOCK_LUMA_INTRA mmxext, v8 | |
766 %endif | |
767 | |
768 | |
769 | |
770 INIT_MMX | |
771 | |
772 %macro CHROMA_V_START 0 | |
773 dec r2d ; alpha-1 | |
774 dec r3d ; beta-1 | |
775 mov t5, r0 | |
776 sub t5, r1 | |
777 sub t5, r1 | |
778 %endmacro | |
779 | |
780 %macro CHROMA_H_START 0 | |
781 dec r2d | |
782 dec r3d | |
783 sub r0, 2 | |
784 lea t6, [r1*3] | |
785 mov t5, r0 | |
786 add r0, t6 | |
787 %endmacro | |
788 | |
789 %define t5 r5 | |
790 %define t6 r6 | |
791 | |
792 ;----------------------------------------------------------------------------- | |
793 ; void x264_deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) | |
794 ;----------------------------------------------------------------------------- | |
795 cglobal x264_deblock_v_chroma_mmxext, 5,6 | |
796 CHROMA_V_START | |
797 movq m0, [t5] | |
798 movq m1, [t5+r1] | |
799 movq m2, [r0] | |
800 movq m3, [r0+r1] | |
801 call x264_chroma_inter_body_mmxext | |
802 movq [t5+r1], m1 | |
803 movq [r0], m2 | |
804 RET | |
805 | |
806 ;----------------------------------------------------------------------------- | |
807 ; void x264_deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) | |
808 ;----------------------------------------------------------------------------- | |
809 cglobal x264_deblock_h_chroma_mmxext, 5,7 | |
810 %ifdef ARCH_X86_64 | |
811 %define buf0 [rsp-24] | |
812 %define buf1 [rsp-16] | |
813 %else | |
814 %define buf0 r0m | |
815 %define buf1 r2m | |
816 %endif | |
817 CHROMA_H_START | |
818 TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6) | |
819 movq buf0, m0 | |
820 movq buf1, m3 | |
821 call x264_chroma_inter_body_mmxext | |
822 movq m0, buf0 | |
823 movq m3, buf1 | |
824 TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6) | |
825 RET | |
826 | |
827 ALIGN 16 | |
828 x264_chroma_inter_body_mmxext: | |
829 LOAD_MASK r2d, r3d | |
830 movd m6, [r4] ; tc0 | |
831 punpcklbw m6, m6 | |
832 pand m7, m6 | |
833 DEBLOCK_P0_Q0 | |
834 ret | |
835 | |
836 | |
837 | |
838 ; in: %1=p0 %2=p1 %3=q1 | |
839 ; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2 | |
840 %macro CHROMA_INTRA_P0 3 | |
841 movq m4, %1 | |
842 pxor m4, %3 | |
843 pand m4, [pb_1] ; m4 = (p0^q1)&1 | |
844 pavgb %1, %3 | |
845 psubusb %1, m4 | |
846 pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) | |
847 %endmacro | |
848 | |
849 %define t5 r4 | |
850 %define t6 r5 | |
851 | |
852 ;----------------------------------------------------------------------------- | |
853 ; void x264_deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta ) | |
854 ;----------------------------------------------------------------------------- | |
855 cglobal x264_deblock_v_chroma_intra_mmxext, 4,5 | |
856 CHROMA_V_START | |
857 movq m0, [t5] | |
858 movq m1, [t5+r1] | |
859 movq m2, [r0] | |
860 movq m3, [r0+r1] | |
861 call x264_chroma_intra_body_mmxext | |
862 movq [t5+r1], m1 | |
863 movq [r0], m2 | |
864 RET | |
865 | |
866 ;----------------------------------------------------------------------------- | |
867 ; void x264_deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta ) | |
868 ;----------------------------------------------------------------------------- | |
869 cglobal x264_deblock_h_chroma_intra_mmxext, 4,6 | |
870 CHROMA_H_START | |
871 TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6) | |
872 call x264_chroma_intra_body_mmxext | |
873 TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6) | |
874 RET | |
875 | |
876 ALIGN 16 | |
877 x264_chroma_intra_body_mmxext: | |
878 LOAD_MASK r2d, r3d | |
879 movq m5, m1 | |
880 movq m6, m2 | |
881 CHROMA_INTRA_P0 m1, m0, m3 | |
882 CHROMA_INTRA_P0 m2, m3, m0 | |
883 psubb m1, m5 | |
884 psubb m2, m6 | |
885 pand m1, m7 | |
886 pand m2, m7 | |
887 paddb m1, m5 | |
888 paddb m2, m6 | |
889 ret |