Mercurial > libavcodec.hg
comparison x86/vp56dsp.asm @ 12418:e17840120b80 libavcodec
Move vp6_filter_diag4() x86 SIMD code from inline ASM to YASM. This should
help in fixing the Win64 fate failures.
author | rbultje |
---|---|
date | Wed, 25 Aug 2010 13:44:16 +0000 |
parents | |
children | 37bb4de77908 |
comparison
equal
deleted
inserted
replaced
12417:9f06475db098 | 12418:e17840120b80 |
---|---|
1 ;****************************************************************************** | |
2 ;* MMX/SSE2-optimized functions for the VP6 decoder | |
3 ;* Copyright (C) 2009 Sebastien Lucas <sebastien.lucas@gmail.com> | |
4 ;* Copyright (C) 2009 Zuxy Meng <zuxy.meng@gmail.com> | |
5 ;* | |
6 ;* This file is part of FFmpeg. | |
7 ;* | |
8 ;* FFmpeg is free software; you can redistribute it and/or | |
9 ;* modify it under the terms of the GNU Lesser General Public | |
10 ;* License as published by the Free Software Foundation; either | |
11 ;* version 2.1 of the License, or (at your option) any later version. | |
12 ;* | |
13 ;* FFmpeg is distributed in the hope that it will be useful, | |
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
16 ;* Lesser General Public License for more details. | |
17 ;* | |
18 ;* You should have received a copy of the GNU Lesser General Public | |
19 ;* License along with FFmpeg; if not, write to the Free Software | |
20 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
21 ;****************************************************************************** | |
22 | |
23 %include "x86inc.asm" | |
24 %include "x86util.asm" | |
25 | |
26 cextern pw_64 | |
27 | |
28 SECTION .text | |
29 | |
30 %macro DIAG4_MMX 6 | |
31 movq m0, [%1+%2] | |
32 movq m1, [%1+%3] | |
33 movq m3, m0 | |
34 movq m4, m1 | |
35 punpcklbw m0, m7 | |
36 punpcklbw m1, m7 | |
37 punpckhbw m3, m7 | |
38 punpckhbw m4, m7 | |
39 pmullw m0, [rsp+8*11] ; src[x-8 ] * biweight [0] | |
40 pmullw m1, [rsp+8*12] ; src[x ] * biweight [1] | |
41 pmullw m3, [rsp+8*11] ; src[x-8 ] * biweight [0] | |
42 pmullw m4, [rsp+8*12] ; src[x ] * biweight [1] | |
43 paddw m0, m1 | |
44 paddw m3, m4 | |
45 movq m1, [%1+%4] | |
46 movq m2, [%1+%5] | |
47 movq m4, m1 | |
48 movq m5, m2 | |
49 punpcklbw m1, m7 | |
50 punpcklbw m2, m7 | |
51 punpcklbw m4, m7 | |
52 punpcklbw m5, m7 | |
53 pmullw m1, [rsp+8*13] ; src[x+8 ] * biweight [2] | |
54 pmullw m2, [rsp+8*14] ; src[x+16] * biweight [3] | |
55 pmullw m4, [rsp+8*13] ; src[x+8 ] * biweight [2] | |
56 pmullw m5, [rsp+8*14] ; src[x+16] * biweight [3] | |
57 paddw m1, m2 | |
58 paddw m4, m5 | |
59 paddsw m0, m1 | |
60 paddsw m3, m4 | |
61 paddsw m0, m6 ; Add 64 | |
62 paddsw m3, m6 ; Add 64 | |
63 psraw m0, 7 | |
64 psraw m3, 7 | |
65 packuswb m0, m3 | |
66 movq [%6], m0 | |
67 %endmacro | |
68 | |
69 %macro DIAG4_SSE2 6 | |
70 movq m0, [%1+%2] | |
71 movq m1, [%1+%3] | |
72 punpcklbw m0, m7 | |
73 punpcklbw m1, m7 | |
74 pmullw m0, m4 ; src[x-8 ] * biweight [0] | |
75 pmullw m1, m5 ; src[x ] * biweight [1] | |
76 paddw m0, m1 | |
77 movq m1, [%1+%4] | |
78 movq m2, [%1+%5] | |
79 punpcklbw m1, m7 | |
80 punpcklbw m2, m7 | |
81 pmullw m1, m6 ; src[x+8 ] * biweight [2] | |
82 pmullw m2, m3 ; src[x+16] * biweight [3] | |
83 paddw m1, m2 | |
84 paddsw m0, m1 | |
85 paddsw m0, [pw_64] ; Add 64 | |
86 psraw m0, 7 | |
87 packuswb m0, m0 | |
88 movq [%6], m0 | |
89 %endmacro | |
90 | |
91 %macro SPLAT4REGS_MMX 0 | |
92 movq m5, m3 | |
93 punpcklwd m3, m3 | |
94 movq m4, m3 | |
95 punpckldq m3, m3 | |
96 punpckhdq m4, m4 | |
97 punpckhwd m5, m5 | |
98 movq m6, m5 | |
99 punpckhdq m6, m6 | |
100 punpckldq m5, m5 | |
101 movq [rsp+8*11], m3 | |
102 movq [rsp+8*12], m4 | |
103 movq [rsp+8*13], m5 | |
104 movq [rsp+8*14], m6 | |
105 %endmacro | |
106 | |
107 %macro SPLAT4REGS_SSE2 0 | |
108 pshuflw m4, m3, 0x0 | |
109 pshuflw m5, m3, 0x55 | |
110 pshuflw m6, m3, 0xAA | |
111 pshuflw m3, m3, 0xFF | |
112 punpcklqdq m4, m4 | |
113 punpcklqdq m5, m5 | |
114 punpcklqdq m6, m6 | |
115 punpcklqdq m3, m3 | |
116 %endmacro | |
117 | |
118 %macro vp6_filter_diag4 2 | |
119 ; void ff_vp6_filter_diag4_<opt>(uint8_t *dst, uint8_t *src, int stride, | |
120 ; const int16_t h_weight[4], const int16_t v_weights[4]) | |
121 cglobal vp6_filter_diag4_%1, 5, 7, %2 | |
122 mov r5, rsp ; backup stack pointer | |
123 and rsp, ~(mmsize-1) ; align stack | |
124 %ifidn %1, sse2 | |
125 sub rsp, 8*11 | |
126 %else | |
127 sub rsp, 8*15 | |
128 movq m6, [pw_64] | |
129 %endif | |
130 | |
131 sub r1, r2 | |
132 | |
133 pxor m7, m7 | |
134 movq m3, [r3] | |
135 SPLAT4REGS | |
136 | |
137 mov r3, rsp | |
138 mov r6, 11 | |
139 .nextrow | |
140 DIAG4 r1, -1, 0, 1, 2, r3 | |
141 add r3, 8 | |
142 add r1, r2 | |
143 dec r6 | |
144 jnz .nextrow | |
145 | |
146 movq m3, [r4] | |
147 SPLAT4REGS | |
148 | |
149 lea r3, [rsp+8] | |
150 mov r6, 8 | |
151 .nextcol | |
152 DIAG4 r3, -8, 0, 8, 16, r0 | |
153 add r3, 8 | |
154 add r0, r2 | |
155 dec r6 | |
156 jnz .nextcol | |
157 | |
158 mov rsp, r5 ; restore stack pointer | |
159 RET | |
160 %endmacro | |
161 | |
162 INIT_MMX | |
163 %define DIAG4 DIAG4_MMX | |
164 %define SPLAT4REGS SPLAT4REGS_MMX | |
165 vp6_filter_diag4 mmx, 0 | |
166 | |
167 INIT_XMM | |
168 %define DIAG4 DIAG4_SSE2 | |
169 %define SPLAT4REGS SPLAT4REGS_SSE2 | |
170 vp6_filter_diag4 sse2, 8 |