Mercurial > libavcodec.hg
annotate i386/dsputil_h264_template_ssse3.c @ 8245:7e6ca1be9e40 libavcodec
Fix reading out of buffer during RV30/40 deblock mask calculation
author | kostya |
---|---|
date | Tue, 02 Dec 2008 18:14:53 +0000 |
parents | eebc7209c47f |
children |
rev | line source |
---|---|
6557 | 1 /* |
2 * Copyright (c) 2008 Loren Merritt | |
3 * | |
4 * This file is part of FFmpeg. | |
5 * | |
6 * FFmpeg is free software; you can redistribute it and/or | |
7 * modify it under the terms of the GNU Lesser General Public | |
8 * License as published by the Free Software Foundation; either | |
9 * version 2.1 of the License, or (at your option) any later version. | |
10 * | |
11 * FFmpeg is distributed in the hope that it will be useful, | |
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 * Lesser General Public License for more details. | |
15 * | |
16 * You should have received a copy of the GNU Lesser General Public | |
17 * License along with FFmpeg; if not, write to the Free Software | |
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
19 */ | |
20 | |
21 /** | |
22 * SSSE3 optimized version of (put|avg)_h264_chroma_mc8. | |
23 * H264_CHROMA_MC8_TMPL must be defined to the desired function name | |
24 * H264_CHROMA_MC8_MV0 must be defined to a (put|avg)_pixels8 function | |
25 * AVG_OP must be defined to empty for put and the identify for avg | |
26 */ | |
27 static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, int rnd) | |
28 { | |
29 if(y==0 && x==0) { | |
30 /* no filter needed */ | |
31 H264_CHROMA_MC8_MV0(dst, src, stride, h); | |
32 return; | |
33 } | |
34 | |
35 assert(x<8 && y<8 && x>=0 && y>=0); | |
36 | |
37 if(y==0 || x==0) | |
38 { | |
39 /* 1 dimensional filter only */ | |
8031 | 40 __asm__ volatile( |
6557 | 41 "movd %0, %%xmm7 \n\t" |
42 "movq %1, %%xmm6 \n\t" | |
43 "pshuflw $0, %%xmm7, %%xmm7 \n\t" | |
44 "movlhps %%xmm6, %%xmm6 \n\t" | |
45 "movlhps %%xmm7, %%xmm7 \n\t" | |
6575
d869966e57e5
Fix H.264 interframe decoding when compiling with icc. Patch by Loren
melanson
parents:
6557
diff
changeset
|
46 :: "r"(255*(x+y)+8), "m"(*(rnd?&ff_pw_4:&ff_pw_3)) |
d869966e57e5
Fix H.264 interframe decoding when compiling with icc. Patch by Loren
melanson
parents:
6557
diff
changeset
|
47 ); |
6557 | 48 |
49 if(x) { | |
8031 | 50 __asm__ volatile( |
6557 | 51 "1: \n\t" |
52 "movq (%1), %%xmm0 \n\t" | |
53 "movq 1(%1), %%xmm1 \n\t" | |
54 "movq (%1,%3), %%xmm2 \n\t" | |
55 "movq 1(%1,%3), %%xmm3 \n\t" | |
56 "punpcklbw %%xmm1, %%xmm0 \n\t" | |
57 "punpcklbw %%xmm3, %%xmm2 \n\t" | |
58 "pmaddubsw %%xmm7, %%xmm0 \n\t" | |
59 "pmaddubsw %%xmm7, %%xmm2 \n\t" | |
60 AVG_OP("movq (%0), %%xmm4 \n\t") | |
61 AVG_OP("movhps (%0,%3), %%xmm4 \n\t") | |
62 "paddw %%xmm6, %%xmm0 \n\t" | |
63 "paddw %%xmm6, %%xmm2 \n\t" | |
64 "psrlw $3, %%xmm0 \n\t" | |
65 "psrlw $3, %%xmm2 \n\t" | |
66 "packuswb %%xmm2, %%xmm0 \n\t" | |
67 AVG_OP("pavgb %%xmm4, %%xmm0 \n\t") | |
68 "movq %%xmm0, (%0) \n\t" | |
69 "movhps %%xmm0, (%0,%3) \n\t" | |
70 "sub $2, %2 \n\t" | |
71 "lea (%1,%3,2), %1 \n\t" | |
72 "lea (%0,%3,2), %0 \n\t" | |
73 "jg 1b \n\t" | |
74 :"+r"(dst), "+r"(src), "+r"(h) | |
6755
33896780c612
Do not misuse long as the size of a register in x86.
ramiro
parents:
6575
diff
changeset
|
75 :"r"((x86_reg)stride) |
6557 | 76 ); |
77 } else { | |
8031 | 78 __asm__ volatile( |
6557 | 79 "1: \n\t" |
80 "movq (%1), %%xmm0 \n\t" | |
81 "movq (%1,%3), %%xmm1 \n\t" | |
82 "movdqa %%xmm1, %%xmm2 \n\t" | |
83 "movq (%1,%3,2), %%xmm3 \n\t" | |
84 "punpcklbw %%xmm1, %%xmm0 \n\t" | |
85 "punpcklbw %%xmm3, %%xmm2 \n\t" | |
86 "pmaddubsw %%xmm7, %%xmm0 \n\t" | |
87 "pmaddubsw %%xmm7, %%xmm2 \n\t" | |
88 AVG_OP("movq (%0), %%xmm4 \n\t") | |
89 AVG_OP("movhps (%0,%3), %%xmm4 \n\t") | |
90 "paddw %%xmm6, %%xmm0 \n\t" | |
91 "paddw %%xmm6, %%xmm2 \n\t" | |
92 "psrlw $3, %%xmm0 \n\t" | |
93 "psrlw $3, %%xmm2 \n\t" | |
94 "packuswb %%xmm2, %%xmm0 \n\t" | |
95 AVG_OP("pavgb %%xmm4, %%xmm0 \n\t") | |
96 "movq %%xmm0, (%0) \n\t" | |
97 "movhps %%xmm0, (%0,%3) \n\t" | |
98 "sub $2, %2 \n\t" | |
99 "lea (%1,%3,2), %1 \n\t" | |
100 "lea (%0,%3,2), %0 \n\t" | |
101 "jg 1b \n\t" | |
102 :"+r"(dst), "+r"(src), "+r"(h) | |
6755
33896780c612
Do not misuse long as the size of a register in x86.
ramiro
parents:
6575
diff
changeset
|
103 :"r"((x86_reg)stride) |
6557 | 104 ); |
105 } | |
106 return; | |
107 } | |
108 | |
109 /* general case, bilinear */ | |
8031 | 110 __asm__ volatile( |
6557 | 111 "movd %0, %%xmm7 \n\t" |
112 "movd %1, %%xmm6 \n\t" | |
113 "movdqa %2, %%xmm5 \n\t" | |
114 "pshuflw $0, %%xmm7, %%xmm7 \n\t" | |
115 "pshuflw $0, %%xmm6, %%xmm6 \n\t" | |
116 "movlhps %%xmm7, %%xmm7 \n\t" | |
117 "movlhps %%xmm6, %%xmm6 \n\t" | |
6575
d869966e57e5
Fix H.264 interframe decoding when compiling with icc. Patch by Loren
melanson
parents:
6557
diff
changeset
|
118 :: "r"((x*255+8)*(8-y)), "r"((x*255+8)*y), "m"(*(rnd?&ff_pw_32:&ff_pw_28)) |
6557 | 119 ); |
120 | |
8031 | 121 __asm__ volatile( |
6557 | 122 "movq (%1), %%xmm0 \n\t" |
123 "movq 1(%1), %%xmm1 \n\t" | |
124 "punpcklbw %%xmm1, %%xmm0 \n\t" | |
125 "add %3, %1 \n\t" | |
126 "1: \n\t" | |
127 "movq (%1), %%xmm1 \n\t" | |
128 "movq 1(%1), %%xmm2 \n\t" | |
129 "movq (%1,%3), %%xmm3 \n\t" | |
130 "movq 1(%1,%3), %%xmm4 \n\t" | |
131 "lea (%1,%3,2), %1 \n\t" | |
132 "punpcklbw %%xmm2, %%xmm1 \n\t" | |
133 "punpcklbw %%xmm4, %%xmm3 \n\t" | |
134 "movdqa %%xmm1, %%xmm2 \n\t" | |
135 "movdqa %%xmm3, %%xmm4 \n\t" | |
136 "pmaddubsw %%xmm7, %%xmm0 \n\t" | |
137 "pmaddubsw %%xmm6, %%xmm1 \n\t" | |
138 "pmaddubsw %%xmm7, %%xmm2 \n\t" | |
139 "pmaddubsw %%xmm6, %%xmm3 \n\t" | |
140 "paddw %%xmm5, %%xmm0 \n\t" | |
141 "paddw %%xmm5, %%xmm2 \n\t" | |
142 "paddw %%xmm0, %%xmm1 \n\t" | |
143 "paddw %%xmm2, %%xmm3 \n\t" | |
144 "movdqa %%xmm4, %%xmm0 \n\t" | |
145 "psrlw $6, %%xmm1 \n\t" | |
146 "psrlw $6, %%xmm3 \n\t" | |
147 AVG_OP("movq (%0), %%xmm2 \n\t") | |
148 AVG_OP("movhps (%0,%3), %%xmm2 \n\t") | |
149 "packuswb %%xmm3, %%xmm1 \n\t" | |
150 AVG_OP("pavgb %%xmm2, %%xmm1 \n\t") | |
151 "movq %%xmm1, (%0)\n\t" | |
152 "movhps %%xmm1, (%0,%3)\n\t" | |
153 "sub $2, %2 \n\t" | |
154 "lea (%0,%3,2), %0 \n\t" | |
155 "jg 1b \n\t" | |
156 :"+r"(dst), "+r"(src), "+r"(h) | |
6755
33896780c612
Do not misuse long as the size of a register in x86.
ramiro
parents:
6575
diff
changeset
|
157 :"r"((x86_reg)stride) |
6557 | 158 ); |
159 } | |
160 | |
161 static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) | |
162 { | |
8031 | 163 __asm__ volatile( |
6557 | 164 "movd %0, %%mm7 \n\t" |
165 "movd %1, %%mm6 \n\t" | |
166 "movq %2, %%mm5 \n\t" | |
167 "pshufw $0, %%mm7, %%mm7 \n\t" | |
168 "pshufw $0, %%mm6, %%mm6 \n\t" | |
169 :: "r"((x*255+8)*(8-y)), "r"((x*255+8)*y), "m"(ff_pw_32) | |
170 ); | |
171 | |
8031 | 172 __asm__ volatile( |
6557 | 173 "movd (%1), %%mm0 \n\t" |
174 "punpcklbw 1(%1), %%mm0 \n\t" | |
175 "add %3, %1 \n\t" | |
176 "1: \n\t" | |
177 "movd (%1), %%mm1 \n\t" | |
178 "movd (%1,%3), %%mm3 \n\t" | |
179 "punpcklbw 1(%1), %%mm1 \n\t" | |
180 "punpcklbw 1(%1,%3), %%mm3 \n\t" | |
181 "lea (%1,%3,2), %1 \n\t" | |
182 "movq %%mm1, %%mm2 \n\t" | |
183 "movq %%mm3, %%mm4 \n\t" | |
184 "pmaddubsw %%mm7, %%mm0 \n\t" | |
185 "pmaddubsw %%mm6, %%mm1 \n\t" | |
186 "pmaddubsw %%mm7, %%mm2 \n\t" | |
187 "pmaddubsw %%mm6, %%mm3 \n\t" | |
188 "paddw %%mm5, %%mm0 \n\t" | |
189 "paddw %%mm5, %%mm2 \n\t" | |
190 "paddw %%mm0, %%mm1 \n\t" | |
191 "paddw %%mm2, %%mm3 \n\t" | |
192 "movq %%mm4, %%mm0 \n\t" | |
193 "psrlw $6, %%mm1 \n\t" | |
194 "psrlw $6, %%mm3 \n\t" | |
195 "packuswb %%mm1, %%mm1 \n\t" | |
196 "packuswb %%mm3, %%mm3 \n\t" | |
197 AVG_OP("pavgb (%0), %%mm1 \n\t") | |
198 AVG_OP("pavgb (%0,%3), %%mm3 \n\t") | |
199 "movd %%mm1, (%0)\n\t" | |
200 "movd %%mm3, (%0,%3)\n\t" | |
201 "sub $2, %2 \n\t" | |
202 "lea (%0,%3,2), %0 \n\t" | |
203 "jg 1b \n\t" | |
204 :"+r"(dst), "+r"(src), "+r"(h) | |
6755
33896780c612
Do not misuse long as the size of a register in x86.
ramiro
parents:
6575
diff
changeset
|
205 :"r"((x86_reg)stride) |
6557 | 206 ); |
207 } | |
208 |