Mercurial > libavcodec.hg
comparison i386/dsputil_h264_template_mmx.c @ 2922:d772011258ec libavcodec
faster h264_chroma_mc8_mmx, added h264_chroma_mc4_mmx.
2-4% overall speedup.
author | lorenm |
---|---|
date | Thu, 27 Oct 2005 06:45:29 +0000 |
parents | a49f140179e9 |
children | 0b546eab515d |
comparison
equal
deleted
inserted
replaced
2921:d22a3556292a | 2922:d772011258ec |
---|---|
1 /* | 1 /* |
2 * Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com> | 2 * Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>, |
3 * Loren Merritt | |
3 * | 4 * |
4 * This library is free software; you can redistribute it and/or | 5 * This library is free software; you can redistribute it and/or |
5 * modify it under the terms of the GNU Lesser General Public | 6 * modify it under the terms of the GNU Lesser General Public |
6 * License as published by the Free Software Foundation; either | 7 * License as published by the Free Software Foundation; either |
7 * version 2 of the License, or (at your option) any later version. | 8 * version 2 of the License, or (at your option) any later version. |
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | 17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
17 */ | 18 */ |
18 | 19 |
19 /** | 20 /** |
20 * MMX optimized version of (put|avg)_h264_chroma_mc8. | 21 * MMX optimized version of (put|avg)_h264_chroma_mc8. |
21 * H264_CHROMA_MC8_TMPL must be defined to the desired function name and | 22 * H264_CHROMA_MC8_TMPL must be defined to the desired function name |
22 * H264_CHROMA_OP must be defined to empty for put and pavgb/pavgusb for avg. | 23 * H264_CHROMA_OP must be defined to empty for put and pavgb/pavgusb for avg |
24 * H264_CHROMA_MC8_MV0 must be defined to a (put|avg)_pixels8 function | |
23 */ | 25 */ |
24 static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) | 26 static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) |
25 { | 27 { |
26 uint64_t AA __align8; | 28 uint64_t AA __align8; |
27 uint64_t DD __align8; | 29 uint64_t DD __align8; |
28 unsigned long srcos = (long)src & 7; | |
29 uint64_t sh1 __align8 = srcos * 8; | |
30 uint64_t sh2 __align8 = 56 - sh1; | |
31 int i; | 30 int i; |
32 | 31 |
32 if(y==0 && x==0) { | |
33 /* no filter needed */ | |
34 H264_CHROMA_MC8_MV0(dst, src, stride, h); | |
35 return; | |
36 } | |
37 | |
33 assert(x<8 && y<8 && x>=0 && y>=0); | 38 assert(x<8 && y<8 && x>=0 && y>=0); |
34 | 39 |
35 asm volatile("movd %1, %%mm4\n\t" | 40 if(y==0) |
36 "movd %2, %%mm6\n\t" | 41 { |
42 /* horizontal filter only */ | |
43 asm volatile("movd %0, %%mm5\n\t" | |
44 "punpcklwd %%mm5, %%mm5\n\t" | |
45 "punpckldq %%mm5, %%mm5\n\t" /* mm5 = B = x */ | |
46 "movq %1, %%mm4\n\t" | |
47 "pxor %%mm7, %%mm7\n\t" | |
48 "psubw %%mm5, %%mm4\n\t" /* mm4 = A = 8-x */ | |
49 : : "rm" (x), "m" (ff_pw_8)); | |
50 | |
51 for(i=0; i<h; i++) { | |
52 asm volatile( | |
53 /* mm0 = src[0..7], mm1 = src[1..8] */ | |
54 "movq %0, %%mm0\n\t" | |
55 "movq %1, %%mm1\n\t" | |
56 : : "m" (src[0]), "m" (src[1])); | |
57 | |
58 asm volatile( | |
59 /* [mm2,mm3] = A * src[0..7] */ | |
60 "movq %%mm0, %%mm2\n\t" | |
61 "punpcklbw %%mm7, %%mm2\n\t" | |
62 "pmullw %%mm4, %%mm2\n\t" | |
63 "movq %%mm0, %%mm3\n\t" | |
64 "punpckhbw %%mm7, %%mm3\n\t" | |
65 "pmullw %%mm4, %%mm3\n\t" | |
66 | |
67 /* [mm2,mm3] += B * src[1..8] */ | |
68 "movq %%mm1, %%mm0\n\t" | |
69 "punpcklbw %%mm7, %%mm0\n\t" | |
70 "pmullw %%mm5, %%mm0\n\t" | |
71 "punpckhbw %%mm7, %%mm1\n\t" | |
72 "pmullw %%mm5, %%mm1\n\t" | |
73 "paddw %%mm0, %%mm2\n\t" | |
74 "paddw %%mm1, %%mm3\n\t" | |
75 | |
76 /* dst[0..7] = pack(([mm2,mm3] + 32) >> 6) */ | |
77 "paddw %1, %%mm2\n\t" | |
78 "paddw %1, %%mm3\n\t" | |
79 "psrlw $3, %%mm2\n\t" | |
80 "psrlw $3, %%mm3\n\t" | |
81 "packuswb %%mm3, %%mm2\n\t" | |
82 H264_CHROMA_OP(%0, %%mm2) | |
83 "movq %%mm2, %0\n\t" | |
84 : "=m" (dst[0]) : "m" (ff_pw_4)); | |
85 | |
86 src += stride; | |
87 dst += stride; | |
88 } | |
89 return; | |
90 } | |
91 | |
92 if(x==0) | |
93 { | |
94 /* vertical filter only */ | |
95 asm volatile("movd %0, %%mm6\n\t" | |
96 "punpcklwd %%mm6, %%mm6\n\t" | |
97 "punpckldq %%mm6, %%mm6\n\t" /* mm6 = C = y */ | |
98 "movq %1, %%mm4\n\t" | |
99 "pxor %%mm7, %%mm7\n\t" | |
100 "psubw %%mm6, %%mm4\n\t" /* mm4 = A = 8-y */ | |
101 : : "rm" (y), "m" (ff_pw_8)); | |
102 | |
103 asm volatile( | |
104 /* mm0 = src[0..7] */ | |
105 "movq %0, %%mm0\n\t" | |
106 : : "m" (src[0])); | |
107 | |
108 for(i=0; i<h; i++) { | |
109 asm volatile( | |
110 /* [mm2,mm3] = A * src[0..7] */ | |
111 "movq %mm0, %mm2\n\t" | |
112 "punpcklbw %mm7, %mm2\n\t" | |
113 "pmullw %mm4, %mm2\n\t" | |
114 "movq %mm0, %mm3\n\t" | |
115 "punpckhbw %mm7, %mm3\n\t" | |
116 "pmullw %mm4, %mm3\n\t"); | |
117 | |
118 src += stride; | |
119 asm volatile( | |
120 /* mm0 = src[0..7] */ | |
121 "movq %0, %%mm0\n\t" | |
122 : : "m" (src[0])); | |
123 | |
124 asm volatile( | |
125 /* [mm2,mm3] += C * src[0..7] */ | |
126 "movq %mm0, %mm1\n\t" | |
127 "punpcklbw %mm7, %mm1\n\t" | |
128 "pmullw %mm6, %mm1\n\t" | |
129 "paddw %mm1, %mm2\n\t" | |
130 "movq %mm0, %mm5\n\t" | |
131 "punpckhbw %mm7, %mm5\n\t" | |
132 "pmullw %mm6, %mm5\n\t" | |
133 "paddw %mm5, %mm3\n\t"); | |
134 | |
135 asm volatile( | |
136 /* dst[0..7] = pack(([mm2,mm3] + 32) >> 6) */ | |
137 "paddw %1, %%mm2\n\t" | |
138 "paddw %1, %%mm3\n\t" | |
139 "psrlw $3, %%mm2\n\t" | |
140 "psrlw $3, %%mm3\n\t" | |
141 "packuswb %%mm3, %%mm2\n\t" | |
142 H264_CHROMA_OP(%0, %%mm2) | |
143 "movq %%mm2, %0\n\t" | |
144 : "=m" (dst[0]) : "m" (ff_pw_4)); | |
145 | |
146 dst += stride; | |
147 } | |
148 return; | |
149 } | |
150 | |
151 /* general case, bilinear */ | |
152 asm volatile("movd %2, %%mm4\n\t" | |
153 "movd %3, %%mm6\n\t" | |
37 "punpcklwd %%mm4, %%mm4\n\t" | 154 "punpcklwd %%mm4, %%mm4\n\t" |
38 "punpcklwd %%mm6, %%mm6\n\t" | 155 "punpcklwd %%mm6, %%mm6\n\t" |
39 "punpckldq %%mm4, %%mm4\n\t" /* mm4 = x words */ | 156 "punpckldq %%mm4, %%mm4\n\t" /* mm4 = x words */ |
40 "punpckldq %%mm6, %%mm6\n\t" /* mm6 = y words */ | 157 "punpckldq %%mm6, %%mm6\n\t" /* mm6 = y words */ |
41 "movq %%mm4, %%mm5\n\t" | 158 "movq %%mm4, %%mm5\n\t" |
42 "pmullw %%mm6, %%mm4\n\t" /* mm4 = x * y */ | 159 "pmullw %%mm6, %%mm4\n\t" /* mm4 = x * y */ |
43 "psllw $3, %%mm5\n\t" | 160 "psllw $3, %%mm5\n\t" |
44 "psllw $3, %%mm6\n\t" | 161 "psllw $3, %%mm6\n\t" |
45 "movq %%mm5, %%mm7\n\t" | 162 "movq %%mm5, %%mm7\n\t" |
46 "paddw %%mm6, %%mm7\n\t" | 163 "paddw %%mm6, %%mm7\n\t" |
47 "movq %%mm4, %0\n\t" /* DD = x * y */ | 164 "movq %%mm4, %1\n\t" /* DD = x * y */ |
48 "psubw %%mm4, %%mm5\n\t" /* mm5 = B = 8x - xy */ | 165 "psubw %%mm4, %%mm5\n\t" /* mm5 = B = 8x - xy */ |
49 "psubw %%mm4, %%mm6\n\t" /* mm6 = C = 8y - xy */ | 166 "psubw %%mm4, %%mm6\n\t" /* mm6 = C = 8y - xy */ |
50 "paddw %3, %%mm4\n\t" | 167 "paddw %4, %%mm4\n\t" |
51 "psubw %%mm7, %%mm4\n\t" /* mm4 = A = xy - (8x+8y) + 64 */ | 168 "psubw %%mm7, %%mm4\n\t" /* mm4 = A = xy - (8x+8y) + 64 */ |
52 "pxor %%mm7, %%mm7\n\t" | 169 "pxor %%mm7, %%mm7\n\t" |
53 : "=m" (DD) : "rm" (x), "rm" (y), "m" (ff_pw_64)); | 170 "movq %%mm4, %0\n\t" |
54 | 171 : "=m" (AA), "=m" (DD) : "rm" (x), "rm" (y), "m" (ff_pw_64)); |
55 asm volatile("movq %%mm4, %0" : "=m" (AA)); | 172 |
56 | |
57 src -= srcos; | |
58 asm volatile( | 173 asm volatile( |
59 /* mm0 = src[0..7], mm1 = src[1..8] */ | 174 /* mm0 = src[0..7], mm1 = src[1..8] */ |
60 "movq %0, %%mm1\n\t" | 175 "movq %0, %%mm0\n\t" |
61 "movq %1, %%mm0\n\t" | 176 "movq %1, %%mm1\n\t" |
62 "psrlq %2, %%mm1\n\t" | 177 : : "m" (src[0]), "m" (src[1])); |
63 "psllq %3, %%mm0\n\t" | |
64 "movq %%mm0, %%mm4\n\t" | |
65 "psllq $8, %%mm0\n\t" | |
66 "por %%mm1, %%mm0\n\t" | |
67 "psrlq $8, %%mm1\n\t" | |
68 "por %%mm4, %%mm1\n\t" | |
69 : : "m" (src[0]), "m" (src[8]), "m" (sh1), "m" (sh2)); | |
70 | 178 |
71 for(i=0; i<h; i++) { | 179 for(i=0; i<h; i++) { |
72 asm volatile( | 180 asm volatile( |
73 /* [mm2,mm3] = A * src[0..7] */ | 181 /* [mm2,mm3] = A * src[0..7] */ |
74 "movq %%mm0, %%mm2\n\t" | 182 "movq %%mm0, %%mm2\n\t" |
89 : : "m" (AA)); | 197 : : "m" (AA)); |
90 | 198 |
91 src += stride; | 199 src += stride; |
92 asm volatile( | 200 asm volatile( |
93 /* mm0 = src[0..7], mm1 = src[1..8] */ | 201 /* mm0 = src[0..7], mm1 = src[1..8] */ |
94 "movq %0, %%mm1\n\t" | 202 "movq %0, %%mm0\n\t" |
95 "movq %1, %%mm0\n\t" | 203 "movq %1, %%mm1\n\t" |
96 "psrlq %2, %%mm1\n\t" | 204 : : "m" (src[0]), "m" (src[1])); |
97 "psllq %3, %%mm0\n\t" | |
98 "movq %%mm0, %%mm4\n\t" | |
99 "psllq $8, %%mm0\n\t" | |
100 "por %%mm1, %%mm0\n\t" | |
101 "psrlq $8, %%mm1\n\t" | |
102 "por %%mm4, %%mm1\n\t" | |
103 : : "m" (src[0]), "m" (src[8]), "m" (sh1), "m" (sh2)); | |
104 | 205 |
105 asm volatile( | 206 asm volatile( |
106 /* [mm2,mm3] += C * src[0..7] */ | 207 /* [mm2,mm3] += C * src[0..7] */ |
107 "movq %mm0, %mm4\n\t" | 208 "movq %mm0, %mm4\n\t" |
108 "punpcklbw %mm7, %mm4\n\t" | 209 "punpcklbw %mm7, %mm4\n\t" |
136 "movq %%mm2, %0\n\t" | 237 "movq %%mm2, %0\n\t" |
137 : "=m" (dst[0]) : "m" (ff_pw_32)); | 238 : "=m" (dst[0]) : "m" (ff_pw_32)); |
138 dst+= stride; | 239 dst+= stride; |
139 } | 240 } |
140 } | 241 } |
242 | |
243 static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) | |
244 { | |
245 uint64_t AA __align8; | |
246 uint64_t DD __align8; | |
247 int i; | |
248 | |
249 /* no special case for mv=(0,0) in 4x*, since it's much less common than in 8x*. | |
250 * could still save a few cycles, but maybe not worth the complexity. */ | |
251 | |
252 assert(x<8 && y<8 && x>=0 && y>=0); | |
253 | |
254 asm volatile("movd %2, %%mm4\n\t" | |
255 "movd %3, %%mm6\n\t" | |
256 "punpcklwd %%mm4, %%mm4\n\t" | |
257 "punpcklwd %%mm6, %%mm6\n\t" | |
258 "punpckldq %%mm4, %%mm4\n\t" /* mm4 = x words */ | |
259 "punpckldq %%mm6, %%mm6\n\t" /* mm6 = y words */ | |
260 "movq %%mm4, %%mm5\n\t" | |
261 "pmullw %%mm6, %%mm4\n\t" /* mm4 = x * y */ | |
262 "psllw $3, %%mm5\n\t" | |
263 "psllw $3, %%mm6\n\t" | |
264 "movq %%mm5, %%mm7\n\t" | |
265 "paddw %%mm6, %%mm7\n\t" | |
266 "movq %%mm4, %1\n\t" /* DD = x * y */ | |
267 "psubw %%mm4, %%mm5\n\t" /* mm5 = B = 8x - xy */ | |
268 "psubw %%mm4, %%mm6\n\t" /* mm6 = C = 8y - xy */ | |
269 "paddw %4, %%mm4\n\t" | |
270 "psubw %%mm7, %%mm4\n\t" /* mm4 = A = xy - (8x+8y) + 64 */ | |
271 "pxor %%mm7, %%mm7\n\t" | |
272 "movq %%mm4, %0\n\t" | |
273 : "=m" (AA), "=m" (DD) : "rm" (x), "rm" (y), "m" (ff_pw_64)); | |
274 | |
275 asm volatile( | |
276 /* mm0 = src[0..3], mm1 = src[1..4] */ | |
277 "movd %0, %%mm0\n\t" | |
278 "movd %1, %%mm1\n\t" | |
279 "punpcklbw %%mm7, %%mm0\n\t" | |
280 "punpcklbw %%mm7, %%mm1\n\t" | |
281 : : "m" (src[0]), "m" (src[1])); | |
282 | |
283 for(i=0; i<h; i++) { | |
284 asm volatile( | |
285 /* mm2 = A * src[0..3] + B * src[1..4] */ | |
286 "movq %%mm0, %%mm2\n\t" | |
287 "pmullw %0, %%mm2\n\t" | |
288 "pmullw %%mm5, %%mm1\n\t" | |
289 "paddw %%mm1, %%mm2\n\t" | |
290 : : "m" (AA)); | |
291 | |
292 src += stride; | |
293 asm volatile( | |
294 /* mm0 = src[0..3], mm1 = src[1..4] */ | |
295 "movd %0, %%mm0\n\t" | |
296 "movd %1, %%mm1\n\t" | |
297 "punpcklbw %%mm7, %%mm0\n\t" | |
298 "punpcklbw %%mm7, %%mm1\n\t" | |
299 : : "m" (src[0]), "m" (src[1])); | |
300 | |
301 asm volatile( | |
302 /* mm2 += C * src[0..3] + D * src[1..4] */ | |
303 "movq %%mm0, %%mm3\n\t" | |
304 "movq %%mm1, %%mm4\n\t" | |
305 "pmullw %%mm6, %%mm3\n\t" | |
306 "pmullw %0, %%mm4\n\t" | |
307 "paddw %%mm3, %%mm2\n\t" | |
308 "paddw %%mm4, %%mm2\n\t" | |
309 : : "m" (DD)); | |
310 | |
311 asm volatile( | |
312 /* dst[0..3] = pack((mm2 + 32) >> 6) */ | |
313 "paddw %1, %%mm2\n\t" | |
314 "psrlw $6, %%mm2\n\t" | |
315 "packuswb %%mm7, %%mm2\n\t" | |
316 H264_CHROMA_OP4(%0, %%mm2, %%mm3) | |
317 "movd %%mm2, %0\n\t" | |
318 : "=m" (dst[0]) : "m" (ff_pw_32)); | |
319 dst += stride; | |
320 } | |
321 } |