Mercurial > libavcodec.hg
comparison i386/h264dsp_mmx.c @ 3105:2d35fb3cb940 libavcodec
h264: special case dc-only idct. ~1% faster overall
author | lorenm |
---|---|
date | Fri, 10 Feb 2006 06:55:25 +0000 |
parents | fcc2892eeab3 |
children | 5b6d0dd37ca7 |
comparison
equal
deleted
inserted
replaced
3104:78d6bfc238f3 | 3105:2d35fb3cb940 |
---|---|
100 "add %1, %0 \n\t" | 100 "add %1, %0 \n\t" |
101 STORE_DIFF_4P( %%mm4, %%mm1, %%mm7) | 101 STORE_DIFF_4P( %%mm4, %%mm1, %%mm7) |
102 : "+r"(dst) | 102 : "+r"(dst) |
103 : "r" ((long)stride) | 103 : "r" ((long)stride) |
104 ); | 104 ); |
105 } | |
106 | |
107 void ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride) | |
108 { | |
109 int dc = (block[0] + 32) >> 6; | |
110 asm volatile( | |
111 "movd %0, %%mm0 \n\t" | |
112 "pxor %%mm7, %%mm7 \n\t" | |
113 "pshufw $0, %%mm0, %%mm0 \n\t" | |
114 "pxor %%mm1, %%mm1 \n\t" | |
115 "psubw %%mm0, %%mm1 \n\t" | |
116 "pmaxsw %%mm7, %%mm0 \n\t" | |
117 "pmaxsw %%mm7, %%mm1 \n\t" | |
118 "packuswb %%mm0, %%mm0 \n\t" | |
119 "packuswb %%mm1, %%mm1 \n\t" | |
120 ::"r"(dc) | |
121 ); | |
122 asm volatile( | |
123 "movd %0, %%mm2 \n\t" | |
124 "movd %1, %%mm3 \n\t" | |
125 "movd %2, %%mm4 \n\t" | |
126 "movd %3, %%mm5 \n\t" | |
127 "paddusb %%mm0, %%mm2 \n\t" | |
128 "paddusb %%mm0, %%mm3 \n\t" | |
129 "paddusb %%mm0, %%mm4 \n\t" | |
130 "paddusb %%mm0, %%mm5 \n\t" | |
131 "psubusb %%mm1, %%mm2 \n\t" | |
132 "psubusb %%mm1, %%mm3 \n\t" | |
133 "psubusb %%mm1, %%mm4 \n\t" | |
134 "psubusb %%mm1, %%mm5 \n\t" | |
135 "movd %%mm2, %0 \n\t" | |
136 "movd %%mm3, %1 \n\t" | |
137 "movd %%mm4, %2 \n\t" | |
138 "movd %%mm5, %3 \n\t" | |
139 :"+m"(*(uint32_t*)(dst+0*stride)), | |
140 "+m"(*(uint32_t*)(dst+1*stride)), | |
141 "+m"(*(uint32_t*)(dst+2*stride)), | |
142 "+m"(*(uint32_t*)(dst+3*stride)) | |
143 ); | |
144 } | |
145 | |
146 void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride) | |
147 { | |
148 int dc = (block[0] + 32) >> 6; | |
149 int y; | |
150 asm volatile( | |
151 "movd %0, %%mm0 \n\t" | |
152 "pxor %%mm7, %%mm7 \n\t" | |
153 "pshufw $0, %%mm0, %%mm0 \n\t" | |
154 "pxor %%mm1, %%mm1 \n\t" | |
155 "psubw %%mm0, %%mm1 \n\t" | |
156 "pmaxsw %%mm7, %%mm0 \n\t" | |
157 "pmaxsw %%mm7, %%mm1 \n\t" | |
158 "packuswb %%mm0, %%mm0 \n\t" | |
159 "packuswb %%mm1, %%mm1 \n\t" | |
160 ::"r"(dc) | |
161 ); | |
162 for(y=2; y--; dst += 4*stride){ | |
163 asm volatile( | |
164 "movq %0, %%mm2 \n\t" | |
165 "movq %1, %%mm3 \n\t" | |
166 "movq %2, %%mm4 \n\t" | |
167 "movq %3, %%mm5 \n\t" | |
168 "paddusb %%mm0, %%mm2 \n\t" | |
169 "paddusb %%mm0, %%mm3 \n\t" | |
170 "paddusb %%mm0, %%mm4 \n\t" | |
171 "paddusb %%mm0, %%mm5 \n\t" | |
172 "psubusb %%mm1, %%mm2 \n\t" | |
173 "psubusb %%mm1, %%mm3 \n\t" | |
174 "psubusb %%mm1, %%mm4 \n\t" | |
175 "psubusb %%mm1, %%mm5 \n\t" | |
176 "movq %%mm2, %0 \n\t" | |
177 "movq %%mm3, %1 \n\t" | |
178 "movq %%mm4, %2 \n\t" | |
179 "movq %%mm5, %3 \n\t" | |
180 :"+m"(*(uint64_t*)(dst+0*stride)), | |
181 "+m"(*(uint64_t*)(dst+1*stride)), | |
182 "+m"(*(uint64_t*)(dst+2*stride)), | |
183 "+m"(*(uint64_t*)(dst+3*stride)) | |
184 ); | |
185 } | |
105 } | 186 } |
106 | 187 |
107 | 188 |
108 /***********************************/ | 189 /***********************************/ |
109 /* deblocking */ | 190 /* deblocking */ |