comparison i386/h264dsp_mmx.c @ 3105:2d35fb3cb940 libavcodec

h264: special case dc-only idct. ~1% faster overall
author lorenm
date Fri, 10 Feb 2006 06:55:25 +0000
parents fcc2892eeab3
children 5b6d0dd37ca7
comparison
equal deleted inserted replaced
3104:78d6bfc238f3 3105:2d35fb3cb940
100 "add %1, %0 \n\t" 100 "add %1, %0 \n\t"
101 STORE_DIFF_4P( %%mm4, %%mm1, %%mm7) 101 STORE_DIFF_4P( %%mm4, %%mm1, %%mm7)
102 : "+r"(dst) 102 : "+r"(dst)
103 : "r" ((long)stride) 103 : "r" ((long)stride)
104 ); 104 );
105 }
106
107 void ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
108 {
109 int dc = (block[0] + 32) >> 6;
110 asm volatile(
111 "movd %0, %%mm0 \n\t"
112 "pxor %%mm7, %%mm7 \n\t"
113 "pshufw $0, %%mm0, %%mm0 \n\t"
114 "pxor %%mm1, %%mm1 \n\t"
115 "psubw %%mm0, %%mm1 \n\t"
116 "pmaxsw %%mm7, %%mm0 \n\t"
117 "pmaxsw %%mm7, %%mm1 \n\t"
118 "packuswb %%mm0, %%mm0 \n\t"
119 "packuswb %%mm1, %%mm1 \n\t"
120 ::"r"(dc)
121 );
122 asm volatile(
123 "movd %0, %%mm2 \n\t"
124 "movd %1, %%mm3 \n\t"
125 "movd %2, %%mm4 \n\t"
126 "movd %3, %%mm5 \n\t"
127 "paddusb %%mm0, %%mm2 \n\t"
128 "paddusb %%mm0, %%mm3 \n\t"
129 "paddusb %%mm0, %%mm4 \n\t"
130 "paddusb %%mm0, %%mm5 \n\t"
131 "psubusb %%mm1, %%mm2 \n\t"
132 "psubusb %%mm1, %%mm3 \n\t"
133 "psubusb %%mm1, %%mm4 \n\t"
134 "psubusb %%mm1, %%mm5 \n\t"
135 "movd %%mm2, %0 \n\t"
136 "movd %%mm3, %1 \n\t"
137 "movd %%mm4, %2 \n\t"
138 "movd %%mm5, %3 \n\t"
139 :"+m"(*(uint32_t*)(dst+0*stride)),
140 "+m"(*(uint32_t*)(dst+1*stride)),
141 "+m"(*(uint32_t*)(dst+2*stride)),
142 "+m"(*(uint32_t*)(dst+3*stride))
143 );
144 }
145
146 void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
147 {
148 int dc = (block[0] + 32) >> 6;
149 int y;
150 asm volatile(
151 "movd %0, %%mm0 \n\t"
152 "pxor %%mm7, %%mm7 \n\t"
153 "pshufw $0, %%mm0, %%mm0 \n\t"
154 "pxor %%mm1, %%mm1 \n\t"
155 "psubw %%mm0, %%mm1 \n\t"
156 "pmaxsw %%mm7, %%mm0 \n\t"
157 "pmaxsw %%mm7, %%mm1 \n\t"
158 "packuswb %%mm0, %%mm0 \n\t"
159 "packuswb %%mm1, %%mm1 \n\t"
160 ::"r"(dc)
161 );
162 for(y=2; y--; dst += 4*stride){
163 asm volatile(
164 "movq %0, %%mm2 \n\t"
165 "movq %1, %%mm3 \n\t"
166 "movq %2, %%mm4 \n\t"
167 "movq %3, %%mm5 \n\t"
168 "paddusb %%mm0, %%mm2 \n\t"
169 "paddusb %%mm0, %%mm3 \n\t"
170 "paddusb %%mm0, %%mm4 \n\t"
171 "paddusb %%mm0, %%mm5 \n\t"
172 "psubusb %%mm1, %%mm2 \n\t"
173 "psubusb %%mm1, %%mm3 \n\t"
174 "psubusb %%mm1, %%mm4 \n\t"
175 "psubusb %%mm1, %%mm5 \n\t"
176 "movq %%mm2, %0 \n\t"
177 "movq %%mm3, %1 \n\t"
178 "movq %%mm4, %2 \n\t"
179 "movq %%mm5, %3 \n\t"
180 :"+m"(*(uint64_t*)(dst+0*stride)),
181 "+m"(*(uint64_t*)(dst+1*stride)),
182 "+m"(*(uint64_t*)(dst+2*stride)),
183 "+m"(*(uint64_t*)(dst+3*stride))
184 );
185 }
105 } 186 }
106 187
107 188
108 /***********************************/ 189 /***********************************/
109 /* deblocking */ 190 /* deblocking */