Mercurial > libavcodec.hg
comparison ppc/dsputil_ppc.c @ 1334:80c46c310a91 libavcodec
PPC970 patch + cpu-specific tuning support by (Romain Dolbeau <dolbeau at irisa dot fr>)
author | michaelni |
---|---|
date | Sun, 29 Jun 2003 00:39:57 +0000 |
parents | f59c3f66363b |
children | 09b8fe0f0139 |
comparison
equal
deleted
inserted
replaced
1333:a1cc1810d58f | 1334:80c46c310a91 |
---|---|
55 "avg_pixels8_altivec", | 55 "avg_pixels8_altivec", |
56 "put_pixels8_xy2_altivec", | 56 "put_pixels8_xy2_altivec", |
57 "put_no_rnd_pixels8_xy2_altivec", | 57 "put_no_rnd_pixels8_xy2_altivec", |
58 "put_pixels16_xy2_altivec", | 58 "put_pixels16_xy2_altivec", |
59 "put_no_rnd_pixels16_xy2_altivec", | 59 "put_no_rnd_pixels16_xy2_altivec", |
60 "clear_blocks_dcbz32_ppc" | 60 "clear_blocks_dcbz32_ppc", |
61 "clear_blocks_dcbz128_ppc" | |
61 }; | 62 }; |
62 #ifdef POWERPC_PERF_USE_PMC | 63 #ifdef POWERPC_PERF_USE_PMC |
63 unsigned long long perfdata_miss[powerpc_perf_total][powerpc_data_total]; | 64 unsigned long long perfdata_miss[powerpc_perf_total][powerpc_data_total]; |
64 #endif | 65 #endif |
65 #include <stdio.h> | 66 #include <stdio.h> |
108 use 32 bytes cache line. | 109 use 32 bytes cache line. |
109 This is due to the use of the 'dcbz' instruction. | 110 This is due to the use of the 'dcbz' instruction. |
110 It simply clear to zero a single cache line, | 111 It simply clear to zero a single cache line, |
111 so you need to know the cache line size to use it ! | 112 so you need to know the cache line size to use it ! |
112 It's absurd, but it's fast... | 113 It's absurd, but it's fast... |
114 | |
115 update 24/06/2003 : Apple released yesterday the G5, | |
116 with a PPC970. cache line size : 128 bytes. Oups. | |
117 The semantic of dcbz was changed, it always clear | |
118 32 bytes. so the function below will work, but will | |
119 be slow. So I fixed check_dcbz_effect to use dcbzl, | |
120 which is defined to clear a cache line (as dcbz before). | |
121 So we still can distinguish, and use dcbz (32 bytes) | |
122 or dcbzl (one cache line) as required. | |
123 | |
124 see <http://developer.apple.com/technotes/tn/tn2087.html> | |
125 and <http://developer.apple.com/technotes/tn/tn2086.html> | |
113 */ | 126 */ |
114 void clear_blocks_dcbz32_ppc(DCTELEM *blocks) | 127 void clear_blocks_dcbz32_ppc(DCTELEM *blocks) |
115 { | 128 { |
116 POWERPC_TBL_DECLARE(powerpc_clear_blocks_dcbz32, 1); | 129 POWERPC_TBL_DECLARE(powerpc_clear_blocks_dcbz32, 1); |
117 register int misal = ((unsigned long)blocks & 0x00000010); | 130 register int misal = ((unsigned long)blocks & 0x00000010); |
124 ((unsigned long*)blocks)[2] = 0L; | 137 ((unsigned long*)blocks)[2] = 0L; |
125 ((unsigned long*)blocks)[3] = 0L; | 138 ((unsigned long*)blocks)[3] = 0L; |
126 i += 16; | 139 i += 16; |
127 } | 140 } |
128 for ( ; i < sizeof(DCTELEM)*6*64 ; i += 32) { | 141 for ( ; i < sizeof(DCTELEM)*6*64 ; i += 32) { |
129 asm volatile("dcbz %0,%1" : : "r" (blocks), "r" (i) : "memory"); | 142 asm volatile("dcbz %0,%1" : : "r" (i), "r" (blocks) : "memory"); |
130 } | 143 } |
131 if (misal) { | 144 if (misal) { |
132 ((unsigned long*)blocks)[188] = 0L; | 145 ((unsigned long*)blocks)[188] = 0L; |
133 ((unsigned long*)blocks)[189] = 0L; | 146 ((unsigned long*)blocks)[189] = 0L; |
134 ((unsigned long*)blocks)[190] = 0L; | 147 ((unsigned long*)blocks)[190] = 0L; |
139 memset(blocks, 0, sizeof(DCTELEM)*6*64); | 152 memset(blocks, 0, sizeof(DCTELEM)*6*64); |
140 #endif | 153 #endif |
141 POWERPC_TBL_STOP_COUNT(powerpc_clear_blocks_dcbz32, 1); | 154 POWERPC_TBL_STOP_COUNT(powerpc_clear_blocks_dcbz32, 1); |
142 } | 155 } |
143 | 156 |
157 /* same as above, when dcbzl clear a whole 128B cache line | |
158 i.e. the PPC970 aka G5 */ | |
159 #ifndef NO_DCBZL | |
160 void clear_blocks_dcbz128_ppc(DCTELEM *blocks) | |
161 { | |
162 POWERPC_TBL_DECLARE(powerpc_clear_blocks_dcbz128, 1); | |
163 register int misal = ((unsigned long)blocks & 0x0000007f); | |
164 register int i = 0; | |
165 POWERPC_TBL_START_COUNT(powerpc_clear_blocks_dcbz128, 1); | |
166 #if 1 | |
167 if (misal) { | |
168 // we could probably also optimize this case, | |
169 // but there's not much point as the machines | |
170 // aren't available yet (2003-06-26) | |
171 memset(blocks, 0, sizeof(DCTELEM)*6*64); | |
172 } | |
173 else | |
174 for ( ; i < sizeof(DCTELEM)*6*64 ; i += 128) { | |
175 asm volatile("dcbzl %0,%1" : : "r" (i), "r" (blocks) : "memory"); | |
176 } | |
177 #else | |
178 memset(blocks, 0, sizeof(DCTELEM)*6*64); | |
179 #endif | |
180 POWERPC_TBL_STOP_COUNT(powerpc_clear_blocks_dcbz128, 1); | |
181 } | |
182 #else | |
183 void clear_blocks_dcbz128_ppc(DCTELEM *blocks) | |
184 { | |
185 memset(blocks, 0, sizeof(DCTELEM)*6*64); | |
186 } | |
187 #endif | |
188 | |
189 #ifndef NO_DCBZL | |
144 /* check dcbz report how many bytes are set to 0 by dcbz */ | 190 /* check dcbz report how many bytes are set to 0 by dcbz */ |
145 long check_dcbz_effect(void) | 191 /* update 24/06/2003 : replace dcbz by dcbzl to get |
192 the intended effect (Apple "fixed" dcbz) | |
193 unfortunately this cannot be used unless the assembler | |
194 knows about dcbzl ... */ | |
195 long check_dcbzl_effect(void) | |
146 { | 196 { |
147 register char *fakedata = (char*)av_malloc(1024); | 197 register char *fakedata = (char*)av_malloc(1024); |
148 register char *fakedata_middle; | 198 register char *fakedata_middle; |
149 register long zero = 0; | 199 register long zero = 0; |
150 register long i = 0; | 200 register long i = 0; |
157 | 207 |
158 fakedata_middle = (fakedata + 512); | 208 fakedata_middle = (fakedata + 512); |
159 | 209 |
160 memset(fakedata, 0xFF, 1024); | 210 memset(fakedata, 0xFF, 1024); |
161 | 211 |
162 asm volatile("dcbz %0, %1" : : "r" (fakedata_middle), "r" (zero)); | 212 asm volatile("dcbzl %0, %1" : : "r" (fakedata_middle), "r" (zero)); |
163 | 213 |
164 for (i = 0; i < 1024 ; i ++) | 214 for (i = 0; i < 1024 ; i ++) |
165 { | 215 { |
166 if (fakedata[i] == (char)0) | 216 if (fakedata[i] == (char)0) |
167 count++; | 217 count++; |
169 | 219 |
170 av_free(fakedata); | 220 av_free(fakedata); |
171 | 221 |
172 return count; | 222 return count; |
173 } | 223 } |
224 #else | |
225 long check_dcbzl_effect(void) | |
226 { | |
227 return 0; | |
228 } | |
229 #endif | |
174 | 230 |
175 void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx) | 231 void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx) |
176 { | 232 { |
177 // Common optimisations whether Altivec or not | 233 // Common optimizations whether Altivec is available or not |
178 | 234 |
179 switch (check_dcbz_effect()) { | 235 switch (check_dcbzl_effect()) { |
180 case 32: | 236 case 32: |
181 c->clear_blocks = clear_blocks_dcbz32_ppc; | 237 c->clear_blocks = clear_blocks_dcbz32_ppc; |
238 break; | |
239 case 128: | |
240 c->clear_blocks = clear_blocks_dcbz128_ppc; | |
182 break; | 241 break; |
183 default: | 242 default: |
184 break; | 243 break; |
185 } | 244 } |
186 | 245 |