comparison ppc/dsputil_ppc.c @ 1334:80c46c310a91 libavcodec

PPC970 patch + cpu-specific tuning support by (Romain Dolbeau <dolbeau at irisa dot fr>)
author michaelni
date Sun, 29 Jun 2003 00:39:57 +0000
parents f59c3f66363b
children 09b8fe0f0139
comparison
equal deleted inserted replaced
1333:a1cc1810d58f 1334:80c46c310a91
55 "avg_pixels8_altivec", 55 "avg_pixels8_altivec",
56 "put_pixels8_xy2_altivec", 56 "put_pixels8_xy2_altivec",
57 "put_no_rnd_pixels8_xy2_altivec", 57 "put_no_rnd_pixels8_xy2_altivec",
58 "put_pixels16_xy2_altivec", 58 "put_pixels16_xy2_altivec",
59 "put_no_rnd_pixels16_xy2_altivec", 59 "put_no_rnd_pixels16_xy2_altivec",
60 "clear_blocks_dcbz32_ppc" 60 "clear_blocks_dcbz32_ppc",
61 "clear_blocks_dcbz128_ppc"
61 }; 62 };
62 #ifdef POWERPC_PERF_USE_PMC 63 #ifdef POWERPC_PERF_USE_PMC
63 unsigned long long perfdata_miss[powerpc_perf_total][powerpc_data_total]; 64 unsigned long long perfdata_miss[powerpc_perf_total][powerpc_data_total];
64 #endif 65 #endif
65 #include <stdio.h> 66 #include <stdio.h>
108 use 32 bytes cache line. 109 use 32 bytes cache line.
109 This is due to the use of the 'dcbz' instruction. 110 This is due to the use of the 'dcbz' instruction.
110 It simply clear to zero a single cache line, 111 It simply clear to zero a single cache line,
111 so you need to know the cache line size to use it ! 112 so you need to know the cache line size to use it !
112 It's absurd, but it's fast... 113 It's absurd, but it's fast...
114
115 update 24/06/2003 : Apple released yesterday the G5,
116 with a PPC970. cache line size : 128 bytes. Oups.
117 The semantic of dcbz was changed, it always clear
118 32 bytes. so the function below will work, but will
119 be slow. So I fixed check_dcbz_effect to use dcbzl,
120 which is defined to clear a cache line (as dcbz before).
121 So we still can distinguish, and use dcbz (32 bytes)
122 or dcbzl (one cache line) as required.
123
124 see <http://developer.apple.com/technotes/tn/tn2087.html>
125 and <http://developer.apple.com/technotes/tn/tn2086.html>
113 */ 126 */
114 void clear_blocks_dcbz32_ppc(DCTELEM *blocks) 127 void clear_blocks_dcbz32_ppc(DCTELEM *blocks)
115 { 128 {
116 POWERPC_TBL_DECLARE(powerpc_clear_blocks_dcbz32, 1); 129 POWERPC_TBL_DECLARE(powerpc_clear_blocks_dcbz32, 1);
117 register int misal = ((unsigned long)blocks & 0x00000010); 130 register int misal = ((unsigned long)blocks & 0x00000010);
124 ((unsigned long*)blocks)[2] = 0L; 137 ((unsigned long*)blocks)[2] = 0L;
125 ((unsigned long*)blocks)[3] = 0L; 138 ((unsigned long*)blocks)[3] = 0L;
126 i += 16; 139 i += 16;
127 } 140 }
128 for ( ; i < sizeof(DCTELEM)*6*64 ; i += 32) { 141 for ( ; i < sizeof(DCTELEM)*6*64 ; i += 32) {
129 asm volatile("dcbz %0,%1" : : "r" (blocks), "r" (i) : "memory"); 142 asm volatile("dcbz %0,%1" : : "r" (i), "r" (blocks) : "memory");
130 } 143 }
131 if (misal) { 144 if (misal) {
132 ((unsigned long*)blocks)[188] = 0L; 145 ((unsigned long*)blocks)[188] = 0L;
133 ((unsigned long*)blocks)[189] = 0L; 146 ((unsigned long*)blocks)[189] = 0L;
134 ((unsigned long*)blocks)[190] = 0L; 147 ((unsigned long*)blocks)[190] = 0L;
139 memset(blocks, 0, sizeof(DCTELEM)*6*64); 152 memset(blocks, 0, sizeof(DCTELEM)*6*64);
140 #endif 153 #endif
141 POWERPC_TBL_STOP_COUNT(powerpc_clear_blocks_dcbz32, 1); 154 POWERPC_TBL_STOP_COUNT(powerpc_clear_blocks_dcbz32, 1);
142 } 155 }
143 156
157 /* same as above, when dcbzl clear a whole 128B cache line
158 i.e. the PPC970 aka G5 */
159 #ifndef NO_DCBZL
160 void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
161 {
162 POWERPC_TBL_DECLARE(powerpc_clear_blocks_dcbz128, 1);
163 register int misal = ((unsigned long)blocks & 0x0000007f);
164 register int i = 0;
165 POWERPC_TBL_START_COUNT(powerpc_clear_blocks_dcbz128, 1);
166 #if 1
167 if (misal) {
168 // we could probably also optimize this case,
169 // but there's not much point as the machines
170 // aren't available yet (2003-06-26)
171 memset(blocks, 0, sizeof(DCTELEM)*6*64);
172 }
173 else
174 for ( ; i < sizeof(DCTELEM)*6*64 ; i += 128) {
175 asm volatile("dcbzl %0,%1" : : "r" (i), "r" (blocks) : "memory");
176 }
177 #else
178 memset(blocks, 0, sizeof(DCTELEM)*6*64);
179 #endif
180 POWERPC_TBL_STOP_COUNT(powerpc_clear_blocks_dcbz128, 1);
181 }
182 #else
183 void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
184 {
185 memset(blocks, 0, sizeof(DCTELEM)*6*64);
186 }
187 #endif
188
189 #ifndef NO_DCBZL
144 /* check dcbz report how many bytes are set to 0 by dcbz */ 190 /* check dcbz report how many bytes are set to 0 by dcbz */
145 long check_dcbz_effect(void) 191 /* update 24/06/2003 : replace dcbz by dcbzl to get
192 the intended effect (Apple "fixed" dcbz)
193 unfortunately this cannot be used unless the assembler
194 knows about dcbzl ... */
195 long check_dcbzl_effect(void)
146 { 196 {
147 register char *fakedata = (char*)av_malloc(1024); 197 register char *fakedata = (char*)av_malloc(1024);
148 register char *fakedata_middle; 198 register char *fakedata_middle;
149 register long zero = 0; 199 register long zero = 0;
150 register long i = 0; 200 register long i = 0;
157 207
158 fakedata_middle = (fakedata + 512); 208 fakedata_middle = (fakedata + 512);
159 209
160 memset(fakedata, 0xFF, 1024); 210 memset(fakedata, 0xFF, 1024);
161 211
162 asm volatile("dcbz %0, %1" : : "r" (fakedata_middle), "r" (zero)); 212 asm volatile("dcbzl %0, %1" : : "r" (fakedata_middle), "r" (zero));
163 213
164 for (i = 0; i < 1024 ; i ++) 214 for (i = 0; i < 1024 ; i ++)
165 { 215 {
166 if (fakedata[i] == (char)0) 216 if (fakedata[i] == (char)0)
167 count++; 217 count++;
169 219
170 av_free(fakedata); 220 av_free(fakedata);
171 221
172 return count; 222 return count;
173 } 223 }
224 #else
225 long check_dcbzl_effect(void)
226 {
227 return 0;
228 }
229 #endif
174 230
175 void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx) 231 void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx)
176 { 232 {
177 // Common optimisations whether Altivec or not 233 // Common optimizations whether Altivec is available or not
178 234
179 switch (check_dcbz_effect()) { 235 switch (check_dcbzl_effect()) {
180 case 32: 236 case 32:
181 c->clear_blocks = clear_blocks_dcbz32_ppc; 237 c->clear_blocks = clear_blocks_dcbz32_ppc;
238 break;
239 case 128:
240 c->clear_blocks = clear_blocks_dcbz128_ppc;
182 break; 241 break;
183 default: 242 default:
184 break; 243 break;
185 } 244 }
186 245