comparison ppc/dsputil_ppc.c @ 1015:35cf2f4a0f8c libavcodec

PPC perf, PPC clear_block, AltiVec put_pixels8_xy2 patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
author michaelni
date Sun, 19 Jan 2003 19:00:45 +0000
parents 3b7cc8e4b83f
children 9cc1031e1864
comparison
equal deleted inserted replaced
1014:48349e11c9b2 1015:35cf2f4a0f8c
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */ 18 */
19 19
20 #include "../dsputil.h" 20 #include "../dsputil.h"
21 21
22 #include "dsputil_ppc.h"
23
22 #ifdef HAVE_ALTIVEC 24 #ifdef HAVE_ALTIVEC
23 #include "dsputil_altivec.h" 25 #include "dsputil_altivec.h"
24 #endif 26 #endif
25 27
26 int mm_flags = 0; 28 int mm_flags = 0;
34 } 36 }
35 #endif /* result */ 37 #endif /* result */
36 return result; 38 return result;
37 } 39 }
38 40
41 #ifdef POWERPC_TBL_PERFORMANCE_REPORT
42 unsigned long long perfdata[powerpc_perf_total][powerpc_data_total];
43 /* list below must match enum in dsputil_altivec.h */
44 static unsigned char* perfname[] = {
45 "fft_calc_altivec",
46 "gmc1_altivec",
47 "dct_unquantize_h263_altivec",
48 "idct_add_altivec",
49 "idct_put_altivec",
50 "put_pixels_clamped_altivec",
51 "put_pixels16_altivec",
52 "avg_pixels16_altivec",
53 "avg_pixels8_altivec",
54 "put_pixels8_xy2_altivec",
55 "clear_blocks_dcbz32_ppc"
56 };
57 #ifdef POWERPC_PERF_USE_PMC
58 unsigned long long perfdata_miss[powerpc_perf_total][powerpc_data_total];
59 #endif
60 #include <stdio.h>
61 #endif
62
63 #ifdef POWERPC_TBL_PERFORMANCE_REPORT
64 void powerpc_display_perf_report(void)
65 {
66 int i;
67 #ifndef POWERPC_PERF_USE_PMC
68 fprintf(stderr, "AltiVec performance report\n Values are from the Time Base register, and represent 4 bus cycles.\n");
69 #else /* POWERPC_PERF_USE_PMC */
70 fprintf(stderr, "AltiVec performance report\n Values are from the PMC registers, and represent whatever the registers are set to record.\n");
71 #endif /* POWERPC_PERF_USE_PMC */
72 for(i = 0 ; i < powerpc_perf_total ; i++)
73 {
74 if (perfdata[i][powerpc_data_num] != (unsigned long long)0)
75 fprintf(stderr, " Function \"%s\" (pmc1):\n\tmin: %llu\n\tmax: %llu\n\tavg: %1.2lf (%llu)\n",
76 perfname[i],
77 perfdata[i][powerpc_data_min],
78 perfdata[i][powerpc_data_max],
79 (double)perfdata[i][powerpc_data_sum] /
80 (double)perfdata[i][powerpc_data_num],
81 perfdata[i][powerpc_data_num]);
82 #ifdef POWERPC_PERF_USE_PMC
83 if (perfdata_miss[i][powerpc_data_num] != (unsigned long long)0)
84 fprintf(stderr, " Function \"%s\" (pmc2):\n\tmin: %llu\n\tmax: %llu\n\tavg: %1.2lf (%llu)\n",
85 perfname[i],
86 perfdata_miss[i][powerpc_data_min],
87 perfdata_miss[i][powerpc_data_max],
88 (double)perfdata_miss[i][powerpc_data_sum] /
89 (double)perfdata_miss[i][powerpc_data_num],
90 perfdata_miss[i][powerpc_data_num]);
91 #endif
92 }
93 }
94 #endif /* POWERPC_TBL_PERFORMANCE_REPORT */
95
96 /* ***** WARNING ***** WARNING ***** WARNING ***** */
97 /*
98 clear_blocks_dcbz32_ppc will not work properly
99 on PowerPC processors with a cache line size
100 not equal to 32 bytes.
101 Fortunately all processor used by Apple up to
102 at least the 7450 (aka second generation G4)
103 use 32 bytes cache line.
104 This is due to the use of the 'dcbz' instruction.
105 It simply clear to zero a single cache line,
106 so you need to know the cache line size to use it !
107 It's absurd, but it's fast...
108 */
109 void clear_blocks_dcbz32_ppc(DCTELEM *blocks)
110 {
111 POWERPC_TBL_DECLARE(powerpc_clear_blocks_dcbz32, 1);
112 register int misal = ((unsigned long)blocks & 0x00000010);
113 register int i = 0;
114 POWERPC_TBL_START_COUNT(powerpc_clear_blocks_dcbz32, 1);
115 #if 1
116 if (misal) {
117 ((unsigned long*)blocks)[0] = 0L;
118 ((unsigned long*)blocks)[1] = 0L;
119 ((unsigned long*)blocks)[2] = 0L;
120 ((unsigned long*)blocks)[3] = 0L;
121 vec_st((vector short)(0), 0, blocks);
122 i += 16;
123 }
124 for ( ; i < sizeof(DCTELEM)*6*64 ; i += 32) {
125 asm volatile("dcbz %0,%1" : : "r" (blocks), "r" (i) : "memory");
126 }
127 if (misal) {
128 ((unsigned long*)blocks)[188] = 0L;
129 ((unsigned long*)blocks)[189] = 0L;
130 ((unsigned long*)blocks)[190] = 0L;
131 ((unsigned long*)blocks)[191] = 0L;
132 i += 16;
133 }
134 #else
135 memset(blocks, 0, sizeof(DCTELEM)*6*64);
136 #endif
137 POWERPC_TBL_STOP_COUNT(powerpc_clear_blocks_dcbz32, 1);
138 }
139
140 /* check dcbz report how many bytes are set to 0 by dcbz */
141 long check_dcbz_effect(void)
142 {
143 register char *fakedata = (char*)malloc(1024);
144 register char *fakedata_middle;
145 register long zero = 0;
146 register long i = 0;
147 long count = 0;
148
149 if (fakedata == NULL)
150 {
151 return 0L;
152 }
153
154
155 fakedata_middle = (fakedata + 512);
156
157 memset(fakedata, 0xFF, 1024);
158
159 asm volatile("dcbz %0, %1" : : "r" (fakedata_middle), "r" (zero));
160
161 for (i = 0; i < 1024 ; i ++)
162 {
163 if (fakedata[i] == (char)0)
164 count++;
165 }
166
167 free(fakedata);
168
169 return count;
170 }
171
39 void dsputil_init_ppc(DSPContext* c, unsigned mask) 172 void dsputil_init_ppc(DSPContext* c, unsigned mask)
40 { 173 {
41 // Common optimisations whether Altivec or not 174 // Common optimisations whether Altivec or not
42 175
43 // ... pending ... 176 switch (check_dcbz_effect()) {
44 177 case 32:
178 c->clear_blocks = clear_blocks_dcbz32_ppc;
179 break;
180 default:
181 break;
182 }
183
45 #if HAVE_ALTIVEC 184 #if HAVE_ALTIVEC
46 if (has_altivec()) { 185 if (has_altivec()) {
47 mm_flags |= MM_ALTIVEC; 186 mm_flags |= MM_ALTIVEC;
48 187
49 // Altivec specific optimisations 188 // Altivec specific optimisations
65 c->add_bytes= add_bytes_altivec; 204 c->add_bytes= add_bytes_altivec;
66 c->put_pixels_clamped = put_pixels_clamped_altivec; 205 c->put_pixels_clamped = put_pixels_clamped_altivec;
67 #endif 206 #endif
68 c->put_pixels_tab[0][0] = put_pixels16_altivec; 207 c->put_pixels_tab[0][0] = put_pixels16_altivec;
69 c->avg_pixels_tab[0][0] = avg_pixels16_altivec; 208 c->avg_pixels_tab[0][0] = avg_pixels16_altivec;
209 // next one disabled as it's untested.
210 #if 0
211 c->avg_pixels_tab[1][0] = avg_pixels8_altivec;
212 #endif
213 c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec;
214
70 c->gmc1 = gmc1_altivec; 215 c->gmc1 = gmc1_altivec;
71 216
72 #ifdef ALTIVEC_TBL_PERFORMANCE_REPORT 217 #ifdef POWERPC_TBL_PERFORMANCE_REPORT
73 { 218 {
74 int i; 219 int i;
75 for (i = 0 ; i < altivec_perf_total ; i++) 220 for (i = 0 ; i < powerpc_perf_total ; i++)
76 { 221 {
77 perfdata[i][altivec_data_min] = 0xFFFFFFFFFFFFFFFF; 222 perfdata[i][powerpc_data_min] = 0xFFFFFFFFFFFFFFFF;
78 perfdata[i][altivec_data_max] = 0x0000000000000000; 223 perfdata[i][powerpc_data_max] = 0x0000000000000000;
79 perfdata[i][altivec_data_sum] = 0x0000000000000000; 224 perfdata[i][powerpc_data_sum] = 0x0000000000000000;
80 perfdata[i][altivec_data_num] = 0x0000000000000000; 225 perfdata[i][powerpc_data_num] = 0x0000000000000000;
226 #ifdef POWERPC_PERF_USE_PMC
227 perfdata_miss[i][powerpc_data_min] = 0xFFFFFFFFFFFFFFFF;
228 perfdata_miss[i][powerpc_data_max] = 0x0000000000000000;
229 perfdata_miss[i][powerpc_data_sum] = 0x0000000000000000;
230 perfdata_miss[i][powerpc_data_num] = 0x0000000000000000;
231 #endif
81 } 232 }
82 } 233 }
83 #endif 234 #endif
84 } else 235 } else
85 #endif 236 #endif