Mercurial > libavcodec.hg
comparison ppc/imgresample_altivec.c @ 5750:09f99af1db40 libavcodec
Sanitize altivec code so it can be built with runtime check properly
author | lu_zero |
---|---|
date | Tue, 02 Oct 2007 11:39:32 +0000 |
parents | |
children | a8a79f5385f6 |
comparison
equal
deleted
inserted
replaced
5749:784dcbdc910f | 5750:09f99af1db40 |
---|---|
1 /* | |
2 * High quality image resampling with polyphase filters | |
3 * Copyright (c) 2001 Fabrice Bellard. | |
4 * | |
5 * This file is part of FFmpeg. | |
6 * | |
7 * FFmpeg is free software; you can redistribute it and/or | |
8 * modify it under the terms of the GNU Lesser General Public | |
9 * License as published by the Free Software Foundation; either | |
10 * version 2.1 of the License, or (at your option) any later version. | |
11 * | |
12 * FFmpeg is distributed in the hope that it will be useful, | |
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 * Lesser General Public License for more details. | |
16 * | |
17 * You should have received a copy of the GNU Lesser General Public | |
18 * License along with FFmpeg; if not, write to the Free Software | |
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 */ | |
21 | |
22 /** | |
23 * @file imgresample_altivec.c | |
24 * High quality image resampling with polyphase filters - AltiVec bits | |
25 */ | |
26 | |
27 #include "gcc_fixes.h" | |
28 | |
29 typedef union { | |
30 vector unsigned char v; | |
31 unsigned char c[16]; | |
32 } vec_uc_t; | |
33 | |
34 typedef union { | |
35 vector signed short v; | |
36 signed short s[8]; | |
37 } vec_ss_t; | |
38 | |
39 void v_resample16_altivec(uint8_t *dst, int dst_width, const uint8_t *src, | |
40 int wrap, int16_t *filter) | |
41 { | |
42 int sum, i; | |
43 const uint8_t *s; | |
44 vector unsigned char *tv, tmp, dstv, zero; | |
45 vec_ss_t srchv[4], srclv[4], fv[4]; | |
46 vector signed short zeros, sumhv, sumlv; | |
47 s = src; | |
48 | |
49 for(i=0;i<4;i++) | |
50 { | |
51 /* | |
52 The vec_madds later on does an implicit >>15 on the result. | |
53 Since FILTER_BITS is 8, and we have 15 bits of magnitude in | |
54 a signed short, we have just enough bits to pre-shift our | |
55 filter constants <<7 to compensate for vec_madds. | |
56 */ | |
57 fv[i].s[0] = filter[i] << (15-FILTER_BITS); | |
58 fv[i].v = vec_splat(fv[i].v, 0); | |
59 } | |
60 | |
61 zero = vec_splat_u8(0); | |
62 zeros = vec_splat_s16(0); | |
63 | |
64 | |
65 /* | |
66 When we're resampling, we'd ideally like both our input buffers, | |
67 and output buffers to be 16-byte aligned, so we can do both aligned | |
68 reads and writes. Sadly we can't always have this at the moment, so | |
69 we opt for aligned writes, as unaligned writes have a huge overhead. | |
70 To do this, do enough scalar resamples to get dst 16-byte aligned. | |
71 */ | |
72 i = (-(int)dst) & 0xf; | |
73 while(i>0) { | |
74 sum = s[0 * wrap] * filter[0] + | |
75 s[1 * wrap] * filter[1] + | |
76 s[2 * wrap] * filter[2] + | |
77 s[3 * wrap] * filter[3]; | |
78 sum = sum >> FILTER_BITS; | |
79 if (sum<0) sum = 0; else if (sum>255) sum=255; | |
80 dst[0] = sum; | |
81 dst++; | |
82 s++; | |
83 dst_width--; | |
84 i--; | |
85 } | |
86 | |
87 /* Do our altivec resampling on 16 pixels at once. */ | |
88 while(dst_width>=16) { | |
89 /* | |
90 Read 16 (potentially unaligned) bytes from each of | |
91 4 lines into 4 vectors, and split them into shorts. | |
92 Interleave the multipy/accumulate for the resample | |
93 filter with the loads to hide the 3 cycle latency | |
94 the vec_madds have. | |
95 */ | |
96 tv = (vector unsigned char *) &s[0 * wrap]; | |
97 tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[i * wrap])); | |
98 srchv[0].v = (vector signed short) vec_mergeh(zero, tmp); | |
99 srclv[0].v = (vector signed short) vec_mergel(zero, tmp); | |
100 sumhv = vec_madds(srchv[0].v, fv[0].v, zeros); | |
101 sumlv = vec_madds(srclv[0].v, fv[0].v, zeros); | |
102 | |
103 tv = (vector unsigned char *) &s[1 * wrap]; | |
104 tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[1 * wrap])); | |
105 srchv[1].v = (vector signed short) vec_mergeh(zero, tmp); | |
106 srclv[1].v = (vector signed short) vec_mergel(zero, tmp); | |
107 sumhv = vec_madds(srchv[1].v, fv[1].v, sumhv); | |
108 sumlv = vec_madds(srclv[1].v, fv[1].v, sumlv); | |
109 | |
110 tv = (vector unsigned char *) &s[2 * wrap]; | |
111 tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[2 * wrap])); | |
112 srchv[2].v = (vector signed short) vec_mergeh(zero, tmp); | |
113 srclv[2].v = (vector signed short) vec_mergel(zero, tmp); | |
114 sumhv = vec_madds(srchv[2].v, fv[2].v, sumhv); | |
115 sumlv = vec_madds(srclv[2].v, fv[2].v, sumlv); | |
116 | |
117 tv = (vector unsigned char *) &s[3 * wrap]; | |
118 tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[3 * wrap])); | |
119 srchv[3].v = (vector signed short) vec_mergeh(zero, tmp); | |
120 srclv[3].v = (vector signed short) vec_mergel(zero, tmp); | |
121 sumhv = vec_madds(srchv[3].v, fv[3].v, sumhv); | |
122 sumlv = vec_madds(srclv[3].v, fv[3].v, sumlv); | |
123 | |
124 /* | |
125 Pack the results into our destination vector, | |
126 and do an aligned write of that back to memory. | |
127 */ | |
128 dstv = vec_packsu(sumhv, sumlv) ; | |
129 vec_st(dstv, 0, (vector unsigned char *) dst); | |
130 | |
131 dst+=16; | |
132 s+=16; | |
133 dst_width-=16; | |
134 } | |
135 | |
136 /* | |
137 If there are any leftover pixels, resample them | |
138 with the slow scalar method. | |
139 */ | |
140 while(dst_width>0) { | |
141 sum = s[0 * wrap] * filter[0] + | |
142 s[1 * wrap] * filter[1] + | |
143 s[2 * wrap] * filter[2] + | |
144 s[3 * wrap] * filter[3]; | |
145 sum = sum >> FILTER_BITS; | |
146 if (sum<0) sum = 0; else if (sum>255) sum=255; | |
147 dst[0] = sum; | |
148 dst++; | |
149 s++; | |
150 dst_width--; | |
151 } | |
152 } | |
153 |