Mercurial > libavcodec.hg
comparison vp8dsp.c @ 12007:ec7be1d7d5b4 libavcodec
Use crop table in C implementations of VP8 DSP functions.
Much faster VP8 C DSP functions; ~5-10% faster overall with asm off.
author | darkshikari |
---|---|
date | Tue, 29 Jun 2010 03:34:24 +0000 |
parents | 1cffcc7b1470 |
children | 98fd80705850 |
comparison
equal
deleted
inserted
replaced
12006:d584c7373a64 | 12007:ec7be1d7d5b4 |
---|---|
59 #define MUL_35468(a) (((a)*35468) >> 16) | 59 #define MUL_35468(a) (((a)*35468) >> 16) |
60 | 60 |
61 static void vp8_idct_add_c(uint8_t *dst, DCTELEM block[16], int stride) | 61 static void vp8_idct_add_c(uint8_t *dst, DCTELEM block[16], int stride) |
62 { | 62 { |
63 int i, t0, t1, t2, t3; | 63 int i, t0, t1, t2, t3; |
64 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; | |
64 DCTELEM tmp[16]; | 65 DCTELEM tmp[16]; |
65 | 66 |
66 for (i = 0; i < 4; i++) { | 67 for (i = 0; i < 4; i++) { |
67 t0 = block[0*4+i] + block[2*4+i]; | 68 t0 = block[0*4+i] + block[2*4+i]; |
68 t1 = block[0*4+i] - block[2*4+i]; | 69 t1 = block[0*4+i] - block[2*4+i]; |
79 t0 = tmp[0*4+i] + tmp[2*4+i]; | 80 t0 = tmp[0*4+i] + tmp[2*4+i]; |
80 t1 = tmp[0*4+i] - tmp[2*4+i]; | 81 t1 = tmp[0*4+i] - tmp[2*4+i]; |
81 t2 = MUL_35468(tmp[1*4+i]) - MUL_20091(tmp[3*4+i]); | 82 t2 = MUL_35468(tmp[1*4+i]) - MUL_20091(tmp[3*4+i]); |
82 t3 = MUL_20091(tmp[1*4+i]) + MUL_35468(tmp[3*4+i]); | 83 t3 = MUL_20091(tmp[1*4+i]) + MUL_35468(tmp[3*4+i]); |
83 | 84 |
84 dst[0] = av_clip_uint8(dst[0] + ((t0 + t3 + 4) >> 3)); | 85 dst[0] = cm[dst[0] + ((t0 + t3 + 4) >> 3)]; |
85 dst[1] = av_clip_uint8(dst[1] + ((t1 + t2 + 4) >> 3)); | 86 dst[1] = cm[dst[1] + ((t1 + t2 + 4) >> 3)]; |
86 dst[2] = av_clip_uint8(dst[2] + ((t1 - t2 + 4) >> 3)); | 87 dst[2] = cm[dst[2] + ((t1 - t2 + 4) >> 3)]; |
87 dst[3] = av_clip_uint8(dst[3] + ((t0 - t3 + 4) >> 3)); | 88 dst[3] = cm[dst[3] + ((t0 - t3 + 4) >> 3)]; |
88 dst += stride; | 89 dst += stride; |
89 } | 90 } |
90 } | 91 } |
91 | 92 |
92 static void vp8_idct_dc_add_c(uint8_t *dst, DCTELEM block[16], int stride) | 93 static void vp8_idct_dc_add_c(uint8_t *dst, DCTELEM block[16], int stride) |
93 { | 94 { |
94 int i, dc = (block[0] + 4) >> 3; | 95 int i, dc = (block[0] + 4) >> 3; |
96 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP + dc; | |
95 | 97 |
96 for (i = 0; i < 4; i++) { | 98 for (i = 0; i < 4; i++) { |
97 dst[0] = av_clip_uint8(dst[0] + dc); | 99 dst[0] = cm[dst[0]]; |
98 dst[1] = av_clip_uint8(dst[1] + dc); | 100 dst[1] = cm[dst[1]]; |
99 dst[2] = av_clip_uint8(dst[2] + dc); | 101 dst[2] = cm[dst[2]]; |
100 dst[3] = av_clip_uint8(dst[3] + dc); | 102 dst[3] = cm[dst[3]]; |
101 dst += stride; | 103 dst += stride; |
102 } | 104 } |
103 } | 105 } |
104 | 106 |
105 | 107 |
112 int av_unused q0 = p[ 0*stride];\ | 114 int av_unused q0 = p[ 0*stride];\ |
113 int av_unused q1 = p[ 1*stride];\ | 115 int av_unused q1 = p[ 1*stride];\ |
114 int av_unused q2 = p[ 2*stride];\ | 116 int av_unused q2 = p[ 2*stride];\ |
115 int av_unused q3 = p[ 3*stride]; | 117 int av_unused q3 = p[ 3*stride]; |
116 | 118 |
119 #define clip_int8(n) (cm[n+0x80]-0x80) | |
120 | |
117 static av_always_inline void filter_common(uint8_t *p, int stride, int is4tap) | 121 static av_always_inline void filter_common(uint8_t *p, int stride, int is4tap) |
118 { | 122 { |
119 LOAD_PIXELS | 123 LOAD_PIXELS |
120 int a, f1, f2; | 124 int a, f1, f2; |
125 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; | |
121 | 126 |
122 a = 3*(q0 - p0); | 127 a = 3*(q0 - p0); |
123 | 128 |
124 if (is4tap) | 129 if (is4tap) |
125 a += av_clip_int8(p1 - q1); | 130 a += clip_int8(p1 - q1); |
126 | 131 |
127 a = av_clip_int8(a); | 132 a = clip_int8(a); |
128 | 133 |
129 // We deviate from the spec here with c(a+3) >> 3 | 134 // We deviate from the spec here with c(a+3) >> 3 |
130 // since that's what libvpx does. | 135 // since that's what libvpx does. |
131 f1 = FFMIN(a+4, 127) >> 3; | 136 f1 = FFMIN(a+4, 127) >> 3; |
132 f2 = FFMIN(a+3, 127) >> 3; | 137 f2 = FFMIN(a+3, 127) >> 3; |
133 | 138 |
134 // Despite what the spec says, we do need to clamp here to | 139 // Despite what the spec says, we do need to clamp here to |
135 // be bitexact with libvpx. | 140 // be bitexact with libvpx. |
136 p[-1*stride] = av_clip_uint8(p0 + f2); | 141 p[-1*stride] = cm[p0 + f2]; |
137 p[ 0*stride] = av_clip_uint8(q0 - f1); | 142 p[ 0*stride] = cm[q0 - f1]; |
138 | 143 |
139 // only used for _inner on blocks without high edge variance | 144 // only used for _inner on blocks without high edge variance |
140 if (!is4tap) { | 145 if (!is4tap) { |
141 a = (f1+1)>>1; | 146 a = (f1+1)>>1; |
142 p[-2*stride] = av_clip_uint8(p1 + a); | 147 p[-2*stride] = cm[p1 + a]; |
143 p[ 1*stride] = av_clip_uint8(q1 - a); | 148 p[ 1*stride] = cm[q1 - a]; |
144 } | 149 } |
145 } | 150 } |
146 | 151 |
147 static av_always_inline int simple_limit(uint8_t *p, int stride, int flim) | 152 static av_always_inline int simple_limit(uint8_t *p, int stride, int flim) |
148 { | 153 { |
170 } | 175 } |
171 | 176 |
172 static av_always_inline void filter_mbedge(uint8_t *p, int stride) | 177 static av_always_inline void filter_mbedge(uint8_t *p, int stride) |
173 { | 178 { |
174 int a0, a1, a2, w; | 179 int a0, a1, a2, w; |
180 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; | |
175 | 181 |
176 LOAD_PIXELS | 182 LOAD_PIXELS |
177 | 183 |
178 w = av_clip_int8(p1-q1); | 184 w = clip_int8(p1-q1); |
179 w = av_clip_int8(w + 3*(q0-p0)); | 185 w = clip_int8(w + 3*(q0-p0)); |
180 | 186 |
181 a0 = (27*w + 63) >> 7; | 187 a0 = (27*w + 63) >> 7; |
182 a1 = (18*w + 63) >> 7; | 188 a1 = (18*w + 63) >> 7; |
183 a2 = ( 9*w + 63) >> 7; | 189 a2 = ( 9*w + 63) >> 7; |
184 | 190 |
185 p[-3*stride] = av_clip_uint8(p2 + a2); | 191 p[-3*stride] = cm[p2 + a2]; |
186 p[-2*stride] = av_clip_uint8(p1 + a1); | 192 p[-2*stride] = cm[p1 + a1]; |
187 p[-1*stride] = av_clip_uint8(p0 + a0); | 193 p[-1*stride] = cm[p0 + a0]; |
188 p[ 0*stride] = av_clip_uint8(q0 - a0); | 194 p[ 0*stride] = cm[q0 - a0]; |
189 p[ 1*stride] = av_clip_uint8(q1 - a1); | 195 p[ 1*stride] = cm[q1 - a1]; |
190 p[ 2*stride] = av_clip_uint8(q2 - a2); | 196 p[ 2*stride] = cm[q2 - a2]; |
191 } | 197 } |
192 | 198 |
193 #define LOOP_FILTER(dir, size, stridea, strideb) \ | 199 #define LOOP_FILTER(dir, size, stridea, strideb) \ |
194 static void vp8_ ## dir ## _loop_filter ## size ## _c(uint8_t *dst, int stride,\ | 200 static void vp8_ ## dir ## _loop_filter ## size ## _c(uint8_t *dst, int stride,\ |
195 int flim_E, int flim_I, int hev_thresh)\ | 201 int flim_E, int flim_I, int hev_thresh)\ |
261 PUT_PIXELS(16) | 267 PUT_PIXELS(16) |
262 PUT_PIXELS(8) | 268 PUT_PIXELS(8) |
263 PUT_PIXELS(4) | 269 PUT_PIXELS(4) |
264 | 270 |
265 #define FILTER_6TAP(src, F, stride) \ | 271 #define FILTER_6TAP(src, F, stride) \ |
266 av_clip_uint8((F[2]*src[x+0*stride] - F[1]*src[x-1*stride] + F[0]*src[x-2*stride] + \ | 272 cm[(F[2]*src[x+0*stride] - F[1]*src[x-1*stride] + F[0]*src[x-2*stride] + \ |
267 F[3]*src[x+1*stride] - F[4]*src[x+2*stride] + F[5]*src[x+3*stride] + 64) >> 7) | 273 F[3]*src[x+1*stride] - F[4]*src[x+2*stride] + F[5]*src[x+3*stride] + 64) >> 7] |
268 | 274 |
269 #define FILTER_4TAP(src, F, stride) \ | 275 #define FILTER_4TAP(src, F, stride) \ |
270 av_clip_uint8((F[2]*src[x+0*stride] - F[1]*src[x-1*stride] + \ | 276 cm[(F[2]*src[x+0*stride] - F[1]*src[x-1*stride] + \ |
271 F[3]*src[x+1*stride] - F[4]*src[x+2*stride] + 64) >> 7) | 277 F[3]*src[x+1*stride] - F[4]*src[x+2*stride] + 64) >> 7] |
272 | 278 |
273 #define VP8_EPEL_H(SIZE, FILTER, FILTERNAME) \ | 279 #define VP8_EPEL_H(SIZE, FILTER, FILTERNAME) \ |
274 static void put_vp8_epel ## SIZE ## _ ## FILTERNAME ## _c(uint8_t *dst, int dststride, uint8_t *src, int srcstride, int h, int mx, int my) \ | 280 static void put_vp8_epel ## SIZE ## _ ## FILTERNAME ## _c(uint8_t *dst, int dststride, uint8_t *src, int srcstride, int h, int mx, int my) \ |
275 { \ | 281 { \ |
276 const uint8_t *filter = subpel_filters[mx-1]; \ | 282 const uint8_t *filter = subpel_filters[mx-1]; \ |
283 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; \ | |
277 int x, y; \ | 284 int x, y; \ |
278 \ | 285 \ |
279 for (y = 0; y < h; y++) { \ | 286 for (y = 0; y < h; y++) { \ |
280 for (x = 0; x < SIZE; x++) \ | 287 for (x = 0; x < SIZE; x++) \ |
281 dst[x] = FILTER(src, filter, 1); \ | 288 dst[x] = FILTER(src, filter, 1); \ |
285 } | 292 } |
286 #define VP8_EPEL_V(SIZE, FILTER, FILTERNAME) \ | 293 #define VP8_EPEL_V(SIZE, FILTER, FILTERNAME) \ |
287 static void put_vp8_epel ## SIZE ## _ ## FILTERNAME ## _c(uint8_t *dst, int dststride, uint8_t *src, int srcstride, int h, int mx, int my) \ | 294 static void put_vp8_epel ## SIZE ## _ ## FILTERNAME ## _c(uint8_t *dst, int dststride, uint8_t *src, int srcstride, int h, int mx, int my) \ |
288 { \ | 295 { \ |
289 const uint8_t *filter = subpel_filters[my-1]; \ | 296 const uint8_t *filter = subpel_filters[my-1]; \ |
297 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; \ | |
290 int x, y; \ | 298 int x, y; \ |
291 \ | 299 \ |
292 for (y = 0; y < h; y++) { \ | 300 for (y = 0; y < h; y++) { \ |
293 for (x = 0; x < SIZE; x++) \ | 301 for (x = 0; x < SIZE; x++) \ |
294 dst[x] = FILTER(src, filter, srcstride); \ | 302 dst[x] = FILTER(src, filter, srcstride); \ |
298 } | 306 } |
299 #define VP8_EPEL_HV(SIZE, FILTERX, FILTERY, FILTERNAME) \ | 307 #define VP8_EPEL_HV(SIZE, FILTERX, FILTERY, FILTERNAME) \ |
300 static void put_vp8_epel ## SIZE ## _ ## FILTERNAME ## _c(uint8_t *dst, int dststride, uint8_t *src, int srcstride, int h, int mx, int my) \ | 308 static void put_vp8_epel ## SIZE ## _ ## FILTERNAME ## _c(uint8_t *dst, int dststride, uint8_t *src, int srcstride, int h, int mx, int my) \ |
301 { \ | 309 { \ |
302 const uint8_t *filter = subpel_filters[mx-1]; \ | 310 const uint8_t *filter = subpel_filters[mx-1]; \ |
311 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; \ | |
303 int x, y; \ | 312 int x, y; \ |
304 uint8_t tmp_array[(2*SIZE+5)*SIZE]; \ | 313 uint8_t tmp_array[(2*SIZE+5)*SIZE]; \ |
305 uint8_t *tmp = tmp_array; \ | 314 uint8_t *tmp = tmp_array; \ |
306 src -= 2*srcstride; \ | 315 src -= 2*srcstride; \ |
307 \ | 316 \ |